From 4b8672256d7818455bcaa4c0797ce96d989134cd Mon Sep 17 00:00:00 2001 From: Aleksandra Martyniuk Date: Thu, 23 Apr 2026 14:13:21 +0200 Subject: [PATCH] service: skip load_sketch unload for excluded nodes on RF shrink When an RF change shrinks replicas on a DC and the node being shrunk is excluded, refresh_tablet_load_stats() only provides load_stats for that node if it has a cached snapshot from when the node was still up. If the snapshot is missing or predates the tables being shrunk (e.g. they were created after the node went down), stats stay incomplete. In that case load_sketch::unload() called from make_rf_change_plan() throws: Can't provide accurate load computation with incomplete load_stats for host: Since an excluded node is not expected to come back, load_stats will never become complete, and the topology coordinator retries the plan infinitely, hanging ALTER KEYSPACE. Add a check for excluded nodes and skip unload() for them: we are removing the replica, so accurate load data for that node is not needed. For all other node states the throw-and-retry behavior is preserved. Modify test_excludenode_shrink_rf to always trigger the bug: a new error injection 'force_down_node_load_stats_invalid' forces the invalid-stats path in refresh_tablet_load_stats() for a down node, so the test does not depend on whether the load-stats refresher happened to cache the excluded node's stats while it was still up. Fixes: SCYLLADB-2056. Closes scylladb/scylladb#29622 (cherry picked from commit d874d355c24410d00963a4c95c38fc80391ed148) Closes scylladb/scylladb#29927 --- service/tablet_allocator.cc | 3 ++- service/topology_coordinator.cc | 3 ++- test/cluster/test_tablets.py | 8 +++++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/service/tablet_allocator.cc b/service/tablet_allocator.cc index a0aa421885..938e8a38b5 100644 --- a/service/tablet_allocator.cc +++ b/service/tablet_allocator.cc @@ -1849,7 +1849,8 @@ public: auto mig_streaming_info = get_migration_streaming_info(topo, ti, mig); // The node being shrunk may be excluded/down and lack complete tablet stats. // Since we're removing a replica (not placing one), accurate load data isn't needed. - if (_load_sketch->has_node(replica->host)) { + auto* rep_node = topo.find_node(replica->host); + if (_load_sketch->has_node(replica->host) && !(rep_node && rep_node->is_excluded())) { unload(*_load_sketch, replica->host, replica->shard, source_tablets); } if (can_accept_load(nodes, mig_streaming_info)) { diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc index 6972004410..424f92b22d 100644 --- a/service/topology_coordinator.cc +++ b/service/topology_coordinator.cc @@ -4437,7 +4437,8 @@ future<> topology_coordinator::refresh_tablet_load_stats() { locator::load_stats node_stats; if (!_gossiper.is_alive(dst)) { - if (_load_stats_per_node.contains(dst)) { + if (_load_stats_per_node.contains(dst) && + !utils::get_local_injector().enter("force_down_node_load_stats_invalid")) { node_stats = _load_stats_per_node[dst]; } else { rtlogger.debug("raft topology: Unable to refresh table load on {} because it's down.", dst); diff --git a/test/cluster/test_tablets.py b/test/cluster/test_tablets.py index 9aa6381dc1..6c73290f65 100644 --- a/test/cluster/test_tablets.py +++ b/test/cluster/test_tablets.py @@ -1745,6 +1745,7 @@ async def test_excludenode(manager: ManagerClient): await manager.remove_node(live_node.server_id, server_id=node_to_remove.server_id) @pytest.mark.asyncio +@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode') async def test_excludenode_shrink_rf(manager: ManagerClient): """ Verifies that ALTER keyspace removing replicas from a DC succeeds when the only @@ -1762,6 +1763,11 @@ async def test_excludenode_shrink_rf(manager: ManagerClient): dc1_servers = await manager.servers_add(servers_num=2, config=config, cmdline=cmdline, property_file={'dc': 'dc1', 'rack': 'rack1'}) dc2_server = await manager.server_add(config=config, cmdline=cmdline, property_file={'dc': 'dc2', 'rack': 'rack2'}) + # Force incomplete load_stats for the down node on every potential topology + # coordinator, so the shrink plan deterministically hits the excluded-node + # path instead of relying on stale cached stats. + await asyncio.gather(*(manager.api.enable_injection(s.ip_addr, "force_down_node_load_stats_invalid", one_shot=False) for s in dc1_servers)) + cql = manager.get_cql() async with new_test_keyspace(manager, "WITH replication = { 'class': 'NetworkTopologyStrategy', " "'dc1': 1, 'dc2': 1} AND tablets = { 'initial': 4 }") as ks: @@ -1778,7 +1784,7 @@ async def test_excludenode_shrink_rf(manager: ManagerClient): # ALTER keyspace to remove dc2 from replication. # This should succeed without timing out, even though the excluded node # has tablet replicas that need to be shrunk away. - await cql.run_async(f"ALTER KEYSPACE {ks} WITH REPLICATION = {{ 'class': 'NetworkTopologyStrategy', 'dc1': 1, 'dc2': 0}}") + await asyncio.wait_for(cql.run_async(f"ALTER KEYSPACE {ks} WITH REPLICATION = {{ 'class': 'NetworkTopologyStrategy', 'dc1': 1, 'dc2': 0}}"), timeout=120) await asyncio.gather(*(read_barrier(manager.api, s.ip_addr) for s in dc1_servers))