service: skip load_sketch unload for excluded nodes on RF shrink

When an RF change shrinks replicas on a DC and the node being shrunk is
excluded, refresh_tablet_load_stats() only provides load_stats for that
node if it has a cached snapshot from when the node was still up. If the
snapshot is missing or predates the tables being shrunk (e.g. they were
created after the node went down), stats stay incomplete. In that case
load_sketch::unload() called from make_rf_change_plan() throws:

    Can't provide accurate load computation with incomplete load_stats
    for host: <uuid>

Since an excluded node is not expected to come back, load_stats will
never become complete, and the topology coordinator retries the plan
infinitely, hanging ALTER KEYSPACE.

Add a check for excluded nodes and skip unload() for them: we are
removing the replica, so accurate load data for that node is not
needed. For all other node states the throw-and-retry behavior is
preserved.

Modify test_excludenode_shrink_rf to always trigger the bug: a new
error injection 'force_down_node_load_stats_invalid' forces the
invalid-stats path in refresh_tablet_load_stats() for a down node, so
the test does not depend on whether the load-stats refresher happened
to cache the excluded node's stats while it was still up.

Fixes: SCYLLADB-2056.

Closes scylladb/scylladb#29622

(cherry picked from commit d874d355c2)

Closes scylladb/scylladb#29927
This commit is contained in:
Aleksandra Martyniuk
2026-04-23 14:13:21 +02:00
committed by Avi Kivity
parent 51e187b6e2
commit 4b8672256d
3 changed files with 11 additions and 3 deletions

View File

@@ -1849,7 +1849,8 @@ public:
auto mig_streaming_info = get_migration_streaming_info(topo, ti, mig);
// The node being shrunk may be excluded/down and lack complete tablet stats.
// Since we're removing a replica (not placing one), accurate load data isn't needed.
if (_load_sketch->has_node(replica->host)) {
auto* rep_node = topo.find_node(replica->host);
if (_load_sketch->has_node(replica->host) && !(rep_node && rep_node->is_excluded())) {
unload(*_load_sketch, replica->host, replica->shard, source_tablets);
}
if (can_accept_load(nodes, mig_streaming_info)) {

View File

@@ -4437,7 +4437,8 @@ future<> topology_coordinator::refresh_tablet_load_stats() {
locator::load_stats node_stats;
if (!_gossiper.is_alive(dst)) {
if (_load_stats_per_node.contains(dst)) {
if (_load_stats_per_node.contains(dst) &&
!utils::get_local_injector().enter("force_down_node_load_stats_invalid")) {
node_stats = _load_stats_per_node[dst];
} else {
rtlogger.debug("raft topology: Unable to refresh table load on {} because it's down.", dst);

View File

@@ -1745,6 +1745,7 @@ async def test_excludenode(manager: ManagerClient):
await manager.remove_node(live_node.server_id, server_id=node_to_remove.server_id)
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_excludenode_shrink_rf(manager: ManagerClient):
"""
Verifies that ALTER keyspace removing replicas from a DC succeeds when the only
@@ -1762,6 +1763,11 @@ async def test_excludenode_shrink_rf(manager: ManagerClient):
dc1_servers = await manager.servers_add(servers_num=2, config=config, cmdline=cmdline, property_file={'dc': 'dc1', 'rack': 'rack1'})
dc2_server = await manager.server_add(config=config, cmdline=cmdline, property_file={'dc': 'dc2', 'rack': 'rack2'})
# Force incomplete load_stats for the down node on every potential topology
# coordinator, so the shrink plan deterministically hits the excluded-node
# path instead of relying on stale cached stats.
await asyncio.gather(*(manager.api.enable_injection(s.ip_addr, "force_down_node_load_stats_invalid", one_shot=False) for s in dc1_servers))
cql = manager.get_cql()
async with new_test_keyspace(manager, "WITH replication = { 'class': 'NetworkTopologyStrategy', "
"'dc1': 1, 'dc2': 1} AND tablets = { 'initial': 4 }") as ks:
@@ -1778,7 +1784,7 @@ async def test_excludenode_shrink_rf(manager: ManagerClient):
# ALTER keyspace to remove dc2 from replication.
# This should succeed without timing out, even though the excluded node
# has tablet replicas that need to be shrunk away.
await cql.run_async(f"ALTER KEYSPACE {ks} WITH REPLICATION = {{ 'class': 'NetworkTopologyStrategy', 'dc1': 1, 'dc2': 0}}")
await asyncio.wait_for(cql.run_async(f"ALTER KEYSPACE {ks} WITH REPLICATION = {{ 'class': 'NetworkTopologyStrategy', 'dc1': 1, 'dc2': 0}}"), timeout=120)
await asyncio.gather(*(read_barrier(manager.api, s.ip_addr) for s in dc1_servers))