From bceb054bbdf33211d97a5194a1d975b5e5eb515b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 27 Apr 2026 21:08:12 +0300 Subject: [PATCH] test/cluster/test_incremental_repair: fix flaky coordinator-change scenario The test_incremental_repair_race_window_promotes_unrepaired_data test was flaky because it hardcodes servers[1] as the restart target but did not ensure servers[1] was NOT the topology coordinator. When servers[1] happened to be the Raft group0 leader (topology coordinator), restarting it killed the leader, forced a new election, and the new coordinator re-initiated tablet repair. This re-repair flushes memtables on all replicas via take_storage_snapshot() and marks the resulting sstables as repaired -- causing post-repair keys to appear in repaired sstables on servers[0] and servers[2]. The test then hit the wrong assertion (servers[0]/[2] contaminated). Fix: before starting the repair, check whether servers[1] is the topology coordinator. If so, move leadership to another server via ensure_group0_leader_on() so that restarting servers[1] only kills a follower -- which does not trigger an election or coordinator change. Reproducibility was confirmed by forcing leadership to servers[1] via ensure_group0_leader_on() and observing deterministic failure with all three servers showing post-repair keys in repaired sstables (confirming the re-repair scenario), then verifying the fix passes reliably. Fixes: SCYLLADB-1478 Fixes: https://scylladb.atlassian.net/browse/SCYLLADB-1903 (cherry picked from commit 914b70c75b8acf6bfce78cf68f9c90d9035764d3) Signed-off-by: Raphael S. Carvalho --- test/cluster/test_incremental_repair.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/test/cluster/test_incremental_repair.py b/test/cluster/test_incremental_repair.py index 6ece1eedab..cf59112fc0 100644 --- a/test/cluster/test_incremental_repair.py +++ b/test/cluster/test_incremental_repair.py @@ -9,7 +9,7 @@ from test.cluster.conftest import skip_mode from test.pylib.repair import load_tablet_sstables_repaired_at, create_table_insert_data_for_repair from test.pylib.tablets import get_all_tablet_replicas from test.cluster.tasks.task_manager_client import TaskManagerClient -from test.cluster.util import reconnect_driver, find_server_by_host_id, get_topology_coordinator, new_test_keyspace, new_test_table, trigger_stepdown +from test.cluster.util import reconnect_driver, find_server_by_host_id, get_topology_coordinator, ensure_group0_leader_on, new_test_keyspace, new_test_table, trigger_stepdown from cassandra.query import ConsistencyLevel @@ -965,8 +965,18 @@ async def test_incremental_repair_race_window_promotes_unrepaired_data(manager: await manager.api.flush_keyspace(s.ip_addr, ks) current_key += 10 + # Ensure servers[1] is not the topology coordinator. If the coordinator is + # restarted, the Raft leader dies, a new election occurs, and the new + # coordinator re-initiates tablet repair -- flushing memtables on all replicas + # and marking post-repair data as repaired. That legitimate re-repair masks + # the compaction-merge bug this test detects. coord = await get_topology_coordinator(manager) coord_serv = await find_server_by_host_id(manager, servers, coord) + if coord_serv == servers[1]: + other = next(s for s in servers if s != servers[1]) + await ensure_group0_leader_on(manager, other) + coord = await get_topology_coordinator(manager) + coord_serv = await find_server_by_host_id(manager, servers, coord) coord_log = await manager.server_open_log(coord_serv.server_id) coord_mark = await coord_log.mark()