From bceb054bbdf33211d97a5194a1d975b5e5eb515b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@scylladb.com>
Date: Mon, 27 Apr 2026 21:08:12 +0300
Subject: [PATCH] test/cluster/test_incremental_repair: fix flaky
 coordinator-change scenario

The test_incremental_repair_race_window_promotes_unrepaired_data test
was flaky because it hardcodes servers[1] as the restart target but did
not ensure servers[1] was NOT the topology coordinator.

When servers[1] happened to be the Raft group0 leader (topology
coordinator), restarting it killed the leader, forced a new election,
and the new coordinator re-initiated tablet repair.  This re-repair
flushes memtables on all replicas via take_storage_snapshot() and marks
the resulting sstables as repaired -- causing post-repair keys to appear
in repaired sstables on servers[0] and servers[2].  The test then hit
the wrong assertion (servers[0]/[2] contaminated).

Fix: before starting the repair, check whether servers[1] is the
topology coordinator.  If so, move leadership to another server via
ensure_group0_leader_on() so that restarting servers[1] only kills a
follower -- which does not trigger an election or coordinator change.

Reproducibility was confirmed by forcing leadership to servers[1] via
ensure_group0_leader_on() and observing deterministic failure with all
three servers showing post-repair keys in repaired sstables (confirming
the re-repair scenario), then verifying the fix passes reliably.

Fixes: SCYLLADB-1478
Fixes: https://scylladb.atlassian.net/browse/SCYLLADB-1903
(cherry picked from commit 914b70c75b8acf6bfce78cf68f9c90d9035764d3)
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
---
 test/cluster/test_incremental_repair.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/test/cluster/test_incremental_repair.py b/test/cluster/test_incremental_repair.py
index 6ece1eedab..cf59112fc0 100644
--- a/test/cluster/test_incremental_repair.py
+++ b/test/cluster/test_incremental_repair.py
@@ -9,7 +9,7 @@ from test.cluster.conftest import skip_mode
 from test.pylib.repair import load_tablet_sstables_repaired_at, create_table_insert_data_for_repair
 from test.pylib.tablets import get_all_tablet_replicas
 from test.cluster.tasks.task_manager_client import TaskManagerClient
-from test.cluster.util import reconnect_driver, find_server_by_host_id, get_topology_coordinator, new_test_keyspace, new_test_table, trigger_stepdown
+from test.cluster.util import reconnect_driver, find_server_by_host_id, get_topology_coordinator, ensure_group0_leader_on, new_test_keyspace, new_test_table, trigger_stepdown
 
 from cassandra.query import ConsistencyLevel
 
@@ -965,8 +965,18 @@ async def test_incremental_repair_race_window_promotes_unrepaired_data(manager:
         await manager.api.flush_keyspace(s.ip_addr, ks)
     current_key += 10
 
+    # Ensure servers[1] is not the topology coordinator.  If the coordinator is
+    # restarted, the Raft leader dies, a new election occurs, and the new
+    # coordinator re-initiates tablet repair -- flushing memtables on all replicas
+    # and marking post-repair data as repaired.  That legitimate re-repair masks
+    # the compaction-merge bug this test detects.
     coord = await get_topology_coordinator(manager)
     coord_serv = await find_server_by_host_id(manager, servers, coord)
+    if coord_serv == servers[1]:
+        other = next(s for s in servers if s != servers[1])
+        await ensure_group0_leader_on(manager, other)
+        coord = await get_topology_coordinator(manager)
+        coord_serv = await find_server_by_host_id(manager, servers, coord)
     coord_log = await manager.server_open_log(coord_serv.server_id)
     coord_mark = await coord_log.mark()