From 0fcae725308ba95bdc100ca46deb2fc3e88248fe Mon Sep 17 00:00:00 2001 From: Piotr Smaron Date: Mon, 11 May 2026 12:29:16 +0200 Subject: [PATCH] test: bootstrap tombstone gc repair cluster sequentially Avoid concurrent topology changes in the tombstone GC repair setup, where debug-mode nodes running hinted handoff and materialized view startup work can time out while applying Raft entries before the test starts. Keep the sequential path opt-in so unrelated repair tests still exercise concurrent bootstrap behavior. Closes scylladb/scylladb#29829 --- test/cluster/test_incremental_repair.py | 4 +++- test/pylib/repair.py | 13 ++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/test/cluster/test_incremental_repair.py b/test/cluster/test_incremental_repair.py index a4eeaaab65..8d4dcfbcf5 100644 --- a/test/cluster/test_incremental_repair.py +++ b/test/cluster/test_incremental_repair.py @@ -1200,9 +1200,11 @@ async def _setup_tombstone_gc_cluster(manager, *, tablets=2, extra_cmdline=None) cmdline = ['--logger-log-level', 'repair=debug'] if extra_cmdline: cmdline += extra_cmdline + # These tests enable hinted handoff and materialized views, which make debug-mode + # concurrent bootstrap occasionally exceed the topology timeout before the test starts. servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair( manager, nr_keys=0, cmdline=cmdline, tablets=tablets, - disable_flush_cache_time=True) + disable_flush_cache_time=True, sequential_server_add=True) # Lower propagation_delay to 0 so gc_before = repair_time, making tombstones # GC-eligible immediately after a successful repair rather than 1h later. await cql.run_async( diff --git a/test/pylib/repair.py b/test/pylib/repair.py index 7a3f80c34f..df0b2f1169 100644 --- a/test/pylib/repair.py +++ b/test/pylib/repair.py @@ -64,7 +64,9 @@ async def load_tablet_repair_task_infos(cql, host, table_id): return repair_task_infos -async def create_table_insert_data_for_repair(manager, rf = 3 , tablets = 8, fast_stats_refresh = True, nr_keys = 256, disable_flush_cache_time = False, cmdline = None) -> (list[ServerInfo], CassandraSession, list[Host], str, str): +async def create_table_insert_data_for_repair(manager, rf=3, tablets=8, fast_stats_refresh=True, nr_keys=256, + disable_flush_cache_time=False, cmdline=None, + sequential_server_add=False) -> tuple[list[ServerInfo], CassandraSession, list[Host], str, str]: assert rf <= 3, "A keyspace with RF > 3 will be RF-rack-invalid if there are fewer racks than the RF" if fast_stats_refresh: @@ -73,8 +75,13 @@ async def create_table_insert_data_for_repair(manager, rf = 3 , tablets = 8, fas config = {} if disable_flush_cache_time: config.update({'repair_hints_batchlog_flush_cache_time_in_ms': 0}) - servers = await manager.servers_add(3, config=config, cmdline=cmdline, - property_file=[{"dc": "dc1", "rack": f"r{i % rf}"} for i in range(rf)]) + property_files = [{"dc": "dc1", "rack": f"r{i % rf}"} for i in range(3)] + if sequential_server_add: + servers = [] + for property_file in property_files: + servers.append(await manager.server_add(config=config, cmdline=cmdline, property_file=property_file)) + else: + servers = await manager.servers_add(len(property_files), config=config, cmdline=cmdline, property_file=property_files) cql = manager.get_cql() ks = await create_new_test_keyspace(cql, "WITH replication = {{'class': 'NetworkTopologyStrategy', " "'replication_factor': {}}} AND tablets = {{'initial': {}}};".format(rf, tablets))