From e414b2b0b9a2a244946963ecb9bfe5cec3685db6 Mon Sep 17 00:00:00 2001 From: Marcin Maliszkiewicz Date: Thu, 16 Apr 2026 15:14:36 +0200 Subject: [PATCH] test/cluster: scale failure_detector_timeout_in_ms by build mode Six cluster test files override failure_detector_timeout_in_ms to 2000ms for faster failure detection. In debug and sanitize builds, this causes flaky node join failures. The following log analysis shows how. The coordinator (server 614, IP 127.2.115.3) accepts the joining node (server 615, host_id 53b01f0b, IP 127.2.115.2) into group0: 20:10:57,049 [shard 0] raft_group0 - server 614 entered 'join group0' transition state for 53b01f0b The joining node begins receiving the raft snapshot 100ms later: 20:10:57,150 [shard 0] raft_group0 - transfer snapshot from 9fa48539 It then spends ~280ms applying schema changes -- creating 6 keyspaces and 12+ tables from the snapshot: 20:10:57,511 [shard 0] migration_manager - Creating keyspace system_auth_v2 ... 20:10:57,788 [shard 0] migration_manager - Creating system_auth_v2.role_members Meanwhile, the coordinator's failure detector pings the joining node. Under debug+ASan load the RPC call times out after ~4.6 seconds: 20:11:01,643 [shard 0] direct_failure_detector - unexpected exception when pinging 53b01f0b: seastar::rpc::timeout_error (rpc call timed out) 25ms later, the coordinator marks the joining node DOWN and removes it: 20:11:01,668 [shard 0] raft_group0 - failure_detector_loop: Mark node 53b01f0b as DOWN 20:11:01,717 [shard 0] raft_group0 - bootstrap: failed to accept 53b01f0b The joining node was still retrying the snapshot transfer at that point: 20:11:01,745 [shard 0] raft_group0 - transfer snapshot from 9fa48539 It then receives the ban notification and aborts: 20:11:01,844 [shard 0] raft_group0 - received notification of being banned from the cluster Replace the hardcoded 2000ms with the failure_detector_timeout fixture from conftest.py, which scales by MODES_TIMEOUT_FACTOR: 3x for debug/sanitize (6000ms), 2x for dev (4000ms), 1x for release (2000ms). Test measurements (before -> after fix): debug mode: test_replace_with_same_ip_twice 24.02s -> 25.02s test_banned_node_notification 217.22s -> 221.72s test_kill_coordinator_during_op 116.11s -> 127.13s test_node_failure_during_tablet_migration [streaming-source] 183.25s -> 192.69s test_replace (4 tests) skipped in debug (skip_in_debug) test_raft_replace_ignore_nodes skipped in debug (run_in_dev only) dev mode: test_replace_different_ip 10.51s -> 11.50s test_replace_different_ip_using_host_id 10.01s -> 12.01s test_replace_reuse_ip 10.51s -> 12.03s test_replace_reuse_ip_using_host_id 13.01s -> 12.01s test_raft_replace_ignore_nodes 19.52s -> 19.52s --- .../test_crash_coordinator_before_streaming.py | 4 ++-- test/cluster/test_node_isolation.py | 4 ++-- test/cluster/test_raft_ignore_nodes.py | 4 ++-- test/cluster/test_replace.py | 16 ++++++++-------- test/cluster/test_replace_with_same_ip_twice.py | 4 ++-- test/cluster/test_tablets_migration.py | 4 ++-- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/test/cluster/test_crash_coordinator_before_streaming.py b/test/cluster/test_crash_coordinator_before_streaming.py index 6dd586b816..b22b8481ba 100644 --- a/test/cluster/test_crash_coordinator_before_streaming.py +++ b/test/cluster/test_crash_coordinator_before_streaming.py @@ -19,7 +19,7 @@ logger = logging.getLogger(__name__) @pytest.mark.asyncio @pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode') -async def test_kill_coordinator_during_op(manager: ManagerClient) -> None: +async def test_kill_coordinator_during_op(manager: ManagerClient, failure_detector_timeout) -> None: """ Kill coordinator with error injection while topology operation is running for cluster: decommission, bootstrap, removenode, replace. @@ -41,7 +41,7 @@ async def test_kill_coordinator_during_op(manager: ManagerClient) -> None: """ # Decrease the failure detector threshold so we don't have to wait for too long. config = { - 'failure_detector_timeout_in_ms': 2000 + 'failure_detector_timeout_in_ms': failure_detector_timeout } cmdline = [ '--logger-log-level', 'raft_topology=trace', diff --git a/test/cluster/test_node_isolation.py b/test/cluster/test_node_isolation.py index 6ca257f644..ae8c5c8ff9 100644 --- a/test/cluster/test_node_isolation.py +++ b/test/cluster/test_node_isolation.py @@ -22,11 +22,11 @@ logger = logging.getLogger(__name__) @pytest.mark.asyncio @pytest.mark.nightly -async def test_banned_node_notification(manager: ManagerClient) -> None: +async def test_banned_node_notification(manager: ManagerClient, failure_detector_timeout) -> None: """Test that a node banned from the cluster get notification about been banned""" # Decrease the failure detector threshold so we don't have to wait for too long. config = { - 'failure_detector_timeout_in_ms': 2000 + 'failure_detector_timeout_in_ms': failure_detector_timeout } srvs = await manager.servers_add(3, config=config) cql = manager.get_cql() diff --git a/test/cluster/test_raft_ignore_nodes.py b/test/cluster/test_raft_ignore_nodes.py index 86cfdd273a..453f10f85e 100644 --- a/test/cluster/test_raft_ignore_nodes.py +++ b/test/cluster/test_raft_ignore_nodes.py @@ -60,14 +60,14 @@ async def make_servers(manager: ManagerClient, servers_num: int, @pytest.mark.asyncio -async def test_raft_replace_ignore_nodes(manager: ManagerClient) -> None: +async def test_raft_replace_ignore_nodes(manager: ManagerClient, failure_detector_timeout) -> None: """Replace 3 dead nodes. This is a slow test with a 7 node cluster and 3 replace operations, we want to run it only in dev mode. """ logger.info("Booting initial cluster") - servers = await make_servers(manager, 7, config={'failure_detector_timeout_in_ms': 2000}) + servers = await make_servers(manager, 7, config={'failure_detector_timeout_in_ms': failure_detector_timeout}) s1_id = await manager.get_host_id(servers[1].server_id) s2_id = await manager.get_host_id(servers[2].server_id) diff --git a/test/cluster/test_replace.py b/test/cluster/test_replace.py index 788c370d53..e76f71a745 100644 --- a/test/cluster/test_replace.py +++ b/test/cluster/test_replace.py @@ -21,9 +21,9 @@ logger = logging.getLogger(__name__) @pytest.mark.asyncio -async def test_replace_different_ip(manager: ManagerClient) -> None: +async def test_replace_different_ip(manager: ManagerClient, failure_detector_timeout) -> None: """Replace an existing node with new node using a different IP address""" - servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': 2000}) + servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': failure_detector_timeout}) logger.info(f"cluster started, servers {servers}") logger.info(f"replacing server {servers[0]}") @@ -67,18 +67,18 @@ async def test_replace_different_ip(manager: ManagerClient) -> None: logger.info(f"server {s} system.peers and gossiper state is valid") @pytest.mark.asyncio -async def test_replace_different_ip_using_host_id(manager: ManagerClient) -> None: +async def test_replace_different_ip_using_host_id(manager: ManagerClient, failure_detector_timeout) -> None: """Replace an existing node with new node reusing the replaced node host id""" - servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': 2000}) + servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': failure_detector_timeout}) await manager.server_stop(servers[0].server_id) replace_cfg = ReplaceConfig(replaced_id = servers[0].server_id, reuse_ip_addr = False, use_host_id = True) await manager.server_add(replace_cfg) await wait_for_token_ring_and_group0_consistency(manager, time.time() + 30) @pytest.mark.asyncio -async def test_replace_reuse_ip(request, manager: ManagerClient) -> None: +async def test_replace_reuse_ip(request, manager: ManagerClient, failure_detector_timeout) -> None: """Replace an existing node with new node using the same IP address""" - servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': 2000}, auto_rack_dc="dc1") + servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': failure_detector_timeout}, auto_rack_dc="dc1") host2 = (await wait_for_cql_and_get_hosts(manager.get_cql(), [servers[2]], time.time() + 60))[0] logger.info(f"creating test table") @@ -130,9 +130,9 @@ async def test_replace_reuse_ip(request, manager: ManagerClient) -> None: await manager.server_sees_other_server(servers[2].ip_addr, servers[0].ip_addr) @pytest.mark.asyncio -async def test_replace_reuse_ip_using_host_id(manager: ManagerClient) -> None: +async def test_replace_reuse_ip_using_host_id(manager: ManagerClient, failure_detector_timeout) -> None: """Replace an existing node with new node using the same IP address and same host id""" - servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': 2000}) + servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': failure_detector_timeout}) await manager.server_stop(servers[0].server_id) replace_cfg = ReplaceConfig(replaced_id = servers[0].server_id, reuse_ip_addr = True, use_host_id = True) await manager.server_add(replace_cfg) diff --git a/test/cluster/test_replace_with_same_ip_twice.py b/test/cluster/test_replace_with_same_ip_twice.py index 4bf77ebf16..f3c8fb52c4 100644 --- a/test/cluster/test_replace_with_same_ip_twice.py +++ b/test/cluster/test_replace_with_same_ip_twice.py @@ -14,9 +14,9 @@ logger = logging.getLogger(__name__) @pytest.mark.asyncio -async def test_replace_with_same_ip_twice(manager: ManagerClient) -> None: +async def test_replace_with_same_ip_twice(manager: ManagerClient, failure_detector_timeout) -> None: logger.info("starting a cluster with two nodes") - servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': 2000}) + servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': failure_detector_timeout}) logger.info(f"cluster started {servers}") async def replace_with_same_ip(s: ServerInfo) -> ServerInfo: diff --git a/test/cluster/test_tablets_migration.py b/test/cluster/test_tablets_migration.py index be2dbd80ba..cd6e223e34 100644 --- a/test/cluster/test_tablets_migration.py +++ b/test/cluster/test_tablets_migration.py @@ -118,14 +118,14 @@ async def test_tablet_transition_sanity(manager: ManagerClient, action): @pytest.mark.parametrize("fail_stage", ["streaming", "allow_write_both_read_old", "write_both_read_old", "write_both_read_new", "use_new", "cleanup", "cleanup_target", "end_migration", "revert_migration"]) @pytest.mark.asyncio @pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode') -async def test_node_failure_during_tablet_migration(manager: ManagerClient, fail_replica, fail_stage): +async def test_node_failure_during_tablet_migration(manager: ManagerClient, fail_replica, fail_stage, failure_detector_timeout): if fail_stage == 'cleanup' and fail_replica == 'destination': pytest.skip('Failing destination during cleanup is pointless') if fail_stage == 'cleanup_target' and fail_replica == 'source': pytest.skip('Failing source during target cleanup is pointless') logger.info("Bootstrapping cluster") - cfg = {'enable_user_defined_functions': False, 'tablets_mode_for_new_keyspaces': 'enabled', 'failure_detector_timeout_in_ms': 2000} + cfg = {'enable_user_defined_functions': False, 'tablets_mode_for_new_keyspaces': 'enabled', 'failure_detector_timeout_in_ms': failure_detector_timeout} host_ids = [] servers = []