Merge 'test/cluster: scale failure_detector_timeout_in_ms by build mode' from Marcin Maliszkiewicz

The failure_detector_timeout_in_ms override of 2000ms in 6 cluster test files is too aggressive for debug/sanitize builds. During node joins, the coordinator's failure detector times out on RPC pings to the joining node while it is still applying schema snapshots, marks it DOWN, and bans it — causing flaky test failures. Scale the timeout by MODES_TIMEOUT_FACTOR (3x for debug/sanitize, 2x for dev, 1x for release) via a shared failure_detector_timeout fixture in conftest.py. Fixes https://scylladb.atlassian.net/browse/SCYLLADB-1587 Backport: no, elasticsearch analyser shows only a single failure Closes scylladb/scylladb#29522 * github.com:scylladb/scylladb: test/cluster: scale failure_detector_timeout_in_ms by build mode test/cluster: add failure_detector_timeout fixture
test/cluster: scale failure_detector_timeout_in_ms by build mode
2026-04-25 11:00:35 +00:00 · 2026-04-24 09:10:43 +03:00 · 2026-04-20 15:28:34 +02:00 · 2026-04-20 15:28:33 +02:00
8 changed files with 25 additions and 20 deletions
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -1088,7 +1088,7 @@ void compaction_manager::register_metrics() {
        sm::make_gauge("normalized_backlog", [this] { return _last_backlog / available_memory(); },
                       sm::description("Holds the sum of normalized compaction backlog for all tables in the system. Backlog is normalized by dividing backlog by shard's available memory.")),
        sm::make_counter("validation_errors", [this] { return _validation_errors; },
-                       sm::description("Holds the number of encountered validation errors.")).set_skip_when_empty(),
+                       sm::description("Holds the number of encountered validation errors.")),
    });
 }

--- a/test/cluster/conftest.py
+++ b/test/cluster/conftest.py
@@ -18,7 +18,7 @@ from concurrent.futures.thread import ThreadPoolExecutor
 from multiprocessing import Event
 from pathlib import Path
 from typing import TYPE_CHECKING
-from test import TOP_SRC_DIR, path_to
+from test import TOP_SRC_DIR, MODES_TIMEOUT_FACTOR, path_to
 from test.pylib.random_tables import RandomTables
 from test.pylib.skip_types import skip_env
 from test.pylib.util import unique_name
@@ -394,3 +394,8 @@ async def key_provider(request, tmpdir, scylla_binary):
    """Encryption providers fixture"""
    async with make_key_provider_factory(request.param, tmpdir, scylla_binary) as res:
        yield res
+
+
+@pytest.fixture(scope="function")
+def failure_detector_timeout(build_mode):
+    return 2000 * MODES_TIMEOUT_FACTOR[build_mode]
--- a/test/cluster/test_crash_coordinator_before_streaming.py
+++ b/test/cluster/test_crash_coordinator_before_streaming.py
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)

@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
-async def test_kill_coordinator_during_op(manager: ManagerClient) -> None:
+async def test_kill_coordinator_during_op(manager: ManagerClient, failure_detector_timeout) -> None:
    """ Kill coordinator with error injection while topology operation is running for cluster: decommission,
    bootstrap, removenode, replace.

@@ -41,7 +41,7 @@ async def test_kill_coordinator_during_op(manager: ManagerClient) -> None:
    """
    # Decrease the failure detector threshold so we don't have to wait for too long.
    config = {
-        'failure_detector_timeout_in_ms': 2000
+        'failure_detector_timeout_in_ms': failure_detector_timeout
    }
    cmdline = [
        '--logger-log-level', 'raft_topology=trace',
--- a/test/cluster/test_node_isolation.py
+++ b/test/cluster/test_node_isolation.py
@@ -22,11 +22,11 @@ logger = logging.getLogger(__name__)

@pytest.mark.asyncio
@pytest.mark.nightly
-async def test_banned_node_notification(manager: ManagerClient) -> None:
+async def test_banned_node_notification(manager: ManagerClient, failure_detector_timeout) -> None:
    """Test that a node banned from the cluster get notification about been banned"""
    # Decrease the failure detector threshold so we don't have to wait for too long.
    config = {
-        'failure_detector_timeout_in_ms': 2000
+        'failure_detector_timeout_in_ms': failure_detector_timeout
    }
    srvs = await manager.servers_add(3, config=config, auto_rack_dc="dc")
    cql = manager.get_cql()
--- a/test/cluster/test_raft_ignore_nodes.py
+++ b/test/cluster/test_raft_ignore_nodes.py
@@ -60,14 +60,14 @@ async def make_servers(manager: ManagerClient, servers_num: int,


@pytest.mark.asyncio
-async def test_raft_replace_ignore_nodes(manager: ManagerClient) -> None:
+async def test_raft_replace_ignore_nodes(manager: ManagerClient, failure_detector_timeout) -> None:
    """Replace 3 dead nodes.

       This is a slow test with a 7 node cluster and 3 replace operations,
       we want to run it only in dev mode.
    """
    logger.info("Booting initial cluster")
-    servers = await make_servers(manager, 7, config={'failure_detector_timeout_in_ms': 2000})
+    servers = await make_servers(manager, 7, config={'failure_detector_timeout_in_ms': failure_detector_timeout})

    s1_id = await manager.get_host_id(servers[1].server_id)
    s2_id = await manager.get_host_id(servers[2].server_id)
--- a/test/cluster/test_replace.py
+++ b/test/cluster/test_replace.py
@@ -21,9 +21,9 @@ logger = logging.getLogger(__name__)


@pytest.mark.asyncio
-async def test_replace_different_ip(manager: ManagerClient) -> None:
+async def test_replace_different_ip(manager: ManagerClient, failure_detector_timeout) -> None:
    """Replace an existing node with new node using a different IP address"""
-    servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': 2000})
+    servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': failure_detector_timeout})
    logger.info(f"cluster started, servers {servers}")

    logger.info(f"replacing server {servers[0]}")
@@ -67,18 +67,18 @@ async def test_replace_different_ip(manager: ManagerClient) -> None:
        logger.info(f"server {s} system.peers and gossiper state is valid")

@pytest.mark.asyncio
-async def test_replace_different_ip_using_host_id(manager: ManagerClient) -> None:
+async def test_replace_different_ip_using_host_id(manager: ManagerClient, failure_detector_timeout) -> None:
    """Replace an existing node with new node reusing the replaced node host id"""
-    servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': 2000})
+    servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': failure_detector_timeout})
    await manager.server_stop(servers[0].server_id)
    replace_cfg = ReplaceConfig(replaced_id = servers[0].server_id, reuse_ip_addr = False, use_host_id = True)
    await manager.server_add(replace_cfg)
    await wait_for_token_ring_and_group0_consistency(manager, time.time() + 30)

@pytest.mark.asyncio
-async def test_replace_reuse_ip(request, manager: ManagerClient) -> None:
+async def test_replace_reuse_ip(request, manager: ManagerClient, failure_detector_timeout) -> None:
    """Replace an existing node with new node using the same IP address"""
-    servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': 2000}, auto_rack_dc="dc1")
+    servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': failure_detector_timeout}, auto_rack_dc="dc1")
    host2 = (await wait_for_cql_and_get_hosts(manager.get_cql(), [servers[2]], time.time() + 60))[0]

    logger.info(f"creating test table")
@@ -130,9 +130,9 @@ async def test_replace_reuse_ip(request, manager: ManagerClient) -> None:
    await manager.server_sees_other_server(servers[2].ip_addr, servers[0].ip_addr)

@pytest.mark.asyncio
-async def test_replace_reuse_ip_using_host_id(manager: ManagerClient) -> None:
+async def test_replace_reuse_ip_using_host_id(manager: ManagerClient, failure_detector_timeout) -> None:
    """Replace an existing node with new node using the same IP address and same host id"""
-    servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': 2000})
+    servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': failure_detector_timeout})
    await manager.server_stop(servers[0].server_id)
    replace_cfg = ReplaceConfig(replaced_id = servers[0].server_id, reuse_ip_addr = True, use_host_id = True)
    await manager.server_add(replace_cfg)
--- a/test/cluster/test_replace_with_same_ip_twice.py
+++ b/test/cluster/test_replace_with_same_ip_twice.py
@@ -14,9 +14,9 @@ logger = logging.getLogger(__name__)


@pytest.mark.asyncio
-async def test_replace_with_same_ip_twice(manager: ManagerClient) -> None:
+async def test_replace_with_same_ip_twice(manager: ManagerClient, failure_detector_timeout) -> None:
    logger.info("starting a cluster with two nodes")
-    servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': 2000})
+    servers = await manager.servers_add(3, config={'failure_detector_timeout_in_ms': failure_detector_timeout})
    logger.info(f"cluster started {servers}")

    async def replace_with_same_ip(s: ServerInfo) -> ServerInfo:
--- a/test/cluster/test_tablets_migration.py
+++ b/test/cluster/test_tablets_migration.py
@@ -119,14 +119,14 @@ async def test_tablet_transition_sanity(manager: ManagerClient, action):
@pytest.mark.parametrize("fail_stage", ["streaming", "allow_write_both_read_old", "write_both_read_old", "write_both_read_new", "use_new", "cleanup", "cleanup_target", "end_migration", "revert_migration"])
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
-async def test_node_failure_during_tablet_migration(manager: ManagerClient, fail_replica, fail_stage):
+async def test_node_failure_during_tablet_migration(manager: ManagerClient, fail_replica, fail_stage, failure_detector_timeout):
    if fail_stage == 'cleanup' and fail_replica == 'destination':
        skip_env('Failing destination during cleanup is pointless')
    if fail_stage == 'cleanup_target' and fail_replica == 'source':
        skip_env('Failing source during target cleanup is pointless')

    logger.info("Bootstrapping cluster")
-    cfg = {'enable_user_defined_functions': False, 'tablets_mode_for_new_keyspaces': 'enabled', 'failure_detector_timeout_in_ms': 2000}
+    cfg = {'enable_user_defined_functions': False, 'tablets_mode_for_new_keyspaces': 'enabled', 'failure_detector_timeout_in_ms': failure_detector_timeout}
    host_ids = []
    servers = []