mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-26 11:30:36 +00:00
The failure_detector_timeout_in_ms override of 2000ms in 6 cluster test files is too aggressive for debug/sanitize builds. During node joins, the coordinator's failure detector times out on RPC pings to the joining node while it is still applying schema snapshots, marks it DOWN, and bans it — causing flaky test failures. Scale the timeout by MODES_TIMEOUT_FACTOR (3x for debug/sanitize, 2x for dev, 1x for release) via a shared failure_detector_timeout fixture in conftest.py. Fixes https://scylladb.atlassian.net/browse/SCYLLADB-1587 Backport: no, elasticsearch analyser shows only a single failure Closes scylladb/scylladb#29522 * github.com:scylladb/scylladb: test/cluster: scale failure_detector_timeout_in_ms by build mode test/cluster: add failure_detector_timeout fixture
51 lines
2.2 KiB
Python
51 lines
2.2 KiB
Python
#
|
|
# Copyright (C) 2023-present ScyllaDB
|
|
#
|
|
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
|
#
|
|
import logging
|
|
import pytest
|
|
|
|
from cassandra.cluster import ConsistencyLevel, ExecutionProfile, EXEC_PROFILE_DEFAULT # type: ignore
|
|
from cassandra.cluster import NoHostAvailable, OperationTimedOut # type: ignore
|
|
from cassandra.cluster import Cluster # type: ignore
|
|
from cassandra.query import SimpleStatement # type: ignore
|
|
from cassandra.policies import WhiteListRoundRobinPolicy # type: ignore
|
|
|
|
from test.pylib.manager_client import ManagerClient
|
|
from test.pylib.rest_client import read_barrier
|
|
from test.cluster.util import create_new_test_keyspace
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.nightly
|
|
async def test_banned_node_notification(manager: ManagerClient, failure_detector_timeout) -> None:
|
|
"""Test that a node banned from the cluster get notification about been banned"""
|
|
# Decrease the failure detector threshold so we don't have to wait for too long.
|
|
config = {
|
|
'failure_detector_timeout_in_ms': failure_detector_timeout
|
|
}
|
|
srvs = await manager.servers_add(3, config=config, auto_rack_dc="dc")
|
|
cql = manager.get_cql()
|
|
|
|
# Pause one of the servers so other nodes mark it as dead and we can remove it.
|
|
# We deliberately don't shut it down, but only pause it - we want to test
|
|
# that we solved the harder problem of safely removing nodes which didn't shut down.
|
|
logger.info(f"Pausing server {srvs[2]}")
|
|
await manager.server_pause(srvs[2].server_id)
|
|
logger.info(f"Removing {srvs[2]} using {srvs[0]}")
|
|
await manager.remove_node(srvs[0].server_id, srvs[2].server_id)
|
|
# Perform a read barrier on srvs[1] so it learns about the ban.
|
|
logger.info(f"Performing read barrier on server {srvs[1]}")
|
|
await read_barrier(manager.api, srvs[1].ip_addr)
|
|
logger.info(f"Unpausing {srvs[2]}")
|
|
await manager.server_unpause(srvs[2].server_id)
|
|
|
|
log = await manager.server_open_log(srvs[2].server_id)
|
|
_, matches = await log.wait_for(f"received notification of being banned from the cluster from")
|
|
|
|
# check that the node is notified about being banned
|
|
assert len(matches) != 0, "The node did not log being banned from the cluster" |