diff --git a/test/cluster/test_change_replication_factor_1_to_0.py b/test/cluster/test_change_replication_factor_1_to_0.py index dcb706f1dc..78e0f4586d 100644 --- a/test/cluster/test_change_replication_factor_1_to_0.py +++ b/test/cluster/test_change_replication_factor_1_to_0.py @@ -12,7 +12,7 @@ from cassandra import ConsistencyLevel # type: ignore from cassandra.query import SimpleStatement # type: ignore from test.pylib.manager_client import ManagerClient from test.pylib.util import wait_for_cql_and_get_hosts -from test.cluster.util import check_token_ring_and_group0_consistency, new_test_keyspace +from test.cluster.util import new_test_keyspace, wait_for_token_ring_and_group0_consistency from test.pylib.util import wait_for logger = logging.getLogger(__name__) @@ -106,7 +106,7 @@ async def test_change_replication_factor_1_to_0_and_decommission(request: pytest # decommission dc1 await manager.decommission_node(srvs[1].server_id) - await check_token_ring_and_group0_consistency(manager) + await wait_for_token_ring_and_group0_consistency(manager, time.time() + 30) # ensure this no-op alter still works async with asyncio.timeout(30): diff --git a/test/cluster/test_data_resurrection_after_cleanup.py b/test/cluster/test_data_resurrection_after_cleanup.py index 97a57ce242..4b223c6211 100644 --- a/test/cluster/test_data_resurrection_after_cleanup.py +++ b/test/cluster/test_data_resurrection_after_cleanup.py @@ -7,7 +7,7 @@ from test.pylib.manager_client import ManagerClient from test.pylib.rest_client import inject_error_one_shot from test.cluster.conftest import skip_mode -from test.cluster.util import check_token_ring_and_group0_consistency, new_test_keyspace +from test.cluster.util import new_test_keyspace, wait_for_token_ring_and_group0_consistency import pytest import asyncio @@ -67,7 +67,7 @@ async def test_data_resurrection_after_cleanup(manager: ManagerClient): logger.info(f"Decommissioning node {servers[1]}") await manager.decommission_node(servers[1].server_id) - await check_token_ring_and_group0_consistency(manager) + await wait_for_token_ring_and_group0_consistency(manager, time.time() + 30) time.sleep(1) await check(range(128)) diff --git a/test/cluster/test_decommission.py b/test/cluster/test_decommission.py index 38cc568130..9d1d5a4bb1 100644 --- a/test/cluster/test_decommission.py +++ b/test/cluster/test_decommission.py @@ -4,10 +4,11 @@ # SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 # import logging +import time import pytest from test.pylib.manager_client import ManagerClient -from test.cluster.util import check_token_ring_and_group0_consistency +from test.cluster.util import wait_for_token_ring_and_group0_consistency logger = logging.getLogger(__name__) @@ -27,7 +28,7 @@ async def test_decommissioned_node_cant_rejoin(request, manager: ManagerClient): # to communicate with other nodes to discover a leader. logger.info(f"Decommissioning node {servers[1]}") await manager.decommission_node(servers[1].server_id) - await check_token_ring_and_group0_consistency(manager) + await wait_for_token_ring_and_group0_consistency(manager, time.time() + 30) logger.info(f"Attempting to start the node {servers[1]} after it was decommissioned") await manager.server_start(servers[1].server_id, expected_error='This node was decommissioned and will not rejoin the ring') diff --git a/test/cluster/test_topology_ops.py b/test/cluster/test_topology_ops.py index f46d53ea17..12004ea65f 100644 --- a/test/cluster/test_topology_ops.py +++ b/test/cluster/test_topology_ops.py @@ -7,7 +7,7 @@ from test.pylib.scylla_cluster import ReplaceConfig from test.pylib.manager_client import ManagerClient from test.pylib.util import wait_for_cql_and_get_hosts from test.cluster.util import check_token_ring_and_group0_consistency, reconnect_driver, \ - check_node_log_for_failed_mutations, start_writes + check_node_log_for_failed_mutations, start_writes, wait_for_token_ring_and_group0_consistency from cassandra.cluster import ConsistencyLevel @@ -45,7 +45,7 @@ async def test_topology_ops(request, manager: ManagerClient, tablets_enabled: bo logger.info(f"Decommissioning node {servers[0]}") await manager.decommission_node(servers[0].server_id) - await check_token_ring_and_group0_consistency(manager) + await wait_for_token_ring_and_group0_consistency(manager, time.time() + 30) servers = servers[1:] logger.info(f"Restarting node {servers[0]} when other nodes have bootstrapped") diff --git a/test/cluster/test_topology_ops_encrypted.py b/test/cluster/test_topology_ops_encrypted.py index 9843edb423..ea0462d5ec 100644 --- a/test/cluster/test_topology_ops_encrypted.py +++ b/test/cluster/test_topology_ops_encrypted.py @@ -6,7 +6,8 @@ from test.pylib.scylla_cluster import ReplaceConfig from test.pylib.manager_client import ManagerClient from test.pylib.util import wait_for_cql_and_get_hosts -from test.cluster.util import check_token_ring_and_group0_consistency, reconnect_driver +from test.cluster.util import check_token_ring_and_group0_consistency, reconnect_driver, \ + wait_for_token_ring_and_group0_consistency from test.cluster.test_topology_ops import check_node_log_for_failed_mutations, start_writes from cassandra.cluster import ConsistencyLevel @@ -51,7 +52,7 @@ async def test_topology_ops_encrypted(request, manager: ManagerClient, tablets_e logger.info(f"Decommissioning node {servers[0]}") await manager.decommission_node(servers[0].server_id) - await check_token_ring_and_group0_consistency(manager) + await wait_for_token_ring_and_group0_consistency(manager, time.time() + 30) servers = servers[1:] logger.info(f"Restarting node {servers[0]} when other nodes have bootstrapped") diff --git a/test/cluster/util.py b/test/cluster/util.py index 3f15d9aa10..3d7922687a 100644 --- a/test/cluster/util.py +++ b/test/cluster/util.py @@ -116,15 +116,22 @@ async def check_token_ring_and_group0_consistency(manager: ManagerClient) -> Non async def wait_for_token_ring_and_group0_consistency(manager: ManagerClient, deadline: float) -> None: - """Weaker version of the above check; the token ring is not immediately updated after - bootstrap/replace/decommission - the normal tokens of the new node propagate through gossip. + """ + Weaker version of the above check. + + In the Raft-based topology, a decommissioning node is removed from group 0 after the decommission request is + considered finished (and the token ring is updated). + + Moreover, in the gossip-based topology, the token ring is not immediately updated after + bootstrap/replace/decommission - the normal tokens propagate through gossip. + Take this into account and wait for the equality condition to hold, with a timeout. """ servers = await manager.running_servers() for srv in servers: - group0_members = await get_current_group0_config(manager, srv) - group0_ids = {m[0] for m in group0_members} - async def token_ring_matches(): + async def token_ring_and_group0_match(): + group0_members = await get_current_group0_config(manager, srv) + group0_ids = {m[0] for m in group0_members} token_ring_ids = await get_token_ring_host_ids(manager, srv) diff = token_ring_ids ^ group0_ids if diff: @@ -132,7 +139,7 @@ async def wait_for_token_ring_and_group0_consistency(manager: ManagerClient, dea f" according to {srv}, symmetric difference: {diff}") return None return True - await wait_for(token_ring_matches, deadline, period=.5) + await wait_for(token_ring_and_group0_match, deadline, period=.5) async def wait_for_upgrade_state(state: str, cql: Session, host: Host, deadline: float) -> None: