test: wait for others_not_see_server before exclude

Between stopping a server and excluding it, wait for other nodes to see
the server as down, otherwise exclude may see the server as alive and
fail.

Fixes SCYLLADB-2110

Closes scylladb/scylladb#29966

(cherry picked from commit eecbead541)

Closes scylladb/scylladb#29975
This commit is contained in:
Michael Litvak
2026-05-19 16:40:17 +02:00
committed by Botond Dénes
parent 17a61e0015
commit 846ff3ce7f
2 changed files with 17 additions and 15 deletions

View File

@@ -223,14 +223,15 @@ async def test_remove_node_violating_rf_rack(manager: ManagerClient, op: str):
cfg = {'rf_rack_valid_keyspaces': False, 'error_injections_at_startup': [{'name': 'suppress_features', 'value': 'RACK_LIST_RF'}]}
cmdline = ['--logger-log-level', 'tablets=debug', '--logger-log-level', 'raft_topology=debug']
async def remove_node(server_id: str, expected_error: str = None):
async def remove_node(srv_to_remove, expected_error: str = None):
if op == "remove":
await manager.server_stop_gracefully(server_id)
await manager.server_stop_gracefully(srv_to_remove.server_id)
await manager.others_not_see_server(srv_to_remove.ip_addr)
# If remove_node fails, the node may be left not excluded, blocking later remove_node.
await manager.api.exclude_node(servers[0].ip_addr, [await manager.get_host_id(server_id)])
await manager.remove_node(servers[0].server_id, server_id, expected_error=expected_error)
await manager.api.exclude_node(servers[0].ip_addr, [await manager.get_host_id(srv_to_remove.server_id)])
await manager.remove_node(servers[0].server_id, srv_to_remove.server_id, expected_error=expected_error)
elif op == "decommission":
await manager.decommission_node(server_id, expected_error=expected_error)
await manager.decommission_node(srv_to_remove.server_id, expected_error=expected_error)
servers = await manager.servers_add(4, config=cfg, cmdline=cmdline, property_file=[
{"dc": "dc1", "rack": "r1"},
@@ -245,13 +246,13 @@ async def test_remove_node_violating_rf_rack(manager: ManagerClient, op: str):
await cql.run_async("CREATE MATERIALIZED VIEW ks.mv AS SELECT * FROM ks.t WHERE v IS NOT NULL PRIMARY KEY (v, p)")
# First removal: Remove one node from rack r3 (should always succeed)
await remove_node(servers[3].server_id)
await remove_node(servers[3])
# Second removal: Try to remove the other node from rack r3
# This would eliminate rack r3 entirely, violating RF-rack constraints
await remove_node(servers[2].server_id, expected_error=f"node {op} rejected: Cannot remove the node because its removal would make some existing keyspace RF-rack-invalid")
await remove_node(servers[2], expected_error=f"node {op} rejected: Cannot remove the node because its removal would make some existing keyspace RF-rack-invalid")
# Drop the materialized view and verify we can now remove the rack
await cql.run_async("DROP MATERIALIZED VIEW ks.mv")
await cql.run_async("ALTER KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': 2}")
await remove_node(servers[2].server_id)
await remove_node(servers[2])

View File

@@ -314,14 +314,15 @@ async def test_remove_node_violating_rf_rack_with_rack_list(manager: ManagerClie
cfg = {}
cmdline = ['--logger-log-level', 'tablets=debug', '--logger-log-level', 'raft_topology=debug']
async def remove_node(server_id: str, expected_error: str = None):
async def remove_node(srv_to_remove, expected_error: str = None):
if op == "remove":
await manager.server_stop_gracefully(server_id)
await manager.server_stop_gracefully(srv_to_remove.server_id)
await manager.others_not_see_server(srv_to_remove.ip_addr)
# If remove_node fails, the node may be left not excluded, blocking later remove_node.
await manager.api.exclude_node(servers[0].ip_addr, [await manager.get_host_id(server_id)])
await manager.remove_node(servers[0].server_id, server_id, expected_error=expected_error)
await manager.api.exclude_node(servers[0].ip_addr, [await manager.get_host_id(srv_to_remove.server_id)])
await manager.remove_node(servers[0].server_id, srv_to_remove.server_id, expected_error=expected_error)
elif op == "decommission":
await manager.decommission_node(server_id, expected_error=expected_error)
await manager.decommission_node(srv_to_remove.server_id, expected_error=expected_error)
servers = await manager.servers_add(5, config=cfg, cmdline=cmdline, property_file=[
{"dc": "dc1", "rack": "r1"},
@@ -339,8 +340,8 @@ async def test_remove_node_violating_rf_rack_with_rack_list(manager: ManagerClie
# Try to remove node from r4 (listed rack) - should be rejected
# This would eliminate rack r4 from the available racks, violating RF-rack constraints
await remove_node(servers[3].server_id, expected_error=f"node {op} rejected: Cannot remove the node because its removal would make some existing keyspace RF-rack-invalid")
await remove_node(servers[3], expected_error=f"node {op} rejected: Cannot remove the node because its removal would make some existing keyspace RF-rack-invalid")
# Remove node from r3 (unlisted rack) - should succeed
# This doesn't affect RF-rack validity since r3 is not in the keyspace's rack list
await remove_node(servers[2].server_id)
await remove_node(servers[2])