mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-23 00:02:37 +00:00
test: wait for others_not_see_server before exclude
Between stopping a server and excluding it, wait for other nodes to see
the server as down, otherwise exclude may see the server as alive and
fail.
Fixes SCYLLADB-2110
Closes scylladb/scylladb#29966
(cherry picked from commit eecbead541)
Closes scylladb/scylladb#29975
This commit is contained in:
committed by
Botond Dénes
parent
17a61e0015
commit
846ff3ce7f
@@ -223,14 +223,15 @@ async def test_remove_node_violating_rf_rack(manager: ManagerClient, op: str):
|
||||
cfg = {'rf_rack_valid_keyspaces': False, 'error_injections_at_startup': [{'name': 'suppress_features', 'value': 'RACK_LIST_RF'}]}
|
||||
cmdline = ['--logger-log-level', 'tablets=debug', '--logger-log-level', 'raft_topology=debug']
|
||||
|
||||
async def remove_node(server_id: str, expected_error: str = None):
|
||||
async def remove_node(srv_to_remove, expected_error: str = None):
|
||||
if op == "remove":
|
||||
await manager.server_stop_gracefully(server_id)
|
||||
await manager.server_stop_gracefully(srv_to_remove.server_id)
|
||||
await manager.others_not_see_server(srv_to_remove.ip_addr)
|
||||
# If remove_node fails, the node may be left not excluded, blocking later remove_node.
|
||||
await manager.api.exclude_node(servers[0].ip_addr, [await manager.get_host_id(server_id)])
|
||||
await manager.remove_node(servers[0].server_id, server_id, expected_error=expected_error)
|
||||
await manager.api.exclude_node(servers[0].ip_addr, [await manager.get_host_id(srv_to_remove.server_id)])
|
||||
await manager.remove_node(servers[0].server_id, srv_to_remove.server_id, expected_error=expected_error)
|
||||
elif op == "decommission":
|
||||
await manager.decommission_node(server_id, expected_error=expected_error)
|
||||
await manager.decommission_node(srv_to_remove.server_id, expected_error=expected_error)
|
||||
|
||||
servers = await manager.servers_add(4, config=cfg, cmdline=cmdline, property_file=[
|
||||
{"dc": "dc1", "rack": "r1"},
|
||||
@@ -245,13 +246,13 @@ async def test_remove_node_violating_rf_rack(manager: ManagerClient, op: str):
|
||||
await cql.run_async("CREATE MATERIALIZED VIEW ks.mv AS SELECT * FROM ks.t WHERE v IS NOT NULL PRIMARY KEY (v, p)")
|
||||
|
||||
# First removal: Remove one node from rack r3 (should always succeed)
|
||||
await remove_node(servers[3].server_id)
|
||||
await remove_node(servers[3])
|
||||
|
||||
# Second removal: Try to remove the other node from rack r3
|
||||
# This would eliminate rack r3 entirely, violating RF-rack constraints
|
||||
await remove_node(servers[2].server_id, expected_error=f"node {op} rejected: Cannot remove the node because its removal would make some existing keyspace RF-rack-invalid")
|
||||
await remove_node(servers[2], expected_error=f"node {op} rejected: Cannot remove the node because its removal would make some existing keyspace RF-rack-invalid")
|
||||
|
||||
# Drop the materialized view and verify we can now remove the rack
|
||||
await cql.run_async("DROP MATERIALIZED VIEW ks.mv")
|
||||
await cql.run_async("ALTER KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': 2}")
|
||||
await remove_node(servers[2].server_id)
|
||||
await remove_node(servers[2])
|
||||
|
||||
@@ -314,14 +314,15 @@ async def test_remove_node_violating_rf_rack_with_rack_list(manager: ManagerClie
|
||||
cfg = {}
|
||||
cmdline = ['--logger-log-level', 'tablets=debug', '--logger-log-level', 'raft_topology=debug']
|
||||
|
||||
async def remove_node(server_id: str, expected_error: str = None):
|
||||
async def remove_node(srv_to_remove, expected_error: str = None):
|
||||
if op == "remove":
|
||||
await manager.server_stop_gracefully(server_id)
|
||||
await manager.server_stop_gracefully(srv_to_remove.server_id)
|
||||
await manager.others_not_see_server(srv_to_remove.ip_addr)
|
||||
# If remove_node fails, the node may be left not excluded, blocking later remove_node.
|
||||
await manager.api.exclude_node(servers[0].ip_addr, [await manager.get_host_id(server_id)])
|
||||
await manager.remove_node(servers[0].server_id, server_id, expected_error=expected_error)
|
||||
await manager.api.exclude_node(servers[0].ip_addr, [await manager.get_host_id(srv_to_remove.server_id)])
|
||||
await manager.remove_node(servers[0].server_id, srv_to_remove.server_id, expected_error=expected_error)
|
||||
elif op == "decommission":
|
||||
await manager.decommission_node(server_id, expected_error=expected_error)
|
||||
await manager.decommission_node(srv_to_remove.server_id, expected_error=expected_error)
|
||||
|
||||
servers = await manager.servers_add(5, config=cfg, cmdline=cmdline, property_file=[
|
||||
{"dc": "dc1", "rack": "r1"},
|
||||
@@ -339,8 +340,8 @@ async def test_remove_node_violating_rf_rack_with_rack_list(manager: ManagerClie
|
||||
|
||||
# Try to remove node from r4 (listed rack) - should be rejected
|
||||
# This would eliminate rack r4 from the available racks, violating RF-rack constraints
|
||||
await remove_node(servers[3].server_id, expected_error=f"node {op} rejected: Cannot remove the node because its removal would make some existing keyspace RF-rack-invalid")
|
||||
await remove_node(servers[3], expected_error=f"node {op} rejected: Cannot remove the node because its removal would make some existing keyspace RF-rack-invalid")
|
||||
|
||||
# Remove node from r3 (unlisted rack) - should succeed
|
||||
# This doesn't affect RF-rack validity since r3 is not in the keyspace's rack list
|
||||
await remove_node(servers[2].server_id)
|
||||
await remove_node(servers[2])
|
||||
|
||||
Reference in New Issue
Block a user