test: Annotate server_stop() calls where conviction is useless

Pass convict=False explicitly to server_stop() calls where conviction
provides no benefit because there is no consumer of the failure
detection:

 - single-node clusters (no other node to call the API on)
 - all nodes being stopped concurrently (no live node remains)
 - immediate restart (no test logic between stop and start
   depends on other nodes detecting the stopped node as dead)
 - node stopped for file manipulation or bootstrap abort
 - majority killed with no quorum on surviving nodes to react
 - no test logic depends on other nodes detecting the failure

This is a no-op change since the default is already convict=False,
but makes the intent explicit for each call site.
This commit is contained in:
Tomasz Grabiec
2026-05-06 13:04:41 +02:00
parent a19e3f6f64
commit 624fe11178
29 changed files with 37 additions and 37 deletions

View File

@@ -503,7 +503,7 @@ class ScyllaNode:
if gently:
self.cluster.manager.server_stop_gracefully(server_id=self.server_id)
else:
self.cluster.manager.server_stop(server_id=self.server_id)
self.cluster.manager.server_stop(server_id=self.server_id, convict=False)
if wait or wait_other_notice:
self.wait_until_stopped(wait_seconds=wait_seconds, marks=marks, dump_core=gently)

View File

@@ -672,7 +672,7 @@ async def kill_non_coordinator_node(manager: ManagerClient,
yield
LOGGER.info("Kill a non-coordinator node")
await manager.server_stop(server_id=(await get_non_coordinator_host(manager=manager)).server_id)
await manager.server_stop(server_id=(await get_non_coordinator_host(manager=manager)).server_id, convict=False)
LOGGER.info("Sleep for 2 seconds")
await asyncio.sleep(2)

View File

@@ -95,4 +95,4 @@ async def space_limited_servers(manager: ManagerClient, volumes_factory: Callabl
yield servers
finally:
# Stop servers to be able to unmount volumes
await gather_safely(*(manager.server_stop(server.server_id) for server in servers))
await gather_safely(*(manager.server_stop(server.server_id, convict=False) for server in servers))

View File

@@ -420,7 +420,7 @@ async def test_localnodes_joining_nodes(manager: ManagerClient):
# server" error).
# Without this trick, this test will take 2 minutes of wait to finish.
for server in await manager.starting_servers():
await manager.server_stop(server.server_id)
await manager.server_stop(server.server_id, convict=False)
try:
await task
except Exception as e:

View File

@@ -124,7 +124,7 @@ async def test_change_two(manager, random_tables, build_mode):
if build_mode != 'release':
s0_logs = await manager.server_open_log(servers[0].server_id)
await s0_logs.wait_for('crash-before-prev-ip-removed hit, killing the node')
await manager.server_stop(servers[0].server_id)
await manager.server_stop(servers[0].server_id, convict=False)
await manager.server_start(servers[0].server_id)
await manager.api.enable_injection(servers[0].ip_addr, 'ip-change-raft-sync-delay', one_shot=False)
await reconnect_driver(manager)

View File

@@ -80,7 +80,7 @@ async def test_client_routes_node_restart(request, manager: ManagerClient):
cql, hosts = await manager.get_ready_cql(servers)
server_to_restart = servers[2]
await manager.server_stop(server_to_restart.server_id)
await manager.server_stop(server_to_restart.server_id, convict=False)
await manager.api.client.post("/v2/client-routes", host=servers[0].ip_addr, json=[generate_client_routes_entry(1)])
await wait_for_expected_client_routes_size(cql, 1)
@@ -147,7 +147,7 @@ async def test_client_routes_lost_quorum(request, manager: ManagerClient):
await wait_for_expected_client_routes_size(cql, 1)
for server in servers[1:]:
await manager.server_stop(server.server_id)
await manager.server_stop(server.server_id, convict=False)
async def fail_req(f):
with pytest.raises(HTTPError) as exc:
@@ -246,7 +246,7 @@ async def test_client_routes_snapshot_transfer(request, manager: ManagerClient,
error_to_inject = "block_group0_transfer_snapshot"
await manager.server_update_config(server_to_restart.server_id, "error_injections_at_startup", [error_to_inject])
await manager.server_stop(server_to_restart.server_id)
await manager.server_stop(server_to_restart.server_id, convict=False)
await manager.api.client.post("/v2/client-routes", host=servers[0].ip_addr, json=[generate_client_routes_entry(1)])
await wait_for_expected_client_routes_size(cql, 1)

View File

@@ -56,7 +56,7 @@ async def change_support_for_test_feature_and_restart(manager: ManagerClient, sr
injections.remove(TEST_FEATURE_ENABLE_ERROR_INJECTION)
await manager.server_update_config(srv.server_id, ERROR_INJECTIONS_AT_STARTUP_CONFIG_KEY, list(injections))
await asyncio.gather(*(manager.server_stop(srv.server_id) for srv in srvs))
await asyncio.gather(*(manager.server_stop(srv.server_id, convict=False) for srv in srvs))
await asyncio.gather(*(adjust_feature_in_config(manager, srv, enable) for srv in srvs))
await asyncio.gather(*(manager.server_start(srv.server_id, expected_error) for srv in srvs))

View File

@@ -78,7 +78,7 @@ async def test_reboot(request, manager: ManagerClient):
logger.info("Some new data is written into test table")
table_content_before_crash = await load_table()
await manager.server_stop(server_info.server_id)
await manager.server_stop(server_info.server_id, convict=False)
await manager.server_start(server_info.server_id)
cql = await reconnect_driver(manager)
await wait_for_cql_and_get_hosts(cql, [server_info], time.time() + 60)

View File

@@ -121,7 +121,7 @@ async def test_pinned_cl_segment_doesnt_resurrect_data(manager: ManagerClient):
logger.debug(f"The following segments were removed: {removed_segments}")
logger.debug("Kill + restart the node")
await manager.server_stop(server.server_id)
await manager.server_stop(server.server_id, convict=False)
await manager.server_start(server.server_id)
manager.driver_close()

View File

@@ -27,7 +27,7 @@ async def test_different_group0_ids(manager: ManagerClient):
id_b = await manager.get_host_id(scylla_b.server_id)
await manager.server_stop(scylla_b.server_id)
await manager.server_stop(scylla_b.server_id, convict=False)
await manager.server_start(scylla_b.server_id, seeds=[scylla_a.ip_addr])
# Since scylla_a and scylla_b have different group0 IDs and didn't join each other,

View File

@@ -275,7 +275,7 @@ async def test_reboot(manager, key_provider):
async def restart(manager: ManagerClient, servers: list[ServerInfo], table_names: list[str]):
# pylint: disable=unused-argument
for s in servers:
await manager.server_stop(s.server_id)
await manager.server_stop(s.server_id, convict=False)
await manager.server_start(s.server_id)
num_servers = 3
@@ -535,7 +535,7 @@ async def test_system_encryption_reboot(manager: ManagerClient, tmpdir):
async def restart(manager: ManagerClient, servers: list[ServerInfo], table_names: list[str]):
# pylint: disable=unused-argument
for s in servers:
await manager.server_stop(s.server_id)
await manager.server_stop(s.server_id, convict=False)
await manager.server_start(s.server_id)
options = {"commitlog_sync": "batch",

View File

@@ -76,7 +76,7 @@ async def test_group0_recovers_after_partial_command_application(manager: Manage
# The error injection enters an infinite loop, so kill the process.
logger.info("Kill the first node")
await manager.server_stop(srv1.server_id)
await manager.server_stop(srv1.server_id, convict=False)
# Before restarting, enable an error injection which causes the state
# machine to load the group0 state before (re)applying any group0

View File

@@ -569,7 +569,7 @@ async def do_test_tablet_incremental_repair_merge_error(manager, error):
await inject_error_on(manager, "tablet_force_tablet_count_decrease_once", servers)
await coord_log.wait_for(f'Got {error}', from_mark=mark)
await inject_error_off(manager, "tablet_force_tablet_count_decrease", servers)
await manager.server_stop(coord_serv.server_id)
await manager.server_stop(coord_serv.server_id, convict=False)
await manager.server_start(coord_serv.server_id)
for server in servers:
@@ -784,7 +784,7 @@ async def test_repair_sigsegv_with_diff_shard_count(manager: ManagerClient, use_
logger.info("Adding data only on first node")
await manager.api.flush_keyspace(servers[1].ip_addr, ks)
await manager.server_stop(servers[1].server_id)
await manager.server_stop(servers[1].server_id, convict=False)
manager.driver_close()
cql = await reconnect_driver(manager)
await write_with_cl_one(0, 10)
@@ -793,7 +793,7 @@ async def test_repair_sigsegv_with_diff_shard_count(manager: ManagerClient, use_
logger.info("Adding data only on second node")
await manager.server_start(servers[1].server_id)
await manager.api.flush_keyspace(servers[0].ip_addr, ks)
await manager.server_stop(servers[0].server_id)
await manager.server_stop(servers[0].server_id, convict=False)
manager.driver_close()
cql = await reconnect_driver(manager)
await write_with_cl_one(10, 20)

View File

@@ -159,7 +159,7 @@ async def do_test_internode_compression_between_datacenters(manager: ManagerClie
verifier(msg_size, node1_proxy, node2_proxy, node3_proxy)
await asyncio.gather(*[manager.server_stop(s.server_id) for s,_ in servers])
await asyncio.gather(*[manager.server_stop(s.server_id, convict=False) for s,_ in servers])
await asyncio.gather(*[p.stop() for p in proxies])
# these will all except, because we just stopped them above
for coro in proxy_futs:

View File

@@ -37,7 +37,7 @@ async def test_broken_bootstrap(manager: ManagerClient):
except Exception:
pass
await gather_safely(*(manager.server_stop(srv.server_id) for srv in [server_a, server_b]))
await gather_safely(*(manager.server_stop(srv.server_id, convict=False) for srv in [server_a, server_b]))
await manager.server_start(server_a.server_id)
await manager.driver_connect()
@@ -95,7 +95,7 @@ async def test_full_shutdown_during_replace(manager: ManagerClient, reuse_ip: bo
replacing_host_id = await manager.get_host_id(replacing_server.server_id)
logger.info(f'Stopping {live_servers + [replacing_server]}')
await gather_safely(*(manager.server_stop(srv.server_id) for srv in live_servers + [replacing_server]))
await gather_safely(*(manager.server_stop(srv.server_id, convict=False) for srv in live_servers + [replacing_server]))
replacing_task.cancel()
for srv in live_servers:

View File

@@ -60,7 +60,7 @@ async def test_cannot_disable_cluster_feature_after_all_declare_support(manager:
# Try to downgrade one node
await manager.server_update_config(servers[0].server_id, 'error_injections_at_startup', [])
await manager.server_stop(servers[0].server_id)
await manager.server_stop(servers[0].server_id, convict=False)
await manager.server_start(servers[0].server_id,
expected_error="Feature 'TEST_ONLY_FEATURE' was previously supported by all nodes in the cluster")

View File

@@ -247,7 +247,7 @@ async def test_can_restart(manager: ManagerClient, raft_op_timeout: int) -> None
servers = await manager.servers_add(5)
logger.info(f"Stopping {servers}")
await asyncio.gather(*(manager.server_stop(srv.server_id) for srv in servers))
await asyncio.gather(*(manager.server_stop(srv.server_id, convict=False) for srv in servers))
# This ensures the read barriers below fail quickly without group 0 quorum.
await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, raft_op_timeout) for srv in servers))

View File

@@ -87,7 +87,7 @@ async def test_raft_recovery_during_join(manager: ManagerClient):
dead_hosts.append(failed_server_host)
logging.info(f'Killing {dead_servers}')
await asyncio.gather(*(manager.server_stop(server_id=srv.server_id) for srv in dead_servers))
await asyncio.gather(*(manager.server_stop(server_id=srv.server_id, convict=False) for srv in dead_servers))
logging.info('Checking that group 0 has no majority')
with pytest.raises(Exception, match="raft operation \\[read_barrier\\] timed out"):

View File

@@ -93,7 +93,7 @@ async def test_raft_recovery_entry_loss(manager: ManagerClient):
logging.info(f'Found group 0 schema version {v_group0}')
logging.info(f'Killing {dead_servers}')
await asyncio.gather(*(manager.server_stop(server_id=srv.server_id) for srv in dead_servers))
await asyncio.gather(*(manager.server_stop(server_id=srv.server_id, convict=False) for srv in dead_servers))
logging.info(f'Starting {live_servers}')
await asyncio.gather(*(manager.server_start(server_id=srv.server_id) for srv in live_servers))

View File

@@ -97,7 +97,7 @@ async def test_raft_recovery_user_data(manager: ManagerClient, remove_dead_nodes
await asyncio.sleep(1)
logging.info(f'Killing {dead_servers}')
await gather_safely(*(manager.server_stop(server_id=srv.server_id) for srv in dead_servers))
await gather_safely(*(manager.server_stop(server_id=srv.server_id, convict=False) for srv in dead_servers))
logging.info('Checking that group 0 has no majority')
with pytest.raises(Exception, match="raft operation \\[read_barrier\\] timed out"):

View File

@@ -500,7 +500,7 @@ async def test_sc_persistence_after_crash(manager: ManagerClient):
# Collect raft state and log entry counts before crash
state_before_crash = await collect_all_raft_state(cql, hosts[0])
await manager.server_stop(server.server_id)
await manager.server_stop(server.server_id, convict=False)
await manager.server_start(server.server_id)
await reconnect_driver(manager)

View File

@@ -1630,7 +1630,7 @@ async def test_orphaned_sstables_on_startup(manager: ManagerClient):
logger.info("Migration done")
logger.info("Stop node1 and copy the sstables from node2")
await manager.server_stop(servers[0].server_id)
await manager.server_stop(servers[0].server_id, convict=False)
for src_path in glob.glob(os.path.join(node1_table_dir, sstable_filename_glob)):
dst_path = os.path.join(node0_table_dir, os.path.basename(src_path))
shutil.copy(src_path, dst_path)

View File

@@ -540,7 +540,7 @@ async def test_tablet_cleanup(manager: ManagerClient):
# Kill and restart first node.
# Check that this doesn't resurrect cleaned data.
logger.info("Brutally restart first node")
await manager.server_stop(servers[0].server_id)
await manager.server_stop(servers[0].server_id, convict=False)
await manager.server_start(servers[0].server_id)
hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
await manager.servers_see_each_other(servers)
@@ -2180,7 +2180,7 @@ async def test_split_and_incremental_repair_synchronization(manager: ManagerClie
# Give enough time for split to happen in debug mode
await wait_for(finished_splitting, time.time() + 120)
await manager.server_stop(servers[0].server_id)
await manager.server_stop(servers[0].server_id, convict=False)
await manager.server_start(servers[0].server_id)
hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
await manager.servers_see_each_other(servers)

View File

@@ -147,7 +147,7 @@ async def test_move_tablet(manager: ManagerClient, move_table: str):
# Now the dst node should hold both tablets. Stop the other node and
# verify we can read from both tables.
await manager.server_stop(servers[src_server].server_id)
await manager.server_stop(servers[src_server].server_id, convict=False)
rows = await cql.run_async(f"SELECT * FROM {ks}.test")
assert len(rows) == row_count
@@ -403,7 +403,7 @@ async def test_repair_colocated_base_and_view(manager: ManagerClient):
await manager.api.flush_keyspace(servers[1].ip_addr, ks)
# Stop node 2 and write data while it is down
await manager.server_stop(servers[1].server_id)
await manager.server_stop(servers[1].server_id, convict=False)
cql.execute(SimpleStatement(f"INSERT INTO {ks}.test(pk, c) VALUES(2, 20)", consistency_level=ConsistencyLevel.ONE))
await manager.api.flush_keyspace(servers[0].ip_addr, ks)

View File

@@ -89,7 +89,7 @@ async def test_crash_during_intranode_migration(manager: ManagerClient):
s0_logs = await manager.server_open_log(servers[0].server_id)
await s0_logs.wait_for('crash-in-tablet-write-both-read-new hit')
await manager.server_stop(servers[0].server_id)
await manager.server_stop(servers[0].server_id, convict=False)
await manager.server_start(servers[0].server_id)
await wait_for_cql_and_get_hosts(manager.cql, servers, time.time() + 60)

View File

@@ -642,4 +642,4 @@ async def test_background_merge_deadlock(manager: ManagerClient):
await wait_for(finished_merge, time.time() + 120)
await manager.server_stop(servers[0].server_id)
await manager.server_stop(servers[0].server_id, convict=False)

View File

@@ -541,7 +541,7 @@ async def test_restart_in_cleanup_stage_after_cleanup(manager: ManagerClient):
await log.wait_for("Waiting after tablet cleanup", from_mark=mark, timeout=60)
# Restart the leaving replica (src_server)
await manager.server_stop(src_server.server_id)
await manager.server_stop(src_server.server_id, convict=False)
await manager.server_start(src_server.server_id)
await wait_for_cql_and_get_hosts(manager.get_cql(), servers, time.time() + 30)

View File

@@ -43,7 +43,7 @@ async def test_truncation_records_pruned_on_dirty_restart(manager: ManagerClient
cql = manager.get_cql()
async def restart():
await manager.server_stop(server.server_id)
await manager.server_stop(server.server_id, convict=False)
await manager.server_start(server.server_id)
manager.driver_close()
await manager.driver_connect()

View File

@@ -200,7 +200,7 @@ async def test_truncate_with_coordinator_crash(manager: ManagerClient):
trunc_future = cql.run_async(f'TRUNCATE TABLE {ks}.test', host=trunc_host)
# Wait for the topology coordinator to crash
await raft_leader_log.wait_for('truncate_crash_after_session_clear hit, killing the node')
await manager.server_stop(raft_leader.server_id)
await manager.server_stop(raft_leader.server_id, convict=False)
# Restart the crashed node
await manager.server_start(raft_leader.server_id)
# Wait for truncate to complete