test: Fix flakiness in topology_experimental_raft/test_tablets
One source of flakiness is in test_tablet_metadata_propagates_with_schema_changes_in_snapshot_mode due to gossiper being aborted prematurely, and causing reconnection storm. Another is test_tablet_missing_data_repair which is flaky due an issue in python driver that session might not reconnect on rolling restart (tracked by https://github.com/scylladb/python-driver/issues/230) Refs #15356. Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
This commit is contained in:
@@ -57,6 +57,22 @@ async def load_repair_history(cql, hosts):
|
||||
logging.info(f"Got repair_history_entry={row}")
|
||||
return all_rows
|
||||
|
||||
async def safe_server_stop_gracefully(manager, server_id, timeout: float = 60, reconnect: bool = False):
|
||||
# Explicitly close the driver to avoid reconnections if scylla fails to update gossiper state on shutdown.
|
||||
# It's a problem until https://github.com/scylladb/scylladb/issues/15356 is fixed.
|
||||
manager.driver_close()
|
||||
await manager.server_stop_gracefully(server_id, timeout)
|
||||
cql = None
|
||||
if reconnect:
|
||||
cql = await reconnect_driver(manager)
|
||||
return cql
|
||||
|
||||
async def safe_rolling_restart(manager, servers, with_down):
|
||||
# https://github.com/scylladb/python-driver/issues/230 is not fixed yet, so for sake of CI stability,
|
||||
# driver must be reconnected after rolling restart of servers.
|
||||
await manager.rolling_restart(servers, with_down)
|
||||
cql = await reconnect_driver(manager)
|
||||
return cql
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tablet_metadata_propagates_with_schema_changes_in_snapshot_mode(manager: ManagerClient):
|
||||
@@ -79,9 +95,8 @@ async def test_tablet_metadata_propagates_with_schema_changes_in_snapshot_mode(m
|
||||
not_s0 = servers[1:]
|
||||
|
||||
# s0 should miss schema and tablet changes
|
||||
await manager.server_stop_gracefully(s0)
|
||||
cql = await safe_server_stop_gracefully(manager, s0, reconnect=True)
|
||||
|
||||
cql = manager.get_cql()
|
||||
await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3} AND tablets = {'initial': 100};")
|
||||
|
||||
# force s0 to catch up later from the snapshot and not the raft log
|
||||
@@ -428,7 +443,7 @@ async def test_tablet_missing_data_repair(manager: ManagerClient):
|
||||
await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});")
|
||||
for k in keys_for_server[down_server.server_id]])
|
||||
|
||||
await manager.rolling_restart(servers, with_down=insert_with_down)
|
||||
cql = await safe_rolling_restart(manager, servers, with_down=insert_with_down)
|
||||
|
||||
await repair_on_node(manager, servers[0], servers)
|
||||
|
||||
@@ -440,7 +455,7 @@ async def test_tablet_missing_data_repair(manager: ManagerClient):
|
||||
for r in rows:
|
||||
assert r.c == r.pk
|
||||
|
||||
await manager.rolling_restart(servers, with_down=check_with_down)
|
||||
cql = await safe_rolling_restart(manager, servers, with_down=insert_with_down)
|
||||
|
||||
|
||||
@pytest.mark.repair
|
||||
@@ -896,11 +911,7 @@ async def test_tablet_load_and_stream(manager: ManagerClient, primary_replica_on
|
||||
|
||||
await create_table("test2", 16)
|
||||
|
||||
# Explicitly close the driver to avoid reconnections if scylla fails to update gossiper state on shutdown.
|
||||
# It's a problem until https://github.com/scylladb/scylladb/issues/15356 is fixed.
|
||||
manager.driver_close()
|
||||
cql = None
|
||||
await manager.server_stop_gracefully(servers[0].server_id)
|
||||
cql = await safe_server_stop_gracefully(manager, servers[0].server_id)
|
||||
|
||||
table_dir = glob.glob(os.path.join(node_workdir, "data", "test", "test-*"))[0]
|
||||
logger.info(f"Table dir: {table_dir}")
|
||||
|
||||
Reference in New Issue
Block a user