test: Fix flakiness in topology_experimental_raft/test_tablets

One source of flakiness is in test_tablet_metadata_propagates_with_schema_changes_in_snapshot_mode due to gossiper being aborted prematurely, and causing reconnection storm. Another is test_tablet_missing_data_repair which is flaky due an issue in python driver that session might not reconnect on rolling restart (tracked by https://github.com/scylladb/python-driver/issues/230) Refs #15356. Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
2026-05-13 03:12:13 +00:00 · 2024-05-22 09:14:52 -03:00
parent 551bf9dd58
commit e7246751b6
1 changed files with 20 additions and 9 deletions
--- a/test/topology_experimental_raft/test_tablets.py
+++ b/test/topology_experimental_raft/test_tablets.py
@@ -57,6 +57,22 @@ async def load_repair_history(cql, hosts):
        logging.info(f"Got repair_history_entry={row}")
    return all_rows

+async def safe_server_stop_gracefully(manager, server_id, timeout: float = 60, reconnect: bool = False):
+    # Explicitly close the driver to avoid reconnections if scylla fails to update gossiper state on shutdown.
+    # It's a problem until https://github.com/scylladb/scylladb/issues/15356 is fixed.
+    manager.driver_close()
+    await manager.server_stop_gracefully(server_id, timeout)
+    cql = None
+    if reconnect:
+        cql = await reconnect_driver(manager)
+    return cql
+
+async def safe_rolling_restart(manager, servers, with_down):
+    # https://github.com/scylladb/python-driver/issues/230 is not fixed yet, so for sake of CI stability,
+    # driver must be reconnected after rolling restart of servers.
+    await manager.rolling_restart(servers, with_down)
+    cql = await reconnect_driver(manager)
+    return cql

@pytest.mark.asyncio
 async def test_tablet_metadata_propagates_with_schema_changes_in_snapshot_mode(manager: ManagerClient):
@@ -79,9 +95,8 @@ async def test_tablet_metadata_propagates_with_schema_changes_in_snapshot_mode(m
    not_s0 = servers[1:]

    # s0 should miss schema and tablet changes
-    await manager.server_stop_gracefully(s0)
+    cql = await safe_server_stop_gracefully(manager, s0, reconnect=True)

-    cql = manager.get_cql()
    await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3} AND tablets = {'initial': 100};")

    # force s0 to catch up later from the snapshot and not the raft log
@@ -428,7 +443,7 @@ async def test_tablet_missing_data_repair(manager: ManagerClient):
        await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});")
                               for k in keys_for_server[down_server.server_id]])

-    await manager.rolling_restart(servers, with_down=insert_with_down)
+    cql = await safe_rolling_restart(manager, servers, with_down=insert_with_down)

    await repair_on_node(manager, servers[0], servers)

@@ -440,7 +455,7 @@ async def test_tablet_missing_data_repair(manager: ManagerClient):
        for r in rows:
            assert r.c == r.pk

-    await manager.rolling_restart(servers, with_down=check_with_down)
+    cql = await safe_rolling_restart(manager, servers, with_down=insert_with_down)


@pytest.mark.repair
@@ -896,11 +911,7 @@ async def test_tablet_load_and_stream(manager: ManagerClient, primary_replica_on

    await create_table("test2", 16)

-    # Explicitly close the driver to avoid reconnections if scylla fails to update gossiper state on shutdown.
-    # It's a problem until https://github.com/scylladb/scylladb/issues/15356 is fixed.
-    manager.driver_close()
-    cql = None
-    await manager.server_stop_gracefully(servers[0].server_id)
+    cql = await safe_server_stop_gracefully(manager, servers[0].server_id)

    table_dir = glob.glob(os.path.join(node_workdir, "data", "test", "test-*"))[0]
    logger.info(f"Table dir: {table_dir}")