test: test group0 tombstone GC in the Raft-based recovery procedure

We add a regression test for the bug fixed in the previous commits. (cherry picked from commit c57f097630)
2026-05-12 19:02:12 +00:00 · 2025-10-16 18:39:56 +02:00
parent 8a11535a12
commit 76560ca095
1 changed files with 64 additions and 1 deletions
--- a/test/cluster/test_tombstone_gc.py
+++ b/test/cluster/test_tombstone_gc.py
@@ -11,10 +11,11 @@ import logging
 import time
 import pytest

+from test.pylib.internal_types import ServerInfo
 from test.pylib.manager_client import ManagerClient
 from test.pylib.util import wait_for, wait_for_cql_and_get_hosts
 from test.cluster.conftest import skip_mode
-from test.cluster.util import disable_schema_agreement_wait, new_test_keyspace, new_test_table
+from test.cluster.util import delete_discovery_state_and_group0_id, delete_raft_group_data, disable_schema_agreement_wait, new_test_keyspace, new_test_table, reconnect_driver

 logger = logging.getLogger(__name__)

@@ -83,6 +84,19 @@ async def test_group0_tombstone_gc(manager: ManagerClient):
            - tombstones exist from the previous test
        Assert:
            - the tombstones are cleaned up eventually
+
+    Test #4:
+        Regression test for https://github.com/scylladb/scylladb/issues/26534
+
+        Arrange:
+            - kill one of the nodes
+        Act:
+            - create new group0 tombstones by updating the schema (create/alter/delete random tables),
+            - perform the Raft-based recovery procedure, which creates a new group0
+        Assert:
+            - (4a) the tombstones are not cleaned up after both live nodes join the new group0 even though the killed
+              node doesn't belong to it,
+            - (4b) the tombstones are cleaned up after removing the killed node
    """
    cmdline = [
        # disabling caches as the tombstones still remain in the cache even after the compaction
@@ -203,6 +217,55 @@ async def test_group0_tombstone_gc(manager: ManagerClient):
            # test #3: the tombstones are cleaned up after the node is started again
            await verify_tombstone_gc(tombstone_mark)

+            first_group0_id = (await cql.run_async(
+                "SELECT value FROM system.scylla_local WHERE key = 'raft_group0_id'"))[0].value
+
+            # Kill one server.
+            await manager.server_stop(down_server.server_id)
+            servers.pop()
+
+            await alter_system_schema(keyspace)
+            tombstone_mark = datetime.now(timezone.utc)
+
+            # Start the Raft-based recovery procedure.
+            await manager.rolling_restart(servers, wait_for_cql=False)
+
+            # TODO: do not reconnect the driver (here and below) once scylladb/python-driver#295 is fixed.
+            # It doesn't go well with disable_schema_agreement_wait above.
+            cql = await reconnect_driver(manager)
+            cql.cluster.max_schema_agreement_wait = 0
+            hosts = [(await wait_for_cql_and_get_hosts(cql, [s], time.time() + 60))[0]
+                     for s in servers]
+
+            for h in hosts:
+                await delete_discovery_state_and_group0_id(cql, h)
+
+            recovery_leader_id = await manager.get_host_id(servers[0].server_id)
+
+            async def set_recovery_leader(srv: ServerInfo):
+                await manager.server_update_config(srv.server_id, 'recovery_leader', recovery_leader_id)
+
+            await manager.rolling_restart(servers, with_down=set_recovery_leader, wait_for_cql=False)
+
+            cql = await reconnect_driver(manager)
+            cql.cluster.max_schema_agreement_wait = 0
+            await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
+
+            # test #4a: the tombstones are not cleaned up after both live nodes join the new group0
+            with pytest.raises(AssertionError, match="Deadline exceeded"):
+                await verify_tombstone_gc(tombstone_mark, timeout=5)
+
+            await manager.remove_node(servers[0].server_id, down_server.server_id)
+
+            # test #4b: the tombstones are cleaned up after removing the killed node
+            await verify_tombstone_gc(tombstone_mark)
+
+            # Finish the Raft-based recovery procedure. We leave the cluster in a clean state for future tests.
+            for srv in servers:
+                await manager.server_remove_config_option(srv.server_id, 'recovery_leader')
+            for h in hosts:
+                await delete_raft_group_data(first_group0_id, cql, h)
+

@pytest.mark.asyncio
@skip_mode('release', "test only needs to run once - allowing only the 'dev' mode")