test: test group0 tombstone GC in the Raft-based recovery procedure

We add a regression test for the bug fixed in the previous commits.

(cherry picked from commit c57f097630)
This commit is contained in:
Patryk Jędrzejczak
2025-10-16 18:39:56 +02:00
committed by GitHub Action
parent 8a11535a12
commit 76560ca095

View File

@@ -11,10 +11,11 @@ import logging
import time
import pytest
from test.pylib.internal_types import ServerInfo
from test.pylib.manager_client import ManagerClient
from test.pylib.util import wait_for, wait_for_cql_and_get_hosts
from test.cluster.conftest import skip_mode
from test.cluster.util import disable_schema_agreement_wait, new_test_keyspace, new_test_table
from test.cluster.util import delete_discovery_state_and_group0_id, delete_raft_group_data, disable_schema_agreement_wait, new_test_keyspace, new_test_table, reconnect_driver
logger = logging.getLogger(__name__)
@@ -83,6 +84,19 @@ async def test_group0_tombstone_gc(manager: ManagerClient):
- tombstones exist from the previous test
Assert:
- the tombstones are cleaned up eventually
Test #4:
Regression test for https://github.com/scylladb/scylladb/issues/26534
Arrange:
- kill one of the nodes
Act:
- create new group0 tombstones by updating the schema (create/alter/delete random tables),
- perform the Raft-based recovery procedure, which creates a new group0
Assert:
- (4a) the tombstones are not cleaned up after both live nodes join the new group0 even though the killed
node doesn't belong to it,
- (4b) the tombstones are cleaned up after removing the killed node
"""
cmdline = [
# disabling caches as the tombstones still remain in the cache even after the compaction
@@ -203,6 +217,55 @@ async def test_group0_tombstone_gc(manager: ManagerClient):
# test #3: the tombstones are cleaned up after the node is started again
await verify_tombstone_gc(tombstone_mark)
first_group0_id = (await cql.run_async(
"SELECT value FROM system.scylla_local WHERE key = 'raft_group0_id'"))[0].value
# Kill one server.
await manager.server_stop(down_server.server_id)
servers.pop()
await alter_system_schema(keyspace)
tombstone_mark = datetime.now(timezone.utc)
# Start the Raft-based recovery procedure.
await manager.rolling_restart(servers, wait_for_cql=False)
# TODO: do not reconnect the driver (here and below) once scylladb/python-driver#295 is fixed.
# It doesn't go well with disable_schema_agreement_wait above.
cql = await reconnect_driver(manager)
cql.cluster.max_schema_agreement_wait = 0
hosts = [(await wait_for_cql_and_get_hosts(cql, [s], time.time() + 60))[0]
for s in servers]
for h in hosts:
await delete_discovery_state_and_group0_id(cql, h)
recovery_leader_id = await manager.get_host_id(servers[0].server_id)
async def set_recovery_leader(srv: ServerInfo):
await manager.server_update_config(srv.server_id, 'recovery_leader', recovery_leader_id)
await manager.rolling_restart(servers, with_down=set_recovery_leader, wait_for_cql=False)
cql = await reconnect_driver(manager)
cql.cluster.max_schema_agreement_wait = 0
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
# test #4a: the tombstones are not cleaned up after both live nodes join the new group0
with pytest.raises(AssertionError, match="Deadline exceeded"):
await verify_tombstone_gc(tombstone_mark, timeout=5)
await manager.remove_node(servers[0].server_id, down_server.server_id)
# test #4b: the tombstones are cleaned up after removing the killed node
await verify_tombstone_gc(tombstone_mark)
# Finish the Raft-based recovery procedure. We leave the cluster in a clean state for future tests.
for srv in servers:
await manager.server_remove_config_option(srv.server_id, 'recovery_leader')
for h in hosts:
await delete_raft_group_data(first_group0_id, cql, h)
@pytest.mark.asyncio
@skip_mode('release', "test only needs to run once - allowing only the 'dev' mode")