test: Restore resilience test

The test checks that losing one of nodes from the cluster while restore
is handled. In particular:

- losing an API node makes the task waiting API to throw (apparently)
- losing coordinator or replica node makes the API call to fail, because
  some tablets should fail to get restored. If the coordinator is lost,
  it triggers coordinator re-election and new coordinator still notices
  that a tablet that was replicated to "old" coordinator failed to get
  restored and fails the restore anyway

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
This commit is contained in:
Pavel Emelyanov
2026-03-26 19:33:22 +03:00
parent 69b8f76a32
commit 3bcefa42c5
2 changed files with 55 additions and 0 deletions

View File

@@ -986,6 +986,8 @@ future<> sstables_loader::download_tablet_sstables(locator::global_tablet_id tid
throw std::logic_error("sstables_partially_contained");
}
llog.debug("{} SSTables filtered by range {} for tablet {}", fully.size(), tablet_range, tid);
co_await utils::get_local_injector().inject("pause_tablet_restore", utils::wait_for_message(60s));
if (fully.empty()) {
// It can happen that a tablet exists and contains no data. Just skip it
co_return;

View File

@@ -7,6 +7,7 @@ import asyncio
import subprocess
import tempfile
import itertools
import aiohttp
import pytest
import time
@@ -810,6 +811,58 @@ async def test_restore_tablets_download_failure(build_mode: str, manager: Manage
assert 'error' in status and 'Failed to download' in status['error']
@pytest.mark.asyncio
@pytest.mark.parametrize("target", ['coordinator', 'replica', 'api'])
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_restore_tablets_node_loss_resiliency(build_mode: str, manager: ManagerClient, object_storage, target):
'''Check how restore handler node loss in the middle of operation'''
topology = topo(rf = 2, nodes = 4, racks = 2, dcs = 1)
servers, host_ids = await create_cluster(topology, manager, logger, object_storage)
log = await manager.server_open_log(servers[0].server_id)
await log.wait_for("raft_topology - start topology coordinator fiber", timeout=10)
await manager.disable_tablet_balancing()
cql = manager.get_cql()
num_keys = 24
tablet_count=8
async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {topology.rf}}}") as ks:
await cql.run_async(f"CREATE TABLE {ks}.test ( pk text primary key, value int ) WITH tablets = {{'min_tablet_count': {tablet_count}}};")
insert_stmt = cql.prepare(f"INSERT INTO {ks}.test (pk, value) VALUES (?, ?)")
insert_stmt.consistency_level = ConsistencyLevel.ALL
await asyncio.gather(*(cql.run_async(insert_stmt, (str(i), i)) for i in range(num_keys)))
snap_name, sstables = await take_snapshot(ks, servers, manager, logger)
await asyncio.gather(*(do_backup(s, snap_name, f'{s.server_id}/{snap_name}', ks, 'test', object_storage, manager, logger) for s in servers))
async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {topology.rf}}}") as ks:
await cql.run_async(f"CREATE TABLE {ks}.test ( pk text primary key, value int ) WITH tablets = {{'min_tablet_count': {tablet_count}, 'max_tablet_count': {tablet_count}}};")
await manager.api.enable_injection(servers[2].ip_addr, "pause_tablet_restore", one_shot=True)
log = await manager.server_open_log(servers[2].server_id)
mark = await log.mark()
manifests = [ f'{s.server_id}/{snap_name}/manifest.json' for s in servers ]
tid = await manager.api.restore_tablets(servers[1].ip_addr, ks, 'test', snap_name, servers[0].datacenter, object_storage.address, object_storage.bucket_name, manifests)
await log.wait_for("pause_tablet_restore: waiting for message", from_mark=mark)
if target == 'api':
await manager.server_stop(servers[1].server_id)
with pytest.raises(aiohttp.client_exceptions.ClientConnectorError):
await manager.api.wait_task(servers[1].ip_addr, tid)
else:
if target == 'coordinator':
await manager.server_stop(servers[0].server_id)
await manager.api.message_injection(servers[2].ip_addr, "pause_tablet_restore")
elif target == 'replica':
await manager.server_stop(servers[2].server_id)
# Sometimes killing nodes manage to restore tablets before being killed
# So the best thing to do is to make sure restore task finishes at all
await asyncio.wait_for(manager.api.wait_task(servers[1].ip_addr, tid), timeout=60)
@pytest.mark.asyncio
async def test_restore_with_non_existing_sstable(manager: ManagerClient, object_storage):
'''Check that restore task fails well when given a non-existing sstable'''