sstables_loader: Fail tablet-restore task if not all sstables were downloaded

When the storage_service::restore_tablets() resolves, it only means that
tablet transitions are done, including restore transitions, but not
necessarily that they succeeded. So before resolving the restoration
task with success need to check if all sstables were downloaded and, if
not, resolve the task with exception.

Test included. It uses fault-injection to abort downloading of a single
sstable early, then checks that the error was properly propagated back
to the task waiting API

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
This commit is contained in:
Pavel Emelyanov
2026-03-26 15:42:19 +03:00
parent 781e7b1f38
commit f85c5184f3
3 changed files with 51 additions and 0 deletions

View File

@@ -778,6 +778,40 @@ async def test_restore_tablets(build_mode: str, manager: ManagerClient, object_s
await check_mutation_replicas(cql, manager, servers, range(num_keys), topology, logger, ks, 'test')
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_restore_tablets_download_failure(build_mode: str, manager: ManagerClient, object_storage):
'''Check that failure to download an sstable propagates back to API'''
topology = topo(rf = 1, nodes = 2, racks = 1, dcs = 1)
servers, host_ids = await create_cluster(topology, manager, logger, object_storage)
await manager.disable_tablet_balancing()
cql = manager.get_cql()
num_keys = 12
tablet_count=4
async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {topology.rf}}}") as ks:
await cql.run_async(create_schema(ks, 'test', min_tablet_count=tablet_count))
insert_stmt = cql.prepare(f"INSERT INTO {ks}.test (pk, value) VALUES (?, ?)")
insert_stmt.consistency_level = ConsistencyLevel.ALL
await asyncio.gather(*(cql.run_async(insert_stmt, (str(i), i)) for i in range(num_keys)))
snap_name, sstables = await take_snapshot(ks, servers, manager, logger)
await asyncio.gather(*(do_backup(s, snap_name, f'{s.server_id}/{snap_name}', ks, 'test', object_storage, manager, logger) for s in servers))
await manager.api.enable_injection(servers[1].ip_addr, "fail_download_sstable", one_shot=True)
async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {topology.rf}}}") as ks:
await cql.run_async(f"CREATE TABLE {ks}.test ( pk text primary key, value int ) WITH tablets = {{'min_tablet_count': {tablet_count}, 'max_tablet_count': {tablet_count}}};")
logger.info(f'Restore cluster via {servers[0].ip_addr}')
manifests = [ f'{s.server_id}/{snap_name}/manifest.json' for s in servers ]
tid = await manager.api.restore_tablets(servers[0].ip_addr, ks, 'test', snap_name, servers[0].datacenter, object_storage.address, object_storage.bucket_name, manifests)
status = await manager.api.wait_task(servers[0].ip_addr, tid)
assert 'state' in status and status['state'] == 'failed'
assert 'error' in status and 'Failed to download' in status['error']
@pytest.mark.asyncio
async def test_restore_with_non_existing_sstable(manager: ManagerClient, object_storage):
'''Check that restore task fails well when given a non-existing sstable'''