Files
scylladb/test/object_store/test_backup.py
Kefu Chai 9f8adcd207 backup_task: track the first failure uploading sstables
before this change, we only record the exception returned
by `upload_file()`, and rethrow the exception. but the exception
thrown by `update_file()` not populated to its caller. instead, the
exceptional future is ignored on pupose -- we need to perform
the uploads in parallel.  this is why the task is not marked fail
even if some of the uploads performed by it fail.

in this change, we

- coroutinize `backup_task_impl::do_backup()`. strictly speaking,
  this is not necessary to populate the exception. but, in order
  to ensure that the possible exception is captured before the
  gate is closed, and to reduce the intentation, the teardown
  steps are performed explicitly.
- in addition to note down the exception in the logging message,
  we also store it in a local variable, which it rethrown
  before this function returns.

Fixes scylladb/scylladb#21248
Signed-off-by: Kefu Chai <kefu.chai@scylladb.com>

Closes scylladb/scylladb#21254
2024-10-28 12:54:27 +03:00

232 lines
9.6 KiB
Python

#!/usr/bin/env python3
import asyncio
import os
import requests
import pytest
import logging
from test.pylib.manager_client import ManagerClient
from test.object_store.conftest import format_tuples
from test.object_store.conftest import get_s3_resource
from test.topology.conftest import skip_mode
from test.pylib.util import unique_name
logger = logging.getLogger(__name__)
def create_ks_and_cf(cql):
ks = 'test_ks'
cf = 'test_cf'
replication_opts = format_tuples({'class': 'NetworkTopologyStrategy', 'replication_factor': '1'})
cql.execute((f"CREATE KEYSPACE {ks} WITH REPLICATION = {replication_opts};"))
cql.execute(f"CREATE TABLE {ks}.{cf} ( name text primary key, value text );")
rows = [('0', 'zero'),
('1', 'one'),
('2', 'two')]
for row in rows:
cql_fmt = "INSERT INTO {}.{} ( name, value ) VALUES ('{}', '{}');"
cql.execute(cql_fmt.format(ks, cf, *row))
return ks, cf
async def prepare_snapshot_for_backup(manager: ManagerClient, server, snap_name = 'backup'):
cql = manager.get_cql()
workdir = await manager.server_get_workdir(server.server_id)
print(f'Create keyspace')
ks, cf = create_ks_and_cf(cql)
print('Flush keyspace')
await manager.api.flush_keyspace(server.ip_addr, ks)
print('Take keyspace snapshot')
await manager.api.take_snapshot(server.ip_addr, ks, snap_name)
return ks, cf
@pytest.mark.asyncio
async def test_simple_backup(manager: ManagerClient, s3_server):
'''check that backing up a snapshot for a keyspace works'''
cfg = {'enable_user_defined_functions': False,
'object_storage_config_file': str(s3_server.config_file),
'experimental_features': ['keyspace-storage-options'],
'task_ttl_in_seconds': 300
}
cmd = [ '--logger-log-level', 'snapshots=trace:task_manager=trace' ]
server = await manager.server_add(config=cfg, cmdline=cmd)
ks, cf = await prepare_snapshot_for_backup(manager, server)
workdir = await manager.server_get_workdir(server.server_id)
cf_dir = os.listdir(f'{workdir}/data/{ks}')[0]
files = set(os.listdir(f'{workdir}/data/{ks}/{cf_dir}/snapshots/backup'))
assert len(files) > 0
print('Backup snapshot')
prefix = f'{cf}/backup'
tid = await manager.api.backup(server.ip_addr, ks, cf, 'backup', s3_server.address, s3_server.bucket_name, prefix)
print(f'Started task {tid}')
status = await manager.api.get_task_status(server.ip_addr, tid)
print(f'Status: {status}, waiting to finish')
status = await manager.api.wait_task(server.ip_addr, tid)
assert (status is not None) and (status['state'] == 'done')
objects = set([ o.key for o in get_s3_resource(s3_server).Bucket(s3_server.bucket_name).objects.all() ])
for f in files:
print(f'Check {f} is in backup')
assert f'{prefix}/{f}' in objects
# Check that task runs in the streaming sched group
log = await manager.server_open_log(server.server_id)
res = await log.grep(r'INFO.*\[shard [0-9]:([a-z]+)\] .* Backup sstables from .* to')
assert len(res) == 1 and res[0][1].group(1) == 'strm'
@pytest.mark.asyncio
async def test_backup_to_non_existent_bucket(manager: ManagerClient, s3_server):
'''backup should fail if the destination bucket does not exist'''
cfg = {'enable_user_defined_functions': False,
'object_storage_config_file': str(s3_server.config_file),
'experimental_features': ['keyspace-storage-options'],
'task_ttl_in_seconds': 300
}
cmd = ['--logger-log-level', 'snapshots=trace:task_manager=trace']
server = await manager.server_add(config=cfg, cmdline=cmd)
ks, cf = await prepare_snapshot_for_backup(manager, server)
workdir = await manager.server_get_workdir(server.server_id)
cf_dir = os.listdir(f'{workdir}/data/{ks}')[0]
files = set(os.listdir(f'{workdir}/data/{ks}/{cf_dir}/snapshots/backup'))
assert len(files) > 0
prefix = f'{cf}/backup'
tid = await manager.api.backup(server.ip_addr, ks, cf, 'backup', s3_server.address, "non-existant-bucket", prefix)
status = await manager.api.wait_task(server.ip_addr, tid)
assert status is not None
assert status['state'] == 'failed'
@pytest.mark.asyncio
@skip_mode('release', 'error injections are not supported in release mode')
async def test_backup_is_abortable(manager: ManagerClient, s3_server):
'''check that backing up a snapshot for a keyspace works'''
cfg = {'enable_user_defined_functions': False,
'object_storage_config_file': str(s3_server.config_file),
'experimental_features': ['keyspace-storage-options'],
'task_ttl_in_seconds': 300
}
cmd = [ '--logger-log-level', 'snapshots=trace:task_manager=trace' ]
server = await manager.server_add(config=cfg, cmdline=cmd)
ks, cf = await prepare_snapshot_for_backup(manager, server)
workdir = await manager.server_get_workdir(server.server_id)
cf_dir = os.listdir(f'{workdir}/data/{ks}')[0]
files = set(os.listdir(f'{workdir}/data/{ks}/{cf_dir}/snapshots/backup'))
assert len(files) > 1
await manager.api.enable_injection(server.ip_addr, "backup_task_pause", one_shot=True)
log = await manager.server_open_log(server.server_id)
mark = await log.mark()
print('Backup snapshot')
prefix = f'{cf}/backup'
tid = await manager.api.backup(server.ip_addr, ks, cf, 'backup', s3_server.address, s3_server.bucket_name, prefix)
print(f'Started task {tid}, aborting it early')
await log.wait_for('backup task: waiting', from_mark=mark)
await manager.api.abort_task(server.ip_addr, tid)
await manager.api.message_injection(server.ip_addr, "backup_task_pause")
status = await manager.api.wait_task(server.ip_addr, tid)
print(f'Status: {status}')
assert (status is not None) and (status['state'] == 'failed')
objects = set([ o.key for o in get_s3_resource(s3_server).Bucket(s3_server.bucket_name).objects.all() ])
uploaded_count = 0
for f in files:
print(f'Check {f} is in backup')
if f'{prefix}/{f}' in objects:
uploaded_count += 1
assert uploaded_count > 0 and uploaded_count < len(files)
@pytest.mark.asyncio
async def test_simple_backup_and_restore(manager: ManagerClient, s3_server):
'''check that restoring from backed up snapshot for a keyspace:table works'''
cfg = {'enable_user_defined_functions': False,
'object_storage_config_file': str(s3_server.config_file),
'experimental_features': ['keyspace-storage-options'],
'task_ttl_in_seconds': 300
}
cmd = [ '--logger-log-level', 'sstables_loader=debug:sstable_directory=trace:snapshots=trace:s3=trace:sstable=debug:http=debug' ]
server = await manager.server_add(config=cfg, cmdline=cmd)
cql = manager.get_cql()
workdir = await manager.server_get_workdir(server.server_id)
# This test is sensitive not to share the bucket with any other test
# that can run in parallel, so generate some unique name for the snapshot
snap_name = unique_name('backup_')
print(f'Create and backup keyspace (snapshot name is {snap_name})')
ks, cf = await prepare_snapshot_for_backup(manager, server, snap_name)
cf_dir = os.listdir(f'{workdir}/data/{ks}')[0]
def list_sstables():
return [ f for f in os.scandir(f'{workdir}/data/{ks}/{cf_dir}') if f.is_file() ]
orig_res = cql.execute(f"SELECT * FROM {ks}.{cf}")
orig_rows = { x.name: x.value for x in orig_res }
# include a "suffix" in the key to mimic the use case where scylla-manager
# 1. backups sstables of multiple snapshots, and deduplicate the backup'ed
# sstables by only upload the new sstables
# 2. restore a given snapshot by collecting all sstables of this snapshot from
# multiple places
#
# in this test, we:
# 1. upload:
# prefix: {prefix}/{suffix}
# sstables:
# - 1-TOC.txt
# - 2-TOC.txt
# - ...
# 2. download:
# prefix = {prefix}
# sstables:
# - {suffix}/1-TOC.txt
# - {suffix}/2-TOC.txt
# - ...
suffix = 'suffix'
toc_names = [f'{suffix}/{entry.name}' for entry in list_sstables() if entry.name.endswith('TOC.txt')]
prefix = f'{cf}/{snap_name}'
tid = await manager.api.backup(server.ip_addr, ks, cf, snap_name, s3_server.address, s3_server.bucket_name, f'{prefix}/{suffix}')
status = await manager.api.wait_task(server.ip_addr, tid)
assert (status is not None) and (status['state'] == 'done')
print(f'Drop the table data and validate it\'s gone')
cql.execute(f"TRUNCATE TABLE {ks}.{cf};")
files = list_sstables()
assert len(files) == 0
res = cql.execute(f"SELECT * FROM {ks}.{cf};")
assert not res
objects = set([ o.key for o in get_s3_resource(s3_server).Bucket(s3_server.bucket_name).objects.filter(Prefix=prefix) ])
assert len(objects) > 0
print(f'Try to restore')
tid = await manager.api.restore(server.ip_addr, ks, cf, s3_server.address, s3_server.bucket_name, prefix, toc_names)
status = await manager.api.wait_task(server.ip_addr, tid)
assert (status is not None) and (status['state'] == 'done')
print(f'Check that sstables came back')
files = list_sstables()
assert len(files) > 0
print(f'Check that data came back too')
res = cql.execute(f"SELECT * FROM {ks}.{cf};")
rows = { x.name: x.value for x in res }
assert rows == orig_rows, "Unexpected table contents after restore"
print(f'Check that backup files are still there') # regression test for #20938
post_objects = set([ o.key for o in get_s3_resource(s3_server).Bucket(s3_server.bucket_name).objects.filter(Prefix=prefix) ])
assert objects == post_objects