mirror of
https://github.com/scylladb/scylladb.git
synced 2026-06-03 21:47:10 +00:00
sstables_loader: Don't bypass synchronization with busy topology
The patchc543059f86fixed the synchronization issue between tablet split and load-and-stream. The synchronization worked only with raft topology, and therefore was disabled with gossip. To do the check, storage_service::raft_topology_change_enabled() but the topology kind is only available/set on shard 0, so it caused the synchronization to be bypassed when load-and-stream runs on any shard other than 0. The reason the reproducer didn't catch it is that it was restricted to single cpu. It will now run with multi cpu and catch the problem observed. Fixes #22707 Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com> Closes scylladb/scylladb#26730 (cherry picked from commit7f34366b9d)
This commit is contained in:
committed by
GitHub Action
parent
4d3e896eae
commit
4c466ace4f
@@ -1974,3 +1974,188 @@ async def test_split_correctness_on_tablet_count_change(manager: ManagerClient):
|
||||
await manager.api.message_injection(server.ip_addr, "splitting_mutation_writer_switch_wait")
|
||||
await asyncio.sleep(.1)
|
||||
await manager.api.message_injection(server.ip_addr, "merge_completion_fiber")
|
||||
<<<<<<< HEAD
|
||||
||||||| parent of 7f34366b9d (sstables_loader: Don't bypass synchronization with busy topology)
|
||||
|
||||
# Reproducer for https://github.com/scylladb/scylladb/issues/26041.
|
||||
@pytest.mark.parametrize("primary_replica_only", [False, True])
|
||||
@skip_mode('release', 'error injections are not supported in release mode')
|
||||
async def test_tablet_load_and_stream_and_split_synchronization(manager: ManagerClient, primary_replica_only):
|
||||
logger.info("Bootstrapping cluster")
|
||||
cmdline = [
|
||||
'--logger-log-level', 'storage_service=debug',
|
||||
'--logger-log-level', 'table=debug',
|
||||
'--smp', '1',
|
||||
]
|
||||
servers = [await manager.server_add(config={
|
||||
'tablet_load_stats_refresh_interval_in_seconds': 1
|
||||
}, cmdline=cmdline)]
|
||||
server = servers[0]
|
||||
|
||||
await manager.api.disable_tablet_balancing(servers[0].ip_addr)
|
||||
|
||||
cql = manager.get_cql()
|
||||
|
||||
initial_tablets = 1
|
||||
|
||||
async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}}") as ks:
|
||||
await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int) WITH tablets = {{'min_tablet_count': {initial_tablets}}};")
|
||||
|
||||
keys = range(100)
|
||||
await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys])
|
||||
|
||||
async def check(ks_name: str):
|
||||
logger.info("Checking table")
|
||||
cql = manager.get_cql()
|
||||
rows = await cql.run_async(f"SELECT * FROM {ks_name}.test BYPASS CACHE;")
|
||||
assert len(rows) == len(keys)
|
||||
for r in rows:
|
||||
assert r.c == r.pk
|
||||
|
||||
await manager.api.flush_keyspace(servers[0].ip_addr, ks)
|
||||
await check(ks)
|
||||
|
||||
node_workdir = await manager.server_get_workdir(servers[0].server_id)
|
||||
|
||||
cql = await safe_server_stop_gracefully(manager, servers[0].server_id)
|
||||
|
||||
table_dir = glob.glob(os.path.join(node_workdir, "data", ks, "test-*"))[0]
|
||||
logger.info(f"Table dir: {table_dir}")
|
||||
|
||||
def move_sstables_to_upload(table_dir: str):
|
||||
logger.info("Moving sstables to upload dir")
|
||||
table_upload_dir = os.path.join(table_dir, "upload")
|
||||
for sst in glob.glob(os.path.join(table_dir, "*-Data.db")):
|
||||
for src_path in glob.glob(os.path.join(table_dir, sst.removesuffix("-Data.db") + "*")):
|
||||
dst_path = os.path.join(table_upload_dir, os.path.basename(src_path))
|
||||
logger.info(f"Moving sstable file {src_path} to {dst_path}")
|
||||
os.rename(src_path, dst_path)
|
||||
|
||||
move_sstables_to_upload(table_dir)
|
||||
|
||||
await manager.server_start(servers[0].server_id)
|
||||
cql = manager.get_cql()
|
||||
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
|
||||
|
||||
rows = await cql.run_async(f"SELECT * FROM {ks}.test BYPASS CACHE;")
|
||||
assert len(rows) == 0
|
||||
|
||||
await manager.api.disable_tablet_balancing(servers[0].ip_addr)
|
||||
|
||||
await manager.api.enable_injection(servers[0].ip_addr, "tablet_resize_finalization_post_barrier", one_shot=True)
|
||||
|
||||
s1_log = await manager.server_open_log(servers[0].server_id)
|
||||
s1_mark = await s1_log.mark()
|
||||
|
||||
await manager.api.enable_tablet_balancing(servers[0].ip_addr)
|
||||
|
||||
await cql.run_async(f"ALTER TABLE {ks}.test WITH tablets = {{'min_tablet_count': {initial_tablets * 2}}}")
|
||||
|
||||
await s1_log.wait_for(f"tablet_resize_finalization_post_barrier: waiting", from_mark=s1_mark)
|
||||
|
||||
await manager.api.enable_injection(servers[0].ip_addr, "stream_mutation_fragments", one_shot=True)
|
||||
|
||||
load_and_stream_task = asyncio.create_task(manager.api.load_new_sstables(servers[0].ip_addr, ks, "test", primary_replica_only))
|
||||
await s1_log.wait_for(f"Loading new SSTables for keyspace", from_mark=s1_mark)
|
||||
|
||||
await manager.api.message_injection(server.ip_addr, "tablet_resize_finalization_post_barrier")
|
||||
await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark)
|
||||
|
||||
await s1_log.wait_for(f"stream_mutation_fragments: waiting", from_mark=s1_mark)
|
||||
await manager.api.message_injection(server.ip_addr, "stream_mutation_fragments")
|
||||
|
||||
await load_and_stream_task
|
||||
|
||||
await check(ks)
|
||||
=======
|
||||
|
||||
# Reproducer for https://github.com/scylladb/scylladb/issues/26041.
|
||||
@pytest.mark.parametrize("primary_replica_only", [False, True])
|
||||
@skip_mode('release', 'error injections are not supported in release mode')
|
||||
async def test_tablet_load_and_stream_and_split_synchronization(manager: ManagerClient, primary_replica_only):
|
||||
logger.info("Bootstrapping cluster")
|
||||
cmdline = [
|
||||
'--logger-log-level', 'storage_service=debug',
|
||||
'--logger-log-level', 'table=debug',
|
||||
]
|
||||
servers = [await manager.server_add(config={
|
||||
'tablet_load_stats_refresh_interval_in_seconds': 1
|
||||
}, cmdline=cmdline)]
|
||||
server = servers[0]
|
||||
|
||||
await manager.api.disable_tablet_balancing(servers[0].ip_addr)
|
||||
|
||||
cql = manager.get_cql()
|
||||
|
||||
initial_tablets = 1
|
||||
|
||||
async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}}") as ks:
|
||||
await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int) WITH tablets = {{'min_tablet_count': {initial_tablets}}};")
|
||||
|
||||
keys = range(100)
|
||||
await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys])
|
||||
|
||||
async def check(ks_name: str):
|
||||
logger.info("Checking table")
|
||||
cql = manager.get_cql()
|
||||
rows = await cql.run_async(f"SELECT * FROM {ks_name}.test BYPASS CACHE;")
|
||||
assert len(rows) == len(keys)
|
||||
for r in rows:
|
||||
assert r.c == r.pk
|
||||
|
||||
await manager.api.flush_keyspace(servers[0].ip_addr, ks)
|
||||
await check(ks)
|
||||
|
||||
node_workdir = await manager.server_get_workdir(servers[0].server_id)
|
||||
|
||||
cql = await safe_server_stop_gracefully(manager, servers[0].server_id)
|
||||
|
||||
table_dir = glob.glob(os.path.join(node_workdir, "data", ks, "test-*"))[0]
|
||||
logger.info(f"Table dir: {table_dir}")
|
||||
|
||||
def move_sstables_to_upload(table_dir: str):
|
||||
logger.info("Moving sstables to upload dir")
|
||||
table_upload_dir = os.path.join(table_dir, "upload")
|
||||
for sst in glob.glob(os.path.join(table_dir, "*-Data.db")):
|
||||
for src_path in glob.glob(os.path.join(table_dir, sst.removesuffix("-Data.db") + "*")):
|
||||
dst_path = os.path.join(table_upload_dir, os.path.basename(src_path))
|
||||
logger.info(f"Moving sstable file {src_path} to {dst_path}")
|
||||
os.rename(src_path, dst_path)
|
||||
|
||||
move_sstables_to_upload(table_dir)
|
||||
|
||||
await manager.server_start(servers[0].server_id)
|
||||
cql = manager.get_cql()
|
||||
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
|
||||
|
||||
rows = await cql.run_async(f"SELECT * FROM {ks}.test BYPASS CACHE;")
|
||||
assert len(rows) == 0
|
||||
|
||||
await manager.api.disable_tablet_balancing(servers[0].ip_addr)
|
||||
|
||||
await manager.api.enable_injection(servers[0].ip_addr, "tablet_resize_finalization_post_barrier", one_shot=True)
|
||||
|
||||
s1_log = await manager.server_open_log(servers[0].server_id)
|
||||
s1_mark = await s1_log.mark()
|
||||
|
||||
await manager.api.enable_tablet_balancing(servers[0].ip_addr)
|
||||
|
||||
await cql.run_async(f"ALTER TABLE {ks}.test WITH tablets = {{'min_tablet_count': {initial_tablets * 2}}}")
|
||||
|
||||
await s1_log.wait_for(f"tablet_resize_finalization_post_barrier: waiting", from_mark=s1_mark)
|
||||
|
||||
await manager.api.enable_injection(servers[0].ip_addr, "stream_mutation_fragments", one_shot=True)
|
||||
|
||||
load_and_stream_task = asyncio.create_task(manager.api.load_new_sstables(servers[0].ip_addr, ks, "test", primary_replica_only))
|
||||
await s1_log.wait_for(f"Loading new SSTables for keyspace", from_mark=s1_mark)
|
||||
|
||||
await manager.api.message_injection(server.ip_addr, "tablet_resize_finalization_post_barrier")
|
||||
await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark)
|
||||
|
||||
await s1_log.wait_for(f"stream_mutation_fragments: waiting", from_mark=s1_mark)
|
||||
await manager.api.message_injection(server.ip_addr, "stream_mutation_fragments")
|
||||
|
||||
await load_and_stream_task
|
||||
|
||||
await check(ks)
|
||||
>>>>>>> 7f34366b9d (sstables_loader: Don't bypass synchronization with busy topology)
|
||||
|
||||
Reference in New Issue
Block a user