Files
scylladb/test/cluster/test_tablets_merge.py
Raphael S. Carvalho 28056344ba replica: Fix take_storage_snapshot() running concurrently to merge completion
Some background:
When merge happens, a background fiber wakes up to merge compaction
groups of sibling tablets into main one. It cannot happen when
rebuilding the storage group list, since token metadata update is
not preemptable. So a storage group, post merge, has the main
compaction group and two other groups to be merged into the main.
When the merge happens, those two groups are empty and will be
freed.

Consider this scenario:
1) merge happens, from 2 to 1 tablet
2) produces a single storage group, containing main and two
other compaction groups to be merged into main.
3) take_storage_snapshot(), triggered by migration post merge,
gets a list of pointer to all compaction groups.
4) t__s__s() iterates first on main group, yields.
5) background fiber wakes up, moves the data into main
and frees the two groups
6) t__s__s() advances to other groups that are now freed,
since step 5.
7) segmentation fault

In addition to memory corruption, there's also a potential for
data to escape the iteration in take_storage_snapshot(), since
data can be moved across compaction groups in background, all
belonging to the same storage group. That could result in
data loss.

Readers should all operate on storage group level since it can
provide a view on all the data owned by a tablet replica.
The movement of sstable from group A to B is atomic, but
iteration first on A, then later on B, might miss data that
was moved from B to A, before the iteration reached B.
By switching to storage group in the interface that retrieves
groups by token range, we guarantee that all data of a given
replica can be found regardless of which compaction group they
sit on.

Fixes #23162.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>

Closes scylladb/scylladb#24058
2025-05-09 14:07:06 +03:00

462 lines
21 KiB
Python

#
# Copyright (C) 2025-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
#
from test.pylib.internal_types import ServerInfo
from test.pylib.manager_client import ManagerClient
from test.pylib.rest_client import inject_error_one_shot, read_barrier
from test.pylib.tablets import get_tablet_replica, get_all_tablet_replicas
from test.pylib.util import wait_for
from test.cluster.conftest import skip_mode
from test.cluster.util import new_test_keyspace, create_new_test_keyspace
import pytest
import asyncio
import logging
import time
import random
logger = logging.getLogger(__name__)
async def inject_error_one_shot_on(manager, error_name, servers):
errs = [inject_error_one_shot(manager.api, s.ip_addr, error_name) for s in servers]
await asyncio.gather(*errs)
async def inject_error_on(manager, error_name, servers):
errs = [manager.api.enable_injection(s.ip_addr, error_name, False) for s in servers]
await asyncio.gather(*errs)
async def disable_injection_on(manager, error_name, servers):
errs = [manager.api.disable_injection(s.ip_addr, error_name) for s in servers]
await asyncio.gather(*errs)
async def get_tablet_count(manager: ManagerClient, server: ServerInfo, keyspace_name: str, table_name: str):
host = manager.cql.cluster.metadata.get_host(server.ip_addr)
# read_barrier is needed to ensure that local tablet metadata on the queried node
# reflects the finalized tablet movement.
await read_barrier(manager.api, server.ip_addr)
table_id = await manager.get_table_id(keyspace_name, table_name)
rows = await manager.cql.run_async(f"SELECT tablet_count FROM system.tablets where "
f"table_id = {table_id}", host=host)
return rows[0].tablet_count
@pytest.mark.asyncio
@skip_mode('release', 'error injections are not supported in release mode')
async def test_tablet_merge_simple(manager: ManagerClient):
logger.info("Bootstrapping cluster")
cmdline = [
'--logger-log-level', 'storage_service=debug',
'--logger-log-level', 'table=debug',
'--logger-log-level', 'load_balancer=debug',
'--target-tablet-size-in-bytes', '30000',
]
servers = [await manager.server_add(config={
'error_injections_at_startup': ['short_tablet_stats_refresh_interval']
}, cmdline=cmdline)]
await manager.api.disable_tablet_balancing(servers[0].ip_addr)
cql = manager.get_cql()
async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks:
await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c blob) WITH gc_grace_seconds=0 AND bloom_filter_fp_chance=1;")
# Initial average table size of 400k (1 tablet), so triggers some splits.
total_keys = 200
keys = range(total_keys)
def populate(keys):
insert = cql.prepare(f"INSERT INTO {ks}.test(pk, c) VALUES(?, ?)")
for pk in keys:
value = random.randbytes(2000)
cql.execute(insert, [pk, value])
populate(keys)
async def check():
logger.info("Checking table")
cql = manager.get_cql()
rows = await cql.run_async(f"SELECT * FROM {ks}.test BYPASS CACHE;")
assert len(rows) == len(keys)
await check()
await manager.api.flush_keyspace(servers[0].ip_addr, ks)
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
assert tablet_count == 1
logger.info("Adding new server")
servers.append(await manager.server_add(cmdline=cmdline))
s1_host_id = await manager.get_host_id(servers[1].server_id)
# Increases the chance of tablet migration concurrent with split
await inject_error_one_shot_on(manager, "tablet_allocator_shuffle", servers)
await inject_error_on(manager, "tablet_load_stats_refresh_before_rebalancing", servers)
s1_log = await manager.server_open_log(servers[0].server_id)
s1_mark = await s1_log.mark()
# Now there's a split and migration need, so they'll potentially run concurrently.
await manager.api.enable_tablet_balancing(servers[0].ip_addr)
await check()
time.sleep(2) # Give load balancer some time to do work
await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark)
await check()
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
assert tablet_count > 1
# Allow shuffling of tablet replicas to make co-location work harder
async def shuffle():
await inject_error_on(manager, "tablet_allocator_shuffle", servers)
time.sleep(2)
await disable_injection_on(manager, "tablet_allocator_shuffle", servers)
await shuffle()
# This will allow us to simulate some balancing after co-location with shuffling, to make sure that
# balancer won't break co-location.
await inject_error_on(manager, "tablet_merge_completion_bypass", servers)
# Shrinks table significantly, forcing merge.
delete_keys = range(total_keys - 1)
await asyncio.gather(*[cql.run_async(f"DELETE FROM {ks}.test WHERE pk={k};") for k in delete_keys])
keys = range(total_keys - 1, total_keys)
# To avoid race of major with migration
await manager.api.disable_tablet_balancing(servers[0].ip_addr)
for server in servers:
await manager.api.flush_keyspace(server.ip_addr, ks)
await manager.api.keyspace_compaction(server.ip_addr, ks)
await manager.api.enable_tablet_balancing(servers[0].ip_addr)
await s1_log.wait_for("Emitting resize decision of type merge", from_mark=s1_mark)
# Waits for balancer to co-locate sibling tablets
await s1_log.wait_for("All sibling tablets are co-located")
# Do some shuffling to make sure balancer works with co-located tablets
await shuffle()
old_tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
s1_mark = await s1_log.mark()
await inject_error_on(manager, "replica_merge_completion_wait", servers)
await disable_injection_on(manager, "tablet_merge_completion_bypass", servers)
await s1_log.wait_for('Detected tablet merge for table', from_mark=s1_mark)
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
assert tablet_count < old_tablet_count
await check()
# Reproduces https://github.com/scylladb/scylladb/issues/21867 that could cause compaction group
# to be destroyed without being stopped first.
# That's done by:
# 1) Migrating a tablet to another node, and putting an artificial delay in cleanup stage when stopping groups
# 2) Force tablet split, causing new groups to be added in a tablet being cleaned up
# Without the fix, new groups are added to tablet being migrated away and never closed, potentially
# resulting in an use-after-free.
keys = range(total_keys)
populate(keys)
# Migrates a tablet to another node and put artificial delay on cleanup stage
await manager.api.enable_injection(servers[0].ip_addr, "delay_tablet_compaction_groups_cleanup", one_shot=True)
tablet_replicas = await get_all_tablet_replicas(manager, servers[0], ks, 'test')
assert len(tablet_replicas) > 0
t = tablet_replicas[0]
migration_task = asyncio.create_task(
manager.api.move_tablet(servers[0].ip_addr, ks, "test", *t.replicas[0], *(s1_host_id, 0), t.last_token))
# Trigger split
for server in servers:
await manager.api.flush_keyspace(server.ip_addr, ks)
try:
await migration_task
except:
# move_tablet() fails if tablet is already in transit.
# forgive if balancer decided to migrate the target tablet post split.
pass
await s1_log.wait_for('Merge completion fiber finished', from_mark=s1_mark)
for server in servers:
await manager.api.flush_keyspace(server.ip_addr, ks)
await manager.api.keyspace_compaction(server.ip_addr, ks)
await check()
# Multiple cycles of split and merge, with topology changes in parallel and RF > 1.
@pytest.mark.asyncio
@skip_mode('release', 'error injections are not supported in release mode')
async def test_tablet_split_and_merge_with_concurrent_topology_changes(manager: ManagerClient):
logger.info("Bootstrapping cluster")
cmdline = [
'--logger-log-level', 'storage_service=info',
'--logger-log-level', 'table=info',
'--logger-log-level', 'raft_topology=info',
'--logger-log-level', 'group0_raft_sm=info',
'--logger-log-level', 'load_balancer=info',
'--target-tablet-size-in-bytes', '30000',
]
config = {
'error_injections_at_startup': ['short_tablet_stats_refresh_interval']
}
servers = [await manager.server_add(config=config, cmdline=cmdline),
await manager.server_add(config=config, cmdline=cmdline),
await manager.server_add(config=config, cmdline=cmdline)]
cql = manager.get_cql()
async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks:
await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c blob) WITH gc_grace_seconds=0 AND bloom_filter_fp_chance=1;")
async def perform_topology_ops():
logger.info("Topology ops in background")
server_id_to_decommission = servers[-1].server_id
logger.info("Decommissioning old server with id {}".format(server_id_to_decommission))
await manager.decommission_node(server_id_to_decommission)
servers.pop()
logger.info("Adding new server")
servers.append(await manager.server_add(cmdline=cmdline))
logger.info("Completed topology ops")
for cycle in range(2):
logger.info("Running split-merge cycle #{}".format(cycle))
await manager.api.disable_tablet_balancing(servers[0].ip_addr)
logger.info("Inserting data")
# Initial average table size of (400k + metadata_overhead). Enough to trigger a few splits.
total_keys = 200
keys = range(total_keys)
insert = cql.prepare(f"INSERT INTO {ks}.test(pk, c) VALUES(?, ?)")
for pk in keys:
value = random.randbytes(2000)
cql.execute(insert, [pk, value])
async def check():
logger.info("Checking table")
cql = manager.get_cql()
rows = await cql.run_async(f"SELECT * FROM {ks}.test BYPASS CACHE;")
assert len(rows) == len(keys)
await check()
logger.info("Flushing keyspace")
for server in servers:
await manager.api.flush_keyspace(server.ip_addr, ks)
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
# Increases the chance of tablet migration concurrent with split
await inject_error_on(manager, "tablet_allocator_shuffle", servers)
await inject_error_on(manager, "tablet_load_stats_refresh_before_rebalancing", servers)
s1_log = await manager.server_open_log(servers[0].server_id)
s1_mark = await s1_log.mark()
logger.info("Enabling balancing")
# Now there's a split and migration need, so they'll potentially run concurrently.
await manager.api.enable_tablet_balancing(servers[0].ip_addr)
topology_ops_task = asyncio.create_task(perform_topology_ops())
await check()
logger.info("Waiting for split")
await disable_injection_on(manager, "tablet_allocator_shuffle", servers)
await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark)
logger.info("Waiting for topology ops")
await topology_ops_task
await check()
old_tablet_count = tablet_count
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
assert tablet_count > old_tablet_count
logger.info("Split increased number of tablets from {} to {}".format(old_tablet_count, tablet_count))
# Allow shuffling of tablet replicas to make co-location work harder
await inject_error_on(manager, "tablet_allocator_shuffle", servers)
# This will allow us to simulate some balancing after co-location with shuffling, to make sure that
# balancer won't break co-location.
await inject_error_on(manager, "tablet_merge_completion_bypass", servers)
logger.info("Deleting data")
# Delete almost all keys, enough to trigger a few merges.
delete_keys = range(total_keys - 1)
await asyncio.gather(*[cql.run_async(f"DELETE FROM {ks}.test WHERE pk={k};") for k in delete_keys])
keys = range(total_keys - 1, total_keys)
await disable_injection_on(manager, "tablet_allocator_shuffle", servers)
# To avoid race of major with migration
await manager.api.disable_tablet_balancing(servers[0].ip_addr)
logger.info("Flushing keyspace and performing major")
for server in servers:
await manager.api.flush_keyspace(server.ip_addr, ks)
await manager.api.keyspace_compaction(server.ip_addr, ks)
await manager.api.enable_tablet_balancing(servers[0].ip_addr)
logger.info("Waiting for merge decision")
await s1_log.wait_for("Emitting resize decision of type merge", from_mark=s1_mark)
# Waits for balancer to co-locate sibling tablets
await s1_log.wait_for("All sibling tablets are co-located")
# Do some shuffling to make sure balancer works with co-located tablets
await inject_error_on(manager, "tablet_allocator_shuffle", servers)
old_tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
topology_ops_task = asyncio.create_task(perform_topology_ops())
await inject_error_on(manager, "replica_merge_completion_wait", servers)
await disable_injection_on(manager, "tablet_merge_completion_bypass", servers)
await disable_injection_on(manager, "tablet_allocator_shuffle", servers)
await s1_log.wait_for('Detected tablet merge for table', from_mark=s1_mark)
await s1_log.wait_for('Merge completion fiber finished', from_mark=s1_mark)
logger.info("Waiting for topology ops")
await topology_ops_task
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
assert tablet_count < old_tablet_count
logger.info("Merge decreased number of tablets from {} to {}".format(old_tablet_count, tablet_count))
await check()
logger.info("Flushing keyspace and performing major")
for server in servers:
await manager.api.flush_keyspace(server.ip_addr, ks)
await manager.api.keyspace_compaction(server.ip_addr, ks)
await check()
@pytest.mark.parametrize("racks", [2, 3])
@pytest.mark.asyncio
@skip_mode('release', 'error injections are not supported in release mode')
async def test_tablet_merge_cross_rack_migrations(manager: ManagerClient, racks):
cmdline = ['--target-tablet-size-in-bytes', '30000',]
config = {'error_injections_at_startup': ['short_tablet_stats_refresh_interval']}
servers = []
rf = racks
for rack_id in range(0, racks):
rack = f'rack{rack_id+1}'
servers.extend(await manager.servers_add(3, config=config, cmdline=cmdline, property_file={'dc': 'mydc', 'rack': rack}))
cql = manager.get_cql()
ks = await create_new_test_keyspace(cql, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {rf}}} AND tablets = {{'initial': 1}}")
await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c blob) WITH compression = {{'sstable_compression': ''}};")
await inject_error_on(manager, "forbid_cross_rack_migration_attempt", servers)
total_keys = 400
keys = range(total_keys)
insert = cql.prepare(f"INSERT INTO {ks}.test(pk, c) VALUES(?, ?)")
for pk in keys:
value = random.randbytes(2000)
cql.execute(insert, [pk, value])
for server in servers:
await manager.api.flush_keyspace(server.ip_addr, ks)
async def finished_splitting():
# FIXME: fragile since it's expecting on-disk size will be enough to produce a few splits.
# (raw_data=800k / target_size=30k) = ~26, lower power-of-two is 16. Compression was disabled.
# Per-table hints (min_tablet_count) can be used to improve this.
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
return tablet_count >= 16 or None
# Give enough time for split to happen in debug mode
await wait_for(finished_splitting, time.time() + 120)
delete_keys = range(total_keys - 1)
await asyncio.gather(*[cql.run_async(f"DELETE FROM {ks}.test WHERE pk={k};") for k in delete_keys])
keys = range(total_keys - 1, total_keys)
old_tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
for server in servers:
await manager.api.flush_keyspace(server.ip_addr, ks)
await manager.api.keyspace_compaction(server.ip_addr, ks)
async def finished_merging():
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
return tablet_count < old_tablet_count or None
await wait_for(finished_merging, time.time() + 120)
# Reproduces use-after-free when migration right after merge, but concurrently to background
# merge completion handler.
# See: https://github.com/scylladb/scylladb/issues/24045
@pytest.mark.asyncio
@skip_mode('release', 'error injections are not supported in release mode')
async def test_migration_running_concurrently_to_merge_completion_handling(manager: ManagerClient):
cmdline = []
cfg = {}
servers = [await manager.server_add(cmdline=cmdline, config=cfg)]
await manager.api.disable_tablet_balancing(servers[0].ip_addr)
cql = manager.get_cql()
async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") as ks:
await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);")
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
assert tablet_count == 2
old_tablet_count = tablet_count
keys = range(100)
await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys])
await cql.run_async(f"ALTER KEYSPACE {ks} WITH tablets = {{'initial': 1}};")
s0_log = await manager.server_open_log(servers[0].server_id)
s0_mark = await s0_log.mark()
await manager.api.enable_injection(servers[0].ip_addr, "merge_completion_fiber", one_shot=True)
await manager.api.enable_injection(servers[0].ip_addr, "replica_merge_completion_wait", one_shot=True)
await manager.api.enable_tablet_balancing(servers[0].ip_addr)
servers.append(await manager.server_add(cmdline=cmdline, config=cfg))
s1_host_id = await manager.get_host_id(servers[1].server_id)
async def finished_merging():
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
return tablet_count < old_tablet_count or None
await wait_for(finished_merging, time.time() + 120)
await manager.api.disable_tablet_balancing(servers[0].ip_addr)
await manager.api.enable_injection(servers[0].ip_addr, "take_storage_snapshot", one_shot=True)
await s0_log.wait_for(f"merge_completion_fiber: waiting", from_mark=s0_mark)
tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
assert tablet_count == 1
tablet_token = 0 # Doesn't matter since there is one tablet
replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token)
s0_host_id = await manager.get_host_id(servers[0].server_id)
src_shard = replica[1]
dst_shard = src_shard
migration = asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], src_shard, s1_host_id, dst_shard, tablet_token))
await s0_log.wait_for(f"take_storage_snapshot: waiting", from_mark=s0_mark)
await manager.api.message_injection(servers[0].ip_addr, "merge_completion_fiber")
await s0_log.wait_for(f"Merge completion fiber finished", from_mark=s0_mark)
await manager.api.message_injection(servers[0].ip_addr, "take_storage_snapshot")
await migration
rows = await cql.run_async(f"SELECT * FROM {ks}.test;")
assert len(rows) == len(keys)