From dc55a566fa288725d82af7f5c755ee4ddb6c4405 Mon Sep 17 00:00:00 2001 From: Ferenc Szili Date: Wed, 15 Jan 2025 15:32:15 +0100 Subject: [PATCH 1/2] table: run set_split_mode() on all storage groups during all_storage_groups_split() tablet_storage_group_manager::all_storage_groups_split() calls set_split_mode() for each of its storage groups to create split ready compaction groups. It does this by iterating through storage groups using std::ranges::all_of() which is not guaranteed to iterate through the entire range, and will stop iterating on the first occurance of the predicate (set_split_mode()) returning false. set_split_mode() creates the split compaction groups and returns false if the storage group's main compaction group or merging groups are not empty. This means that in cases where the tablet storage group manager has non-empty storage groups, we could have a situation where split compaction groups are not created for all storage groups. The missing split compaction groups are later created in tablet_storage_group_manager::split_all_storage_groups() which also calls set_split_mode(), and that is the reason why split completes successfully. The problem is that tablet_storage_group_manager::all_storage_groups_split() runs under a group0 guard, and tablet_storage_group_manager::split_all_storage_groups() does not. This can cause problems with operations which should exclude with compaction group creation. i.e. DROP TABLE/DROP KEYSPACE (cherry picked from commit 24e8d2a55c72b8c2c2463fee525a57a406e58ea1) --- replica/table.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/replica/table.cc b/replica/table.cc index c3a0adddbe..fac90171cd 100644 --- a/replica/table.cc +++ b/replica/table.cc @@ -1029,8 +1029,10 @@ bool tablet_storage_group_manager::all_storage_groups_split() { return true; } - auto split_ready = std::ranges::all_of(_storage_groups | std::views::values, - std::mem_fn(&storage_group::set_split_mode)); + bool split_ready = true; + for (const storage_group_ptr& sg : _storage_groups | std::views::values) { + split_ready &= sg->set_split_mode(); + } // The table replica will say to coordinator that its split status is ready by // mirroring the sequence number from tablet metadata into its local state, From fe869fd902323f636075de4f6b6edfd6967c0075 Mon Sep 17 00:00:00 2001 From: Ferenc Szili Date: Wed, 15 Jan 2025 16:23:07 +0100 Subject: [PATCH 2/2] test: add reproducer and test for fix to split ready CG creation This adds a reproducer for #22431 In cases where a tablet storage group manager had more than one storage group, it was possible to create compaction groups outside the group0 guard, which could create problems with operations which should exclude with compaction group creation. (cherry picked from commit 8bff7786a8041a430e707af65452e9211642e31f) --- replica/database.cc | 6 ++++ replica/table.cc | 6 ++++ test/topology_custom/test_tablets.py | 51 ++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+) diff --git a/replica/database.cc b/replica/database.cc index 4a8579446b..1c70173f61 100644 --- a/replica/database.cc +++ b/replica/database.cc @@ -2557,6 +2557,12 @@ future<> database::truncate_table_on_all_shards(sharded& sharded_db, s }); }); + co_await utils::get_local_injector().inject("truncate_compaction_disabled_wait", [] (auto& handler) -> future<> { + dblog.info("truncate_compaction_disabled_wait: wait"); + co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5}); + dblog.info("truncate_compaction_disabled_wait: done"); + }, false); + const auto should_flush = with_snapshot && cf.can_flush(); dblog.trace("{} {}.{} and views on all shards", should_flush ? "Flushing" : "Clearing", s->ks_name(), s->cf_name()); std::function(replica::table&)> flush_or_clear = should_flush ? diff --git a/replica/table.cc b/replica/table.cc index fac90171cd..c155d5f0ec 100644 --- a/replica/table.cc +++ b/replica/table.cc @@ -1060,6 +1060,12 @@ sstables::compaction_type_options::split tablet_storage_group_manager::split_com future<> tablet_storage_group_manager::split_all_storage_groups(tasks::task_info tablet_split_task_info) { sstables::compaction_type_options::split opt = split_compaction_options(); + co_await utils::get_local_injector().inject("split_storage_groups_wait", [] (auto& handler) -> future<> { + dblog.info("split_storage_groups_wait: waiting"); + co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5}); + dblog.info("split_storage_groups_wait: done"); + }, false); + co_await for_each_storage_group_gently([opt, tablet_split_task_info] (storage_group& storage_group) { return storage_group.split(opt, tablet_split_task_info); }); diff --git a/test/topology_custom/test_tablets.py b/test/topology_custom/test_tablets.py index 268cabb797..4da4a020f0 100644 --- a/test/topology_custom/test_tablets.py +++ b/test/topology_custom/test_tablets.py @@ -754,3 +754,54 @@ async def test_replace_with_no_normal_token_owners_in_dc(manager: ManagerClient, assert len(rows) == len(keys) for r in rows: assert r.c == r.pk + +@pytest.mark.asyncio +@skip_mode('release', 'error injections are not supported in release mode') +async def test_drop_keyspace_while_split(manager: ManagerClient): + + # Reproducer for: https://github.com/scylladb/scylladb/issues/22431 + # This tests if the split ready compaction groups are correctly created + # on a shard with several storage groups for the same table + + logger.info("Bootstrapping cluster") + cmdline = [ '--target-tablet-size-in-bytes', '8192', + '--smp', '2' ] + config = { 'error_injections_at_startup': ['short_tablet_stats_refresh_interval'] } + servers = [await manager.server_add(config=config, cmdline=cmdline)] + + s0_log = await manager.server_open_log(servers[0].server_id) + + cql = manager.get_cql() + await wait_for_cql_and_get_hosts(cql, [servers[0]], time.time() + 60) + + await manager.api.disable_tablet_balancing(servers[0].ip_addr) + + # create a table so that it has at least 2 tablets (and storage groups) per shard + await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 4};") + await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + + await manager.api.disable_autocompaction(servers[0].ip_addr, 'test') + + keys = range(2048) + await asyncio.gather(*[cql.run_async(f'INSERT INTO test.test (pk, c) VALUES ({k}, {k});') for k in keys]) + await manager.api.flush_keyspace(servers[0].ip_addr, 'test') + + await manager.api.enable_injection(servers[0].ip_addr, 'truncate_compaction_disabled_wait', one_shot=False) + await manager.api.enable_injection(servers[0].ip_addr, 'split_storage_groups_wait', one_shot=False) + + # enable the load balancer which should emmit a tablet split + await manager.api.enable_tablet_balancing(servers[0].ip_addr) + + # wait for compaction groups to be created and split to begin + await s0_log.wait_for('split_storage_groups_wait: wait') + + # start a DROP and wait for it to disable compaction + drop_ks_task = cql.run_async('DROP KEYSPACE test;') + await s0_log.wait_for('truncate_compaction_disabled_wait: wait') + + # release split + await manager.api.message_injection(servers[0].ip_addr, "split_storage_groups_wait") + + # release drop and wait for it to complete + await manager.api.message_injection(servers[0].ip_addr, "truncate_compaction_disabled_wait") + await drop_ks_task