From cb329b10bf215da0b4e2c72f7065596b719d5196 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 4 Feb 2026 18:25:46 +0300 Subject: [PATCH] code: Add maintenance/maintenance group And move some activities from streaming group into it, namely - tablet_allocator background group - sstables_manager-s components reclaimer - tablet storage group manager merge completion fiber - prometheus All other activity that was in streaming group remains there, but can be moved to this group (or to new maintenance subgroup) later. All but prometheus are patched here, prometheus still uses the maintenance_sched_group variable in main.cc, so it transparently moves into new group Signed-off-by: Pavel Emelyanov --- CMakeLists.txt | 2 +- configure.py | 2 +- main.cc | 9 +++++---- replica/database.cc | 7 +++++-- replica/database.hh | 3 +++ replica/table.cc | 2 +- 6 files changed, 16 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 58461a49aa..742b984f11 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,7 +96,7 @@ else() set(Seastar_EXCLUDE_APPS_FROM_ALL ON CACHE BOOL "" FORCE) set(Seastar_EXCLUDE_TESTS_FROM_ALL ON CACHE BOOL "" FORCE) set(Seastar_IO_URING ON CACHE BOOL "" FORCE) - set(Seastar_SCHEDULING_GROUPS_COUNT 23 CACHE STRING "" FORCE) + set(Seastar_SCHEDULING_GROUPS_COUNT 24 CACHE STRING "" FORCE) set(Seastar_UNUSED_RESULT_ERROR ON CACHE BOOL "" FORCE) add_subdirectory(seastar) target_compile_definitions (seastar diff --git a/configure.py b/configure.py index 924cfb832f..4c89d27a5a 100755 --- a/configure.py +++ b/configure.py @@ -2148,7 +2148,7 @@ def configure_seastar(build_dir, mode, mode_config, compiler_cache=None): '-DSeastar_DEPRECATED_OSTREAM_FORMATTERS=OFF', '-DSeastar_UNUSED_RESULT_ERROR=ON', '-DCMAKE_EXPORT_COMPILE_COMMANDS=ON', - '-DSeastar_SCHEDULING_GROUPS_COUNT=23', + '-DSeastar_SCHEDULING_GROUPS_COUNT=24', '-DSeastar_IO_URING=ON', ] diff --git a/main.cc b/main.cc index 89e34674dc..46f37bd541 100644 --- a/main.cc +++ b/main.cc @@ -945,8 +945,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl auto maintenance_supergroup = create_scheduling_supergroup(200).get(); auto bandwidth_updater = io_throughput_updater("maintenance supergroup", maintenance_supergroup, cfg->maintenance_io_throughput_mb_per_sec.is_set() ? cfg->maintenance_io_throughput_mb_per_sec : cfg->stream_io_throughput_mb_per_sec); - auto maintenance_scheduling_group = create_scheduling_group("streaming", "strm", 200, maintenance_supergroup).get(); - debug::streaming_scheduling_group = maintenance_scheduling_group; + auto maintenance_scheduling_group = create_scheduling_group("maintenance", "mant", 200, maintenance_supergroup).get(); smp::invoke_on_all([&cfg, background_reclaim_scheduling_group] { logalloc::tracker::config st_cfg; @@ -1186,7 +1185,9 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl dbcfg.compaction_scheduling_group = create_scheduling_group("compaction", "comp", 1000).get(); dbcfg.maintenance_compaction_scheduling_group = create_scheduling_group("maintenance_compaction", "manc", 200, maintenance_supergroup).get(); dbcfg.memory_compaction_scheduling_group = create_scheduling_group("mem_compaction", "mcmp", 1000).get(); - dbcfg.streaming_scheduling_group = maintenance_scheduling_group; + dbcfg.streaming_scheduling_group = create_scheduling_group("streaming", "strm", 200, maintenance_supergroup).get(); + debug::streaming_scheduling_group = dbcfg.streaming_scheduling_group; + dbcfg.maintenance_scheduling_group = maintenance_scheduling_group; dbcfg.statement_scheduling_group = create_scheduling_group("statement", "stmt", 1000, user_ssg).get(); dbcfg.memtable_scheduling_group = create_scheduling_group("memtable", "mt", 1000).get(); dbcfg.memtable_to_cache_scheduling_group = create_scheduling_group("memtable_to_cache", "mt2c", 200).get(); @@ -1761,7 +1762,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl checkpoint(stop_signal, "starting tablet allocator"); service::tablet_allocator::config tacfg { - .background_sg = dbcfg.streaming_scheduling_group, + .background_sg = dbcfg.maintenance_scheduling_group, }; sharded tablet_allocator; tablet_allocator.start(tacfg, std::ref(mm_notifier), std::ref(db)).get(); diff --git a/replica/database.cc b/replica/database.cc index 50fe4fc49b..47e21943dd 100644 --- a/replica/database.cc +++ b/replica/database.cc @@ -460,8 +460,8 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat , _nop_large_data_handler(std::make_unique()) , _corrupt_data_handler(std::make_unique(db::system_table_corrupt_data_handler::config{.entry_ttl = std::chrono::days(10)}, db::corrupt_data_handler::register_metrics::yes)) , _nop_corrupt_data_handler(std::make_unique(db::corrupt_data_handler::register_metrics::no)) - , _user_sstables_manager(std::make_unique("user", *_large_data_handler, *_corrupt_data_handler, configure_sstables_manager(_cfg, dbcfg), feat, _row_cache_tracker, sst_dir_sem, [&stm]{ return stm.get()->get_my_id(); }, scf, abort, _cfg.extensions().sstable_file_io_extensions(), dbcfg.streaming_scheduling_group, &sstm)) - , _system_sstables_manager(std::make_unique("system", *_nop_large_data_handler, *_nop_corrupt_data_handler, configure_sstables_manager(_cfg, dbcfg), feat, _row_cache_tracker, sst_dir_sem, [&stm]{ return stm.get()->get_my_id(); }, scf, abort, _cfg.extensions().sstable_file_io_extensions(), dbcfg.streaming_scheduling_group)) + , _user_sstables_manager(std::make_unique("user", *_large_data_handler, *_corrupt_data_handler, configure_sstables_manager(_cfg, dbcfg), feat, _row_cache_tracker, sst_dir_sem, [&stm]{ return stm.get()->get_my_id(); }, scf, abort, _cfg.extensions().sstable_file_io_extensions(), dbcfg.maintenance_scheduling_group, &sstm)) + , _system_sstables_manager(std::make_unique("system", *_nop_large_data_handler, *_nop_corrupt_data_handler, configure_sstables_manager(_cfg, dbcfg), feat, _row_cache_tracker, sst_dir_sem, [&stm]{ return stm.get()->get_my_id(); }, scf, abort, _cfg.extensions().sstable_file_io_extensions(), dbcfg.maintenance_scheduling_group)) , _result_memory_limiter(dbcfg.available_memory / 10) , _data_listeners(std::make_unique()) , _mnotifier(mn) @@ -1570,6 +1570,7 @@ keyspace::make_column_family_config(const schema& s, const database& db) const { cfg.memtable_scheduling_group = _config.memtable_scheduling_group; cfg.memtable_to_cache_scheduling_group = _config.memtable_to_cache_scheduling_group; cfg.streaming_scheduling_group = _config.streaming_scheduling_group; + cfg.maintenance_scheduling_group = _config.maintenance_scheduling_group; cfg.enable_metrics_reporting = db_config.enable_keyspace_column_family_metrics(); cfg.enable_node_aggregated_table_metrics = db_config.enable_node_aggregated_table_metrics(); cfg.tombstone_warn_threshold = db_config.tombstone_warn_threshold(); @@ -1712,6 +1713,7 @@ request_class classify_request(const database_config& _dbcfg) { // Requests done on behalf of view update generation run in the streaming group } else if (current_group == _dbcfg.streaming_scheduling_group || current_group == _dbcfg.backup_scheduling_group + || current_group == _dbcfg.maintenance_scheduling_group || current_group == _dbcfg.maintenance_compaction_scheduling_group) { return request_class::maintenance; // Everything else is considered a user request @@ -2521,6 +2523,7 @@ database::make_keyspace_config(const keyspace_metadata& ksm, system_keyspace is_ cfg.memtable_scheduling_group = _dbcfg.memtable_scheduling_group; cfg.memtable_to_cache_scheduling_group = _dbcfg.memtable_to_cache_scheduling_group; cfg.streaming_scheduling_group = _dbcfg.streaming_scheduling_group; + cfg.maintenance_scheduling_group = _dbcfg.maintenance_scheduling_group; cfg.enable_metrics_reporting = _cfg.enable_keyspace_column_family_metrics(); cfg.view_update_memory_semaphore_limit = max_memory_pending_view_updates(); diff --git a/replica/database.hh b/replica/database.hh index 3b66d708be..b7f3b639a7 100644 --- a/replica/database.hh +++ b/replica/database.hh @@ -470,6 +470,7 @@ public: seastar::scheduling_group memtable_to_cache_scheduling_group; seastar::scheduling_group memory_compaction_scheduling_group; seastar::scheduling_group streaming_scheduling_group; + seastar::scheduling_group maintenance_scheduling_group; bool enable_metrics_reporting = false; bool enable_node_aggregated_table_metrics = true; size_t view_update_memory_semaphore_limit; @@ -1456,6 +1457,7 @@ public: seastar::scheduling_group memtable_to_cache_scheduling_group; seastar::scheduling_group memory_compaction_scheduling_group; seastar::scheduling_group streaming_scheduling_group; + seastar::scheduling_group maintenance_scheduling_group; bool enable_metrics_reporting = false; size_t view_update_memory_semaphore_limit; }; @@ -1536,6 +1538,7 @@ struct database_config { seastar::scheduling_group memory_compaction_scheduling_group; seastar::scheduling_group statement_scheduling_group; seastar::scheduling_group streaming_scheduling_group; + seastar::scheduling_group maintenance_scheduling_group; seastar::scheduling_group gossip_scheduling_group; seastar::scheduling_group commitlog_scheduling_group; seastar::scheduling_group schema_commitlog_scheduling_group; diff --git a/replica/table.cc b/replica/table.cc index 426b94399e..294a9ab8bf 100644 --- a/replica/table.cc +++ b/replica/table.cc @@ -3313,7 +3313,7 @@ void tablet_storage_group_manager::handle_tablet_split_completion(const locator: } future<> tablet_storage_group_manager::merge_completion_fiber() { - co_await coroutine::switch_to(_t.get_config().streaming_scheduling_group); + co_await coroutine::switch_to(_t.get_config().maintenance_scheduling_group); while (!_t.async_gate().is_closed()) { try {