From ea3615de1edd1904d81aef6a847952bd0385cf83 Mon Sep 17 00:00:00 2001 From: "Raphael S. Carvalho" Date: Fri, 22 May 2026 15:11:52 -0300 Subject: [PATCH] compaction: fail resharding when out of space prevention is activated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When out-of-space prevention is activated, the compaction manager is drained and disabled. This caused resharding to silently succeed without actually processing any SSTables, because: 1. run_custom_job() calls start_compaction() which returns nullopt when is_disabled() is true, and run_custom_job() would just return immediately — appearing as a successful no-op. 2. reshard() used throw_if_stopping::no, so even within the compaction task executor, stopping would be silently swallowed rather than propagated as an exception. The SSTable loader interprets a successful return from resharding as "all SSTables processed", so it proceeds without error, leaving the unprocessed SSTables orphaned and their data missing from the table. Fix this with two changes: - run_custom_job(): when start_compaction() returns nullopt, check is_disabled() and throw via make_disabled_exception() rather than returning silently. This ensures callers are always informed when a job was skipped because compaction is disabled (e.g. due to disk space pressure), as opposed to a benign skip (e.g. table removed). - reshard(): change throw_if_stopping::no to throw_if_stopping::yes. Resharding is mandatory for correct SSTable loading — unlike reshape which is optional and can be safely skipped, resharding failure must be propagated to the caller so the loader does not proceed with incomplete data. Fixes https://scylladb.atlassian.net/browse/SCYLLADB-2085. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Raphael S. Carvalho Closes scylladb/scylladb#30041 --- compaction/compaction_manager.cc | 3 +++ compaction/task_manager_module.cc | 8 ++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/compaction/compaction_manager.cc b/compaction/compaction_manager.cc index f9a6b69f19..c070c26159 100644 --- a/compaction/compaction_manager.cc +++ b/compaction/compaction_manager.cc @@ -710,6 +710,9 @@ protected: future<> compaction_manager::run_custom_job(compaction_group_view& t, compaction_type type, const char* desc, noncopyable_function(compaction_data&, compaction_progress_monitor&)> job, tasks::task_info info, throw_if_stopping do_throw_if_stopping) { auto gh = start_compaction(t); if (!gh) { + if (is_disabled()) { + co_return co_await coroutine::return_exception_ptr(make_disabled_exception(t)); + } co_return; } diff --git a/compaction/task_manager_module.cc b/compaction/task_manager_module.cc index 81456edd9b..09edc5bcce 100644 --- a/compaction/task_manager_module.cc +++ b/compaction/task_manager_module.cc @@ -170,8 +170,12 @@ future<> reshard(sstables::sstable_directory& dir, sstables::sstable_directory:: on_internal_error(tasks::tmlogger, format("No compaction group found for table {}.{}", table.schema()->ks_name(), table.schema()->cf_name())); } auto& t = cg->view_for_unrepaired_data(); + // Resharding is mandatory for SSTable loading — if it cannot proceed (e.g. due to + // out-of-space prevention disabling compaction), we must fail loudly rather than + // silently skipping and leaving SSTables orphaned. + auto& cm = table.get_compaction_manager(); co_await coroutine::parallel_for_each(buckets, [&] (std::vector& sstlist) mutable { - return table.get_compaction_manager().run_custom_job(t, compaction_type::Reshard, "Reshard compaction", [&] (compaction_data& info, compaction_progress_monitor& progress_monitor) -> future<> { + return cm.run_custom_job(t, compaction_type::Reshard, "Reshard compaction", [&] (compaction_data& info, compaction_progress_monitor& progress_monitor) -> future<> { auto erm = table.get_effective_replication_map(); // keep alive around compaction. compaction_descriptor desc(sstlist); @@ -184,7 +188,7 @@ future<> reshard(sstables::sstable_directory& dir, sstables::sstable_directory:: // input sstables are moved, to guarantee their resources are released once we're done // resharding them. co_await when_all_succeed(dir.collect_output_unshared_sstables(std::move(result.new_sstables), sstables::sstable_directory::can_be_remote::yes), dir.remove_sstables(std::move(sstlist))).discard_result(); - }, parent_info, throw_if_stopping::no); + }, parent_info, throw_if_stopping::yes); }); }