From cccdd6aaaeb85fe66dfeb417e369ecb50343afb3 Mon Sep 17 00:00:00 2001 From: "Raphael S. Carvalho" Date: Fri, 12 Mar 2021 13:22:23 -0300 Subject: [PATCH] compaction_manager: Fix performance of cleanup compaction due to unlimited parallelism Prior to 463d0ab, only one table could be cleaned up at a time on a given shard. Since then, all tables belonging to a given keyspace are cleaned up in parallel. Cleanup serialization on each shard was enforced with a semaphore, which was incorrectly removed by the patch aforementioned. So space requirement for cleanup to succeed can be up to the size of keyspace, increasing the chances of node running out of space. Node could also run out of memory if there are tons of tables in the keyspace. Memory requirement is at least #_of_tables * 128k (not taking into account write behind, etc). With 5k tables, it's ~0.64G per shard. Also all tables being cleaned up in parallel will compete for the same disk and cpu bandwidth, so making them all much slower, and consequently the operation time is significantly higher. This problem was detected with cleanup, but scrub and upgrade go through the same rewrite procedure, so they're affected by exact the same problem. Fixes #8247. Signed-off-by: Raphael S. Carvalho Message-Id: <20210312162223.149993-1-raphaelsc@scylladb.com> (cherry picked from commit 71712448444989f0ecbd017e335338e2f5dc478c) --- sstables/compaction_manager.cc | 16 +++++++++------- sstables/compaction_manager.hh | 1 + 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/sstables/compaction_manager.cc b/sstables/compaction_manager.cc index fb05f126f5..4f0b12eb67 100644 --- a/sstables/compaction_manager.cc +++ b/sstables/compaction_manager.cc @@ -673,13 +673,15 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa compacting->release_compacting(exhausted_sstables); }; - _stats.pending_tasks--; - _stats.active_tasks++; - task->compaction_running = true; - compaction_backlog_tracker user_initiated(std::make_unique(_compaction_controller.backlog_of_shares(200), _available_memory)); - return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor)] (compaction_backlog_tracker& bt) mutable { - return with_scheduling_group(_scheduling_group, [this, &cf, descriptor = std::move(descriptor)] () mutable { - return cf.run_compaction(std::move(descriptor)); + return with_semaphore(_rewrite_sstables_sem, 1, [this, task, &cf, descriptor = std::move(descriptor)] () mutable { + _stats.pending_tasks--; + _stats.active_tasks++; + task->compaction_running = true; + compaction_backlog_tracker user_initiated(std::make_unique(_compaction_controller.backlog_of_shares(200), _available_memory)); + return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor)] (compaction_backlog_tracker& bt) mutable { + return with_scheduling_group(_scheduling_group, [this, &cf, descriptor = std::move(descriptor)]() mutable { + return cf.run_compaction(std::move(descriptor)); + }); }); }).then_wrapped([this, task, compacting] (future<> f) mutable { task->compaction_running = false; diff --git a/sstables/compaction_manager.hh b/sstables/compaction_manager.hh index e499bbbabc..8193e8ef17 100644 --- a/sstables/compaction_manager.hh +++ b/sstables/compaction_manager.hh @@ -110,6 +110,7 @@ private: std::unordered_map _compaction_locks; semaphore _custom_job_sem{1}; + seastar::named_semaphore _rewrite_sstables_sem = {1, named_semaphore_exception_factory{"rewrite sstables"}}; std::function compaction_submission_callback(); // all registered column families are submitted for compaction at a constant interval.