From cccdd6aaaeb85fe66dfeb417e369ecb50343afb3 Mon Sep 17 00:00:00 2001
From: "Raphael S. Carvalho" <raphaelsc@scylladb.com>
Date: Fri, 12 Mar 2021 13:22:23 -0300
Subject: [PATCH] compaction_manager: Fix performance of cleanup compaction due
 to unlimited parallelism

Prior to 463d0ab, only one table could be cleaned up at a time on a given shard.
Since then, all tables belonging to a given keyspace are cleaned up in parallel.
Cleanup serialization on each shard was enforced with a semaphore, which was
incorrectly removed by the patch aforementioned.

So space requirement for cleanup to succeed can be up to the size of keyspace,
increasing the chances of node running out of space.

Node could also run out of memory if there are tons of tables in the keyspace.
Memory requirement is at least #_of_tables * 128k (not taking into account write
behind, etc). With 5k tables, it's ~0.64G per shard.

Also all tables being cleaned up in parallel will compete for the same
disk and cpu bandwidth, so making them all much slower, and consequently
the operation time is significantly higher.

This problem was detected with cleanup, but scrub and upgrade go through the
same rewrite procedure, so they're affected by exact the same problem.

Fixes #8247.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20210312162223.149993-1-raphaelsc@scylladb.com>
(cherry picked from commit 71712448444989f0ecbd017e335338e2f5dc478c)
---
 sstables/compaction_manager.cc | 16 +++++++++-------
 sstables/compaction_manager.hh |  1 +
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/sstables/compaction_manager.cc b/sstables/compaction_manager.cc
index fb05f126f5..4f0b12eb67 100644
--- a/sstables/compaction_manager.cc
+++ b/sstables/compaction_manager.cc
@@ -673,13 +673,15 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
                 compacting->release_compacting(exhausted_sstables);
             };
 
-            _stats.pending_tasks--;
-            _stats.active_tasks++;
-            task->compaction_running = true;
-            compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
-            return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor)] (compaction_backlog_tracker& bt) mutable {
-                return with_scheduling_group(_scheduling_group, [this, &cf, descriptor = std::move(descriptor)] () mutable {
-                    return cf.run_compaction(std::move(descriptor));
+            return with_semaphore(_rewrite_sstables_sem, 1, [this, task, &cf, descriptor = std::move(descriptor)] () mutable {
+                _stats.pending_tasks--;
+                _stats.active_tasks++;
+                task->compaction_running = true;
+                compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
+                return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor)] (compaction_backlog_tracker& bt) mutable {
+                    return with_scheduling_group(_scheduling_group, [this, &cf, descriptor = std::move(descriptor)]() mutable {
+                        return cf.run_compaction(std::move(descriptor));
+                    });
                 });
             }).then_wrapped([this, task, compacting] (future<> f) mutable {
                 task->compaction_running = false;
diff --git a/sstables/compaction_manager.hh b/sstables/compaction_manager.hh
index e499bbbabc..8193e8ef17 100644
--- a/sstables/compaction_manager.hh
+++ b/sstables/compaction_manager.hh
@@ -110,6 +110,7 @@ private:
     std::unordered_map<column_family*, rwlock> _compaction_locks;
 
     semaphore _custom_job_sem{1};
+    seastar::named_semaphore _rewrite_sstables_sem = {1, named_semaphore_exception_factory{"rewrite sstables"}};
 
     std::function<void()> compaction_submission_callback();
     // all registered column families are submitted for compaction at a constant interval.