factor out some of the controller code

The control algorithm we are using for memtables have proven itself quite successful. We will very likely use the same for other processes, like compactions. Make the code a bit more generic, so that a new controller has to only set the desired parameters Signed-off-by: Glauber Costa <glauber@scylladb.com>
2026-05-12 19:02:12 +00:00 · 2017-08-24 13:32:07 -04:00
parent bb29d082d2
commit 1671d9c433
4 changed files with 157 additions and 112 deletions
--- a/backlog_controller.hh
+++ b/backlog_controller.hh
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <seastar/core/thread.hh>
+#include <seastar/core/timer.hh>
+#include <chrono>
+
+// Simple proportional controller to adjust shares for processes for which a backlog can be clearly
+// defined.
+//
+// Goal is to consume the backlog as fast as we can, but not so fast that we steal all the CPU from
+// incoming requests, and at the same time minimize user-visible fluctuations in the quota.
+//
+// What that translates to is we'll try to keep the backlog's firt derivative at 0 (IOW, we keep
+// backlog constant). As the backlog grows we increase CPU usage, decreasing CPU usage as the
+// backlog diminishes.
+//
+// The exact point at which the controller stops determines the desired CPU usage. As the backlog
+// grows and approach a maximum desired, we need to be more aggressive. We will therefore define two
+// thresholds, and increase the constant as we cross them.
+//
+// Doing that divides the range in three (before the first, between first and second, and after
+// second threshold), and we'll be slow to grow in the first region, grow normally in the second
+// region, and aggressively in the third region.
+//
+// The constants q1 and q2 are used to determine the proportional factor at each stage.
+class backlog_controller {
+protected:
+    struct control_point {
+        float input;
+        float output;
+    };
+
+    std::chrono::milliseconds _interval;
+    timer<> _update_timer;
+
+    std::vector<control_point> _control_points;
+
+    std::function<float()> _current_backlog;
+
+    virtual void update_controller(float quota) = 0;
+
+    void adjust();
+
+    backlog_controller(std::chrono::milliseconds interval, std::vector<control_point> control_points, std::function<float()> backlog)
+        : _interval(interval)
+        , _update_timer([this] { adjust(); })
+        , _control_points({{0,0}})
+        , _current_backlog(std::move(backlog))
+    {
+        _control_points.insert(_control_points.end(), control_points.begin(), control_points.end());
+         _update_timer.arm_periodic(_interval);
+    }
+
+    // Used when the controllers are disabled. When we deprecate the --auto-adjust-flush-quota
+    // parameter we can delete this constructor.
+    backlog_controller() = default;
+    virtual ~backlog_controller() {}
+};
+
+
+class backlog_cpu_controller : public backlog_controller {
+public:
+    struct disabled {
+        seastar::thread_scheduling_group *backup;
+    };
+
+    seastar::thread_scheduling_group* scheduling_group() {
+        return _current_scheduling_group;
+    }
+
+    float current_quota() const {
+        return _current_quota;
+    }
+protected:
+    float _current_quota = 0.0f;
+
+    void update_controller(float quota) override;
+
+    seastar::thread_scheduling_group _scheduling_group;
+    seastar::thread_scheduling_group *_current_scheduling_group = nullptr;
+
+    backlog_cpu_controller(std::chrono::milliseconds interval, std::vector<backlog_controller::control_point> control_points, std::function<float()> backlog)
+        : backlog_controller(interval, std::move(control_points), backlog)
+        , _scheduling_group(std::chrono::milliseconds(1), _current_quota)
+        , _current_scheduling_group(&_scheduling_group)
+    {}
+
+    backlog_cpu_controller(disabled d)
+        : backlog_controller()
+        , _scheduling_group(std::chrono::nanoseconds(0), 0)
+        , _current_scheduling_group(d.backup) {}
+};
+
+// memtable flush CPU controller.
+//
+// - First threshold is the soft limit line,
+// - Maximum is the point in which we'd stop consuming request,
+// - Second threshold is halfway between them.
+//
+// Below the soft limit, we are in no particular hurry to flush, since it means we're set to
+// complete flushing before we a new memtable is ready. The quota is dirty * q1, and q1 is set to a
+// low number.
+//
+// The first half of the virtual dirty region is where we expect to be usually, so we have a low
+// slope corresponding to a sluggish response between q1 * soft_limit and q2.
+//
+// In the second half, we're getting close to the hard dirty limit so we increase the slope and
+// become more responsive, up to a maximum quota of qmax.
+class flush_cpu_controller : public backlog_cpu_controller {
+    static constexpr float hard_dirty_limit = 1.0f;
+public:
+    flush_cpu_controller(backlog_cpu_controller::disabled d) : backlog_cpu_controller(std::move(d)) {}
+    flush_cpu_controller(flush_cpu_controller&&) = default;
+    flush_cpu_controller(std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty)
+        : backlog_cpu_controller(std::move(interval),
+          std::vector<backlog_controller::control_point>({{soft_limit, 0.1}, {soft_limit + (hard_dirty_limit - soft_limit) / 2, 0.2} , {hard_dirty_limit, 1}}),
+          std::move(current_dirty)
+        )
+    {}
+};
--- a/cpu_controller.hh
+++ b/cpu_controller.hh
@@ -1,89 +0,0 @@
-/*
- * Copyright (C) 2017 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-#include <seastar/core/thread.hh>
-#include <seastar/core/timer.hh>
-#include <chrono>
-
-// Simple proportional controller to adjust shares of memtable/streaming flushes.
-//
-// Goal is to flush as fast as we can, but not so fast that we steal all the CPU from incoming
-// requests, and at the same time minimize user-visible fluctuations in the flush quota.
-//
-// What that translates to is we'll try to keep virtual dirty's firt derivative at 0 (IOW, we keep
-// virtual dirty constant), which means that the rate of incoming writes is equal to the rate of
-// flushed bytes.
-//
-// The exact point at which the controller stops determines the desired flush CPU usage. As we
-// approach the hard dirty limit, we need to be more aggressive. We will therefore define two
-// thresholds, and increase the constant as we cross them.
-//
-//  1) the soft limit line
-//  2) halfway between soft limit and dirty limit
-//
-// The constants q1 and q2 are used to determine the proportional factor at each stage.
-//
-// Below the soft limit, we are in no particular hurry to flush, since it means we're set to
-// complete flushing before we a new memtable is ready. The quota is dirty * q1, and q1 is set to a
-// low number.
-//
-// The first half of the virtual dirty region is where we expect to be usually, so we have a low
-// slope corresponding to a sluggish response between q1 * soft_limit and q2.
-//
-// In the second half, we're getting close to the hard dirty limit so we increase the slope and
-// become more responsive, up to a maximum quota of qmax.
-//
-// For now we'll just set them in the structure not to complicate the constructor. But q1, q2 and
-// qmax can easily become parameters if we find another user.
-class flush_cpu_controller {
-    static constexpr float hard_dirty_limit = 0.50;
-    static constexpr float q1 = 0.01;
-    static constexpr float q2 = 0.2;
-    static constexpr float qmax = 1;
-
-    float _current_quota = 0.0f;
-    float _goal;
-    std::function<float()> _current_dirty;
-    std::chrono::milliseconds _interval;
-    timer<> _update_timer;
-
-    seastar::thread_scheduling_group _scheduling_group;
-    seastar::thread_scheduling_group *_current_scheduling_group = nullptr;
-
-    void adjust();
-public:
-    seastar::thread_scheduling_group* scheduling_group() {
-        return _current_scheduling_group;
-    }
-    float current_quota() const {
-        return _current_quota;
-    }
-
-    struct disabled {
-        seastar::thread_scheduling_group *backup;
-    };
-    flush_cpu_controller(disabled d) : _scheduling_group(std::chrono::nanoseconds(0), 0), _current_scheduling_group(d.backup) {}
-    flush_cpu_controller(std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty);
-    flush_cpu_controller(flush_cpu_controller&&) = default;
-};
-
-
--- a/database.cc
+++ b/database.cc
@@ -2049,7 +2049,7 @@ make_flush_cpu_controller(db::config& cfg, seastar::thread_scheduling_group* bac
    if (cfg.auto_adjust_flush_quota()) {
        return flush_cpu_controller(250ms, cfg.virtual_dirty_soft_limit(), std::move(fn));
    }
-    return flush_cpu_controller(flush_cpu_controller::disabled{backup});
+    return flush_cpu_controller(backlog_cpu_controller::disabled{backup});
 }

 utils::UUID database::empty_version = utils::UUID_gen::get_name_UUID(bytes{});
@@ -2066,7 +2066,7 @@ database::database(const db::config& cfg)
    , _dirty_memory_manager(*this, memory::stats().total_memory() * 0.45, cfg.virtual_dirty_soft_limit())
    , _streaming_dirty_memory_manager(*this, memory::stats().total_memory() * 0.10, cfg.virtual_dirty_soft_limit())
    , _background_writer_scheduling_group(1ms, _cfg->background_writer_scheduling_quota())
-    , _memtable_cpu_controller(make_flush_cpu_controller(*_cfg, &_background_writer_scheduling_group, [this, limit = 2.0f * _dirty_memory_manager.throttle_threshold()] {
+    , _memtable_cpu_controller(make_flush_cpu_controller(*_cfg, &_background_writer_scheduling_group, [this, limit = float(_dirty_memory_manager.throttle_threshold())] {
        return (_dirty_memory_manager.virtual_dirty_memory()) / limit;
    }))
    , _version(empty_version)
@@ -2079,31 +2079,25 @@ database::database(const db::config& cfg)
    dblog.info("Row: max_vector_size: {}, internal_count: {}", size_t(row::max_vector_size), size_t(row::internal_count));
 }

-void flush_cpu_controller::adjust() {
-    auto mid = _goal + (hard_dirty_limit - _goal) / 2;
+void backlog_controller::adjust() {
+    auto backlog = _current_backlog();

-    auto dirty = _current_dirty();
-    if (dirty < _goal) {
-        _current_quota = dirty * q1 / _goal;
-    } else if ((dirty >= _goal) && (dirty < mid)) {
-        _current_quota = q1 + (dirty - _goal) * (q2 - q1)/(mid - _goal);
-    } else {
-        _current_quota = q2 + (dirty - mid) * (qmax - q2) / (hard_dirty_limit - mid);
+    // interpolate to find out which region we are. This run infrequently and there are a fixed
+    // number of points so a simple loop will do.
+    size_t idx = 1;
+    while ((idx < _control_points.size()) && (_control_points[idx].input < backlog)) {
+        idx++;
    }

-    dblog.trace("dirty {}, goal {}, mid {} quota {}", dirty, _goal, mid, _current_quota);
-    _scheduling_group.update_usage(_current_quota);
+    control_point& cp = _control_points[idx];
+    control_point& last = _control_points[idx - 1];
+    float result = last.output + (backlog - last.input) * (cp.output - last.output)/(cp.input - last.input);
+    update_controller(result);
 }

-flush_cpu_controller::flush_cpu_controller(std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty)
-    : _goal(soft_limit / 2)
-    , _current_dirty(std::move(current_dirty))
-    , _interval(interval)
-    , _update_timer([this] { adjust(); })
-    , _scheduling_group(1ms, 0.0f)
-    , _current_scheduling_group(&_scheduling_group)
-{
-    _update_timer.arm_periodic(_interval);
+void backlog_cpu_controller::update_controller(float quota) {
+    _current_quota = quota;
+    _scheduling_group.update_usage(_current_quota);
 }

 void
--- a/database.hh
+++ b/database.hh
@@ -78,7 +78,7 @@
 #include "db/view/view.hh"
 #include "lister.hh"
 #include "utils/phased_barrier.hh"
-#include "cpu_controller.hh"
+#include "backlog_controller.hh"
 #include "dirty_memory_manager.hh"
 #include "reader_resource_tracker.hh"