Files
scylladb/db/rate_limiter.hh
Piotr Dulikowski 0fe8b55427 db: add rate_limiter
Introduces the rate_limiter, a replica-side data structure meant for
tracking the frequence with which each partition is being accessed
(separately for reads and writes) and deciding whether the request
should be accepted and processed further or rejected.

The limiter is implemented as a statically allocated hashmap which keeps
track of the frequency with which partitions are accessed. Its entries
are incremented when an operation is admitted and are decayed
exponentially over time.

If a partition is detected to be accessed more than its limit allows,
requests are rejected with a probability calculated in such a way that,
on average, the number of accepted requests is kept at the limit.

The structure currently weights a bit above 1MB and each shard is meant
to keep a separate instance. All operations are O(1), including the
periodic timer.
2022-06-22 20:16:48 +02:00

179 lines
5.5 KiB
C++

/*
* Copyright (C) 2022-present ScyllaDB
*/
/*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
#pragma once
#include <cstdint>
#include <cstddef>
#include <chrono>
#include <limits>
#include <concepts>
#include <vector>
#include <optional>
#include <random>
#include <seastar/core/future.hh>
#include <seastar/core/timer.hh>
#include <seastar/core/lowres_clock.hh>
#include <seastar/core/metrics_registration.hh>
#include <seastar/util/bool_class.hh>
#include "utils/chunked_vector.hh"
#include "db/per_partition_rate_limit_info.hh"
// A data structure used to implement per-partition rate limiting. It accounts
// operations and enforces limits when it is detected that the operation rate
// is too high.
namespace db {
class rate_limiter_base {
public:
static constexpr size_t op_count_bits = 20;
static constexpr size_t time_window_bits = 12;
private:
struct metrics {
uint64_t allocations_on_empty = 0;
uint64_t successful_lookups = 0;
uint64_t failed_allocations = 0;
uint64_t probe_count = 0;
};
// Represents a piece of the hashmap storage.
struct entry {
public:
// The partition key token of the operation which allocated this entry.
uint64_t token = 0;
// The label of the operation which allocated this entry.
// Labels are used to differentiate operations which should be counted
// separately, e.g. reads and writes to the same table or writes
// to two different tables.
uint32_t label = 0;
// The number of operations counted for given token/label.
// It is virtually decremented on each window change, so the real
// operation count is actually `op_count - _current_bucket`.
// If the number drops to zero or below, the entry is considered
// "expired" and may be overwritten by another operation.
uint32_t op_count : op_count_bits = 0;
// ID of the time window in which the entry was allocated.
uint32_t time_window : time_window_bits = 0;
};
struct time_window_entry {
// How many entries are there active within this time window?
uint32_t entries_active = 0;
// By how much should the counter should be decreased within
// this time window?
uint32_t lossy_counting_decrease = 0;
};
public:
struct can_proceed_tag{};
using can_proceed = seastar::bool_class<can_proceed_tag>;
// Identifies a type of operation which is counted separately from other
// operations. For example, reads and writes for given table should have
// separate labels.
struct label {
private:
// The current ID used to identify the label in the rate limiter.
// It is assigned on first use.
uint32_t _label = 0;
friend class rate_limiter_base;
};
private:
uint32_t _current_bucket = 0;
uint32_t _current_ops_in_bucket = 0;
uint32_t _current_entries_in_time_window = 0;
uint32_t _next_label = 1;
uint32_t _current_time_window = 0;
const uint32_t _salt;
utils::chunked_vector<entry> _entries;
std::vector<time_window_entry> _time_window_history;
metrics _metrics;
seastar::metrics::metric_groups _metric_group;
private:
entry* get_entry(uint32_t label, uint64_t token) noexcept;
size_t compute_hash(uint32_t label, uint64_t token) noexcept;
void entry_refresh(entry& b) noexcept;
bool entry_is_empty(const entry& b) noexcept;
void register_metrics();
protected:
void on_timer() noexcept;
public:
rate_limiter_base();
rate_limiter_base(const rate_limiter_base&) = delete;
rate_limiter_base(rate_limiter_base&&) = delete;
rate_limiter_base& operator=(const rate_limiter_base&) = delete;
rate_limiter_base& operator=(rate_limiter_base&&) = delete;
// (For testing purposes only)
// Increments the counter for given (label, token) and returns
// the new value of the counter.
uint64_t increase_and_get_counter(label& l, uint64_t token) noexcept;
// Increments the counter for given (label, token).
// If the counter indicates that the partition is over the limit,
// returns can_proceed::no with some probability.
//
// The `random_variable` parameter should be a value from range [0, 1).
// It is used as the source of randomness - the function chooses a threshold
// and accepts if and only if `random_variable` is below it.
//
// The probability is calculated in such a way that statistically
// only `limit` operations per second are admitted.
can_proceed account_operation(label& l, uint64_t token, uint64_t limit,
const db::per_partition_rate_limit::info& rate_limit_info) noexcept;
};
template<typename ClockType>
class generic_rate_limiter : public rate_limiter_base {
private:
seastar::timer<ClockType> _timer;
public:
generic_rate_limiter()
: rate_limiter_base() {
// Rate limiting is more accurate when the rate limiter timers
// on all nodes are synchronized. Assume that the nodes' clocks
// are synchronized and schedule the first tick on the beginning
// of the closest second.
const auto period = std::chrono::seconds(1);
const auto now = std::chrono::system_clock::now();
const auto initial_delay = period - now.time_since_epoch() % period;
_timer.set_callback([this] { on_timer(); });
_timer.arm(ClockType::now() + initial_delay, period);
}
};
extern template class generic_rate_limiter<seastar::lowres_clock>;
using rate_limiter = generic_rate_limiter<seastar::lowres_clock>;
}