From efc3953c0aafa2a49f610c11273725714a6b4f77 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Mon, 13 Dec 2021 19:11:18 +0100
Subject: [PATCH 01/29] transport: add rate_limit_error

Adds a CQL protocol extension which introduces the rate_limit_error. The
new error code will be used to indicate that the operation failed due to
it exceeding the allowed per-partition rate limit.

The error code is supposed to be returned only if the corresponding CQL
extension is enabled by the client - if it's not enabled, then
Config_error will be returned in its stead.
---
 db/operation_type.hh                     | 23 ++++++++++++++++
 docs/design-notes/protocol-extensions.md | 34 ++++++++++++++++++++++++
 exceptions/coordinator_result.hh         |  3 ++-
 exceptions/exceptions.cc                 |  9 ++++++-
 exceptions/exceptions.hh                 | 20 +++++++++++++-
 replica/database.cc                      |  9 +++++++
 transport/cql_protocol_extension.cc      |  6 ++++-
 transport/cql_protocol_extension.hh      |  6 +++--
 transport/server.cc                      | 18 +++++++++++++
 transport/server.hh                      |  2 ++
 10 files changed, 124 insertions(+), 6 deletions(-)
 create mode 100644 db/operation_type.hh
diff --git a/db/operation_type.hh b/db/operation_type.hh
new file mode 100644
index 0000000000..def6c35bfc
--- /dev/null
+++ b/db/operation_type.hh
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+
+namespace db {
+
+enum class operation_type : uint8_t {
+    read = 0,
+    write = 1
+};
+
+std::ostream& operator<<(std::ostream& os, operation_type op_type);
+
+}
diff --git a/docs/design-notes/protocol-extensions.md b/docs/design-notes/protocol-extensions.md
index 2bfe5f372d..8fdfeadd79 100644
--- a/docs/design-notes/protocol-extensions.md
+++ b/docs/design-notes/protocol-extensions.md
@@ -146,3 +146,37 @@ parameters:
     the bit mask that should be used by the client to test against when checking
     prepared statement metadata flags to see if the current query is conditional
     or not.
+
+## Rate limit error
+
+This extension allows the driver to send a new type of error in case the operation
+goes over the allowed per-partition rate limit. This kind of error does not fit
+other existing error codes well, hence the need for the protocol extension.
+
+On receiving this error, the driver should not retry the request; instead,
+the error should be propagated to the user so that they can decide what to do
+with it - sometimes it might make sense to propagate the error, in other cases
+it might make sense to retry with backoff.
+
+The body of the error consists of the usual error code, error message and then
+the following fields: `<op_type><rejected_by_coordinator>`, where:
+
+- `op_type` is a byte which identifies the operation which is the origin
+  of the rate limit.
+  - 0: read
+  - 1: write
+- `rejected_by_coordinator` is a byte which is 1 if the operation was rejected
+  on the coordinator and 0 if it was rejected by replicas.
+
+If the driver does not understand this extension and does not enable it,
+the Config_error will be used instead of the new error code.
+
+In order to be forward compatible with error codes added in the future protocol
+versions, this extension doesn't reserve a fixed error code - instead, it
+advertises the integer value used as the error code in the SUPPORTED response.
+
+This extension is identified by the `SCYLLA_RATE_LIMIT_ERROR` key.
+The string map in the SUPPORTED response will contain the following parameters:
+
+  - `ERROR_CODE`: a 32-bit signed decimal integer which Scylla
+    will use as the error code for the rate limit exception.
diff --git a/exceptions/coordinator_result.hh b/exceptions/coordinator_result.hh
index 0195e08194..79e09e780f 100644
--- a/exceptions/coordinator_result.hh
+++ b/exceptions/coordinator_result.hh
@@ -31,7 +31,8 @@ namespace exceptions {
 using coordinator_exception_container = utils::exception_container<
     mutation_write_timeout_exception,
     read_timeout_exception,
-    read_failure_exception
+    read_failure_exception,
+    rate_limit_exception
 >;
 
 template<typename T = void>
diff --git a/exceptions/exceptions.cc b/exceptions/exceptions.cc
index 59cbfc2671..d75428ecd4 100644
--- a/exceptions/exceptions.cc
+++ b/exceptions/exceptions.cc
@@ -39,7 +39,8 @@ const std::unordered_map<exception_code, sstring>& exception_map() {
         {exception_code::INVALID, "invalid"},
         {exception_code::CONFIG_ERROR, "config_error"},
         {exception_code::ALREADY_EXISTS, "already_exists"},
-        {exception_code::UNPREPARED, "unprepared"}
+        {exception_code::UNPREPARED, "unprepared"},
+        {exception_code::RATE_LIMIT_ERROR, "rate_limit_error"}
     };
     return map;
 }
@@ -77,6 +78,12 @@ overloaded_exception::overloaded_exception(size_t c) noexcept
     : cassandra_exception(exception_code::OVERLOADED, prepare_message("Too many in flight hints: {}", c))
     {}
 
+rate_limit_exception::rate_limit_exception(const sstring& ks, const sstring& cf, db::operation_type op_type_, bool rejected_by_coordinator_) noexcept
+    : cassandra_exception(exception_code::CONFIG_ERROR, prepare_message("Per-partition rate limit reached for {} in table {}.{}, rejected by {}", op_type, ks, cf, rejected_by_coordinator_ ? "coordinator" : "replicas"))
+    , op_type(op_type_)
+    , rejected_by_coordinator(rejected_by_coordinator_)
+    { }
+
 prepared_query_not_found_exception::prepared_query_not_found_exception(bytes id) noexcept
     : request_validation_exception{exception_code::UNPREPARED, prepare_message("No prepared statement with ID {} found.", id)}
     , id{id}
diff --git a/exceptions/exceptions.hh b/exceptions/exceptions.hh
index d662f81506..a62f46a642 100644
--- a/exceptions/exceptions.hh
+++ b/exceptions/exceptions.hh
@@ -12,6 +12,7 @@
 
 #include "db/consistency_level_type.hh"
 #include "db/write_type.hh"
+#include "db/operation_type.hh"
 #include <stdexcept>
 #include <seastar/core/sstring.hh>
 #include "bytes.hh"
@@ -42,7 +43,17 @@ enum class exception_code : int32_t {
     INVALID         = 0x2200,
     CONFIG_ERROR    = 0x2300,
     ALREADY_EXISTS  = 0x2400,
-    UNPREPARED      = 0x2500
+    UNPREPARED      = 0x2500,
+
+    // Scylla-specific error codes
+    // The error codes below are advertised to the drivers during connection
+    // handshake using the protocol extension negotiation, and are only
+    // enabled if the drivers explicitly enable them. Therefore it's perfectly
+    // fine to change them in case some new error codes are introduced
+    // in Cassandra.
+    // NOTE TO DRIVER DEVELOPERS: These constants must not be relied upon,
+    // they must be learned from protocol extensions instead.
+    RATE_LIMIT_ERROR = 0xF000
 };
 
 const std::unordered_map<exception_code, sstring>& exception_map();
@@ -183,6 +194,13 @@ struct overloaded_exception : public cassandra_exception {
         cassandra_exception(exception_code::OVERLOADED, std::move(msg)) {}
 };
 
+struct rate_limit_exception : public cassandra_exception {
+    db::operation_type op_type;
+    bool rejected_by_coordinator;
+
+    rate_limit_exception(const sstring& ks, const sstring& cf, db::operation_type op_type_, bool rejected_by_coordinator_) noexcept;
+};
+
 class request_validation_exception : public cassandra_exception {
 public:
     using cassandra_exception::cassandra_exception;
diff --git a/replica/database.cc b/replica/database.cc
index be58bb3cea..bfe3a55b4f 100644
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -42,6 +42,7 @@
 #include "gms/feature_service.hh"
 #include "timeout_config.hh"
 #include "service/storage_proxy.hh"
+#include "db/operation_type.hh"
 
 #include "utils/human_readable.hh"
 #include "utils/fb_utilities.hh"
@@ -1933,6 +1934,14 @@ std::ostream& operator<<(std::ostream& os, db::consistency_level cl) {
     }
 }
 
+std::ostream& operator<<(std::ostream& os, operation_type op_type) {
+    switch (op_type) {
+    case operation_type::read: return os << "read";
+    case operation_type::write: return os << "write";
+    }
+    abort();
+}
+
 }
 
 std::ostream&
diff --git a/transport/cql_protocol_extension.cc b/transport/cql_protocol_extension.cc
index 7b0f1dbb4b..5193541a93 100644
--- a/transport/cql_protocol_extension.cc
+++ b/transport/cql_protocol_extension.cc
@@ -9,13 +9,15 @@
 #include <seastar/core/print.hh>
 #include "transport/cql_protocol_extension.hh"
 #include "cql3/result_set.hh"
+#include "exceptions/exceptions.hh"
 
 #include <map>
 
 namespace cql_transport {
 
 static const std::map<cql_protocol_extension, seastar::sstring> EXTENSION_NAMES = {
-    {cql_protocol_extension::LWT_ADD_METADATA_MARK, "SCYLLA_LWT_ADD_METADATA_MARK"}
+    {cql_protocol_extension::LWT_ADD_METADATA_MARK, "SCYLLA_LWT_ADD_METADATA_MARK"},
+    {cql_protocol_extension::RATE_LIMIT_ERROR, "SCYLLA_RATE_LIMIT_ERROR"}
 };
 
 cql_protocol_extension_enum_set supported_cql_protocol_extensions() {
@@ -30,6 +32,8 @@ std::vector<seastar::sstring> additional_options_for_proto_ext(cql_protocol_exte
     switch (ext) {
         case cql_protocol_extension::LWT_ADD_METADATA_MARK:
             return {format("LWT_OPTIMIZATION_META_BIT_MASK={:d}", cql3::prepared_metadata::LWT_FLAG_MASK)};
+        case cql_protocol_extension::RATE_LIMIT_ERROR:
+            return {format("ERROR_CODE={:d}", exceptions::exception_code::RATE_LIMIT_ERROR)};
         default:
             return {};
     }
diff --git a/transport/cql_protocol_extension.hh b/transport/cql_protocol_extension.hh
index ede06e8a52..413358d8c0 100644
--- a/transport/cql_protocol_extension.hh
+++ b/transport/cql_protocol_extension.hh
@@ -28,11 +28,13 @@ namespace cql_transport {
  * `docs/protocol-extensions.md`. 
  */
 enum class cql_protocol_extension {
-    LWT_ADD_METADATA_MARK
+    LWT_ADD_METADATA_MARK,
+    RATE_LIMIT_ERROR
 };
 
 using cql_protocol_extension_enum = super_enum<cql_protocol_extension,
-    cql_protocol_extension::LWT_ADD_METADATA_MARK>;
+    cql_protocol_extension::LWT_ADD_METADATA_MARK,
+    cql_protocol_extension::RATE_LIMIT_ERROR>;
 
 using cql_protocol_extension_enum_set = enum_set<cql_protocol_extension_enum>;
 
diff --git a/transport/server.cc b/transport/server.cc
index b0f2ac332e..e9edff9e47 100644
--- a/transport/server.cc
+++ b/transport/server.cc
@@ -37,6 +37,7 @@
 #include <seastar/core/execution_stage.hh>
 #include "utils/result_try.hh"
 #include "utils/result_combinators.hh"
+#include "db/operation_type.hh"
 
 #include "enum_set.hh"
 #include "service/query_state.hh"
@@ -483,6 +484,9 @@ future<foreign_ptr<std::unique_ptr<cql_server::response>>>
         }), utils::result_catch<exceptions::function_execution_exception>([&] (const auto& ex) {
             try { ++_server._stats.errors[ex.code()]; } catch(...) {}
             return make_function_failure_error(stream, ex.code(), ex.what(), ex.ks_name, ex.func_name, ex.args, trace_state);
+        }), utils::result_catch<exceptions::rate_limit_exception>([&] (const auto& ex) {
+            try { ++_server._stats.errors[ex.code()]; } catch(...) {}
+            return make_rate_limit_error(stream, ex.code(), ex.what(), ex.op_type, ex.rejected_by_coordinator, trace_state, client_state);
         }), utils::result_catch<exceptions::cassandra_exception>([&] (const auto& ex) {
             // Note: the CQL protocol specifies that many types of errors have
             // mandatory parameters. These cassandra_exception subclasses MUST
@@ -1275,6 +1279,20 @@ std::unique_ptr<cql_server::response> cql_server::connection::make_function_fail
     return response;
 }
 
+std::unique_ptr<cql_server::response> cql_server::connection::make_rate_limit_error(int16_t stream, exceptions::exception_code err, sstring msg, db::operation_type op_type, bool rejected_by_coordinator, const tracing::trace_state_ptr& tr_state, const service::client_state& client_state) const
+{
+    if (!client_state.is_protocol_extension_set(cql_protocol_extension::RATE_LIMIT_ERROR)) {
+        return make_error(stream, exceptions::exception_code::CONFIG_ERROR, std::move(msg), tr_state);
+    }
+
+    auto response = std::make_unique<cql_server::response>(stream, cql_binary_opcode::ERROR, tr_state);
+    response->write_int(static_cast<int32_t>(err));
+    response->write_string(msg);
+    response->write_byte(static_cast<uint8_t>(op_type));
+    response->write_byte(static_cast<uint8_t>(rejected_by_coordinator));
+    return response;
+}
+
 std::unique_ptr<cql_server::response> cql_server::connection::make_error(int16_t stream, exceptions::exception_code err, sstring msg, const tracing::trace_state_ptr& tr_state) const
 {
     auto response = std::make_unique<cql_server::response>(stream, cql_binary_opcode::ERROR, tr_state);
diff --git a/transport/server.hh b/transport/server.hh
index e78a0c2736..d2437c39b6 100644
--- a/transport/server.hh
+++ b/transport/server.hh
@@ -31,6 +31,7 @@
 #include "transport/messages/result_message.hh"
 #include "utils/chunked_vector.hh"
 #include "exceptions/coordinator_result.hh"
+#include "db/operation_type.hh"
 
 namespace cql3 {
 
@@ -247,6 +248,7 @@ private:
         std::unique_ptr<cql_server::response> make_already_exists_error(int16_t stream, exceptions::exception_code err, sstring msg, sstring ks_name, sstring cf_name, const tracing::trace_state_ptr& tr_state) const;
         std::unique_ptr<cql_server::response> make_unprepared_error(int16_t stream, exceptions::exception_code err, sstring msg, bytes id, const tracing::trace_state_ptr& tr_state) const;
         std::unique_ptr<cql_server::response> make_function_failure_error(int16_t stream, exceptions::exception_code err, sstring msg, sstring ks_name, sstring func_name, std::vector<sstring> args, const tracing::trace_state_ptr& tr_state) const;
+        std::unique_ptr<cql_server::response> make_rate_limit_error(int16_t stream, exceptions::exception_code err, sstring msg, db::operation_type op_type, bool rejected_by_coordinator, const tracing::trace_state_ptr& tr_state, const service::client_state& client_state) const;
         std::unique_ptr<cql_server::response> make_error(int16_t stream, exceptions::exception_code err, sstring msg, const tracing::trace_state_ptr& tr_state) const;
         std::unique_ptr<cql_server::response> make_ready(int16_t stream, const tracing::trace_state_ptr& tr_state) const;
         std::unique_ptr<cql_server::response> make_supported(int16_t stream, const tracing::trace_state_ptr& tr_state) const;

From a55d7ad46dd053c93a4609a9fd6e28d7411aab43 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Mon, 16 May 2022 20:28:21 +0200
Subject: [PATCH 02/29] docs: design doc for per-partition rate limiting

---
 docs/design-notes/cql-extensions.md           |  40 +++++
 docs/design-notes/per-partition-rate-limit.md | 153 ++++++++++++++++++
 2 files changed, 193 insertions(+)
 create mode 100644 docs/design-notes/per-partition-rate-limit.md

diff --git a/docs/design-notes/cql-extensions.md b/docs/design-notes/cql-extensions.md
index 4339e49f74..1dab7b06f3 100644
--- a/docs/design-notes/cql-extensions.md
+++ b/docs/design-notes/cql-extensions.md
@@ -140,3 +140,43 @@ Subscripting a list in a WHERE clause is supported as are maps.
 ```cql
 WHERE some_list[:index] = :value
 ```
+
+## Per-partition rate limit
+
+The `per_partition_rate_limit` option can be used to limit the allowed
+rate of requests to each partition in a given table. When the cluster detects
+that the rate of requests exceeds configured limit, the cluster will start
+rejecting some of them in order to bring the throughput back to the configured
+limit. Rejected requests are less costly which can help reduce overload.
+
+_NOTE_: Due to Scylla's distributed nature, tracking per-partition request rates
+is not perfect and the actual rate of accepted requests may be higher up to
+a factor of keyspace's `RF`. This feature should not be used to enforce precise
+limits but rather serve as an overload protection feature.
+
+_NOTE): This feature works best when shard-aware drivers are used (rejected
+requests have the least cost).
+
+Limits are configured separately for reads and writes. Some examples:
+
+```cql
+    ALTER TABLE t WITH per_partition_rate_limit = {
+        'max_reads_per_second': 100,
+        'max_writes_per_second': 200
+    };
+```
+
+Limit reads only, no limit for writes:
+```cql
+    ALTER TABLE t WITH per_partition_rate_limit = {
+        'max_reads_per_second': 200
+    };
+```
+
+Rejected requests receive the scylla-specific "Rate limit exceeded" error.
+If the driver doesn't support it, `Config_error` will be sent instead.
+
+For more details, see:
+
+- Detailed [`design notes`](./per-partition-rate-limit.md)
+- Description of the [rate limit exceeded](./protocol-extensions.md#rate-limit-error) error
diff --git a/docs/design-notes/per-partition-rate-limit.md b/docs/design-notes/per-partition-rate-limit.md
new file mode 100644
index 0000000000..5f592af898
--- /dev/null
+++ b/docs/design-notes/per-partition-rate-limit.md
@@ -0,0 +1,153 @@
+# Per-partition rate limiting
+
+Scylla clusters operate best when the data is spread across a large number
+of small partitions, and reads/writes are spread uniformly across all shards
+and nodes. Due to various reasons (bugs, malicious end users etc.) this
+assumption may suddenly not hold anymore and one partition may start getting
+a disproportionate number of requests. In turn, this usually leads to the owning
+shards being overloaded - a scenario called "hot partition" - and the total
+cluster latency becoming worse.
+
+The _per partition rate limit_ feature allows users to limit the rate
+of accepted requests on a per-partition basis. When a partition exceeds
+the configured limit of operations of given type (reads/writes) per second,
+the cluster will start responding with errors to some of the operations for that
+partition so that, statistically, the rate of accepted requests is kept
+at the configured limit. Rejected operations use less resources, therefore
+this feature can help in the "hot partition" situation.
+
+_NOTE_: this is an overload protection mechanism and may not be used to reliably
+enforce limits in some situations. Due to Scylla's distributed nature,
+the actual number of accepted requests depends on the cluster and driver
+configuration and may be larger by a factor of RF (keyspace's replication
+factor). It is recommended to set the limit to a value an order of magnitude
+larger than the maximum expected per-partition throughput. See the
+[Inaccurracies](#inaccurracies) section for more information.
+
+
+## Usage
+
+### Server-side configuration
+
+Per-partition limits are set separately for reads and writes, on a per-table
+basis. Limits can be set with the `per_partition_rate_limit` extension when
+CREATE'ing or ALTER'ing a table using a schema extension:
+
+```cql
+ALTER TABLE ks.tbl WITH per_partition_rate_limit = {
+    'max_reads_per_second': 123,
+    'max_writes_per_second': 456
+};
+```
+
+Both `max_reads_per_second` and `max_writes_per_second` are optional - omitting
+one of them means "no limit" for that type of operation.
+
+### Driver response
+
+Rejected operations are reported as an ERROR response to the driver.
+If the driver supports it, the response contains a scylla-specific error code
+indicating that the operation was rejected. For more details about the error
+code, see the [Rate limit error](./protocol-extensions.md#Rate%20limit%20error)
+section in the `protocol-extensions.md` doc.
+
+If the driver doesn't support the new error code, the `Config_error` code
+is returned instead. The code was chosen in order for the retry policies
+of the drivers not to retry the requests and instead propagate them directly
+to the users.
+
+## How it works
+
+Accounting related to tracking per-partition limits is done by replicas.
+Each replica keeps a map of counters which are identified by a combination
+of (token, table, operation type). When the replica accounts an operation,
+it increments the relevant counter. All counters are halved every second.
+
+Depending on whether the coordinator is a replica or not, the flow is
+a bit different. Here, "coordinator == replica" requirement also means
+that the operation is handled on the correct shard.
+
+Only reads and writes explicitly issued by the user are counted to the limit.
+Read repair, hints, batch replay, CDC preimage query and internal system queries
+are _not_ counted to the limit.
+
+Paxos and counters are not covered in current implemenation.
+
+### Coordinator is not a replica
+
+Coordinator generates a random number from range `[0, 1)` with uniform
+distribution and sends it to replicas along with the operation request.
+Each replica accounts the operation and then calculates a rejection threshold
+based on the local counter value. If the number received from the coordinator
+is above the threshold, the operation is rejected.
+
+The assumption is that all replicas will converge to similar counter values.
+Most of the time they will agree on the decision and not much work
+will be wasted due to some replicas accepting and other rejecting.
+
+### Coordinator is a replica
+
+As before, the coordinator generates a random number. However, it does not
+send requests to replicas immediately but rather calculates local rejection
+threshold. If the number is above threshold, the whole operation is skipped
+and the operation is only accounted on the coordinator. Otherwise, coordinator
+proceeds with sending the requests, and replicas are told only to account
+the operation but never reject it.
+
+This strategy leads to no wasted replica work. However, when the coordinator
+rejects the operation other replicas do not account it, so it may lead to
+a bit more requests being accepted (but still not more than `RF * limit`).
+
+### How to calculate rejection threshold
+
+Let's assume the simplest case where there is only one replica. It will
+increment its counter on every operation. Because all counters are halved
+every second, assuming the rate of `V` ops/s the counter will eventually
+oscillate between `V` and `2V`. If the limit is `L` ops/s, then we would
+like to admit only `L` operation within each second - therefore the probability
+should satisfy the following:
+
+```
+  L = Sum(i = V..2V) { P(i) }
+```
+
+This can be approximated with a definite integral:
+
+```
+  L = Int(x = V..2V) { P(x) }
+```
+
+A solution to this integral is:
+
+```
+  P(x) = L / (x * ln 2)
+```
+
+where `x` is the current value of the counter. This is the formula used
+in the current implementation.
+
+### Inaccurracies
+
+In practice, RF is rarely 1 so there is more than one replica. Depending on
+the type of the operation, this introduces some inaccurracies in counting.
+
+- Writes are counted relatively well because all live replicas participate
+  in a write operation, so all replicas should have an up-to-date counter
+  value. Because of the "coordinator is replica" case, rejected writes
+  will not be accounted on all replicas. In tests, the amount of accepted
+  operations was quite close to the limit and much less than the theoretical
+  `RF * limit`.
+- Reads are less accurate because not all replicas may participate in a given
+  read operation (this depends on CL). In the worst case of CL=ONE and
+  round-robin strategy, up to `RF * limit` ops/s will be accepted. Higher
+  consistencies are counted better, e.g. CL=ALL - although they are also
+  susceptible to the inaccurracy introduced by "coordinator is replica" case.
+- In case of non-shard-aware drivers, it is best to keep the clocks in sync.
+  When the coordinator is not a replica, each replica decides whether to accept
+  or not, based on the random number sent by coordinator. If the replicas have
+  their clocks in sync, then their per-partition counters should have close
+  values and they will agree on the decision whether to reject or not most of
+  the time. If not, they will disagree more frequently which will result in
+  wasted replica work and the effective rate limit will be lower or higher,
+  depending on the consistency. In the worst case, it might be 30% lower or
+  45% higher than the real limit.

From 621b7f35e2341906841612cdfb181a2acd4d3972 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Thu, 10 Mar 2022 14:59:10 +0100
Subject: [PATCH 03/29] replica: add rate_limit_exception and a simple
 serialization framework

Introduces `replica::rate_limit_exception` - an exceptions that is
supposed to be thrown/returned on the replica side when the request is
rejected due to the exceeding the per-partition rate limit.

Additionally, introduces the `exception_variant` type which allows to
transport the new exception over RPC while preserving the type
information. This will be useful in later commits, as the coordinator
will have to know whether a replica has failed due to rate limit being
exceeded or another kind of error.

The `exception_variant` currently can only either hold "other exception"
(std::monostate) or the aforementioned `rate_limit_exception`, but can
be extended in a backwards-compatible way in the future to be able to
hold more exceptions that need to be handled in a different way.
---
 configure.py                 |  2 +
 exceptions/exceptions.cc     |  2 +-
 idl/replica_exception.idl.hh | 25 ++++++++++++
 replica/exceptions.cc        | 40 ++++++++++++++++++++
 replica/exceptions.hh        | 73 ++++++++++++++++++++++++++++++++++++
 5 files changed, 141 insertions(+), 1 deletion(-)
 create mode 100644 idl/replica_exception.idl.hh
 create mode 100644 replica/exceptions.cc
 create mode 100644 replica/exceptions.hh

diff --git a/configure.py b/configure.py
index 69b4febc00..e35e4a6cf2 100755
--- a/configure.py
+++ b/configure.py
@@ -668,6 +668,7 @@ scylla_core = (['replica/database.cc',
                 'replica/table.cc',
                 'replica/distributed_loader.cc',
                 'replica/memtable.cc',
+                'replica/exceptions.cc',
                 'absl-flat_hash_map.cc',
                 'atomic_cell.cc',
                 'caching_options.cc',
@@ -1128,6 +1129,7 @@ idls = ['idl/gossip_digest.idl.hh',
         'idl/storage_proxy.idl.hh',
         'idl/group0_state_machine.idl.hh',
         'idl/forward_request.idl.hh',
+        'idl/replica_exception.idl.hh',
         ]
 
 rusts = [
diff --git a/exceptions/exceptions.cc b/exceptions/exceptions.cc
index d75428ecd4..0c6da158e2 100644
--- a/exceptions/exceptions.cc
+++ b/exceptions/exceptions.cc
@@ -79,7 +79,7 @@ overloaded_exception::overloaded_exception(size_t c) noexcept
     {}
 
 rate_limit_exception::rate_limit_exception(const sstring& ks, const sstring& cf, db::operation_type op_type_, bool rejected_by_coordinator_) noexcept
-    : cassandra_exception(exception_code::CONFIG_ERROR, prepare_message("Per-partition rate limit reached for {} in table {}.{}, rejected by {}", op_type, ks, cf, rejected_by_coordinator_ ? "coordinator" : "replicas"))
+    : cassandra_exception(exception_code::CONFIG_ERROR, prepare_message("Per-partition rate limit reached for {} in table {}.{}, rejected by {}", op_type_, ks, cf, rejected_by_coordinator_ ? "coordinator" : "replicas"))
     , op_type(op_type_)
     , rejected_by_coordinator(rejected_by_coordinator_)
     { }
diff --git a/idl/replica_exception.idl.hh b/idl/replica_exception.idl.hh
new file mode 100644
index 0000000000..650a494fe3
--- /dev/null
+++ b/idl/replica_exception.idl.hh
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+namespace replica {
+
+struct unknown_exception {};
+
+struct no_exception {};
+
+class rate_limit_exception {
+};
+
+struct exception_variant {
+    std::variant<replica::unknown_exception,
+            replica::no_exception,
+            replica::rate_limit_exception
+    > reason;
+};
+
+}
diff --git a/replica/exceptions.cc b/replica/exceptions.cc
new file mode 100644
index 0000000000..4c2ed82aec
--- /dev/null
+++ b/replica/exceptions.cc
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include <concepts>
+#include <sstream>
+#include <stdexcept>
+#include <type_traits>
+
+#include "replica/exceptions.hh"
+#include "utils/exceptions.hh"
+
+
+namespace replica {
+
+exception_variant try_encode_replica_exception(std::exception_ptr eptr) {
+    try {
+        std::rethrow_exception(std::move(eptr));
+    } catch (rate_limit_exception&) {
+        return rate_limit_exception();
+    } catch (...) {
+        return no_exception{};
+    }
+}
+
+std::exception_ptr exception_variant::into_exception_ptr() noexcept {
+    return std::visit([] <typename Ex> (Ex&& ex) {
+        if constexpr (std::is_same_v<Ex, unknown_exception>) {
+            return std::make_exception_ptr(std::runtime_error("unknown exception"));
+        } else {
+            return std::make_exception_ptr(std::move(ex));
+        }
+    }, std::move(reason));
+}
+
+}
diff --git a/replica/exceptions.hh b/replica/exceptions.hh
new file mode 100644
index 0000000000..1b3caf01ac
--- /dev/null
+++ b/replica/exceptions.hh
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <exception>
+#include <optional>
+#include <variant>
+
+#include "seastar/core/sstring.hh"
+#include "seastar/core/timed_out_error.hh"
+
+#include "utils/exception_container.hh"
+#include "utils/result.hh"
+
+namespace replica {
+
+// A marker indicating that the exception_variant holds an unknown exception.
+// For example, replica sends a new type of error and coordinator does not
+// understand it because it wasn't upgraded to a newer version yet.
+struct unknown_exception {};
+
+// A marker indicating that the exception variant doesn't hold any exception.
+struct no_exception {};
+
+class replica_exception : public std::exception {
+public:
+    replica_exception() noexcept {};
+};
+
+class rate_limit_exception final : public replica_exception {
+public:
+    rate_limit_exception() noexcept
+            : replica_exception()
+    { }
+
+    virtual const char* what() const noexcept override { return "rate limit exceeded"; }
+};
+
+struct exception_variant {
+    std::variant<unknown_exception,
+            no_exception,
+            rate_limit_exception
+    > reason;
+
+    exception_variant()
+            : reason(no_exception{})
+    { }
+
+    template<typename Ex>
+    exception_variant(Ex&& ex)
+            : reason(std::move(ex))
+    { }
+
+    std::exception_ptr into_exception_ptr() noexcept;
+
+    inline operator bool() const noexcept {
+        return !std::holds_alternative<no_exception>(reason);
+    }
+};
+
+// Tries to encode the exception into an exception_variant.
+// If given exception cannot be encoded into one of the replica exception types,
+// returns no_exception.
+exception_variant try_encode_replica_exception(std::exception_ptr eptr);
+
+}

From 51546b0609b0d9e433e122bfe11270c0ccb6c41a Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Mon, 16 May 2022 18:55:59 +0200
Subject: [PATCH 04/29] storage_proxy: pass rate_limit_exception through write
 RPC

This commit modifies the storage_proxy logic so that the coordinator
knows whether a write operation failed due to rate limit being exceeded,
and returns `exceptions::rate_limit_exception` when that happens.
---
 idl/storage_proxy.idl.hh     |  2 +-
 message/messaging_service.cc |  3 +++
 service/storage_proxy.cc     | 52 ++++++++++++++++++++++++++----------
 service/storage_proxy.hh     |  4 ++-
 4 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/idl/storage_proxy.idl.hh b/idl/storage_proxy.idl.hh
index ec3cbd0dfa..5a16736a08 100644
--- a/idl/storage_proxy.idl.hh
+++ b/idl/storage_proxy.idl.hh
@@ -8,7 +8,7 @@
 
 verb [[with_client_info, with_timeout, one_way]] mutation (frozen_mutation fm, inet_address_vector_replica_set forward, gms::inet_address reply_to, unsigned shard, uint64_t response_id, std::optional<tracing::trace_info> trace_info [[version 1.3.0]]);
 verb [[with_client_info, one_way]] mutation_done (unsigned shard, uint64_t response_id, db::view::update_backlog backlog [[version 3.1.0]]);
-verb [[with_client_info, one_way]] mutation_failed (unsigned shard, uint64_t response_id, size_t num_failed, db::view::update_backlog backlog [[version 3.1.0]]);
+verb [[with_client_info, one_way]] mutation_failed (unsigned shard, uint64_t response_id, size_t num_failed, db::view::update_backlog backlog [[version 3.1.0]], replica::exception_variant exception [[version 5.1.0]]);
 verb [[with_client_info, with_timeout]] counter_mutation (std::vector<frozen_mutation> fms, db::consistency_level cl, std::optional<tracing::trace_info> trace_info);
 verb [[with_client_info, with_timeout, one_way]] hint_mutation (frozen_mutation fm, inet_address_vector_replica_set forward, gms::inet_address reply_to, unsigned shard, uint64_t response_id, std::optional<tracing::trace_info> trace_info [[version 1.3.0]] /* this verb was mistakenly introduced with optional trace_info */);
 verb [[with_client_info, with_timeout]] read_data (query::read_command cmd, ::compat::wrapping_partition_range pr, query::digest_algorithm digest [[version 3.0.0]]) -> query::result [[lw_shared_ptr]], cache_temperature [[version 2.0.0]];
diff --git a/message/messaging_service.cc b/message/messaging_service.cc
index 59e2371806..2c564d7ecc 100644
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -42,6 +42,7 @@
 #include "cache_temperature.hh"
 #include "raft/raft.hh"
 #include "service/raft/messaging.hh"
+#include "replica/exceptions.hh"
 #include "serializer.hh"
 #include "idl/consistency_level.dist.hh"
 #include "idl/tracing.dist.hh"
@@ -67,6 +68,7 @@
 #include "idl/raft_storage.dist.hh"
 #include "idl/raft.dist.hh"
 #include "idl/group0.dist.hh"
+#include "idl/replica_exception.dist.hh"
 #include "idl/storage_proxy.dist.hh"
 #include "serializer_impl.hh"
 #include "serialization_visitors.hh"
@@ -94,6 +96,7 @@
 #include "idl/raft.dist.impl.hh"
 #include "idl/group0.dist.impl.hh"
 #include "idl/view.dist.impl.hh"
+#include "idl/replica_exception.dist.impl.hh"
 #include "idl/storage_proxy.dist.impl.hh"
 #include <seastar/rpc/lz4_compressor.hh>
 #include <seastar/rpc/lz4_fragmented_compressor.hh>
diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index 524043e522..baf30261ac 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -93,6 +93,8 @@
 #include "utils/overloaded_functor.hh"
 #include "utils/result_try.hh"
 #include "utils/error_injection.hh"
+#include "replica/exceptions.hh"
+#include "db/operation_type.hh"
 
 namespace bi = boost::intrusive;
 
@@ -417,6 +419,8 @@ public:
                 } else {
                     _ready.set_exception(mutation_write_failure_exception(*_message, _cl, _cl_acks, _failed, _total_block_for, _type));
                 }
+            } else if (_error == error::RATE_LIMIT) {
+                _ready.set_value(exceptions::rate_limit_exception(get_schema()->ks_name(), get_schema()->cf_name(), db::operation_type::write, false));
             }
             if (_cdc_operation_result_tracker) {
                 _cdc_operation_result_tracker->on_mutation_failed();
@@ -2795,6 +2799,9 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
             std::optional<sstring> msg;
             try {
                 std::rethrow_exception(eptr);
+            } catch (replica::rate_limit_exception&) {
+                // There might be a lot of those, so ignore
+                err = error::RATE_LIMIT;
             } catch(rpc::closed_error&) {
                 // ignore, disconnect will be logged by gossiper
             } catch(seastar::gate_closed_exception&) {
@@ -4965,10 +4972,15 @@ storage_proxy::handle_write(netw::messaging_service::msg_addr src_addr, rpc::opt
             timeout = *t;
         }
 
-        return do_with(std::move(in), get_local_shared_storage_proxy(), size_t(0), [this, src_addr = std::move(src_addr),
+        struct errors_info {
+            size_t count = 0;
+            replica::exception_variant local;
+        };
+
+        return do_with(std::move(in), get_local_shared_storage_proxy(), errors_info{}, [this, src_addr = std::move(src_addr),
                        forward = std::move(forward), reply_to, shard, response_id, trace_state_ptr, timeout,
                        schema_version, apply_fn = std::move(apply_fn), forward_fn = std::move(forward_fn)]
-                       (const auto& m, shared_ptr<storage_proxy>& p, size_t& errors) mutable {
+                       (const auto& m, shared_ptr<storage_proxy>& p, errors_info& errors) mutable {
             ++p->get_stats().received_mutations;
             p->get_stats().forwarded_mutations += forward.size();
             return when_all(
@@ -4995,14 +5007,15 @@ storage_proxy::handle_write(netw::messaging_service::msg_addr src_addr, rpc::opt
                         f.ignore_ready_future();
                     });
                 }).handle_exception([reply_to, shard, &p, &errors] (std::exception_ptr eptr) {
+                    errors.count++;
+                    errors.local = replica::try_encode_replica_exception(eptr);
                     seastar::log_level l = seastar::log_level::warn;
-                    if (is_timeout_exception(eptr)) {
-                        // ignore timeouts so that logs are not flooded.
-                        // database total_writes_timedout counter was incremented.
+                    if (is_timeout_exception(eptr) || std::holds_alternative<replica::rate_limit_exception>(errors.local.reason)) {
+                        // ignore timeouts and rate limit exceptions so that logs are not flooded.
+                        // database's total_writes_timedout or total_writes_rate_limited counter was incremented.
                         l = seastar::log_level::debug;
                     }
                     slogger.log(l, "Failed to apply mutation from {}#{}: {}", reply_to, shard, eptr);
-                    errors++;
                 }),
                 parallel_for_each(forward.begin(), forward.end(), [reply_to, shard, response_id, &m, &p, trace_state_ptr,
                                   timeout, &errors, forward_fn = std::move(forward_fn)] (gms::inet_address forward) {
@@ -5012,7 +5025,7 @@ storage_proxy::handle_write(netw::messaging_service::msg_addr src_addr, rpc::opt
                             .then_wrapped([&p, &errors] (future<> f) {
                         if (f.failed()) {
                             ++p->get_stats().forwarding_errors;
-                            errors++;
+                            errors.count++;
                         };
                         f.ignore_ready_future();
                     });
@@ -5020,14 +5033,15 @@ storage_proxy::handle_write(netw::messaging_service::msg_addr src_addr, rpc::opt
             ).then_wrapped([trace_state_ptr, reply_to, shard, response_id, &errors, &p] (future<std::tuple<future<>, future<>>>&& f) {
                 // ignore results, since we'll be returning them via MUTATION_DONE/MUTATION_FAILURE verbs
                 auto fut = make_ready_future<seastar::rpc::no_wait_type>(netw::messaging_service::no_wait());
-                if (errors) {
-                    tracing::trace(trace_state_ptr, "Sending mutation_failure with {} failures to /{}", errors, reply_to);
+                if (errors.count) {
+                    tracing::trace(trace_state_ptr, "Sending mutation_failure with {} failures to /{}", errors.count, reply_to);
                     fut = ser::storage_proxy_rpc_verbs::send_mutation_failed(&p->_messaging,
                             netw::messaging_service::msg_addr{reply_to, shard},
                             shard,
                             response_id,
-                            errors,
-                            p->get_view_update_backlog()).then_wrapped([] (future<> f) {
+                            errors.count,
+                            p->get_view_update_backlog(),
+                            std::move(errors.local)).then_wrapped([] (future<> f) {
                         f.ignore_ready_future();
                         return netw::messaging_service::no_wait();
                     });
@@ -5092,11 +5106,21 @@ storage_proxy::handle_mutation_done(const rpc::client_info& cinfo, unsigned shar
 }
 
 future<rpc::no_wait_type>
-storage_proxy::handle_mutation_failed(const rpc::client_info& cinfo, unsigned shard, storage_proxy::response_id_type response_id, size_t num_failed, rpc::optional<db::view::update_backlog> backlog) {
+storage_proxy::handle_mutation_failed(const rpc::client_info& cinfo, unsigned shard, storage_proxy::response_id_type response_id, size_t num_failed, rpc::optional<db::view::update_backlog> backlog, rpc::optional<replica::exception_variant> exception) {
         auto& from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
         get_stats().replica_cross_shard_ops += shard != this_shard_id();
-        return container().invoke_on(shard, _write_ack_smp_service_group, [from, response_id, num_failed, backlog = std::move(backlog)] (storage_proxy& sp) mutable {
-            sp.got_failure_response(response_id, from, num_failed, std::move(backlog), error::FAILURE, std::nullopt);
+        return container().invoke_on(shard, _write_ack_smp_service_group, [from, response_id, num_failed, backlog = std::move(backlog), exception = std::move(exception)] (storage_proxy& sp) mutable {
+            error err = error::FAILURE;
+            if (exception) {
+                err = std::visit([] <typename Ex> (Ex&) {
+                    if constexpr (std::is_same_v<Ex, replica::rate_limit_exception>) {
+                        return error::RATE_LIMIT;
+                    } else if constexpr (std::is_same_v<Ex, replica::unknown_exception> || std::is_same_v<Ex, replica::no_exception>) {
+                        return error::FAILURE;
+                    }
+                }, exception->reason);
+            }
+            sp.got_failure_response(response_id, from, num_failed, std::move(backlog), err, std::nullopt);
             return netw::messaging_service::no_wait();
         });
 }
diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh
index a7eebb3933..a4861e7d4f 100644
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -46,6 +46,7 @@
 #include "partition_range_compat.hh"
 #include "exceptions/exceptions.hh"
 #include "exceptions/coordinator_result.hh"
+#include "replica/exceptions.hh"
 
 class reconcilable_result;
 class frozen_mutation_and_schema;
@@ -126,6 +127,7 @@ public:
         NONE,
         TIMEOUT,
         FAILURE,
+        RATE_LIMIT,
     };
     template<typename T = void>
     using result = exceptions::coordinator_result<T>;
@@ -435,7 +437,7 @@ private:
             inet_address_vector_replica_set forward, gms::inet_address reply_to, unsigned shard,
             storage_proxy::response_id_type response_id, std::optional<tracing::trace_info> trace_info);
     future<rpc::no_wait_type> handle_mutation_done(const rpc::client_info& cinfo, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<db::view::update_backlog> backlog);
-    future<rpc::no_wait_type> handle_mutation_failed(const rpc::client_info& cinfo, unsigned shard, storage_proxy::response_id_type response_id, size_t num_failed, rpc::optional<db::view::update_backlog> backlog);
+    future<rpc::no_wait_type> handle_mutation_failed(const rpc::client_info& cinfo, unsigned shard, storage_proxy::response_id_type response_id, size_t num_failed, rpc::optional<db::view::update_backlog> backlog, rpc::optional<replica::exception_variant> exception);
     future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>> handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda);
     future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> handle_read_mutation_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr);
     future<rpc::tuple<query::result_digest, long, cache_temperature>> handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda);

From 000f417d23b9fa40c7381b6c728d4786742ae13f Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Tue, 29 Mar 2022 11:37:42 +0200
Subject: [PATCH 05/29] gms: add TYPED_ERRORS_IN_READ_RPC cluster feature

We would like to extend the read RPC to return an optional, second value
which indicates an exception - seastar type-erases exception on the RPC
handler boundary and we need to differentiate rate_limit_exception from
others. However, it may happen that a replica with an up-to-date version
of Scylla tries to return an exception in this way to a coordinator with
an old version and the coordinator will drop the error, thinking that
the request succeeded.

In order to protect from that, we introduce the
`TYPED_ERROR_IN_READ_RPC` feature. Only after it is enabled replicas
will start returning exceptions in the new way, and until then all
exceptions will be reported using seastar's type-erasure mechanism.
---
 gms/feature_service.hh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gms/feature_service.hh b/gms/feature_service.hh
index 9912e0afc1..5ec669ad84 100644
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -108,6 +108,7 @@ public:
     gms::feature tombstone_gc_options { *this, "TOMBSTONE_GC_OPTIONS"sv };
     gms::feature parallelized_aggregation { *this, "PARALLELIZED_AGGREGATION"sv };
     gms::feature keyspace_storage_options { *this, "KEYSPACE_STORAGE_OPTIONS"sv };
+    gms::feature typed_errors_in_read_rpc { *this, "TYPED_ERRORS_IN_READ_RPC"sv };
 
 public:
 

From 2162bb9f3bce49559037ed66ccdfe3377f12ade8 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Tue, 29 Mar 2022 17:24:27 +0200
Subject: [PATCH 06/29] storage_proxy: propagate rate_limit_exception through
 read RPC

This commit modifies the read RPC and the storage_proxy logic so that
the coordinator knows whether a read operation failed due to rate limit
being exceeded, and returns `exceptions::rate_limit_exception` if that
happens.
---
 idl/storage_proxy.idl.hh |  6 +--
 service/storage_proxy.cc | 98 +++++++++++++++++++++++++++++++---------
 service/storage_proxy.hh |  9 ++--
 3 files changed, 86 insertions(+), 27 deletions(-)

diff --git a/idl/storage_proxy.idl.hh b/idl/storage_proxy.idl.hh
index 5a16736a08..14fc707f6e 100644
--- a/idl/storage_proxy.idl.hh
+++ b/idl/storage_proxy.idl.hh
@@ -11,9 +11,9 @@ verb [[with_client_info, one_way]] mutation_done (unsigned shard, uint64_t respo
 verb [[with_client_info, one_way]] mutation_failed (unsigned shard, uint64_t response_id, size_t num_failed, db::view::update_backlog backlog [[version 3.1.0]], replica::exception_variant exception [[version 5.1.0]]);
 verb [[with_client_info, with_timeout]] counter_mutation (std::vector<frozen_mutation> fms, db::consistency_level cl, std::optional<tracing::trace_info> trace_info);
 verb [[with_client_info, with_timeout, one_way]] hint_mutation (frozen_mutation fm, inet_address_vector_replica_set forward, gms::inet_address reply_to, unsigned shard, uint64_t response_id, std::optional<tracing::trace_info> trace_info [[version 1.3.0]] /* this verb was mistakenly introduced with optional trace_info */);
-verb [[with_client_info, with_timeout]] read_data (query::read_command cmd, ::compat::wrapping_partition_range pr, query::digest_algorithm digest [[version 3.0.0]]) -> query::result [[lw_shared_ptr]], cache_temperature [[version 2.0.0]];
-verb [[with_client_info, with_timeout]] read_mutation_data (query::read_command cmd, ::compat::wrapping_partition_range pr) -> reconcilable_result [[lw_shared_ptr]], cache_temperature [[version 2.0.0]];
-verb [[with_client_info, with_timeout]] read_digest (query::read_command cmd, ::compat::wrapping_partition_range pr, query::digest_algorithm digest [[version 3.0.0]]) -> query::result_digest, api::timestamp_type [[version 1.2.0]], cache_temperature [[version 2.0.0]];
+verb [[with_client_info, with_timeout]] read_data (query::read_command cmd, ::compat::wrapping_partition_range pr, query::digest_algorithm digest [[version 3.0.0]]) -> query::result [[lw_shared_ptr]], cache_temperature [[version 2.0.0]], replica::exception_variant [[version 5.1.0]];
+verb [[with_client_info, with_timeout]] read_mutation_data (query::read_command cmd, ::compat::wrapping_partition_range pr) -> reconcilable_result [[lw_shared_ptr]], cache_temperature [[version 2.0.0]], replica::exception_variant [[version 5.1.0]];
+verb [[with_client_info, with_timeout]] read_digest (query::read_command cmd, ::compat::wrapping_partition_range pr, query::digest_algorithm digest [[version 3.0.0]]) -> query::result_digest, api::timestamp_type [[version 1.2.0]], cache_temperature [[version 2.0.0]], replica::exception_variant [[version 5.1.0]];
 verb [[with_timeout]] truncate (sstring, sstring);
 verb [[with_client_info, with_timeout]] paxos_prepare (query::read_command cmd, partition_key key, utils::UUID ballot, bool only_digest, query::digest_algorithm da, std::optional<tracing::trace_info> trace_info) -> service::paxos::prepare_response [[unique_ptr]];
 verb [[with_client_info, with_timeout]] paxos_accept (service::paxos::proposal proposal [[ref]], std::optional<tracing::trace_info> trace_info) -> bool;
diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index baf30261ac..e26368bf5d 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -2845,6 +2845,11 @@ future<result<>> storage_proxy::schedule_repair(std::unordered_map<dht::token, s
 
 class abstract_read_resolver {
 protected:
+    enum class error_kind : uint8_t {
+        FAILURE,
+        DISCONNECT,
+        RATE_LIMIT,
+    };
     db::consistency_level _cl;
     size_t _targets_count;
     promise<result<>> _done_promise; // all target responded
@@ -2876,18 +2881,21 @@ public:
         _timeout.arm(timeout);
     }
     virtual ~abstract_read_resolver() {};
-    virtual void on_error(gms::inet_address ep, bool disconnect) = 0;
+    virtual void on_error(gms::inet_address ep, error_kind kind) = 0;
     future<result<>> done() {
         return _done_promise.get_future();
     }
     void error(gms::inet_address ep, std::exception_ptr eptr) {
         sstring why;
-        bool disconnect = false;
+        error_kind kind = error_kind::FAILURE;
         try {
             std::rethrow_exception(eptr);
+        } catch (replica::rate_limit_exception&) {
+            // There might be a lot of those, so ignore
+            kind = error_kind::RATE_LIMIT;
         } catch (rpc::closed_error&) {
             // do not report connection closed exception, gossiper does that
-            disconnect = true;
+            kind = error_kind::DISCONNECT;
         } catch (rpc::timeout_error&) {
             // do not report timeouts, the whole operation will timeout and be reported
             return; // also do not report timeout as replica failure for the same reason
@@ -2909,7 +2917,7 @@ public:
         }
 
         if (!_request_failed) { // request may fail only once.
-            on_error(ep, disconnect);
+            on_error(ep, kind);
         }
     }
 };
@@ -2990,18 +2998,26 @@ public:
             _done_promise.set_value(bo::success());
         }
     }
-    void on_error(gms::inet_address ep, bool disconnect) override {
+    void on_error(gms::inet_address ep, error_kind kind) override {
         if (waiting_for(ep)) {
             _failed++;
         }
-        if (disconnect && _block_for == _target_count_for_cl) {
+        if (kind == error_kind::DISCONNECT && _block_for == _target_count_for_cl) {
             // if the error is because of a connection disconnect and there is no targets to speculate
             // wait for timeout in hope that the client will issue speculative read
             // FIXME: resolver should have access to all replicas and try another one in this case
             return;
         }
         if (_block_for + _failed > _target_count_for_cl) {
-            fail_request(read_failure_exception(_schema->ks_name(), _schema->cf_name(), _cl, _cl_responses, _failed, _block_for, _data_result));
+            switch (kind) {
+            case error_kind::RATE_LIMIT:
+                fail_request(exceptions::rate_limit_exception(_schema->ks_name(), _schema->cf_name(), db::operation_type::read, false));
+                break;
+            case error_kind::DISCONNECT:
+            case error_kind::FAILURE:
+                fail_request(read_failure_exception(_schema->ks_name(), _schema->cf_name(), _cl, _cl_responses, _failed, _block_for, _data_result));
+                break;
+            }
         }
     }
     future<result<digest_read_result>> has_cl() {
@@ -3314,8 +3330,16 @@ public:
             }
         }
     }
-    void on_error(gms::inet_address ep, bool disconnect) override {
-        fail_request(read_failure_exception(_schema->ks_name(), _schema->cf_name(), _cl, response_count(), 1, _targets_count, response_count() != 0));
+    void on_error(gms::inet_address ep, error_kind kind) override {
+        switch (kind) {
+        case error_kind::RATE_LIMIT:
+            fail_request(exceptions::rate_limit_exception(_schema->ks_name(), _schema->cf_name(), db::operation_type::read, false));
+            break;
+        case error_kind::DISCONNECT:
+        case error_kind::FAILURE:
+            fail_request(read_failure_exception(_schema->ks_name(), _schema->cf_name(), _cl, response_count(), 1, _targets_count, response_count() != 0));
+            break;
+        }
     }
     uint32_t max_live_count() const {
         return _max_live_count;
@@ -3565,8 +3589,11 @@ protected:
             return _proxy->query_mutations_locally(_schema, cmd, _partition_range, timeout, _trace_state);
         } else {
             tracing::trace(_trace_state, "read_mutation_data: sending a message to /{}", ep);
-            return ser::storage_proxy_rpc_verbs::send_read_mutation_data(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *cmd, _partition_range).then([this, ep](rpc::tuple<reconcilable_result, rpc::optional<cache_temperature>> result_and_hit_rate) {
-                auto&& [result, hit_rate] = result_and_hit_rate;
+            return ser::storage_proxy_rpc_verbs::send_read_mutation_data(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *cmd, _partition_range).then([this, ep](rpc::tuple<reconcilable_result, rpc::optional<cache_temperature>, rpc::optional<replica::exception_variant>> result_and_hit_rate) {
+                auto&& [result, hit_rate, opt_exception] = result_and_hit_rate;
+                if (opt_exception.has_value() && *opt_exception) {
+                    return make_exception_future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>((*opt_exception).into_exception_ptr());
+                }
                 tracing::trace(_trace_state, "read_mutation_data: got response from /{}", ep);
                 return make_ready_future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>(rpc::tuple(make_foreign(::make_lw_shared<reconcilable_result>(std::move(result))), hit_rate.value_or(cache_temperature::invalid())));
             });
@@ -3582,8 +3609,11 @@ protected:
             return _proxy->query_result_local(_schema, _cmd, _partition_range, opts, _trace_state, timeout);
         } else {
             tracing::trace(_trace_state, "read_data: sending a message to /{}", ep);
-            return ser::storage_proxy_rpc_verbs::send_read_data(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range, opts.digest_algo).then([this, ep](rpc::tuple<query::result, rpc::optional<cache_temperature>> result_hit_rate) {
-                auto&& [result, hit_rate] = result_hit_rate;
+            return ser::storage_proxy_rpc_verbs::send_read_data(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range, opts.digest_algo).then([this, ep](rpc::tuple<query::result, rpc::optional<cache_temperature>, rpc::optional<replica::exception_variant>> result_hit_rate) {
+                auto&& [result, hit_rate, opt_exception] = result_hit_rate;
+                if (opt_exception.has_value() && *opt_exception) {
+                    return make_exception_future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>((*opt_exception).into_exception_ptr());
+                }
                 tracing::trace(_trace_state, "read_data: got response from /{}", ep);
                 return make_ready_future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>(rpc::tuple(make_foreign(::make_lw_shared<query::result>(std::move(result))), hit_rate.value_or(cache_temperature::invalid())));
             });
@@ -3599,8 +3629,11 @@ protected:
             tracing::trace(_trace_state, "read_digest: sending a message to /{}", ep);
             return ser::storage_proxy_rpc_verbs::send_read_digest(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *_cmd,
                         _partition_range, digest_algorithm(*_proxy)).then([this, ep] (
-                    rpc::tuple<query::result_digest, rpc::optional<api::timestamp_type>, rpc::optional<cache_temperature>> digest_timestamp_hit_rate) {
-                auto&& [d, t, hit_rate] = digest_timestamp_hit_rate;
+                    rpc::tuple<query::result_digest, rpc::optional<api::timestamp_type>, rpc::optional<cache_temperature>, rpc::optional<replica::exception_variant>> digest_timestamp_hit_rate) {
+                auto&& [d, t, hit_rate, opt_exception] = digest_timestamp_hit_rate;
+                if (opt_exception.has_value() && *opt_exception) {
+                    return make_exception_future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>>((*opt_exception).into_exception_ptr());
+                }
                 tracing::trace(_trace_state, "read_digest: got response from /{}", ep);
                 return make_ready_future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>>(rpc::tuple(d, t ? t.value() : api::missing_timestamp, hit_rate.value_or(cache_temperature::invalid())));
             });
@@ -5125,7 +5158,27 @@ storage_proxy::handle_mutation_failed(const rpc::client_info& cinfo, unsigned sh
         });
 }
 
-future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>
+template<typename... Elements>
+future<rpc::tuple<Elements..., replica::exception_variant>> storage_proxy::encode_replica_exception_for_rpc(future<rpc::tuple<Elements...>>&& f, auto&& default_tuple_maker) {
+    using original_std_tuple_type = std::tuple<Elements...>;
+    using final_tuple_type = rpc::tuple<Elements..., replica::exception_variant>;
+
+    if (!f.failed()) {
+        return make_ready_future<final_tuple_type>(std::tuple_cat(original_std_tuple_type(f.get()), std::tuple<replica::exception_variant>(replica::exception_variant())));
+    }
+
+    std::exception_ptr eptr = f.get_exception();
+    if (features().typed_errors_in_read_rpc) {
+        replica::exception_variant ex = replica::try_encode_replica_exception(eptr);
+        if (ex) {
+            return make_ready_future<final_tuple_type>(std::tuple_cat(default_tuple_maker(), std::tuple<replica::exception_variant>(std::move(ex))));
+        }
+    }
+
+    return make_exception_future<final_tuple_type>(std::move(eptr));
+}
+
+future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature, replica::exception_variant>>
 storage_proxy::handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda) {
         tracing::trace_state_ptr trace_state_ptr;
         auto src_addr = netw::messaging_service::get_source(cinfo);
@@ -5154,13 +5207,14 @@ storage_proxy::handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_poi
                 opts.request = da == query::digest_algorithm::none ? query::result_request::only_result : query::result_request::result_and_digest;
                 auto timeout = t ? *t : db::no_timeout;
                 return p->query_result_local(std::move(s), cmd, std::move(pr2.first), opts, trace_state_ptr, timeout);
-            }).finally([&trace_state_ptr, src_ip] () mutable {
+            }).then_wrapped([this, &trace_state_ptr, src_ip] (future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>> f) mutable {
                 tracing::trace(trace_state_ptr, "read_data handling is done, sending a response to /{}", src_ip);
+                return encode_replica_exception_for_rpc(std::move(f), [] { return std::make_tuple(foreign_ptr(make_lw_shared<query::result>()), cache_temperature::invalid()); });
             });
         });
 }
 
-future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>
+future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature, replica::exception_variant>>
 storage_proxy::handle_read_mutation_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr) {
         tracing::trace_state_ptr trace_state_ptr;
         auto src_addr = netw::messaging_service::get_source(cinfo);
@@ -5187,13 +5241,14 @@ storage_proxy::handle_read_mutation_data(const rpc::client_info& cinfo, rpc::opt
                 unwrapped = ::compat::unwrap(std::move(pr), *s);
                 auto timeout = t ? *t : db::no_timeout;
                 return p->query_mutations_locally(std::move(s), std::move(cmd), unwrapped, timeout, trace_state_ptr);
-            }).finally([&trace_state_ptr, src_ip] () mutable {
+            }).then_wrapped([this, &trace_state_ptr, src_ip] (future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> f) mutable {
                 tracing::trace(trace_state_ptr, "read_mutation_data handling is done, sending a response to /{}", src_ip);
+                return encode_replica_exception_for_rpc(std::move(f), [] { return std::make_tuple(foreign_ptr(make_lw_shared<reconcilable_result>()), cache_temperature::invalid()); });
             });
         });
 }
 
-future<rpc::tuple<query::result_digest, long, cache_temperature>>
+future<rpc::tuple<query::result_digest, long, cache_temperature, replica::exception_variant>>
 storage_proxy::handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda) {
         tracing::trace_state_ptr trace_state_ptr;
         auto src_addr = netw::messaging_service::get_source(cinfo);
@@ -5217,8 +5272,9 @@ storage_proxy::handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_p
                 }
                 auto timeout = t ? *t : db::no_timeout;
                 return p->query_result_local_digest(std::move(s), cmd, std::move(pr2.first), trace_state_ptr, timeout, da);
-            }).finally([&trace_state_ptr, src_ip] () mutable {
+            }).then_wrapped([this, &trace_state_ptr, src_ip] (future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>> f) mutable {
                 tracing::trace(trace_state_ptr, "read_digest handling is done, sending a response to /{}", src_ip);
+                return encode_replica_exception_for_rpc(std::move(f), [] { return std::make_tuple(query::result_digest(), api::missing_timestamp, cache_temperature::invalid()); });
             });
         });
 }
diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh
index a4861e7d4f..2e16dbf74e 100644
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -426,6 +426,9 @@ private:
     void retire_view_response_handlers(noncopyable_function<bool(const abstract_write_response_handler&)> filter_fun);
     void connection_dropped(gms::inet_address);
 private:
+    template<typename... Elements>
+    future<rpc::tuple<Elements..., replica::exception_variant>> encode_replica_exception_for_rpc(future<rpc::tuple<Elements...>>&& f, auto&& default_tuple_maker);
+
     future<> handle_counter_mutation(const rpc::client_info& cinfo, rpc::opt_time_point t, std::vector<frozen_mutation> fms, db::consistency_level cl, std::optional<tracing::trace_info> trace_info);
     future<rpc::no_wait_type> handle_write(netw::msg_addr src_addr, rpc::opt_time_point t,
                       utils::UUID schema_version, auto in, inet_address_vector_replica_set forward, gms::inet_address reply_to,
@@ -438,9 +441,9 @@ private:
             storage_proxy::response_id_type response_id, std::optional<tracing::trace_info> trace_info);
     future<rpc::no_wait_type> handle_mutation_done(const rpc::client_info& cinfo, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<db::view::update_backlog> backlog);
     future<rpc::no_wait_type> handle_mutation_failed(const rpc::client_info& cinfo, unsigned shard, storage_proxy::response_id_type response_id, size_t num_failed, rpc::optional<db::view::update_backlog> backlog, rpc::optional<replica::exception_variant> exception);
-    future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>> handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda);
-    future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> handle_read_mutation_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr);
-    future<rpc::tuple<query::result_digest, long, cache_temperature>> handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda);
+    future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature, replica::exception_variant>> handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda);
+    future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature, replica::exception_variant>> handle_read_mutation_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr);
+    future<rpc::tuple<query::result_digest, long, cache_temperature, replica::exception_variant>> handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda);
     future<> handle_truncate(rpc::opt_time_point timeout, sstring ksname, sstring cfname);
     future<foreign_ptr<std::unique_ptr<service::paxos::prepare_response>>> handle_paxos_prepare(const rpc::client_info& cinfo, rpc::opt_time_point timeout,
                 query::read_command cmd, partition_key key, utils::UUID ballot, bool only_digest, query::digest_algorithm da,

From 0fe8b55427914bfd79fb749d715d0c5d28fe52ba Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Thu, 2 Dec 2021 14:25:43 +0100
Subject: [PATCH 07/29] db: add rate_limiter

Introduces the rate_limiter, a replica-side data structure meant for
tracking the frequence with which each partition is being accessed
(separately for reads and writes) and deciding whether the request
should be accepted and processed further or rejected.

The limiter is implemented as a statically allocated hashmap which keeps
track of the frequency with which partitions are accessed. Its entries
are incremented when an operation is admitted and are decayed
exponentially over time.

If a partition is detected to be accessed more than its limit allows,
requests are rejected with a probability calculated in such a way that,
on average, the number of accepted requests is kept at the limit.

The structure currently weights a bit above 1MB and each shard is meant
to keep a separate instance. All operations are O(1), including the
periodic timer.
---
 CMakeLists.txt                           |   1 +
 configure.py                             |   4 +
 db/per_partition_rate_limit_info.hh      |  49 ++++
 db/rate_limiter.cc                       | 305 +++++++++++++++++++++++
 db/rate_limiter.hh                       | 178 +++++++++++++
 idl/per_partition_rate_limit_info.idl.hh |  23 ++
 test/boost/rate_limiter_test.cc          | 130 ++++++++++
 7 files changed, 690 insertions(+)
 create mode 100644 db/per_partition_rate_limit_info.hh
 create mode 100644 db/rate_limiter.cc
 create mode 100644 db/rate_limiter.hh
 create mode 100644 idl/per_partition_rate_limit_info.idl.hh
 create mode 100644 test/boost/rate_limiter_test.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7484d0a43d..312dd620e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -450,6 +450,7 @@ set(scylla_sources
     db/large_data_handler.cc
     db/legacy_schema_migrator.cc
     db/marshal/type_parser.cc
+    db/rate_limiter.cc
     db/schema_tables.cc
     db/size_estimates_virtual_reader.cc
     db/snapshot-ctl.cc
diff --git a/configure.py b/configure.py
index e35e4a6cf2..1550ac08ff 100755
--- a/configure.py
+++ b/configure.py
@@ -505,6 +505,7 @@ scylla_tests = set([
     'test/boost/group0_test',
     'test/boost/exception_container_test',
     'test/boost/result_utils_test',
+    'test/boost/rate_limiter_test',
     'test/boost/expr_test',
     'test/manual/ec2_snitch_test',
     'test/manual/enormous_table_scan_test',
@@ -887,6 +888,7 @@ scylla_core = (['replica/database.cc',
                 'db/view/row_locking.cc',
                 'db/sstables-format-selector.cc',
                 'db/snapshot-ctl.cc',
+                'db/rate_limiter.cc',
                 'index/secondary_index_manager.cc',
                 'index/secondary_index.cc',
                 'utils/UUID_gen.cc',
@@ -1130,6 +1132,7 @@ idls = ['idl/gossip_digest.idl.hh',
         'idl/group0_state_machine.idl.hh',
         'idl/forward_request.idl.hh',
         'idl/replica_exception.idl.hh',
+        'idl/per_partition_rate_limit_info.idl.hh',
         ]
 
 rusts = [
@@ -1281,6 +1284,7 @@ deps['test/boost/linearizing_input_stream_test'] = [
     "test/lib/log.cc",
 ]
 deps['test/boost/expr_test'] = ['test/boost/expr_test.cc'] + scylla_core
+deps['test/boost/rate_limiter_test'] = ['test/boost/rate_limiter_test.cc', 'db/rate_limiter.cc']
 
 deps['test/boost/duration_test'] += ['test/lib/exception_utils.cc']
 deps['test/boost/schema_loader_test'] += ['tools/schema_loader.cc']
diff --git a/db/per_partition_rate_limit_info.hh b/db/per_partition_rate_limit_info.hh
new file mode 100644
index 0000000000..e9cf0dfd45
--- /dev/null
+++ b/db/per_partition_rate_limit_info.hh
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <variant>
+#include <seastar/util/bool_class.hh>
+
+namespace db {
+
+using allow_per_partition_rate_limit = seastar::bool_class<class allow_per_partition_rate_limit_tag>;
+
+namespace per_partition_rate_limit {
+
+// Tells the replica to account the operation (increase the corresponding counter)
+// and accept it regardless from the value of the counter.
+//
+// Used when the coordinator IS a replica (correct node and shard).
+struct account_only {};
+
+// Tells the replica to account the operation and decide whether to reject
+// or not, based on the random variable sent by the coordinator.
+//
+// Used when the coordinator IS NOT a replica (wrong node or shard).
+struct account_and_enforce {
+    // A random 32-bit number generated by the coordinator.
+    // Replicas are supposed to use it in order to decide whether
+    // to accept or reject.
+    uint32_t random_variable;
+
+    inline double get_random_variable_as_double() const {
+        return double(random_variable) / double(1LL << 32);
+    }
+};
+
+// std::monostate -> do not count to the rate limit and never reject
+// account_and_enforce -> account to the rate limit and optionally reject
+using info = std::variant<std::monostate, account_only, account_and_enforce>;
+
+} // namespace per_partition_rate_limit
+
+} // namespace db
+
diff --git a/db/rate_limiter.cc b/db/rate_limiter.cc
new file mode 100644
index 0000000000..195387289a
--- /dev/null
+++ b/db/rate_limiter.cc
@@ -0,0 +1,305 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include <cmath>
+#include <numbers>
+#include <array>
+#include <random>
+#include <variant>
+#include <chrono>
+
+#include <seastar/core/metrics.hh>
+
+#include "utils/small_vector.hh"
+#include "utils/murmur_hash.hh"
+#include "db/rate_limiter.hh"
+
+// The rate limiter keeps a hashmap of counters differentiated by operation type
+// (e.g. read or write) and the partition token. On each operation,
+// the corresponding counter is increased by 1.
+//
+// The counters are decremented via two mechanisms:
+//
+// 1. Every `time_window_duration`, all counters are halved.
+// 2. Within a time window, on every `bucket_size` operations all counters
+//    are decremented by 1.
+//
+// The mechanism 1) makes sure that we do not forget about very frequent
+// operations too quick and makes it possible to reject in a probabilistic
+// manner (this is described in more detail in design notes).
+//
+// The mechanism 2) protects the internal hashmap from being flooded with
+// counters with low values. This causes the rate limiter to underestimate
+// the counter values by the current number of "buckets" within this
+// time window. This strategy is also known as "lossy counting".
+//
+// Both mechanisms 1) and 2) are implemented in a lazy manner.
+
+namespace db {
+
+static constexpr size_t hash_bits = 16;
+static constexpr size_t entry_count = 1 << hash_bits;
+static constexpr size_t bucket_size = 10000;
+
+
+void rate_limiter_base::on_timer() noexcept {
+    _time_window_history.pop_back();
+    _time_window_history.insert(_time_window_history.begin(), time_window_entry {
+        .entries_active = _current_entries_in_time_window,
+        .lossy_counting_decrease = _current_bucket,
+    });
+
+    _current_bucket = 0;
+    _current_ops_in_bucket = 0;
+    _current_entries_in_time_window = 0;
+
+    _current_time_window = (_current_time_window + 1) % (1 << time_window_bits);
+
+    // Because time window ids are 12 bit numbers and we increase the current
+    // time window number by 1 every second, it wraps around every 4096
+    // seconds (more than an hour). Because of this, some very old entry
+    // updated last 4096 seconds may accidentally become valid again.
+    //
+    // In order to prevent this, we make sure to update the entries
+    // more frequently. We do this by refreshing all the entries within half
+    // of the wraparound period (2048 seconds).
+    //
+    // Instead of clearing everything at once, we divide this operation
+    // into many small steps and perform them during time window change.
+    //
+    // All of this should make sure that each entry's time window is not
+    // older than 2048 seconds from the current generation.
+
+    constexpr size_t period = 1 << (time_window_bits - 1);
+    constexpr size_t entries_per_step = entry_count / period;
+
+    const size_t begin = _current_time_window * entries_per_step;
+    for (size_t i = 0; i < entries_per_step; i++) {
+        entry_refresh(_entries[(begin + i) % entry_count]);
+    }
+}
+
+rate_limiter_base::entry* rate_limiter_base::get_entry(uint32_t label, uint64_t token) noexcept {
+    // We need to either find the existing entry for this (label, token) combination
+    // or otherwise find an invalid entry which we can initialize and use.
+    //
+    // We start by looking at the entry corresponding to the computed hash,
+    // if it's occupied by another (label, token) try other entries using
+    // the quadratic probing strategy.
+    //
+    // We limit ourselves to 32 attempts - if no suitable entry is found
+    // then we return nullptr and admit the operation unconditionally.
+
+    // Because we use quadratic probing and entries can be deleted (lazily),
+    // a situation can occur where an entry A suddenly becomes inaccessible
+    // because another entry B which is earlier on the probe chain is deleted.
+    // One of the following will happen:
+    //
+    // 1. Either we will allocate a new entry over B and A becomes accessible
+    //    again,
+    // 2. Or we will allocate a new entry for the same operation/partition as A
+    //    and A will eventually expire.
+    //
+    // In the worst case, A might be a "hot" entry and be actively rate limited
+    // and the described situation will cause a large number of operations
+    // to be admitted. Fortunately, this will move the entry earlier in the
+    // probe chain, so this situation will happen a limited number of times (if
+    // any at all) for a single "hot" entry.
+
+    size_t hash = compute_hash(label, token);
+
+    static constexpr size_t max_probes = 32;
+    for (size_t i = 0; i < max_probes; i++) {
+        // Quadratic probing - every iteration jumps further than the previous one
+        hash = (hash + i) % entry_count;
+        entry& b = _entries[hash];
+        ++_metrics.probe_count;
+
+        entry_refresh(b);
+
+        if (entry_is_empty(b)) {
+            ++_metrics.allocations_on_empty;
+            b.token = token;
+            b.label = label;
+            b.op_count = _current_bucket;
+            return &b;
+        } else if (b.token == token && b.label == label) {
+            ++_metrics.successful_lookups;
+            return &b;
+        }
+    }
+
+    ++_metrics.failed_allocations;
+    return nullptr;
+}
+
+size_t rate_limiter_base::compute_hash(uint32_t label, uint64_t token) noexcept {
+    // The map key is a tuple (token, key) + salt
+    // The key is hashed with murmur hash for good hash quality
+
+    static constexpr size_t key_length = sizeof(token) + sizeof(label) + sizeof(_salt);
+
+    std::array<uint8_t, key_length> key;
+    uint8_t* ptr = key.data();
+    memcpy(ptr, &token, sizeof(token));
+    ptr += sizeof(token);
+    memcpy(ptr, &label, sizeof(label));
+    ptr += sizeof(label);
+    memcpy(ptr, &_salt, sizeof(_salt));
+
+    std::array<uint64_t, 2> out;
+    utils::murmur_hash::hash3_x64_128(key.data(), key_length, 0, out);
+    return out[0];
+}
+
+void rate_limiter_base::entry_refresh(rate_limiter_base::entry& b) noexcept {
+    uint32_t window_delta = _current_time_window - b.time_window;
+
+    if (window_delta == 0) {
+        // The entry is fresh, it was allocated in this time window
+        return;
+    }
+
+    if (window_delta < _time_window_history.size()) {
+        // The entry is not that old so we have to apply the effects
+        // of lossy counting and halving on time window switch
+        --_time_window_history[window_delta - 1].entries_active;
+        while (window_delta > 0) {
+            if (b.op_count > _time_window_history[window_delta - 1].lossy_counting_decrease) {
+                b.op_count -= _time_window_history[window_delta - 1].lossy_counting_decrease;
+            } else {
+                b.op_count = 0;
+            }
+            b.op_count /= 2;
+
+            --window_delta;
+        }
+    } else {
+        // The entry is very old and the op_count can be safely decreased to zero
+        b.op_count = 0;
+    }
+
+    ++_current_entries_in_time_window;
+    b.time_window = _current_time_window;
+}
+
+bool rate_limiter_base::entry_is_empty(const rate_limiter_base::entry& b) noexcept {
+    return b.op_count <= _current_bucket;
+}
+
+void rate_limiter_base::register_metrics() {
+    namespace sm = seastar::metrics;
+
+    _metric_group.add_group("per_partition_rate_limiter", {
+        // TODO: Most of the following metrics are pretty low-level and not useful for users,
+        // perhaps they should be hidden behind a configuration flag
+
+        sm::make_counter("allocations", _metrics.allocations_on_empty,
+                sm::description("Number of times a entry was allocated over an empty/expired entry.")),
+
+        sm::make_counter("successful_lookups", _metrics.successful_lookups,
+                sm::description("Number of times a lookup returned an already allocated entry.")),
+
+        sm::make_counter("failed_allocations", _metrics.failed_allocations,
+                sm::description("Number of times the rate limiter gave up trying to allocate.")),
+
+        sm::make_counter("probe_count", _metrics.probe_count,
+                sm::description("Number of probes made during lookups.")),
+
+        sm::make_gauge("load_factor", [&] {
+                    uint32_t occupied_entry_count = _current_entries_in_time_window;
+                    for (const auto& twe : _time_window_history) {
+                        occupied_entry_count += twe.entries_active;
+                    }
+                    return double(occupied_entry_count) / double(entry_count);
+                },
+                sm::description("Current load factor of the hash table (upper bound, may be overestimated).")),
+    });
+}
+
+rate_limiter_base::rate_limiter_base()
+        : _salt(std::random_device{}())
+        , _entries(entry_count)
+        , _time_window_history(op_count_bits - 1) {
+    
+    register_metrics();
+}
+
+uint64_t rate_limiter_base::increase_and_get_counter(label& l, uint64_t token) noexcept {
+    // Assign a label if not done yet
+    if (l._label == 0) {
+        l._label = _next_label++;
+    }
+
+    entry* b = get_entry(l._label, token);
+    if (!b) {
+        // We failed to allocate a entry for this partition. This means that
+        // we won't track hit count for this partition during this time window.
+        // Assume that it's OK to admit the operation.
+        return 0;
+    }
+
+    // Protect from wrap-around
+    b->op_count = std::min<uint32_t>((1 << op_count_bits) - 1, b->op_count + 1);
+    ++_current_ops_in_bucket;
+    if (_current_ops_in_bucket >= bucket_size) {
+        // Every `bucket_size` operations, virtually decrement all entries
+        // by one. We implement it by always subtracting the `_current_bucket`
+        // when comparing the count in the entry with the limit.
+        ++_current_bucket;
+        _current_ops_in_bucket -= bucket_size;
+    }
+
+    return b->op_count - _current_bucket;
+}
+
+
+rate_limiter_base::can_proceed rate_limiter_base::account_operation(
+        label& l, uint64_t token, uint64_t limit,
+        const db::per_partition_rate_limit::info& rate_limit_info) noexcept {
+
+    if (std::holds_alternative<std::monostate>(rate_limit_info)) {
+        // Rate limiting turned off
+        return can_proceed::yes;
+    }
+
+    const uint64_t count = increase_and_get_counter(l, token);
+
+    if (auto* info = std::get_if<db::per_partition_rate_limit::account_and_enforce>(&rate_limit_info)) {
+        // On each time window change we halve the entry counts, therefore
+        // a partition with X ops/s will stabilize at 2X hits at the end
+        // of each time window.
+        if (count <= 2 * limit) {
+            return can_proceed::yes;
+        } else {
+            // As mentioned before, assuming a fixed operation rate, the operation
+            // count in a entry will oscillate between X at the beginning of the
+            // time window and 2X at the end. In order to only accept `limit`
+            // operations within a time window, we need to reject with probability
+            // P_c(x), where P_c(x) is a function such that, integrated over [X, 2X]
+            // will be equal to `limit`. `P_c(x) = limit / (x * ln 2)` satisfies
+            // this criterion.
+            //
+            // All replicas get the same value for the random variable X, with an
+            // expectation that all replicas' counters oscillate between the same
+            // values. Because of that, most of the time replicas will agree
+            // and either all accept or reject.
+            if (info->get_random_variable_as_double() * double(count) * std::numbers::ln2 < double(limit)) {
+                return can_proceed::yes;
+            } else {
+                return can_proceed::no;
+            }
+        }
+    } else {
+        return can_proceed::yes;
+    }
+}
+
+template class generic_rate_limiter<seastar::lowres_clock>;
+
+}
diff --git a/db/rate_limiter.hh b/db/rate_limiter.hh
new file mode 100644
index 0000000000..9e891db025
--- /dev/null
+++ b/db/rate_limiter.hh
@@ -0,0 +1,178 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+#include <chrono>
+#include <limits>
+#include <concepts>
+#include <vector>
+#include <optional>
+#include <random>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/timer.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/metrics_registration.hh>
+#include <seastar/util/bool_class.hh>
+
+#include "utils/chunked_vector.hh"
+#include "db/per_partition_rate_limit_info.hh"
+
+// A data structure used to implement per-partition rate limiting. It accounts
+// operations and enforces limits when it is detected that the operation rate
+// is too high.
+
+namespace db {
+
+class rate_limiter_base {
+public:
+    static constexpr size_t op_count_bits = 20;
+    static constexpr size_t time_window_bits = 12;
+
+private:
+    struct metrics {
+        uint64_t allocations_on_empty = 0;
+        uint64_t successful_lookups = 0;
+        uint64_t failed_allocations = 0;
+        uint64_t probe_count = 0;
+    };
+
+    // Represents a piece of the hashmap storage.
+    struct entry {
+    public:
+        // The partition key token of the operation which allocated this entry.
+        uint64_t token = 0;
+
+        // The label of the operation which allocated this entry.
+        // Labels are used to differentiate operations which should be counted
+        // separately, e.g. reads and writes to the same table or writes
+        // to two different tables.
+        uint32_t label = 0;
+
+        // The number of operations counted for given token/label.
+        // It is virtually decremented on each window change, so the real
+        // operation count is actually `op_count - _current_bucket`.
+        // If the number drops to zero or below, the entry is considered
+        // "expired" and may be overwritten by another operation.
+        uint32_t op_count : op_count_bits = 0;
+
+        // ID of the time window in which the entry was allocated.
+        uint32_t time_window : time_window_bits = 0;
+    };
+
+    struct time_window_entry {
+        // How many entries are there active within this time window?
+        uint32_t entries_active = 0;
+
+        // By how much should the counter should be decreased within
+        // this time window?
+        uint32_t lossy_counting_decrease = 0;
+    };
+
+public:
+    struct can_proceed_tag{};
+    using can_proceed = seastar::bool_class<can_proceed_tag>;
+
+    // Identifies a type of operation which is counted separately from other
+    // operations. For example, reads and writes for given table should have
+    // separate labels.
+    struct label {
+    private:
+        // The current ID used to identify the label in the rate limiter.
+        // It is assigned on first use.
+        uint32_t _label = 0;
+
+        friend class rate_limiter_base;
+    };
+
+private:
+    uint32_t _current_bucket = 0;
+    uint32_t _current_ops_in_bucket = 0;
+    uint32_t _current_entries_in_time_window = 0;
+
+    uint32_t _next_label = 1;
+    uint32_t _current_time_window = 0;
+
+    const uint32_t _salt;
+
+    utils::chunked_vector<entry> _entries;
+    std::vector<time_window_entry> _time_window_history;
+
+    metrics _metrics;
+    seastar::metrics::metric_groups _metric_group;
+
+private:
+    entry* get_entry(uint32_t label, uint64_t token) noexcept;
+    size_t compute_hash(uint32_t label, uint64_t token) noexcept;
+
+    void entry_refresh(entry& b) noexcept;
+    bool entry_is_empty(const entry& b) noexcept;
+
+    void register_metrics();
+
+protected:
+    void on_timer() noexcept;
+
+public:
+    rate_limiter_base();
+
+    rate_limiter_base(const rate_limiter_base&) = delete;
+    rate_limiter_base(rate_limiter_base&&) = delete;
+
+    rate_limiter_base& operator=(const rate_limiter_base&) = delete;
+    rate_limiter_base& operator=(rate_limiter_base&&) = delete;
+
+    // (For testing purposes only)
+    // Increments the counter for given (label, token) and returns
+    // the new value of the counter.
+    uint64_t increase_and_get_counter(label& l, uint64_t token) noexcept;
+
+    // Increments the counter for given (label, token).
+    // If the counter indicates that the partition is over the limit,
+    // returns can_proceed::no with some probability.
+    //
+    // The `random_variable` parameter should be a value from range [0, 1).
+    // It is used as the source of randomness - the function chooses a threshold
+    // and accepts if and only if `random_variable` is below it.
+    //
+    // The probability is calculated in such a way that statistically
+    // only `limit` operations per second are admitted.
+    can_proceed account_operation(label& l, uint64_t token, uint64_t limit,
+            const db::per_partition_rate_limit::info& rate_limit_info) noexcept;
+};
+
+template<typename ClockType>
+class generic_rate_limiter : public rate_limiter_base {
+private:
+    seastar::timer<ClockType> _timer;
+
+public:
+    generic_rate_limiter()
+            : rate_limiter_base() {
+
+        // Rate limiting is more accurate when the rate limiter timers
+        // on all nodes are synchronized. Assume that the nodes' clocks
+        // are synchronized and schedule the first tick on the beginning
+        // of the closest second.
+
+        const auto period = std::chrono::seconds(1);
+        const auto now = std::chrono::system_clock::now();
+        const auto initial_delay = period - now.time_since_epoch() % period;
+
+        _timer.set_callback([this] { on_timer(); });
+        _timer.arm(ClockType::now() + initial_delay, period);
+    }
+};
+
+extern template class generic_rate_limiter<seastar::lowres_clock>;
+using rate_limiter = generic_rate_limiter<seastar::lowres_clock>;
+
+}
diff --git a/idl/per_partition_rate_limit_info.idl.hh b/idl/per_partition_rate_limit_info.idl.hh
new file mode 100644
index 0000000000..ee51ceb04c
--- /dev/null
+++ b/idl/per_partition_rate_limit_info.idl.hh
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+namespace db {
+
+namespace per_partition_rate_limit {
+
+struct account_only {};
+
+struct account_and_enforce {
+    uint32_t random_variable;
+};
+
+// using info = std::variant<std::monostate, account_only, account_and_enforce>;
+
+} // namespace per_partition_rate_limit
+
+} // namespace db
diff --git a/test/boost/rate_limiter_test.cc b/test/boost/rate_limiter_test.cc
new file mode 100644
index 0000000000..f54c4319ea
--- /dev/null
+++ b/test/boost/rate_limiter_test.cc
@@ -0,0 +1,130 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include <chrono>
+#include <cstdint>
+#include <seastar/core/manual_clock.hh>
+#include <seastar/core/sleep.hh>
+#include <seastar/core/coroutine.hh>
+#include <seastar/util/later.hh>
+#include <seastar/testing/test_case.hh>
+
+#include "db/rate_limiter.hh"
+
+using namespace seastar;
+using test_rate_limiter = db::generic_rate_limiter<seastar::manual_clock>;
+
+future<> step_seconds(int seconds) {
+    for (int i = 0; i < seconds; i++) {
+        // The rate limiter's timer executes periodically every second
+        // and we want the timer to run `seconds` times.
+        // Because `manual_clock::advance` executes each timer only once
+        // even if they reschedule, we cannot just advance by requested
+        // number of seconds - instead, we must advance multiple times
+        // by one second.
+        manual_clock::advance(std::chrono::seconds(1));
+        co_await yield();
+    }
+}
+
+SEASTAR_TEST_CASE(test_rate_limiter_no_rejections_on_sequential) {
+    const uint64_t token_count = 1000 * 1000;
+    const uint64_t limit = 1;
+    test_rate_limiter::label lbl;
+
+    test_rate_limiter limiter;
+
+    for (uint64_t token = 0; token < token_count; token++) {
+        BOOST_REQUIRE_LE(limiter.increase_and_get_counter(lbl, token), 1);
+        co_await maybe_yield();
+    }
+}
+
+SEASTAR_TEST_CASE(test_rate_limiter_partition_label_separation) {
+    const uint64_t token_count = 30;
+    const uint64_t repeat_count = 10;
+    std::vector<test_rate_limiter::label> labels{3};
+
+    test_rate_limiter limiter;
+
+    for (uint64_t i = 0; i < repeat_count; i++) {
+        for (uint64_t token = 0; token < token_count; token++) {
+            for (auto& l : labels) {
+                BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(l, token), i + 1);
+                co_await maybe_yield();
+            }
+        }
+    }
+}
+
+SEASTAR_TEST_CASE(test_rate_limiter_halving_over_time) {
+    test_rate_limiter::label lbl;
+    test_rate_limiter limiter;
+
+    for (int i = 0; i < 16; i++) {
+        limiter.increase_and_get_counter(lbl, 0);
+    }
+
+    // Should be cut in half
+    co_await step_seconds(1);
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), (16 / 2) + 1);
+
+    // Should decrease four times (9 -> 2)
+    co_await step_seconds(2);
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), (9 / 4) + 1);
+
+    // Should be reset
+    co_await step_seconds(10);
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), 1);
+}
+
+SEASTAR_TEST_CASE(test_rate_limiter_time_window_wraparound_handling) {
+    test_rate_limiter::label lbl;
+    test_rate_limiter limiter;
+
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), 1);
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), 2);
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), 3);
+
+    // Advance far into the future so that the time window wraps around
+    co_await step_seconds(1 << test_rate_limiter::time_window_bits);
+
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), 1);
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), 2);
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), 3);
+
+    // TODO: Workaround for seastar#1072. Calling `manual_clock::advance`
+    // multiple times and then quitting the test immediately causes
+    // the test framework to hang. I didn't have the time to debug it, but I
+    // suspect there are some pending tasks which need to finish before exiting
+    // from the main test task.
+    co_await seastar::sleep(std::chrono::seconds(1));
+}
+
+SEASTAR_TEST_CASE(test_rate_limiter_account_operation) {
+    const uint64_t limit = 1;
+    const int ops_per_loop = 1000;
+    test_rate_limiter::label lbl;
+
+    test_rate_limiter limiter;
+
+    // We use UINT_MAX as the random parameter so that we get rejected quickly
+    db::per_partition_rate_limit::account_and_enforce info {
+        .random_variable = UINT32_MAX,
+    };
+
+    bool encountered_rejection = false;
+    for (int i = 0; i < ops_per_loop; i++) {
+        if (limiter.account_operation(lbl, 0, limit, info) == test_rate_limiter::can_proceed::no) {
+            encountered_rejection = true;
+            break;
+        }
+        co_await maybe_yield();
+    }
+    BOOST_REQUIRE(encountered_rejection);
+}

From dccb8a5729e40fa7533a2a34de029b5007ec5623 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Tue, 19 Oct 2021 11:46:20 +0200
Subject: [PATCH 08/29] schema: add per_partition_rate_limit schema extension

Adds the new `per_partition_rate_limit` schema extension. It has two
parameters: `max_writes_per_second` and `max_reads_per_second`.
In the future commits they will control how many operations of given
type are allowed for each partition in the given table.
---
 configure.py                             |  1 +
 db/per_partition_rate_limit_extension.hh | 42 ++++++++++++++++
 db/per_partition_rate_limit_options.cc   | 63 ++++++++++++++++++++++++
 db/per_partition_rate_limit_options.hh   | 61 +++++++++++++++++++++++
 main.cc                                  |  2 +
 schema.cc                                | 12 +++++
 schema.hh                                |  6 +++
 schema_builder.hh                        |  5 ++
 8 files changed, 192 insertions(+)
 create mode 100644 db/per_partition_rate_limit_extension.hh
 create mode 100644 db/per_partition_rate_limit_options.cc
 create mode 100644 db/per_partition_rate_limit_options.hh

diff --git a/configure.py b/configure.py
index 1550ac08ff..08f99d3fc2 100755
--- a/configure.py
+++ b/configure.py
@@ -889,6 +889,7 @@ scylla_core = (['replica/database.cc',
                 'db/sstables-format-selector.cc',
                 'db/snapshot-ctl.cc',
                 'db/rate_limiter.cc',
+                'db/per_partition_rate_limit_options.cc',
                 'index/secondary_index_manager.cc',
                 'index/secondary_index.cc',
                 'utils/UUID_gen.cc',
diff --git a/db/per_partition_rate_limit_extension.hh b/db/per_partition_rate_limit_extension.hh
new file mode 100644
index 0000000000..e8fb55ac3c
--- /dev/null
+++ b/db/per_partition_rate_limit_extension.hh
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include "db/per_partition_rate_limit_options.hh"
+
+namespace db {
+
+class per_partition_rate_limit_extension : public schema_extension {
+    per_partition_rate_limit_options _options;
+public:
+    static constexpr auto NAME = "per_partition_rate_limit";
+
+    per_partition_rate_limit_extension() = default;
+    per_partition_rate_limit_extension(const per_partition_rate_limit_options& opts) : _options(opts) {}
+
+    explicit per_partition_rate_limit_extension(const std::map<sstring, sstring>& tags) : _options(tags) {}
+    explicit per_partition_rate_limit_extension(const bytes& b) : _options(deserialize(b)) {}
+    explicit per_partition_rate_limit_extension(const sstring& s) {
+        throw std::logic_error("Cannot create per partition rate limit info from string");
+    }
+
+    bytes serialize() const override {
+        return ser::serialize_to_buffer<bytes>(_options.to_map());
+    }
+    static std::map<sstring, sstring> deserialize(const bytes_view& buffer) {
+        return ser::deserialize_from_buffer(buffer, boost::type<std::map<sstring, sstring>>());
+    }
+    const per_partition_rate_limit_options& get_options() const {
+        return _options;
+    }
+
+};
+
+}
+
diff --git a/db/per_partition_rate_limit_options.cc b/db/per_partition_rate_limit_options.cc
new file mode 100644
index 0000000000..fb2107c80e
--- /dev/null
+++ b/db/per_partition_rate_limit_options.cc
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include <optional>
+#include <boost/range/adaptor/map.hpp>
+
+#include "serializer.hh"
+#include "schema.hh"
+#include "log.hh"
+
+namespace db {
+
+const char* per_partition_rate_limit_options::max_writes_per_second_key = "max_writes_per_second";
+const char* per_partition_rate_limit_options::max_reads_per_second_key = "max_reads_per_second";
+
+per_partition_rate_limit_options::per_partition_rate_limit_options(std::map<sstring, sstring> map) {
+    auto handle_uint32_arg = [&] (const char* key) -> std::optional<uint32_t> {
+        auto it = map.find(key);
+        if (it == map.end()) {
+            return std::nullopt;
+        }
+        try {
+            const uint32_t ret = std::stol(it->second);
+            map.erase(it);
+            return ret;
+        } catch (std::invalid_argument&) {
+            throw exceptions::configuration_exception(format(
+                    "Invalid value for {} option: expected a non-negative number",
+                    key));
+        } catch (std::out_of_range&) {
+            throw exceptions::configuration_exception(format(
+                    "Value for {} is out of range accepted by 32-bit numbers",
+                    key));
+        }
+    };
+
+    _max_writes_per_second = handle_uint32_arg(max_writes_per_second_key);
+    _max_reads_per_second = handle_uint32_arg(max_reads_per_second_key);
+
+    if (!map.empty()) {
+        throw exceptions::configuration_exception(format(
+                "Unknown keys in map for per_partition_rate_limit extension: {}",
+                ::join(", ", map | boost::adaptors::map_keys)));
+    }
+}
+
+std::map<sstring, sstring> per_partition_rate_limit_options::to_map() const {
+    std::map<sstring, sstring> ret;
+    if (_max_writes_per_second) {
+        ret.insert_or_assign(max_writes_per_second_key, std::to_string(*_max_writes_per_second));
+    }
+    if (_max_reads_per_second) {
+        ret.insert_or_assign(max_reads_per_second_key, std::to_string(*_max_reads_per_second));
+    }
+    return ret;
+}
+
+}
diff --git a/db/per_partition_rate_limit_options.hh b/db/per_partition_rate_limit_options.hh
new file mode 100644
index 0000000000..f75a40a3ae
--- /dev/null
+++ b/db/per_partition_rate_limit_options.hh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include <optional>
+
+#include "operation_type.hh"
+#include "serializer.hh"
+#include "schema.hh"
+#include "log.hh"
+
+namespace db {
+
+class per_partition_rate_limit_options final {
+private:
+    static const char* max_writes_per_second_key;
+    static const char* max_reads_per_second_key;
+
+private:
+    std::optional<uint32_t> _max_writes_per_second;
+    std::optional<uint32_t> _max_reads_per_second;
+
+public:
+    per_partition_rate_limit_options() = default;
+    per_partition_rate_limit_options(std::map<sstring, sstring> map);
+
+    std::map<sstring, sstring> to_map() const;
+
+    inline std::optional<uint32_t> get_max_ops_per_second(operation_type op_type) const {
+        switch (op_type) {
+        case operation_type::write:
+            return _max_writes_per_second;
+        case operation_type::read:
+            return _max_reads_per_second;
+        }
+    }
+
+    inline void set_max_writes_per_second(std::optional<uint32_t> v) {
+        _max_writes_per_second = v;
+    }
+
+    inline std::optional<uint32_t> get_max_writes_per_second() const {
+        return _max_writes_per_second;
+    }
+
+    inline void set_max_reads_per_second(std::optional<uint32_t> v) {
+        _max_reads_per_second = v;
+    }
+
+    inline std::optional<uint32_t> get_max_reads_per_second() const {
+        return _max_reads_per_second;
+    }
+};
+
+}
diff --git a/main.cc b/main.cc
index 23f01bdbe3..e0f66cfccd 100644
--- a/main.cc
+++ b/main.cc
@@ -86,6 +86,7 @@
 #include "alternator/controller.hh"
 #include "alternator/ttl.hh"
 #include "tools/entry_point.hh"
+#include "db/per_partition_rate_limit_extension.hh"
 
 #include "service/raft/raft_group_registry.hh"
 #include "service/raft/raft_group0_client.hh"
@@ -459,6 +460,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
     ext->add_schema_extension<cdc::cdc_extension>(cdc::cdc_extension::NAME);
     ext->add_schema_extension<db::paxos_grace_seconds_extension>(db::paxos_grace_seconds_extension::NAME);
     ext->add_schema_extension<tombstone_gc_extension>(tombstone_gc_extension::NAME);
+    ext->add_schema_extension<db::per_partition_rate_limit_extension>(db::per_partition_rate_limit_extension::NAME);
 
     auto cfg = make_lw_shared<db::config>(ext);
     auto init = app.get_options_description().add_options();
diff --git a/schema.cc b/schema.cc
index d91fe4e4a0..5984c44b35 100644
--- a/schema.cc
+++ b/schema.cc
@@ -31,6 +31,7 @@
 #include "db/paxos_grace_seconds_extension.hh"
 #include "utils/rjson.hh"
 #include "tombstone_gc_options.hh"
+#include "db/per_partition_rate_limit_extension.hh"
 
 constexpr int32_t schema::NAME_LENGTH;
 
@@ -1269,6 +1270,12 @@ schema_ptr schema_builder::build() {
             dynamic_pointer_cast<db::paxos_grace_seconds_extension>(it->second)->get_paxos_grace_seconds();
     }
 
+    // cache the `per_partition_rate_limit` parameters for fast access through the schema object.
+    if (auto it = new_raw._extensions.find(db::per_partition_rate_limit_extension::NAME); it != new_raw._extensions.end()) {
+        new_raw._per_partition_rate_limit_options =
+            dynamic_pointer_cast<db::per_partition_rate_limit_extension>(it->second)->get_options();
+    }
+
     return make_lw_shared<schema>(schema::private_tag{}, new_raw, _view_info);
 }
 
@@ -1302,6 +1309,11 @@ schema_builder& schema_builder::with_tombstone_gc_options(const tombstone_gc_opt
     return *this;
 }
 
+schema_builder& schema_builder::with_per_partition_rate_limit_options(const db::per_partition_rate_limit_options& opts) {
+    add_extension(db::per_partition_rate_limit_extension::NAME, ::make_shared<db::per_partition_rate_limit_extension>(opts));
+    return *this;
+}
+
 schema_builder& schema_builder::set_paxos_grace_seconds(int32_t seconds) {
     add_extension(db::paxos_grace_seconds_extension::NAME, ::make_shared<db::paxos_grace_seconds_extension>(seconds));
     return *this;
diff --git a/schema.hh b/schema.hh
index d86ea4b029..4938bbd197 100644
--- a/schema.hh
+++ b/schema.hh
@@ -29,6 +29,7 @@
 #include "column_computation.hh"
 #include "timestamp.hh"
 #include "tombstone_gc_options.hh"
+#include "db/per_partition_rate_limit_options.hh"
 
 namespace dht {
 
@@ -621,6 +622,7 @@ private:
         double _dc_local_read_repair_chance = 0.0;
         double _read_repair_chance = 0.0;
         double _crc_check_chance = 1;
+        db::per_partition_rate_limit_options _per_partition_rate_limit_options;
         int32_t _min_compaction_threshold = DEFAULT_MIN_COMPACTION_THRESHOLD;
         int32_t _max_compaction_threshold = DEFAULT_MAX_COMPACTION_THRESHOLD;
         int32_t _min_index_interval = DEFAULT_MIN_INDEX_INTERVAL;
@@ -813,6 +815,10 @@ public:
 
     const ::tombstone_gc_options& tombstone_gc_options() const;
 
+    const db::per_partition_rate_limit_options& per_partition_rate_limit_options() const {
+        return _raw._per_partition_rate_limit_options;
+    }
+
     const ::speculative_retry& speculative_retry() const {
         return _raw._speculative_retry;
     }
diff --git a/schema_builder.hh b/schema_builder.hh
index cfe8ef1a1e..88d70e6191 100644
--- a/schema_builder.hh
+++ b/schema_builder.hh
@@ -14,6 +14,10 @@
 #include "dht/i_partitioner.hh"
 #include "tombstone_gc_options.hh"
 
+namespace db {
+class per_partition_rate_limit_options;
+}
+
 struct schema_builder {
 public:
     enum class compact_storage { no, yes };
@@ -280,6 +284,7 @@ public:
 
     schema_builder& with_cdc_options(const cdc::options&);
     schema_builder& with_tombstone_gc_options(const tombstone_gc_options& opts);
+    schema_builder& with_per_partition_rate_limit_options(const db::per_partition_rate_limit_options&);
     
     default_names get_default_names() const {
         return default_names(_raw);

From ec635ba170efc1a31ee1215668414388a17c8b85 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Wed, 11 May 2022 07:54:55 +0200
Subject: [PATCH 09/29] database: move and rename: classify_query ->
 classify_request

Moves the classify_query higher and renames it to classify_request. The
function will be reused in further commits to protect non-user queries
from accidentally being rate limited.
---
 replica/database.cc | 84 ++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/replica/database.cc b/replica/database.cc
index bfe3a55b4f..f5110fbff5 100644
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -1284,6 +1284,40 @@ database::existing_index_names(const sstring& ks_name, const sstring& cf_to_excl
     return names;
 }
 
+namespace {
+
+enum class request_class {
+    user,
+    system,
+    maintenance,
+};
+
+request_class classify_request(const database_config& _dbcfg) {
+    const auto current_group = current_scheduling_group();
+
+    // Everything running in the statement group is considered a user request
+    if (current_group == _dbcfg.statement_scheduling_group) {
+        return request_class::user;
+    // System requests run in the default (main) scheduling group
+    // All requests executed on behalf of internal work also uses the system semaphore
+    } else if (current_group == default_scheduling_group()
+            || current_group == _dbcfg.compaction_scheduling_group
+            || current_group == _dbcfg.gossip_scheduling_group
+            || current_group == _dbcfg.memory_compaction_scheduling_group
+            || current_group == _dbcfg.memtable_scheduling_group
+            || current_group == _dbcfg.memtable_to_cache_scheduling_group) {
+        return request_class::system;
+    // Requests done on behalf of view update generation run in the streaming group
+    } else if (current_scheduling_group() == _dbcfg.streaming_scheduling_group) {
+        return request_class::maintenance;
+    // Everything else is considered a user request
+    } else {
+        return request_class::user;
+    }
+}
+
+} // anonymous namespace
+
 future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>>
 database::query(schema_ptr s, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges,
                 tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
@@ -1405,56 +1439,22 @@ database::query_mutations(schema_ptr s, const query::read_command& cmd, const dh
     co_return std::tuple(std::move(result), hit_rate);
 }
 
-namespace {
-
-enum class query_class {
-    user,
-    system,
-    maintenance,
-};
-
-query_class classify_query(const database_config& _dbcfg) {
-    const auto current_group = current_scheduling_group();
-
-    // Everything running in the statement group is considered a user query
-    if (current_group == _dbcfg.statement_scheduling_group) {
-        return query_class::user;
-    // System queries run in the default (main) scheduling group
-    // All queries executed on behalf of internal work also uses the system semaphore
-    } else if (current_group == default_scheduling_group()
-            || current_group == _dbcfg.compaction_scheduling_group
-            || current_group == _dbcfg.gossip_scheduling_group
-            || current_group == _dbcfg.memory_compaction_scheduling_group
-            || current_group == _dbcfg.memtable_scheduling_group
-            || current_group == _dbcfg.memtable_to_cache_scheduling_group) {
-        return query_class::system;
-    // Reads done on behalf of view update generation run in the streaming group
-    } else if (current_scheduling_group() == _dbcfg.streaming_scheduling_group) {
-        return query_class::maintenance;
-    // Everything else is considered a user query
-    } else {
-        return query_class::user;
-    }
-}
-
-} // anonymous namespace
-
 query::max_result_size database::get_unlimited_query_max_result_size() const {
-    switch (classify_query(_dbcfg)) {
-        case query_class::user:
+    switch (classify_request(_dbcfg)) {
+        case request_class::user:
             return query::max_result_size(_cfg.max_memory_for_unlimited_query_soft_limit(), _cfg.max_memory_for_unlimited_query_hard_limit());
-        case query_class::system: [[fallthrough]];
-        case query_class::maintenance:
+        case request_class::system: [[fallthrough]];
+        case request_class::maintenance:
             return query::max_result_size(query::result_memory_limiter::unlimited_result_size);
     }
     std::abort();
 }
 
 reader_concurrency_semaphore& database::get_reader_concurrency_semaphore() {
-    switch (classify_query(_dbcfg)) {
-        case query_class::user: return _read_concurrency_sem;
-        case query_class::system: return _system_read_concurrency_sem;
-        case query_class::maintenance: return _streaming_concurrency_sem;
+    switch (classify_request(_dbcfg)) {
+        case request_class::user: return _read_concurrency_sem;
+        case request_class::system: return _system_read_concurrency_sem;
+        case request_class::maintenance: return _streaming_concurrency_sem;
     }
     std::abort();
 }

From cc9a2ad41fadb5e77adcad4f3651cc5e9d6b03b7 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Mon, 16 May 2022 10:57:30 +0200
Subject: [PATCH 10/29] database: apply per-partition rate limiting for
 reads/writes

Adds the `db::rate_limiter` to the `database` class and modifies the
`query` and `apply` methods so that they account the read/write
operations in the rate limiter and optionally reject them.
---
 replica/database.cc | 57 +++++++++++++++++++++++++++++++++++++++++----
 replica/database.hh | 35 ++++++++++++++++++++++++----
 2 files changed, 83 insertions(+), 9 deletions(-)

diff --git a/replica/database.cc b/replica/database.cc
index f5110fbff5..9cfd11e540 100644
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -7,6 +7,7 @@
  */
 
 #include "log.hh"
+#include "replica/database_fwd.hh"
 #include "utils/lister.hh"
 #include "replica/database.hh"
 #include <seastar/core/future-util.hh>
@@ -63,6 +64,7 @@
 #include "tombstone_gc.hh"
 
 #include "replica/data_dictionary_impl.hh"
+#include "replica/exceptions.hh"
 #include "readers/multi_range.hh"
 #include "readers/multishard.hh"
 
@@ -1318,15 +1320,51 @@ request_class classify_request(const database_config& _dbcfg) {
 
 } // anonymous namespace
 
+
+static db::rate_limiter::can_proceed account_singular_ranges_to_rate_limit(
+        db::rate_limiter& limiter, column_family& cf,
+        const dht::partition_range_vector& ranges,
+        const database_config& dbcfg,
+        db::per_partition_rate_limit::info rate_limit_info) {
+    using can_proceed = db::rate_limiter::can_proceed;
+
+    if (std::holds_alternative<std::monostate>(rate_limit_info) || !can_apply_per_partition_rate_limit(*cf.schema(), dbcfg, db::operation_type::read)) {
+        // Rate limiting is disabled for this query
+        return can_proceed::yes;
+    }
+
+    auto table_limit = *cf.schema()->per_partition_rate_limit_options().get_max_reads_per_second();
+    can_proceed ret = can_proceed::yes;
+
+    auto& read_label = cf.get_rate_limiter_label_for_reads();
+    for (const auto& range : ranges) {
+        if (!range.is_singular()) {
+            continue;
+        }
+        auto token = dht::token::to_int64(ranges.front().start()->value().token());
+        if (limiter.account_operation(read_label, token, table_limit, rate_limit_info) == db::rate_limiter::can_proceed::no) {
+            // Don't return immediately - account all ranges first
+            ret = can_proceed::no;
+        }
+    }
+
+    return ret;
+}
+
 future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>>
 database::query(schema_ptr s, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges,
-                tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
+                tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout, db::per_partition_rate_limit::info rate_limit_info) {
     const auto reversed = cmd.slice.is_reversed();
     if (reversed) {
         s = s->make_reversed();
     }
 
     column_family& cf = find_column_family(cmd.cf_id);
+
+    if (account_singular_ranges_to_rate_limit(_rate_limiter, cf, ranges, _dbcfg, rate_limit_info) == db::rate_limiter::can_proceed::no) {
+        co_await coroutine::return_exception(replica::rate_limit_exception());
+    }
+
     auto& semaphore = get_reader_concurrency_semaphore();
     auto max_result_size = cmd.max_result_size ? *cmd.max_result_size : get_unlimited_query_max_result_size();
 
@@ -1770,13 +1808,22 @@ future<> database::apply_with_commitlog(column_family& cf, const mutation& m, db
     }
 }
 
-future<> database::do_apply(schema_ptr s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::timeout_clock::time_point timeout, db::commitlog::force_sync sync) {
+future<> database::do_apply(schema_ptr s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::timeout_clock::time_point timeout, db::commitlog::force_sync sync, db::per_partition_rate_limit::info rate_limit_info) {
     // I'm doing a nullcheck here since the init code path for db etc
     // is a little in flux and commitlog is created only when db is
     // initied from datadir.
     auto uuid = m.column_family_id();
     auto& cf = find_column_family(uuid);
 
+    if (!std::holds_alternative<std::monostate>(rate_limit_info) && can_apply_per_partition_rate_limit(*s, db::operation_type::write)) {
+        auto table_limit = *s->per_partition_rate_limit_options().get_max_writes_per_second();
+        auto& write_label = cf.get_rate_limiter_label_for_writes();
+        auto token = dht::token::to_int64(dht::get_token(*s, m.key()));
+        if (_rate_limiter.account_operation(write_label, token, table_limit, rate_limit_info) == db::rate_limiter::can_proceed::no) {
+            co_await coroutine::return_exception(replica::rate_limit_exception());
+        }
+    }
+
     sync = sync || db::commitlog::force_sync(s->wait_for_sync_to_commitlog());
 
     // Signal to view building code that a write is in progress,
@@ -1833,7 +1880,7 @@ void database::update_write_metrics_for_timed_out_write() {
     ++_stats->total_writes_timedout;
 }
 
-future<> database::apply(schema_ptr s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, db::timeout_clock::time_point timeout) {
+future<> database::apply(schema_ptr s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, db::timeout_clock::time_point timeout, db::per_partition_rate_limit::info rate_limit_info) {
     if (dblog.is_enabled(logging::log_level::trace)) {
         dblog.trace("apply {}", m.pretty_printer(s));
     }
@@ -1844,7 +1891,7 @@ future<> database::apply(schema_ptr s, const frozen_mutation& m, tracing::trace_
     if (!s->is_synced()) {
         on_internal_error(dblog, format("attempted to apply mutation using not synced schema of {}.{}, version={}", s->ks_name(), s->cf_name(), s->version()));
     }
-    return update_write_metrics(_apply_stage(this, std::move(s), seastar::cref(m), std::move(tr_state), timeout, sync));
+    return update_write_metrics(_apply_stage(this, std::move(s), seastar::cref(m), std::move(tr_state), timeout, sync, rate_limit_info));
 }
 
 future<> database::apply_hint(schema_ptr s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::timeout_clock::time_point timeout) {
@@ -1855,7 +1902,7 @@ future<> database::apply_hint(schema_ptr s, const frozen_mutation& m, tracing::t
         on_internal_error(dblog, format("attempted to apply hint using not synced schema of {}.{}, version={}", s->ks_name(), s->cf_name(), s->version()));
     }
     return with_scheduling_group(_dbcfg.streaming_scheduling_group, [this, s = std::move(s), &m, tr_state = std::move(tr_state), timeout] () mutable {
-        return update_write_metrics(_apply_stage(this, std::move(s), seastar::cref(m), std::move(tr_state), timeout, db::commitlog::force_sync::no));
+        return update_write_metrics(_apply_stage(this, std::move(s), seastar::cref(m), std::move(tr_state), timeout, db::commitlog::force_sync::no, std::monostate{}));
     });
 }
 
diff --git a/replica/database.hh b/replica/database.hh
index 564eb298b2..0496c80a47 100644
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -66,6 +66,8 @@
 #include "absl-flat_hash_map.hh"
 #include "utils/cross-shard-barrier.hh"
 #include "sstables/generation_type.hh"
+#include "db/rate_limiter.hh"
+#include "db/per_partition_rate_limit_info.hh"
 
 class cell_locker;
 class cell_locker_stats;
@@ -452,6 +454,11 @@ private:
     std::vector<view_ptr> _views;
 
     std::unique_ptr<cell_locker> _counter_cell_locks; // Memory-intensive; allocate only when needed.
+
+    // Labels used to identify writes and reads for this table in the rate_limiter structure.
+    db::rate_limiter::label _rate_limiter_label_for_writes;
+    db::rate_limiter::label _rate_limiter_label_for_reads;
+
     void set_metrics();
     seastar::metrics::metric_groups _metrics;
 
@@ -744,6 +751,23 @@ public:
         return _cache;
     }
 
+    db::rate_limiter::label& get_rate_limiter_label_for_op_type(db::operation_type op_type) {
+        switch (op_type) {
+        case db::operation_type::write:
+            return _rate_limiter_label_for_writes;
+        case db::operation_type::read:
+            return _rate_limiter_label_for_reads;
+        }
+    }
+
+    db::rate_limiter::label& get_rate_limiter_label_for_writes() {
+        return _rate_limiter_label_for_writes;
+    }
+
+    db::rate_limiter::label& get_rate_limiter_label_for_reads() {
+        return _rate_limiter_label_for_reads;
+    }
+
     future<std::vector<locked_cell>> lock_counter_cells(const mutation& m, db::timeout_clock::time_point timeout);
 
     logalloc::occupancy_stats occupancy() const;
@@ -1294,7 +1318,8 @@ private:
             const frozen_mutation&,
             tracing::trace_state_ptr,
             db::timeout_clock::time_point,
-            db::commitlog_force_sync> _apply_stage;
+            db::commitlog_force_sync,
+            db::per_partition_rate_limit::info> _apply_stage;
 
     flat_hash_map<sstring, keyspace> _keyspaces;
     std::unordered_map<utils::UUID, lw_shared_ptr<column_family>> _column_families;
@@ -1332,6 +1357,8 @@ private:
     std::unique_ptr<wasm::engine> _wasm_engine;
     utils::cross_shard_barrier _stop_barrier;
 
+    db::rate_limiter _rate_limiter;
+
 public:
     data_dictionary::database as_data_dictionary() const;
     std::shared_ptr<data_dictionary::user_types_storage> as_user_types_storage() const noexcept;
@@ -1361,7 +1388,7 @@ private:
     void setup_metrics();
     void setup_scylla_memory_diagnostics_producer();
 
-    future<> do_apply(schema_ptr, const frozen_mutation&, tracing::trace_state_ptr tr_state, db::timeout_clock::time_point timeout, db::commitlog_force_sync sync);
+    future<> do_apply(schema_ptr, const frozen_mutation&, tracing::trace_state_ptr tr_state, db::timeout_clock::time_point timeout, db::commitlog_force_sync sync, db::per_partition_rate_limit::info rate_limit_info);
     future<> apply_with_commitlog(column_family& cf, const mutation& m, db::timeout_clock::time_point timeout);
 
     future<mutation> do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema, db::timeout_clock::time_point timeout,
@@ -1487,12 +1514,12 @@ public:
 
     future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>> query(schema_ptr, const query::read_command& cmd, query::result_options opts,
                                                                   const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state,
-                                                                  db::timeout_clock::time_point timeout);
+                                                                  db::timeout_clock::time_point timeout, db::per_partition_rate_limit::info rate_limit_info = std::monostate{});
     future<std::tuple<reconcilable_result, cache_temperature>> query_mutations(schema_ptr, const query::read_command& cmd, const dht::partition_range& range,
                                                 tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout);
     // Apply the mutation atomically.
     // Throws timed_out_error when timeout is reached.
-    future<> apply(schema_ptr, const frozen_mutation&, tracing::trace_state_ptr tr_state, db::commitlog_force_sync sync, db::timeout_clock::time_point timeout);
+    future<> apply(schema_ptr, const frozen_mutation&, tracing::trace_state_ptr tr_state, db::commitlog_force_sync sync, db::timeout_clock::time_point timeout, db::per_partition_rate_limit::info rate_limit_info = std::monostate{});
     future<> apply_hint(schema_ptr, const frozen_mutation&, tracing::trace_state_ptr tr_state, db::timeout_clock::time_point timeout);
     future<mutation> apply_counter_update(schema_ptr, const frozen_mutation& m, db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_state);
     keyspace::config make_keyspace_config(const keyspace_metadata& ksm);

From c06376b383a52167ce17a033f73fff0ae6edd8ce Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Mon, 2 May 2022 12:50:59 +0200
Subject: [PATCH 11/29] storage_proxy: add per partition rate limit info to
 mutate_locally

Now, mutate_locally accepts a parameter that controls the rate limiter
behavior on the replica.
---
 service/storage_proxy.cc | 23 ++++++++++++-----------
 service/storage_proxy.hh | 17 +++++++++--------
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index e26368bf5d..5e22e26248 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -1869,7 +1869,7 @@ void storage_proxy::connection_dropped(gms::inet_address addr) {
 }
 
 future<>
-storage_proxy::mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp) {
+storage_proxy::mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp, db::per_partition_rate_limit::info rate_limit_info) {
     auto shard = m.shard_of();
     get_stats().replica_cross_shard_ops += shard != this_shard_id();
     return _db.invoke_on(shard, {smp_grp, timeout},
@@ -1877,32 +1877,33 @@ storage_proxy::mutate_locally(const mutation& m, tracing::trace_state_ptr tr_sta
              m = freeze(m),
              gtr = tracing::global_trace_state_ptr(std::move(tr_state)),
              timeout,
-             sync] (replica::database& db) mutable -> future<> {
-        return db.apply(s, m, gtr.get(), sync, timeout);
+             sync,
+             rate_limit_info] (replica::database& db) mutable -> future<> {
+        return db.apply(s, m, gtr.get(), sync, timeout, rate_limit_info);
     });
 }
 
 future<>
 storage_proxy::mutate_locally(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout,
-        smp_service_group smp_grp) {
+        smp_service_group smp_grp, db::per_partition_rate_limit::info rate_limit_info) {
     auto shard = m.shard_of(*s);
     get_stats().replica_cross_shard_ops += shard != this_shard_id();
     return _db.invoke_on(shard, {smp_grp, timeout},
-            [&m, gs = global_schema_ptr(s), gtr = tracing::global_trace_state_ptr(std::move(tr_state)), timeout, sync] (replica::database& db) mutable -> future<> {
-        return db.apply(gs, m, gtr.get(), sync, timeout);
+            [&m, gs = global_schema_ptr(s), gtr = tracing::global_trace_state_ptr(std::move(tr_state)), timeout, sync, rate_limit_info] (replica::database& db) mutable -> future<> {
+        return db.apply(gs, m, gtr.get(), sync, timeout, rate_limit_info);
     });
 }
 
 future<>
-storage_proxy::mutate_locally(std::vector<mutation> mutations, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, smp_service_group smp_grp) {
+storage_proxy::mutate_locally(std::vector<mutation> mutations, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, smp_service_group smp_grp, db::per_partition_rate_limit::info rate_limit_info) {
     co_await coroutine::parallel_for_each(mutations, [&] (const mutation& m) mutable {
-            return mutate_locally(m, tr_state, db::commitlog::force_sync::no, timeout, smp_grp);
+            return mutate_locally(m, tr_state, db::commitlog::force_sync::no, timeout, smp_grp, rate_limit_info);
     });
 }
 
 future<> 
-storage_proxy::mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout) {
-        return mutate_locally(std::move(mutation), tr_state, timeout, _write_smp_service_group);
+storage_proxy::mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, db::per_partition_rate_limit::info rate_limit_info) {
+        return mutate_locally(std::move(mutation), tr_state, timeout, _write_smp_service_group, rate_limit_info);
 }
 future<>
 storage_proxy::mutate_hint(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, clock_type::time_point timeout) {
@@ -5097,7 +5098,7 @@ storage_proxy::receive_mutation_handler(smp_service_group smp_grp, const rpc::cl
                 trace_info ? *trace_info : std::nullopt,
                 /* apply_fn */ [smp_grp] (shared_ptr<storage_proxy>& p, tracing::trace_state_ptr tr_state, schema_ptr s, const frozen_mutation& m,
                         clock_type::time_point timeout) {
-                    return p->mutate_locally(std::move(s), m, std::move(tr_state), db::commitlog::force_sync::no, timeout, smp_grp);
+                    return p->mutate_locally(std::move(s), m, std::move(tr_state), db::commitlog::force_sync::no, timeout, smp_grp, std::monostate()); // TODO: Pass the correct rate limit info
                 },
                 /* forward_fn */ [] (shared_ptr<storage_proxy>& p, netw::messaging_service::msg_addr addr, clock_type::time_point timeout, const frozen_mutation& m,
                         gms::inet_address reply_to, unsigned shard, response_id_type response_id,
diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh
index 2e16dbf74e..ead5469d3d 100644
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -47,6 +47,7 @@
 #include "exceptions/exceptions.hh"
 #include "exceptions/coordinator_result.hh"
 #include "replica/exceptions.hh"
+#include "db/per_partition_rate_limit_info.hh"
 
 class reconcilable_result;
 class frozen_mutation_and_schema;
@@ -500,29 +501,29 @@ public:
 private:
     // Applies mutation on this node.
     // Resolves with timed_out_error when timeout is reached.
-    future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp);
+    future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp, db::per_partition_rate_limit::info rate_limit_info);
     // Applies mutation on this node.
     // Resolves with timed_out_error when timeout is reached.
     future<> mutate_locally(const schema_ptr&, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout,
-            smp_service_group smp_grp);
+            smp_service_group smp_grp, db::per_partition_rate_limit::info rate_limit_info);
     // Applies mutations on this node.
     // Resolves with timed_out_error when timeout is reached.
-    future<> mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, smp_service_group smp_grp);
+    future<> mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, smp_service_group smp_grp, db::per_partition_rate_limit::info rate_limit_info);
 
 public:
     // Applies mutation on this node.
     // Resolves with timed_out_error when timeout is reached.
-    future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max()) {
-        return mutate_locally(m, tr_state, sync, timeout, _write_smp_service_group);
+    future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max(), db::per_partition_rate_limit::info rate_limit_info = std::monostate()) {
+        return mutate_locally(m, tr_state, sync, timeout, _write_smp_service_group, rate_limit_info);
     }
     // Applies mutation on this node.
     // Resolves with timed_out_error when timeout is reached.
-    future<> mutate_locally(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max()) {
-        return mutate_locally(s, m, tr_state, sync, timeout, _write_smp_service_group);
+    future<> mutate_locally(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max(), db::per_partition_rate_limit::info rate_limit_info = std::monostate()) {
+        return mutate_locally(s, m, tr_state, sync, timeout, _write_smp_service_group, rate_limit_info);
     }
     // Applies mutations on this node.
     // Resolves with timed_out_error when timeout is reached.
-    future<> mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout = clock_type::time_point::max());
+    future<> mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout = clock_type::time_point::max(), db::per_partition_rate_limit::info rate_limit_info = std::monostate());
 
     future<> mutate_hint(const schema_ptr&, const frozen_mutation& m, tracing::trace_state_ptr tr_state, clock_type::time_point timeout = clock_type::time_point::max());
 

From 02469e0b15c3c0042c90cae1918dc3a15fe62d55 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Mon, 2 May 2022 12:51:37 +0200
Subject: [PATCH 12/29] storage_proxy: add per partition rate limit info to
 write RPC

Adds db::per_partition_rate_limit::info parameter to the write RPC. The
rate limit info controls the behavior of the rate limiter on the
replica.
---
 idl/storage_proxy.idl.hh     |  2 +-
 message/messaging_service.cc |  3 +++
 service/storage_proxy.cc     | 20 ++++++++++++--------
 service/storage_proxy.hh     |  2 +-
 4 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/idl/storage_proxy.idl.hh b/idl/storage_proxy.idl.hh
index 14fc707f6e..d233c46d08 100644
--- a/idl/storage_proxy.idl.hh
+++ b/idl/storage_proxy.idl.hh
@@ -6,7 +6,7 @@
  * SPDX-License-Identifier: AGPL-3.0-or-later
  */
 
-verb [[with_client_info, with_timeout, one_way]] mutation (frozen_mutation fm, inet_address_vector_replica_set forward, gms::inet_address reply_to, unsigned shard, uint64_t response_id, std::optional<tracing::trace_info> trace_info [[version 1.3.0]]);
+verb [[with_client_info, with_timeout, one_way]] mutation (frozen_mutation fm, inet_address_vector_replica_set forward, gms::inet_address reply_to, unsigned shard, uint64_t response_id, std::optional<tracing::trace_info> trace_info [[version 1.3.0]], db::per_partition_rate_limit::info rate_limit_info [[version 5.1.0]]);
 verb [[with_client_info, one_way]] mutation_done (unsigned shard, uint64_t response_id, db::view::update_backlog backlog [[version 3.1.0]]);
 verb [[with_client_info, one_way]] mutation_failed (unsigned shard, uint64_t response_id, size_t num_failed, db::view::update_backlog backlog [[version 3.1.0]], replica::exception_variant exception [[version 5.1.0]]);
 verb [[with_client_info, with_timeout]] counter_mutation (std::vector<frozen_mutation> fms, db::consistency_level cl, std::optional<tracing::trace_info> trace_info);
diff --git a/message/messaging_service.cc b/message/messaging_service.cc
index 2c564d7ecc..d3accb8d70 100644
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -44,6 +44,7 @@
 #include "service/raft/messaging.hh"
 #include "replica/exceptions.hh"
 #include "serializer.hh"
+#include "db/per_partition_rate_limit_info.hh"
 #include "idl/consistency_level.dist.hh"
 #include "idl/tracing.dist.hh"
 #include "idl/result.dist.hh"
@@ -69,6 +70,7 @@
 #include "idl/raft.dist.hh"
 #include "idl/group0.dist.hh"
 #include "idl/replica_exception.dist.hh"
+#include "idl/per_partition_rate_limit_info.dist.hh"
 #include "idl/storage_proxy.dist.hh"
 #include "serializer_impl.hh"
 #include "serialization_visitors.hh"
@@ -97,6 +99,7 @@
 #include "idl/group0.dist.impl.hh"
 #include "idl/view.dist.impl.hh"
 #include "idl/replica_exception.dist.impl.hh"
+#include "idl/per_partition_rate_limit_info.dist.impl.hh"
 #include "idl/storage_proxy.dist.impl.hh"
 #include <seastar/rpc/lz4_compressor.hh>
 #include <seastar/rpc/lz4_fragmented_compressor.hh>
diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index 5e22e26248..845545664f 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -224,7 +224,8 @@ public:
             return ser::storage_proxy_rpc_verbs::send_mutation(&sp._messaging,
                                     netw::messaging_service::msg_addr{ep, 0}, timeout, *m,
                                     std::move(forward), utils::fb_utilities::get_broadcast_address(), this_shard_id(),
-                                    response_id, tracing::make_trace_info(tr_state));
+                                    response_id, tracing::make_trace_info(tr_state),
+                                    std::monostate()); // TODO: propagate the correct flag
         }
         sp.got_response(response_id, ep, std::nullopt);
         return make_ready_future<>();
@@ -271,7 +272,8 @@ public:
         return ser::storage_proxy_rpc_verbs::send_mutation(&sp._messaging,
                 netw::messaging_service::msg_addr{ep, 0}, timeout, *_mutation,
                 std::move(forward), utils::fb_utilities::get_broadcast_address(), this_shard_id(),
-                response_id, tracing::make_trace_info(tr_state));
+                response_id, tracing::make_trace_info(tr_state),
+                std::monostate()); // TODO: propagate the correct flag
     }
     virtual bool is_shared() override {
         return true;
@@ -4944,7 +4946,7 @@ void storage_proxy::init_messaging_service(shared_ptr<migration_manager> mm) {
     _mm = std::move(mm);
     ser::storage_proxy_rpc_verbs::register_counter_mutation(&_messaging, std::bind_front(&storage_proxy::handle_counter_mutation, this));
     ser::storage_proxy_rpc_verbs::register_mutation(&_messaging, std::bind_front(&storage_proxy::receive_mutation_handler, this, _write_smp_service_group));
-    ser::storage_proxy_rpc_verbs::register_hint_mutation(&_messaging, std::bind_front(&storage_proxy::receive_mutation_handler, this, _hints_write_smp_service_group));
+    ser::storage_proxy_rpc_verbs::register_hint_mutation(&_messaging, [this] <typename... Args>(Args&&... args) { return receive_mutation_handler(_hints_write_smp_service_group, std::forward<Args>(args)..., std::monostate()); });
     ser::storage_proxy_rpc_verbs::register_paxos_learn(&_messaging, std::bind_front(&storage_proxy::handle_paxos_learn, this));
     ser::storage_proxy_rpc_verbs::register_mutation_done(&_messaging, std::bind_front(&storage_proxy::handle_mutation_done, this));
     ser::storage_proxy_rpc_verbs::register_mutation_failed(&_messaging, std::bind_front(&storage_proxy::handle_mutation_failed, this));
@@ -5089,22 +5091,24 @@ storage_proxy::handle_write(netw::messaging_service::msg_addr src_addr, rpc::opt
 
 future<rpc::no_wait_type>
 storage_proxy::receive_mutation_handler(smp_service_group smp_grp, const rpc::client_info& cinfo, rpc::opt_time_point t, frozen_mutation in, inet_address_vector_replica_set forward,
-            gms::inet_address reply_to, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info) {
+            gms::inet_address reply_to, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info,
+            rpc::optional<db::per_partition_rate_limit::info> rate_limit_info_opt) {
         tracing::trace_state_ptr trace_state_ptr;
         auto src_addr = netw::messaging_service::get_source(cinfo);
+        auto rate_limit_info = rate_limit_info_opt.value_or(std::monostate());
 
         utils::UUID schema_version = in.schema_version();
         return handle_write(src_addr, t, schema_version, std::move(in), std::move(forward), reply_to, shard, response_id,
                 trace_info ? *trace_info : std::nullopt,
-                /* apply_fn */ [smp_grp] (shared_ptr<storage_proxy>& p, tracing::trace_state_ptr tr_state, schema_ptr s, const frozen_mutation& m,
+                /* apply_fn */ [smp_grp, rate_limit_info] (shared_ptr<storage_proxy>& p, tracing::trace_state_ptr tr_state, schema_ptr s, const frozen_mutation& m,
                         clock_type::time_point timeout) {
-                    return p->mutate_locally(std::move(s), m, std::move(tr_state), db::commitlog::force_sync::no, timeout, smp_grp, std::monostate()); // TODO: Pass the correct rate limit info
+                    return p->mutate_locally(std::move(s), m, std::move(tr_state), db::commitlog::force_sync::no, timeout, smp_grp, rate_limit_info);
                 },
-                /* forward_fn */ [] (shared_ptr<storage_proxy>& p, netw::messaging_service::msg_addr addr, clock_type::time_point timeout, const frozen_mutation& m,
+                /* forward_fn */ [rate_limit_info] (shared_ptr<storage_proxy>& p, netw::messaging_service::msg_addr addr, clock_type::time_point timeout, const frozen_mutation& m,
                         gms::inet_address reply_to, unsigned shard, response_id_type response_id,
                         std::optional<tracing::trace_info> trace_info) {
                     return ser::storage_proxy_rpc_verbs::send_mutation(&p->_messaging,
-                                            addr, timeout, m, {}, reply_to, shard, response_id, std::move(trace_info));
+                                            addr, timeout, m, {}, reply_to, shard, response_id, std::move(trace_info), rate_limit_info);
                 });
 }
 
diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh
index ead5469d3d..4cb9c6f1c7 100644
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -436,7 +436,7 @@ private:
                       unsigned shard, storage_proxy::response_id_type response_id, std::optional<tracing::trace_info> trace_info,
                       auto&& apply_fn, auto&& forward_fn);
     future<rpc::no_wait_type> receive_mutation_handler (smp_service_group smp_grp, const rpc::client_info& cinfo, rpc::opt_time_point t, frozen_mutation in, inet_address_vector_replica_set forward,
-            gms::inet_address reply_to, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info);
+            gms::inet_address reply_to, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info, rpc::optional<db::per_partition_rate_limit::info> rate_limit_info_opt);
     future<rpc::no_wait_type> handle_paxos_learn(const rpc::client_info& cinfo, rpc::opt_time_point t, paxos::proposal decision,
             inet_address_vector_replica_set forward, gms::inet_address reply_to, unsigned shard,
             storage_proxy::response_id_type response_id, std::optional<tracing::trace_info> trace_info);

From 3f88ecdea6ea40e8b4e29718787427258c2a9099 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Wed, 4 May 2022 00:33:46 +0200
Subject: [PATCH 13/29] storage_proxy: add per partition rate limit to
 mutation_holders

Now, `apply_locally` and `apply_remotely` accept the per partition rate
limit info parameter.
---
 service/storage_proxy.cc | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index 845545664f..651f3a0313 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -163,10 +163,10 @@ public:
     virtual ~mutation_holder() {}
     virtual bool store_hint(db::hints::manager& hm, gms::inet_address ep, tracing::trace_state_ptr tr_state) = 0;
     virtual future<> apply_locally(storage_proxy& sp, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) = 0;
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) = 0;
     virtual future<> apply_remotely(storage_proxy& sp, gms::inet_address ep, inet_address_vector_replica_set&& forward,
             storage_proxy::response_id_type response_id, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) = 0;
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) = 0;
     virtual bool is_shared() = 0;
     size_t size() const {
         return _size;
@@ -207,17 +207,17 @@ public:
         }
     }
     virtual future<> apply_locally(storage_proxy& sp, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) override {
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) override {
         auto m = _mutations[utils::fb_utilities::get_broadcast_address()];
         if (m) {
             tracing::trace(tr_state, "Executing a mutation locally");
-            return sp.mutate_locally(_schema, *m, std::move(tr_state), db::commitlog::force_sync::no, timeout);
+            return sp.mutate_locally(_schema, *m, std::move(tr_state), db::commitlog::force_sync::no, timeout, rate_limit_info);
         }
         return make_ready_future<>();
     }
     virtual future<> apply_remotely(storage_proxy& sp, gms::inet_address ep, inet_address_vector_replica_set&& forward,
             storage_proxy::response_id_type response_id, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) override {
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) override {
         auto m = _mutations[ep];
         if (m) {
             tracing::trace(tr_state, "Sending a mutation to /{}", ep);
@@ -225,7 +225,7 @@ public:
                                     netw::messaging_service::msg_addr{ep, 0}, timeout, *m,
                                     std::move(forward), utils::fb_utilities::get_broadcast_address(), this_shard_id(),
                                     response_id, tracing::make_trace_info(tr_state),
-                                    std::monostate()); // TODO: propagate the correct flag
+                                    rate_limit_info);
         }
         sp.got_response(response_id, ep, std::nullopt);
         return make_ready_future<>();
@@ -261,19 +261,19 @@ public:
             return hm.store_hint(ep, _schema, _mutation, tr_state);
     }
     virtual future<> apply_locally(storage_proxy& sp, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) override {
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) override {
         tracing::trace(tr_state, "Executing a mutation locally");
-        return sp.mutate_locally(_schema, *_mutation, std::move(tr_state), db::commitlog::force_sync::no, timeout);
+        return sp.mutate_locally(_schema, *_mutation, std::move(tr_state), db::commitlog::force_sync::no, timeout, rate_limit_info);
     }
     virtual future<> apply_remotely(storage_proxy& sp, gms::inet_address ep, inet_address_vector_replica_set&& forward,
             storage_proxy::response_id_type response_id, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) override {
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) override {
         tracing::trace(tr_state, "Sending a mutation to /{}", ep);
         return ser::storage_proxy_rpc_verbs::send_mutation(&sp._messaging,
                 netw::messaging_service::msg_addr{ep, 0}, timeout, *_mutation,
                 std::move(forward), utils::fb_utilities::get_broadcast_address(), this_shard_id(),
                 response_id, tracing::make_trace_info(tr_state),
-                std::monostate()); // TODO: propagate the correct flag
+                rate_limit_info);
     }
     virtual bool is_shared() override {
         return true;
@@ -291,14 +291,14 @@ public:
         throw std::runtime_error("Attempted to store a hint for a hint");
     }
     virtual future<> apply_locally(storage_proxy& sp, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) override {
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) override {
         // A hint will be sent to all relevant endpoints when the endpoint it was originally intended for
         // becomes unavailable - this might include the current node
         return sp.mutate_hint(_schema, *_mutation, std::move(tr_state), timeout);
     }
     virtual future<> apply_remotely(storage_proxy& sp, gms::inet_address ep, inet_address_vector_replica_set&& forward,
             storage_proxy::response_id_type response_id, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) override {
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) override {
         tracing::trace(tr_state, "Sending a hint to /{}", ep);
         return ser::storage_proxy_rpc_verbs::send_hint_mutation(&sp._messaging,
                 netw::messaging_service::msg_addr{ep, 0}, timeout, *_mutation,
@@ -320,14 +320,16 @@ public:
             return false; // CAS does not save hints yet
     }
     virtual future<> apply_locally(storage_proxy& sp, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) override {
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) override {
         tracing::trace(tr_state, "Executing a learn locally");
+        // TODO: Enforce per partition rate limiting in paxos
         return paxos::paxos_state::learn(sp, _schema, *_proposal, timeout, tr_state);
     }
     virtual future<> apply_remotely(storage_proxy& sp, gms::inet_address ep, inet_address_vector_replica_set&& forward,
             storage_proxy::response_id_type response_id, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) override {
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) override {
         tracing::trace(tr_state, "Sending a learn to /{}", ep);
+        // TODO: Enforce per partition rate limiting in paxos
         return ser::storage_proxy_rpc_verbs::send_paxos_learn(&sp._messaging, netw::messaging_service::msg_addr{ep, 0}, timeout,
                                 *_proposal, std::move(forward), utils::fb_utilities::get_broadcast_address(),
                                 this_shard_id(), response_id, tracing::make_trace_info(tr_state));
@@ -609,12 +611,12 @@ public:
         return _mutation_holder->store_hint(hm, ep, tr_state);
     }
     future<> apply_locally(storage_proxy::clock_type::time_point timeout, tracing::trace_state_ptr tr_state) {
-        return _mutation_holder->apply_locally(*_proxy, timeout, std::move(tr_state));
+        return _mutation_holder->apply_locally(*_proxy, timeout, std::move(tr_state), std::monostate()); // TODO: Pass the correct info
     }
     future<> apply_remotely(gms::inet_address ep, inet_address_vector_replica_set&& forward,
             storage_proxy::response_id_type response_id, storage_proxy::clock_type::time_point timeout,
             tracing::trace_state_ptr tr_state) {
-        return _mutation_holder->apply_remotely(*_proxy, ep, std::move(forward), response_id, timeout, std::move(tr_state));
+        return _mutation_holder->apply_remotely(*_proxy, ep, std::move(forward), response_id, timeout, std::move(tr_state), std::monostate()); // TODO: Pass the correct info
     }
     const schema_ptr& get_schema() const {
         return _mutation_holder->schema();

From 2a7ba76c3e6c1d0da87146abfac2f89d95d3a9c8 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Thu, 19 May 2022 09:19:13 +0200
Subject: [PATCH 14/29] storage_proxy: resultize return types of write handler
 creation path

The mutate_prepare and create_write_response_handler(_helper) functions
are modified to be able to return exceptions without throwing them. In
an upcoming commit, create_write_response_handler will sometimes return
rate limit exception, and it is preferable to return them without
throwing.
---
 service/storage_proxy.cc | 44 ++++++++++++++++++++++------------------
 service/storage_proxy.hh | 18 ++++++++--------
 2 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index 651f3a0313..0bfca09b11 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -1439,7 +1439,7 @@ future<result<>> storage_proxy::response_wait(storage_proxy::response_id_type id
         return _response_handlers.find(id)->second;
 }
 
-storage_proxy::response_id_type storage_proxy::create_write_response_handler(replica::keyspace& ks, db::consistency_level cl, db::write_type type, std::unique_ptr<mutation_holder> m,
+result<storage_proxy::response_id_type> storage_proxy::create_write_response_handler(replica::keyspace& ks, db::consistency_level cl, db::write_type type, std::unique_ptr<mutation_holder> m,
                              inet_address_vector_replica_set targets, const inet_address_vector_topology_change& pending_endpoints, inet_address_vector_topology_change dead_endpoints, tracing::trace_state_ptr tr_state,
                              storage_proxy::write_stats& stats, service_permit permit)
 {
@@ -1455,7 +1455,7 @@ storage_proxy::response_id_type storage_proxy::create_write_response_handler(rep
     } else {
         h = ::make_shared<write_response_handler>(shared_from_this(), ks, cl, type, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit));
     }
-    return register_response_handler(std::move(h));
+    return bo::success(register_response_handler(std::move(h)));
 }
 
 seastar::metrics::label storage_proxy_stats::split_stats::datacenter_label("datacenter");
@@ -1944,7 +1944,7 @@ storage_proxy::mutate_counter_on_leader_and_replicate(const schema_ptr& s, froze
     });
 }
 
-storage_proxy::response_id_type
+result<storage_proxy::response_id_type>
 storage_proxy::create_write_response_handler_helper(schema_ptr s, const dht::token& token, std::unique_ptr<mutation_holder> mh,
         db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
     auto keyspace_name = s->ks_name();
@@ -2009,19 +2009,19 @@ storage_proxy::create_write_response_handler_helper(schema_ptr s, const dht::tok
  * Since ordering is (maybe?) significant, we need to carry some info across from here
  * to the hint method below (dead nodes).
  */
-storage_proxy::response_id_type
+result<storage_proxy::response_id_type>
 storage_proxy::create_write_response_handler(const mutation& m, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
     return create_write_response_handler_helper(m.schema(), m.token(), std::make_unique<shared_mutation>(m), cl, type, tr_state,
             std::move(permit));
 }
 
-storage_proxy::response_id_type
+result<storage_proxy::response_id_type>
 storage_proxy::create_write_response_handler(const hint_wrapper& h, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
     return create_write_response_handler_helper(h.mut.schema(), h.mut.token(), std::make_unique<hint_mutation>(h.mut), cl, type, tr_state,
             std::move(permit));
 }
 
-storage_proxy::response_id_type
+result<storage_proxy::response_id_type>
 storage_proxy::create_write_response_handler(const std::unordered_map<gms::inet_address, std::optional<mutation>>& m, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
     inet_address_vector_replica_set endpoints;
     endpoints.reserve(m.size());
@@ -2037,7 +2037,7 @@ storage_proxy::create_write_response_handler(const std::unordered_map<gms::inet_
     return create_write_response_handler(ks, cl, type, std::move(mh), std::move(endpoints), inet_address_vector_topology_change(), inet_address_vector_topology_change(), std::move(tr_state), get_stats(), std::move(permit));
 }
 
-storage_proxy::response_id_type
+result<storage_proxy::response_id_type>
 storage_proxy::create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>& meta,
         db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
     auto& [commit, s, h, t] = meta;
@@ -2046,7 +2046,7 @@ storage_proxy::create_write_response_handler(const std::tuple<lw_shared_ptr<paxo
             db::write_type::CAS, tr_state, std::move(permit));
 }
 
-storage_proxy::response_id_type
+result<storage_proxy::response_id_type>
 storage_proxy::create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, dht::token, inet_address_vector_replica_set>& meta,
         db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
     auto& [commit, s, token, endpoints] = meta;
@@ -2087,20 +2087,24 @@ storage_proxy::hint_to_dead_endpoints(response_id_type id, db::consistency_level
 }
 
 template<typename Range, typename CreateWriteHandler>
-future<storage_proxy::unique_response_handler_vector> storage_proxy::mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, service_permit permit, CreateWriteHandler create_handler) {
+future<result<storage_proxy::unique_response_handler_vector>> storage_proxy::mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, service_permit permit, CreateWriteHandler create_handler) {
     // apply is used to convert exceptions to exceptional future
     return futurize_invoke([this] (Range&& mutations, db::consistency_level cl, db::write_type type, service_permit permit, CreateWriteHandler create_handler) {
         unique_response_handler_vector ids;
         ids.reserve(std::distance(std::begin(mutations), std::end(mutations)));
         for (auto& m : mutations) {
-            ids.emplace_back(*this, create_handler(m, cl, type, permit));
+            auto r_handler = create_handler(m, cl, type, permit);
+            if (!r_handler) {
+                return make_ready_future<result<unique_response_handler_vector>>(std::move(r_handler).as_failure());
+            }
+            ids.emplace_back(*this, std::move(r_handler).value());
         }
-        return make_ready_future<unique_response_handler_vector>(std::move(ids));
+        return make_ready_future<result<unique_response_handler_vector>>(std::move(ids));
     }, std::forward<Range>(mutations), cl, type, std::move(permit), std::move(create_handler));
 }
 
 template<typename Range>
-future<storage_proxy::unique_response_handler_vector> storage_proxy::mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
+future<result<storage_proxy::unique_response_handler_vector>> storage_proxy::mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
     return mutate_prepare<>(std::forward<Range>(mutations), cl, type, std::move(permit), [this, tr_state = std::move(tr_state)] (const typename std::decay_t<Range>::value_type& m, db::consistency_level cl, db::write_type type, service_permit permit) mutable {
         return create_write_response_handler(m, cl, type, tr_state, std::move(permit));
     });
@@ -2399,11 +2403,11 @@ storage_proxy::mutate_internal(Range mutations, db::consistency_level cl, bool c
     utils::latency_counter lc;
     lc.start();
 
-    return mutate_prepare(mutations, cl, type, tr_state, std::move(permit)).then([this, cl, timeout_opt, tracker = std::move(cdc_tracker),
+    return mutate_prepare(mutations, cl, type, tr_state, std::move(permit)).then(utils::result_wrap([this, cl, timeout_opt, tracker = std::move(cdc_tracker),
             tr_state] (storage_proxy::unique_response_handler_vector ids) mutable {
         register_cdc_operation_result_tracker(ids, tracker);
         return mutate_begin(std::move(ids), cl, tr_state, timeout_opt);
-    }).then_wrapped([this, p = shared_from_this(), lc, tr_state] (future<result<>> f) mutable {
+    })).then_wrapped([this, p = shared_from_this(), lc, tr_state] (future<result<>> f) mutable {
         return p->mutate_end(std::move(f), lc, get_stats(), std::move(tr_state));
     });
 }
@@ -2490,10 +2494,10 @@ storage_proxy::mutate_atomically_result(std::vector<mutation> mutations, db::con
             return _p.mutate_prepare<>(std::array<mutation, 1>{std::move(m)}, cl, db::write_type::BATCH_LOG, _permit, [this] (const mutation& m, db::consistency_level cl, db::write_type type, service_permit permit) {
                 auto& ks = _p._db.local().find_keyspace(m.schema()->ks_name());
                 return _p.create_write_response_handler(ks, cl, type, std::make_unique<shared_mutation>(m), _batchlog_endpoints, {}, {}, _trace_state, _stats, std::move(permit));
-            }).then([this, cl] (unique_response_handler_vector ids) {
+            }).then(utils::result_wrap([this, cl] (unique_response_handler_vector ids) {
                 _p.register_cdc_operation_result_tracker(ids, _cdc_tracker);
                 return _p.mutate_begin(std::move(ids), cl, _trace_state, _timeout);
-            });
+            }));
         }
         future<result<>> sync_write_to_batchlog() {
             auto m = _p.get_batchlog_mutation_for(_mutations, _batch_uuid, netw::messaging_service::current_version, db_clock::now());
@@ -2522,7 +2526,7 @@ storage_proxy::mutate_atomically_result(std::vector<mutation> mutations, db::con
         };
 
         future<result<>> run() {
-            return _p.mutate_prepare(_mutations, _cl, db::write_type::BATCH, _trace_state, _permit).then([this] (unique_response_handler_vector ids) {
+            return _p.mutate_prepare(_mutations, _cl, db::write_type::BATCH, _trace_state, _permit).then(utils::result_wrap([this] (unique_response_handler_vector ids) {
                 return sync_write_to_batchlog().then(utils::result_wrap([this, ids = std::move(ids)] () mutable {
                     tracing::trace(_trace_state, "Sending batch mutations");
                     _p.register_cdc_operation_result_tracker(ids, _cdc_tracker);
@@ -2530,7 +2534,7 @@ storage_proxy::mutate_atomically_result(std::vector<mutation> mutations, db::con
                 })).then(utils::result_wrap([this] {
                     return utils::then_ok_result<result<>>(async_remove_from_batchlog());
                 }));
-            });
+            }));
         }
     };
 
@@ -2632,9 +2636,9 @@ future<> storage_proxy::send_to_endpoint(
             tr_state,
             stats,
             std::move(permit));
-    }).then([this, cl, tr_state = std::move(tr_state), timeout = std::move(timeout)] (unique_response_handler_vector ids) mutable {
+    }).then(utils::result_wrap([this, cl, tr_state = std::move(tr_state), timeout = std::move(timeout)] (unique_response_handler_vector ids) mutable {
         return mutate_begin(std::move(ids), cl, std::move(tr_state), std::move(timeout));
-    }).then_wrapped([p = shared_from_this(), lc, &stats] (future<result<>> f) {
+    })).then_wrapped([p = shared_from_this(), lc, &stats] (future<result<>> f) {
         return p->mutate_end(std::move(f), lc, stats, nullptr).then(utils::result_into_future<result<>>);
     });
 }
diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh
index 4cb9c6f1c7..9901c9eeb2 100644
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -310,17 +310,17 @@ private:
     void got_failure_response(response_id_type id, gms::inet_address from, size_t count, std::optional<db::view::update_backlog> backlog, error err, std::optional<sstring> msg);
     future<result<>> response_wait(response_id_type id, clock_type::time_point timeout);
     ::shared_ptr<abstract_write_response_handler>& get_write_response_handler(storage_proxy::response_id_type id);
-    response_id_type create_write_response_handler_helper(schema_ptr s, const dht::token& token,
+    result<response_id_type> create_write_response_handler_helper(schema_ptr s, const dht::token& token,
             std::unique_ptr<mutation_holder> mh, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state,
             service_permit permit);
-    response_id_type create_write_response_handler(replica::keyspace& ks, db::consistency_level cl, db::write_type type, std::unique_ptr<mutation_holder> m, inet_address_vector_replica_set targets,
+    result<response_id_type> create_write_response_handler(replica::keyspace& ks, db::consistency_level cl, db::write_type type, std::unique_ptr<mutation_holder> m, inet_address_vector_replica_set targets,
             const inet_address_vector_topology_change& pending_endpoints, inet_address_vector_topology_change, tracing::trace_state_ptr tr_state, storage_proxy::write_stats& stats, service_permit permit);
-    response_id_type create_write_response_handler(const mutation&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
-    response_id_type create_write_response_handler(const hint_wrapper&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
-    response_id_type create_write_response_handler(const std::unordered_map<gms::inet_address, std::optional<mutation>>&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
-    response_id_type create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>& proposal,
+    result<response_id_type> create_write_response_handler(const mutation&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
+    result<response_id_type> create_write_response_handler(const hint_wrapper&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
+    result<response_id_type> create_write_response_handler(const std::unordered_map<gms::inet_address, std::optional<mutation>>&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
+    result<response_id_type> create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>& proposal,
             db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
-    response_id_type create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, dht::token, inet_address_vector_replica_set>& meta,
+    result<response_id_type> create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, dht::token, inet_address_vector_replica_set>& meta,
             db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
     void register_cdc_operation_result_tracker(const storage_proxy::unique_response_handler_vector& ids, lw_shared_ptr<cdc::operation_result_tracker> tracker);
     void send_to_live_endpoints(response_id_type response_id, clock_type::time_point timeout);
@@ -379,9 +379,9 @@ private:
         db::consistency_level cl,
         coordinator_query_options optional_params);
     template<typename Range, typename CreateWriteHandler>
-    future<unique_response_handler_vector> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, service_permit permit, CreateWriteHandler handler);
+    future<result<unique_response_handler_vector>> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, service_permit permit, CreateWriteHandler handler);
     template<typename Range>
-    future<unique_response_handler_vector> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
+    future<result<unique_response_handler_vector>> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
     future<result<>> mutate_begin(unique_response_handler_vector ids, db::consistency_level cl, tracing::trace_state_ptr trace_state, std::optional<clock_type::time_point> timeout_opt = { });
     future<result<>> mutate_end(future<result<>> mutate_result, utils::latency_counter, write_stats& stats, tracing::trace_state_ptr trace_state);
     future<result<>> schedule_repair(std::unordered_map<dht::token, std::unordered_map<gms::inet_address, std::optional<mutation>>> diffs, db::consistency_level cl, tracing::trace_state_ptr trace_state, service_permit permit);

From 76e95e7ae8e618ce506fed09ae87822936434887 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Wed, 4 May 2022 00:37:56 +0200
Subject: [PATCH 15/29] storage_proxy: choose the right per partition rate
 limit info in write handler

Now, write response handler calculates the appropriate rate limit info
parameter and passes it to the mutation holder.
---
 replica/database.cc      |  17 +++++
 replica/database.hh      |  15 ++++
 service/storage_proxy.cc | 143 +++++++++++++++++++++++++++++----------
 service/storage_proxy.hh |  14 ++--
 4 files changed, 147 insertions(+), 42 deletions(-)

diff --git a/replica/database.cc b/replica/database.cc
index 9cfd11e540..2a5f4929eb 100644
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -1320,6 +1320,23 @@ request_class classify_request(const database_config& _dbcfg) {
 
 } // anonymous namespace
 
+static bool can_apply_per_partition_rate_limit(const schema& s, const database_config& dbcfg, db::operation_type op_type) {
+    return s.per_partition_rate_limit_options().get_max_ops_per_second(op_type).has_value()
+            && classify_request(dbcfg) == request_class::user;
+}
+
+bool database::can_apply_per_partition_rate_limit(const schema& s, db::operation_type op_type) const {
+    return replica::can_apply_per_partition_rate_limit(s, _dbcfg, op_type);
+}
+
+std::optional<db::rate_limiter::can_proceed> database::account_coordinator_operation_to_rate_limit(table& tbl, const dht::token& token,
+        db::per_partition_rate_limit::account_and_enforce account_and_enforce_info,
+        db::operation_type op_type) {
+
+    std::optional<uint32_t> table_limit = tbl.schema()->per_partition_rate_limit_options().get_max_ops_per_second(op_type);
+    db::rate_limiter::label& lbl = tbl.get_rate_limiter_label_for_op_type(op_type);
+    return _rate_limiter.account_operation(lbl, dht::token::to_int64(token), *table_limit, account_and_enforce_info);
+}
 
 static db::rate_limiter::can_proceed account_singular_ranges_to_rate_limit(
         db::rate_limiter& limiter, column_family& cf,
diff --git a/replica/database.hh b/replica/database.hh
index 0496c80a47..fc7dd7556d 100644
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -68,6 +68,7 @@
 #include "sstables/generation_type.hh"
 #include "db/rate_limiter.hh"
 #include "db/per_partition_rate_limit_info.hh"
+#include "db/operation_type.hh"
 
 class cell_locker;
 class cell_locker_stats;
@@ -1512,6 +1513,20 @@ public:
     future<> stop();
     future<> close_tables(table_kind kind_to_close);
 
+    /// Checks whether per-partition rate limit can be applied to the operation or not.
+    bool can_apply_per_partition_rate_limit(const schema& s, db::operation_type op_type) const;
+
+    /// Tries to account given operation to the rate limit when the coordinator is a replica.
+    /// This function can be called ONLY when rate limiting can be applied to the operation (see `can_apply_per_partition_rate_limit`)
+    /// AND the current node/shard is a replica for the given operation.
+    ///
+    /// nullopt -> the decision should be delegated to replicas
+    /// can_proceed::no -> operation should be rejected
+    /// can_proceed::yes -> operation should be accepted
+    std::optional<db::rate_limiter::can_proceed> account_coordinator_operation_to_rate_limit(table& tbl, const dht::token& token,
+            db::per_partition_rate_limit::account_and_enforce account_and_enforce_info,
+            db::operation_type op_type);
+
     future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>> query(schema_ptr, const query::read_command& cmd, query::result_options opts,
                                                                   const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state,
                                                                   db::timeout_clock::time_point timeout, db::per_partition_rate_limit::info rate_limit_info = std::monostate{});
diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index 0bfca09b11..8f08663f94 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -8,6 +8,7 @@
  * SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
  */
 
+#include <random>
 #include <seastar/core/sleep.hh>
 #include <seastar/util/defer.hh>
 #include "partition_range_compat.hh"
@@ -155,6 +156,59 @@ unsigned storage_proxy::cas_shard(const schema& s, dht::token token) {
     return dht::shard_of(s, token);
 }
 
+static uint32_t random_variable_for_rate_limit() {
+    static thread_local std::default_random_engine re{std::random_device{}()};
+    static thread_local std::uniform_int_distribution<uint32_t> dist(0, 0xFFFFFFFF);
+    return dist(re);
+}
+
+static result<db::per_partition_rate_limit::info> choose_rate_limit_info(
+        replica::database& db,
+        bool coordinator_in_replica_set,
+        db::operation_type op_type,
+        const schema_ptr& s,
+        const dht::token& token,
+        tracing::trace_state_ptr tr_state) {
+
+    db::per_partition_rate_limit::account_and_enforce enforce_info{
+        .random_variable = random_variable_for_rate_limit(),
+    };
+    if (coordinator_in_replica_set && dht::shard_of(*s, token) == this_shard_id()) {
+        auto& cf = db.find_column_family(s);
+        auto decision = db.account_coordinator_operation_to_rate_limit(cf, token, enforce_info, op_type);
+        if (decision) {
+            if (*decision == db::rate_limiter::can_proceed::yes) {
+                // The coordinator has decided to accept the operation.
+                // Tell other replicas only to account, but not reject
+                slogger.trace("Per-partition rate limiting: coordinator accepted");
+                tracing::trace(tr_state, "Per-partition rate limiting: coordinator accepted");
+                return db::per_partition_rate_limit::account_only{};
+            } else {
+                // The coordinator has decided to reject, abort the operation
+                slogger.trace("Per-partition rate limiting: coordinator rejected");
+                tracing::trace(tr_state, "Per-partition rate limiting: coordinator rejected");
+                return coordinator_exception_container(exceptions::rate_limit_exception(s->ks_name(), s->cf_name(), op_type, true));
+            }
+        }
+    }
+
+    // The coordinator is not a replica. The decision whether to accept
+    // or reject is left for replicas.
+    slogger.trace("Per-partition rate limiting: replicas will decide");
+    tracing::trace(tr_state, "Per-partition rate limiting: replicas will decide");
+    return enforce_info;
+}
+
+static inline db::per_partition_rate_limit::info adjust_rate_limit_for_local_operation(
+        const db::per_partition_rate_limit::info& info) {
+    if (std::holds_alternative<db::per_partition_rate_limit::account_only>(info)) {
+        // In this case, the coordinator has already accounted the operation,
+        // so don't do it again on this shard
+        return std::monostate();
+    }
+    return info;
+}
+
 class mutation_holder {
 protected:
     size_t _size = 0;
@@ -382,6 +436,7 @@ protected:
     lw_shared_ptr<cdc::operation_result_tracker> _cdc_operation_result_tracker;
     timer<storage_proxy::clock_type> _expire_timer;
     service_permit _permit; // holds admission permit until operation completes
+    db::per_partition_rate_limit::info _rate_limit_info;
 
 protected:
     virtual bool waited_for(gms::inet_address from) = 0;
@@ -395,9 +450,11 @@ protected:
 public:
     abstract_write_response_handler(shared_ptr<storage_proxy> p, replica::keyspace& ks, db::consistency_level cl, db::write_type type,
             std::unique_ptr<mutation_holder> mh, inet_address_vector_replica_set targets, tracing::trace_state_ptr trace_state,
-            storage_proxy::write_stats& stats, service_permit permit, size_t pending_endpoints = 0, inet_address_vector_topology_change dead_endpoints = {})
+            storage_proxy::write_stats& stats, service_permit permit, db::per_partition_rate_limit::info rate_limit_info, size_t pending_endpoints = 0,
+            inet_address_vector_topology_change dead_endpoints = {})
             : _id(p->get_next_response_id()), _proxy(std::move(p)), _trace_state(trace_state), _cl(cl), _type(type), _mutation_holder(std::move(mh)), _targets(std::move(targets)),
-              _dead_endpoints(std::move(dead_endpoints)), _stats(stats), _expire_timer([this] { timeout_cb(); }), _permit(std::move(permit)) {
+              _dead_endpoints(std::move(dead_endpoints)), _stats(stats), _expire_timer([this] { timeout_cb(); }), _permit(std::move(permit)),
+              _rate_limit_info(rate_limit_info) {
         // original comment from cassandra:
         // during bootstrap, include pending endpoints in the count
         // or we may fail the consistency level guarantees (see #833, #8058)
@@ -611,12 +668,12 @@ public:
         return _mutation_holder->store_hint(hm, ep, tr_state);
     }
     future<> apply_locally(storage_proxy::clock_type::time_point timeout, tracing::trace_state_ptr tr_state) {
-        return _mutation_holder->apply_locally(*_proxy, timeout, std::move(tr_state), std::monostate()); // TODO: Pass the correct info
+        return _mutation_holder->apply_locally(*_proxy, timeout, std::move(tr_state), adjust_rate_limit_for_local_operation(_rate_limit_info));
     }
     future<> apply_remotely(gms::inet_address ep, inet_address_vector_replica_set&& forward,
             storage_proxy::response_id_type response_id, storage_proxy::clock_type::time_point timeout,
             tracing::trace_state_ptr tr_state) {
-        return _mutation_holder->apply_remotely(*_proxy, ep, std::move(forward), response_id, timeout, std::move(tr_state), std::monostate()); // TODO: Pass the correct info
+        return _mutation_holder->apply_remotely(*_proxy, ep, std::move(forward), response_id, timeout, std::move(tr_state), _rate_limit_info);
     }
     const schema_ptr& get_schema() const {
         return _mutation_holder->schema();
@@ -648,9 +705,9 @@ public:
     datacenter_write_response_handler(shared_ptr<storage_proxy> p, replica::keyspace& ks, db::consistency_level cl, db::write_type type,
             std::unique_ptr<mutation_holder> mh, inet_address_vector_replica_set targets,
             const inet_address_vector_topology_change& pending_endpoints, inet_address_vector_topology_change dead_endpoints, tracing::trace_state_ptr tr_state,
-            storage_proxy::write_stats& stats, service_permit permit) :
+            storage_proxy::write_stats& stats, service_permit permit, db::per_partition_rate_limit::info rate_limit_info) :
                 abstract_write_response_handler(std::move(p), ks, cl, type, std::move(mh),
-                        std::move(targets), std::move(tr_state), stats, std::move(permit), db::count_local_endpoints(pending_endpoints), std::move(dead_endpoints)) {
+                        std::move(targets), std::move(tr_state), stats, std::move(permit), rate_limit_info, db::count_local_endpoints(pending_endpoints), std::move(dead_endpoints)) {
         _total_endpoints = db::count_local_endpoints(_targets);
     }
 };
@@ -663,9 +720,9 @@ public:
     write_response_handler(shared_ptr<storage_proxy> p, replica::keyspace& ks, db::consistency_level cl, db::write_type type,
             std::unique_ptr<mutation_holder> mh, inet_address_vector_replica_set targets,
             const inet_address_vector_topology_change& pending_endpoints, inet_address_vector_topology_change dead_endpoints, tracing::trace_state_ptr tr_state,
-            storage_proxy::write_stats& stats, service_permit permit) :
+            storage_proxy::write_stats& stats, service_permit permit, db::per_partition_rate_limit::info rate_limit_info) :
                 abstract_write_response_handler(std::move(p), ks, cl, type, std::move(mh),
-                        std::move(targets), std::move(tr_state), stats, std::move(permit), pending_endpoints.size(), std::move(dead_endpoints)) {
+                        std::move(targets), std::move(tr_state), stats, std::move(permit), rate_limit_info, pending_endpoints.size(), std::move(dead_endpoints)) {
         _total_endpoints = _targets.size();
     }
 };
@@ -675,9 +732,9 @@ public:
     view_update_write_response_handler(shared_ptr<storage_proxy> p, replica::keyspace& ks, db::consistency_level cl,
             std::unique_ptr<mutation_holder> mh, inet_address_vector_replica_set targets,
             const inet_address_vector_topology_change& pending_endpoints, inet_address_vector_topology_change dead_endpoints, tracing::trace_state_ptr tr_state,
-            storage_proxy::write_stats& stats, service_permit permit):
+            storage_proxy::write_stats& stats, service_permit permit, db::per_partition_rate_limit::info rate_limit_info):
                 write_response_handler(p, ks, cl, db::write_type::VIEW, std::move(mh),
-                        std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit)) {
+                        std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit), rate_limit_info) {
         register_in_intrusive_list(*p);
     }
     ~view_update_write_response_handler();
@@ -754,8 +811,9 @@ class datacenter_sync_write_response_handler : public abstract_write_response_ha
 public:
     datacenter_sync_write_response_handler(shared_ptr<storage_proxy> p, replica::keyspace& ks, db::consistency_level cl, db::write_type type,
             std::unique_ptr<mutation_holder> mh, inet_address_vector_replica_set targets, const inet_address_vector_topology_change& pending_endpoints,
-            inet_address_vector_topology_change dead_endpoints, tracing::trace_state_ptr tr_state, storage_proxy::write_stats& stats, service_permit permit) :
-        abstract_write_response_handler(std::move(p), ks, cl, type, std::move(mh), targets, std::move(tr_state), stats, std::move(permit), 0, dead_endpoints) {
+            inet_address_vector_topology_change dead_endpoints, tracing::trace_state_ptr tr_state, storage_proxy::write_stats& stats, service_permit permit,
+            db::per_partition_rate_limit::info rate_limit_info) :
+        abstract_write_response_handler(std::move(p), ks, cl, type, std::move(mh), targets, std::move(tr_state), stats, std::move(permit), rate_limit_info, 0, dead_endpoints) {
         auto& snitch_ptr = locator::i_endpoint_snitch::get_local_snitch_ptr();
 
         for (auto& target : targets) {
@@ -1441,19 +1499,19 @@ future<result<>> storage_proxy::response_wait(storage_proxy::response_id_type id
 
 result<storage_proxy::response_id_type> storage_proxy::create_write_response_handler(replica::keyspace& ks, db::consistency_level cl, db::write_type type, std::unique_ptr<mutation_holder> m,
                              inet_address_vector_replica_set targets, const inet_address_vector_topology_change& pending_endpoints, inet_address_vector_topology_change dead_endpoints, tracing::trace_state_ptr tr_state,
-                             storage_proxy::write_stats& stats, service_permit permit)
+                             storage_proxy::write_stats& stats, service_permit permit, db::per_partition_rate_limit::info rate_limit_info)
 {
     shared_ptr<abstract_write_response_handler> h;
     auto& rs = ks.get_replication_strategy();
 
     if (db::is_datacenter_local(cl)) {
-        h = ::make_shared<datacenter_write_response_handler>(shared_from_this(), ks, cl, type, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit));
+        h = ::make_shared<datacenter_write_response_handler>(shared_from_this(), ks, cl, type, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit), rate_limit_info);
     } else if (cl == db::consistency_level::EACH_QUORUM && rs.get_type() == locator::replication_strategy_type::network_topology){
-        h = ::make_shared<datacenter_sync_write_response_handler>(shared_from_this(), ks, cl, type, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit));
+        h = ::make_shared<datacenter_sync_write_response_handler>(shared_from_this(), ks, cl, type, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit), rate_limit_info);
     } else if (type == db::write_type::VIEW) {
-        h = ::make_shared<view_update_write_response_handler>(shared_from_this(), ks, cl, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit));
+        h = ::make_shared<view_update_write_response_handler>(shared_from_this(), ks, cl, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit), rate_limit_info);
     } else {
-        h = ::make_shared<write_response_handler>(shared_from_this(), ks, cl, type, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit));
+        h = ::make_shared<write_response_handler>(shared_from_this(), ks, cl, type, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit), rate_limit_info);
     }
     return bo::success(register_response_handler(std::move(h)));
 }
@@ -1946,7 +2004,7 @@ storage_proxy::mutate_counter_on_leader_and_replicate(const schema_ptr& s, froze
 
 result<storage_proxy::response_id_type>
 storage_proxy::create_write_response_handler_helper(schema_ptr s, const dht::token& token, std::unique_ptr<mutation_holder> mh,
-        db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
+        db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit) {
     auto keyspace_name = s->ks_name();
     replica::keyspace& ks = _db.local().find_keyspace(keyspace_name);
     auto erm = ks.get_effective_replication_map();
@@ -1956,14 +2014,15 @@ storage_proxy::create_write_response_handler_helper(schema_ptr s, const dht::tok
     slogger.trace("creating write handler for token: {} natural: {} pending: {}", token, natural_endpoints, pending_endpoints);
     tracing::trace(tr_state, "Creating write handler for token: {} natural: {} pending: {}", token, natural_endpoints ,pending_endpoints);
 
+    const bool coordinator_in_replica_set = std::find(natural_endpoints.begin(), natural_endpoints.end(),
+            utils::fb_utilities::get_broadcast_address()) != natural_endpoints.end();
+
     // Check if this node, which is serving as a coordinator for
     // the mutation, is also a replica for the partition being
     // changed. Mutations sent by drivers unaware of token
     // distribution create a lot of network noise and thus should be
     // accounted in the metrics.
-    if (std::find(natural_endpoints.begin(), natural_endpoints.end(),
-                  utils::fb_utilities::get_broadcast_address()) == natural_endpoints.end()) {
-
+    if (!coordinator_in_replica_set) {
         get_stats().writes_coordinator_outside_replica_set++;
     }
 
@@ -1993,13 +2052,24 @@ storage_proxy::create_write_response_handler_helper(schema_ptr s, const dht::tok
     std::partition_copy(all.begin(), all.end(), std::back_inserter(live_endpoints),
             std::back_inserter(dead_endpoints), std::bind_front(std::mem_fn(&gms::gossiper::is_alive), &_gossiper));
 
+    db::per_partition_rate_limit::info rate_limit_info;
+    if (allow_limit && _db.local().can_apply_per_partition_rate_limit(*s, db::operation_type::write)) {
+        auto r_rate_limit_info = choose_rate_limit_info(_db.local(), coordinator_in_replica_set, db::operation_type::write, s, token, tr_state);
+        if (!r_rate_limit_info) {
+            return std::move(r_rate_limit_info).as_failure();
+        }
+        rate_limit_info = r_rate_limit_info.value();
+    } else {
+        slogger.trace("Operation is not rate limited");
+    }
+
     slogger.trace("creating write handler with live: {} dead: {}", live_endpoints, dead_endpoints);
     tracing::trace(tr_state, "Creating write handler with live: {} dead: {}", live_endpoints, dead_endpoints);
 
     db::assure_sufficient_live_nodes(cl, ks, live_endpoints, pending_endpoints);
 
     return create_write_response_handler(ks, cl, type, std::move(mh), std::move(live_endpoints), pending_endpoints,
-            std::move(dead_endpoints), std::move(tr_state), get_stats(), std::move(permit));
+            std::move(dead_endpoints), std::move(tr_state), get_stats(), std::move(permit), rate_limit_info);
 }
 
 /**
@@ -2010,19 +2080,19 @@ storage_proxy::create_write_response_handler_helper(schema_ptr s, const dht::tok
  * to the hint method below (dead nodes).
  */
 result<storage_proxy::response_id_type>
-storage_proxy::create_write_response_handler(const mutation& m, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
+storage_proxy::create_write_response_handler(const mutation& m, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit) {
     return create_write_response_handler_helper(m.schema(), m.token(), std::make_unique<shared_mutation>(m), cl, type, tr_state,
-            std::move(permit));
+            std::move(permit), allow_limit);
 }
 
 result<storage_proxy::response_id_type>
-storage_proxy::create_write_response_handler(const hint_wrapper& h, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
+storage_proxy::create_write_response_handler(const hint_wrapper& h, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit) {
     return create_write_response_handler_helper(h.mut.schema(), h.mut.token(), std::make_unique<hint_mutation>(h.mut), cl, type, tr_state,
-            std::move(permit));
+            std::move(permit), allow_limit);
 }
 
 result<storage_proxy::response_id_type>
-storage_proxy::create_write_response_handler(const std::unordered_map<gms::inet_address, std::optional<mutation>>& m, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
+storage_proxy::create_write_response_handler(const std::unordered_map<gms::inet_address, std::optional<mutation>>& m, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit) {
     inet_address_vector_replica_set endpoints;
     endpoints.reserve(m.size());
     boost::copy(m | boost::adaptors::map_keys, std::inserter(endpoints, endpoints.begin()));
@@ -2034,21 +2104,22 @@ storage_proxy::create_write_response_handler(const std::unordered_map<gms::inet_
     auto keyspace_name = mh->schema()->ks_name();
     replica::keyspace& ks = _db.local().find_keyspace(keyspace_name);
 
-    return create_write_response_handler(ks, cl, type, std::move(mh), std::move(endpoints), inet_address_vector_topology_change(), inet_address_vector_topology_change(), std::move(tr_state), get_stats(), std::move(permit));
+    // No rate limiting for read repair
+    return create_write_response_handler(ks, cl, type, std::move(mh), std::move(endpoints), inet_address_vector_topology_change(), inet_address_vector_topology_change(), std::move(tr_state), get_stats(), std::move(permit), std::monostate());
 }
 
 result<storage_proxy::response_id_type>
 storage_proxy::create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>& meta,
-        db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
+        db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit) {
     auto& [commit, s, h, t] = meta;
 
     return create_write_response_handler_helper(s, t, std::make_unique<cas_mutation>(std::move(commit), s, std::move(h)), cl,
-            db::write_type::CAS, tr_state, std::move(permit));
+            db::write_type::CAS, tr_state, std::move(permit), allow_limit);
 }
 
 result<storage_proxy::response_id_type>
 storage_proxy::create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, dht::token, inet_address_vector_replica_set>& meta,
-        db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
+        db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit) {
     auto& [commit, s, token, endpoints] = meta;
 
     slogger.trace("creating write handler for paxos repair token: {} endpoint: {}", token, endpoints);
@@ -2057,8 +2128,9 @@ storage_proxy::create_write_response_handler(const std::tuple<lw_shared_ptr<paxo
     auto keyspace_name = s->ks_name();
     replica::keyspace& ks = _db.local().find_keyspace(keyspace_name);
 
+    // No rate limiting for paxos (yet)
     return create_write_response_handler(ks, cl, db::write_type::CAS, std::make_unique<cas_mutation>(std::move(commit), s, nullptr), std::move(endpoints),
-                    inet_address_vector_topology_change(), inet_address_vector_topology_change(), std::move(tr_state), get_stats(), std::move(permit));
+                    inet_address_vector_topology_change(), inet_address_vector_topology_change(), std::move(tr_state), get_stats(), std::move(permit), std::monostate());
 }
 
 void storage_proxy::register_cdc_operation_result_tracker(const storage_proxy::unique_response_handler_vector& ids, lw_shared_ptr<cdc::operation_result_tracker> tracker) {
@@ -2106,7 +2178,7 @@ future<result<storage_proxy::unique_response_handler_vector>> storage_proxy::mut
 template<typename Range>
 future<result<storage_proxy::unique_response_handler_vector>> storage_proxy::mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
     return mutate_prepare<>(std::forward<Range>(mutations), cl, type, std::move(permit), [this, tr_state = std::move(tr_state)] (const typename std::decay_t<Range>::value_type& m, db::consistency_level cl, db::write_type type, service_permit permit) mutable {
-        return create_write_response_handler(m, cl, type, tr_state, std::move(permit));
+        return create_write_response_handler(m, cl, type, tr_state, std::move(permit), /* TODO */ db::allow_per_partition_rate_limit::no);
     });
 }
 
@@ -2493,7 +2565,7 @@ storage_proxy::mutate_atomically_result(std::vector<mutation> mutations, db::con
         future<result<>> send_batchlog_mutation(mutation m, db::consistency_level cl = db::consistency_level::ONE) {
             return _p.mutate_prepare<>(std::array<mutation, 1>{std::move(m)}, cl, db::write_type::BATCH_LOG, _permit, [this] (const mutation& m, db::consistency_level cl, db::write_type type, service_permit permit) {
                 auto& ks = _p._db.local().find_keyspace(m.schema()->ks_name());
-                return _p.create_write_response_handler(ks, cl, type, std::make_unique<shared_mutation>(m), _batchlog_endpoints, {}, {}, _trace_state, _stats, std::move(permit));
+                return _p.create_write_response_handler(ks, cl, type, std::make_unique<shared_mutation>(m), _batchlog_endpoints, {}, {}, _trace_state, _stats, std::move(permit), std::monostate());
             }).then(utils::result_wrap([this, cl] (unique_response_handler_vector ids) {
                 _p.register_cdc_operation_result_tracker(ids, _cdc_tracker);
                 return _p.mutate_begin(std::move(ids), cl, _trace_state, _timeout);
@@ -2635,7 +2707,8 @@ future<> storage_proxy::send_to_endpoint(
             std::move(dead_endpoints),
             tr_state,
             stats,
-            std::move(permit));
+            std::move(permit),
+            std::monostate()); // TODO: Pass the correct enforcement type
     }).then(utils::result_wrap([this, cl, tr_state = std::move(tr_state), timeout = std::move(timeout)] (unique_response_handler_vector ids) mutable {
         return mutate_begin(std::move(ids), cl, std::move(tr_state), std::move(timeout));
     })).then_wrapped([p = shared_from_this(), lc, &stats] (future<result<>> f) {
diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh
index 9901c9eeb2..af0c9e805e 100644
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -312,16 +312,16 @@ private:
     ::shared_ptr<abstract_write_response_handler>& get_write_response_handler(storage_proxy::response_id_type id);
     result<response_id_type> create_write_response_handler_helper(schema_ptr s, const dht::token& token,
             std::unique_ptr<mutation_holder> mh, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state,
-            service_permit permit);
+            service_permit permit, db::allow_per_partition_rate_limit allow_limit);
     result<response_id_type> create_write_response_handler(replica::keyspace& ks, db::consistency_level cl, db::write_type type, std::unique_ptr<mutation_holder> m, inet_address_vector_replica_set targets,
-            const inet_address_vector_topology_change& pending_endpoints, inet_address_vector_topology_change, tracing::trace_state_ptr tr_state, storage_proxy::write_stats& stats, service_permit permit);
-    result<response_id_type> create_write_response_handler(const mutation&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
-    result<response_id_type> create_write_response_handler(const hint_wrapper&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
-    result<response_id_type> create_write_response_handler(const std::unordered_map<gms::inet_address, std::optional<mutation>>&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
+            const inet_address_vector_topology_change& pending_endpoints, inet_address_vector_topology_change, tracing::trace_state_ptr tr_state, storage_proxy::write_stats& stats, service_permit permit, db::per_partition_rate_limit::info rate_limit_info);
+    result<response_id_type> create_write_response_handler(const mutation&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
+    result<response_id_type> create_write_response_handler(const hint_wrapper&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
+    result<response_id_type> create_write_response_handler(const std::unordered_map<gms::inet_address, std::optional<mutation>>&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
     result<response_id_type> create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>& proposal,
-            db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
+            db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
     result<response_id_type> create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, dht::token, inet_address_vector_replica_set>& meta,
-            db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
+            db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
     void register_cdc_operation_result_tracker(const storage_proxy::unique_response_handler_vector& ids, lw_shared_ptr<cdc::operation_result_tracker> tracker);
     void send_to_live_endpoints(response_id_type response_id, clock_type::time_point timeout);
     template<typename Range>

From 1e4e92ed8b48a2f9bc3e985ff6941316febed240 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Wed, 4 May 2022 00:54:25 +0200
Subject: [PATCH 16/29] storage_proxy: add allow rate limit flag to
 mutate_begin

Now, mutate_begin accepts a flag which decides whether given write
should be rate limited or not.
---
 service/storage_proxy.cc | 10 +++++-----
 service/storage_proxy.hh |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index 8f08663f94..94344c883a 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -2176,9 +2176,9 @@ future<result<storage_proxy::unique_response_handler_vector>> storage_proxy::mut
 }
 
 template<typename Range>
-future<result<storage_proxy::unique_response_handler_vector>> storage_proxy::mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
-    return mutate_prepare<>(std::forward<Range>(mutations), cl, type, std::move(permit), [this, tr_state = std::move(tr_state)] (const typename std::decay_t<Range>::value_type& m, db::consistency_level cl, db::write_type type, service_permit permit) mutable {
-        return create_write_response_handler(m, cl, type, tr_state, std::move(permit), /* TODO */ db::allow_per_partition_rate_limit::no);
+future<result<storage_proxy::unique_response_handler_vector>> storage_proxy::mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit) {
+    return mutate_prepare<>(std::forward<Range>(mutations), cl, type, std::move(permit), [this, tr_state = std::move(tr_state), allow_limit] (const typename std::decay_t<Range>::value_type& m, db::consistency_level cl, db::write_type type, service_permit permit) mutable {
+        return create_write_response_handler(m, cl, type, tr_state, std::move(permit), allow_limit);
     });
 }
 
@@ -2475,7 +2475,7 @@ storage_proxy::mutate_internal(Range mutations, db::consistency_level cl, bool c
     utils::latency_counter lc;
     lc.start();
 
-    return mutate_prepare(mutations, cl, type, tr_state, std::move(permit)).then(utils::result_wrap([this, cl, timeout_opt, tracker = std::move(cdc_tracker),
+    return mutate_prepare(mutations, cl, type, tr_state, std::move(permit), /* TODO */ db::allow_per_partition_rate_limit::no).then(utils::result_wrap([this, cl, timeout_opt, tracker = std::move(cdc_tracker),
             tr_state] (storage_proxy::unique_response_handler_vector ids) mutable {
         register_cdc_operation_result_tracker(ids, tracker);
         return mutate_begin(std::move(ids), cl, tr_state, timeout_opt);
@@ -2598,7 +2598,7 @@ storage_proxy::mutate_atomically_result(std::vector<mutation> mutations, db::con
         };
 
         future<result<>> run() {
-            return _p.mutate_prepare(_mutations, _cl, db::write_type::BATCH, _trace_state, _permit).then(utils::result_wrap([this] (unique_response_handler_vector ids) {
+            return _p.mutate_prepare(_mutations, _cl, db::write_type::BATCH, _trace_state, _permit, db::allow_per_partition_rate_limit::no).then(utils::result_wrap([this] (unique_response_handler_vector ids) {
                 return sync_write_to_batchlog().then(utils::result_wrap([this, ids = std::move(ids)] () mutable {
                     tracing::trace(_trace_state, "Sending batch mutations");
                     _p.register_cdc_operation_result_tracker(ids, _cdc_tracker);
diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh
index af0c9e805e..ab5291eb96 100644
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -381,7 +381,7 @@ private:
     template<typename Range, typename CreateWriteHandler>
     future<result<unique_response_handler_vector>> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, service_permit permit, CreateWriteHandler handler);
     template<typename Range>
-    future<result<unique_response_handler_vector>> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
+    future<result<unique_response_handler_vector>> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
     future<result<>> mutate_begin(unique_response_handler_vector ids, db::consistency_level cl, tracing::trace_state_ptr trace_state, std::optional<clock_type::time_point> timeout_opt = { });
     future<result<>> mutate_end(future<result<>> mutate_result, utils::latency_counter, write_stats& stats, tracing::trace_state_ptr trace_state);
     future<result<>> schedule_repair(std::unordered_map<dht::token, std::unordered_map<gms::inet_address, std::optional<mutation>>> diffs, db::consistency_level cl, tracing::trace_state_ptr trace_state, service_permit permit);

From 1f65c4e001f97978966b0c945b3063f9167989a8 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Wed, 4 May 2022 00:55:21 +0200
Subject: [PATCH 17/29] storage_proxy: add allow rate limit flag to
 mutate_internal

Now, mutate_internal accepts a flag which decides whether the write
should be rate limited or not.
---
 service/storage_proxy.cc | 5 +++--
 service/storage_proxy.hh | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index 94344c883a..7571838e31 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -2458,7 +2458,8 @@ future<> storage_proxy::replicate_counter_from_leader(mutation m, db::consistenc
 template<typename Range>
 future<result<>>
 storage_proxy::mutate_internal(Range mutations, db::consistency_level cl, bool counters, tracing::trace_state_ptr tr_state, service_permit permit,
-                               std::optional<clock_type::time_point> timeout_opt, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker) {
+                               std::optional<clock_type::time_point> timeout_opt, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker,
+                               db::allow_per_partition_rate_limit allow_limit) {
     if (boost::empty(mutations)) {
         return make_ready_future<result<>>(bo::success());
     }
@@ -2475,7 +2476,7 @@ storage_proxy::mutate_internal(Range mutations, db::consistency_level cl, bool c
     utils::latency_counter lc;
     lc.start();
 
-    return mutate_prepare(mutations, cl, type, tr_state, std::move(permit), /* TODO */ db::allow_per_partition_rate_limit::no).then(utils::result_wrap([this, cl, timeout_opt, tracker = std::move(cdc_tracker),
+    return mutate_prepare(mutations, cl, type, tr_state, std::move(permit), allow_limit).then(utils::result_wrap([this, cl, timeout_opt, tracker = std::move(cdc_tracker),
             tr_state] (storage_proxy::unique_response_handler_vector ids) mutable {
         register_cdc_operation_result_tracker(ids, tracker);
         return mutate_begin(std::move(ids), cl, tr_state, timeout_opt);
diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh
index ab5291eb96..25d6d5f15c 100644
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -389,7 +389,7 @@ private:
     void unthrottle();
     void handle_read_error(std::variant<exceptions::coordinator_exception_container, std::exception_ptr> failure, bool range);
     template<typename Range>
-    future<result<>> mutate_internal(Range mutations, db::consistency_level cl, bool counter_write, tracing::trace_state_ptr tr_state, service_permit permit, std::optional<clock_type::time_point> timeout_opt = { }, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker = { });
+    future<result<>> mutate_internal(Range mutations, db::consistency_level cl, bool counter_write, tracing::trace_state_ptr tr_state, service_permit permit, std::optional<clock_type::time_point> timeout_opt = { }, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker = { }, db::allow_per_partition_rate_limit allow_limit = db::allow_per_partition_rate_limit::no);
     future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> query_nonsingular_mutations_locally(
             schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range_vector&& pr, tracing::trace_state_ptr trace_state,
             clock_type::time_point timeout);

From e6beab3106078d2f0fdb0f41e76100433399b8f8 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Thu, 28 Apr 2022 13:58:49 +0200
Subject: [PATCH 18/29] storage_proxy: add allow rate limit flag to
 mutate/mutate_result

Now, mutate/mutate_result accept a flag which decides whether the write
should be rate limited or not.

The new parameter is mandatory and all call sites were updated.
---
 alternator/executor.cc                    |  7 ++++---
 alternator/ttl.cc                         |  3 ++-
 cql3/statements/batch_statement.cc        |  2 +-
 cql3/statements/modification_statement.cc |  2 +-
 db/batchlog_manager.cc                    |  2 +-
 db/system_distributed_keyspace.cc         |  3 +++
 db/view/view.cc                           |  2 +-
 redis/mutation_utils.cc                   |  8 ++++----
 service/storage_proxy.cc                  | 20 ++++++++++----------
 service/storage_proxy.hh                  | 10 ++++++----
 thrift/handler.cc                         | 10 +++++-----
 11 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/alternator/executor.cc b/alternator/executor.cc
index bd5521e8ef..e87a63ca12 100644
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -1543,7 +1543,7 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
                 if (!m) {
                     return make_ready_future<executor::request_return_type>(api_error::conditional_check_failed("Failed condition."));
                 }
-                return proxy.mutate(std::vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit)).then([this] () mutable {
+                return proxy.mutate(std::vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes).then([this] () mutable {
                     return rmw_operation_return(std::move(_return_attributes));
                 });
             });
@@ -1551,7 +1551,7 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
     } else if (_write_isolation != write_isolation::LWT_ALWAYS) {
         std::optional<mutation> m = apply(nullptr, api::new_timestamp());
         assert(m); // !needs_read_before_write, so apply() did not check a condition
-        return proxy.mutate(std::vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit)).then([this] () mutable {
+        return proxy.mutate(std::vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes).then([this] () mutable {
             return rmw_operation_return(std::move(_return_attributes));
         });
     }
@@ -1896,7 +1896,8 @@ static future<> do_batch_write(service::storage_proxy& proxy,
                 db::consistency_level::LOCAL_QUORUM,
                 executor::default_timeout(),
                 trace_state,
-                std::move(permit));
+                std::move(permit),
+                db::allow_per_partition_rate_limit::yes);
     } else {
         // Do the write via LWT:
         // Multiple mutations may be destined for the same partition, adding
diff --git a/alternator/ttl.cc b/alternator/ttl.cc
index 3396073a9c..eb8eee73fb 100644
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -284,7 +284,8 @@ static future<> expire_item(service::storage_proxy& proxy,
     return proxy.mutate(std::vector<mutation>{std::move(m)},
         db::consistency_level::LOCAL_QUORUM,
         executor::default_timeout(), // FIXME - which timeout?
-        qs.get_trace_state(), qs.get_permit());
+        qs.get_trace_state(), qs.get_permit(),
+        db::allow_per_partition_rate_limit::no);
 }
 
 static size_t random_offset(size_t min, size_t max) {
diff --git a/cql3/statements/batch_statement.cc b/cql3/statements/batch_statement.cc
index 9994706ecb..20a89b8990 100644
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -317,7 +317,7 @@ future<coordinator_result<>> batch_statement::execute_without_conditions(
             mutate_atomic = false;
         }
     }
-    return qp.proxy().mutate_with_triggers(std::move(mutations), cl, timeout, mutate_atomic, std::move(tr_state), std::move(permit));
+    return qp.proxy().mutate_with_triggers(std::move(mutations), cl, timeout, mutate_atomic, std::move(tr_state), std::move(permit), db::allow_per_partition_rate_limit::yes);
 }
 
 future<shared_ptr<cql_transport::messages::result_message>> batch_statement::execute_with_conditions(
diff --git a/cql3/statements/modification_statement.cc b/cql3/statements/modification_statement.cc
index 9c627b8394..872168cb25 100644
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -284,7 +284,7 @@ modification_statement::execute_without_condition(query_processor& qp, service::
             return make_ready_future<coordinator_result<>>(bo::success());
         }
         
-        return qp.proxy().mutate_with_triggers(std::move(mutations), cl, timeout, false, qs.get_trace_state(), qs.get_permit(), this->is_raw_counter_shard_write());
+        return qp.proxy().mutate_with_triggers(std::move(mutations), cl, timeout, false, qs.get_trace_state(), qs.get_permit(), db::allow_per_partition_rate_limit::yes, this->is_raw_counter_shard_write());
     });
 }
 
diff --git a/db/batchlog_manager.cc b/db/batchlog_manager.cc
index 81a9c790e4..4b755f78a5 100644
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -243,7 +243,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
                 // send to partially or wholly fail in actually sending stuff. Since we don't
                 // have hints (yet), send with CL=ALL, and hope we can re-do this soon.
                 // See below, we use retry on write failure.
-                return _qp.proxy().mutate(mutations, db::consistency_level::ALL, db::no_timeout, nullptr, empty_service_permit());
+                return _qp.proxy().mutate(mutations, db::consistency_level::ALL, db::no_timeout, nullptr, empty_service_permit(), db::allow_per_partition_rate_limit::no);
             });
         }).then_wrapped([this, id](future<> batch_result) {
             try {
diff --git a/db/system_distributed_keyspace.cc b/db/system_distributed_keyspace.cc
index eda4d0cb23..c1ebf38dd3 100644
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -565,6 +565,7 @@ system_distributed_keyspace::insert_cdc_generation(
             db::timeout_clock::now() + 60s,
             nullptr, // trace_state
             empty_service_permit(),
+            db::allow_per_partition_rate_limit::no,
             false // raw_counters
         );
     });
@@ -661,6 +662,7 @@ system_distributed_keyspace::create_cdc_desc(
             db::timeout_clock::now() + 30s,
             nullptr, // trace_state
             empty_service_permit(),
+            db::allow_per_partition_rate_limit::no,
             false // raw_counters
         );
     });
@@ -704,6 +706,7 @@ system_distributed_keyspace::cdc_desc_exists(
             db::timeout_clock::now() + 10s,
             nullptr, // trace_state
             empty_service_permit(),
+            db::allow_per_partition_rate_limit::no,
             false // raw_counters
         );
 
diff --git a/db/view/view.cc b/db/view/view.cc
index 243d8df8a5..f67b7ee8e1 100644
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -2248,7 +2248,7 @@ void delete_ghost_rows_visitor::accept_new_row(const clustering_key& ck, const q
         auto& row = m.partition().clustered_row(*_view, ck);
         row.apply(tombstone(api::new_timestamp(), gc_clock::now()));
         timeout = db::timeout_clock::now() + _timeout_duration;
-        _proxy.mutate({m}, db::consistency_level::ALL, timeout, _state.get_trace_state(), empty_service_permit()).get();
+        _proxy.mutate({m}, db::consistency_level::ALL, timeout, _state.get_trace_state(), empty_service_permit(), db::allow_per_partition_rate_limit::no).get();
     }
 }
 
diff --git a/redis/mutation_utils.cc b/redis/mutation_utils.cc
index cd3bc59492..9bac16d6ca 100644
--- a/redis/mutation_utils.cc
+++ b/redis/mutation_utils.cc
@@ -50,7 +50,7 @@ future<> write_hashes(service::storage_proxy& proxy, redis::redis_options& optio
     m.set_clustered_cell(ckey, column, std::move(cell));
 
     auto write_consistency_level = options.get_write_consistency_level();
-    return proxy.mutate(std::vector<mutation> {std::move(m)}, write_consistency_level, timeout, nullptr, permit);
+    return proxy.mutate(std::vector<mutation> {std::move(m)}, write_consistency_level, timeout, nullptr, permit, db::allow_per_partition_rate_limit::yes);
 }
 
 
@@ -68,7 +68,7 @@ future<> write_strings(service::storage_proxy& proxy, redis::redis_options& opti
     db::timeout_clock::time_point timeout = db::timeout_clock::now() + options.get_write_timeout();
     auto m = make_mutation(proxy, options, std::move(key), std::move(data), ttl);
     auto write_consistency_level = options.get_write_consistency_level();
-    return proxy.mutate(std::vector<mutation> {std::move(m)}, write_consistency_level, timeout, nullptr, permit);
+    return proxy.mutate(std::vector<mutation> {std::move(m)}, write_consistency_level, timeout, nullptr, permit, db::allow_per_partition_rate_limit::yes);
 }
 
 
@@ -87,7 +87,7 @@ future<> delete_objects(service::storage_proxy& proxy, redis::redis_options& opt
     auto remove = [&proxy, timeout, write_consistency_level, permit, &options, keys = std::move(keys)] (const sstring& cf_name) {
         return parallel_for_each(keys.begin(), keys.end(), [&proxy, timeout, write_consistency_level, &options, permit, cf_name] (const bytes& key) {
             auto m = make_tombstone(proxy, options, cf_name, key);
-            return proxy.mutate(std::vector<mutation> {std::move(m)}, write_consistency_level, timeout, nullptr, permit);
+            return proxy.mutate(std::vector<mutation> {std::move(m)}, write_consistency_level, timeout, nullptr, permit, db::allow_per_partition_rate_limit::yes);
         });
     };  
     return parallel_for_each(tables.begin(), tables.end(), remove);
@@ -107,7 +107,7 @@ future<> delete_fields(service::storage_proxy& proxy, redis::redis_options& opti
         m.partition().apply_delete(*schema, ckey, tombstone { ts, clk });
         mutations.push_back(m);
     }
-    return proxy.mutate(mutations, write_consistency_level, timeout, nullptr, permit);
+    return proxy.mutate(mutations, write_consistency_level, timeout, nullptr, permit, db::allow_per_partition_rate_limit::yes);
 }
 
 }
diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index 7571838e31..67e7e75768 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -2414,29 +2414,29 @@ storage_proxy::get_paxos_participants(const sstring& ks_name, const dht::token &
  * @param consistency_level the consistency level for the operation
  * @param tr_state trace state handle
  */
-future<> storage_proxy::mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters) {
-    return mutate_result(std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit), raw_counters)
+future<> storage_proxy::mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit, bool raw_counters) {
+    return mutate_result(std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit), allow_limit, raw_counters)
             .then(utils::result_into_future<result<>>);
 }
 
-future<result<>> storage_proxy::mutate_result(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters) {
+future<result<>> storage_proxy::mutate_result(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit, bool raw_counters) {
     if (_cdc && _cdc->needs_cdc_augmentation(mutations)) {
-        return _cdc->augment_mutation_call(timeout, std::move(mutations), tr_state, cl).then([this, cl, timeout, tr_state, permit = std::move(permit), raw_counters, cdc = _cdc->shared_from_this()](std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>&& t) mutable {
+        return _cdc->augment_mutation_call(timeout, std::move(mutations), tr_state, cl).then([this, cl, timeout, tr_state, permit = std::move(permit), raw_counters, cdc = _cdc->shared_from_this(), allow_limit](std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>&& t) mutable {
             auto mutations = std::move(std::get<0>(t));
             auto tracker = std::move(std::get<1>(t));
-            return _mutate_stage(this, std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit), raw_counters, std::move(tracker));
+            return _mutate_stage(this, std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit), raw_counters, allow_limit, std::move(tracker));
         });
     }
-    return _mutate_stage(this, std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit), raw_counters, nullptr);
+    return _mutate_stage(this, std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit), raw_counters, allow_limit, nullptr);
 }
 
-future<result<>> storage_proxy::do_mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker) {
+future<result<>> storage_proxy::do_mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters, db::allow_per_partition_rate_limit allow_limit, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker) {
     auto mid = raw_counters ? mutations.begin() : boost::range::partition(mutations, [] (auto&& m) {
         return m.schema()->is_counter();
     });
     return seastar::when_all_succeed(
         mutate_counters(boost::make_iterator_range(mutations.begin(), mid), cl, tr_state, permit, timeout),
-        mutate_internal(boost::make_iterator_range(mid, mutations.end()), cl, false, tr_state, permit, timeout, std::move(cdc_tracker))
+        mutate_internal(boost::make_iterator_range(mid, mutations.end()), cl, false, tr_state, permit, timeout, std::move(cdc_tracker), allow_limit)
     ).then([] (std::tuple<result<>> res) {
         // For now, only mutate_internal returns a result<>
         return std::get<0>(std::move(res));
@@ -2488,13 +2488,13 @@ storage_proxy::mutate_internal(Range mutations, db::consistency_level cl, bool c
 future<result<>>
 storage_proxy::mutate_with_triggers(std::vector<mutation> mutations, db::consistency_level cl,
     clock_type::time_point timeout,
-    bool should_mutate_atomically, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters) {
+    bool should_mutate_atomically, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit, bool raw_counters) {
     warn(unimplemented::cause::TRIGGERS);
     if (should_mutate_atomically) {
         assert(!raw_counters);
         return mutate_atomically_result(std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit));
     }
-    return mutate_result(std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit), raw_counters);
+    return mutate_result(std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit), allow_limit, raw_counters);
 }
 
 /**
diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh
index 25d6d5f15c..48f619ddeb 100644
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -271,6 +271,7 @@ private:
             tracing::trace_state_ptr,
             service_permit,
             bool,
+            db::allow_per_partition_rate_limit,
             lw_shared_ptr<cdc::operation_result_tracker>> _mutate_stage;
     netw::connection_drop_slot_t _connection_dropped;
     netw::connection_drop_registration_t _condrop_registration;
@@ -404,7 +405,7 @@ private:
 
     gms::inet_address find_leader_for_counter_update(const mutation& m, db::consistency_level cl);
 
-    future<result<>> do_mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker);
+    future<result<>> do_mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool, db::allow_per_partition_rate_limit allow_limit, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker);
 
     future<> send_to_endpoint(
             std::unique_ptr<mutation_holder> m,
@@ -537,14 +538,14 @@ public:
     * @param consistency_level the consistency level for the operation
     * @param tr_state trace state handle
     */
-    future<> mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters = false);
+    future<> mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit, bool raw_counters = false);
 
     /**
     * See mutate. Does the same, but returns some exceptions
     * through the result<>, which allows for efficient inspection
     * of the exception on the exception handling path.
     */
-    future<result<>> mutate_result(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters = false);
+    future<result<>> mutate_result(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit, bool raw_counters = false);
 
     paxos_participants
     get_paxos_participants(const sstring& ks_name, const dht::token& token, db::consistency_level consistency_for_paxos);
@@ -553,7 +554,8 @@ public:
                                            clock_type::time_point timeout, service_permit permit);
 
     future<result<>> mutate_with_triggers(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout,
-                                          bool should_mutate_atomically, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters = false);
+                                          bool should_mutate_atomically, tracing::trace_state_ptr tr_state, service_permit permit,
+                                          db::allow_per_partition_rate_limit allow_limit, bool raw_counters = false);
 
     /**
     * See mutate. Adds additional steps before and after writing a batch.
diff --git a/thrift/handler.cc b/thrift/handler.cc
index c06aa6ca1b..4ee1097a20 100644
--- a/thrift/handler.cc
+++ b/thrift/handler.cc
@@ -511,7 +511,7 @@ public:
             add_to_mutation(*schema, column, m_to_apply);
             return _query_state.get_client_state().has_schema_access(_db, *schema, auth::permission::MODIFY).then([this, m_to_apply = std::move(m_to_apply), consistency_level, permit = std::move(permit)] () mutable {
                 auto timeout = db::timeout_clock::now() + _timeout_config.write_timeout;
-                return _proxy.local().mutate({std::move(m_to_apply)}, cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit));
+                return _proxy.local().mutate({std::move(m_to_apply)}, cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit), db::allow_per_partition_rate_limit::yes);
             });
         });
     }
@@ -527,7 +527,7 @@ public:
             add_to_mutation(*schema, column, m_to_apply);
             return _query_state.get_client_state().has_schema_access(_db, *schema, auth::permission::MODIFY).then([this, m_to_apply = std::move(m_to_apply), consistency_level, permit = std::move(permit)] () mutable {
                 auto timeout = db::timeout_clock::now() + _timeout_config.write_timeout;
-                return _proxy.local().mutate({std::move(m_to_apply)}, cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit));
+                return _proxy.local().mutate({std::move(m_to_apply)}, cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit), db::allow_per_partition_rate_limit::yes);
             });
         });
     }
@@ -564,7 +564,7 @@ public:
 
             return _query_state.get_client_state().has_schema_access(_db, *schema, auth::permission::MODIFY).then([this, m_to_apply = std::move(m_to_apply), consistency_level, permit = std::move(permit)] () mutable {
                 auto timeout = db::timeout_clock::now() + _timeout_config.write_timeout;
-                return _proxy.local().mutate({std::move(m_to_apply)}, cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit));
+                return _proxy.local().mutate({std::move(m_to_apply)}, cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit), db::allow_per_partition_rate_limit::yes);
             });
         });
     }
@@ -591,7 +591,7 @@ public:
             return _query_state.get_client_state().has_schema_access(_db, *schema, auth::permission::MODIFY).then([this, m_to_apply = std::move(m_to_apply), consistency_level, permit = std::move(permit)] () mutable {
                 // This mutation contains only counter tombstones so it can be applied like non-counter mutations.
                 auto timeout = db::timeout_clock::now() + _timeout_config.counter_write_timeout;
-                return _proxy.local().mutate({std::move(m_to_apply)}, cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit));
+                return _proxy.local().mutate({std::move(m_to_apply)}, cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit), db::allow_per_partition_rate_limit::yes);
             });
         });
     }
@@ -604,7 +604,7 @@ public:
                 return _query_state.get_client_state().has_schema_access(_db, *schema, auth::permission::MODIFY);
             }).then([this, muts = std::move(p.first), consistency_level, permit = std::move(permit)] () mutable {
                 auto timeout = db::timeout_clock::now() + _timeout_config.write_timeout;
-                return _proxy.local().mutate(std::move(muts), cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit));
+                return _proxy.local().mutate(std::move(muts), cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit), db::allow_per_partition_rate_limit::yes);
             });
         });
     }

From e8e8ada4b45661f48fafbeb07fb8e3269041617e Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Wed, 4 May 2022 01:53:14 +0200
Subject: [PATCH 19/29] storage_proxy: add per partition rate limit info to
 query_result_local(_digest)

The query_result_local and query_result_local_digest methods were
updated to accept db::per_partition_rate_limit::info structure and pass
it on to database::accept.
---
 service/storage_proxy.cc | 18 +++++++++---------
 service/storage_proxy.hh |  6 ++++--
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index 67e7e75768..6e642201c1 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -3689,7 +3689,7 @@ protected:
                   : query::result_options{query::result_request::only_result, query::digest_algorithm::none};
         if (fbu::is_me(ep)) {
             tracing::trace(_trace_state, "read_data: querying locally");
-            return _proxy->query_result_local(_schema, _cmd, _partition_range, opts, _trace_state, timeout);
+            return _proxy->query_result_local(_schema, _cmd, _partition_range, opts, _trace_state, timeout, std::monostate()); // TODO: Pass the correct rate limit type
         } else {
             tracing::trace(_trace_state, "read_data: sending a message to /{}", ep);
             return ser::storage_proxy_rpc_verbs::send_read_data(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range, opts.digest_algo).then([this, ep](rpc::tuple<query::result, rpc::optional<cache_temperature>, rpc::optional<replica::exception_variant>> result_hit_rate) {
@@ -3707,7 +3707,7 @@ protected:
         if (fbu::is_me(ep)) {
             tracing::trace(_trace_state, "read_digest: querying locally");
             return _proxy->query_result_local_digest(_schema, _cmd, _partition_range, _trace_state,
-                        timeout, digest_algorithm(*_proxy));
+                        timeout, digest_algorithm(*_proxy), std::monostate()); // TODO: Pass the correct rate limit type
         } else {
             tracing::trace(_trace_state, "read_digest: sending a message to /{}", ep);
             return ser::storage_proxy_rpc_verbs::send_read_digest(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *_cmd,
@@ -4169,8 +4169,8 @@ db::read_repair_decision storage_proxy::new_read_repair_decision(const schema& s
 }
 
 future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>>
-storage_proxy::query_result_local_digest(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr, tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout, query::digest_algorithm da) {
-    return query_result_local(std::move(s), std::move(cmd), pr, query::result_options::only_digest(da), std::move(trace_state), timeout).then([] (rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature> result_and_hit_rate) {
+storage_proxy::query_result_local_digest(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr, tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout, query::digest_algorithm da, db::per_partition_rate_limit::info rate_limit_info) {
+    return query_result_local(std::move(s), std::move(cmd), pr, query::result_options::only_digest(da), std::move(trace_state), timeout, rate_limit_info).then([] (rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature> result_and_hit_rate) {
         auto&& [result, hit_rate] = result_and_hit_rate;
         return make_ready_future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>>(rpc::tuple(*result->digest(), result->last_modified(), hit_rate));
     });
@@ -4178,15 +4178,15 @@ storage_proxy::query_result_local_digest(schema_ptr s, lw_shared_ptr<query::read
 
 future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>
 storage_proxy::query_result_local(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr, query::result_options opts,
-                                  tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout) {
+                                  tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout, db::per_partition_rate_limit::info rate_limit_info) {
     cmd->slice.options.set_if<query::partition_slice::option::with_digest>(opts.request != query::result_request::only_result);
     if (pr.is_singular()) {
         unsigned shard = dht::shard_of(*s, pr.start()->value().token());
         get_stats().replica_cross_shard_ops += shard != this_shard_id();
-        return _db.invoke_on(shard, _read_smp_service_group, [gs = global_schema_ptr(s), prv = dht::partition_range_vector({pr}) /* FIXME: pr is copied */, cmd, opts, timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (replica::database& db) mutable {
+        return _db.invoke_on(shard, _read_smp_service_group, [gs = global_schema_ptr(s), prv = dht::partition_range_vector({pr}) /* FIXME: pr is copied */, cmd, opts, timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state)), rate_limit_info] (replica::database& db) mutable {
             auto trace_state = gt.get();
             tracing::trace(trace_state, "Start querying singular range {}", prv.front());
-            return db.query(gs, *cmd, opts, prv, trace_state, timeout).then([trace_state](std::tuple<lw_shared_ptr<query::result>, cache_temperature>&& f_ht) {
+            return db.query(gs, *cmd, opts, prv, trace_state, timeout, rate_limit_info).then([trace_state](std::tuple<lw_shared_ptr<query::result>, cache_temperature>&& f_ht) {
                 auto&& [f, ht] = f_ht;
                 tracing::trace(trace_state, "Querying is done");
                 return make_ready_future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>(rpc::tuple(make_foreign(std::move(f)), ht));
@@ -5291,7 +5291,7 @@ storage_proxy::handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_poi
                 opts.digest_algo = da;
                 opts.request = da == query::digest_algorithm::none ? query::result_request::only_result : query::result_request::result_and_digest;
                 auto timeout = t ? *t : db::no_timeout;
-                return p->query_result_local(std::move(s), cmd, std::move(pr2.first), opts, trace_state_ptr, timeout);
+                return p->query_result_local(std::move(s), cmd, std::move(pr2.first), opts, trace_state_ptr, timeout, std::monostate()); // TODO: Pass the correct rate limit info
             }).then_wrapped([this, &trace_state_ptr, src_ip] (future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>> f) mutable {
                 tracing::trace(trace_state_ptr, "read_data handling is done, sending a response to /{}", src_ip);
                 return encode_replica_exception_for_rpc(std::move(f), [] { return std::make_tuple(foreign_ptr(make_lw_shared<query::result>()), cache_temperature::invalid()); });
@@ -5356,7 +5356,7 @@ storage_proxy::handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_p
                     throw std::runtime_error("READ_DIGEST called with wrapping range");
                 }
                 auto timeout = t ? *t : db::no_timeout;
-                return p->query_result_local_digest(std::move(s), cmd, std::move(pr2.first), trace_state_ptr, timeout, da);
+                return p->query_result_local_digest(std::move(s), cmd, std::move(pr2.first), trace_state_ptr, timeout, da, std::monostate()); // TODO: Pass the correct rate limit info
             }).then_wrapped([this, &trace_state_ptr, src_ip] (future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>> f) mutable {
                 tracing::trace(trace_state_ptr, "read_digest handling is done, sending a response to /{}", src_ip);
                 return encode_replica_exception_for_rpc(std::move(f), [] { return std::make_tuple(query::result_digest(), api::missing_timestamp, cache_temperature::invalid()); });
diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh
index 48f619ddeb..7f06e85043 100644
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -347,11 +347,13 @@ private:
     future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>> query_result_local(schema_ptr, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr,
                                                                            query::result_options opts,
                                                                            tracing::trace_state_ptr trace_state,
-                                                                           clock_type::time_point timeout);
+                                                                           clock_type::time_point timeout,
+                                                                           db::per_partition_rate_limit::info rate_limit_info);
     future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>> query_result_local_digest(schema_ptr, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr,
                                                                                                    tracing::trace_state_ptr trace_state,
                                                                                                    clock_type::time_point timeout,
-                                                                                                   query::digest_algorithm da);
+                                                                                                   query::digest_algorithm da,
+                                                                                                   db::per_partition_rate_limit::info rate_limit_info);
     future<result<coordinator_query_result>> query_partition_key_range(lw_shared_ptr<query::read_command> cmd,
             dht::partition_range_vector partition_ranges,
             db::consistency_level cl,

From d3d9add219e5f90dce85ada5f0715088fe016a1b Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Thu, 14 Apr 2022 15:47:07 +0200
Subject: [PATCH 20/29] storage_proxy: add per partition rate limit info to
 read RPC

Now, the read RPC accept the per partition rate limit info parameter. It
is passed on to query_result_local(_digest) methods.
---
 idl/storage_proxy.idl.hh |  4 ++--
 service/storage_proxy.cc | 22 ++++++++++++----------
 service/storage_proxy.hh |  4 ++--
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/idl/storage_proxy.idl.hh b/idl/storage_proxy.idl.hh
index d233c46d08..740dd97156 100644
--- a/idl/storage_proxy.idl.hh
+++ b/idl/storage_proxy.idl.hh
@@ -11,9 +11,9 @@ verb [[with_client_info, one_way]] mutation_done (unsigned shard, uint64_t respo
 verb [[with_client_info, one_way]] mutation_failed (unsigned shard, uint64_t response_id, size_t num_failed, db::view::update_backlog backlog [[version 3.1.0]], replica::exception_variant exception [[version 5.1.0]]);
 verb [[with_client_info, with_timeout]] counter_mutation (std::vector<frozen_mutation> fms, db::consistency_level cl, std::optional<tracing::trace_info> trace_info);
 verb [[with_client_info, with_timeout, one_way]] hint_mutation (frozen_mutation fm, inet_address_vector_replica_set forward, gms::inet_address reply_to, unsigned shard, uint64_t response_id, std::optional<tracing::trace_info> trace_info [[version 1.3.0]] /* this verb was mistakenly introduced with optional trace_info */);
-verb [[with_client_info, with_timeout]] read_data (query::read_command cmd, ::compat::wrapping_partition_range pr, query::digest_algorithm digest [[version 3.0.0]]) -> query::result [[lw_shared_ptr]], cache_temperature [[version 2.0.0]], replica::exception_variant [[version 5.1.0]];
+verb [[with_client_info, with_timeout]] read_data (query::read_command cmd, ::compat::wrapping_partition_range pr, query::digest_algorithm digest [[version 3.0.0]], db::per_partition_rate_limit::info rate_limit_info [[version 5.1.0]]) -> query::result [[lw_shared_ptr]], cache_temperature [[version 2.0.0]], replica::exception_variant [[version 5.1.0]];
 verb [[with_client_info, with_timeout]] read_mutation_data (query::read_command cmd, ::compat::wrapping_partition_range pr) -> reconcilable_result [[lw_shared_ptr]], cache_temperature [[version 2.0.0]], replica::exception_variant [[version 5.1.0]];
-verb [[with_client_info, with_timeout]] read_digest (query::read_command cmd, ::compat::wrapping_partition_range pr, query::digest_algorithm digest [[version 3.0.0]]) -> query::result_digest, api::timestamp_type [[version 1.2.0]], cache_temperature [[version 2.0.0]], replica::exception_variant [[version 5.1.0]];
+verb [[with_client_info, with_timeout]] read_digest (query::read_command cmd, ::compat::wrapping_partition_range pr, query::digest_algorithm digest [[version 3.0.0]], db::per_partition_rate_limit::info rate_limit_info [[version 5.1.0]]) -> query::result_digest, api::timestamp_type [[version 1.2.0]], cache_temperature [[version 2.0.0]], replica::exception_variant [[version 5.1.0]];
 verb [[with_timeout]] truncate (sstring, sstring);
 verb [[with_client_info, with_timeout]] paxos_prepare (query::read_command cmd, partition_key key, utils::UUID ballot, bool only_digest, query::digest_algorithm da, std::optional<tracing::trace_info> trace_info) -> service::paxos::prepare_response [[unique_ptr]];
 verb [[with_client_info, with_timeout]] paxos_accept (service::paxos::proposal proposal [[ref]], std::optional<tracing::trace_info> trace_info) -> bool;
diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index 6e642201c1..d5b55b986b 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -3692,7 +3692,7 @@ protected:
             return _proxy->query_result_local(_schema, _cmd, _partition_range, opts, _trace_state, timeout, std::monostate()); // TODO: Pass the correct rate limit type
         } else {
             tracing::trace(_trace_state, "read_data: sending a message to /{}", ep);
-            return ser::storage_proxy_rpc_verbs::send_read_data(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range, opts.digest_algo).then([this, ep](rpc::tuple<query::result, rpc::optional<cache_temperature>, rpc::optional<replica::exception_variant>> result_hit_rate) {
+            return ser::storage_proxy_rpc_verbs::send_read_data(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range, opts.digest_algo, /* TODO: Pass the correct rate limit info */ std::monostate()).then([this, ep](rpc::tuple<query::result, rpc::optional<cache_temperature>, rpc::optional<replica::exception_variant>> result_hit_rate) {
                 auto&& [result, hit_rate, opt_exception] = result_hit_rate;
                 if (opt_exception.has_value() && *opt_exception) {
                     return make_exception_future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>((*opt_exception).into_exception_ptr());
@@ -3711,7 +3711,7 @@ protected:
         } else {
             tracing::trace(_trace_state, "read_digest: sending a message to /{}", ep);
             return ser::storage_proxy_rpc_verbs::send_read_digest(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *_cmd,
-                        _partition_range, digest_algorithm(*_proxy)).then([this, ep] (
+                        _partition_range, digest_algorithm(*_proxy), /* TODO: Pass the correct rate limit info */ std::monostate()).then([this, ep] (
                     rpc::tuple<query::result_digest, rpc::optional<api::timestamp_type>, rpc::optional<cache_temperature>, rpc::optional<replica::exception_variant>> digest_timestamp_hit_rate) {
                 auto&& [d, t, hit_rate, opt_exception] = digest_timestamp_hit_rate;
                 if (opt_exception.has_value() && *opt_exception) {
@@ -5264,7 +5264,7 @@ future<rpc::tuple<Elements..., replica::exception_variant>> storage_proxy::encod
 }
 
 future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature, replica::exception_variant>>
-storage_proxy::handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda) {
+storage_proxy::handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda, rpc::optional<db::per_partition_rate_limit::info> rate_limit_info_opt) {
         tracing::trace_state_ptr trace_state_ptr;
         auto src_addr = netw::messaging_service::get_source(cinfo);
         if (cmd.trace_info) {
@@ -5274,14 +5274,15 @@ storage_proxy::handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_poi
         }
         auto da = oda.value_or(query::digest_algorithm::MD5);
         auto sp = get_local_shared_storage_proxy();
+        auto rate_limit_info = rate_limit_info_opt.value_or(std::monostate());
         if (!cmd.max_result_size) {
             auto& cfg = sp->local_db().get_config();
             cmd.max_result_size.emplace(cfg.max_memory_for_unlimited_query_soft_limit(), cfg.max_memory_for_unlimited_query_hard_limit());
         }
-        return do_with(std::move(pr), std::move(sp), std::move(trace_state_ptr), [this, &cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, t] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
+        return do_with(std::move(pr), std::move(sp), std::move(trace_state_ptr), [this, &cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, t, rate_limit_info] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
             p->get_stats().replica_data_reads++;
             auto src_ip = src_addr.addr;
-            return _mm->get_schema_for_read(cmd->schema_version, std::move(src_addr), p->_messaging).then([cmd, da, &pr, &p, &trace_state_ptr, t] (schema_ptr s) {
+            return _mm->get_schema_for_read(cmd->schema_version, std::move(src_addr), p->_messaging).then([cmd, da, &pr, &p, &trace_state_ptr, t, rate_limit_info] (schema_ptr s) {
                 auto pr2 = ::compat::unwrap(std::move(pr), *s);
                 if (pr2.second) {
                     // this function assumes singular queries but doesn't validate
@@ -5291,7 +5292,7 @@ storage_proxy::handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_poi
                 opts.digest_algo = da;
                 opts.request = da == query::digest_algorithm::none ? query::result_request::only_result : query::result_request::result_and_digest;
                 auto timeout = t ? *t : db::no_timeout;
-                return p->query_result_local(std::move(s), cmd, std::move(pr2.first), opts, trace_state_ptr, timeout, std::monostate()); // TODO: Pass the correct rate limit info
+                return p->query_result_local(std::move(s), cmd, std::move(pr2.first), opts, trace_state_ptr, timeout, rate_limit_info);
             }).then_wrapped([this, &trace_state_ptr, src_ip] (future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>> f) mutable {
                 tracing::trace(trace_state_ptr, "read_data handling is done, sending a response to /{}", src_ip);
                 return encode_replica_exception_for_rpc(std::move(f), [] { return std::make_tuple(foreign_ptr(make_lw_shared<query::result>()), cache_temperature::invalid()); });
@@ -5334,7 +5335,7 @@ storage_proxy::handle_read_mutation_data(const rpc::client_info& cinfo, rpc::opt
 }
 
 future<rpc::tuple<query::result_digest, long, cache_temperature, replica::exception_variant>>
-storage_proxy::handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda) {
+storage_proxy::handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda, rpc::optional<db::per_partition_rate_limit::info> rate_limit_info_opt) {
         tracing::trace_state_ptr trace_state_ptr;
         auto src_addr = netw::messaging_service::get_source(cinfo);
         if (cmd.trace_info) {
@@ -5343,20 +5344,21 @@ storage_proxy::handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_p
             tracing::trace(trace_state_ptr, "read_digest: message received from /{}", src_addr.addr);
         }
         auto da = oda.value_or(query::digest_algorithm::MD5);
+        auto rate_limit_info = rate_limit_info_opt.value_or(std::monostate());
         if (!cmd.max_result_size) {
             cmd.max_result_size.emplace(cinfo.retrieve_auxiliary<uint64_t>("max_result_size"));
         }
-        return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [this, &cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, t] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
+        return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [this, &cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, t, rate_limit_info] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
             p->get_stats().replica_digest_reads++;
             auto src_ip = src_addr.addr;
-            return _mm->get_schema_for_read(cmd->schema_version, std::move(src_addr), p->_messaging).then([cmd, &pr, &p, &trace_state_ptr, t, da] (schema_ptr s) {
+            return _mm->get_schema_for_read(cmd->schema_version, std::move(src_addr), p->_messaging).then([cmd, &pr, &p, &trace_state_ptr, t, da, rate_limit_info] (schema_ptr s) {
                 auto pr2 = ::compat::unwrap(std::move(pr), *s);
                 if (pr2.second) {
                     // this function assumes singular queries but doesn't validate
                     throw std::runtime_error("READ_DIGEST called with wrapping range");
                 }
                 auto timeout = t ? *t : db::no_timeout;
-                return p->query_result_local_digest(std::move(s), cmd, std::move(pr2.first), trace_state_ptr, timeout, da, std::monostate()); // TODO: Pass the correct rate limit info
+                return p->query_result_local_digest(std::move(s), cmd, std::move(pr2.first), trace_state_ptr, timeout, da, rate_limit_info);
             }).then_wrapped([this, &trace_state_ptr, src_ip] (future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>> f) mutable {
                 tracing::trace(trace_state_ptr, "read_digest handling is done, sending a response to /{}", src_ip);
                 return encode_replica_exception_for_rpc(std::move(f), [] { return std::make_tuple(query::result_digest(), api::missing_timestamp, cache_temperature::invalid()); });
diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh
index 7f06e85043..a280c9be3a 100644
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -445,9 +445,9 @@ private:
             storage_proxy::response_id_type response_id, std::optional<tracing::trace_info> trace_info);
     future<rpc::no_wait_type> handle_mutation_done(const rpc::client_info& cinfo, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<db::view::update_backlog> backlog);
     future<rpc::no_wait_type> handle_mutation_failed(const rpc::client_info& cinfo, unsigned shard, storage_proxy::response_id_type response_id, size_t num_failed, rpc::optional<db::view::update_backlog> backlog, rpc::optional<replica::exception_variant> exception);
-    future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature, replica::exception_variant>> handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda);
+    future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature, replica::exception_variant>> handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda, rpc::optional<db::per_partition_rate_limit::info> rate_limit_info_opt);
     future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature, replica::exception_variant>> handle_read_mutation_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr);
-    future<rpc::tuple<query::result_digest, long, cache_temperature, replica::exception_variant>> handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda);
+    future<rpc::tuple<query::result_digest, long, cache_temperature, replica::exception_variant>> handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda, rpc::optional<db::per_partition_rate_limit::info> rate_limit_info_opt);
     future<> handle_truncate(rpc::opt_time_point timeout, sstring ksname, sstring cfname);
     future<foreign_ptr<std::unique_ptr<service::paxos::prepare_response>>> handle_paxos_prepare(const rpc::client_info& cinfo, rpc::opt_time_point timeout,
                 query::read_command cmd, partition_key key, utils::UUID ballot, bool only_digest, query::digest_algorithm da,

From 3357066387655cf9576c3700008542de3f6c0838 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Thu, 19 May 2022 08:59:36 +0200
Subject: [PATCH 21/29] storage_proxy: resultize return type of
 get_read_executor

Now, get_read_executor is able to return coordinator exceptions without
throwing them. In an upcoming commit, it will start returning rate limit
exception in some cases and it is preferable to return them without
throwing.
---
 service/storage_proxy.cc | 13 ++++++++-----
 service/storage_proxy.hh |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index d5b55b986b..5739715374 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -4092,7 +4092,7 @@ db::read_repair_decision storage_proxy::new_read_repair_decision(const schema& s
     return db::read_repair_decision::NONE;
 }
 
-::shared_ptr<abstract_read_executor> storage_proxy::get_read_executor(lw_shared_ptr<query::read_command> cmd,
+result<::shared_ptr<abstract_read_executor>> storage_proxy::get_read_executor(lw_shared_ptr<query::read_command> cmd,
         schema_ptr schema,
         dht::partition_range pr,
         db::consistency_level cl,
@@ -4254,11 +4254,14 @@ storage_proxy::query_singular(lw_shared_ptr<query::read_command> cmd,
         const auto replicas = it == query_options.preferred_replicas.end()
             ? inet_address_vector_replica_set{} : replica_ids_to_endpoints(*tmptr, it->second);
 
-        auto read_executor = get_read_executor(cmd, schema, std::move(pr), cl, repair_decision,
-                                               query_options.trace_state, replicas, is_read_non_local,
-                                               query_options.permit);
+        auto r_read_executor = get_read_executor(cmd, schema, std::move(pr), cl, repair_decision,
+                                                 query_options.trace_state, replicas, is_read_non_local,
+                                                 query_options.permit);
+        if (!r_read_executor) {
+            co_return std::move(r_read_executor).as_failure();
+        }
 
-        exec.emplace_back(read_executor, std::move(token_range));
+        exec.emplace_back(r_read_executor.value(), std::move(token_range));
     }
     if (is_read_non_local) {
         get_stats().reads_coordinator_outside_replica_set++;
diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh
index a280c9be3a..df8c0738a3 100644
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -335,7 +335,7 @@ private:
     static void sort_endpoints_by_proximity(inet_address_vector_replica_set& eps);
     inet_address_vector_replica_set get_live_sorted_endpoints(replica::keyspace& ks, const dht::token& token) const;
     db::read_repair_decision new_read_repair_decision(const schema& s);
-    ::shared_ptr<abstract_read_executor> get_read_executor(lw_shared_ptr<query::read_command> cmd,
+    result<::shared_ptr<abstract_read_executor>> get_read_executor(lw_shared_ptr<query::read_command> cmd,
             schema_ptr schema,
             dht::partition_range pr,
             db::consistency_level cl,

From c691e94190c1c4f9060068b3b9070e03eddb56d7 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Wed, 4 May 2022 02:06:42 +0200
Subject: [PATCH 22/29] storage_proxy: add allow rate limit flag to
 get_read_executor

Adds a flag to get_read_executor which decides whether the read should
be rate limited or not. The read executors were modified to choose the
appropriate per partition rate limit info parameter and send it to the
replicas.
---
 service/storage_proxy.cc | 43 ++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index 5739715374..78f39266dd 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -3636,6 +3636,7 @@ protected:
     lw_shared_ptr<replica::column_family> _cf;
     bool _foreground = true;
     service_permit _permit; // holds admission permit until operation completes
+    db::per_partition_rate_limit::info _rate_limit_info;
 
 private:
     void on_read_resolved() noexcept {
@@ -3645,9 +3646,9 @@ private:
     }
 public:
     abstract_read_executor(schema_ptr s, lw_shared_ptr<replica::column_family> cf, shared_ptr<storage_proxy> proxy, lw_shared_ptr<query::read_command> cmd, dht::partition_range pr, db::consistency_level cl, size_t block_for,
-            inet_address_vector_replica_set targets, tracing::trace_state_ptr trace_state, service_permit permit) :
+            inet_address_vector_replica_set targets, tracing::trace_state_ptr trace_state, service_permit permit, db::per_partition_rate_limit::info rate_limit_info) :
                            _schema(std::move(s)), _proxy(std::move(proxy)), _cmd(std::move(cmd)), _partition_range(std::move(pr)), _cl(cl), _block_for(block_for), _targets(std::move(targets)), _trace_state(std::move(trace_state)),
-                           _cf(std::move(cf)), _permit(std::move(permit)) {
+                           _cf(std::move(cf)), _permit(std::move(permit)), _rate_limit_info(rate_limit_info) {
         _proxy->get_stats().reads++;
         _proxy->get_stats().foreground_reads++;
     }
@@ -3689,10 +3690,10 @@ protected:
                   : query::result_options{query::result_request::only_result, query::digest_algorithm::none};
         if (fbu::is_me(ep)) {
             tracing::trace(_trace_state, "read_data: querying locally");
-            return _proxy->query_result_local(_schema, _cmd, _partition_range, opts, _trace_state, timeout, std::monostate()); // TODO: Pass the correct rate limit type
+            return _proxy->query_result_local(_schema, _cmd, _partition_range, opts, _trace_state, timeout, adjust_rate_limit_for_local_operation(_rate_limit_info));
         } else {
             tracing::trace(_trace_state, "read_data: sending a message to /{}", ep);
-            return ser::storage_proxy_rpc_verbs::send_read_data(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range, opts.digest_algo, /* TODO: Pass the correct rate limit info */ std::monostate()).then([this, ep](rpc::tuple<query::result, rpc::optional<cache_temperature>, rpc::optional<replica::exception_variant>> result_hit_rate) {
+            return ser::storage_proxy_rpc_verbs::send_read_data(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range, opts.digest_algo, _rate_limit_info).then([this, ep](rpc::tuple<query::result, rpc::optional<cache_temperature>, rpc::optional<replica::exception_variant>> result_hit_rate) {
                 auto&& [result, hit_rate, opt_exception] = result_hit_rate;
                 if (opt_exception.has_value() && *opt_exception) {
                     return make_exception_future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>((*opt_exception).into_exception_ptr());
@@ -3707,11 +3708,11 @@ protected:
         if (fbu::is_me(ep)) {
             tracing::trace(_trace_state, "read_digest: querying locally");
             return _proxy->query_result_local_digest(_schema, _cmd, _partition_range, _trace_state,
-                        timeout, digest_algorithm(*_proxy), std::monostate()); // TODO: Pass the correct rate limit type
+                        timeout, digest_algorithm(*_proxy), adjust_rate_limit_for_local_operation(_rate_limit_info));
         } else {
             tracing::trace(_trace_state, "read_digest: sending a message to /{}", ep);
             return ser::storage_proxy_rpc_verbs::send_read_digest(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *_cmd,
-                        _partition_range, digest_algorithm(*_proxy), /* TODO: Pass the correct rate limit info */ std::monostate()).then([this, ep] (
+                        _partition_range, digest_algorithm(*_proxy), _rate_limit_info).then([this, ep] (
                     rpc::tuple<query::result_digest, rpc::optional<api::timestamp_type>, rpc::optional<cache_temperature>, rpc::optional<replica::exception_variant>> digest_timestamp_hit_rate) {
                 auto&& [d, t, hit_rate, opt_exception] = digest_timestamp_hit_rate;
                 if (opt_exception.has_value() && *opt_exception) {
@@ -4003,9 +4004,9 @@ private:
 
 class never_speculating_read_executor : public abstract_read_executor {
 public:
-    never_speculating_read_executor(schema_ptr s, lw_shared_ptr<replica::column_family> cf, shared_ptr<storage_proxy> proxy, lw_shared_ptr<query::read_command> cmd, dht::partition_range pr, db::consistency_level cl, inet_address_vector_replica_set targets, tracing::trace_state_ptr trace_state,
-                                    service_permit permit) :
-                                        abstract_read_executor(std::move(s), std::move(cf), std::move(proxy), std::move(cmd), std::move(pr), cl, 0, std::move(targets), std::move(trace_state), std::move(permit)) {
+    never_speculating_read_executor(schema_ptr s, lw_shared_ptr<replica::column_family> cf, shared_ptr<storage_proxy> proxy, lw_shared_ptr<query::read_command> cmd, dht::partition_range pr, db::consistency_level cl, inet_address_vector_replica_set targets, tracing::trace_state_ptr trace_state, service_permit permit,
+            db::per_partition_rate_limit::info rate_limit_info) :
+                                        abstract_read_executor(std::move(s), std::move(cf), std::move(proxy), std::move(cmd), std::move(pr), cl, 0, std::move(targets), std::move(trace_state), std::move(permit), rate_limit_info) {
         _block_for = _targets.size();
     }
 };
@@ -4137,24 +4138,36 @@ result<::shared_ptr<abstract_read_executor>> storage_proxy::get_read_executor(lw
 
     size_t block_for = db::block_for(ks, cl);
     auto p = shared_from_this();
+
+    db::per_partition_rate_limit::info rate_limit_info;
+    if (false /* TODO: Pass the correct allow_limit */ && _db.local().can_apply_per_partition_rate_limit(*schema, db::operation_type::read)) {
+        auto r_rate_limit_info = choose_rate_limit_info(_db.local(), !is_read_non_local, db::operation_type::read, schema, token, trace_state);
+        if (!r_rate_limit_info) {
+            return std::move(r_rate_limit_info).as_failure();
+        }
+        rate_limit_info = r_rate_limit_info.value();
+    } else {
+        slogger.trace("Operation is not rate limited");
+    }
+
     // Speculative retry is disabled *OR* there are simply no extra replicas to speculate.
     if (retry_type == speculative_retry::type::NONE || block_for == all_replicas.size()
             || (repair_decision == db::read_repair_decision::DC_LOCAL && is_datacenter_local(cl) && block_for == target_replicas.size())) {
-        return ::make_shared<never_speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, std::move(target_replicas), std::move(trace_state), std::move(permit));
+        return ::make_shared<never_speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, std::move(target_replicas), std::move(trace_state), std::move(permit), rate_limit_info);
     }
 
     if (target_replicas.size() == all_replicas.size()) {
         // CL.ALL, RRD.GLOBAL or RRD.DC_LOCAL and a single-DC.
         // We are going to contact every node anyway, so ask for 2 full data requests instead of 1, for redundancy
         // (same amount of requests in total, but we turn 1 digest request into a full blown data request).
-        return ::make_shared<always_speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, block_for, std::move(target_replicas), std::move(trace_state), std::move(permit));
+        return ::make_shared<always_speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, block_for, std::move(target_replicas), std::move(trace_state), std::move(permit), rate_limit_info);
     }
 
     // RRD.NONE or RRD.DC_LOCAL w/ multiple DCs.
     if (target_replicas.size() == block_for) { // If RRD.DC_LOCAL extra replica may already be present
         if (is_datacenter_local(cl) && !db::is_local(extra_replica)) {
             slogger.trace("read executor no extra target to speculate");
-            return ::make_shared<never_speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, std::move(target_replicas), std::move(trace_state), std::move(permit));
+            return ::make_shared<never_speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, std::move(target_replicas), std::move(trace_state), std::move(permit), rate_limit_info);
         } else {
             target_replicas.push_back(extra_replica);
             slogger.trace("creating read executor with extra target {}", extra_replica);
@@ -4162,9 +4175,9 @@ result<::shared_ptr<abstract_read_executor>> storage_proxy::get_read_executor(lw
     }
 
     if (retry_type == speculative_retry::type::ALWAYS) {
-        return ::make_shared<always_speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, block_for, std::move(target_replicas), std::move(trace_state), std::move(permit));
+        return ::make_shared<always_speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, block_for, std::move(target_replicas), std::move(trace_state), std::move(permit), rate_limit_info);
     } else {// PERCENTILE or CUSTOM.
-        return ::make_shared<speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, block_for, std::move(target_replicas), std::move(trace_state), std::move(permit));
+        return ::make_shared<speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, block_for, std::move(target_replicas), std::move(trace_state), std::move(permit), rate_limit_info);
     }
 }
 
@@ -4469,7 +4482,7 @@ storage_proxy::query_partition_key_range_concurrent(storage_proxy::clock_type::t
             throw;
         }
 
-        exec.push_back(::make_shared<never_speculating_read_executor>(schema, cf.shared_from_this(), p, cmd, std::move(range), cl, std::move(filtered_endpoints), trace_state, permit));
+        exec.push_back(::make_shared<never_speculating_read_executor>(schema, cf.shared_from_this(), p, cmd, std::move(range), cl, std::move(filtered_endpoints), trace_state, permit, std::monostate()));
         ranges_per_exec.emplace(exec.back().get(), std::move(merged_ranges));
     }
 

From a7ad70600d69b2b724c1c253f038f14e61cc14ba Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Thu, 2 Jun 2022 14:17:38 +0200
Subject: [PATCH 23/29] query-request: add allow_limit flag

Adds allow_limit flag to the read_command. The flag decides whether rate
limiting of this operation is allowed.
---
 alternator/executor.cc              | 2 ++
 cql3/statements/select_statement.cc | 2 ++
 query-request.hh                    | 7 ++++++-
 redis/query_utils.cc                | 1 +
 service/storage_proxy.cc            | 2 +-
 thrift/handler.cc                   | 9 +++++++--
 6 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/alternator/executor.cc b/alternator/executor.cc
index e87a63ca12..ec42030291 100644
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -1510,6 +1510,7 @@ static future<std::unique_ptr<rjson::value>> get_previous_item(
     stats.reads_before_write++;
     auto selection = cql3::selection::selection::wildcard(schema);
     auto command = previous_item_read_command(proxy, schema, ck, selection);
+    command->allow_limit = db::allow_per_partition_rate_limit::yes;
     auto cl = db::consistency_level::LOCAL_QUORUM;
 
     return proxy.query(schema, command, to_partition_ranges(*schema, pk), cl, service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state)).then(
@@ -3253,6 +3254,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
             auto selection = cql3::selection::selection::wildcard(rs.schema);
             auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
             auto command = ::make_lw_shared<query::read_command>(rs.schema->id(), rs.schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice));
+            command->allow_limit = db::allow_per_partition_rate_limit::yes;
             future<std::vector<rjson::value>> f = _proxy.query(rs.schema, std::move(command), std::move(partition_ranges), rs.cl,
                     service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then(
                     [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get] (service::storage_proxy::coordinator_query_result qr) mutable {
diff --git a/cql3/statements/select_statement.cc b/cql3/statements/select_statement.cc
index 46831b04b0..fb6ec88fcd 100644
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -360,6 +360,7 @@ select_statement::do_execute(query_processor& qp,
             utils::UUID(),
             query::is_first_page::no,
             options.get_timestamp(state));
+    command->allow_limit = db::allow_per_partition_rate_limit::yes;
 
     int32_t page_size = options.get_page_size();
 
@@ -530,6 +531,7 @@ indexed_table_select_statement::prepare_command_for_base_query(query_processor&
             utils::UUID(),
             query::is_first_page::no,
             options.get_timestamp(state));
+    cmd->allow_limit = db::allow_per_partition_rate_limit::yes;
     return cmd;
 }
 
diff --git a/query-request.hh b/query-request.hh
index bc62331b37..767a1c7c42 100644
--- a/query-request.hh
+++ b/query-request.hh
@@ -18,6 +18,7 @@
 #include "tracing/tracing.hh"
 #include "utils/small_vector.hh"
 #include "query_class_config.hh"
+#include "db/per_partition_rate_limit_info.hh"
 
 #include "bytes.hh"
 
@@ -298,6 +299,7 @@ public:
     std::optional<query::max_result_size> max_result_size;
     uint32_t row_limit_high_bits;
     api::timestamp_type read_timestamp; // not serialized
+    db::allow_per_partition_rate_limit allow_limit; // not serialized
 public:
     // IDL constructor
     read_command(utils::UUID cf_id,
@@ -323,6 +325,7 @@ public:
         , max_result_size(max_result_size)
         , row_limit_high_bits(row_limit_high_bits)
         , read_timestamp(api::new_timestamp())
+        , allow_limit(db::allow_per_partition_rate_limit::no)
     { }
 
     read_command(utils::UUID cf_id,
@@ -335,7 +338,8 @@ public:
             std::optional<tracing::trace_info> ti = std::nullopt,
             utils::UUID query_uuid = utils::UUID(),
             query::is_first_page is_first_page = query::is_first_page::no,
-            api::timestamp_type rt = api::new_timestamp())
+            api::timestamp_type rt = api::new_timestamp(),
+            db::allow_per_partition_rate_limit allow_limit = db::allow_per_partition_rate_limit::no)
         : cf_id(std::move(cf_id))
         , schema_version(std::move(schema_version))
         , slice(std::move(slice))
@@ -348,6 +352,7 @@ public:
         , max_result_size(max_result_size)
         , row_limit_high_bits(static_cast<uint32_t>(static_cast<uint64_t>(row_limit) >> 32))
         , read_timestamp(rt)
+        , allow_limit(allow_limit)
     { }
 
 
diff --git a/redis/query_utils.cc b/redis/query_utils.cc
index 222f3764a8..c6c2ee18a3 100644
--- a/redis/query_utils.cc
+++ b/redis/query_utils.cc
@@ -8,6 +8,7 @@
 
 
 #include "redis/query_utils.hh"
+#include "db/per_partition_rate_limit_info.hh"
 #include "redis/options.hh"
 #include "timeout_config.hh"
 #include "service/client_state.hh"
diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index 78f39266dd..eb1418f765 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -4140,7 +4140,7 @@ result<::shared_ptr<abstract_read_executor>> storage_proxy::get_read_executor(lw
     auto p = shared_from_this();
 
     db::per_partition_rate_limit::info rate_limit_info;
-    if (false /* TODO: Pass the correct allow_limit */ && _db.local().can_apply_per_partition_rate_limit(*schema, db::operation_type::read)) {
+    if (cmd->allow_limit && _db.local().can_apply_per_partition_rate_limit(*schema, db::operation_type::read)) {
         auto r_rate_limit_info = choose_rate_limit_info(_db.local(), !is_read_non_local, db::operation_type::read, schema, token, trace_state);
         if (!r_rate_limit_info) {
             return std::move(r_rate_limit_info).as_failure();
diff --git a/thrift/handler.cc b/thrift/handler.cc
index 4ee1097a20..6843959058 100644
--- a/thrift/handler.cc
+++ b/thrift/handler.cc
@@ -405,8 +405,10 @@ public:
         clustering_ranges.emplace_back(query::clustering_range::make_open_ended_both_sides());
         auto slice = query::partition_slice(std::move(clustering_ranges), { }, std::move(regular_columns), opts,
                 std::move(specific_ranges), cql_serialization_format::internal());
-        return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), proxy.get_max_result_size(slice),
+        auto cmd = make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), proxy.get_max_result_size(slice),
                 query::row_limit(row_limit), query::partition_limit(partition_limit));
+        cmd->allow_limit = db::allow_per_partition_rate_limit::yes;
+        return cmd;
     }
 
     static future<> do_get_paged_slice(
@@ -692,6 +694,7 @@ public:
             auto& proxy = _proxy.local();
             auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), proxy.get_max_result_size(slice),
                     query::row_limit(row_limit));
+            cmd->allow_limit = db::allow_per_partition_rate_limit::yes;
             auto f = _query_state.get_client_state().has_schema_access(_db, *schema, auth::permission::SELECT);
             return f.then([this, &proxy, dk = std::move(dk), cmd, schema, column_limit = request.count, cl = request.consistency_level, permit = std::move(permit)] () mutable {
                 auto timeout = db::timeout_clock::now() + _timeout_config.read_timeout;
@@ -1611,7 +1614,9 @@ private:
         }
         auto slice = query::partition_slice(std::move(clustering_ranges), {}, std::move(regular_columns), opts,
                 nullptr, cql_serialization_format::internal(), per_partition_row_limit);
-        return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), proxy.get_max_result_size(slice));
+        auto cmd = make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), proxy.get_max_result_size(slice));
+        cmd->allow_limit = db::allow_per_partition_rate_limit::yes;
+        return cmd;
     }
     static ColumnParent column_path_to_column_parent(const ColumnPath& column_path) {
         ColumnParent ret;

From 1a36029ab5b6cc19ad81885df47719deae8d466b Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Mon, 13 Jun 2022 16:30:14 +0200
Subject: [PATCH 24/29] cf_prop_defs: guard per-partition rate limit with a
 feature

The per-partition rate limit feature requires all nodes in the cluster
to support it in order to work well. This commit adds a check which
disallows creating/altering tables with per-partition rate limit until
the node is sure that all nodes in the cluster support it.
---
 cql3/statements/cf_prop_defs.cc | 17 +++++++++++++++++
 cql3/statements/cf_prop_defs.hh |  1 +
 2 files changed, 18 insertions(+)

diff --git a/cql3/statements/cf_prop_defs.cc b/cql3/statements/cf_prop_defs.cc
index 457ab6227c..23d6fc4e46 100644
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -17,6 +17,8 @@
 #include "gms/feature_service.hh"
 #include "tombstone_gc_extension.hh"
 #include "tombstone_gc.hh"
+#include "db/per_partition_rate_limit_extension.hh"
+#include "db/per_partition_rate_limit_options.hh"
 
 #include <boost/algorithm/string/predicate.hpp>
 
@@ -127,6 +129,11 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
         throw exceptions::configuration_exception("CDC not supported by the cluster");
     }
 
+    auto per_partition_rate_limit_options = get_per_partition_rate_limit_options(schema_extensions);
+    if (per_partition_rate_limit_options && !db.features().typed_errors_in_read_rpc) {
+        throw exceptions::configuration_exception("Per-partition rate limit is not supported yet by the whole cluster");
+    }
+
     auto tombstone_gc_options = get_tombstone_gc_options(schema_extensions);
     validate_tombstone_gc_options(tombstone_gc_options, db, ks_name);
 
@@ -219,6 +226,16 @@ const tombstone_gc_options* cf_prop_defs::get_tombstone_gc_options(const schema:
     return &ext->get_options();
 }
 
+const db::per_partition_rate_limit_options* cf_prop_defs::get_per_partition_rate_limit_options(const schema::extensions_map& schema_exts) const {
+    auto it = schema_exts.find(db::per_partition_rate_limit_extension::NAME);
+    if (it == schema_exts.end()) {
+        return nullptr;
+    }
+
+    auto ext = dynamic_pointer_cast<db::per_partition_rate_limit_extension>(it->second);
+    return &ext->get_options();
+}
+
 void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions) const {
     if (has_property(KW_COMMENT)) {
         builder.set_comment(get_string(KW_COMMENT, ""));
diff --git a/cql3/statements/cf_prop_defs.hh b/cql3/statements/cf_prop_defs.hh
index 30699f3170..261835bacb 100644
--- a/cql3/statements/cf_prop_defs.hh
+++ b/cql3/statements/cf_prop_defs.hh
@@ -78,6 +78,7 @@ public:
     const cdc::options* get_cdc_options(const schema::extensions_map&) const;
     std::optional<caching_options> get_caching_options() const;
     const tombstone_gc_options* get_tombstone_gc_options(const schema::extensions_map&) const;
+    const db::per_partition_rate_limit_options* get_per_partition_rate_limit_options(const schema::extensions_map&) const;
 #if 0
     public CachingOptions getCachingOptions() throws SyntaxException, ConfigurationException
     {

From 761a037afb6767dd60d62e3c57718b4b8e7df2b6 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Mon, 20 Jun 2022 14:39:47 +0200
Subject: [PATCH 25/29] config: add add_per_partition_rate_limit_extension
 function for testing

...and use it in cql_test_env to enable the per_partition_rate_limit
extension for all tests that use it.
---
 db/config.cc             | 5 +++++
 db/config.hh             | 1 +
 test/lib/cql_test_env.cc | 1 +
 3 files changed, 7 insertions(+)

diff --git a/db/config.cc b/db/config.cc
index a7e8c0ec6f..9fb19a1eca 100644
--- a/db/config.cc
+++ b/db/config.cc
@@ -23,6 +23,7 @@
 
 #include "cdc/cdc_extension.hh"
 #include "tombstone_gc_extension.hh"
+#include "db/per_partition_rate_limit_extension.hh"
 #include "config.hh"
 #include "extensions.hh"
 #include "log.hh"
@@ -910,6 +911,10 @@ void db::config::add_cdc_extension() {
     _extensions->add_schema_extension<cdc::cdc_extension>(cdc::cdc_extension::NAME);
 }
 
+void db::config::add_per_partition_rate_limit_extension() {
+    _extensions->add_schema_extension<db::per_partition_rate_limit_extension>(db::per_partition_rate_limit_extension::NAME);
+}
+
 void db::config::setup_directories() {
     maybe_in_workdir(commitlog_directory, "commitlog");
     maybe_in_workdir(data_file_directories, "data");
diff --git a/db/config.hh b/db/config.hh
index d2ba29786c..7f87358aec 100644
--- a/db/config.hh
+++ b/db/config.hh
@@ -105,6 +105,7 @@ public:
 
     // For testing only
     void add_cdc_extension();
+    void add_per_partition_rate_limit_extension();
 
     /// True iff the feature is enabled.
     bool check_experimental(experimental_features_t::feature f) const;
diff --git a/test/lib/cql_test_env.cc b/test/lib/cql_test_env.cc
index b1d68125e0..21b577d276 100644
--- a/test/lib/cql_test_env.cc
+++ b/test/lib/cql_test_env.cc
@@ -101,6 +101,7 @@ cql_test_config::cql_test_config(shared_ptr<db::config> cfg)
     db_config->commitlog_use_o_dsync.set(false);
 
     db_config->add_cdc_extension();
+    db_config->add_per_partition_rate_limit_extension();
 
     db_config->flush_schema_tables_after_modification.set(false);
 }

From bc5016301643d214fddba13efbbdb920d2767e25 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Mon, 20 Jun 2022 14:40:40 +0200
Subject: [PATCH 26/29] tests: add per_partition_rate_limit_test

Adds the per_partition_rate_limit_test.cc file. Currently, it only
contains a test which verifies that the feature correctly switches off
rate limiting for internal queries (!allow_limit || internal sg).
---
 configure.py                                |   1 +
 test/boost/per_partition_rate_limit_test.cc | 109 ++++++++++++++++++++
 2 files changed, 110 insertions(+)
 create mode 100644 test/boost/per_partition_rate_limit_test.cc

diff --git a/configure.py b/configure.py
index 08f99d3fc2..eda12da0ca 100755
--- a/configure.py
+++ b/configure.py
@@ -506,6 +506,7 @@ scylla_tests = set([
     'test/boost/exception_container_test',
     'test/boost/result_utils_test',
     'test/boost/rate_limiter_test',
+    'test/boost/per_partition_rate_limit_test',
     'test/boost/expr_test',
     'test/manual/ec2_snitch_test',
     'test/manual/enormous_table_scan_test',
diff --git a/test/boost/per_partition_rate_limit_test.cc b/test/boost/per_partition_rate_limit_test.cc
new file mode 100644
index 0000000000..0c52e310f3
--- /dev/null
+++ b/test/boost/per_partition_rate_limit_test.cc
@@ -0,0 +1,109 @@
+#include <chrono>
+#include <cstdint>
+#include <seastar/core/coroutine.hh>
+#include <seastar/testing/test_case.hh>
+
+#include "test/lib/cql_test_env.hh"
+#include "test/lib/cql_assertions.hh"
+
+#include "mutation.hh"
+#include "service/storage_proxy.hh"
+
+SEASTAR_TEST_CASE(test_internal_operation_filtering) {
+    return do_with_cql_env_thread([] (cql_test_env& e) -> future<> {
+        // The test requires at least two shards
+        // so that it can test the shard!=coordinator case
+        BOOST_REQUIRE_GT(smp::count, 1);
+
+        cquery_nofail(e, "CREATE TABLE ks.tbl (pk int PRIMARY KEY) \
+                WITH per_partition_rate_limit = {'max_reads_per_second': 1, 'max_writes_per_second': 1}");
+        
+        auto& db = e.db();
+        auto& qp = e.qp();
+        const auto sptr = db.local().find_schema("ks", "tbl");
+
+        auto pk = partition_key::from_singular(*sptr, int32_t(0));
+
+        unsigned local_shard = dht::shard_of(*sptr, dht::get_token(*sptr, pk.view()));
+        unsigned foreign_shard = (local_shard + 1) % smp::count;
+        
+        auto run_writes = [&qp, &db, pk] (db::allow_per_partition_rate_limit allow_limit) -> future<> {
+            BOOST_TEST_MESSAGE("Testing writes");
+
+            const auto sptr = db.local().find_schema("ks", "tbl");
+            auto m = mutation(sptr, partition_key(pk));
+
+            // Rejection is probabilistic, so try many times
+            for (int i = 0; i < 100; i++) {
+                qp.local().proxy().mutate({m},
+                        db::consistency_level::ALL,
+                        service::storage_proxy::clock_type::now() + std::chrono::seconds(10),
+                        nullptr,
+                        empty_service_permit(),
+                        allow_limit).get();
+            }
+
+            return make_ready_future<>();
+        };
+
+        auto run_reads = [&qp, &db, pk] (db::allow_per_partition_rate_limit allow_limit) -> future<> {
+            BOOST_TEST_MESSAGE("Testing reads");
+
+            const auto sptr = db.local().find_schema("ks", "tbl");
+            auto pk_def = sptr->get_column_definition("pk");
+            auto dk = dht::decorate_key(*sptr, partition_key(pk));
+            auto selection = cql3::selection::selection::for_columns(sptr, {pk_def});
+            auto opts = selection->get_query_options();
+            auto partition_slice = query::partition_slice(
+                    {query::clustering_range::make_open_ended_both_sides()}, {}, {}, std::move(opts));
+
+            auto cmd = make_lw_shared<query::read_command>(sptr->id(), sptr->version(), partition_slice, query::max_result_size(1), query::row_limit(1));
+            cmd->allow_limit = allow_limit;
+
+            // Rejection is probabilistic, so try many times
+            for (int i = 0; i < 100; i++) {
+                qp.local().proxy().query(sptr,
+                        cmd,
+                        {dht::partition_range(dk)},
+                        db::consistency_level::ALL,
+                        service::storage_proxy::coordinator_query_options(
+                                db::timeout_clock::now() + std::chrono::seconds(10),
+                                empty_service_permit(),
+                                service::client_state::for_internal_calls())).get();
+            }
+
+            return make_ready_future<>();
+        };
+
+        auto sgroups = get_scheduling_groups().get();
+
+        for (unsigned shard : {local_shard, foreign_shard}) {
+            for (scheduling_group sg : {sgroups.statement_scheduling_group, sgroups.streaming_scheduling_group}) {
+                for (db::allow_per_partition_rate_limit allow_limit : {db::allow_per_partition_rate_limit::yes, db::allow_per_partition_rate_limit::no}) {
+                    // Rate limiting must be explicitly enabled and handled on the correct scheduling group.
+                    const bool expect_limiting = (sg == sgroups.statement_scheduling_group) && bool(allow_limit);
+
+                    BOOST_TEST_MESSAGE(format("Test config, shard: {}, scheduling_group: {}, allow_limit: {}, expect_limiting: {}",
+                            (shard == local_shard) ? "local" : "foreign",
+                            (sg == sgroups.statement_scheduling_group) ? "statement" : "streaming",
+                            allow_limit,
+                            expect_limiting));
+                    
+                    smp::submit_to(shard, [&] () mutable {
+                        return seastar::async(thread_attributes{sg}, [&] {
+                            if (expect_limiting) {
+                                BOOST_REQUIRE_THROW(run_writes(allow_limit).get(), exceptions::rate_limit_exception);
+                                BOOST_REQUIRE_THROW(run_reads(allow_limit).get(), exceptions::rate_limit_exception);
+                            } else {
+                                BOOST_REQUIRE_NO_THROW(run_writes(allow_limit).get());
+                                BOOST_REQUIRE_NO_THROW(run_reads(allow_limit).get());
+                            }
+                        });
+                    }).get();
+                }
+            }
+        }
+
+        return make_ready_future<>();
+    });
+}
\ No newline at end of file

From 13a5022499ff30934ebad77b1bd988b1fc0ad994 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Wed, 22 Jun 2022 14:32:56 +0200
Subject: [PATCH 27/29] database: add stats for per partition rate limiting

Adds statistics which count how many times a replica has decided to
reject a write ("total_writes_rate_limited") or a read
("total_reads_rate_limited").
---
 replica/database.cc | 13 +++++++++++++
 replica/database.hh |  2 ++
 2 files changed, 15 insertions(+)

diff --git a/replica/database.cc b/replica/database.cc
index 2a5f4929eb..1c66faf172 100644
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -543,6 +543,9 @@ database::setup_metrics() {
         sm::make_counter("total_writes_timedout", _stats->total_writes_timedout,
                        sm::description("Counts write operations failed due to a timeout. A positive value is a sign of storage being overloaded.")),
 
+        sm::make_counter("total_writes_rate_limited", _stats->total_writes_rate_limited,
+                       sm::description("Counts write operations which were rejected on the replica side because the per-partition limit was reached.")),
+
         sm::make_counter("total_reads", _read_concurrency_sem.get_stats().total_successful_reads,
                        sm::description("Counts the total number of successful user reads on this shard."),
                        {user_label_instance}),
@@ -561,6 +564,9 @@ database::setup_metrics() {
                                        "Add the total_reads to this value to get the total amount of reads issued on this shard."),
                        {system_label_instance}),
 
+        sm::make_counter("total_reads_rate_limited", _stats->total_reads_rate_limited,
+                       sm::description("Counts read operations which were rejected on the replica side because the per-partition limit was reached.")),
+
         sm::make_current_bytes("view_update_backlog", [this] { return get_view_update_backlog().current; },
                        sm::description("Holds the current size in bytes of the pending view updates for all tables")),
 
@@ -1379,6 +1385,7 @@ database::query(schema_ptr s, const query::read_command& cmd, query::result_opti
     column_family& cf = find_column_family(cmd.cf_id);
 
     if (account_singular_ranges_to_rate_limit(_rate_limiter, cf, ranges, _dbcfg, rate_limit_info) == db::rate_limiter::can_proceed::no) {
+        ++_stats->total_reads_rate_limited;
         co_await coroutine::return_exception(replica::rate_limit_exception());
     }
 
@@ -1884,6 +1891,12 @@ Future database::update_write_metrics(Future&& f) {
             if (is_timeout_exception(ep)) {
                 ++s->total_writes_timedout;
             }
+            try {
+                std::rethrow_exception(ep);
+            } catch (replica::rate_limit_exception&) {
+                ++s->total_writes_rate_limited;
+            } catch (...) {
+            }
             return futurize<Future>::make_exception_future(std::move(ep));
         }
         ++s->total_writes;
diff --git a/replica/database.hh b/replica/database.hh
index fc7dd7556d..f642ee2c61 100644
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -1277,8 +1277,10 @@ private:
         uint64_t total_writes = 0;
         uint64_t total_writes_failed = 0;
         uint64_t total_writes_timedout = 0;
+        uint64_t total_writes_rate_limited = 0;
         uint64_t total_reads = 0;
         uint64_t total_reads_failed = 0;
+        uint64_t total_reads_rate_limited = 0;
 
         uint64_t short_data_queries = 0;
         uint64_t short_mutation_queries = 0;

From 6e5d48697087407e423c5e881028e7c2bba525d5 Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Wed, 22 Jun 2022 14:45:47 +0200
Subject: [PATCH 28/29] storage_proxy: metrics for per-partition rate limiting
 of writes

Adds a metric "write_rate_limited" which indicates how many times a
write operation was rejected due to per-partition rate limiting. The
metric differentiates between writes rejected by the coordinator and
writes rejected by replicas.
---
 service/storage_proxy.cc       | 18 ++++++++++++++++++
 service/storage_proxy_stats.hh |  2 ++
 2 files changed, 20 insertions(+)

diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index eb1418f765..4e2cbe0d24 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -113,6 +113,7 @@ static const sstring COORDINATOR_STATS_CATEGORY("storage_proxy_coordinator");
 static const sstring REPLICA_STATS_CATEGORY("storage_proxy_replica");
 static const seastar::metrics::label op_type_label("op_type");
 static const seastar::metrics::label scheduling_group_label("scheduling_group_name");
+static const seastar::metrics::label rejected_by_coordinator_label("rejected_by_coordinator");
 
 seastar::metrics::label_instance current_scheduling_group_label() {
     return scheduling_group_label(current_scheduling_group().name());
@@ -1579,6 +1580,14 @@ void storage_proxy_stats::write_stats::register_stats() {
                            sm::description("number write requests failed due to an \"unavailable\" error"),
                            {storage_proxy_stats::current_scheduling_group_label()}),
 
+            sm::make_total_operations("write_rate_limited", write_rate_limited_by_replicas._count,
+                           sm::description("number of write requests which were rejected by replicas because rate limit for the partition was reached."),
+                           {storage_proxy_stats::current_scheduling_group_label(), storage_proxy_stats::rejected_by_coordinator_label(false)}),
+
+            sm::make_total_operations("write_rate_limited", write_rate_limited_by_coordinator._count,
+                           sm::description("number of write requests which were rejected directly on the coordinator because rate limit for the partition was reached."),
+                           {storage_proxy_stats::current_scheduling_group_label(),storage_proxy_stats::rejected_by_coordinator_label(true)}),
+
             sm::make_total_operations("background_writes_failed", background_writes_failed,
                            sm::description("number of write requests that failed after CL was reached"),
                            {storage_proxy_stats::current_scheduling_group_label()}),
@@ -2244,6 +2253,15 @@ future<result<>> storage_proxy::mutate_end(future<result<>> mutate_result, utils
         stats.write_unavailables.mark();
         slogger.trace("Unavailable");
         return handle.into_future();
+    }), utils::result_catch<exceptions::rate_limit_exception>([&] (const auto& ex, auto&& handle) {
+        tracing::trace(trace_state, "Mutation failed: rate limit exceeded");
+        if (ex.rejected_by_coordinator) {
+            stats.write_rate_limited_by_coordinator.mark();
+        } else {
+            stats.write_rate_limited_by_replicas.mark();
+        }
+        slogger.trace("Rate limit exceeded");
+        return handle.into_future();
     }), utils::result_catch<overloaded_exception>([&] (const auto& ex, auto&& handle) {
         tracing::trace(trace_state, "Mutation failed: overloaded");
         stats.write_unavailables.mark();
diff --git a/service/storage_proxy_stats.hh b/service/storage_proxy_stats.hh
index eb9cf55815..ade1a901c5 100644
--- a/service/storage_proxy_stats.hh
+++ b/service/storage_proxy_stats.hh
@@ -75,6 +75,8 @@ struct write_stats {
 
     utils::timed_rate_moving_average write_unavailables;
     utils::timed_rate_moving_average write_timeouts;
+    utils::timed_rate_moving_average write_rate_limited_by_replicas;
+    utils::timed_rate_moving_average write_rate_limited_by_coordinator;
 
     utils::timed_rate_moving_average_and_histogram write;
     utils::time_estimated_histogram estimated_write;

From 442901f14aeb0f2957ea68b07ae690b05bce691e Mon Sep 17 00:00:00 2001
From: Piotr Dulikowski <piodul@scylladb.com>
Date: Wed, 22 Jun 2022 14:46:13 +0200
Subject: [PATCH 29/29] storage_proxy: metrics for per-partition rate limiting
 of reads

Adds a metric "read_rate_limited" which indicates how many times a read
operation was rejected due to per-partition rate limiting. The metric
differentiates between reads rejected by the coordinator and reads
rejected by replicas.
---
 service/storage_proxy.cc       | 16 ++++++++++++++++
 service/storage_proxy_stats.hh |  2 ++
 2 files changed, 18 insertions(+)

diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index 4e2cbe0d24..16ae4c889a 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -1672,6 +1672,14 @@ void storage_proxy_stats::stats::register_stats() {
                        sm::description("number read requests failed due to an \"unavailable\" error"),
                        {storage_proxy_stats::current_scheduling_group_label()}),
 
+        sm::make_total_operations("read_rate_limited", read_rate_limited_by_replicas._count,
+                       sm::description("number of read requests which were rejected by replicas because rate limit for the partition was reached."),
+                       {storage_proxy_stats::current_scheduling_group_label(), storage_proxy_stats::rejected_by_coordinator_label(false)}),
+
+        sm::make_total_operations("read_rate_limited", read_rate_limited_by_coordinator._count,
+                       sm::description("number of read requests which were rejected directly on the coordinator because rate limit for the partition was reached."),
+                       {storage_proxy_stats::current_scheduling_group_label(), storage_proxy_stats::rejected_by_coordinator_label(true)}),
+
         sm::make_total_operations("range_timeouts", range_slice_timeouts._count,
                        sm::description("number of range read operations failed due to a timeout"),
                        {storage_proxy_stats::current_scheduling_group_label()}),
@@ -4251,6 +4259,14 @@ void storage_proxy::handle_read_error(std::variant<exceptions::coordinator_excep
             get_stats().read_timeouts.mark();
         }
         return bo::success();
+    }), utils::result_catch<exceptions::rate_limit_exception>([&] (const auto& ex) {
+        slogger.debug("Read was rate limited");
+        if (ex.rejected_by_coordinator) {
+            get_stats().read_rate_limited_by_coordinator.mark();
+        } else {
+            get_stats().read_rate_limited_by_replicas.mark();
+        }
+        return bo::success();
     }), utils::result_catch_dots([&] (auto&& handle) {
         slogger.debug("Error during read query {}", handle.as_inner());
         return bo::success();
diff --git a/service/storage_proxy_stats.hh b/service/storage_proxy_stats.hh
index ade1a901c5..fcecbd508b 100644
--- a/service/storage_proxy_stats.hh
+++ b/service/storage_proxy_stats.hh
@@ -127,6 +127,8 @@ struct stats : public write_stats {
     seastar::metrics::metric_groups _metrics;
     utils::timed_rate_moving_average read_timeouts;
     utils::timed_rate_moving_average read_unavailables;
+    utils::timed_rate_moving_average read_rate_limited_by_replicas;
+    utils::timed_rate_moving_average read_rate_limited_by_coordinator;
     utils::timed_rate_moving_average range_slice_timeouts;
     utils::timed_rate_moving_average range_slice_unavailables;