diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7484d0a43d..312dd620e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -450,6 +450,7 @@ set(scylla_sources
     db/large_data_handler.cc
     db/legacy_schema_migrator.cc
     db/marshal/type_parser.cc
+    db/rate_limiter.cc
     db/schema_tables.cc
     db/size_estimates_virtual_reader.cc
     db/snapshot-ctl.cc
diff --git a/alternator/executor.cc b/alternator/executor.cc
index 4eddf38ebd..9976c0b1d5 100644
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -1510,6 +1510,7 @@ static future<std::unique_ptr<rjson::value>> get_previous_item(
     stats.reads_before_write++;
     auto selection = cql3::selection::selection::wildcard(schema);
     auto command = previous_item_read_command(proxy, schema, ck, selection);
+    command->allow_limit = db::allow_per_partition_rate_limit::yes;
     auto cl = db::consistency_level::LOCAL_QUORUM;
 
     return proxy.query(schema, command, to_partition_ranges(*schema, pk), cl, service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state)).then(
@@ -1543,7 +1544,7 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
                 if (!m) {
                     return make_ready_future<executor::request_return_type>(api_error::conditional_check_failed("Failed condition."));
                 }
-                return proxy.mutate(std::vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit)).then([this] () mutable {
+                return proxy.mutate(std::vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes).then([this] () mutable {
                     return rmw_operation_return(std::move(_return_attributes));
                 });
             });
@@ -1551,7 +1552,7 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
     } else if (_write_isolation != write_isolation::LWT_ALWAYS) {
         std::optional<mutation> m = apply(nullptr, api::new_timestamp());
         assert(m); // !needs_read_before_write, so apply() did not check a condition
-        return proxy.mutate(std::vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit)).then([this] () mutable {
+        return proxy.mutate(std::vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes).then([this] () mutable {
             return rmw_operation_return(std::move(_return_attributes));
         });
     }
@@ -1896,7 +1897,8 @@ static future<> do_batch_write(service::storage_proxy& proxy,
                 db::consistency_level::LOCAL_QUORUM,
                 executor::default_timeout(),
                 trace_state,
-                std::move(permit));
+                std::move(permit),
+                db::allow_per_partition_rate_limit::yes);
     } else {
         // Do the write via LWT:
         // Multiple mutations may be destined for the same partition, adding
@@ -3252,6 +3254,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
             auto selection = cql3::selection::selection::wildcard(rs.schema);
             auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
             auto command = ::make_lw_shared<query::read_command>(rs.schema->id(), rs.schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice));
+            command->allow_limit = db::allow_per_partition_rate_limit::yes;
             future<std::vector<rjson::value>> f = _proxy.query(rs.schema, std::move(command), std::move(partition_ranges), rs.cl,
                     service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then(
                     [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get] (service::storage_proxy::coordinator_query_result qr) mutable {
diff --git a/alternator/ttl.cc b/alternator/ttl.cc
index 3396073a9c..eb8eee73fb 100644
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -284,7 +284,8 @@ static future<> expire_item(service::storage_proxy& proxy,
     return proxy.mutate(std::vector<mutation>{std::move(m)},
         db::consistency_level::LOCAL_QUORUM,
         executor::default_timeout(), // FIXME - which timeout?
-        qs.get_trace_state(), qs.get_permit());
+        qs.get_trace_state(), qs.get_permit(),
+        db::allow_per_partition_rate_limit::no);
 }
 
 static size_t random_offset(size_t min, size_t max) {
diff --git a/configure.py b/configure.py
index 69b4febc00..eda12da0ca 100755
--- a/configure.py
+++ b/configure.py
@@ -505,6 +505,8 @@ scylla_tests = set([
     'test/boost/group0_test',
     'test/boost/exception_container_test',
     'test/boost/result_utils_test',
+    'test/boost/rate_limiter_test',
+    'test/boost/per_partition_rate_limit_test',
     'test/boost/expr_test',
     'test/manual/ec2_snitch_test',
     'test/manual/enormous_table_scan_test',
@@ -668,6 +670,7 @@ scylla_core = (['replica/database.cc',
                 'replica/table.cc',
                 'replica/distributed_loader.cc',
                 'replica/memtable.cc',
+                'replica/exceptions.cc',
                 'absl-flat_hash_map.cc',
                 'atomic_cell.cc',
                 'caching_options.cc',
@@ -886,6 +889,8 @@ scylla_core = (['replica/database.cc',
                 'db/view/row_locking.cc',
                 'db/sstables-format-selector.cc',
                 'db/snapshot-ctl.cc',
+                'db/rate_limiter.cc',
+                'db/per_partition_rate_limit_options.cc',
                 'index/secondary_index_manager.cc',
                 'index/secondary_index.cc',
                 'utils/UUID_gen.cc',
@@ -1128,6 +1133,8 @@ idls = ['idl/gossip_digest.idl.hh',
         'idl/storage_proxy.idl.hh',
         'idl/group0_state_machine.idl.hh',
         'idl/forward_request.idl.hh',
+        'idl/replica_exception.idl.hh',
+        'idl/per_partition_rate_limit_info.idl.hh',
         ]
 
 rusts = [
@@ -1279,6 +1286,7 @@ deps['test/boost/linearizing_input_stream_test'] = [
     "test/lib/log.cc",
 ]
 deps['test/boost/expr_test'] = ['test/boost/expr_test.cc'] + scylla_core
+deps['test/boost/rate_limiter_test'] = ['test/boost/rate_limiter_test.cc', 'db/rate_limiter.cc']
 
 deps['test/boost/duration_test'] += ['test/lib/exception_utils.cc']
 deps['test/boost/schema_loader_test'] += ['tools/schema_loader.cc']
diff --git a/cql3/statements/batch_statement.cc b/cql3/statements/batch_statement.cc
index 9994706ecb..20a89b8990 100644
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -317,7 +317,7 @@ future<coordinator_result<>> batch_statement::execute_without_conditions(
             mutate_atomic = false;
         }
     }
-    return qp.proxy().mutate_with_triggers(std::move(mutations), cl, timeout, mutate_atomic, std::move(tr_state), std::move(permit));
+    return qp.proxy().mutate_with_triggers(std::move(mutations), cl, timeout, mutate_atomic, std::move(tr_state), std::move(permit), db::allow_per_partition_rate_limit::yes);
 }
 
 future<shared_ptr<cql_transport::messages::result_message>> batch_statement::execute_with_conditions(
diff --git a/cql3/statements/cf_prop_defs.cc b/cql3/statements/cf_prop_defs.cc
index 457ab6227c..23d6fc4e46 100644
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -17,6 +17,8 @@
 #include "gms/feature_service.hh"
 #include "tombstone_gc_extension.hh"
 #include "tombstone_gc.hh"
+#include "db/per_partition_rate_limit_extension.hh"
+#include "db/per_partition_rate_limit_options.hh"
 
 #include <boost/algorithm/string/predicate.hpp>
 
@@ -127,6 +129,11 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
         throw exceptions::configuration_exception("CDC not supported by the cluster");
     }
 
+    auto per_partition_rate_limit_options = get_per_partition_rate_limit_options(schema_extensions);
+    if (per_partition_rate_limit_options && !db.features().typed_errors_in_read_rpc) {
+        throw exceptions::configuration_exception("Per-partition rate limit is not supported yet by the whole cluster");
+    }
+
     auto tombstone_gc_options = get_tombstone_gc_options(schema_extensions);
     validate_tombstone_gc_options(tombstone_gc_options, db, ks_name);
 
@@ -219,6 +226,16 @@ const tombstone_gc_options* cf_prop_defs::get_tombstone_gc_options(const schema:
     return &ext->get_options();
 }
 
+const db::per_partition_rate_limit_options* cf_prop_defs::get_per_partition_rate_limit_options(const schema::extensions_map& schema_exts) const {
+    auto it = schema_exts.find(db::per_partition_rate_limit_extension::NAME);
+    if (it == schema_exts.end()) {
+        return nullptr;
+    }
+
+    auto ext = dynamic_pointer_cast<db::per_partition_rate_limit_extension>(it->second);
+    return &ext->get_options();
+}
+
 void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions) const {
     if (has_property(KW_COMMENT)) {
         builder.set_comment(get_string(KW_COMMENT, ""));
diff --git a/cql3/statements/cf_prop_defs.hh b/cql3/statements/cf_prop_defs.hh
index 30699f3170..261835bacb 100644
--- a/cql3/statements/cf_prop_defs.hh
+++ b/cql3/statements/cf_prop_defs.hh
@@ -78,6 +78,7 @@ public:
     const cdc::options* get_cdc_options(const schema::extensions_map&) const;
     std::optional<caching_options> get_caching_options() const;
     const tombstone_gc_options* get_tombstone_gc_options(const schema::extensions_map&) const;
+    const db::per_partition_rate_limit_options* get_per_partition_rate_limit_options(const schema::extensions_map&) const;
 #if 0
     public CachingOptions getCachingOptions() throws SyntaxException, ConfigurationException
     {
diff --git a/cql3/statements/modification_statement.cc b/cql3/statements/modification_statement.cc
index 9c627b8394..872168cb25 100644
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -284,7 +284,7 @@ modification_statement::execute_without_condition(query_processor& qp, service::
             return make_ready_future<coordinator_result<>>(bo::success());
         }
         
-        return qp.proxy().mutate_with_triggers(std::move(mutations), cl, timeout, false, qs.get_trace_state(), qs.get_permit(), this->is_raw_counter_shard_write());
+        return qp.proxy().mutate_with_triggers(std::move(mutations), cl, timeout, false, qs.get_trace_state(), qs.get_permit(), db::allow_per_partition_rate_limit::yes, this->is_raw_counter_shard_write());
     });
 }
 
diff --git a/cql3/statements/select_statement.cc b/cql3/statements/select_statement.cc
index 46831b04b0..fb6ec88fcd 100644
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -360,6 +360,7 @@ select_statement::do_execute(query_processor& qp,
             utils::UUID(),
             query::is_first_page::no,
             options.get_timestamp(state));
+    command->allow_limit = db::allow_per_partition_rate_limit::yes;
 
     int32_t page_size = options.get_page_size();
 
@@ -530,6 +531,7 @@ indexed_table_select_statement::prepare_command_for_base_query(query_processor&
             utils::UUID(),
             query::is_first_page::no,
             options.get_timestamp(state));
+    cmd->allow_limit = db::allow_per_partition_rate_limit::yes;
     return cmd;
 }
 
diff --git a/db/batchlog_manager.cc b/db/batchlog_manager.cc
index 81a9c790e4..4b755f78a5 100644
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -243,7 +243,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
                 // send to partially or wholly fail in actually sending stuff. Since we don't
                 // have hints (yet), send with CL=ALL, and hope we can re-do this soon.
                 // See below, we use retry on write failure.
-                return _qp.proxy().mutate(mutations, db::consistency_level::ALL, db::no_timeout, nullptr, empty_service_permit());
+                return _qp.proxy().mutate(mutations, db::consistency_level::ALL, db::no_timeout, nullptr, empty_service_permit(), db::allow_per_partition_rate_limit::no);
             });
         }).then_wrapped([this, id](future<> batch_result) {
             try {
diff --git a/db/config.cc b/db/config.cc
index a7e8c0ec6f..9fb19a1eca 100644
--- a/db/config.cc
+++ b/db/config.cc
@@ -23,6 +23,7 @@
 
 #include "cdc/cdc_extension.hh"
 #include "tombstone_gc_extension.hh"
+#include "db/per_partition_rate_limit_extension.hh"
 #include "config.hh"
 #include "extensions.hh"
 #include "log.hh"
@@ -910,6 +911,10 @@ void db::config::add_cdc_extension() {
     _extensions->add_schema_extension<cdc::cdc_extension>(cdc::cdc_extension::NAME);
 }
 
+void db::config::add_per_partition_rate_limit_extension() {
+    _extensions->add_schema_extension<db::per_partition_rate_limit_extension>(db::per_partition_rate_limit_extension::NAME);
+}
+
 void db::config::setup_directories() {
     maybe_in_workdir(commitlog_directory, "commitlog");
     maybe_in_workdir(data_file_directories, "data");
diff --git a/db/config.hh b/db/config.hh
index 5d33762731..0905ad35e5 100644
--- a/db/config.hh
+++ b/db/config.hh
@@ -107,6 +107,7 @@ public:
 
     // For testing only
     void add_cdc_extension();
+    void add_per_partition_rate_limit_extension();
 
     /// True iff the feature is enabled.
     bool check_experimental(experimental_features_t::feature f) const;
diff --git a/db/operation_type.hh b/db/operation_type.hh
new file mode 100644
index 0000000000..def6c35bfc
--- /dev/null
+++ b/db/operation_type.hh
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+
+namespace db {
+
+enum class operation_type : uint8_t {
+    read = 0,
+    write = 1
+};
+
+std::ostream& operator<<(std::ostream& os, operation_type op_type);
+
+}
diff --git a/db/per_partition_rate_limit_extension.hh b/db/per_partition_rate_limit_extension.hh
new file mode 100644
index 0000000000..e8fb55ac3c
--- /dev/null
+++ b/db/per_partition_rate_limit_extension.hh
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include "db/per_partition_rate_limit_options.hh"
+
+namespace db {
+
+class per_partition_rate_limit_extension : public schema_extension {
+    per_partition_rate_limit_options _options;
+public:
+    static constexpr auto NAME = "per_partition_rate_limit";
+
+    per_partition_rate_limit_extension() = default;
+    per_partition_rate_limit_extension(const per_partition_rate_limit_options& opts) : _options(opts) {}
+
+    explicit per_partition_rate_limit_extension(const std::map<sstring, sstring>& tags) : _options(tags) {}
+    explicit per_partition_rate_limit_extension(const bytes& b) : _options(deserialize(b)) {}
+    explicit per_partition_rate_limit_extension(const sstring& s) {
+        throw std::logic_error("Cannot create per partition rate limit info from string");
+    }
+
+    bytes serialize() const override {
+        return ser::serialize_to_buffer<bytes>(_options.to_map());
+    }
+    static std::map<sstring, sstring> deserialize(const bytes_view& buffer) {
+        return ser::deserialize_from_buffer(buffer, boost::type<std::map<sstring, sstring>>());
+    }
+    const per_partition_rate_limit_options& get_options() const {
+        return _options;
+    }
+
+};
+
+}
+
diff --git a/db/per_partition_rate_limit_info.hh b/db/per_partition_rate_limit_info.hh
new file mode 100644
index 0000000000..e9cf0dfd45
--- /dev/null
+++ b/db/per_partition_rate_limit_info.hh
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <variant>
+#include <seastar/util/bool_class.hh>
+
+namespace db {
+
+using allow_per_partition_rate_limit = seastar::bool_class<class allow_per_partition_rate_limit_tag>;
+
+namespace per_partition_rate_limit {
+
+// Tells the replica to account the operation (increase the corresponding counter)
+// and accept it regardless from the value of the counter.
+//
+// Used when the coordinator IS a replica (correct node and shard).
+struct account_only {};
+
+// Tells the replica to account the operation and decide whether to reject
+// or not, based on the random variable sent by the coordinator.
+//
+// Used when the coordinator IS NOT a replica (wrong node or shard).
+struct account_and_enforce {
+    // A random 32-bit number generated by the coordinator.
+    // Replicas are supposed to use it in order to decide whether
+    // to accept or reject.
+    uint32_t random_variable;
+
+    inline double get_random_variable_as_double() const {
+        return double(random_variable) / double(1LL << 32);
+    }
+};
+
+// std::monostate -> do not count to the rate limit and never reject
+// account_and_enforce -> account to the rate limit and optionally reject
+using info = std::variant<std::monostate, account_only, account_and_enforce>;
+
+} // namespace per_partition_rate_limit
+
+} // namespace db
+
diff --git a/db/per_partition_rate_limit_options.cc b/db/per_partition_rate_limit_options.cc
new file mode 100644
index 0000000000..fb2107c80e
--- /dev/null
+++ b/db/per_partition_rate_limit_options.cc
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include <optional>
+#include <boost/range/adaptor/map.hpp>
+
+#include "serializer.hh"
+#include "schema.hh"
+#include "log.hh"
+
+namespace db {
+
+const char* per_partition_rate_limit_options::max_writes_per_second_key = "max_writes_per_second";
+const char* per_partition_rate_limit_options::max_reads_per_second_key = "max_reads_per_second";
+
+per_partition_rate_limit_options::per_partition_rate_limit_options(std::map<sstring, sstring> map) {
+    auto handle_uint32_arg = [&] (const char* key) -> std::optional<uint32_t> {
+        auto it = map.find(key);
+        if (it == map.end()) {
+            return std::nullopt;
+        }
+        try {
+            const uint32_t ret = std::stol(it->second);
+            map.erase(it);
+            return ret;
+        } catch (std::invalid_argument&) {
+            throw exceptions::configuration_exception(format(
+                    "Invalid value for {} option: expected a non-negative number",
+                    key));
+        } catch (std::out_of_range&) {
+            throw exceptions::configuration_exception(format(
+                    "Value for {} is out of range accepted by 32-bit numbers",
+                    key));
+        }
+    };
+
+    _max_writes_per_second = handle_uint32_arg(max_writes_per_second_key);
+    _max_reads_per_second = handle_uint32_arg(max_reads_per_second_key);
+
+    if (!map.empty()) {
+        throw exceptions::configuration_exception(format(
+                "Unknown keys in map for per_partition_rate_limit extension: {}",
+                ::join(", ", map | boost::adaptors::map_keys)));
+    }
+}
+
+std::map<sstring, sstring> per_partition_rate_limit_options::to_map() const {
+    std::map<sstring, sstring> ret;
+    if (_max_writes_per_second) {
+        ret.insert_or_assign(max_writes_per_second_key, std::to_string(*_max_writes_per_second));
+    }
+    if (_max_reads_per_second) {
+        ret.insert_or_assign(max_reads_per_second_key, std::to_string(*_max_reads_per_second));
+    }
+    return ret;
+}
+
+}
diff --git a/db/per_partition_rate_limit_options.hh b/db/per_partition_rate_limit_options.hh
new file mode 100644
index 0000000000..f75a40a3ae
--- /dev/null
+++ b/db/per_partition_rate_limit_options.hh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include <optional>
+
+#include "operation_type.hh"
+#include "serializer.hh"
+#include "schema.hh"
+#include "log.hh"
+
+namespace db {
+
+class per_partition_rate_limit_options final {
+private:
+    static const char* max_writes_per_second_key;
+    static const char* max_reads_per_second_key;
+
+private:
+    std::optional<uint32_t> _max_writes_per_second;
+    std::optional<uint32_t> _max_reads_per_second;
+
+public:
+    per_partition_rate_limit_options() = default;
+    per_partition_rate_limit_options(std::map<sstring, sstring> map);
+
+    std::map<sstring, sstring> to_map() const;
+
+    inline std::optional<uint32_t> get_max_ops_per_second(operation_type op_type) const {
+        switch (op_type) {
+        case operation_type::write:
+            return _max_writes_per_second;
+        case operation_type::read:
+            return _max_reads_per_second;
+        }
+    }
+
+    inline void set_max_writes_per_second(std::optional<uint32_t> v) {
+        _max_writes_per_second = v;
+    }
+
+    inline std::optional<uint32_t> get_max_writes_per_second() const {
+        return _max_writes_per_second;
+    }
+
+    inline void set_max_reads_per_second(std::optional<uint32_t> v) {
+        _max_reads_per_second = v;
+    }
+
+    inline std::optional<uint32_t> get_max_reads_per_second() const {
+        return _max_reads_per_second;
+    }
+};
+
+}
diff --git a/db/rate_limiter.cc b/db/rate_limiter.cc
new file mode 100644
index 0000000000..195387289a
--- /dev/null
+++ b/db/rate_limiter.cc
@@ -0,0 +1,305 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include <cmath>
+#include <numbers>
+#include <array>
+#include <random>
+#include <variant>
+#include <chrono>
+
+#include <seastar/core/metrics.hh>
+
+#include "utils/small_vector.hh"
+#include "utils/murmur_hash.hh"
+#include "db/rate_limiter.hh"
+
+// The rate limiter keeps a hashmap of counters differentiated by operation type
+// (e.g. read or write) and the partition token. On each operation,
+// the corresponding counter is increased by 1.
+//
+// The counters are decremented via two mechanisms:
+//
+// 1. Every `time_window_duration`, all counters are halved.
+// 2. Within a time window, on every `bucket_size` operations all counters
+//    are decremented by 1.
+//
+// The mechanism 1) makes sure that we do not forget about very frequent
+// operations too quick and makes it possible to reject in a probabilistic
+// manner (this is described in more detail in design notes).
+//
+// The mechanism 2) protects the internal hashmap from being flooded with
+// counters with low values. This causes the rate limiter to underestimate
+// the counter values by the current number of "buckets" within this
+// time window. This strategy is also known as "lossy counting".
+//
+// Both mechanisms 1) and 2) are implemented in a lazy manner.
+
+namespace db {
+
+static constexpr size_t hash_bits = 16;
+static constexpr size_t entry_count = 1 << hash_bits;
+static constexpr size_t bucket_size = 10000;
+
+
+void rate_limiter_base::on_timer() noexcept {
+    _time_window_history.pop_back();
+    _time_window_history.insert(_time_window_history.begin(), time_window_entry {
+        .entries_active = _current_entries_in_time_window,
+        .lossy_counting_decrease = _current_bucket,
+    });
+
+    _current_bucket = 0;
+    _current_ops_in_bucket = 0;
+    _current_entries_in_time_window = 0;
+
+    _current_time_window = (_current_time_window + 1) % (1 << time_window_bits);
+
+    // Because time window ids are 12 bit numbers and we increase the current
+    // time window number by 1 every second, it wraps around every 4096
+    // seconds (more than an hour). Because of this, some very old entry
+    // updated last 4096 seconds may accidentally become valid again.
+    //
+    // In order to prevent this, we make sure to update the entries
+    // more frequently. We do this by refreshing all the entries within half
+    // of the wraparound period (2048 seconds).
+    //
+    // Instead of clearing everything at once, we divide this operation
+    // into many small steps and perform them during time window change.
+    //
+    // All of this should make sure that each entry's time window is not
+    // older than 2048 seconds from the current generation.
+
+    constexpr size_t period = 1 << (time_window_bits - 1);
+    constexpr size_t entries_per_step = entry_count / period;
+
+    const size_t begin = _current_time_window * entries_per_step;
+    for (size_t i = 0; i < entries_per_step; i++) {
+        entry_refresh(_entries[(begin + i) % entry_count]);
+    }
+}
+
+rate_limiter_base::entry* rate_limiter_base::get_entry(uint32_t label, uint64_t token) noexcept {
+    // We need to either find the existing entry for this (label, token) combination
+    // or otherwise find an invalid entry which we can initialize and use.
+    //
+    // We start by looking at the entry corresponding to the computed hash,
+    // if it's occupied by another (label, token) try other entries using
+    // the quadratic probing strategy.
+    //
+    // We limit ourselves to 32 attempts - if no suitable entry is found
+    // then we return nullptr and admit the operation unconditionally.
+
+    // Because we use quadratic probing and entries can be deleted (lazily),
+    // a situation can occur where an entry A suddenly becomes inaccessible
+    // because another entry B which is earlier on the probe chain is deleted.
+    // One of the following will happen:
+    //
+    // 1. Either we will allocate a new entry over B and A becomes accessible
+    //    again,
+    // 2. Or we will allocate a new entry for the same operation/partition as A
+    //    and A will eventually expire.
+    //
+    // In the worst case, A might be a "hot" entry and be actively rate limited
+    // and the described situation will cause a large number of operations
+    // to be admitted. Fortunately, this will move the entry earlier in the
+    // probe chain, so this situation will happen a limited number of times (if
+    // any at all) for a single "hot" entry.
+
+    size_t hash = compute_hash(label, token);
+
+    static constexpr size_t max_probes = 32;
+    for (size_t i = 0; i < max_probes; i++) {
+        // Quadratic probing - every iteration jumps further than the previous one
+        hash = (hash + i) % entry_count;
+        entry& b = _entries[hash];
+        ++_metrics.probe_count;
+
+        entry_refresh(b);
+
+        if (entry_is_empty(b)) {
+            ++_metrics.allocations_on_empty;
+            b.token = token;
+            b.label = label;
+            b.op_count = _current_bucket;
+            return &b;
+        } else if (b.token == token && b.label == label) {
+            ++_metrics.successful_lookups;
+            return &b;
+        }
+    }
+
+    ++_metrics.failed_allocations;
+    return nullptr;
+}
+
+size_t rate_limiter_base::compute_hash(uint32_t label, uint64_t token) noexcept {
+    // The map key is a tuple (token, key) + salt
+    // The key is hashed with murmur hash for good hash quality
+
+    static constexpr size_t key_length = sizeof(token) + sizeof(label) + sizeof(_salt);
+
+    std::array<uint8_t, key_length> key;
+    uint8_t* ptr = key.data();
+    memcpy(ptr, &token, sizeof(token));
+    ptr += sizeof(token);
+    memcpy(ptr, &label, sizeof(label));
+    ptr += sizeof(label);
+    memcpy(ptr, &_salt, sizeof(_salt));
+
+    std::array<uint64_t, 2> out;
+    utils::murmur_hash::hash3_x64_128(key.data(), key_length, 0, out);
+    return out[0];
+}
+
+void rate_limiter_base::entry_refresh(rate_limiter_base::entry& b) noexcept {
+    uint32_t window_delta = _current_time_window - b.time_window;
+
+    if (window_delta == 0) {
+        // The entry is fresh, it was allocated in this time window
+        return;
+    }
+
+    if (window_delta < _time_window_history.size()) {
+        // The entry is not that old so we have to apply the effects
+        // of lossy counting and halving on time window switch
+        --_time_window_history[window_delta - 1].entries_active;
+        while (window_delta > 0) {
+            if (b.op_count > _time_window_history[window_delta - 1].lossy_counting_decrease) {
+                b.op_count -= _time_window_history[window_delta - 1].lossy_counting_decrease;
+            } else {
+                b.op_count = 0;
+            }
+            b.op_count /= 2;
+
+            --window_delta;
+        }
+    } else {
+        // The entry is very old and the op_count can be safely decreased to zero
+        b.op_count = 0;
+    }
+
+    ++_current_entries_in_time_window;
+    b.time_window = _current_time_window;
+}
+
+bool rate_limiter_base::entry_is_empty(const rate_limiter_base::entry& b) noexcept {
+    return b.op_count <= _current_bucket;
+}
+
+void rate_limiter_base::register_metrics() {
+    namespace sm = seastar::metrics;
+
+    _metric_group.add_group("per_partition_rate_limiter", {
+        // TODO: Most of the following metrics are pretty low-level and not useful for users,
+        // perhaps they should be hidden behind a configuration flag
+
+        sm::make_counter("allocations", _metrics.allocations_on_empty,
+                sm::description("Number of times a entry was allocated over an empty/expired entry.")),
+
+        sm::make_counter("successful_lookups", _metrics.successful_lookups,
+                sm::description("Number of times a lookup returned an already allocated entry.")),
+
+        sm::make_counter("failed_allocations", _metrics.failed_allocations,
+                sm::description("Number of times the rate limiter gave up trying to allocate.")),
+
+        sm::make_counter("probe_count", _metrics.probe_count,
+                sm::description("Number of probes made during lookups.")),
+
+        sm::make_gauge("load_factor", [&] {
+                    uint32_t occupied_entry_count = _current_entries_in_time_window;
+                    for (const auto& twe : _time_window_history) {
+                        occupied_entry_count += twe.entries_active;
+                    }
+                    return double(occupied_entry_count) / double(entry_count);
+                },
+                sm::description("Current load factor of the hash table (upper bound, may be overestimated).")),
+    });
+}
+
+rate_limiter_base::rate_limiter_base()
+        : _salt(std::random_device{}())
+        , _entries(entry_count)
+        , _time_window_history(op_count_bits - 1) {
+    
+    register_metrics();
+}
+
+uint64_t rate_limiter_base::increase_and_get_counter(label& l, uint64_t token) noexcept {
+    // Assign a label if not done yet
+    if (l._label == 0) {
+        l._label = _next_label++;
+    }
+
+    entry* b = get_entry(l._label, token);
+    if (!b) {
+        // We failed to allocate a entry for this partition. This means that
+        // we won't track hit count for this partition during this time window.
+        // Assume that it's OK to admit the operation.
+        return 0;
+    }
+
+    // Protect from wrap-around
+    b->op_count = std::min<uint32_t>((1 << op_count_bits) - 1, b->op_count + 1);
+    ++_current_ops_in_bucket;
+    if (_current_ops_in_bucket >= bucket_size) {
+        // Every `bucket_size` operations, virtually decrement all entries
+        // by one. We implement it by always subtracting the `_current_bucket`
+        // when comparing the count in the entry with the limit.
+        ++_current_bucket;
+        _current_ops_in_bucket -= bucket_size;
+    }
+
+    return b->op_count - _current_bucket;
+}
+
+
+rate_limiter_base::can_proceed rate_limiter_base::account_operation(
+        label& l, uint64_t token, uint64_t limit,
+        const db::per_partition_rate_limit::info& rate_limit_info) noexcept {
+
+    if (std::holds_alternative<std::monostate>(rate_limit_info)) {
+        // Rate limiting turned off
+        return can_proceed::yes;
+    }
+
+    const uint64_t count = increase_and_get_counter(l, token);
+
+    if (auto* info = std::get_if<db::per_partition_rate_limit::account_and_enforce>(&rate_limit_info)) {
+        // On each time window change we halve the entry counts, therefore
+        // a partition with X ops/s will stabilize at 2X hits at the end
+        // of each time window.
+        if (count <= 2 * limit) {
+            return can_proceed::yes;
+        } else {
+            // As mentioned before, assuming a fixed operation rate, the operation
+            // count in a entry will oscillate between X at the beginning of the
+            // time window and 2X at the end. In order to only accept `limit`
+            // operations within a time window, we need to reject with probability
+            // P_c(x), where P_c(x) is a function such that, integrated over [X, 2X]
+            // will be equal to `limit`. `P_c(x) = limit / (x * ln 2)` satisfies
+            // this criterion.
+            //
+            // All replicas get the same value for the random variable X, with an
+            // expectation that all replicas' counters oscillate between the same
+            // values. Because of that, most of the time replicas will agree
+            // and either all accept or reject.
+            if (info->get_random_variable_as_double() * double(count) * std::numbers::ln2 < double(limit)) {
+                return can_proceed::yes;
+            } else {
+                return can_proceed::no;
+            }
+        }
+    } else {
+        return can_proceed::yes;
+    }
+}
+
+template class generic_rate_limiter<seastar::lowres_clock>;
+
+}
diff --git a/db/rate_limiter.hh b/db/rate_limiter.hh
new file mode 100644
index 0000000000..9e891db025
--- /dev/null
+++ b/db/rate_limiter.hh
@@ -0,0 +1,178 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+#include <chrono>
+#include <limits>
+#include <concepts>
+#include <vector>
+#include <optional>
+#include <random>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/timer.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/metrics_registration.hh>
+#include <seastar/util/bool_class.hh>
+
+#include "utils/chunked_vector.hh"
+#include "db/per_partition_rate_limit_info.hh"
+
+// A data structure used to implement per-partition rate limiting. It accounts
+// operations and enforces limits when it is detected that the operation rate
+// is too high.
+
+namespace db {
+
+class rate_limiter_base {
+public:
+    static constexpr size_t op_count_bits = 20;
+    static constexpr size_t time_window_bits = 12;
+
+private:
+    struct metrics {
+        uint64_t allocations_on_empty = 0;
+        uint64_t successful_lookups = 0;
+        uint64_t failed_allocations = 0;
+        uint64_t probe_count = 0;
+    };
+
+    // Represents a piece of the hashmap storage.
+    struct entry {
+    public:
+        // The partition key token of the operation which allocated this entry.
+        uint64_t token = 0;
+
+        // The label of the operation which allocated this entry.
+        // Labels are used to differentiate operations which should be counted
+        // separately, e.g. reads and writes to the same table or writes
+        // to two different tables.
+        uint32_t label = 0;
+
+        // The number of operations counted for given token/label.
+        // It is virtually decremented on each window change, so the real
+        // operation count is actually `op_count - _current_bucket`.
+        // If the number drops to zero or below, the entry is considered
+        // "expired" and may be overwritten by another operation.
+        uint32_t op_count : op_count_bits = 0;
+
+        // ID of the time window in which the entry was allocated.
+        uint32_t time_window : time_window_bits = 0;
+    };
+
+    struct time_window_entry {
+        // How many entries are there active within this time window?
+        uint32_t entries_active = 0;
+
+        // By how much should the counter should be decreased within
+        // this time window?
+        uint32_t lossy_counting_decrease = 0;
+    };
+
+public:
+    struct can_proceed_tag{};
+    using can_proceed = seastar::bool_class<can_proceed_tag>;
+
+    // Identifies a type of operation which is counted separately from other
+    // operations. For example, reads and writes for given table should have
+    // separate labels.
+    struct label {
+    private:
+        // The current ID used to identify the label in the rate limiter.
+        // It is assigned on first use.
+        uint32_t _label = 0;
+
+        friend class rate_limiter_base;
+    };
+
+private:
+    uint32_t _current_bucket = 0;
+    uint32_t _current_ops_in_bucket = 0;
+    uint32_t _current_entries_in_time_window = 0;
+
+    uint32_t _next_label = 1;
+    uint32_t _current_time_window = 0;
+
+    const uint32_t _salt;
+
+    utils::chunked_vector<entry> _entries;
+    std::vector<time_window_entry> _time_window_history;
+
+    metrics _metrics;
+    seastar::metrics::metric_groups _metric_group;
+
+private:
+    entry* get_entry(uint32_t label, uint64_t token) noexcept;
+    size_t compute_hash(uint32_t label, uint64_t token) noexcept;
+
+    void entry_refresh(entry& b) noexcept;
+    bool entry_is_empty(const entry& b) noexcept;
+
+    void register_metrics();
+
+protected:
+    void on_timer() noexcept;
+
+public:
+    rate_limiter_base();
+
+    rate_limiter_base(const rate_limiter_base&) = delete;
+    rate_limiter_base(rate_limiter_base&&) = delete;
+
+    rate_limiter_base& operator=(const rate_limiter_base&) = delete;
+    rate_limiter_base& operator=(rate_limiter_base&&) = delete;
+
+    // (For testing purposes only)
+    // Increments the counter for given (label, token) and returns
+    // the new value of the counter.
+    uint64_t increase_and_get_counter(label& l, uint64_t token) noexcept;
+
+    // Increments the counter for given (label, token).
+    // If the counter indicates that the partition is over the limit,
+    // returns can_proceed::no with some probability.
+    //
+    // The `random_variable` parameter should be a value from range [0, 1).
+    // It is used as the source of randomness - the function chooses a threshold
+    // and accepts if and only if `random_variable` is below it.
+    //
+    // The probability is calculated in such a way that statistically
+    // only `limit` operations per second are admitted.
+    can_proceed account_operation(label& l, uint64_t token, uint64_t limit,
+            const db::per_partition_rate_limit::info& rate_limit_info) noexcept;
+};
+
+template<typename ClockType>
+class generic_rate_limiter : public rate_limiter_base {
+private:
+    seastar::timer<ClockType> _timer;
+
+public:
+    generic_rate_limiter()
+            : rate_limiter_base() {
+
+        // Rate limiting is more accurate when the rate limiter timers
+        // on all nodes are synchronized. Assume that the nodes' clocks
+        // are synchronized and schedule the first tick on the beginning
+        // of the closest second.
+
+        const auto period = std::chrono::seconds(1);
+        const auto now = std::chrono::system_clock::now();
+        const auto initial_delay = period - now.time_since_epoch() % period;
+
+        _timer.set_callback([this] { on_timer(); });
+        _timer.arm(ClockType::now() + initial_delay, period);
+    }
+};
+
+extern template class generic_rate_limiter<seastar::lowres_clock>;
+using rate_limiter = generic_rate_limiter<seastar::lowres_clock>;
+
+}
diff --git a/db/system_distributed_keyspace.cc b/db/system_distributed_keyspace.cc
index eda4d0cb23..c1ebf38dd3 100644
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -565,6 +565,7 @@ system_distributed_keyspace::insert_cdc_generation(
             db::timeout_clock::now() + 60s,
             nullptr, // trace_state
             empty_service_permit(),
+            db::allow_per_partition_rate_limit::no,
             false // raw_counters
         );
     });
@@ -661,6 +662,7 @@ system_distributed_keyspace::create_cdc_desc(
             db::timeout_clock::now() + 30s,
             nullptr, // trace_state
             empty_service_permit(),
+            db::allow_per_partition_rate_limit::no,
             false // raw_counters
         );
     });
@@ -704,6 +706,7 @@ system_distributed_keyspace::cdc_desc_exists(
             db::timeout_clock::now() + 10s,
             nullptr, // trace_state
             empty_service_permit(),
+            db::allow_per_partition_rate_limit::no,
             false // raw_counters
         );
 
diff --git a/db/view/view.cc b/db/view/view.cc
index 5be551cd4c..5b59f0afcd 100644
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -2253,7 +2253,7 @@ void delete_ghost_rows_visitor::accept_new_row(const clustering_key& ck, const q
         auto& row = m.partition().clustered_row(*_view, ck);
         row.apply(tombstone(api::new_timestamp(), gc_clock::now()));
         timeout = db::timeout_clock::now() + _timeout_duration;
-        _proxy.mutate({m}, db::consistency_level::ALL, timeout, _state.get_trace_state(), empty_service_permit()).get();
+        _proxy.mutate({m}, db::consistency_level::ALL, timeout, _state.get_trace_state(), empty_service_permit(), db::allow_per_partition_rate_limit::no).get();
     }
 }
 
diff --git a/docs/design-notes/cql-extensions.md b/docs/design-notes/cql-extensions.md
index 4339e49f74..1dab7b06f3 100644
--- a/docs/design-notes/cql-extensions.md
+++ b/docs/design-notes/cql-extensions.md
@@ -140,3 +140,43 @@ Subscripting a list in a WHERE clause is supported as are maps.
 ```cql
 WHERE some_list[:index] = :value
 ```
+
+## Per-partition rate limit
+
+The `per_partition_rate_limit` option can be used to limit the allowed
+rate of requests to each partition in a given table. When the cluster detects
+that the rate of requests exceeds configured limit, the cluster will start
+rejecting some of them in order to bring the throughput back to the configured
+limit. Rejected requests are less costly which can help reduce overload.
+
+_NOTE_: Due to Scylla's distributed nature, tracking per-partition request rates
+is not perfect and the actual rate of accepted requests may be higher up to
+a factor of keyspace's `RF`. This feature should not be used to enforce precise
+limits but rather serve as an overload protection feature.
+
+_NOTE): This feature works best when shard-aware drivers are used (rejected
+requests have the least cost).
+
+Limits are configured separately for reads and writes. Some examples:
+
+```cql
+    ALTER TABLE t WITH per_partition_rate_limit = {
+        'max_reads_per_second': 100,
+        'max_writes_per_second': 200
+    };
+```
+
+Limit reads only, no limit for writes:
+```cql
+    ALTER TABLE t WITH per_partition_rate_limit = {
+        'max_reads_per_second': 200
+    };
+```
+
+Rejected requests receive the scylla-specific "Rate limit exceeded" error.
+If the driver doesn't support it, `Config_error` will be sent instead.
+
+For more details, see:
+
+- Detailed [`design notes`](./per-partition-rate-limit.md)
+- Description of the [rate limit exceeded](./protocol-extensions.md#rate-limit-error) error
diff --git a/docs/design-notes/per-partition-rate-limit.md b/docs/design-notes/per-partition-rate-limit.md
new file mode 100644
index 0000000000..5f592af898
--- /dev/null
+++ b/docs/design-notes/per-partition-rate-limit.md
@@ -0,0 +1,153 @@
+# Per-partition rate limiting
+
+Scylla clusters operate best when the data is spread across a large number
+of small partitions, and reads/writes are spread uniformly across all shards
+and nodes. Due to various reasons (bugs, malicious end users etc.) this
+assumption may suddenly not hold anymore and one partition may start getting
+a disproportionate number of requests. In turn, this usually leads to the owning
+shards being overloaded - a scenario called "hot partition" - and the total
+cluster latency becoming worse.
+
+The _per partition rate limit_ feature allows users to limit the rate
+of accepted requests on a per-partition basis. When a partition exceeds
+the configured limit of operations of given type (reads/writes) per second,
+the cluster will start responding with errors to some of the operations for that
+partition so that, statistically, the rate of accepted requests is kept
+at the configured limit. Rejected operations use less resources, therefore
+this feature can help in the "hot partition" situation.
+
+_NOTE_: this is an overload protection mechanism and may not be used to reliably
+enforce limits in some situations. Due to Scylla's distributed nature,
+the actual number of accepted requests depends on the cluster and driver
+configuration and may be larger by a factor of RF (keyspace's replication
+factor). It is recommended to set the limit to a value an order of magnitude
+larger than the maximum expected per-partition throughput. See the
+[Inaccurracies](#inaccurracies) section for more information.
+
+
+## Usage
+
+### Server-side configuration
+
+Per-partition limits are set separately for reads and writes, on a per-table
+basis. Limits can be set with the `per_partition_rate_limit` extension when
+CREATE'ing or ALTER'ing a table using a schema extension:
+
+```cql
+ALTER TABLE ks.tbl WITH per_partition_rate_limit = {
+    'max_reads_per_second': 123,
+    'max_writes_per_second': 456
+};
+```
+
+Both `max_reads_per_second` and `max_writes_per_second` are optional - omitting
+one of them means "no limit" for that type of operation.
+
+### Driver response
+
+Rejected operations are reported as an ERROR response to the driver.
+If the driver supports it, the response contains a scylla-specific error code
+indicating that the operation was rejected. For more details about the error
+code, see the [Rate limit error](./protocol-extensions.md#Rate%20limit%20error)
+section in the `protocol-extensions.md` doc.
+
+If the driver doesn't support the new error code, the `Config_error` code
+is returned instead. The code was chosen in order for the retry policies
+of the drivers not to retry the requests and instead propagate them directly
+to the users.
+
+## How it works
+
+Accounting related to tracking per-partition limits is done by replicas.
+Each replica keeps a map of counters which are identified by a combination
+of (token, table, operation type). When the replica accounts an operation,
+it increments the relevant counter. All counters are halved every second.
+
+Depending on whether the coordinator is a replica or not, the flow is
+a bit different. Here, "coordinator == replica" requirement also means
+that the operation is handled on the correct shard.
+
+Only reads and writes explicitly issued by the user are counted to the limit.
+Read repair, hints, batch replay, CDC preimage query and internal system queries
+are _not_ counted to the limit.
+
+Paxos and counters are not covered in current implemenation.
+
+### Coordinator is not a replica
+
+Coordinator generates a random number from range `[0, 1)` with uniform
+distribution and sends it to replicas along with the operation request.
+Each replica accounts the operation and then calculates a rejection threshold
+based on the local counter value. If the number received from the coordinator
+is above the threshold, the operation is rejected.
+
+The assumption is that all replicas will converge to similar counter values.
+Most of the time they will agree on the decision and not much work
+will be wasted due to some replicas accepting and other rejecting.
+
+### Coordinator is a replica
+
+As before, the coordinator generates a random number. However, it does not
+send requests to replicas immediately but rather calculates local rejection
+threshold. If the number is above threshold, the whole operation is skipped
+and the operation is only accounted on the coordinator. Otherwise, coordinator
+proceeds with sending the requests, and replicas are told only to account
+the operation but never reject it.
+
+This strategy leads to no wasted replica work. However, when the coordinator
+rejects the operation other replicas do not account it, so it may lead to
+a bit more requests being accepted (but still not more than `RF * limit`).
+
+### How to calculate rejection threshold
+
+Let's assume the simplest case where there is only one replica. It will
+increment its counter on every operation. Because all counters are halved
+every second, assuming the rate of `V` ops/s the counter will eventually
+oscillate between `V` and `2V`. If the limit is `L` ops/s, then we would
+like to admit only `L` operation within each second - therefore the probability
+should satisfy the following:
+
+```
+  L = Sum(i = V..2V) { P(i) }
+```
+
+This can be approximated with a definite integral:
+
+```
+  L = Int(x = V..2V) { P(x) }
+```
+
+A solution to this integral is:
+
+```
+  P(x) = L / (x * ln 2)
+```
+
+where `x` is the current value of the counter. This is the formula used
+in the current implementation.
+
+### Inaccurracies
+
+In practice, RF is rarely 1 so there is more than one replica. Depending on
+the type of the operation, this introduces some inaccurracies in counting.
+
+- Writes are counted relatively well because all live replicas participate
+  in a write operation, so all replicas should have an up-to-date counter
+  value. Because of the "coordinator is replica" case, rejected writes
+  will not be accounted on all replicas. In tests, the amount of accepted
+  operations was quite close to the limit and much less than the theoretical
+  `RF * limit`.
+- Reads are less accurate because not all replicas may participate in a given
+  read operation (this depends on CL). In the worst case of CL=ONE and
+  round-robin strategy, up to `RF * limit` ops/s will be accepted. Higher
+  consistencies are counted better, e.g. CL=ALL - although they are also
+  susceptible to the inaccurracy introduced by "coordinator is replica" case.
+- In case of non-shard-aware drivers, it is best to keep the clocks in sync.
+  When the coordinator is not a replica, each replica decides whether to accept
+  or not, based on the random number sent by coordinator. If the replicas have
+  their clocks in sync, then their per-partition counters should have close
+  values and they will agree on the decision whether to reject or not most of
+  the time. If not, they will disagree more frequently which will result in
+  wasted replica work and the effective rate limit will be lower or higher,
+  depending on the consistency. In the worst case, it might be 30% lower or
+  45% higher than the real limit.
diff --git a/docs/design-notes/protocol-extensions.md b/docs/design-notes/protocol-extensions.md
index 2bfe5f372d..8fdfeadd79 100644
--- a/docs/design-notes/protocol-extensions.md
+++ b/docs/design-notes/protocol-extensions.md
@@ -146,3 +146,37 @@ parameters:
     the bit mask that should be used by the client to test against when checking
     prepared statement metadata flags to see if the current query is conditional
     or not.
+
+## Rate limit error
+
+This extension allows the driver to send a new type of error in case the operation
+goes over the allowed per-partition rate limit. This kind of error does not fit
+other existing error codes well, hence the need for the protocol extension.
+
+On receiving this error, the driver should not retry the request; instead,
+the error should be propagated to the user so that they can decide what to do
+with it - sometimes it might make sense to propagate the error, in other cases
+it might make sense to retry with backoff.
+
+The body of the error consists of the usual error code, error message and then
+the following fields: `<op_type><rejected_by_coordinator>`, where:
+
+- `op_type` is a byte which identifies the operation which is the origin
+  of the rate limit.
+  - 0: read
+  - 1: write
+- `rejected_by_coordinator` is a byte which is 1 if the operation was rejected
+  on the coordinator and 0 if it was rejected by replicas.
+
+If the driver does not understand this extension and does not enable it,
+the Config_error will be used instead of the new error code.
+
+In order to be forward compatible with error codes added in the future protocol
+versions, this extension doesn't reserve a fixed error code - instead, it
+advertises the integer value used as the error code in the SUPPORTED response.
+
+This extension is identified by the `SCYLLA_RATE_LIMIT_ERROR` key.
+The string map in the SUPPORTED response will contain the following parameters:
+
+  - `ERROR_CODE`: a 32-bit signed decimal integer which Scylla
+    will use as the error code for the rate limit exception.
diff --git a/exceptions/coordinator_result.hh b/exceptions/coordinator_result.hh
index 0195e08194..79e09e780f 100644
--- a/exceptions/coordinator_result.hh
+++ b/exceptions/coordinator_result.hh
@@ -31,7 +31,8 @@ namespace exceptions {
 using coordinator_exception_container = utils::exception_container<
     mutation_write_timeout_exception,
     read_timeout_exception,
-    read_failure_exception
+    read_failure_exception,
+    rate_limit_exception
 >;
 
 template<typename T = void>
diff --git a/exceptions/exceptions.cc b/exceptions/exceptions.cc
index 59cbfc2671..0c6da158e2 100644
--- a/exceptions/exceptions.cc
+++ b/exceptions/exceptions.cc
@@ -39,7 +39,8 @@ const std::unordered_map<exception_code, sstring>& exception_map() {
         {exception_code::INVALID, "invalid"},
         {exception_code::CONFIG_ERROR, "config_error"},
         {exception_code::ALREADY_EXISTS, "already_exists"},
-        {exception_code::UNPREPARED, "unprepared"}
+        {exception_code::UNPREPARED, "unprepared"},
+        {exception_code::RATE_LIMIT_ERROR, "rate_limit_error"}
     };
     return map;
 }
@@ -77,6 +78,12 @@ overloaded_exception::overloaded_exception(size_t c) noexcept
     : cassandra_exception(exception_code::OVERLOADED, prepare_message("Too many in flight hints: {}", c))
     {}
 
+rate_limit_exception::rate_limit_exception(const sstring& ks, const sstring& cf, db::operation_type op_type_, bool rejected_by_coordinator_) noexcept
+    : cassandra_exception(exception_code::CONFIG_ERROR, prepare_message("Per-partition rate limit reached for {} in table {}.{}, rejected by {}", op_type_, ks, cf, rejected_by_coordinator_ ? "coordinator" : "replicas"))
+    , op_type(op_type_)
+    , rejected_by_coordinator(rejected_by_coordinator_)
+    { }
+
 prepared_query_not_found_exception::prepared_query_not_found_exception(bytes id) noexcept
     : request_validation_exception{exception_code::UNPREPARED, prepare_message("No prepared statement with ID {} found.", id)}
     , id{id}
diff --git a/exceptions/exceptions.hh b/exceptions/exceptions.hh
index d662f81506..a62f46a642 100644
--- a/exceptions/exceptions.hh
+++ b/exceptions/exceptions.hh
@@ -12,6 +12,7 @@
 
 #include "db/consistency_level_type.hh"
 #include "db/write_type.hh"
+#include "db/operation_type.hh"
 #include <stdexcept>
 #include <seastar/core/sstring.hh>
 #include "bytes.hh"
@@ -42,7 +43,17 @@ enum class exception_code : int32_t {
     INVALID         = 0x2200,
     CONFIG_ERROR    = 0x2300,
     ALREADY_EXISTS  = 0x2400,
-    UNPREPARED      = 0x2500
+    UNPREPARED      = 0x2500,
+
+    // Scylla-specific error codes
+    // The error codes below are advertised to the drivers during connection
+    // handshake using the protocol extension negotiation, and are only
+    // enabled if the drivers explicitly enable them. Therefore it's perfectly
+    // fine to change them in case some new error codes are introduced
+    // in Cassandra.
+    // NOTE TO DRIVER DEVELOPERS: These constants must not be relied upon,
+    // they must be learned from protocol extensions instead.
+    RATE_LIMIT_ERROR = 0xF000
 };
 
 const std::unordered_map<exception_code, sstring>& exception_map();
@@ -183,6 +194,13 @@ struct overloaded_exception : public cassandra_exception {
         cassandra_exception(exception_code::OVERLOADED, std::move(msg)) {}
 };
 
+struct rate_limit_exception : public cassandra_exception {
+    db::operation_type op_type;
+    bool rejected_by_coordinator;
+
+    rate_limit_exception(const sstring& ks, const sstring& cf, db::operation_type op_type_, bool rejected_by_coordinator_) noexcept;
+};
+
 class request_validation_exception : public cassandra_exception {
 public:
     using cassandra_exception::cassandra_exception;
diff --git a/gms/feature_service.hh b/gms/feature_service.hh
index 9912e0afc1..5ec669ad84 100644
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -108,6 +108,7 @@ public:
     gms::feature tombstone_gc_options { *this, "TOMBSTONE_GC_OPTIONS"sv };
     gms::feature parallelized_aggregation { *this, "PARALLELIZED_AGGREGATION"sv };
     gms::feature keyspace_storage_options { *this, "KEYSPACE_STORAGE_OPTIONS"sv };
+    gms::feature typed_errors_in_read_rpc { *this, "TYPED_ERRORS_IN_READ_RPC"sv };
 
 public:
 
diff --git a/idl/per_partition_rate_limit_info.idl.hh b/idl/per_partition_rate_limit_info.idl.hh
new file mode 100644
index 0000000000..ee51ceb04c
--- /dev/null
+++ b/idl/per_partition_rate_limit_info.idl.hh
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+namespace db {
+
+namespace per_partition_rate_limit {
+
+struct account_only {};
+
+struct account_and_enforce {
+    uint32_t random_variable;
+};
+
+// using info = std::variant<std::monostate, account_only, account_and_enforce>;
+
+} // namespace per_partition_rate_limit
+
+} // namespace db
diff --git a/idl/replica_exception.idl.hh b/idl/replica_exception.idl.hh
new file mode 100644
index 0000000000..650a494fe3
--- /dev/null
+++ b/idl/replica_exception.idl.hh
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+namespace replica {
+
+struct unknown_exception {};
+
+struct no_exception {};
+
+class rate_limit_exception {
+};
+
+struct exception_variant {
+    std::variant<replica::unknown_exception,
+            replica::no_exception,
+            replica::rate_limit_exception
+    > reason;
+};
+
+}
diff --git a/idl/storage_proxy.idl.hh b/idl/storage_proxy.idl.hh
index ec3cbd0dfa..740dd97156 100644
--- a/idl/storage_proxy.idl.hh
+++ b/idl/storage_proxy.idl.hh
@@ -6,14 +6,14 @@
  * SPDX-License-Identifier: AGPL-3.0-or-later
  */
 
-verb [[with_client_info, with_timeout, one_way]] mutation (frozen_mutation fm, inet_address_vector_replica_set forward, gms::inet_address reply_to, unsigned shard, uint64_t response_id, std::optional<tracing::trace_info> trace_info [[version 1.3.0]]);
+verb [[with_client_info, with_timeout, one_way]] mutation (frozen_mutation fm, inet_address_vector_replica_set forward, gms::inet_address reply_to, unsigned shard, uint64_t response_id, std::optional<tracing::trace_info> trace_info [[version 1.3.0]], db::per_partition_rate_limit::info rate_limit_info [[version 5.1.0]]);
 verb [[with_client_info, one_way]] mutation_done (unsigned shard, uint64_t response_id, db::view::update_backlog backlog [[version 3.1.0]]);
-verb [[with_client_info, one_way]] mutation_failed (unsigned shard, uint64_t response_id, size_t num_failed, db::view::update_backlog backlog [[version 3.1.0]]);
+verb [[with_client_info, one_way]] mutation_failed (unsigned shard, uint64_t response_id, size_t num_failed, db::view::update_backlog backlog [[version 3.1.0]], replica::exception_variant exception [[version 5.1.0]]);
 verb [[with_client_info, with_timeout]] counter_mutation (std::vector<frozen_mutation> fms, db::consistency_level cl, std::optional<tracing::trace_info> trace_info);
 verb [[with_client_info, with_timeout, one_way]] hint_mutation (frozen_mutation fm, inet_address_vector_replica_set forward, gms::inet_address reply_to, unsigned shard, uint64_t response_id, std::optional<tracing::trace_info> trace_info [[version 1.3.0]] /* this verb was mistakenly introduced with optional trace_info */);
-verb [[with_client_info, with_timeout]] read_data (query::read_command cmd, ::compat::wrapping_partition_range pr, query::digest_algorithm digest [[version 3.0.0]]) -> query::result [[lw_shared_ptr]], cache_temperature [[version 2.0.0]];
-verb [[with_client_info, with_timeout]] read_mutation_data (query::read_command cmd, ::compat::wrapping_partition_range pr) -> reconcilable_result [[lw_shared_ptr]], cache_temperature [[version 2.0.0]];
-verb [[with_client_info, with_timeout]] read_digest (query::read_command cmd, ::compat::wrapping_partition_range pr, query::digest_algorithm digest [[version 3.0.0]]) -> query::result_digest, api::timestamp_type [[version 1.2.0]], cache_temperature [[version 2.0.0]];
+verb [[with_client_info, with_timeout]] read_data (query::read_command cmd, ::compat::wrapping_partition_range pr, query::digest_algorithm digest [[version 3.0.0]], db::per_partition_rate_limit::info rate_limit_info [[version 5.1.0]]) -> query::result [[lw_shared_ptr]], cache_temperature [[version 2.0.0]], replica::exception_variant [[version 5.1.0]];
+verb [[with_client_info, with_timeout]] read_mutation_data (query::read_command cmd, ::compat::wrapping_partition_range pr) -> reconcilable_result [[lw_shared_ptr]], cache_temperature [[version 2.0.0]], replica::exception_variant [[version 5.1.0]];
+verb [[with_client_info, with_timeout]] read_digest (query::read_command cmd, ::compat::wrapping_partition_range pr, query::digest_algorithm digest [[version 3.0.0]], db::per_partition_rate_limit::info rate_limit_info [[version 5.1.0]]) -> query::result_digest, api::timestamp_type [[version 1.2.0]], cache_temperature [[version 2.0.0]], replica::exception_variant [[version 5.1.0]];
 verb [[with_timeout]] truncate (sstring, sstring);
 verb [[with_client_info, with_timeout]] paxos_prepare (query::read_command cmd, partition_key key, utils::UUID ballot, bool only_digest, query::digest_algorithm da, std::optional<tracing::trace_info> trace_info) -> service::paxos::prepare_response [[unique_ptr]];
 verb [[with_client_info, with_timeout]] paxos_accept (service::paxos::proposal proposal [[ref]], std::optional<tracing::trace_info> trace_info) -> bool;
diff --git a/main.cc b/main.cc
index 950502992e..82fe253d25 100644
--- a/main.cc
+++ b/main.cc
@@ -86,6 +86,7 @@
 #include "alternator/controller.hh"
 #include "alternator/ttl.hh"
 #include "tools/entry_point.hh"
+#include "db/per_partition_rate_limit_extension.hh"
 
 #include "service/raft/raft_group_registry.hh"
 #include "service/raft/raft_group0_client.hh"
@@ -459,6 +460,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
     ext->add_schema_extension<cdc::cdc_extension>(cdc::cdc_extension::NAME);
     ext->add_schema_extension<db::paxos_grace_seconds_extension>(db::paxos_grace_seconds_extension::NAME);
     ext->add_schema_extension<tombstone_gc_extension>(tombstone_gc_extension::NAME);
+    ext->add_schema_extension<db::per_partition_rate_limit_extension>(db::per_partition_rate_limit_extension::NAME);
 
     auto cfg = make_lw_shared<db::config>(ext);
     auto init = app.get_options_description().add_options();
diff --git a/message/messaging_service.cc b/message/messaging_service.cc
index 59e2371806..d3accb8d70 100644
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -42,7 +42,9 @@
 #include "cache_temperature.hh"
 #include "raft/raft.hh"
 #include "service/raft/messaging.hh"
+#include "replica/exceptions.hh"
 #include "serializer.hh"
+#include "db/per_partition_rate_limit_info.hh"
 #include "idl/consistency_level.dist.hh"
 #include "idl/tracing.dist.hh"
 #include "idl/result.dist.hh"
@@ -67,6 +69,8 @@
 #include "idl/raft_storage.dist.hh"
 #include "idl/raft.dist.hh"
 #include "idl/group0.dist.hh"
+#include "idl/replica_exception.dist.hh"
+#include "idl/per_partition_rate_limit_info.dist.hh"
 #include "idl/storage_proxy.dist.hh"
 #include "serializer_impl.hh"
 #include "serialization_visitors.hh"
@@ -94,6 +98,8 @@
 #include "idl/raft.dist.impl.hh"
 #include "idl/group0.dist.impl.hh"
 #include "idl/view.dist.impl.hh"
+#include "idl/replica_exception.dist.impl.hh"
+#include "idl/per_partition_rate_limit_info.dist.impl.hh"
 #include "idl/storage_proxy.dist.impl.hh"
 #include <seastar/rpc/lz4_compressor.hh>
 #include <seastar/rpc/lz4_fragmented_compressor.hh>
diff --git a/query-request.hh b/query-request.hh
index bc62331b37..767a1c7c42 100644
--- a/query-request.hh
+++ b/query-request.hh
@@ -18,6 +18,7 @@
 #include "tracing/tracing.hh"
 #include "utils/small_vector.hh"
 #include "query_class_config.hh"
+#include "db/per_partition_rate_limit_info.hh"
 
 #include "bytes.hh"
 
@@ -298,6 +299,7 @@ public:
     std::optional<query::max_result_size> max_result_size;
     uint32_t row_limit_high_bits;
     api::timestamp_type read_timestamp; // not serialized
+    db::allow_per_partition_rate_limit allow_limit; // not serialized
 public:
     // IDL constructor
     read_command(utils::UUID cf_id,
@@ -323,6 +325,7 @@ public:
         , max_result_size(max_result_size)
         , row_limit_high_bits(row_limit_high_bits)
         , read_timestamp(api::new_timestamp())
+        , allow_limit(db::allow_per_partition_rate_limit::no)
     { }
 
     read_command(utils::UUID cf_id,
@@ -335,7 +338,8 @@ public:
             std::optional<tracing::trace_info> ti = std::nullopt,
             utils::UUID query_uuid = utils::UUID(),
             query::is_first_page is_first_page = query::is_first_page::no,
-            api::timestamp_type rt = api::new_timestamp())
+            api::timestamp_type rt = api::new_timestamp(),
+            db::allow_per_partition_rate_limit allow_limit = db::allow_per_partition_rate_limit::no)
         : cf_id(std::move(cf_id))
         , schema_version(std::move(schema_version))
         , slice(std::move(slice))
@@ -348,6 +352,7 @@ public:
         , max_result_size(max_result_size)
         , row_limit_high_bits(static_cast<uint32_t>(static_cast<uint64_t>(row_limit) >> 32))
         , read_timestamp(rt)
+        , allow_limit(allow_limit)
     { }
 
 
diff --git a/redis/mutation_utils.cc b/redis/mutation_utils.cc
index cd3bc59492..9bac16d6ca 100644
--- a/redis/mutation_utils.cc
+++ b/redis/mutation_utils.cc
@@ -50,7 +50,7 @@ future<> write_hashes(service::storage_proxy& proxy, redis::redis_options& optio
     m.set_clustered_cell(ckey, column, std::move(cell));
 
     auto write_consistency_level = options.get_write_consistency_level();
-    return proxy.mutate(std::vector<mutation> {std::move(m)}, write_consistency_level, timeout, nullptr, permit);
+    return proxy.mutate(std::vector<mutation> {std::move(m)}, write_consistency_level, timeout, nullptr, permit, db::allow_per_partition_rate_limit::yes);
 }
 
 
@@ -68,7 +68,7 @@ future<> write_strings(service::storage_proxy& proxy, redis::redis_options& opti
     db::timeout_clock::time_point timeout = db::timeout_clock::now() + options.get_write_timeout();
     auto m = make_mutation(proxy, options, std::move(key), std::move(data), ttl);
     auto write_consistency_level = options.get_write_consistency_level();
-    return proxy.mutate(std::vector<mutation> {std::move(m)}, write_consistency_level, timeout, nullptr, permit);
+    return proxy.mutate(std::vector<mutation> {std::move(m)}, write_consistency_level, timeout, nullptr, permit, db::allow_per_partition_rate_limit::yes);
 }
 
 
@@ -87,7 +87,7 @@ future<> delete_objects(service::storage_proxy& proxy, redis::redis_options& opt
     auto remove = [&proxy, timeout, write_consistency_level, permit, &options, keys = std::move(keys)] (const sstring& cf_name) {
         return parallel_for_each(keys.begin(), keys.end(), [&proxy, timeout, write_consistency_level, &options, permit, cf_name] (const bytes& key) {
             auto m = make_tombstone(proxy, options, cf_name, key);
-            return proxy.mutate(std::vector<mutation> {std::move(m)}, write_consistency_level, timeout, nullptr, permit);
+            return proxy.mutate(std::vector<mutation> {std::move(m)}, write_consistency_level, timeout, nullptr, permit, db::allow_per_partition_rate_limit::yes);
         });
     };  
     return parallel_for_each(tables.begin(), tables.end(), remove);
@@ -107,7 +107,7 @@ future<> delete_fields(service::storage_proxy& proxy, redis::redis_options& opti
         m.partition().apply_delete(*schema, ckey, tombstone { ts, clk });
         mutations.push_back(m);
     }
-    return proxy.mutate(mutations, write_consistency_level, timeout, nullptr, permit);
+    return proxy.mutate(mutations, write_consistency_level, timeout, nullptr, permit, db::allow_per_partition_rate_limit::yes);
 }
 
 }
diff --git a/redis/query_utils.cc b/redis/query_utils.cc
index 222f3764a8..c6c2ee18a3 100644
--- a/redis/query_utils.cc
+++ b/redis/query_utils.cc
@@ -8,6 +8,7 @@
 
 
 #include "redis/query_utils.hh"
+#include "db/per_partition_rate_limit_info.hh"
 #include "redis/options.hh"
 #include "timeout_config.hh"
 #include "service/client_state.hh"
diff --git a/replica/database.cc b/replica/database.cc
index be58bb3cea..1c66faf172 100644
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -7,6 +7,7 @@
  */
 
 #include "log.hh"
+#include "replica/database_fwd.hh"
 #include "utils/lister.hh"
 #include "replica/database.hh"
 #include <seastar/core/future-util.hh>
@@ -42,6 +43,7 @@
 #include "gms/feature_service.hh"
 #include "timeout_config.hh"
 #include "service/storage_proxy.hh"
+#include "db/operation_type.hh"
 
 #include "utils/human_readable.hh"
 #include "utils/fb_utilities.hh"
@@ -62,6 +64,7 @@
 #include "tombstone_gc.hh"
 
 #include "replica/data_dictionary_impl.hh"
+#include "replica/exceptions.hh"
 #include "readers/multi_range.hh"
 #include "readers/multishard.hh"
 
@@ -540,6 +543,9 @@ database::setup_metrics() {
         sm::make_counter("total_writes_timedout", _stats->total_writes_timedout,
                        sm::description("Counts write operations failed due to a timeout. A positive value is a sign of storage being overloaded.")),
 
+        sm::make_counter("total_writes_rate_limited", _stats->total_writes_rate_limited,
+                       sm::description("Counts write operations which were rejected on the replica side because the per-partition limit was reached.")),
+
         sm::make_counter("total_reads", _read_concurrency_sem.get_stats().total_successful_reads,
                        sm::description("Counts the total number of successful user reads on this shard."),
                        {user_label_instance}),
@@ -558,6 +564,9 @@ database::setup_metrics() {
                                        "Add the total_reads to this value to get the total amount of reads issued on this shard."),
                        {system_label_instance}),
 
+        sm::make_counter("total_reads_rate_limited", _stats->total_reads_rate_limited,
+                       sm::description("Counts read operations which were rejected on the replica side because the per-partition limit was reached.")),
+
         sm::make_current_bytes("view_update_backlog", [this] { return get_view_update_backlog().current; },
                        sm::description("Holds the current size in bytes of the pending view updates for all tables")),
 
@@ -1283,15 +1292,103 @@ database::existing_index_names(const sstring& ks_name, const sstring& cf_to_excl
     return names;
 }
 
+namespace {
+
+enum class request_class {
+    user,
+    system,
+    maintenance,
+};
+
+request_class classify_request(const database_config& _dbcfg) {
+    const auto current_group = current_scheduling_group();
+
+    // Everything running in the statement group is considered a user request
+    if (current_group == _dbcfg.statement_scheduling_group) {
+        return request_class::user;
+    // System requests run in the default (main) scheduling group
+    // All requests executed on behalf of internal work also uses the system semaphore
+    } else if (current_group == default_scheduling_group()
+            || current_group == _dbcfg.compaction_scheduling_group
+            || current_group == _dbcfg.gossip_scheduling_group
+            || current_group == _dbcfg.memory_compaction_scheduling_group
+            || current_group == _dbcfg.memtable_scheduling_group
+            || current_group == _dbcfg.memtable_to_cache_scheduling_group) {
+        return request_class::system;
+    // Requests done on behalf of view update generation run in the streaming group
+    } else if (current_scheduling_group() == _dbcfg.streaming_scheduling_group) {
+        return request_class::maintenance;
+    // Everything else is considered a user request
+    } else {
+        return request_class::user;
+    }
+}
+
+} // anonymous namespace
+
+static bool can_apply_per_partition_rate_limit(const schema& s, const database_config& dbcfg, db::operation_type op_type) {
+    return s.per_partition_rate_limit_options().get_max_ops_per_second(op_type).has_value()
+            && classify_request(dbcfg) == request_class::user;
+}
+
+bool database::can_apply_per_partition_rate_limit(const schema& s, db::operation_type op_type) const {
+    return replica::can_apply_per_partition_rate_limit(s, _dbcfg, op_type);
+}
+
+std::optional<db::rate_limiter::can_proceed> database::account_coordinator_operation_to_rate_limit(table& tbl, const dht::token& token,
+        db::per_partition_rate_limit::account_and_enforce account_and_enforce_info,
+        db::operation_type op_type) {
+
+    std::optional<uint32_t> table_limit = tbl.schema()->per_partition_rate_limit_options().get_max_ops_per_second(op_type);
+    db::rate_limiter::label& lbl = tbl.get_rate_limiter_label_for_op_type(op_type);
+    return _rate_limiter.account_operation(lbl, dht::token::to_int64(token), *table_limit, account_and_enforce_info);
+}
+
+static db::rate_limiter::can_proceed account_singular_ranges_to_rate_limit(
+        db::rate_limiter& limiter, column_family& cf,
+        const dht::partition_range_vector& ranges,
+        const database_config& dbcfg,
+        db::per_partition_rate_limit::info rate_limit_info) {
+    using can_proceed = db::rate_limiter::can_proceed;
+
+    if (std::holds_alternative<std::monostate>(rate_limit_info) || !can_apply_per_partition_rate_limit(*cf.schema(), dbcfg, db::operation_type::read)) {
+        // Rate limiting is disabled for this query
+        return can_proceed::yes;
+    }
+
+    auto table_limit = *cf.schema()->per_partition_rate_limit_options().get_max_reads_per_second();
+    can_proceed ret = can_proceed::yes;
+
+    auto& read_label = cf.get_rate_limiter_label_for_reads();
+    for (const auto& range : ranges) {
+        if (!range.is_singular()) {
+            continue;
+        }
+        auto token = dht::token::to_int64(ranges.front().start()->value().token());
+        if (limiter.account_operation(read_label, token, table_limit, rate_limit_info) == db::rate_limiter::can_proceed::no) {
+            // Don't return immediately - account all ranges first
+            ret = can_proceed::no;
+        }
+    }
+
+    return ret;
+}
+
 future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>>
 database::query(schema_ptr s, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges,
-                tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
+                tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout, db::per_partition_rate_limit::info rate_limit_info) {
     const auto reversed = cmd.slice.is_reversed();
     if (reversed) {
         s = s->make_reversed();
     }
 
     column_family& cf = find_column_family(cmd.cf_id);
+
+    if (account_singular_ranges_to_rate_limit(_rate_limiter, cf, ranges, _dbcfg, rate_limit_info) == db::rate_limiter::can_proceed::no) {
+        ++_stats->total_reads_rate_limited;
+        co_await coroutine::return_exception(replica::rate_limit_exception());
+    }
+
     auto& semaphore = get_reader_concurrency_semaphore();
     auto max_result_size = cmd.max_result_size ? *cmd.max_result_size : get_unlimited_query_max_result_size();
 
@@ -1404,56 +1501,22 @@ database::query_mutations(schema_ptr s, const query::read_command& cmd, const dh
     co_return std::tuple(std::move(result), hit_rate);
 }
 
-namespace {
-
-enum class query_class {
-    user,
-    system,
-    maintenance,
-};
-
-query_class classify_query(const database_config& _dbcfg) {
-    const auto current_group = current_scheduling_group();
-
-    // Everything running in the statement group is considered a user query
-    if (current_group == _dbcfg.statement_scheduling_group) {
-        return query_class::user;
-    // System queries run in the default (main) scheduling group
-    // All queries executed on behalf of internal work also uses the system semaphore
-    } else if (current_group == default_scheduling_group()
-            || current_group == _dbcfg.compaction_scheduling_group
-            || current_group == _dbcfg.gossip_scheduling_group
-            || current_group == _dbcfg.memory_compaction_scheduling_group
-            || current_group == _dbcfg.memtable_scheduling_group
-            || current_group == _dbcfg.memtable_to_cache_scheduling_group) {
-        return query_class::system;
-    // Reads done on behalf of view update generation run in the streaming group
-    } else if (current_scheduling_group() == _dbcfg.streaming_scheduling_group) {
-        return query_class::maintenance;
-    // Everything else is considered a user query
-    } else {
-        return query_class::user;
-    }
-}
-
-} // anonymous namespace
-
 query::max_result_size database::get_unlimited_query_max_result_size() const {
-    switch (classify_query(_dbcfg)) {
-        case query_class::user:
+    switch (classify_request(_dbcfg)) {
+        case request_class::user:
             return query::max_result_size(_cfg.max_memory_for_unlimited_query_soft_limit(), _cfg.max_memory_for_unlimited_query_hard_limit());
-        case query_class::system: [[fallthrough]];
-        case query_class::maintenance:
+        case request_class::system: [[fallthrough]];
+        case request_class::maintenance:
             return query::max_result_size(query::result_memory_limiter::unlimited_result_size);
     }
     std::abort();
 }
 
 reader_concurrency_semaphore& database::get_reader_concurrency_semaphore() {
-    switch (classify_query(_dbcfg)) {
-        case query_class::user: return _read_concurrency_sem;
-        case query_class::system: return _system_read_concurrency_sem;
-        case query_class::maintenance: return _streaming_concurrency_sem;
+    switch (classify_request(_dbcfg)) {
+        case request_class::user: return _read_concurrency_sem;
+        case request_class::system: return _system_read_concurrency_sem;
+        case request_class::maintenance: return _streaming_concurrency_sem;
     }
     std::abort();
 }
@@ -1769,13 +1832,22 @@ future<> database::apply_with_commitlog(column_family& cf, const mutation& m, db
     }
 }
 
-future<> database::do_apply(schema_ptr s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::timeout_clock::time_point timeout, db::commitlog::force_sync sync) {
+future<> database::do_apply(schema_ptr s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::timeout_clock::time_point timeout, db::commitlog::force_sync sync, db::per_partition_rate_limit::info rate_limit_info) {
     // I'm doing a nullcheck here since the init code path for db etc
     // is a little in flux and commitlog is created only when db is
     // initied from datadir.
     auto uuid = m.column_family_id();
     auto& cf = find_column_family(uuid);
 
+    if (!std::holds_alternative<std::monostate>(rate_limit_info) && can_apply_per_partition_rate_limit(*s, db::operation_type::write)) {
+        auto table_limit = *s->per_partition_rate_limit_options().get_max_writes_per_second();
+        auto& write_label = cf.get_rate_limiter_label_for_writes();
+        auto token = dht::token::to_int64(dht::get_token(*s, m.key()));
+        if (_rate_limiter.account_operation(write_label, token, table_limit, rate_limit_info) == db::rate_limiter::can_proceed::no) {
+            co_await coroutine::return_exception(replica::rate_limit_exception());
+        }
+    }
+
     sync = sync || db::commitlog::force_sync(s->wait_for_sync_to_commitlog());
 
     // Signal to view building code that a write is in progress,
@@ -1819,6 +1891,12 @@ Future database::update_write_metrics(Future&& f) {
             if (is_timeout_exception(ep)) {
                 ++s->total_writes_timedout;
             }
+            try {
+                std::rethrow_exception(ep);
+            } catch (replica::rate_limit_exception&) {
+                ++s->total_writes_rate_limited;
+            } catch (...) {
+            }
             return futurize<Future>::make_exception_future(std::move(ep));
         }
         ++s->total_writes;
@@ -1832,7 +1910,7 @@ void database::update_write_metrics_for_timed_out_write() {
     ++_stats->total_writes_timedout;
 }
 
-future<> database::apply(schema_ptr s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, db::timeout_clock::time_point timeout) {
+future<> database::apply(schema_ptr s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, db::timeout_clock::time_point timeout, db::per_partition_rate_limit::info rate_limit_info) {
     if (dblog.is_enabled(logging::log_level::trace)) {
         dblog.trace("apply {}", m.pretty_printer(s));
     }
@@ -1843,7 +1921,7 @@ future<> database::apply(schema_ptr s, const frozen_mutation& m, tracing::trace_
     if (!s->is_synced()) {
         on_internal_error(dblog, format("attempted to apply mutation using not synced schema of {}.{}, version={}", s->ks_name(), s->cf_name(), s->version()));
     }
-    return update_write_metrics(_apply_stage(this, std::move(s), seastar::cref(m), std::move(tr_state), timeout, sync));
+    return update_write_metrics(_apply_stage(this, std::move(s), seastar::cref(m), std::move(tr_state), timeout, sync, rate_limit_info));
 }
 
 future<> database::apply_hint(schema_ptr s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::timeout_clock::time_point timeout) {
@@ -1854,7 +1932,7 @@ future<> database::apply_hint(schema_ptr s, const frozen_mutation& m, tracing::t
         on_internal_error(dblog, format("attempted to apply hint using not synced schema of {}.{}, version={}", s->ks_name(), s->cf_name(), s->version()));
     }
     return with_scheduling_group(_dbcfg.streaming_scheduling_group, [this, s = std::move(s), &m, tr_state = std::move(tr_state), timeout] () mutable {
-        return update_write_metrics(_apply_stage(this, std::move(s), seastar::cref(m), std::move(tr_state), timeout, db::commitlog::force_sync::no));
+        return update_write_metrics(_apply_stage(this, std::move(s), seastar::cref(m), std::move(tr_state), timeout, db::commitlog::force_sync::no, std::monostate{}));
     });
 }
 
@@ -1933,6 +2011,14 @@ std::ostream& operator<<(std::ostream& os, db::consistency_level cl) {
     }
 }
 
+std::ostream& operator<<(std::ostream& os, operation_type op_type) {
+    switch (op_type) {
+    case operation_type::read: return os << "read";
+    case operation_type::write: return os << "write";
+    }
+    abort();
+}
+
 }
 
 std::ostream&
diff --git a/replica/database.hh b/replica/database.hh
index 564eb298b2..f642ee2c61 100644
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -66,6 +66,9 @@
 #include "absl-flat_hash_map.hh"
 #include "utils/cross-shard-barrier.hh"
 #include "sstables/generation_type.hh"
+#include "db/rate_limiter.hh"
+#include "db/per_partition_rate_limit_info.hh"
+#include "db/operation_type.hh"
 
 class cell_locker;
 class cell_locker_stats;
@@ -452,6 +455,11 @@ private:
     std::vector<view_ptr> _views;
 
     std::unique_ptr<cell_locker> _counter_cell_locks; // Memory-intensive; allocate only when needed.
+
+    // Labels used to identify writes and reads for this table in the rate_limiter structure.
+    db::rate_limiter::label _rate_limiter_label_for_writes;
+    db::rate_limiter::label _rate_limiter_label_for_reads;
+
     void set_metrics();
     seastar::metrics::metric_groups _metrics;
 
@@ -744,6 +752,23 @@ public:
         return _cache;
     }
 
+    db::rate_limiter::label& get_rate_limiter_label_for_op_type(db::operation_type op_type) {
+        switch (op_type) {
+        case db::operation_type::write:
+            return _rate_limiter_label_for_writes;
+        case db::operation_type::read:
+            return _rate_limiter_label_for_reads;
+        }
+    }
+
+    db::rate_limiter::label& get_rate_limiter_label_for_writes() {
+        return _rate_limiter_label_for_writes;
+    }
+
+    db::rate_limiter::label& get_rate_limiter_label_for_reads() {
+        return _rate_limiter_label_for_reads;
+    }
+
     future<std::vector<locked_cell>> lock_counter_cells(const mutation& m, db::timeout_clock::time_point timeout);
 
     logalloc::occupancy_stats occupancy() const;
@@ -1252,8 +1277,10 @@ private:
         uint64_t total_writes = 0;
         uint64_t total_writes_failed = 0;
         uint64_t total_writes_timedout = 0;
+        uint64_t total_writes_rate_limited = 0;
         uint64_t total_reads = 0;
         uint64_t total_reads_failed = 0;
+        uint64_t total_reads_rate_limited = 0;
 
         uint64_t short_data_queries = 0;
         uint64_t short_mutation_queries = 0;
@@ -1294,7 +1321,8 @@ private:
             const frozen_mutation&,
             tracing::trace_state_ptr,
             db::timeout_clock::time_point,
-            db::commitlog_force_sync> _apply_stage;
+            db::commitlog_force_sync,
+            db::per_partition_rate_limit::info> _apply_stage;
 
     flat_hash_map<sstring, keyspace> _keyspaces;
     std::unordered_map<utils::UUID, lw_shared_ptr<column_family>> _column_families;
@@ -1332,6 +1360,8 @@ private:
     std::unique_ptr<wasm::engine> _wasm_engine;
     utils::cross_shard_barrier _stop_barrier;
 
+    db::rate_limiter _rate_limiter;
+
 public:
     data_dictionary::database as_data_dictionary() const;
     std::shared_ptr<data_dictionary::user_types_storage> as_user_types_storage() const noexcept;
@@ -1361,7 +1391,7 @@ private:
     void setup_metrics();
     void setup_scylla_memory_diagnostics_producer();
 
-    future<> do_apply(schema_ptr, const frozen_mutation&, tracing::trace_state_ptr tr_state, db::timeout_clock::time_point timeout, db::commitlog_force_sync sync);
+    future<> do_apply(schema_ptr, const frozen_mutation&, tracing::trace_state_ptr tr_state, db::timeout_clock::time_point timeout, db::commitlog_force_sync sync, db::per_partition_rate_limit::info rate_limit_info);
     future<> apply_with_commitlog(column_family& cf, const mutation& m, db::timeout_clock::time_point timeout);
 
     future<mutation> do_apply_counter_update(column_family& cf, const frozen_mutation& fm, schema_ptr m_schema, db::timeout_clock::time_point timeout,
@@ -1485,14 +1515,28 @@ public:
     future<> stop();
     future<> close_tables(table_kind kind_to_close);
 
+    /// Checks whether per-partition rate limit can be applied to the operation or not.
+    bool can_apply_per_partition_rate_limit(const schema& s, db::operation_type op_type) const;
+
+    /// Tries to account given operation to the rate limit when the coordinator is a replica.
+    /// This function can be called ONLY when rate limiting can be applied to the operation (see `can_apply_per_partition_rate_limit`)
+    /// AND the current node/shard is a replica for the given operation.
+    ///
+    /// nullopt -> the decision should be delegated to replicas
+    /// can_proceed::no -> operation should be rejected
+    /// can_proceed::yes -> operation should be accepted
+    std::optional<db::rate_limiter::can_proceed> account_coordinator_operation_to_rate_limit(table& tbl, const dht::token& token,
+            db::per_partition_rate_limit::account_and_enforce account_and_enforce_info,
+            db::operation_type op_type);
+
     future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>> query(schema_ptr, const query::read_command& cmd, query::result_options opts,
                                                                   const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state,
-                                                                  db::timeout_clock::time_point timeout);
+                                                                  db::timeout_clock::time_point timeout, db::per_partition_rate_limit::info rate_limit_info = std::monostate{});
     future<std::tuple<reconcilable_result, cache_temperature>> query_mutations(schema_ptr, const query::read_command& cmd, const dht::partition_range& range,
                                                 tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout);
     // Apply the mutation atomically.
     // Throws timed_out_error when timeout is reached.
-    future<> apply(schema_ptr, const frozen_mutation&, tracing::trace_state_ptr tr_state, db::commitlog_force_sync sync, db::timeout_clock::time_point timeout);
+    future<> apply(schema_ptr, const frozen_mutation&, tracing::trace_state_ptr tr_state, db::commitlog_force_sync sync, db::timeout_clock::time_point timeout, db::per_partition_rate_limit::info rate_limit_info = std::monostate{});
     future<> apply_hint(schema_ptr, const frozen_mutation&, tracing::trace_state_ptr tr_state, db::timeout_clock::time_point timeout);
     future<mutation> apply_counter_update(schema_ptr, const frozen_mutation& m, db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_state);
     keyspace::config make_keyspace_config(const keyspace_metadata& ksm);
diff --git a/replica/exceptions.cc b/replica/exceptions.cc
new file mode 100644
index 0000000000..4c2ed82aec
--- /dev/null
+++ b/replica/exceptions.cc
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include <concepts>
+#include <sstream>
+#include <stdexcept>
+#include <type_traits>
+
+#include "replica/exceptions.hh"
+#include "utils/exceptions.hh"
+
+
+namespace replica {
+
+exception_variant try_encode_replica_exception(std::exception_ptr eptr) {
+    try {
+        std::rethrow_exception(std::move(eptr));
+    } catch (rate_limit_exception&) {
+        return rate_limit_exception();
+    } catch (...) {
+        return no_exception{};
+    }
+}
+
+std::exception_ptr exception_variant::into_exception_ptr() noexcept {
+    return std::visit([] <typename Ex> (Ex&& ex) {
+        if constexpr (std::is_same_v<Ex, unknown_exception>) {
+            return std::make_exception_ptr(std::runtime_error("unknown exception"));
+        } else {
+            return std::make_exception_ptr(std::move(ex));
+        }
+    }, std::move(reason));
+}
+
+}
diff --git a/replica/exceptions.hh b/replica/exceptions.hh
new file mode 100644
index 0000000000..1b3caf01ac
--- /dev/null
+++ b/replica/exceptions.hh
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <exception>
+#include <optional>
+#include <variant>
+
+#include "seastar/core/sstring.hh"
+#include "seastar/core/timed_out_error.hh"
+
+#include "utils/exception_container.hh"
+#include "utils/result.hh"
+
+namespace replica {
+
+// A marker indicating that the exception_variant holds an unknown exception.
+// For example, replica sends a new type of error and coordinator does not
+// understand it because it wasn't upgraded to a newer version yet.
+struct unknown_exception {};
+
+// A marker indicating that the exception variant doesn't hold any exception.
+struct no_exception {};
+
+class replica_exception : public std::exception {
+public:
+    replica_exception() noexcept {};
+};
+
+class rate_limit_exception final : public replica_exception {
+public:
+    rate_limit_exception() noexcept
+            : replica_exception()
+    { }
+
+    virtual const char* what() const noexcept override { return "rate limit exceeded"; }
+};
+
+struct exception_variant {
+    std::variant<unknown_exception,
+            no_exception,
+            rate_limit_exception
+    > reason;
+
+    exception_variant()
+            : reason(no_exception{})
+    { }
+
+    template<typename Ex>
+    exception_variant(Ex&& ex)
+            : reason(std::move(ex))
+    { }
+
+    std::exception_ptr into_exception_ptr() noexcept;
+
+    inline operator bool() const noexcept {
+        return !std::holds_alternative<no_exception>(reason);
+    }
+};
+
+// Tries to encode the exception into an exception_variant.
+// If given exception cannot be encoded into one of the replica exception types,
+// returns no_exception.
+exception_variant try_encode_replica_exception(std::exception_ptr eptr);
+
+}
diff --git a/schema.cc b/schema.cc
index d91fe4e4a0..5984c44b35 100644
--- a/schema.cc
+++ b/schema.cc
@@ -31,6 +31,7 @@
 #include "db/paxos_grace_seconds_extension.hh"
 #include "utils/rjson.hh"
 #include "tombstone_gc_options.hh"
+#include "db/per_partition_rate_limit_extension.hh"
 
 constexpr int32_t schema::NAME_LENGTH;
 
@@ -1269,6 +1270,12 @@ schema_ptr schema_builder::build() {
             dynamic_pointer_cast<db::paxos_grace_seconds_extension>(it->second)->get_paxos_grace_seconds();
     }
 
+    // cache the `per_partition_rate_limit` parameters for fast access through the schema object.
+    if (auto it = new_raw._extensions.find(db::per_partition_rate_limit_extension::NAME); it != new_raw._extensions.end()) {
+        new_raw._per_partition_rate_limit_options =
+            dynamic_pointer_cast<db::per_partition_rate_limit_extension>(it->second)->get_options();
+    }
+
     return make_lw_shared<schema>(schema::private_tag{}, new_raw, _view_info);
 }
 
@@ -1302,6 +1309,11 @@ schema_builder& schema_builder::with_tombstone_gc_options(const tombstone_gc_opt
     return *this;
 }
 
+schema_builder& schema_builder::with_per_partition_rate_limit_options(const db::per_partition_rate_limit_options& opts) {
+    add_extension(db::per_partition_rate_limit_extension::NAME, ::make_shared<db::per_partition_rate_limit_extension>(opts));
+    return *this;
+}
+
 schema_builder& schema_builder::set_paxos_grace_seconds(int32_t seconds) {
     add_extension(db::paxos_grace_seconds_extension::NAME, ::make_shared<db::paxos_grace_seconds_extension>(seconds));
     return *this;
diff --git a/schema.hh b/schema.hh
index d86ea4b029..4938bbd197 100644
--- a/schema.hh
+++ b/schema.hh
@@ -29,6 +29,7 @@
 #include "column_computation.hh"
 #include "timestamp.hh"
 #include "tombstone_gc_options.hh"
+#include "db/per_partition_rate_limit_options.hh"
 
 namespace dht {
 
@@ -621,6 +622,7 @@ private:
         double _dc_local_read_repair_chance = 0.0;
         double _read_repair_chance = 0.0;
         double _crc_check_chance = 1;
+        db::per_partition_rate_limit_options _per_partition_rate_limit_options;
         int32_t _min_compaction_threshold = DEFAULT_MIN_COMPACTION_THRESHOLD;
         int32_t _max_compaction_threshold = DEFAULT_MAX_COMPACTION_THRESHOLD;
         int32_t _min_index_interval = DEFAULT_MIN_INDEX_INTERVAL;
@@ -813,6 +815,10 @@ public:
 
     const ::tombstone_gc_options& tombstone_gc_options() const;
 
+    const db::per_partition_rate_limit_options& per_partition_rate_limit_options() const {
+        return _raw._per_partition_rate_limit_options;
+    }
+
     const ::speculative_retry& speculative_retry() const {
         return _raw._speculative_retry;
     }
diff --git a/schema_builder.hh b/schema_builder.hh
index cfe8ef1a1e..88d70e6191 100644
--- a/schema_builder.hh
+++ b/schema_builder.hh
@@ -14,6 +14,10 @@
 #include "dht/i_partitioner.hh"
 #include "tombstone_gc_options.hh"
 
+namespace db {
+class per_partition_rate_limit_options;
+}
+
 struct schema_builder {
 public:
     enum class compact_storage { no, yes };
@@ -280,6 +284,7 @@ public:
 
     schema_builder& with_cdc_options(const cdc::options&);
     schema_builder& with_tombstone_gc_options(const tombstone_gc_options& opts);
+    schema_builder& with_per_partition_rate_limit_options(const db::per_partition_rate_limit_options&);
     
     default_names get_default_names() const {
         return default_names(_raw);
diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index bece0f18ff..4a41ce1624 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -8,6 +8,7 @@
  * SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
  */
 
+#include <random>
 #include <seastar/core/sleep.hh>
 #include <seastar/util/defer.hh>
 #include "partition_range_compat.hh"
@@ -93,6 +94,8 @@
 #include "utils/overloaded_functor.hh"
 #include "utils/result_try.hh"
 #include "utils/error_injection.hh"
+#include "replica/exceptions.hh"
+#include "db/operation_type.hh"
 
 namespace bi = boost::intrusive;
 
@@ -110,6 +113,7 @@ static const sstring COORDINATOR_STATS_CATEGORY("storage_proxy_coordinator");
 static const sstring REPLICA_STATS_CATEGORY("storage_proxy_replica");
 static const seastar::metrics::label op_type_label("op_type");
 static const seastar::metrics::label scheduling_group_label("scheduling_group_name");
+static const seastar::metrics::label rejected_by_coordinator_label("rejected_by_coordinator");
 
 seastar::metrics::label_instance current_scheduling_group_label() {
     return scheduling_group_label(current_scheduling_group().name());
@@ -141,6 +145,59 @@ unsigned storage_proxy::cas_shard(const schema& s, dht::token token) {
     return dht::shard_of(s, token);
 }
 
+static uint32_t random_variable_for_rate_limit() {
+    static thread_local std::default_random_engine re{std::random_device{}()};
+    static thread_local std::uniform_int_distribution<uint32_t> dist(0, 0xFFFFFFFF);
+    return dist(re);
+}
+
+static result<db::per_partition_rate_limit::info> choose_rate_limit_info(
+        replica::database& db,
+        bool coordinator_in_replica_set,
+        db::operation_type op_type,
+        const schema_ptr& s,
+        const dht::token& token,
+        tracing::trace_state_ptr tr_state) {
+
+    db::per_partition_rate_limit::account_and_enforce enforce_info{
+        .random_variable = random_variable_for_rate_limit(),
+    };
+    if (coordinator_in_replica_set && dht::shard_of(*s, token) == this_shard_id()) {
+        auto& cf = db.find_column_family(s);
+        auto decision = db.account_coordinator_operation_to_rate_limit(cf, token, enforce_info, op_type);
+        if (decision) {
+            if (*decision == db::rate_limiter::can_proceed::yes) {
+                // The coordinator has decided to accept the operation.
+                // Tell other replicas only to account, but not reject
+                slogger.trace("Per-partition rate limiting: coordinator accepted");
+                tracing::trace(tr_state, "Per-partition rate limiting: coordinator accepted");
+                return db::per_partition_rate_limit::account_only{};
+            } else {
+                // The coordinator has decided to reject, abort the operation
+                slogger.trace("Per-partition rate limiting: coordinator rejected");
+                tracing::trace(tr_state, "Per-partition rate limiting: coordinator rejected");
+                return coordinator_exception_container(exceptions::rate_limit_exception(s->ks_name(), s->cf_name(), op_type, true));
+            }
+        }
+    }
+
+    // The coordinator is not a replica. The decision whether to accept
+    // or reject is left for replicas.
+    slogger.trace("Per-partition rate limiting: replicas will decide");
+    tracing::trace(tr_state, "Per-partition rate limiting: replicas will decide");
+    return enforce_info;
+}
+
+static inline db::per_partition_rate_limit::info adjust_rate_limit_for_local_operation(
+        const db::per_partition_rate_limit::info& info) {
+    if (std::holds_alternative<db::per_partition_rate_limit::account_only>(info)) {
+        // In this case, the coordinator has already accounted the operation,
+        // so don't do it again on this shard
+        return std::monostate();
+    }
+    return info;
+}
+
 class mutation_holder {
 protected:
     size_t _size = 0;
@@ -149,10 +206,10 @@ public:
     virtual ~mutation_holder() {}
     virtual bool store_hint(db::hints::manager& hm, gms::inet_address ep, tracing::trace_state_ptr tr_state) = 0;
     virtual future<> apply_locally(storage_proxy& sp, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) = 0;
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) = 0;
     virtual future<> apply_remotely(storage_proxy& sp, gms::inet_address ep, inet_address_vector_replica_set&& forward,
             storage_proxy::response_id_type response_id, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) = 0;
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) = 0;
     virtual bool is_shared() = 0;
     size_t size() const {
         return _size;
@@ -193,24 +250,25 @@ public:
         }
     }
     virtual future<> apply_locally(storage_proxy& sp, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) override {
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) override {
         auto m = _mutations[utils::fb_utilities::get_broadcast_address()];
         if (m) {
             tracing::trace(tr_state, "Executing a mutation locally");
-            return sp.mutate_locally(_schema, *m, std::move(tr_state), db::commitlog::force_sync::no, timeout);
+            return sp.mutate_locally(_schema, *m, std::move(tr_state), db::commitlog::force_sync::no, timeout, rate_limit_info);
         }
         return make_ready_future<>();
     }
     virtual future<> apply_remotely(storage_proxy& sp, gms::inet_address ep, inet_address_vector_replica_set&& forward,
             storage_proxy::response_id_type response_id, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) override {
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) override {
         auto m = _mutations[ep];
         if (m) {
             tracing::trace(tr_state, "Sending a mutation to /{}", ep);
             return ser::storage_proxy_rpc_verbs::send_mutation(&sp._messaging,
                                     netw::messaging_service::msg_addr{ep, 0}, timeout, *m,
                                     std::move(forward), utils::fb_utilities::get_broadcast_address(), this_shard_id(),
-                                    response_id, tracing::make_trace_info(tr_state));
+                                    response_id, tracing::make_trace_info(tr_state),
+                                    rate_limit_info);
         }
         sp.got_response(response_id, ep, std::nullopt);
         return make_ready_future<>();
@@ -246,18 +304,19 @@ public:
             return hm.store_hint(ep, _schema, _mutation, tr_state);
     }
     virtual future<> apply_locally(storage_proxy& sp, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) override {
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) override {
         tracing::trace(tr_state, "Executing a mutation locally");
-        return sp.mutate_locally(_schema, *_mutation, std::move(tr_state), db::commitlog::force_sync::no, timeout);
+        return sp.mutate_locally(_schema, *_mutation, std::move(tr_state), db::commitlog::force_sync::no, timeout, rate_limit_info);
     }
     virtual future<> apply_remotely(storage_proxy& sp, gms::inet_address ep, inet_address_vector_replica_set&& forward,
             storage_proxy::response_id_type response_id, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) override {
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) override {
         tracing::trace(tr_state, "Sending a mutation to /{}", ep);
         return ser::storage_proxy_rpc_verbs::send_mutation(&sp._messaging,
                 netw::messaging_service::msg_addr{ep, 0}, timeout, *_mutation,
                 std::move(forward), utils::fb_utilities::get_broadcast_address(), this_shard_id(),
-                response_id, tracing::make_trace_info(tr_state));
+                response_id, tracing::make_trace_info(tr_state),
+                rate_limit_info);
     }
     virtual bool is_shared() override {
         return true;
@@ -275,14 +334,14 @@ public:
         throw std::runtime_error("Attempted to store a hint for a hint");
     }
     virtual future<> apply_locally(storage_proxy& sp, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) override {
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) override {
         // A hint will be sent to all relevant endpoints when the endpoint it was originally intended for
         // becomes unavailable - this might include the current node
         return sp.mutate_hint(_schema, *_mutation, std::move(tr_state), timeout);
     }
     virtual future<> apply_remotely(storage_proxy& sp, gms::inet_address ep, inet_address_vector_replica_set&& forward,
             storage_proxy::response_id_type response_id, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) override {
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) override {
         tracing::trace(tr_state, "Sending a hint to /{}", ep);
         return ser::storage_proxy_rpc_verbs::send_hint_mutation(&sp._messaging,
                 netw::messaging_service::msg_addr{ep, 0}, timeout, *_mutation,
@@ -304,14 +363,16 @@ public:
             return false; // CAS does not save hints yet
     }
     virtual future<> apply_locally(storage_proxy& sp, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) override {
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) override {
         tracing::trace(tr_state, "Executing a learn locally");
+        // TODO: Enforce per partition rate limiting in paxos
         return paxos::paxos_state::learn(sp, _schema, *_proposal, timeout, tr_state);
     }
     virtual future<> apply_remotely(storage_proxy& sp, gms::inet_address ep, inet_address_vector_replica_set&& forward,
             storage_proxy::response_id_type response_id, storage_proxy::clock_type::time_point timeout,
-            tracing::trace_state_ptr tr_state) override {
+            tracing::trace_state_ptr tr_state, db::per_partition_rate_limit::info rate_limit_info) override {
         tracing::trace(tr_state, "Sending a learn to /{}", ep);
+        // TODO: Enforce per partition rate limiting in paxos
         return ser::storage_proxy_rpc_verbs::send_paxos_learn(&sp._messaging, netw::messaging_service::msg_addr{ep, 0}, timeout,
                                 *_proposal, std::move(forward), utils::fb_utilities::get_broadcast_address(),
                                 this_shard_id(), response_id, tracing::make_trace_info(tr_state));
@@ -364,6 +425,7 @@ protected:
     lw_shared_ptr<cdc::operation_result_tracker> _cdc_operation_result_tracker;
     timer<storage_proxy::clock_type> _expire_timer;
     service_permit _permit; // holds admission permit until operation completes
+    db::per_partition_rate_limit::info _rate_limit_info;
 
 protected:
     virtual bool waited_for(gms::inet_address from) = 0;
@@ -377,9 +439,11 @@ protected:
 public:
     abstract_write_response_handler(shared_ptr<storage_proxy> p, replica::keyspace& ks, db::consistency_level cl, db::write_type type,
             std::unique_ptr<mutation_holder> mh, inet_address_vector_replica_set targets, tracing::trace_state_ptr trace_state,
-            storage_proxy::write_stats& stats, service_permit permit, size_t pending_endpoints = 0, inet_address_vector_topology_change dead_endpoints = {})
+            storage_proxy::write_stats& stats, service_permit permit, db::per_partition_rate_limit::info rate_limit_info, size_t pending_endpoints = 0,
+            inet_address_vector_topology_change dead_endpoints = {})
             : _id(p->get_next_response_id()), _proxy(std::move(p)), _trace_state(trace_state), _cl(cl), _type(type), _mutation_holder(std::move(mh)), _targets(std::move(targets)),
-              _dead_endpoints(std::move(dead_endpoints)), _stats(stats), _expire_timer([this] { timeout_cb(); }), _permit(std::move(permit)) {
+              _dead_endpoints(std::move(dead_endpoints)), _stats(stats), _expire_timer([this] { timeout_cb(); }), _permit(std::move(permit)),
+              _rate_limit_info(rate_limit_info) {
         // original comment from cassandra:
         // during bootstrap, include pending endpoints in the count
         // or we may fail the consistency level guarantees (see #833, #8058)
@@ -405,6 +469,8 @@ public:
                 } else {
                     _ready.set_exception(mutation_write_failure_exception(*_message, _cl, _cl_acks, _failed, _total_block_for, _type));
                 }
+            } else if (_error == error::RATE_LIMIT) {
+                _ready.set_value(exceptions::rate_limit_exception(get_schema()->ks_name(), get_schema()->cf_name(), db::operation_type::write, false));
             }
             if (_cdc_operation_result_tracker) {
                 _cdc_operation_result_tracker->on_mutation_failed();
@@ -591,12 +657,12 @@ public:
         return _mutation_holder->store_hint(hm, ep, tr_state);
     }
     future<> apply_locally(storage_proxy::clock_type::time_point timeout, tracing::trace_state_ptr tr_state) {
-        return _mutation_holder->apply_locally(*_proxy, timeout, std::move(tr_state));
+        return _mutation_holder->apply_locally(*_proxy, timeout, std::move(tr_state), adjust_rate_limit_for_local_operation(_rate_limit_info));
     }
     future<> apply_remotely(gms::inet_address ep, inet_address_vector_replica_set&& forward,
             storage_proxy::response_id_type response_id, storage_proxy::clock_type::time_point timeout,
             tracing::trace_state_ptr tr_state) {
-        return _mutation_holder->apply_remotely(*_proxy, ep, std::move(forward), response_id, timeout, std::move(tr_state));
+        return _mutation_holder->apply_remotely(*_proxy, ep, std::move(forward), response_id, timeout, std::move(tr_state), _rate_limit_info);
     }
     const schema_ptr& get_schema() const {
         return _mutation_holder->schema();
@@ -628,9 +694,9 @@ public:
     datacenter_write_response_handler(shared_ptr<storage_proxy> p, replica::keyspace& ks, db::consistency_level cl, db::write_type type,
             std::unique_ptr<mutation_holder> mh, inet_address_vector_replica_set targets,
             const inet_address_vector_topology_change& pending_endpoints, inet_address_vector_topology_change dead_endpoints, tracing::trace_state_ptr tr_state,
-            storage_proxy::write_stats& stats, service_permit permit) :
+            storage_proxy::write_stats& stats, service_permit permit, db::per_partition_rate_limit::info rate_limit_info) :
                 abstract_write_response_handler(std::move(p), ks, cl, type, std::move(mh),
-                        std::move(targets), std::move(tr_state), stats, std::move(permit), db::count_local_endpoints(pending_endpoints), std::move(dead_endpoints)) {
+                        std::move(targets), std::move(tr_state), stats, std::move(permit), rate_limit_info, db::count_local_endpoints(pending_endpoints), std::move(dead_endpoints)) {
         _total_endpoints = db::count_local_endpoints(_targets);
     }
 };
@@ -643,9 +709,9 @@ public:
     write_response_handler(shared_ptr<storage_proxy> p, replica::keyspace& ks, db::consistency_level cl, db::write_type type,
             std::unique_ptr<mutation_holder> mh, inet_address_vector_replica_set targets,
             const inet_address_vector_topology_change& pending_endpoints, inet_address_vector_topology_change dead_endpoints, tracing::trace_state_ptr tr_state,
-            storage_proxy::write_stats& stats, service_permit permit) :
+            storage_proxy::write_stats& stats, service_permit permit, db::per_partition_rate_limit::info rate_limit_info) :
                 abstract_write_response_handler(std::move(p), ks, cl, type, std::move(mh),
-                        std::move(targets), std::move(tr_state), stats, std::move(permit), pending_endpoints.size(), std::move(dead_endpoints)) {
+                        std::move(targets), std::move(tr_state), stats, std::move(permit), rate_limit_info, pending_endpoints.size(), std::move(dead_endpoints)) {
         _total_endpoints = _targets.size();
     }
 };
@@ -655,9 +721,9 @@ public:
     view_update_write_response_handler(shared_ptr<storage_proxy> p, replica::keyspace& ks, db::consistency_level cl,
             std::unique_ptr<mutation_holder> mh, inet_address_vector_replica_set targets,
             const inet_address_vector_topology_change& pending_endpoints, inet_address_vector_topology_change dead_endpoints, tracing::trace_state_ptr tr_state,
-            storage_proxy::write_stats& stats, service_permit permit):
+            storage_proxy::write_stats& stats, service_permit permit, db::per_partition_rate_limit::info rate_limit_info):
                 write_response_handler(p, ks, cl, db::write_type::VIEW, std::move(mh),
-                        std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit)) {
+                        std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit), rate_limit_info) {
         register_in_intrusive_list(*p);
     }
     ~view_update_write_response_handler();
@@ -734,8 +800,9 @@ class datacenter_sync_write_response_handler : public abstract_write_response_ha
 public:
     datacenter_sync_write_response_handler(shared_ptr<storage_proxy> p, replica::keyspace& ks, db::consistency_level cl, db::write_type type,
             std::unique_ptr<mutation_holder> mh, inet_address_vector_replica_set targets, const inet_address_vector_topology_change& pending_endpoints,
-            inet_address_vector_topology_change dead_endpoints, tracing::trace_state_ptr tr_state, storage_proxy::write_stats& stats, service_permit permit) :
-        abstract_write_response_handler(std::move(p), ks, cl, type, std::move(mh), targets, std::move(tr_state), stats, std::move(permit), 0, dead_endpoints) {
+            inet_address_vector_topology_change dead_endpoints, tracing::trace_state_ptr tr_state, storage_proxy::write_stats& stats, service_permit permit,
+            db::per_partition_rate_limit::info rate_limit_info) :
+        abstract_write_response_handler(std::move(p), ks, cl, type, std::move(mh), targets, std::move(tr_state), stats, std::move(permit), rate_limit_info, 0, dead_endpoints) {
         auto& topology = _proxy->get_token_metadata_ptr()->get_topology();
 
         for (auto& target : targets) {
@@ -1419,23 +1486,23 @@ future<result<>> storage_proxy::response_wait(storage_proxy::response_id_type id
         return _response_handlers.find(id)->second;
 }
 
-storage_proxy::response_id_type storage_proxy::create_write_response_handler(replica::keyspace& ks, db::consistency_level cl, db::write_type type, std::unique_ptr<mutation_holder> m,
+result<storage_proxy::response_id_type> storage_proxy::create_write_response_handler(replica::keyspace& ks, db::consistency_level cl, db::write_type type, std::unique_ptr<mutation_holder> m,
                              inet_address_vector_replica_set targets, const inet_address_vector_topology_change& pending_endpoints, inet_address_vector_topology_change dead_endpoints, tracing::trace_state_ptr tr_state,
-                             storage_proxy::write_stats& stats, service_permit permit)
+                             storage_proxy::write_stats& stats, service_permit permit, db::per_partition_rate_limit::info rate_limit_info)
 {
     shared_ptr<abstract_write_response_handler> h;
     auto& rs = ks.get_replication_strategy();
 
     if (db::is_datacenter_local(cl)) {
-        h = ::make_shared<datacenter_write_response_handler>(shared_from_this(), ks, cl, type, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit));
+        h = ::make_shared<datacenter_write_response_handler>(shared_from_this(), ks, cl, type, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit), rate_limit_info);
     } else if (cl == db::consistency_level::EACH_QUORUM && rs.get_type() == locator::replication_strategy_type::network_topology){
-        h = ::make_shared<datacenter_sync_write_response_handler>(shared_from_this(), ks, cl, type, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit));
+        h = ::make_shared<datacenter_sync_write_response_handler>(shared_from_this(), ks, cl, type, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit), rate_limit_info);
     } else if (type == db::write_type::VIEW) {
-        h = ::make_shared<view_update_write_response_handler>(shared_from_this(), ks, cl, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit));
+        h = ::make_shared<view_update_write_response_handler>(shared_from_this(), ks, cl, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit), rate_limit_info);
     } else {
-        h = ::make_shared<write_response_handler>(shared_from_this(), ks, cl, type, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit));
+        h = ::make_shared<write_response_handler>(shared_from_this(), ks, cl, type, std::move(m), std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats, std::move(permit), rate_limit_info);
     }
-    return register_response_handler(std::move(h));
+    return bo::success(register_response_handler(std::move(h)));
 }
 
 seastar::metrics::label storage_proxy_stats::split_stats::datacenter_label("datacenter");
@@ -1501,6 +1568,14 @@ void storage_proxy_stats::write_stats::register_stats() {
                            sm::description("number write requests failed due to an \"unavailable\" error"),
                            {storage_proxy_stats::current_scheduling_group_label()}),
 
+            sm::make_total_operations("write_rate_limited", write_rate_limited_by_replicas._count,
+                           sm::description("number of write requests which were rejected by replicas because rate limit for the partition was reached."),
+                           {storage_proxy_stats::current_scheduling_group_label(), storage_proxy_stats::rejected_by_coordinator_label(false)}),
+
+            sm::make_total_operations("write_rate_limited", write_rate_limited_by_coordinator._count,
+                           sm::description("number of write requests which were rejected directly on the coordinator because rate limit for the partition was reached."),
+                           {storage_proxy_stats::current_scheduling_group_label(),storage_proxy_stats::rejected_by_coordinator_label(true)}),
+
             sm::make_total_operations("background_writes_failed", background_writes_failed,
                            sm::description("number of write requests that failed after CL was reached"),
                            {storage_proxy_stats::current_scheduling_group_label()}),
@@ -1585,6 +1660,14 @@ void storage_proxy_stats::stats::register_stats() {
                        sm::description("number read requests failed due to an \"unavailable\" error"),
                        {storage_proxy_stats::current_scheduling_group_label()}),
 
+        sm::make_total_operations("read_rate_limited", read_rate_limited_by_replicas._count,
+                       sm::description("number of read requests which were rejected by replicas because rate limit for the partition was reached."),
+                       {storage_proxy_stats::current_scheduling_group_label(), storage_proxy_stats::rejected_by_coordinator_label(false)}),
+
+        sm::make_total_operations("read_rate_limited", read_rate_limited_by_coordinator._count,
+                       sm::description("number of read requests which were rejected directly on the coordinator because rate limit for the partition was reached."),
+                       {storage_proxy_stats::current_scheduling_group_label(), storage_proxy_stats::rejected_by_coordinator_label(true)}),
+
         sm::make_total_operations("range_timeouts", range_slice_timeouts._count,
                        sm::description("number of range read operations failed due to a timeout"),
                        {storage_proxy_stats::current_scheduling_group_label()}),
@@ -1852,7 +1935,7 @@ void storage_proxy::connection_dropped(gms::inet_address addr) {
 }
 
 future<>
-storage_proxy::mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp) {
+storage_proxy::mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp, db::per_partition_rate_limit::info rate_limit_info) {
     auto shard = m.shard_of();
     get_stats().replica_cross_shard_ops += shard != this_shard_id();
     return _db.invoke_on(shard, {smp_grp, timeout},
@@ -1860,32 +1943,33 @@ storage_proxy::mutate_locally(const mutation& m, tracing::trace_state_ptr tr_sta
              m = freeze(m),
              gtr = tracing::global_trace_state_ptr(std::move(tr_state)),
              timeout,
-             sync] (replica::database& db) mutable -> future<> {
-        return db.apply(s, m, gtr.get(), sync, timeout);
+             sync,
+             rate_limit_info] (replica::database& db) mutable -> future<> {
+        return db.apply(s, m, gtr.get(), sync, timeout, rate_limit_info);
     });
 }
 
 future<>
 storage_proxy::mutate_locally(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout,
-        smp_service_group smp_grp) {
+        smp_service_group smp_grp, db::per_partition_rate_limit::info rate_limit_info) {
     auto shard = m.shard_of(*s);
     get_stats().replica_cross_shard_ops += shard != this_shard_id();
     return _db.invoke_on(shard, {smp_grp, timeout},
-            [&m, gs = global_schema_ptr(s), gtr = tracing::global_trace_state_ptr(std::move(tr_state)), timeout, sync] (replica::database& db) mutable -> future<> {
-        return db.apply(gs, m, gtr.get(), sync, timeout);
+            [&m, gs = global_schema_ptr(s), gtr = tracing::global_trace_state_ptr(std::move(tr_state)), timeout, sync, rate_limit_info] (replica::database& db) mutable -> future<> {
+        return db.apply(gs, m, gtr.get(), sync, timeout, rate_limit_info);
     });
 }
 
 future<>
-storage_proxy::mutate_locally(std::vector<mutation> mutations, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, smp_service_group smp_grp) {
+storage_proxy::mutate_locally(std::vector<mutation> mutations, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, smp_service_group smp_grp, db::per_partition_rate_limit::info rate_limit_info) {
     co_await coroutine::parallel_for_each(mutations, [&] (const mutation& m) mutable {
-            return mutate_locally(m, tr_state, db::commitlog::force_sync::no, timeout, smp_grp);
+            return mutate_locally(m, tr_state, db::commitlog::force_sync::no, timeout, smp_grp, rate_limit_info);
     });
 }
 
 future<> 
-storage_proxy::mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout) {
-        return mutate_locally(std::move(mutation), tr_state, timeout, _write_smp_service_group);
+storage_proxy::mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, db::per_partition_rate_limit::info rate_limit_info) {
+        return mutate_locally(std::move(mutation), tr_state, timeout, _write_smp_service_group, rate_limit_info);
 }
 future<>
 storage_proxy::mutate_hint(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, clock_type::time_point timeout) {
@@ -1922,9 +2006,9 @@ storage_proxy::mutate_counter_on_leader_and_replicate(const schema_ptr& s, froze
     });
 }
 
-storage_proxy::response_id_type
+result<storage_proxy::response_id_type>
 storage_proxy::create_write_response_handler_helper(schema_ptr s, const dht::token& token, std::unique_ptr<mutation_holder> mh,
-        db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
+        db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit) {
     auto keyspace_name = s->ks_name();
     replica::keyspace& ks = _db.local().find_keyspace(keyspace_name);
     auto erm = ks.get_effective_replication_map();
@@ -1934,14 +2018,15 @@ storage_proxy::create_write_response_handler_helper(schema_ptr s, const dht::tok
     slogger.trace("creating write handler for token: {} natural: {} pending: {}", token, natural_endpoints, pending_endpoints);
     tracing::trace(tr_state, "Creating write handler for token: {} natural: {} pending: {}", token, natural_endpoints ,pending_endpoints);
 
+    const bool coordinator_in_replica_set = std::find(natural_endpoints.begin(), natural_endpoints.end(),
+            utils::fb_utilities::get_broadcast_address()) != natural_endpoints.end();
+
     // Check if this node, which is serving as a coordinator for
     // the mutation, is also a replica for the partition being
     // changed. Mutations sent by drivers unaware of token
     // distribution create a lot of network noise and thus should be
     // accounted in the metrics.
-    if (std::find(natural_endpoints.begin(), natural_endpoints.end(),
-                  utils::fb_utilities::get_broadcast_address()) == natural_endpoints.end()) {
-
+    if (!coordinator_in_replica_set) {
         get_stats().writes_coordinator_outside_replica_set++;
     }
 
@@ -1971,13 +2056,24 @@ storage_proxy::create_write_response_handler_helper(schema_ptr s, const dht::tok
     std::partition_copy(all.begin(), all.end(), std::back_inserter(live_endpoints),
             std::back_inserter(dead_endpoints), std::bind_front(std::mem_fn(&gms::gossiper::is_alive), &_gossiper));
 
+    db::per_partition_rate_limit::info rate_limit_info;
+    if (allow_limit && _db.local().can_apply_per_partition_rate_limit(*s, db::operation_type::write)) {
+        auto r_rate_limit_info = choose_rate_limit_info(_db.local(), coordinator_in_replica_set, db::operation_type::write, s, token, tr_state);
+        if (!r_rate_limit_info) {
+            return std::move(r_rate_limit_info).as_failure();
+        }
+        rate_limit_info = r_rate_limit_info.value();
+    } else {
+        slogger.trace("Operation is not rate limited");
+    }
+
     slogger.trace("creating write handler with live: {} dead: {}", live_endpoints, dead_endpoints);
     tracing::trace(tr_state, "Creating write handler with live: {} dead: {}", live_endpoints, dead_endpoints);
 
     db::assure_sufficient_live_nodes(cl, ks, live_endpoints, pending_endpoints);
 
     return create_write_response_handler(ks, cl, type, std::move(mh), std::move(live_endpoints), pending_endpoints,
-            std::move(dead_endpoints), std::move(tr_state), get_stats(), std::move(permit));
+            std::move(dead_endpoints), std::move(tr_state), get_stats(), std::move(permit), rate_limit_info);
 }
 
 /**
@@ -1987,20 +2083,20 @@ storage_proxy::create_write_response_handler_helper(schema_ptr s, const dht::tok
  * Since ordering is (maybe?) significant, we need to carry some info across from here
  * to the hint method below (dead nodes).
  */
-storage_proxy::response_id_type
-storage_proxy::create_write_response_handler(const mutation& m, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
+result<storage_proxy::response_id_type>
+storage_proxy::create_write_response_handler(const mutation& m, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit) {
     return create_write_response_handler_helper(m.schema(), m.token(), std::make_unique<shared_mutation>(m), cl, type, tr_state,
-            std::move(permit));
+            std::move(permit), allow_limit);
 }
 
-storage_proxy::response_id_type
-storage_proxy::create_write_response_handler(const hint_wrapper& h, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
+result<storage_proxy::response_id_type>
+storage_proxy::create_write_response_handler(const hint_wrapper& h, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit) {
     return create_write_response_handler_helper(h.mut.schema(), h.mut.token(), std::make_unique<hint_mutation>(h.mut), cl, type, tr_state,
-            std::move(permit));
+            std::move(permit), allow_limit);
 }
 
-storage_proxy::response_id_type
-storage_proxy::create_write_response_handler(const std::unordered_map<gms::inet_address, std::optional<mutation>>& m, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
+result<storage_proxy::response_id_type>
+storage_proxy::create_write_response_handler(const std::unordered_map<gms::inet_address, std::optional<mutation>>& m, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit) {
     inet_address_vector_replica_set endpoints;
     endpoints.reserve(m.size());
     boost::copy(m | boost::adaptors::map_keys, std::inserter(endpoints, endpoints.begin()));
@@ -2012,21 +2108,22 @@ storage_proxy::create_write_response_handler(const std::unordered_map<gms::inet_
     auto keyspace_name = mh->schema()->ks_name();
     replica::keyspace& ks = _db.local().find_keyspace(keyspace_name);
 
-    return create_write_response_handler(ks, cl, type, std::move(mh), std::move(endpoints), inet_address_vector_topology_change(), inet_address_vector_topology_change(), std::move(tr_state), get_stats(), std::move(permit));
+    // No rate limiting for read repair
+    return create_write_response_handler(ks, cl, type, std::move(mh), std::move(endpoints), inet_address_vector_topology_change(), inet_address_vector_topology_change(), std::move(tr_state), get_stats(), std::move(permit), std::monostate());
 }
 
-storage_proxy::response_id_type
+result<storage_proxy::response_id_type>
 storage_proxy::create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>& meta,
-        db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
+        db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit) {
     auto& [commit, s, h, t] = meta;
 
     return create_write_response_handler_helper(s, t, std::make_unique<cas_mutation>(std::move(commit), s, std::move(h)), cl,
-            db::write_type::CAS, tr_state, std::move(permit));
+            db::write_type::CAS, tr_state, std::move(permit), allow_limit);
 }
 
-storage_proxy::response_id_type
+result<storage_proxy::response_id_type>
 storage_proxy::create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, dht::token, inet_address_vector_replica_set>& meta,
-        db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
+        db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit) {
     auto& [commit, s, token, endpoints] = meta;
 
     slogger.trace("creating write handler for paxos repair token: {} endpoint: {}", token, endpoints);
@@ -2035,8 +2132,9 @@ storage_proxy::create_write_response_handler(const std::tuple<lw_shared_ptr<paxo
     auto keyspace_name = s->ks_name();
     replica::keyspace& ks = _db.local().find_keyspace(keyspace_name);
 
+    // No rate limiting for paxos (yet)
     return create_write_response_handler(ks, cl, db::write_type::CAS, std::make_unique<cas_mutation>(std::move(commit), s, nullptr), std::move(endpoints),
-                    inet_address_vector_topology_change(), inet_address_vector_topology_change(), std::move(tr_state), get_stats(), std::move(permit));
+                    inet_address_vector_topology_change(), inet_address_vector_topology_change(), std::move(tr_state), get_stats(), std::move(permit), std::monostate());
 }
 
 void storage_proxy::register_cdc_operation_result_tracker(const storage_proxy::unique_response_handler_vector& ids, lw_shared_ptr<cdc::operation_result_tracker> tracker) {
@@ -2065,22 +2163,26 @@ storage_proxy::hint_to_dead_endpoints(response_id_type id, db::consistency_level
 }
 
 template<typename Range, typename CreateWriteHandler>
-future<storage_proxy::unique_response_handler_vector> storage_proxy::mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, service_permit permit, CreateWriteHandler create_handler) {
+future<result<storage_proxy::unique_response_handler_vector>> storage_proxy::mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, service_permit permit, CreateWriteHandler create_handler) {
     // apply is used to convert exceptions to exceptional future
     return futurize_invoke([this] (Range&& mutations, db::consistency_level cl, db::write_type type, service_permit permit, CreateWriteHandler create_handler) {
         unique_response_handler_vector ids;
         ids.reserve(std::distance(std::begin(mutations), std::end(mutations)));
         for (auto& m : mutations) {
-            ids.emplace_back(*this, create_handler(m, cl, type, permit));
+            auto r_handler = create_handler(m, cl, type, permit);
+            if (!r_handler) {
+                return make_ready_future<result<unique_response_handler_vector>>(std::move(r_handler).as_failure());
+            }
+            ids.emplace_back(*this, std::move(r_handler).value());
         }
-        return make_ready_future<unique_response_handler_vector>(std::move(ids));
+        return make_ready_future<result<unique_response_handler_vector>>(std::move(ids));
     }, std::forward<Range>(mutations), cl, type, std::move(permit), std::move(create_handler));
 }
 
 template<typename Range>
-future<storage_proxy::unique_response_handler_vector> storage_proxy::mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
-    return mutate_prepare<>(std::forward<Range>(mutations), cl, type, std::move(permit), [this, tr_state = std::move(tr_state)] (const typename std::decay_t<Range>::value_type& m, db::consistency_level cl, db::write_type type, service_permit permit) mutable {
-        return create_write_response_handler(m, cl, type, tr_state, std::move(permit));
+future<result<storage_proxy::unique_response_handler_vector>> storage_proxy::mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit) {
+    return mutate_prepare<>(std::forward<Range>(mutations), cl, type, std::move(permit), [this, tr_state = std::move(tr_state), allow_limit] (const typename std::decay_t<Range>::value_type& m, db::consistency_level cl, db::write_type type, service_permit permit) mutable {
+        return create_write_response_handler(m, cl, type, tr_state, std::move(permit), allow_limit);
     });
 }
 
@@ -2146,6 +2248,15 @@ future<result<>> storage_proxy::mutate_end(future<result<>> mutate_result, utils
         stats.write_unavailables.mark();
         slogger.trace("Unavailable");
         return handle.into_future();
+    }), utils::result_catch<exceptions::rate_limit_exception>([&] (const auto& ex, auto&& handle) {
+        tracing::trace(trace_state, "Mutation failed: rate limit exceeded");
+        if (ex.rejected_by_coordinator) {
+            stats.write_rate_limited_by_coordinator.mark();
+        } else {
+            stats.write_rate_limited_by_replicas.mark();
+        }
+        slogger.trace("Rate limit exceeded");
+        return handle.into_future();
     }), utils::result_catch<overloaded_exception>([&] (const auto& ex, auto&& handle) {
         tracing::trace(trace_state, "Mutation failed: overloaded");
         stats.write_unavailables.mark();
@@ -2316,29 +2427,29 @@ storage_proxy::get_paxos_participants(const sstring& ks_name, const dht::token &
  * @param consistency_level the consistency level for the operation
  * @param tr_state trace state handle
  */
-future<> storage_proxy::mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters) {
-    return mutate_result(std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit), raw_counters)
+future<> storage_proxy::mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit, bool raw_counters) {
+    return mutate_result(std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit), allow_limit, raw_counters)
             .then(utils::result_into_future<result<>>);
 }
 
-future<result<>> storage_proxy::mutate_result(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters) {
+future<result<>> storage_proxy::mutate_result(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit, bool raw_counters) {
     if (_cdc && _cdc->needs_cdc_augmentation(mutations)) {
-        return _cdc->augment_mutation_call(timeout, std::move(mutations), tr_state, cl).then([this, cl, timeout, tr_state, permit = std::move(permit), raw_counters, cdc = _cdc->shared_from_this()](std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>&& t) mutable {
+        return _cdc->augment_mutation_call(timeout, std::move(mutations), tr_state, cl).then([this, cl, timeout, tr_state, permit = std::move(permit), raw_counters, cdc = _cdc->shared_from_this(), allow_limit](std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>&& t) mutable {
             auto mutations = std::move(std::get<0>(t));
             auto tracker = std::move(std::get<1>(t));
-            return _mutate_stage(this, std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit), raw_counters, std::move(tracker));
+            return _mutate_stage(this, std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit), raw_counters, allow_limit, std::move(tracker));
         });
     }
-    return _mutate_stage(this, std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit), raw_counters, nullptr);
+    return _mutate_stage(this, std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit), raw_counters, allow_limit, nullptr);
 }
 
-future<result<>> storage_proxy::do_mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker) {
+future<result<>> storage_proxy::do_mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters, db::allow_per_partition_rate_limit allow_limit, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker) {
     auto mid = raw_counters ? mutations.begin() : boost::range::partition(mutations, [] (auto&& m) {
         return m.schema()->is_counter();
     });
     return seastar::when_all_succeed(
         mutate_counters(boost::make_iterator_range(mutations.begin(), mid), cl, tr_state, permit, timeout),
-        mutate_internal(boost::make_iterator_range(mid, mutations.end()), cl, false, tr_state, permit, timeout, std::move(cdc_tracker))
+        mutate_internal(boost::make_iterator_range(mid, mutations.end()), cl, false, tr_state, permit, timeout, std::move(cdc_tracker), allow_limit)
     ).then([] (std::tuple<result<>> res) {
         // For now, only mutate_internal returns a result<>
         return std::get<0>(std::move(res));
@@ -2360,7 +2471,8 @@ future<> storage_proxy::replicate_counter_from_leader(mutation m, db::consistenc
 template<typename Range>
 future<result<>>
 storage_proxy::mutate_internal(Range mutations, db::consistency_level cl, bool counters, tracing::trace_state_ptr tr_state, service_permit permit,
-                               std::optional<clock_type::time_point> timeout_opt, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker) {
+                               std::optional<clock_type::time_point> timeout_opt, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker,
+                               db::allow_per_partition_rate_limit allow_limit) {
     if (boost::empty(mutations)) {
         return make_ready_future<result<>>(bo::success());
     }
@@ -2377,11 +2489,11 @@ storage_proxy::mutate_internal(Range mutations, db::consistency_level cl, bool c
     utils::latency_counter lc;
     lc.start();
 
-    return mutate_prepare(mutations, cl, type, tr_state, std::move(permit)).then([this, cl, timeout_opt, tracker = std::move(cdc_tracker),
+    return mutate_prepare(mutations, cl, type, tr_state, std::move(permit), allow_limit).then(utils::result_wrap([this, cl, timeout_opt, tracker = std::move(cdc_tracker),
             tr_state] (storage_proxy::unique_response_handler_vector ids) mutable {
         register_cdc_operation_result_tracker(ids, tracker);
         return mutate_begin(std::move(ids), cl, tr_state, timeout_opt);
-    }).then_wrapped([this, p = shared_from_this(), lc, tr_state] (future<result<>> f) mutable {
+    })).then_wrapped([this, p = shared_from_this(), lc, tr_state] (future<result<>> f) mutable {
         return p->mutate_end(std::move(f), lc, get_stats(), std::move(tr_state));
     });
 }
@@ -2389,13 +2501,13 @@ storage_proxy::mutate_internal(Range mutations, db::consistency_level cl, bool c
 future<result<>>
 storage_proxy::mutate_with_triggers(std::vector<mutation> mutations, db::consistency_level cl,
     clock_type::time_point timeout,
-    bool should_mutate_atomically, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters) {
+    bool should_mutate_atomically, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit, bool raw_counters) {
     warn(unimplemented::cause::TRIGGERS);
     if (should_mutate_atomically) {
         assert(!raw_counters);
         return mutate_atomically_result(std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit));
     }
-    return mutate_result(std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit), raw_counters);
+    return mutate_result(std::move(mutations), cl, timeout, std::move(tr_state), std::move(permit), allow_limit, raw_counters);
 }
 
 /**
@@ -2468,11 +2580,11 @@ storage_proxy::mutate_atomically_result(std::vector<mutation> mutations, db::con
         future<result<>> send_batchlog_mutation(mutation m, db::consistency_level cl = db::consistency_level::ONE) {
             return _p.mutate_prepare<>(std::array<mutation, 1>{std::move(m)}, cl, db::write_type::BATCH_LOG, _permit, [this] (const mutation& m, db::consistency_level cl, db::write_type type, service_permit permit) {
                 auto& ks = _p._db.local().find_keyspace(m.schema()->ks_name());
-                return _p.create_write_response_handler(ks, cl, type, std::make_unique<shared_mutation>(m), _batchlog_endpoints, {}, {}, _trace_state, _stats, std::move(permit));
-            }).then([this, cl] (unique_response_handler_vector ids) {
+                return _p.create_write_response_handler(ks, cl, type, std::make_unique<shared_mutation>(m), _batchlog_endpoints, {}, {}, _trace_state, _stats, std::move(permit), std::monostate());
+            }).then(utils::result_wrap([this, cl] (unique_response_handler_vector ids) {
                 _p.register_cdc_operation_result_tracker(ids, _cdc_tracker);
                 return _p.mutate_begin(std::move(ids), cl, _trace_state, _timeout);
-            });
+            }));
         }
         future<result<>> sync_write_to_batchlog() {
             auto m = _p.get_batchlog_mutation_for(_mutations, _batch_uuid, netw::messaging_service::current_version, db_clock::now());
@@ -2501,7 +2613,7 @@ storage_proxy::mutate_atomically_result(std::vector<mutation> mutations, db::con
         };
 
         future<result<>> run() {
-            return _p.mutate_prepare(_mutations, _cl, db::write_type::BATCH, _trace_state, _permit).then([this] (unique_response_handler_vector ids) {
+            return _p.mutate_prepare(_mutations, _cl, db::write_type::BATCH, _trace_state, _permit, db::allow_per_partition_rate_limit::no).then(utils::result_wrap([this] (unique_response_handler_vector ids) {
                 return sync_write_to_batchlog().then(utils::result_wrap([this, ids = std::move(ids)] () mutable {
                     tracing::trace(_trace_state, "Sending batch mutations");
                     _p.register_cdc_operation_result_tracker(ids, _cdc_tracker);
@@ -2509,7 +2621,7 @@ storage_proxy::mutate_atomically_result(std::vector<mutation> mutations, db::con
                 })).then(utils::result_wrap([this] {
                     return utils::then_ok_result<result<>>(async_remove_from_batchlog());
                 }));
-            });
+            }));
         }
     };
 
@@ -2610,10 +2722,11 @@ future<> storage_proxy::send_to_endpoint(
             std::move(dead_endpoints),
             tr_state,
             stats,
-            std::move(permit));
-    }).then([this, cl, tr_state = std::move(tr_state), timeout = std::move(timeout)] (unique_response_handler_vector ids) mutable {
+            std::move(permit),
+            std::monostate()); // TODO: Pass the correct enforcement type
+    }).then(utils::result_wrap([this, cl, tr_state = std::move(tr_state), timeout = std::move(timeout)] (unique_response_handler_vector ids) mutable {
         return mutate_begin(std::move(ids), cl, std::move(tr_state), std::move(timeout));
-    }).then_wrapped([p = shared_from_this(), lc, &stats] (future<result<>> f) {
+    })).then_wrapped([p = shared_from_this(), lc, &stats] (future<result<>> f) {
         return p->mutate_end(std::move(f), lc, stats, nullptr).then(utils::result_into_future<result<>>);
     });
 }
@@ -2785,6 +2898,9 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
             std::optional<sstring> msg;
             try {
                 std::rethrow_exception(eptr);
+            } catch (replica::rate_limit_exception&) {
+                // There might be a lot of those, so ignore
+                err = error::RATE_LIMIT;
             } catch(rpc::closed_error&) {
                 // ignore, disconnect will be logged by gossiper
             } catch(seastar::gate_closed_exception&) {
@@ -2828,6 +2944,11 @@ future<result<>> storage_proxy::schedule_repair(std::unordered_map<dht::token, s
 
 class abstract_read_resolver {
 protected:
+    enum class error_kind : uint8_t {
+        FAILURE,
+        DISCONNECT,
+        RATE_LIMIT,
+    };
     db::consistency_level _cl;
     size_t _targets_count;
     promise<result<>> _done_promise; // all target responded
@@ -2859,18 +2980,21 @@ public:
         _timeout.arm(timeout);
     }
     virtual ~abstract_read_resolver() {};
-    virtual void on_error(gms::inet_address ep, bool disconnect) = 0;
+    virtual void on_error(gms::inet_address ep, error_kind kind) = 0;
     future<result<>> done() {
         return _done_promise.get_future();
     }
     void error(gms::inet_address ep, std::exception_ptr eptr) {
         sstring why;
-        bool disconnect = false;
+        error_kind kind = error_kind::FAILURE;
         try {
             std::rethrow_exception(eptr);
+        } catch (replica::rate_limit_exception&) {
+            // There might be a lot of those, so ignore
+            kind = error_kind::RATE_LIMIT;
         } catch (rpc::closed_error&) {
             // do not report connection closed exception, gossiper does that
-            disconnect = true;
+            kind = error_kind::DISCONNECT;
         } catch (rpc::timeout_error&) {
             // do not report timeouts, the whole operation will timeout and be reported
             return; // also do not report timeout as replica failure for the same reason
@@ -2892,7 +3016,7 @@ public:
         }
 
         if (!_request_failed) { // request may fail only once.
-            on_error(ep, disconnect);
+            on_error(ep, kind);
         }
     }
 };
@@ -2973,18 +3097,26 @@ public:
             _done_promise.set_value(bo::success());
         }
     }
-    void on_error(gms::inet_address ep, bool disconnect) override {
+    void on_error(gms::inet_address ep, error_kind kind) override {
         if (waiting_for(ep)) {
             _failed++;
         }
-        if (disconnect && _block_for == _target_count_for_cl) {
+        if (kind == error_kind::DISCONNECT && _block_for == _target_count_for_cl) {
             // if the error is because of a connection disconnect and there is no targets to speculate
             // wait for timeout in hope that the client will issue speculative read
             // FIXME: resolver should have access to all replicas and try another one in this case
             return;
         }
         if (_block_for + _failed > _target_count_for_cl) {
-            fail_request(read_failure_exception(_schema->ks_name(), _schema->cf_name(), _cl, _cl_responses, _failed, _block_for, _data_result));
+            switch (kind) {
+            case error_kind::RATE_LIMIT:
+                fail_request(exceptions::rate_limit_exception(_schema->ks_name(), _schema->cf_name(), db::operation_type::read, false));
+                break;
+            case error_kind::DISCONNECT:
+            case error_kind::FAILURE:
+                fail_request(read_failure_exception(_schema->ks_name(), _schema->cf_name(), _cl, _cl_responses, _failed, _block_for, _data_result));
+                break;
+            }
         }
     }
     future<result<digest_read_result>> has_cl() {
@@ -3297,8 +3429,16 @@ public:
             }
         }
     }
-    void on_error(gms::inet_address ep, bool disconnect) override {
-        fail_request(read_failure_exception(_schema->ks_name(), _schema->cf_name(), _cl, response_count(), 1, _targets_count, response_count() != 0));
+    void on_error(gms::inet_address ep, error_kind kind) override {
+        switch (kind) {
+        case error_kind::RATE_LIMIT:
+            fail_request(exceptions::rate_limit_exception(_schema->ks_name(), _schema->cf_name(), db::operation_type::read, false));
+            break;
+        case error_kind::DISCONNECT:
+        case error_kind::FAILURE:
+            fail_request(read_failure_exception(_schema->ks_name(), _schema->cf_name(), _cl, response_count(), 1, _targets_count, response_count() != 0));
+            break;
+        }
     }
     uint32_t max_live_count() const {
         return _max_live_count;
@@ -3512,6 +3652,7 @@ protected:
     lw_shared_ptr<replica::column_family> _cf;
     bool _foreground = true;
     service_permit _permit; // holds admission permit until operation completes
+    db::per_partition_rate_limit::info _rate_limit_info;
 
 private:
     void on_read_resolved() noexcept {
@@ -3526,9 +3667,9 @@ private:
 
 public:
     abstract_read_executor(schema_ptr s, lw_shared_ptr<replica::column_family> cf, shared_ptr<storage_proxy> proxy, lw_shared_ptr<query::read_command> cmd, dht::partition_range pr, db::consistency_level cl, size_t block_for,
-            inet_address_vector_replica_set targets, tracing::trace_state_ptr trace_state, service_permit permit) :
+            inet_address_vector_replica_set targets, tracing::trace_state_ptr trace_state, service_permit permit, db::per_partition_rate_limit::info rate_limit_info) :
                            _schema(std::move(s)), _proxy(std::move(proxy)), _cmd(std::move(cmd)), _partition_range(std::move(pr)), _cl(cl), _block_for(block_for), _targets(std::move(targets)), _trace_state(std::move(trace_state)),
-                           _cf(std::move(cf)), _permit(std::move(permit)) {
+                           _cf(std::move(cf)), _permit(std::move(permit)), _rate_limit_info(rate_limit_info) {
         _proxy->get_stats().reads++;
         _proxy->get_stats().foreground_reads++;
     }
@@ -3553,8 +3694,11 @@ protected:
             return _proxy->query_mutations_locally(_schema, cmd, _partition_range, timeout, _trace_state);
         } else {
             tracing::trace(_trace_state, "read_mutation_data: sending a message to /{}", ep);
-            return ser::storage_proxy_rpc_verbs::send_read_mutation_data(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *cmd, _partition_range).then([this, ep](rpc::tuple<reconcilable_result, rpc::optional<cache_temperature>> result_and_hit_rate) {
-                auto&& [result, hit_rate] = result_and_hit_rate;
+            return ser::storage_proxy_rpc_verbs::send_read_mutation_data(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *cmd, _partition_range).then([this, ep](rpc::tuple<reconcilable_result, rpc::optional<cache_temperature>, rpc::optional<replica::exception_variant>> result_and_hit_rate) {
+                auto&& [result, hit_rate, opt_exception] = result_and_hit_rate;
+                if (opt_exception.has_value() && *opt_exception) {
+                    return make_exception_future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>((*opt_exception).into_exception_ptr());
+                }
                 tracing::trace(_trace_state, "read_mutation_data: got response from /{}", ep);
                 return make_ready_future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>(rpc::tuple(make_foreign(::make_lw_shared<reconcilable_result>(std::move(result))), hit_rate.value_or(cache_temperature::invalid())));
             });
@@ -3567,11 +3711,14 @@ protected:
                   : query::result_options{query::result_request::only_result, query::digest_algorithm::none};
         if (fbu::is_me(ep)) {
             tracing::trace(_trace_state, "read_data: querying locally");
-            return _proxy->query_result_local(_schema, _cmd, _partition_range, opts, _trace_state, timeout);
+            return _proxy->query_result_local(_schema, _cmd, _partition_range, opts, _trace_state, timeout, adjust_rate_limit_for_local_operation(_rate_limit_info));
         } else {
             tracing::trace(_trace_state, "read_data: sending a message to /{}", ep);
-            return ser::storage_proxy_rpc_verbs::send_read_data(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range, opts.digest_algo).then([this, ep](rpc::tuple<query::result, rpc::optional<cache_temperature>> result_hit_rate) {
-                auto&& [result, hit_rate] = result_hit_rate;
+            return ser::storage_proxy_rpc_verbs::send_read_data(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range, opts.digest_algo, _rate_limit_info).then([this, ep](rpc::tuple<query::result, rpc::optional<cache_temperature>, rpc::optional<replica::exception_variant>> result_hit_rate) {
+                auto&& [result, hit_rate, opt_exception] = result_hit_rate;
+                if (opt_exception.has_value() && *opt_exception) {
+                    return make_exception_future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>((*opt_exception).into_exception_ptr());
+                }
                 tracing::trace(_trace_state, "read_data: got response from /{}", ep);
                 return make_ready_future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>(rpc::tuple(make_foreign(::make_lw_shared<query::result>(std::move(result))), hit_rate.value_or(cache_temperature::invalid())));
             });
@@ -3582,13 +3729,16 @@ protected:
         if (fbu::is_me(ep)) {
             tracing::trace(_trace_state, "read_digest: querying locally");
             return _proxy->query_result_local_digest(_schema, _cmd, _partition_range, _trace_state,
-                        timeout, digest_algorithm(*_proxy));
+                        timeout, digest_algorithm(*_proxy), adjust_rate_limit_for_local_operation(_rate_limit_info));
         } else {
             tracing::trace(_trace_state, "read_digest: sending a message to /{}", ep);
             return ser::storage_proxy_rpc_verbs::send_read_digest(&_proxy->_messaging, netw::messaging_service::msg_addr{ep, 0}, timeout, *_cmd,
-                        _partition_range, digest_algorithm(*_proxy)).then([this, ep] (
-                    rpc::tuple<query::result_digest, rpc::optional<api::timestamp_type>, rpc::optional<cache_temperature>> digest_timestamp_hit_rate) {
-                auto&& [d, t, hit_rate] = digest_timestamp_hit_rate;
+                        _partition_range, digest_algorithm(*_proxy), _rate_limit_info).then([this, ep] (
+                    rpc::tuple<query::result_digest, rpc::optional<api::timestamp_type>, rpc::optional<cache_temperature>, rpc::optional<replica::exception_variant>> digest_timestamp_hit_rate) {
+                auto&& [d, t, hit_rate, opt_exception] = digest_timestamp_hit_rate;
+                if (opt_exception.has_value() && *opt_exception) {
+                    return make_exception_future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>>((*opt_exception).into_exception_ptr());
+                }
                 tracing::trace(_trace_state, "read_digest: got response from /{}", ep);
                 return make_ready_future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>>(rpc::tuple(d, t ? t.value() : api::missing_timestamp, hit_rate.value_or(cache_temperature::invalid())));
             });
@@ -3875,9 +4025,9 @@ private:
 
 class never_speculating_read_executor : public abstract_read_executor {
 public:
-    never_speculating_read_executor(schema_ptr s, lw_shared_ptr<replica::column_family> cf, shared_ptr<storage_proxy> proxy, lw_shared_ptr<query::read_command> cmd, dht::partition_range pr, db::consistency_level cl, inet_address_vector_replica_set targets, tracing::trace_state_ptr trace_state,
-                                    service_permit permit) :
-                                        abstract_read_executor(std::move(s), std::move(cf), std::move(proxy), std::move(cmd), std::move(pr), cl, 0, std::move(targets), std::move(trace_state), std::move(permit)) {
+    never_speculating_read_executor(schema_ptr s, lw_shared_ptr<replica::column_family> cf, shared_ptr<storage_proxy> proxy, lw_shared_ptr<query::read_command> cmd, dht::partition_range pr, db::consistency_level cl, inet_address_vector_replica_set targets, tracing::trace_state_ptr trace_state, service_permit permit,
+            db::per_partition_rate_limit::info rate_limit_info) :
+                                        abstract_read_executor(std::move(s), std::move(cf), std::move(proxy), std::move(cmd), std::move(pr), cl, 0, std::move(targets), std::move(trace_state), std::move(permit), rate_limit_info) {
         _block_for = _targets.size();
     }
 };
@@ -3964,7 +4114,7 @@ db::read_repair_decision storage_proxy::new_read_repair_decision(const schema& s
     return db::read_repair_decision::NONE;
 }
 
-::shared_ptr<abstract_read_executor> storage_proxy::get_read_executor(lw_shared_ptr<query::read_command> cmd,
+result<::shared_ptr<abstract_read_executor>> storage_proxy::get_read_executor(lw_shared_ptr<query::read_command> cmd,
         schema_ptr schema,
         dht::partition_range pr,
         db::consistency_level cl,
@@ -4009,24 +4159,36 @@ db::read_repair_decision storage_proxy::new_read_repair_decision(const schema& s
 
     size_t block_for = db::block_for(ks, cl);
     auto p = shared_from_this();
+
+    db::per_partition_rate_limit::info rate_limit_info;
+    if (cmd->allow_limit && _db.local().can_apply_per_partition_rate_limit(*schema, db::operation_type::read)) {
+        auto r_rate_limit_info = choose_rate_limit_info(_db.local(), !is_read_non_local, db::operation_type::read, schema, token, trace_state);
+        if (!r_rate_limit_info) {
+            return std::move(r_rate_limit_info).as_failure();
+        }
+        rate_limit_info = r_rate_limit_info.value();
+    } else {
+        slogger.trace("Operation is not rate limited");
+    }
+
     // Speculative retry is disabled *OR* there are simply no extra replicas to speculate.
     if (retry_type == speculative_retry::type::NONE || block_for == all_replicas.size()
             || (repair_decision == db::read_repair_decision::DC_LOCAL && is_datacenter_local(cl) && block_for == target_replicas.size())) {
-        return ::make_shared<never_speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, std::move(target_replicas), std::move(trace_state), std::move(permit));
+        return ::make_shared<never_speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, std::move(target_replicas), std::move(trace_state), std::move(permit), rate_limit_info);
     }
 
     if (target_replicas.size() == all_replicas.size()) {
         // CL.ALL, RRD.GLOBAL or RRD.DC_LOCAL and a single-DC.
         // We are going to contact every node anyway, so ask for 2 full data requests instead of 1, for redundancy
         // (same amount of requests in total, but we turn 1 digest request into a full blown data request).
-        return ::make_shared<always_speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, block_for, std::move(target_replicas), std::move(trace_state), std::move(permit));
+        return ::make_shared<always_speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, block_for, std::move(target_replicas), std::move(trace_state), std::move(permit), rate_limit_info);
     }
 
     // RRD.NONE or RRD.DC_LOCAL w/ multiple DCs.
     if (target_replicas.size() == block_for) { // If RRD.DC_LOCAL extra replica may already be present
         if (is_datacenter_local(cl) && !db::is_local(extra_replica)) {
             slogger.trace("read executor no extra target to speculate");
-            return ::make_shared<never_speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, std::move(target_replicas), std::move(trace_state), std::move(permit));
+            return ::make_shared<never_speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, std::move(target_replicas), std::move(trace_state), std::move(permit), rate_limit_info);
         } else {
             target_replicas.push_back(extra_replica);
             slogger.trace("creating read executor with extra target {}", extra_replica);
@@ -4034,15 +4196,15 @@ db::read_repair_decision storage_proxy::new_read_repair_decision(const schema& s
     }
 
     if (retry_type == speculative_retry::type::ALWAYS) {
-        return ::make_shared<always_speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, block_for, std::move(target_replicas), std::move(trace_state), std::move(permit));
+        return ::make_shared<always_speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, block_for, std::move(target_replicas), std::move(trace_state), std::move(permit), rate_limit_info);
     } else {// PERCENTILE or CUSTOM.
-        return ::make_shared<speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, block_for, std::move(target_replicas), std::move(trace_state), std::move(permit));
+        return ::make_shared<speculating_read_executor>(schema, cf, p, cmd, std::move(pr), cl, block_for, std::move(target_replicas), std::move(trace_state), std::move(permit), rate_limit_info);
     }
 }
 
 future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>>
-storage_proxy::query_result_local_digest(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr, tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout, query::digest_algorithm da) {
-    return query_result_local(std::move(s), std::move(cmd), pr, query::result_options::only_digest(da), std::move(trace_state), timeout).then([] (rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature> result_and_hit_rate) {
+storage_proxy::query_result_local_digest(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr, tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout, query::digest_algorithm da, db::per_partition_rate_limit::info rate_limit_info) {
+    return query_result_local(std::move(s), std::move(cmd), pr, query::result_options::only_digest(da), std::move(trace_state), timeout, rate_limit_info).then([] (rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature> result_and_hit_rate) {
         auto&& [result, hit_rate] = result_and_hit_rate;
         return make_ready_future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>>(rpc::tuple(*result->digest(), result->last_modified(), hit_rate));
     });
@@ -4050,15 +4212,15 @@ storage_proxy::query_result_local_digest(schema_ptr s, lw_shared_ptr<query::read
 
 future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>
 storage_proxy::query_result_local(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr, query::result_options opts,
-                                  tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout) {
+                                  tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout, db::per_partition_rate_limit::info rate_limit_info) {
     cmd->slice.options.set_if<query::partition_slice::option::with_digest>(opts.request != query::result_request::only_result);
     if (pr.is_singular()) {
         unsigned shard = dht::shard_of(*s, pr.start()->value().token());
         get_stats().replica_cross_shard_ops += shard != this_shard_id();
-        return _db.invoke_on(shard, _read_smp_service_group, [gs = global_schema_ptr(s), prv = dht::partition_range_vector({pr}) /* FIXME: pr is copied */, cmd, opts, timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (replica::database& db) mutable {
+        return _db.invoke_on(shard, _read_smp_service_group, [gs = global_schema_ptr(s), prv = dht::partition_range_vector({pr}) /* FIXME: pr is copied */, cmd, opts, timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state)), rate_limit_info] (replica::database& db) mutable {
             auto trace_state = gt.get();
             tracing::trace(trace_state, "Start querying singular range {}", prv.front());
-            return db.query(gs, *cmd, opts, prv, trace_state, timeout).then([trace_state](std::tuple<lw_shared_ptr<query::result>, cache_temperature>&& f_ht) {
+            return db.query(gs, *cmd, opts, prv, trace_state, timeout, rate_limit_info).then([trace_state](std::tuple<lw_shared_ptr<query::result>, cache_temperature>&& f_ht) {
                 auto&& [f, ht] = f_ht;
                 tracing::trace(trace_state, "Querying is done");
                 return make_ready_future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>(rpc::tuple(make_foreign(std::move(f)), ht));
@@ -4092,6 +4254,14 @@ void storage_proxy::handle_read_error(std::variant<exceptions::coordinator_excep
             get_stats().read_timeouts.mark();
         }
         return bo::success();
+    }), utils::result_catch<exceptions::rate_limit_exception>([&] (const auto& ex) {
+        slogger.debug("Read was rate limited");
+        if (ex.rejected_by_coordinator) {
+            get_stats().read_rate_limited_by_coordinator.mark();
+        } else {
+            get_stats().read_rate_limited_by_replicas.mark();
+        }
+        return bo::success();
     }), utils::result_catch_dots([&] (auto&& handle) {
         slogger.debug("Error during read query {}", handle.as_inner());
         return bo::success();
@@ -4126,11 +4296,14 @@ storage_proxy::query_singular(lw_shared_ptr<query::read_command> cmd,
         const auto replicas = it == query_options.preferred_replicas.end()
             ? inet_address_vector_replica_set{} : replica_ids_to_endpoints(*tmptr, it->second);
 
-        auto read_executor = get_read_executor(cmd, schema, std::move(pr), cl, repair_decision,
-                                               query_options.trace_state, replicas, is_read_non_local,
-                                               query_options.permit);
+        auto r_read_executor = get_read_executor(cmd, schema, std::move(pr), cl, repair_decision,
+                                                 query_options.trace_state, replicas, is_read_non_local,
+                                                 query_options.permit);
+        if (!r_read_executor) {
+            co_return std::move(r_read_executor).as_failure();
+        }
 
-        exec.emplace_back(read_executor, std::move(token_range));
+        exec.emplace_back(r_read_executor.value(), std::move(token_range));
     }
     if (is_read_non_local) {
         get_stats().reads_coordinator_outside_replica_set++;
@@ -4338,7 +4511,7 @@ storage_proxy::query_partition_key_range_concurrent(storage_proxy::clock_type::t
             throw;
         }
 
-        exec.push_back(::make_shared<never_speculating_read_executor>(schema, cf.shared_from_this(), p, cmd, std::move(range), cl, std::move(filtered_endpoints), trace_state, permit));
+        exec.push_back(::make_shared<never_speculating_read_executor>(schema, cf.shared_from_this(), p, cmd, std::move(range), cl, std::move(filtered_endpoints), trace_state, permit, std::monostate()));
         ranges_per_exec.emplace(exec.back().get(), std::move(merged_ranges));
     }
 
@@ -4898,7 +5071,7 @@ void storage_proxy::init_messaging_service(shared_ptr<migration_manager> mm) {
     _mm = std::move(mm);
     ser::storage_proxy_rpc_verbs::register_counter_mutation(&_messaging, std::bind_front(&storage_proxy::handle_counter_mutation, this));
     ser::storage_proxy_rpc_verbs::register_mutation(&_messaging, std::bind_front(&storage_proxy::receive_mutation_handler, this, _write_smp_service_group));
-    ser::storage_proxy_rpc_verbs::register_hint_mutation(&_messaging, std::bind_front(&storage_proxy::receive_mutation_handler, this, _hints_write_smp_service_group));
+    ser::storage_proxy_rpc_verbs::register_hint_mutation(&_messaging, [this] <typename... Args>(Args&&... args) { return receive_mutation_handler(_hints_write_smp_service_group, std::forward<Args>(args)..., std::monostate()); });
     ser::storage_proxy_rpc_verbs::register_paxos_learn(&_messaging, std::bind_front(&storage_proxy::handle_paxos_learn, this));
     ser::storage_proxy_rpc_verbs::register_mutation_done(&_messaging, std::bind_front(&storage_proxy::handle_mutation_done, this));
     ser::storage_proxy_rpc_verbs::register_mutation_failed(&_messaging, std::bind_front(&storage_proxy::handle_mutation_failed, this));
@@ -4960,10 +5133,15 @@ storage_proxy::handle_write(netw::messaging_service::msg_addr src_addr, rpc::opt
             timeout = *t;
         }
 
-        return do_with(std::move(in), get_local_shared_storage_proxy(), size_t(0), [this, src_addr = std::move(src_addr),
+        struct errors_info {
+            size_t count = 0;
+            replica::exception_variant local;
+        };
+
+        return do_with(std::move(in), get_local_shared_storage_proxy(), errors_info{}, [this, src_addr = std::move(src_addr),
                        forward = std::move(forward), reply_to, shard, response_id, trace_state_ptr, timeout,
                        schema_version, apply_fn = std::move(apply_fn), forward_fn = std::move(forward_fn)]
-                       (const auto& m, shared_ptr<storage_proxy>& p, size_t& errors) mutable {
+                       (const auto& m, shared_ptr<storage_proxy>& p, errors_info& errors) mutable {
             ++p->get_stats().received_mutations;
             p->get_stats().forwarded_mutations += forward.size();
             return when_all(
@@ -4990,14 +5168,15 @@ storage_proxy::handle_write(netw::messaging_service::msg_addr src_addr, rpc::opt
                         f.ignore_ready_future();
                     });
                 }).handle_exception([reply_to, shard, &p, &errors] (std::exception_ptr eptr) {
+                    errors.count++;
+                    errors.local = replica::try_encode_replica_exception(eptr);
                     seastar::log_level l = seastar::log_level::warn;
-                    if (is_timeout_exception(eptr)) {
-                        // ignore timeouts so that logs are not flooded.
-                        // database total_writes_timedout counter was incremented.
+                    if (is_timeout_exception(eptr) || std::holds_alternative<replica::rate_limit_exception>(errors.local.reason)) {
+                        // ignore timeouts and rate limit exceptions so that logs are not flooded.
+                        // database's total_writes_timedout or total_writes_rate_limited counter was incremented.
                         l = seastar::log_level::debug;
                     }
                     slogger.log(l, "Failed to apply mutation from {}#{}: {}", reply_to, shard, eptr);
-                    errors++;
                 }),
                 parallel_for_each(forward.begin(), forward.end(), [reply_to, shard, response_id, &m, &p, trace_state_ptr,
                                   timeout, &errors, forward_fn = std::move(forward_fn)] (gms::inet_address forward) {
@@ -5007,7 +5186,7 @@ storage_proxy::handle_write(netw::messaging_service::msg_addr src_addr, rpc::opt
                             .then_wrapped([&p, &errors] (future<> f) {
                         if (f.failed()) {
                             ++p->get_stats().forwarding_errors;
-                            errors++;
+                            errors.count++;
                         };
                         f.ignore_ready_future();
                     });
@@ -5015,14 +5194,15 @@ storage_proxy::handle_write(netw::messaging_service::msg_addr src_addr, rpc::opt
             ).then_wrapped([trace_state_ptr, reply_to, shard, response_id, &errors, &p] (future<std::tuple<future<>, future<>>>&& f) {
                 // ignore results, since we'll be returning them via MUTATION_DONE/MUTATION_FAILURE verbs
                 auto fut = make_ready_future<seastar::rpc::no_wait_type>(netw::messaging_service::no_wait());
-                if (errors) {
-                    tracing::trace(trace_state_ptr, "Sending mutation_failure with {} failures to /{}", errors, reply_to);
+                if (errors.count) {
+                    tracing::trace(trace_state_ptr, "Sending mutation_failure with {} failures to /{}", errors.count, reply_to);
                     fut = ser::storage_proxy_rpc_verbs::send_mutation_failed(&p->_messaging,
                             netw::messaging_service::msg_addr{reply_to, shard},
                             shard,
                             response_id,
-                            errors,
-                            p->get_view_update_backlog()).then_wrapped([] (future<> f) {
+                            errors.count,
+                            p->get_view_update_backlog(),
+                            std::move(errors.local)).then_wrapped([] (future<> f) {
                         f.ignore_ready_future();
                         return netw::messaging_service::no_wait();
                     });
@@ -5036,22 +5216,24 @@ storage_proxy::handle_write(netw::messaging_service::msg_addr src_addr, rpc::opt
 
 future<rpc::no_wait_type>
 storage_proxy::receive_mutation_handler(smp_service_group smp_grp, const rpc::client_info& cinfo, rpc::opt_time_point t, frozen_mutation in, inet_address_vector_replica_set forward,
-            gms::inet_address reply_to, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info) {
+            gms::inet_address reply_to, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info,
+            rpc::optional<db::per_partition_rate_limit::info> rate_limit_info_opt) {
         tracing::trace_state_ptr trace_state_ptr;
         auto src_addr = netw::messaging_service::get_source(cinfo);
+        auto rate_limit_info = rate_limit_info_opt.value_or(std::monostate());
 
         utils::UUID schema_version = in.schema_version();
         return handle_write(src_addr, t, schema_version, std::move(in), std::move(forward), reply_to, shard, response_id,
                 trace_info ? *trace_info : std::nullopt,
-                /* apply_fn */ [smp_grp] (shared_ptr<storage_proxy>& p, tracing::trace_state_ptr tr_state, schema_ptr s, const frozen_mutation& m,
+                /* apply_fn */ [smp_grp, rate_limit_info] (shared_ptr<storage_proxy>& p, tracing::trace_state_ptr tr_state, schema_ptr s, const frozen_mutation& m,
                         clock_type::time_point timeout) {
-                    return p->mutate_locally(std::move(s), m, std::move(tr_state), db::commitlog::force_sync::no, timeout, smp_grp);
+                    return p->mutate_locally(std::move(s), m, std::move(tr_state), db::commitlog::force_sync::no, timeout, smp_grp, rate_limit_info);
                 },
-                /* forward_fn */ [] (shared_ptr<storage_proxy>& p, netw::messaging_service::msg_addr addr, clock_type::time_point timeout, const frozen_mutation& m,
+                /* forward_fn */ [rate_limit_info] (shared_ptr<storage_proxy>& p, netw::messaging_service::msg_addr addr, clock_type::time_point timeout, const frozen_mutation& m,
                         gms::inet_address reply_to, unsigned shard, response_id_type response_id,
                         std::optional<tracing::trace_info> trace_info) {
                     return ser::storage_proxy_rpc_verbs::send_mutation(&p->_messaging,
-                                            addr, timeout, m, {}, reply_to, shard, response_id, std::move(trace_info));
+                                            addr, timeout, m, {}, reply_to, shard, response_id, std::move(trace_info), rate_limit_info);
                 });
 }
 
@@ -5087,17 +5269,47 @@ storage_proxy::handle_mutation_done(const rpc::client_info& cinfo, unsigned shar
 }
 
 future<rpc::no_wait_type>
-storage_proxy::handle_mutation_failed(const rpc::client_info& cinfo, unsigned shard, storage_proxy::response_id_type response_id, size_t num_failed, rpc::optional<db::view::update_backlog> backlog) {
+storage_proxy::handle_mutation_failed(const rpc::client_info& cinfo, unsigned shard, storage_proxy::response_id_type response_id, size_t num_failed, rpc::optional<db::view::update_backlog> backlog, rpc::optional<replica::exception_variant> exception) {
         auto& from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
         get_stats().replica_cross_shard_ops += shard != this_shard_id();
-        return container().invoke_on(shard, _write_ack_smp_service_group, [from, response_id, num_failed, backlog = std::move(backlog)] (storage_proxy& sp) mutable {
-            sp.got_failure_response(response_id, from, num_failed, std::move(backlog), error::FAILURE, std::nullopt);
+        return container().invoke_on(shard, _write_ack_smp_service_group, [from, response_id, num_failed, backlog = std::move(backlog), exception = std::move(exception)] (storage_proxy& sp) mutable {
+            error err = error::FAILURE;
+            if (exception) {
+                err = std::visit([] <typename Ex> (Ex&) {
+                    if constexpr (std::is_same_v<Ex, replica::rate_limit_exception>) {
+                        return error::RATE_LIMIT;
+                    } else if constexpr (std::is_same_v<Ex, replica::unknown_exception> || std::is_same_v<Ex, replica::no_exception>) {
+                        return error::FAILURE;
+                    }
+                }, exception->reason);
+            }
+            sp.got_failure_response(response_id, from, num_failed, std::move(backlog), err, std::nullopt);
             return netw::messaging_service::no_wait();
         });
 }
 
-future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>
-storage_proxy::handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda) {
+template<typename... Elements>
+future<rpc::tuple<Elements..., replica::exception_variant>> storage_proxy::encode_replica_exception_for_rpc(future<rpc::tuple<Elements...>>&& f, auto&& default_tuple_maker) {
+    using original_std_tuple_type = std::tuple<Elements...>;
+    using final_tuple_type = rpc::tuple<Elements..., replica::exception_variant>;
+
+    if (!f.failed()) {
+        return make_ready_future<final_tuple_type>(std::tuple_cat(original_std_tuple_type(f.get()), std::tuple<replica::exception_variant>(replica::exception_variant())));
+    }
+
+    std::exception_ptr eptr = f.get_exception();
+    if (features().typed_errors_in_read_rpc) {
+        replica::exception_variant ex = replica::try_encode_replica_exception(eptr);
+        if (ex) {
+            return make_ready_future<final_tuple_type>(std::tuple_cat(default_tuple_maker(), std::tuple<replica::exception_variant>(std::move(ex))));
+        }
+    }
+
+    return make_exception_future<final_tuple_type>(std::move(eptr));
+}
+
+future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature, replica::exception_variant>>
+storage_proxy::handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda, rpc::optional<db::per_partition_rate_limit::info> rate_limit_info_opt) {
         tracing::trace_state_ptr trace_state_ptr;
         auto src_addr = netw::messaging_service::get_source(cinfo);
         if (cmd.trace_info) {
@@ -5107,14 +5319,15 @@ storage_proxy::handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_poi
         }
         auto da = oda.value_or(query::digest_algorithm::MD5);
         auto sp = get_local_shared_storage_proxy();
+        auto rate_limit_info = rate_limit_info_opt.value_or(std::monostate());
         if (!cmd.max_result_size) {
             auto& cfg = sp->local_db().get_config();
             cmd.max_result_size.emplace(cfg.max_memory_for_unlimited_query_soft_limit(), cfg.max_memory_for_unlimited_query_hard_limit());
         }
-        return do_with(std::move(pr), std::move(sp), std::move(trace_state_ptr), [this, &cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, t] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
+        return do_with(std::move(pr), std::move(sp), std::move(trace_state_ptr), [this, &cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, t, rate_limit_info] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
             p->get_stats().replica_data_reads++;
             auto src_ip = src_addr.addr;
-            return _mm->get_schema_for_read(cmd->schema_version, std::move(src_addr), p->_messaging).then([cmd, da, &pr, &p, &trace_state_ptr, t] (schema_ptr s) {
+            return _mm->get_schema_for_read(cmd->schema_version, std::move(src_addr), p->_messaging).then([cmd, da, &pr, &p, &trace_state_ptr, t, rate_limit_info] (schema_ptr s) {
                 auto pr2 = ::compat::unwrap(std::move(pr), *s);
                 if (pr2.second) {
                     // this function assumes singular queries but doesn't validate
@@ -5124,14 +5337,15 @@ storage_proxy::handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_poi
                 opts.digest_algo = da;
                 opts.request = da == query::digest_algorithm::none ? query::result_request::only_result : query::result_request::result_and_digest;
                 auto timeout = t ? *t : db::no_timeout;
-                return p->query_result_local(std::move(s), cmd, std::move(pr2.first), opts, trace_state_ptr, timeout);
-            }).finally([&trace_state_ptr, src_ip] () mutable {
+                return p->query_result_local(std::move(s), cmd, std::move(pr2.first), opts, trace_state_ptr, timeout, rate_limit_info);
+            }).then_wrapped([this, &trace_state_ptr, src_ip] (future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>> f) mutable {
                 tracing::trace(trace_state_ptr, "read_data handling is done, sending a response to /{}", src_ip);
+                return encode_replica_exception_for_rpc(std::move(f), [] { return std::make_tuple(foreign_ptr(make_lw_shared<query::result>()), cache_temperature::invalid()); });
             });
         });
 }
 
-future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>
+future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature, replica::exception_variant>>
 storage_proxy::handle_read_mutation_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr) {
         tracing::trace_state_ptr trace_state_ptr;
         auto src_addr = netw::messaging_service::get_source(cinfo);
@@ -5158,14 +5372,15 @@ storage_proxy::handle_read_mutation_data(const rpc::client_info& cinfo, rpc::opt
                 unwrapped = ::compat::unwrap(std::move(pr), *s);
                 auto timeout = t ? *t : db::no_timeout;
                 return p->query_mutations_locally(std::move(s), std::move(cmd), unwrapped, timeout, trace_state_ptr);
-            }).finally([&trace_state_ptr, src_ip] () mutable {
+            }).then_wrapped([this, &trace_state_ptr, src_ip] (future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> f) mutable {
                 tracing::trace(trace_state_ptr, "read_mutation_data handling is done, sending a response to /{}", src_ip);
+                return encode_replica_exception_for_rpc(std::move(f), [] { return std::make_tuple(foreign_ptr(make_lw_shared<reconcilable_result>()), cache_temperature::invalid()); });
             });
         });
 }
 
-future<rpc::tuple<query::result_digest, long, cache_temperature>>
-storage_proxy::handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda) {
+future<rpc::tuple<query::result_digest, long, cache_temperature, replica::exception_variant>>
+storage_proxy::handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda, rpc::optional<db::per_partition_rate_limit::info> rate_limit_info_opt) {
         tracing::trace_state_ptr trace_state_ptr;
         auto src_addr = netw::messaging_service::get_source(cinfo);
         if (cmd.trace_info) {
@@ -5174,22 +5389,24 @@ storage_proxy::handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_p
             tracing::trace(trace_state_ptr, "read_digest: message received from /{}", src_addr.addr);
         }
         auto da = oda.value_or(query::digest_algorithm::MD5);
+        auto rate_limit_info = rate_limit_info_opt.value_or(std::monostate());
         if (!cmd.max_result_size) {
             cmd.max_result_size.emplace(cinfo.retrieve_auxiliary<uint64_t>("max_result_size"));
         }
-        return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [this, &cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, t] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
+        return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [this, &cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, t, rate_limit_info] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
             p->get_stats().replica_digest_reads++;
             auto src_ip = src_addr.addr;
-            return _mm->get_schema_for_read(cmd->schema_version, std::move(src_addr), p->_messaging).then([cmd, &pr, &p, &trace_state_ptr, t, da] (schema_ptr s) {
+            return _mm->get_schema_for_read(cmd->schema_version, std::move(src_addr), p->_messaging).then([cmd, &pr, &p, &trace_state_ptr, t, da, rate_limit_info] (schema_ptr s) {
                 auto pr2 = ::compat::unwrap(std::move(pr), *s);
                 if (pr2.second) {
                     // this function assumes singular queries but doesn't validate
                     throw std::runtime_error("READ_DIGEST called with wrapping range");
                 }
                 auto timeout = t ? *t : db::no_timeout;
-                return p->query_result_local_digest(std::move(s), cmd, std::move(pr2.first), trace_state_ptr, timeout, da);
-            }).finally([&trace_state_ptr, src_ip] () mutable {
+                return p->query_result_local_digest(std::move(s), cmd, std::move(pr2.first), trace_state_ptr, timeout, da, rate_limit_info);
+            }).then_wrapped([this, &trace_state_ptr, src_ip] (future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>> f) mutable {
                 tracing::trace(trace_state_ptr, "read_digest handling is done, sending a response to /{}", src_ip);
+                return encode_replica_exception_for_rpc(std::move(f), [] { return std::make_tuple(query::result_digest(), api::missing_timestamp, cache_temperature::invalid()); });
             });
         });
 }
diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh
index a7eebb3933..df8c0738a3 100644
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -46,6 +46,8 @@
 #include "partition_range_compat.hh"
 #include "exceptions/exceptions.hh"
 #include "exceptions/coordinator_result.hh"
+#include "replica/exceptions.hh"
+#include "db/per_partition_rate_limit_info.hh"
 
 class reconcilable_result;
 class frozen_mutation_and_schema;
@@ -126,6 +128,7 @@ public:
         NONE,
         TIMEOUT,
         FAILURE,
+        RATE_LIMIT,
     };
     template<typename T = void>
     using result = exceptions::coordinator_result<T>;
@@ -268,6 +271,7 @@ private:
             tracing::trace_state_ptr,
             service_permit,
             bool,
+            db::allow_per_partition_rate_limit,
             lw_shared_ptr<cdc::operation_result_tracker>> _mutate_stage;
     netw::connection_drop_slot_t _connection_dropped;
     netw::connection_drop_registration_t _condrop_registration;
@@ -307,18 +311,18 @@ private:
     void got_failure_response(response_id_type id, gms::inet_address from, size_t count, std::optional<db::view::update_backlog> backlog, error err, std::optional<sstring> msg);
     future<result<>> response_wait(response_id_type id, clock_type::time_point timeout);
     ::shared_ptr<abstract_write_response_handler>& get_write_response_handler(storage_proxy::response_id_type id);
-    response_id_type create_write_response_handler_helper(schema_ptr s, const dht::token& token,
+    result<response_id_type> create_write_response_handler_helper(schema_ptr s, const dht::token& token,
             std::unique_ptr<mutation_holder> mh, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state,
-            service_permit permit);
-    response_id_type create_write_response_handler(replica::keyspace& ks, db::consistency_level cl, db::write_type type, std::unique_ptr<mutation_holder> m, inet_address_vector_replica_set targets,
-            const inet_address_vector_topology_change& pending_endpoints, inet_address_vector_topology_change, tracing::trace_state_ptr tr_state, storage_proxy::write_stats& stats, service_permit permit);
-    response_id_type create_write_response_handler(const mutation&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
-    response_id_type create_write_response_handler(const hint_wrapper&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
-    response_id_type create_write_response_handler(const std::unordered_map<gms::inet_address, std::optional<mutation>>&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
-    response_id_type create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>& proposal,
-            db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
-    response_id_type create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, dht::token, inet_address_vector_replica_set>& meta,
-            db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
+            service_permit permit, db::allow_per_partition_rate_limit allow_limit);
+    result<response_id_type> create_write_response_handler(replica::keyspace& ks, db::consistency_level cl, db::write_type type, std::unique_ptr<mutation_holder> m, inet_address_vector_replica_set targets,
+            const inet_address_vector_topology_change& pending_endpoints, inet_address_vector_topology_change, tracing::trace_state_ptr tr_state, storage_proxy::write_stats& stats, service_permit permit, db::per_partition_rate_limit::info rate_limit_info);
+    result<response_id_type> create_write_response_handler(const mutation&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
+    result<response_id_type> create_write_response_handler(const hint_wrapper&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
+    result<response_id_type> create_write_response_handler(const std::unordered_map<gms::inet_address, std::optional<mutation>>&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
+    result<response_id_type> create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>& proposal,
+            db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
+    result<response_id_type> create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, dht::token, inet_address_vector_replica_set>& meta,
+            db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
     void register_cdc_operation_result_tracker(const storage_proxy::unique_response_handler_vector& ids, lw_shared_ptr<cdc::operation_result_tracker> tracker);
     void send_to_live_endpoints(response_id_type response_id, clock_type::time_point timeout);
     template<typename Range>
@@ -331,7 +335,7 @@ private:
     static void sort_endpoints_by_proximity(inet_address_vector_replica_set& eps);
     inet_address_vector_replica_set get_live_sorted_endpoints(replica::keyspace& ks, const dht::token& token) const;
     db::read_repair_decision new_read_repair_decision(const schema& s);
-    ::shared_ptr<abstract_read_executor> get_read_executor(lw_shared_ptr<query::read_command> cmd,
+    result<::shared_ptr<abstract_read_executor>> get_read_executor(lw_shared_ptr<query::read_command> cmd,
             schema_ptr schema,
             dht::partition_range pr,
             db::consistency_level cl,
@@ -343,11 +347,13 @@ private:
     future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>> query_result_local(schema_ptr, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr,
                                                                            query::result_options opts,
                                                                            tracing::trace_state_ptr trace_state,
-                                                                           clock_type::time_point timeout);
+                                                                           clock_type::time_point timeout,
+                                                                           db::per_partition_rate_limit::info rate_limit_info);
     future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>> query_result_local_digest(schema_ptr, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr,
                                                                                                    tracing::trace_state_ptr trace_state,
                                                                                                    clock_type::time_point timeout,
-                                                                                                   query::digest_algorithm da);
+                                                                                                   query::digest_algorithm da,
+                                                                                                   db::per_partition_rate_limit::info rate_limit_info);
     future<result<coordinator_query_result>> query_partition_key_range(lw_shared_ptr<query::read_command> cmd,
             dht::partition_range_vector partition_ranges,
             db::consistency_level cl,
@@ -376,9 +382,9 @@ private:
         db::consistency_level cl,
         coordinator_query_options optional_params);
     template<typename Range, typename CreateWriteHandler>
-    future<unique_response_handler_vector> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, service_permit permit, CreateWriteHandler handler);
+    future<result<unique_response_handler_vector>> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, service_permit permit, CreateWriteHandler handler);
     template<typename Range>
-    future<unique_response_handler_vector> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
+    future<result<unique_response_handler_vector>> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
     future<result<>> mutate_begin(unique_response_handler_vector ids, db::consistency_level cl, tracing::trace_state_ptr trace_state, std::optional<clock_type::time_point> timeout_opt = { });
     future<result<>> mutate_end(future<result<>> mutate_result, utils::latency_counter, write_stats& stats, tracing::trace_state_ptr trace_state);
     future<result<>> schedule_repair(std::unordered_map<dht::token, std::unordered_map<gms::inet_address, std::optional<mutation>>> diffs, db::consistency_level cl, tracing::trace_state_ptr trace_state, service_permit permit);
@@ -386,7 +392,7 @@ private:
     void unthrottle();
     void handle_read_error(std::variant<exceptions::coordinator_exception_container, std::exception_ptr> failure, bool range);
     template<typename Range>
-    future<result<>> mutate_internal(Range mutations, db::consistency_level cl, bool counter_write, tracing::trace_state_ptr tr_state, service_permit permit, std::optional<clock_type::time_point> timeout_opt = { }, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker = { });
+    future<result<>> mutate_internal(Range mutations, db::consistency_level cl, bool counter_write, tracing::trace_state_ptr tr_state, service_permit permit, std::optional<clock_type::time_point> timeout_opt = { }, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker = { }, db::allow_per_partition_rate_limit allow_limit = db::allow_per_partition_rate_limit::no);
     future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> query_nonsingular_mutations_locally(
             schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range_vector&& pr, tracing::trace_state_ptr trace_state,
             clock_type::time_point timeout);
@@ -401,7 +407,7 @@ private:
 
     gms::inet_address find_leader_for_counter_update(const mutation& m, db::consistency_level cl);
 
-    future<result<>> do_mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker);
+    future<result<>> do_mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool, db::allow_per_partition_rate_limit allow_limit, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker);
 
     future<> send_to_endpoint(
             std::unique_ptr<mutation_holder> m,
@@ -424,21 +430,24 @@ private:
     void retire_view_response_handlers(noncopyable_function<bool(const abstract_write_response_handler&)> filter_fun);
     void connection_dropped(gms::inet_address);
 private:
+    template<typename... Elements>
+    future<rpc::tuple<Elements..., replica::exception_variant>> encode_replica_exception_for_rpc(future<rpc::tuple<Elements...>>&& f, auto&& default_tuple_maker);
+
     future<> handle_counter_mutation(const rpc::client_info& cinfo, rpc::opt_time_point t, std::vector<frozen_mutation> fms, db::consistency_level cl, std::optional<tracing::trace_info> trace_info);
     future<rpc::no_wait_type> handle_write(netw::msg_addr src_addr, rpc::opt_time_point t,
                       utils::UUID schema_version, auto in, inet_address_vector_replica_set forward, gms::inet_address reply_to,
                       unsigned shard, storage_proxy::response_id_type response_id, std::optional<tracing::trace_info> trace_info,
                       auto&& apply_fn, auto&& forward_fn);
     future<rpc::no_wait_type> receive_mutation_handler (smp_service_group smp_grp, const rpc::client_info& cinfo, rpc::opt_time_point t, frozen_mutation in, inet_address_vector_replica_set forward,
-            gms::inet_address reply_to, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info);
+            gms::inet_address reply_to, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info, rpc::optional<db::per_partition_rate_limit::info> rate_limit_info_opt);
     future<rpc::no_wait_type> handle_paxos_learn(const rpc::client_info& cinfo, rpc::opt_time_point t, paxos::proposal decision,
             inet_address_vector_replica_set forward, gms::inet_address reply_to, unsigned shard,
             storage_proxy::response_id_type response_id, std::optional<tracing::trace_info> trace_info);
     future<rpc::no_wait_type> handle_mutation_done(const rpc::client_info& cinfo, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<db::view::update_backlog> backlog);
-    future<rpc::no_wait_type> handle_mutation_failed(const rpc::client_info& cinfo, unsigned shard, storage_proxy::response_id_type response_id, size_t num_failed, rpc::optional<db::view::update_backlog> backlog);
-    future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>> handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda);
-    future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> handle_read_mutation_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr);
-    future<rpc::tuple<query::result_digest, long, cache_temperature>> handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda);
+    future<rpc::no_wait_type> handle_mutation_failed(const rpc::client_info& cinfo, unsigned shard, storage_proxy::response_id_type response_id, size_t num_failed, rpc::optional<db::view::update_backlog> backlog, rpc::optional<replica::exception_variant> exception);
+    future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature, replica::exception_variant>> handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda, rpc::optional<db::per_partition_rate_limit::info> rate_limit_info_opt);
+    future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature, replica::exception_variant>> handle_read_mutation_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr);
+    future<rpc::tuple<query::result_digest, long, cache_temperature, replica::exception_variant>> handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda, rpc::optional<db::per_partition_rate_limit::info> rate_limit_info_opt);
     future<> handle_truncate(rpc::opt_time_point timeout, sstring ksname, sstring cfname);
     future<foreign_ptr<std::unique_ptr<service::paxos::prepare_response>>> handle_paxos_prepare(const rpc::client_info& cinfo, rpc::opt_time_point timeout,
                 query::read_command cmd, partition_key key, utils::UUID ballot, bool only_digest, query::digest_algorithm da,
@@ -495,29 +504,29 @@ public:
 private:
     // Applies mutation on this node.
     // Resolves with timed_out_error when timeout is reached.
-    future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp);
+    future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp, db::per_partition_rate_limit::info rate_limit_info);
     // Applies mutation on this node.
     // Resolves with timed_out_error when timeout is reached.
     future<> mutate_locally(const schema_ptr&, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout,
-            smp_service_group smp_grp);
+            smp_service_group smp_grp, db::per_partition_rate_limit::info rate_limit_info);
     // Applies mutations on this node.
     // Resolves with timed_out_error when timeout is reached.
-    future<> mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, smp_service_group smp_grp);
+    future<> mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, smp_service_group smp_grp, db::per_partition_rate_limit::info rate_limit_info);
 
 public:
     // Applies mutation on this node.
     // Resolves with timed_out_error when timeout is reached.
-    future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max()) {
-        return mutate_locally(m, tr_state, sync, timeout, _write_smp_service_group);
+    future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max(), db::per_partition_rate_limit::info rate_limit_info = std::monostate()) {
+        return mutate_locally(m, tr_state, sync, timeout, _write_smp_service_group, rate_limit_info);
     }
     // Applies mutation on this node.
     // Resolves with timed_out_error when timeout is reached.
-    future<> mutate_locally(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max()) {
-        return mutate_locally(s, m, tr_state, sync, timeout, _write_smp_service_group);
+    future<> mutate_locally(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max(), db::per_partition_rate_limit::info rate_limit_info = std::monostate()) {
+        return mutate_locally(s, m, tr_state, sync, timeout, _write_smp_service_group, rate_limit_info);
     }
     // Applies mutations on this node.
     // Resolves with timed_out_error when timeout is reached.
-    future<> mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout = clock_type::time_point::max());
+    future<> mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout = clock_type::time_point::max(), db::per_partition_rate_limit::info rate_limit_info = std::monostate());
 
     future<> mutate_hint(const schema_ptr&, const frozen_mutation& m, tracing::trace_state_ptr tr_state, clock_type::time_point timeout = clock_type::time_point::max());
 
@@ -531,14 +540,14 @@ public:
     * @param consistency_level the consistency level for the operation
     * @param tr_state trace state handle
     */
-    future<> mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters = false);
+    future<> mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit, bool raw_counters = false);
 
     /**
     * See mutate. Does the same, but returns some exceptions
     * through the result<>, which allows for efficient inspection
     * of the exception on the exception handling path.
     */
-    future<result<>> mutate_result(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters = false);
+    future<result<>> mutate_result(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit, bool raw_counters = false);
 
     paxos_participants
     get_paxos_participants(const sstring& ks_name, const dht::token& token, db::consistency_level consistency_for_paxos);
@@ -547,7 +556,8 @@ public:
                                            clock_type::time_point timeout, service_permit permit);
 
     future<result<>> mutate_with_triggers(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout,
-                                          bool should_mutate_atomically, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters = false);
+                                          bool should_mutate_atomically, tracing::trace_state_ptr tr_state, service_permit permit,
+                                          db::allow_per_partition_rate_limit allow_limit, bool raw_counters = false);
 
     /**
     * See mutate. Adds additional steps before and after writing a batch.
diff --git a/service/storage_proxy_stats.hh b/service/storage_proxy_stats.hh
index 3d344e0347..66d54f7e0b 100644
--- a/service/storage_proxy_stats.hh
+++ b/service/storage_proxy_stats.hh
@@ -77,6 +77,8 @@ struct write_stats {
 
     utils::timed_rate_moving_average write_unavailables;
     utils::timed_rate_moving_average write_timeouts;
+    utils::timed_rate_moving_average write_rate_limited_by_replicas;
+    utils::timed_rate_moving_average write_rate_limited_by_coordinator;
 
     utils::timed_rate_moving_average_and_histogram write;
     utils::time_estimated_histogram estimated_write;
@@ -127,6 +129,8 @@ struct stats : public write_stats {
     seastar::metrics::metric_groups _metrics;
     utils::timed_rate_moving_average read_timeouts;
     utils::timed_rate_moving_average read_unavailables;
+    utils::timed_rate_moving_average read_rate_limited_by_replicas;
+    utils::timed_rate_moving_average read_rate_limited_by_coordinator;
     utils::timed_rate_moving_average range_slice_timeouts;
     utils::timed_rate_moving_average range_slice_unavailables;
 
diff --git a/test/boost/per_partition_rate_limit_test.cc b/test/boost/per_partition_rate_limit_test.cc
new file mode 100644
index 0000000000..0c52e310f3
--- /dev/null
+++ b/test/boost/per_partition_rate_limit_test.cc
@@ -0,0 +1,109 @@
+#include <chrono>
+#include <cstdint>
+#include <seastar/core/coroutine.hh>
+#include <seastar/testing/test_case.hh>
+
+#include "test/lib/cql_test_env.hh"
+#include "test/lib/cql_assertions.hh"
+
+#include "mutation.hh"
+#include "service/storage_proxy.hh"
+
+SEASTAR_TEST_CASE(test_internal_operation_filtering) {
+    return do_with_cql_env_thread([] (cql_test_env& e) -> future<> {
+        // The test requires at least two shards
+        // so that it can test the shard!=coordinator case
+        BOOST_REQUIRE_GT(smp::count, 1);
+
+        cquery_nofail(e, "CREATE TABLE ks.tbl (pk int PRIMARY KEY) \
+                WITH per_partition_rate_limit = {'max_reads_per_second': 1, 'max_writes_per_second': 1}");
+        
+        auto& db = e.db();
+        auto& qp = e.qp();
+        const auto sptr = db.local().find_schema("ks", "tbl");
+
+        auto pk = partition_key::from_singular(*sptr, int32_t(0));
+
+        unsigned local_shard = dht::shard_of(*sptr, dht::get_token(*sptr, pk.view()));
+        unsigned foreign_shard = (local_shard + 1) % smp::count;
+        
+        auto run_writes = [&qp, &db, pk] (db::allow_per_partition_rate_limit allow_limit) -> future<> {
+            BOOST_TEST_MESSAGE("Testing writes");
+
+            const auto sptr = db.local().find_schema("ks", "tbl");
+            auto m = mutation(sptr, partition_key(pk));
+
+            // Rejection is probabilistic, so try many times
+            for (int i = 0; i < 100; i++) {
+                qp.local().proxy().mutate({m},
+                        db::consistency_level::ALL,
+                        service::storage_proxy::clock_type::now() + std::chrono::seconds(10),
+                        nullptr,
+                        empty_service_permit(),
+                        allow_limit).get();
+            }
+
+            return make_ready_future<>();
+        };
+
+        auto run_reads = [&qp, &db, pk] (db::allow_per_partition_rate_limit allow_limit) -> future<> {
+            BOOST_TEST_MESSAGE("Testing reads");
+
+            const auto sptr = db.local().find_schema("ks", "tbl");
+            auto pk_def = sptr->get_column_definition("pk");
+            auto dk = dht::decorate_key(*sptr, partition_key(pk));
+            auto selection = cql3::selection::selection::for_columns(sptr, {pk_def});
+            auto opts = selection->get_query_options();
+            auto partition_slice = query::partition_slice(
+                    {query::clustering_range::make_open_ended_both_sides()}, {}, {}, std::move(opts));
+
+            auto cmd = make_lw_shared<query::read_command>(sptr->id(), sptr->version(), partition_slice, query::max_result_size(1), query::row_limit(1));
+            cmd->allow_limit = allow_limit;
+
+            // Rejection is probabilistic, so try many times
+            for (int i = 0; i < 100; i++) {
+                qp.local().proxy().query(sptr,
+                        cmd,
+                        {dht::partition_range(dk)},
+                        db::consistency_level::ALL,
+                        service::storage_proxy::coordinator_query_options(
+                                db::timeout_clock::now() + std::chrono::seconds(10),
+                                empty_service_permit(),
+                                service::client_state::for_internal_calls())).get();
+            }
+
+            return make_ready_future<>();
+        };
+
+        auto sgroups = get_scheduling_groups().get();
+
+        for (unsigned shard : {local_shard, foreign_shard}) {
+            for (scheduling_group sg : {sgroups.statement_scheduling_group, sgroups.streaming_scheduling_group}) {
+                for (db::allow_per_partition_rate_limit allow_limit : {db::allow_per_partition_rate_limit::yes, db::allow_per_partition_rate_limit::no}) {
+                    // Rate limiting must be explicitly enabled and handled on the correct scheduling group.
+                    const bool expect_limiting = (sg == sgroups.statement_scheduling_group) && bool(allow_limit);
+
+                    BOOST_TEST_MESSAGE(format("Test config, shard: {}, scheduling_group: {}, allow_limit: {}, expect_limiting: {}",
+                            (shard == local_shard) ? "local" : "foreign",
+                            (sg == sgroups.statement_scheduling_group) ? "statement" : "streaming",
+                            allow_limit,
+                            expect_limiting));
+                    
+                    smp::submit_to(shard, [&] () mutable {
+                        return seastar::async(thread_attributes{sg}, [&] {
+                            if (expect_limiting) {
+                                BOOST_REQUIRE_THROW(run_writes(allow_limit).get(), exceptions::rate_limit_exception);
+                                BOOST_REQUIRE_THROW(run_reads(allow_limit).get(), exceptions::rate_limit_exception);
+                            } else {
+                                BOOST_REQUIRE_NO_THROW(run_writes(allow_limit).get());
+                                BOOST_REQUIRE_NO_THROW(run_reads(allow_limit).get());
+                            }
+                        });
+                    }).get();
+                }
+            }
+        }
+
+        return make_ready_future<>();
+    });
+}
\ No newline at end of file
diff --git a/test/boost/rate_limiter_test.cc b/test/boost/rate_limiter_test.cc
new file mode 100644
index 0000000000..f54c4319ea
--- /dev/null
+++ b/test/boost/rate_limiter_test.cc
@@ -0,0 +1,130 @@
+/*
+ * Copyright (C) 2022-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+#include <chrono>
+#include <cstdint>
+#include <seastar/core/manual_clock.hh>
+#include <seastar/core/sleep.hh>
+#include <seastar/core/coroutine.hh>
+#include <seastar/util/later.hh>
+#include <seastar/testing/test_case.hh>
+
+#include "db/rate_limiter.hh"
+
+using namespace seastar;
+using test_rate_limiter = db::generic_rate_limiter<seastar::manual_clock>;
+
+future<> step_seconds(int seconds) {
+    for (int i = 0; i < seconds; i++) {
+        // The rate limiter's timer executes periodically every second
+        // and we want the timer to run `seconds` times.
+        // Because `manual_clock::advance` executes each timer only once
+        // even if they reschedule, we cannot just advance by requested
+        // number of seconds - instead, we must advance multiple times
+        // by one second.
+        manual_clock::advance(std::chrono::seconds(1));
+        co_await yield();
+    }
+}
+
+SEASTAR_TEST_CASE(test_rate_limiter_no_rejections_on_sequential) {
+    const uint64_t token_count = 1000 * 1000;
+    const uint64_t limit = 1;
+    test_rate_limiter::label lbl;
+
+    test_rate_limiter limiter;
+
+    for (uint64_t token = 0; token < token_count; token++) {
+        BOOST_REQUIRE_LE(limiter.increase_and_get_counter(lbl, token), 1);
+        co_await maybe_yield();
+    }
+}
+
+SEASTAR_TEST_CASE(test_rate_limiter_partition_label_separation) {
+    const uint64_t token_count = 30;
+    const uint64_t repeat_count = 10;
+    std::vector<test_rate_limiter::label> labels{3};
+
+    test_rate_limiter limiter;
+
+    for (uint64_t i = 0; i < repeat_count; i++) {
+        for (uint64_t token = 0; token < token_count; token++) {
+            for (auto& l : labels) {
+                BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(l, token), i + 1);
+                co_await maybe_yield();
+            }
+        }
+    }
+}
+
+SEASTAR_TEST_CASE(test_rate_limiter_halving_over_time) {
+    test_rate_limiter::label lbl;
+    test_rate_limiter limiter;
+
+    for (int i = 0; i < 16; i++) {
+        limiter.increase_and_get_counter(lbl, 0);
+    }
+
+    // Should be cut in half
+    co_await step_seconds(1);
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), (16 / 2) + 1);
+
+    // Should decrease four times (9 -> 2)
+    co_await step_seconds(2);
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), (9 / 4) + 1);
+
+    // Should be reset
+    co_await step_seconds(10);
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), 1);
+}
+
+SEASTAR_TEST_CASE(test_rate_limiter_time_window_wraparound_handling) {
+    test_rate_limiter::label lbl;
+    test_rate_limiter limiter;
+
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), 1);
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), 2);
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), 3);
+
+    // Advance far into the future so that the time window wraps around
+    co_await step_seconds(1 << test_rate_limiter::time_window_bits);
+
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), 1);
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), 2);
+    BOOST_REQUIRE_EQUAL(limiter.increase_and_get_counter(lbl, 0), 3);
+
+    // TODO: Workaround for seastar#1072. Calling `manual_clock::advance`
+    // multiple times and then quitting the test immediately causes
+    // the test framework to hang. I didn't have the time to debug it, but I
+    // suspect there are some pending tasks which need to finish before exiting
+    // from the main test task.
+    co_await seastar::sleep(std::chrono::seconds(1));
+}
+
+SEASTAR_TEST_CASE(test_rate_limiter_account_operation) {
+    const uint64_t limit = 1;
+    const int ops_per_loop = 1000;
+    test_rate_limiter::label lbl;
+
+    test_rate_limiter limiter;
+
+    // We use UINT_MAX as the random parameter so that we get rejected quickly
+    db::per_partition_rate_limit::account_and_enforce info {
+        .random_variable = UINT32_MAX,
+    };
+
+    bool encountered_rejection = false;
+    for (int i = 0; i < ops_per_loop; i++) {
+        if (limiter.account_operation(lbl, 0, limit, info) == test_rate_limiter::can_proceed::no) {
+            encountered_rejection = true;
+            break;
+        }
+        co_await maybe_yield();
+    }
+    BOOST_REQUIRE(encountered_rejection);
+}
diff --git a/test/lib/cql_test_env.cc b/test/lib/cql_test_env.cc
index b1d68125e0..21b577d276 100644
--- a/test/lib/cql_test_env.cc
+++ b/test/lib/cql_test_env.cc
@@ -101,6 +101,7 @@ cql_test_config::cql_test_config(shared_ptr<db::config> cfg)
     db_config->commitlog_use_o_dsync.set(false);
 
     db_config->add_cdc_extension();
+    db_config->add_per_partition_rate_limit_extension();
 
     db_config->flush_schema_tables_after_modification.set(false);
 }
diff --git a/thrift/handler.cc b/thrift/handler.cc
index c06aa6ca1b..6843959058 100644
--- a/thrift/handler.cc
+++ b/thrift/handler.cc
@@ -405,8 +405,10 @@ public:
         clustering_ranges.emplace_back(query::clustering_range::make_open_ended_both_sides());
         auto slice = query::partition_slice(std::move(clustering_ranges), { }, std::move(regular_columns), opts,
                 std::move(specific_ranges), cql_serialization_format::internal());
-        return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), proxy.get_max_result_size(slice),
+        auto cmd = make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), proxy.get_max_result_size(slice),
                 query::row_limit(row_limit), query::partition_limit(partition_limit));
+        cmd->allow_limit = db::allow_per_partition_rate_limit::yes;
+        return cmd;
     }
 
     static future<> do_get_paged_slice(
@@ -511,7 +513,7 @@ public:
             add_to_mutation(*schema, column, m_to_apply);
             return _query_state.get_client_state().has_schema_access(_db, *schema, auth::permission::MODIFY).then([this, m_to_apply = std::move(m_to_apply), consistency_level, permit = std::move(permit)] () mutable {
                 auto timeout = db::timeout_clock::now() + _timeout_config.write_timeout;
-                return _proxy.local().mutate({std::move(m_to_apply)}, cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit));
+                return _proxy.local().mutate({std::move(m_to_apply)}, cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit), db::allow_per_partition_rate_limit::yes);
             });
         });
     }
@@ -527,7 +529,7 @@ public:
             add_to_mutation(*schema, column, m_to_apply);
             return _query_state.get_client_state().has_schema_access(_db, *schema, auth::permission::MODIFY).then([this, m_to_apply = std::move(m_to_apply), consistency_level, permit = std::move(permit)] () mutable {
                 auto timeout = db::timeout_clock::now() + _timeout_config.write_timeout;
-                return _proxy.local().mutate({std::move(m_to_apply)}, cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit));
+                return _proxy.local().mutate({std::move(m_to_apply)}, cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit), db::allow_per_partition_rate_limit::yes);
             });
         });
     }
@@ -564,7 +566,7 @@ public:
 
             return _query_state.get_client_state().has_schema_access(_db, *schema, auth::permission::MODIFY).then([this, m_to_apply = std::move(m_to_apply), consistency_level, permit = std::move(permit)] () mutable {
                 auto timeout = db::timeout_clock::now() + _timeout_config.write_timeout;
-                return _proxy.local().mutate({std::move(m_to_apply)}, cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit));
+                return _proxy.local().mutate({std::move(m_to_apply)}, cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit), db::allow_per_partition_rate_limit::yes);
             });
         });
     }
@@ -591,7 +593,7 @@ public:
             return _query_state.get_client_state().has_schema_access(_db, *schema, auth::permission::MODIFY).then([this, m_to_apply = std::move(m_to_apply), consistency_level, permit = std::move(permit)] () mutable {
                 // This mutation contains only counter tombstones so it can be applied like non-counter mutations.
                 auto timeout = db::timeout_clock::now() + _timeout_config.counter_write_timeout;
-                return _proxy.local().mutate({std::move(m_to_apply)}, cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit));
+                return _proxy.local().mutate({std::move(m_to_apply)}, cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit), db::allow_per_partition_rate_limit::yes);
             });
         });
     }
@@ -604,7 +606,7 @@ public:
                 return _query_state.get_client_state().has_schema_access(_db, *schema, auth::permission::MODIFY);
             }).then([this, muts = std::move(p.first), consistency_level, permit = std::move(permit)] () mutable {
                 auto timeout = db::timeout_clock::now() + _timeout_config.write_timeout;
-                return _proxy.local().mutate(std::move(muts), cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit));
+                return _proxy.local().mutate(std::move(muts), cl_from_thrift(consistency_level), timeout, nullptr, std::move(permit), db::allow_per_partition_rate_limit::yes);
             });
         });
     }
@@ -692,6 +694,7 @@ public:
             auto& proxy = _proxy.local();
             auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), proxy.get_max_result_size(slice),
                     query::row_limit(row_limit));
+            cmd->allow_limit = db::allow_per_partition_rate_limit::yes;
             auto f = _query_state.get_client_state().has_schema_access(_db, *schema, auth::permission::SELECT);
             return f.then([this, &proxy, dk = std::move(dk), cmd, schema, column_limit = request.count, cl = request.consistency_level, permit = std::move(permit)] () mutable {
                 auto timeout = db::timeout_clock::now() + _timeout_config.read_timeout;
@@ -1611,7 +1614,9 @@ private:
         }
         auto slice = query::partition_slice(std::move(clustering_ranges), {}, std::move(regular_columns), opts,
                 nullptr, cql_serialization_format::internal(), per_partition_row_limit);
-        return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), proxy.get_max_result_size(slice));
+        auto cmd = make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), proxy.get_max_result_size(slice));
+        cmd->allow_limit = db::allow_per_partition_rate_limit::yes;
+        return cmd;
     }
     static ColumnParent column_path_to_column_parent(const ColumnPath& column_path) {
         ColumnParent ret;
diff --git a/transport/cql_protocol_extension.cc b/transport/cql_protocol_extension.cc
index 7b0f1dbb4b..5193541a93 100644
--- a/transport/cql_protocol_extension.cc
+++ b/transport/cql_protocol_extension.cc
@@ -9,13 +9,15 @@
 #include <seastar/core/print.hh>
 #include "transport/cql_protocol_extension.hh"
 #include "cql3/result_set.hh"
+#include "exceptions/exceptions.hh"
 
 #include <map>
 
 namespace cql_transport {
 
 static const std::map<cql_protocol_extension, seastar::sstring> EXTENSION_NAMES = {
-    {cql_protocol_extension::LWT_ADD_METADATA_MARK, "SCYLLA_LWT_ADD_METADATA_MARK"}
+    {cql_protocol_extension::LWT_ADD_METADATA_MARK, "SCYLLA_LWT_ADD_METADATA_MARK"},
+    {cql_protocol_extension::RATE_LIMIT_ERROR, "SCYLLA_RATE_LIMIT_ERROR"}
 };
 
 cql_protocol_extension_enum_set supported_cql_protocol_extensions() {
@@ -30,6 +32,8 @@ std::vector<seastar::sstring> additional_options_for_proto_ext(cql_protocol_exte
     switch (ext) {
         case cql_protocol_extension::LWT_ADD_METADATA_MARK:
             return {format("LWT_OPTIMIZATION_META_BIT_MASK={:d}", cql3::prepared_metadata::LWT_FLAG_MASK)};
+        case cql_protocol_extension::RATE_LIMIT_ERROR:
+            return {format("ERROR_CODE={:d}", exceptions::exception_code::RATE_LIMIT_ERROR)};
         default:
             return {};
     }
diff --git a/transport/cql_protocol_extension.hh b/transport/cql_protocol_extension.hh
index ede06e8a52..413358d8c0 100644
--- a/transport/cql_protocol_extension.hh
+++ b/transport/cql_protocol_extension.hh
@@ -28,11 +28,13 @@ namespace cql_transport {
  * `docs/protocol-extensions.md`. 
  */
 enum class cql_protocol_extension {
-    LWT_ADD_METADATA_MARK
+    LWT_ADD_METADATA_MARK,
+    RATE_LIMIT_ERROR
 };
 
 using cql_protocol_extension_enum = super_enum<cql_protocol_extension,
-    cql_protocol_extension::LWT_ADD_METADATA_MARK>;
+    cql_protocol_extension::LWT_ADD_METADATA_MARK,
+    cql_protocol_extension::RATE_LIMIT_ERROR>;
 
 using cql_protocol_extension_enum_set = enum_set<cql_protocol_extension_enum>;
 
diff --git a/transport/server.cc b/transport/server.cc
index b0f2ac332e..e9edff9e47 100644
--- a/transport/server.cc
+++ b/transport/server.cc
@@ -37,6 +37,7 @@
 #include <seastar/core/execution_stage.hh>
 #include "utils/result_try.hh"
 #include "utils/result_combinators.hh"
+#include "db/operation_type.hh"
 
 #include "enum_set.hh"
 #include "service/query_state.hh"
@@ -483,6 +484,9 @@ future<foreign_ptr<std::unique_ptr<cql_server::response>>>
         }), utils::result_catch<exceptions::function_execution_exception>([&] (const auto& ex) {
             try { ++_server._stats.errors[ex.code()]; } catch(...) {}
             return make_function_failure_error(stream, ex.code(), ex.what(), ex.ks_name, ex.func_name, ex.args, trace_state);
+        }), utils::result_catch<exceptions::rate_limit_exception>([&] (const auto& ex) {
+            try { ++_server._stats.errors[ex.code()]; } catch(...) {}
+            return make_rate_limit_error(stream, ex.code(), ex.what(), ex.op_type, ex.rejected_by_coordinator, trace_state, client_state);
         }), utils::result_catch<exceptions::cassandra_exception>([&] (const auto& ex) {
             // Note: the CQL protocol specifies that many types of errors have
             // mandatory parameters. These cassandra_exception subclasses MUST
@@ -1275,6 +1279,20 @@ std::unique_ptr<cql_server::response> cql_server::connection::make_function_fail
     return response;
 }
 
+std::unique_ptr<cql_server::response> cql_server::connection::make_rate_limit_error(int16_t stream, exceptions::exception_code err, sstring msg, db::operation_type op_type, bool rejected_by_coordinator, const tracing::trace_state_ptr& tr_state, const service::client_state& client_state) const
+{
+    if (!client_state.is_protocol_extension_set(cql_protocol_extension::RATE_LIMIT_ERROR)) {
+        return make_error(stream, exceptions::exception_code::CONFIG_ERROR, std::move(msg), tr_state);
+    }
+
+    auto response = std::make_unique<cql_server::response>(stream, cql_binary_opcode::ERROR, tr_state);
+    response->write_int(static_cast<int32_t>(err));
+    response->write_string(msg);
+    response->write_byte(static_cast<uint8_t>(op_type));
+    response->write_byte(static_cast<uint8_t>(rejected_by_coordinator));
+    return response;
+}
+
 std::unique_ptr<cql_server::response> cql_server::connection::make_error(int16_t stream, exceptions::exception_code err, sstring msg, const tracing::trace_state_ptr& tr_state) const
 {
     auto response = std::make_unique<cql_server::response>(stream, cql_binary_opcode::ERROR, tr_state);
diff --git a/transport/server.hh b/transport/server.hh
index e78a0c2736..d2437c39b6 100644
--- a/transport/server.hh
+++ b/transport/server.hh
@@ -31,6 +31,7 @@
 #include "transport/messages/result_message.hh"
 #include "utils/chunked_vector.hh"
 #include "exceptions/coordinator_result.hh"
+#include "db/operation_type.hh"
 
 namespace cql3 {
 
@@ -247,6 +248,7 @@ private:
         std::unique_ptr<cql_server::response> make_already_exists_error(int16_t stream, exceptions::exception_code err, sstring msg, sstring ks_name, sstring cf_name, const tracing::trace_state_ptr& tr_state) const;
         std::unique_ptr<cql_server::response> make_unprepared_error(int16_t stream, exceptions::exception_code err, sstring msg, bytes id, const tracing::trace_state_ptr& tr_state) const;
         std::unique_ptr<cql_server::response> make_function_failure_error(int16_t stream, exceptions::exception_code err, sstring msg, sstring ks_name, sstring func_name, std::vector<sstring> args, const tracing::trace_state_ptr& tr_state) const;
+        std::unique_ptr<cql_server::response> make_rate_limit_error(int16_t stream, exceptions::exception_code err, sstring msg, db::operation_type op_type, bool rejected_by_coordinator, const tracing::trace_state_ptr& tr_state, const service::client_state& client_state) const;
         std::unique_ptr<cql_server::response> make_error(int16_t stream, exceptions::exception_code err, sstring msg, const tracing::trace_state_ptr& tr_state) const;
         std::unique_ptr<cql_server::response> make_ready(int16_t stream, const tracing::trace_state_ptr& tr_state) const;
         std::unique_ptr<cql_server::response> make_supported(int16_t stream, const tracing::trace_state_ptr& tr_state) const;