system_keyspace: Prune dropped tables from truncation on start/drop

Fixes #25683 Once a table drop is complete, there should be no reason to retain truncation records for it, as any replay should skip mutations anyway (no CF), and iff we somehow resurrect a dropped table, this replay-resurrected data is the least problem anyway. Adds a prune phase to the startup drop_truncation_rp_records run, which ignores updating, and instead deletes records for non-existant tables (which should patch any existing servers with lingering data as well). Also does an explicit delete of records on actual table DROP, to ensure we don't grow this table more than needed even in long uptime nodes. Small unit test included. Closes scylladb/scylladb#25699 (cherry picked from commit bc20861afb) Closes scylladb/scylladb#25815
Merge '[Backport 2025.3] drop table: fix crash on drop table with concurrent cleanup' from Scylladb[bot]
2025-09-05 19:02:39 +03:00 · 2025-09-05 19:02:04 +03:00 · 2025-09-05 19:01:22 +03:00 · 2025-09-05 10:08:29 +03:00 · 2025-09-04 11:38:55 +03:00 · 2025-09-04 11:38:17 +03:00
207 changed files with 5282 additions and 1624 deletions
--- a/.github/scripts/auto-backport.py
+++ b/.github/scripts/auto-backport.py
@@ -112,10 +112,15 @@ def backport(repo, pr, version, commits, backport_base_branch, is_collaborator):
                    is_draft = True
                    repo_local.git.add(A=True)
                    repo_local.git.cherry_pick('--continue')
-            repo_local.git.push(fork_repo, new_branch_name, force=True)
-            create_pull_request(repo, new_branch_name, backport_base_branch, pr, backport_pr_title, commits,
-                                is_draft, is_collaborator)
-
+            # Check if the branch already exists in the remote fork
+            remote_refs = repo_local.git.ls_remote('--heads', fork_repo, new_branch_name)
+            if not remote_refs:
+                # Branch does not exist, create it with a regular push
+                repo_local.git.push(fork_repo, new_branch_name)
+                create_pull_request(repo, new_branch_name, backport_base_branch, pr, backport_pr_title, commits,
+                                    is_draft, is_collaborator)
+            else:
+                logging.info(f"Remote branch {new_branch_name} already exists in fork. Skipping push.")
        except GitCommandError as e:
            logging.warning(f"GitCommandError: {e}")

--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2025.3.0-dev
+VERSION=2025.3.1

 if test -f version
 then
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -38,7 +38,6 @@
 #include <optional>
 #include "utils/assert.hh"
 #include "utils/overloaded_functor.hh"
-#include <seastar/json/json_elements.hh>
 #include "collection_mutation.hh"
 #include "schema/schema.hh"
 #include "db/tags/extension.hh"
@@ -121,47 +120,50 @@ static lw_shared_ptr<stats> get_stats_from_schema(service::storage_proxy& sp, co
    }
 }

-make_jsonable::make_jsonable(rjson::value&& value)
-    : _value(std::move(value))
-{}
-std::string make_jsonable::to_json() const {
-    return rjson::print(_value);
-}
-
-json::json_return_type make_streamed(rjson::value&& value) {
-    // CMH. json::json_return_type uses std::function, not noncopyable_function.
-    // Need to make a copyable version of value. Gah.
-    auto rs = make_shared<rjson::value>(std::move(value));
-    std::function<future<>(output_stream<char>&&)> func = [rs](output_stream<char>&& os) mutable -> future<> {
-        // move objects to coroutine frame.
-        auto los = std::move(os);
-        auto lrs = std::move(rs);
+executor::body_writer make_streamed(rjson::value&& value) {
+    return [value = std::move(value)](output_stream<char>&& _out) mutable -> future<> {
+        auto out = std::move(_out);
        std::exception_ptr ex;
        try {
-            co_await rjson::print(*lrs, los);
+            co_await rjson::print(value, out);
        } catch (...) {
-            // at this point, we cannot really do anything. HTTP headers and return code are
-            // already written, and quite potentially a portion of the content data.
-            // just log + rethrow. It is probably better the HTTP server closes connection
-            // abruptly or something...
            ex = std::current_exception();
-            elogger.error("Exception during streaming HTTP response: {}", ex);
        }
-        co_await los.close();
-        co_await rjson::destroy_gently(std::move(*lrs));
+        co_await out.close();
+        co_await rjson::destroy_gently(std::move(value));
        if (ex) {
            co_await coroutine::return_exception_ptr(std::move(ex));
        }
-        co_return;
    };
-    return func;
 }

-json_string::json_string(std::string&& value)
-    : _value(std::move(value))
-{}
-std::string json_string::to_json() const {
-    return _value;
+// make_streamed_with_extra_array() is variant of make_streamed() above, which
+// builds a streaming response (a function writing to an output stream) from a
+// JSON object (rjson::value) but adds to it at the end an additional array.
+// The extra array is given a separate chunked_vector to avoid putting it
+// inside the rjson::value - because RapidJSON does contiguous allocations for
+// arrays which we want to avoid for potentially long arrays in Query/Scan
+// responses (see #23535).
+// If we ever fix RapidJSON to avoid contiguous allocations for arrays, or
+// replace it entirely (#24458), we can remove this function and the function
+// rjson::print_with_extra_array() which it calls.
+executor::body_writer make_streamed_with_extra_array(rjson::value&& value,
+    std::string array_name, utils::chunked_vector<rjson::value>&& array) {
+    return [value = std::move(value), array_name = std::move(array_name), array = std::move(array)](output_stream<char>&& _out) mutable -> future<> {
+        auto out = std::move(_out);
+        std::exception_ptr ex;
+        try {
+            co_await rjson::print_with_extra_array(value, array_name, array, out);
+        } catch (...) {
+            ex = std::current_exception();
+        }
+        co_await out.close();
+        co_await rjson::destroy_gently(std::move(value));
+        // TODO: can/should we also destroy the array gently?
+        if (ex) {
+            co_await coroutine::return_exception_ptr(std::move(ex));
+        }
+    };
 }

 // This function throws api_error::validation if input value is not an object.
@@ -764,7 +766,7 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
    rjson::value response = rjson::empty_object();
    rjson::add(response, "Table", std::move(table_description));
    elogger.trace("returning {}", response);
-    co_return make_jsonable(std::move(response));
+    co_return rjson::print(std::move(response));
 }

 // Check CQL's Role-Based Access Control (RBAC) permission_to_check (MODIFY,
@@ -881,7 +883,7 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
    rjson::value response = rjson::empty_object();
    rjson::add(response, "TableDescription", std::move(table_description));
    elogger.trace("returning {}", response);
-    co_return make_jsonable(std::move(response));
+    co_return rjson::print(std::move(response));
 }

 static data_type parse_key_type(std::string_view type) {
@@ -1165,7 +1167,7 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
        update_tags_map(*tags, tags_map, update_tags_action::add_tags);
    });
-    co_return json_string("");
+    co_return ""; // empty response
 }

 future<executor::request_return_type> executor::untag_resource(client_state& client_state, service_permit permit, rjson::value request) {
@@ -1186,7 +1188,7 @@ future<executor::request_return_type> executor::untag_resource(client_state& cli
    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
        update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
    });
-    co_return json_string("");
+    co_return ""; // empty response
 }

 future<executor::request_return_type> executor::list_tags_of_resource(client_state& client_state, service_permit permit, rjson::value request) {
@@ -1212,7 +1214,7 @@ future<executor::request_return_type> executor::list_tags_of_resource(client_sta
        rjson::push_back(tags, std::move(new_entry));
    }

-    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+    return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
 }

 struct billing_mode_type {
@@ -1674,7 +1676,7 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
    rjson::value status = rjson::empty_object();
    executor::supplement_table_info(request, *schema, sp);
    rjson::add(status, "TableDescription", std::move(request));
-    co_return make_jsonable(std::move(status));
+    co_return rjson::print(std::move(status));
 }

 future<executor::request_return_type> executor::create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
@@ -1951,7 +1953,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
        rjson::value status = rjson::empty_object();
        supplement_table_info(request, *schema, p.local());
        rjson::add(status, "TableDescription", std::move(request));
-        co_return make_jsonable(std::move(status));
+        co_return rjson::print(std::move(status));
    });
 }

@@ -2417,7 +2419,7 @@ static future<executor::request_return_type> rmw_operation_return(rjson::value&&
    if (!attributes.IsNull()) {
        rjson::add(ret, "Attributes", std::move(attributes));
    }
-    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+    return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
 }

 static future<std::unique_ptr<rjson::value>> get_previous_item(
@@ -3009,7 +3011,7 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
        rjson::add(ret, "ConsumedCapacity", std::move(consumed_capacity));
    }
    _stats.api_operations.batch_write_item_latency.mark(std::chrono::steady_clock::now() - start_time);
-    co_return make_jsonable(std::move(ret));
+    co_return rjson::print(std::move(ret));
 }

 static const std::string_view get_item_type_string(const rjson::value& v) {
@@ -4249,18 +4251,17 @@ future<executor::request_return_type> executor::get_item(client_state& client_st
    verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "GetItem");
    rcu_consumed_capacity_counter add_capacity(request, cl == db::consistency_level::LOCAL_QUORUM);
    co_await verify_permission(_enforce_authorization, client_state, schema, auth::permission::SELECT);
-    co_return co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl,
-            service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state, trace_state)).then(
-            [per_table_stats, this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = std::move(attrs_to_get), start_time = std::move(start_time), add_capacity=std::move(add_capacity)] (service::storage_proxy::coordinator_query_result qr) mutable {
-
-        per_table_stats->api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
-        _stats.api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
-        uint64_t rcu_half_units = 0;
-        auto res = make_ready_future<executor::request_return_type>(make_jsonable(describe_item(schema, partition_slice, *selection, *qr.query_result, std::move(attrs_to_get), add_capacity, rcu_half_units)));
-        per_table_stats->rcu_half_units_total += rcu_half_units;
-        _stats.rcu_half_units_total += rcu_half_units;
-        return res;
-    });
+    service::storage_proxy::coordinator_query_result qr =
+        co_await _proxy.query(
+            schema, std::move(command), std::move(partition_ranges), cl,
+            service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state, trace_state));
+    per_table_stats->api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
+    _stats.api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
+    uint64_t rcu_half_units = 0;
+    rjson::value res = describe_item(schema, partition_slice, *selection, *qr.query_result, std::move(attrs_to_get), add_capacity, rcu_half_units);
+    per_table_stats->rcu_half_units_total += rcu_half_units;
+    _stats.rcu_half_units_total += rcu_half_units;
+    co_return rjson::print(std::move(res));
 }

 static void check_big_object(const rjson::value& val, int& size_left);
@@ -4505,7 +4506,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
    if (is_big(response)) {
        co_return make_streamed(std::move(response));
    } else {
-        co_return make_jsonable(std::move(response));
+        co_return rjson::print(std::move(response));
    }
 }

@@ -4649,7 +4650,11 @@ class describe_items_visitor {
    const filter& _filter;
    typename columns_t::const_iterator _column_it;
    rjson::value _item;
-    rjson::value _items;
+    // _items is a chunked_vector<rjson::value> instead of a RapidJson array
+    // (rjson::value) because unfortunately RapidJson arrays are stored
+    // contiguously in memory, and cause large allocations when a Query/Scan
+    // returns a long list of short items (issue #23535).
+    utils::chunked_vector<rjson::value> _items;
    size_t _scanned_count;

 public:
@@ -4659,7 +4664,6 @@ public:
            , _filter(filter)
            , _column_it(columns.begin())
            , _item(rjson::empty_object())
-            , _items(rjson::empty_array())
            , _scanned_count(0)
    {
        // _filter.check() may need additional attributes not listed in
@@ -4738,13 +4742,13 @@ public:
                rjson::remove_member(_item, attr);
            }

-            rjson::push_back(_items, std::move(_item));
+            _items.push_back(std::move(_item));
        }
        _item = rjson::empty_object();
        ++_scanned_count;
    }

-    rjson::value get_items() && {
+    utils::chunked_vector<rjson::value> get_items() && {
        return std::move(_items);
    }

@@ -4753,13 +4757,25 @@ public:
    }
 };

-static future<std::tuple<rjson::value, size_t>> describe_items(const cql3::selection::selection& selection, std::unique_ptr<cql3::result_set> result_set, std::optional<attrs_to_get>&& attrs_to_get, filter&& filter) {
+// describe_items() returns a JSON object that includes members "Count"
+// and "ScannedCount", but *not* "Items" - that is returned separately
+// as a chunked_vector to avoid large contiguous allocations which
+// RapidJSON does of its array. The caller should add "Items" to the
+// returned JSON object if needed, or print it separately.
+// The returned chunked_vector (the items) is std::optional<>, because
+// the user may have requested only to count items, and not return any
+// items - which is different from returning an empty list of items.
+static future<std::tuple<rjson::value, std::optional<utils::chunked_vector<rjson::value>>, size_t>> describe_items(
+        const cql3::selection::selection& selection,
+        std::unique_ptr<cql3::result_set> result_set,
+        std::optional<attrs_to_get>&& attrs_to_get,
+        filter&& filter) {
    describe_items_visitor visitor(selection.get_columns(), attrs_to_get, filter);
    co_await result_set->visit_gently(visitor);
    auto scanned_count = visitor.get_scanned_count();
-    rjson::value items = std::move(visitor).get_items();
+    utils::chunked_vector<rjson::value> items = std::move(visitor).get_items();
    rjson::value items_descr = rjson::empty_object();
-    auto size = items.Size();
+    auto size = items.size();
    rjson::add(items_descr, "Count", rjson::value(size));
    rjson::add(items_descr, "ScannedCount", rjson::value(scanned_count));
    // If attrs_to_get && attrs_to_get->empty(), this means the user asked not
@@ -4769,10 +4785,11 @@ static future<std::tuple<rjson::value, size_t>> describe_items(const cql3::selec
    // In that case, we currently build a list of empty items and here drop
    // it. We could just count the items and not bother with the empty items.
    // (However, remember that when we do have a filter, we need the items).
+    std::optional<utils::chunked_vector<rjson::value>> opt_items;
    if (!attrs_to_get || !attrs_to_get->empty()) {
-        rjson::add(items_descr, "Items", std::move(items));
+        opt_items = std::move(items);
    }
-    co_return std::tuple<rjson::value, size_t>{std::move(items_descr), size};
+    co_return std::tuple(std::move(items_descr), std::move(opt_items), size);
 }

 static rjson::value encode_paging_state(const schema& schema, const service::pager::paging_state& paging_state) {
@@ -4810,6 +4827,12 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag
    return last_evaluated_key;
 }

+// RapidJSON allocates arrays contiguously in memory, so we want to avoid
+// returning a large number of items as a single rapidjson array, and use
+// a chunked_vector instead. The following constant is an arbitrary cutoff
+// point for when to switch from a rapidjson array to a chunked_vector.
+static constexpr int max_items_for_rapidjson_array = 256;
+
 static future<executor::request_return_type> do_query(service::storage_proxy& proxy,
        schema_ptr table_schema,
        const rjson::value* exclusive_start_key,
@@ -4882,19 +4905,35 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
    }
    auto paging_state = rs->get_metadata().paging_state();
    bool has_filter = filter;
-    auto [items, size] = co_await describe_items(*selection, std::move(rs), std::move(attrs_to_get), std::move(filter));
+    auto [items_descr, opt_items, size] = co_await describe_items(*selection, std::move(rs), std::move(attrs_to_get), std::move(filter));
    if (paging_state) {
-        rjson::add(items, "LastEvaluatedKey", encode_paging_state(*table_schema, *paging_state));
+        rjson::add(items_descr, "LastEvaluatedKey", encode_paging_state(*table_schema, *paging_state));
    }
    if (has_filter){
        cql_stats.filtered_rows_read_total += p->stats().rows_read_total;
        // update our "filtered_row_matched_total" for all the rows matched, despited the filter
        cql_stats.filtered_rows_matched_total += size;
    }
-    if (is_big(items)) {
-        co_return executor::request_return_type(make_streamed(std::move(items)));
+    if (opt_items) {
+        if (opt_items->size() >= max_items_for_rapidjson_array) {
+            // There are many items, better print the JSON and the array of
+            // items (opt_items) separately to avoid RapidJSON's contiguous
+            // allocation of arrays.
+            co_return make_streamed_with_extra_array(std::move(items_descr), "Items", std::move(*opt_items));
+        }
+        // There aren't many items in the chunked vector opt_items,
+        // let's just insert them into the JSON object and print the
+        // full JSON normally.
+        rjson::value items_json = rjson::empty_array();
+        for (auto& item : *opt_items) {
+            rjson::push_back(items_json, std::move(item));
+        }
+        rjson::add(items_descr, "Items", std::move(items_json));
    }
-    co_return executor::request_return_type(make_jsonable(std::move(items)));
+    if (is_big(items_descr)) {
+        co_return make_streamed(std::move(items_descr));
+    }
+    co_return rjson::print(std::move(items_descr));
 }

 static dht::token token_for_segment(int segment, int total_segments) {
@@ -5489,7 +5528,7 @@ future<executor::request_return_type> executor::list_tables(client_state& client
    std::string exclusive_start = exclusive_start_json ? exclusive_start_json->GetString() : "";
    int limit = limit_json ? limit_json->GetInt() : 100;
    if (limit < 1 || limit > 100) {
-        return make_ready_future<request_return_type>(api_error::validation("Limit must be greater than 0 and no greater than 100"));
+        co_return api_error::validation("Limit must be greater than 0 and no greater than 100");
    }

    auto tables = _proxy.data_dictionary().get_tables(); // hold on to temporary, table_names isn't a container, it's a view
@@ -5531,7 +5570,7 @@ future<executor::request_return_type> executor::list_tables(client_state& client
        rjson::add(response, "LastEvaluatedTableName", rjson::copy(last_table_name));
    }

-    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(response)));
+    co_return rjson::print(std::move(response));
 }

 future<executor::request_return_type> executor::describe_endpoints(client_state& client_state, service_permit permit, rjson::value request, std::string host_header) {
@@ -5542,8 +5581,8 @@ future<executor::request_return_type> executor::describe_endpoints(client_state&
    if (!override.empty()) {
        if (override == "disabled") {
            _stats.unsupported_operations++;
-            return make_ready_future<request_return_type>(api_error::unknown_operation(
-                "DescribeEndpoints disabled by configuration (alternator_describe_endpoints=disabled)"));
+            co_return api_error::unknown_operation(
+                "DescribeEndpoints disabled by configuration (alternator_describe_endpoints=disabled)");
        }
        host_header = std::move(override);
    }
@@ -5555,13 +5594,13 @@ future<executor::request_return_type> executor::describe_endpoints(client_state&
    // A "Host:" header includes both host name and port, exactly what we need
    // to return.
    if (host_header.empty()) {
-        return make_ready_future<request_return_type>(api_error::validation("DescribeEndpoints needs a 'Host:' header in request"));
+        co_return api_error::validation("DescribeEndpoints needs a 'Host:' header in request");
    }
    rjson::add(response, "Endpoints", rjson::empty_array());
    rjson::push_back(response["Endpoints"], rjson::empty_object());
    rjson::add(response["Endpoints"][0], "Address", rjson::from_string(host_header));
    rjson::add(response["Endpoints"][0], "CachePeriodInMinutes", rjson::value(1440));
-    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(response)));
+    co_return rjson::print(std::move(response));
 }

 static std::map<sstring, sstring> get_network_topology_options(service::storage_proxy& sp, gms::gossiper& gossiper, int rf) {
@@ -5596,7 +5635,7 @@ future<executor::request_return_type> executor::describe_continuous_backups(clie
    rjson::add(desc, "PointInTimeRecoveryDescription", std::move(pitr));
    rjson::value response = rjson::empty_object();
    rjson::add(response, "ContinuousBackupsDescription", std::move(desc));
-    co_return make_jsonable(std::move(response));
+    co_return rjson::print(std::move(response));
 }

 // Create the metadata for the keyspace in which we put the alternator
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -10,8 +10,8 @@

 #include <seastar/core/future.hh>
 #include "seastarx.hh"
-#include <seastar/json/json_elements.hh>
 #include <seastar/core/sharded.hh>
+#include <seastar/util/noncopyable_function.hh>

 #include "service/migration_manager.hh"
 #include "service/client_state.hh"
@@ -58,29 +58,6 @@ namespace alternator {

 class rmw_operation;

-struct make_jsonable : public json::jsonable {
-    rjson::value _value;
-public:
-    explicit make_jsonable(rjson::value&& value);
-    std::string to_json() const override;
-};
-
-/**
- * Make return type for serializing the object "streamed",
- * i.e. direct to HTTP output stream. Note: only useful for
- * (very) large objects as there are overhead issues with this
- * as well, but for massive lists of return objects this can
- * help avoid large allocations/many re-allocs
- */
-json::json_return_type make_streamed(rjson::value&&);
-
-struct json_string : public json::jsonable {
-    std::string _value;
-public:
-    explicit json_string(std::string&& value);
-    std::string to_json() const override;
-};
-
 namespace parsed {
 class path;
 };
@@ -169,7 +146,19 @@ class executor : public peering_sharded_service<executor> {

 public:
    using client_state = service::client_state;
-    using request_return_type = std::variant<json::json_return_type, api_error>;
+    // request_return_type is the return type of the executor methods, which
+    // can be one of:
+    // 1. A string, which is the response body for the request.
+    // 2. A body_writer, an asynchronous function (returning future<>) that
+    //    takes an output_stream and writes the response body into it.
+    // 3. An api_error, which is an error response that should be returned to
+    //    the client.
+    // The body_writer is used for streaming responses, where the response body
+    // is written in chunks to the output_stream. This allows for efficient
+    // handling of large responses without needing to allocate a large buffer
+    // in memory.
+    using body_writer = noncopyable_function<future<>(output_stream<char>&&)>;
+    using request_return_type = std::variant<std::string, body_writer, api_error>;
    stats _stats;
    // The metric_groups object holds this stat object's metrics registered
    // as long as the stats object is alive.
@@ -275,4 +264,13 @@ bool is_big(const rjson::value& val, int big_size = 100'000);
 // appropriate user-readable api_error::access_denied is thrown.
 future<> verify_permission(bool enforce_authorization, const service::client_state&, const schema_ptr&, auth::permission);

+/**
+ * Make return type for serializing the object "streamed",
+ * i.e. direct to HTTP output stream. Note: only useful for
+ * (very) large objects as there are overhead issues with this
+ * as well, but for massive lists of return objects this can
+ * help avoid large allocations/many re-allocs
+ */
+executor::body_writer make_streamed(rjson::value&&);
+
 }
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -13,7 +13,6 @@
 #include <seastar/http/function_handlers.hh>
 #include <seastar/http/short_streams.hh>
 #include <seastar/core/coroutine.hh>
-#include <seastar/json/json_elements.hh>
 #include <seastar/util/defer.hh>
 #include <seastar/util/short_streams.hh>
 #include "seastarx.hh"
@@ -124,22 +123,22 @@ public:
             }
             auto res = resf.get();
             std::visit(overloaded_functor {
-                 [&] (const json::json_return_type& json_return_value) {
-                     slogger.trace("api_handler success case");
-                     if (json_return_value._body_writer) {
-                         // Unfortunately, write_body() forces us to choose
-                         // from a fixed and irrelevant list of "mime-types"
-                         // at this point. But we'll override it with the
-                         // one (application/x-amz-json-1.0) below.
-                         rep->write_body("json", std::move(json_return_value._body_writer));
-                     } else {
-                         rep->_content += json_return_value._res;
-                     }
-                 },
-                 [&] (const api_error& err) {
-                     generate_error_reply(*rep, err);
-                 }
-             }, res);
+                [&] (std::string&& str) {
+                    // Note that despite the move, there is a copy here -
+                    // as str is std::string and rep->_content is sstring.
+                    rep->_content = std::move(str);
+                },
+                [&] (executor::body_writer&& body_writer) {
+                    // Unfortunately, write_body() forces us to choose
+                    // from a fixed and irrelevant list of "mime-types"
+                    // at this point. But we'll override it with the
+                    // correct one (application/x-amz-json-1.0) below.
+                    rep->write_body("json", std::move(body_writer));
+                },
+                [&] (const api_error& err) {
+                    generate_error_reply(*rep, err);
+                }
+             }, std::move(res));

             return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
         });
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -217,7 +217,7 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
        rjson::add(ret, "LastEvaluatedStreamArn", *last);
    }

-    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+    return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
 }

 struct shard_id {
@@ -491,7 +491,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    if (!opts.enabled()) {
        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-        return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
    }

    // TODO: label
@@ -617,7 +617,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
        rjson::add(stream_desc, "Shards", std::move(shards));
        rjson::add(ret, "StreamDescription", std::move(stream_desc));
            
-        return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
    });
 }

@@ -770,7 +770,7 @@ future<executor::request_return_type> executor::get_shard_iterator(client_state&
    auto ret = rjson::empty_object();
    rjson::add(ret, "ShardIterator", iter);

-    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+    return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
 }

 struct event_id {
@@ -1021,7 +1021,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
            // will notice end end of shard and not return NextShardIterator.
            rjson::add(ret, "NextShardIterator", next_iter);
            _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
-            return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
        }

        // ugh. figure out if we are and end-of-shard
@@ -1047,7 +1047,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
            if (is_big(ret)) {
                return make_ready_future<executor::request_return_type>(make_streamed(std::move(ret)));
            }
-            return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
        });
    });
 }
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -118,7 +118,7 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
    // basically identical to the request's
    rjson::value response = rjson::empty_object();
    rjson::add(response, "TimeToLiveSpecification", std::move(*spec));
-    co_return make_jsonable(std::move(response));
+    co_return rjson::print(std::move(response));
 }

 future<executor::request_return_type> executor::describe_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
@@ -135,7 +135,7 @@ future<executor::request_return_type> executor::describe_time_to_live(client_sta
    }
    rjson::value response = rjson::empty_object();
    rjson::add(response, "TimeToLiveDescription", std::move(desc));
-    co_return make_jsonable(std::move(response));
+    co_return rjson::print(std::move(response));
 }

 // expiration_service is a sharded service responsible for cleaning up expired
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -3161,6 +3161,22 @@
               ]
            }
         ]
+      },
+      {
+         "path":"/storage_service/raft_topology/cmd_rpc_status",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get information about currently running topology cmd rpc",
+               "type":"string",
+               "nickname":"raft_topology_get_cmd_status",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+               ]
+            }
+         ]
      }
   ],
   "models":{
@@ -3297,11 +3313,11 @@
         "properties":{
            "start_token":{
               "type":"string",
-               "description":"The range start token"
+               "description":"The range start token (exclusive)"
            },
            "end_token":{
               "type":"string",
-               "description":"The range start token"
+               "description":"The range end token (inclusive)"
            },
            "endpoints":{
               "type":"array",
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -749,13 +749,7 @@ rest_force_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
            fmopt = flush_mode::skip;
        }
        auto task = co_await compaction_module.make_and_start_task<global_major_compaction_task_impl>({}, db, fmopt, consider_only_existing_data);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("force_compaction failed: {}", std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json_void();
 }

@@ -774,13 +768,7 @@ rest_force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request>
            fmopt = flush_mode::skip;
        }
        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("force_keyspace_compaction: keyspace={} tables={} failed: {}", task->get_status().keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json_void();
 }

@@ -805,13 +793,7 @@ rest_force_keyspace_cleanup(http_context& ctx, sharded<service::storage_service>
        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>(
            {}, std::move(keyspace), db, table_infos, flush_mode::all_tables, tasks::is_user_task::yes);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("force_keyspace_cleanup: keyspace={} tables={} failed: {}", task->get_status().keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json::json_return_type(0);
 }

@@ -833,12 +815,7 @@ rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::
        auto& db = ctx.db;
        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<global_cleanup_compaction_task_impl>({}, db);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("cleanup_all failed: {}", std::current_exception());
-            throw;
-        }
+        co_await task->done();
        co_return json::json_return_type(0);
 }

@@ -850,13 +827,7 @@ rest_perform_keyspace_offstrategy_compaction(http_context& ctx, std::unique_ptr<
        bool res = false;
        auto& compaction_module = ctx.db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<offstrategy_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, table_infos, &res);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("perform_keyspace_offstrategy_compaction: keyspace={} tables={} failed: {}", task->get_status().keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json::json_return_type(res);
 }

@@ -871,13 +842,7 @@ rest_upgrade_sstables(http_context& ctx, std::unique_ptr<http::request> req) {

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("upgrade_sstables: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json::json_return_type(0);
 }

@@ -1670,6 +1635,18 @@ rest_raft_topology_upgrade_status(sharded<service::storage_service>& ss, std::un
        co_return sstring(format("{}", ustate));
 }

+static
+future<json::json_return_type>
+rest_raft_topology_get_cmd_status(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+        const auto status = co_await ss.invoke_on(0, [] (auto& ss) {
+            return ss.get_topology_cmd_status();
+        });
+        if (status.active_dst.empty()) {
+            co_return sstring("none");
+        }
+        co_return sstring(fmt::format("{}[{}]: {}", status.current, status.index, fmt::join(status.active_dst, ",")));
+}
+
 static
 future<json::json_return_type>
 rest_move_tablet(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -1902,6 +1879,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
    ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
    ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
+    ss::raft_topology_get_cmd_status.set(r, rest_bind(rest_raft_topology_get_cmd_status, ss));
    ss::move_tablet.set(r, rest_bind(rest_move_tablet, ctx, ss));
    ss::add_tablet_replica.set(r, rest_bind(rest_add_tablet_replica, ctx, ss));
    ss::del_tablet_replica.set(r, rest_bind(rest_del_tablet_replica, ctx, ss));
@@ -1983,6 +1961,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::reload_raft_topology_state.unset(r);
    ss::upgrade_to_raft_topology.unset(r);
    ss::raft_topology_upgrade_status.unset(r);
+    ss::raft_topology_get_cmd_status.unset(r);
    ss::move_tablet.unset(r);
    ss::add_tablet_replica.unset(r);
    ss::del_tablet_replica.unset(r);
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -227,7 +227,9 @@ future<> password_authenticator::start() {
                utils::get_local_injector().inject("password_authenticator_start_pause", utils::wait_for_message(5min)).get();
                if (!legacy_mode(_qp)) {
                    maybe_create_default_password_with_retries().get();
-                    _superuser_created_promise.set_value();
+                    if (!_superuser_created_promise.available()) {
+                        _superuser_created_promise.set_value();
+                    }
                }
            });
        });
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -9,6 +9,7 @@
 #include "auth/standard_role_manager.hh"

 #include <optional>
+#include <stdexcept>
 #include <unordered_set>
 #include <vector>

@@ -28,6 +29,7 @@
 #include "cql3/util.hh"
 #include "db/consistency_level_type.hh"
 #include "exceptions/exceptions.hh"
+#include "utils/error_injection.hh"
 #include "utils/log.hh"
 #include <seastar/core/loop.hh>
 #include <seastar/coroutine/maybe_yield.hh>
@@ -321,7 +323,9 @@ future<> standard_role_manager::start() {
            }
            if (!legacy) {
                co_await maybe_create_default_role_with_retries();
-                _superuser_created_promise.set_value();
+                if (!_superuser_created_promise.available()) {
+                    _superuser_created_promise.set_value();
+                }
            }
        };

@@ -671,6 +675,12 @@ future<role_set> standard_role_manager::query_all() {
    // To avoid many copies of a view.
    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);

+    if (utils::get_local_injector().enter("standard_role_manager_fail_legacy_query")) {
+        if (legacy_mode(_qp)) {
+            throw std::runtime_error("standard_role_manager::query_all: failed due to error injection");
+        }
+    }
+
    const auto results = co_await _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -960,8 +960,12 @@ public:
    // Given a reference to such a column from the base schema, this function sets the corresponding column
    // in the log to the given value for the given row.
    void set_value(const clustering_key& log_ck, const column_definition& base_cdef, const managed_bytes_view& value) {
-        auto& log_cdef = *_log_schema.get_column_definition(log_data_column_name_bytes(base_cdef.name()));
-        _log_mut.set_cell(log_ck, log_cdef, atomic_cell::make_live(*base_cdef.type, _ts, value, _ttl));
+        auto log_cdef_ptr = _log_schema.get_column_definition(log_data_column_name_bytes(base_cdef.name()));
+        if (!log_cdef_ptr) {
+            throw exceptions::invalid_request_exception(format("CDC log schema for {}.{} does not have base column {}",
+                _log_schema.ks_name(), _log_schema.cf_name(), base_cdef.name_as_text()));
+        }
+        _log_mut.set_cell(log_ck, *log_cdef_ptr, atomic_cell::make_live(*base_cdef.type, _ts, value, _ttl));
    }

    // Each regular and static column in the base schema has a corresponding column in the log schema
@@ -969,7 +973,13 @@ public:
    // Given a reference to such a column from the base schema, this function sets the corresponding column
    // in the log to `true` for the given row. If not called, the column will be `null`.
    void set_deleted(const clustering_key& log_ck, const column_definition& base_cdef) {
-        _log_mut.set_cell(log_ck, log_data_column_deleted_name_bytes(base_cdef.name()), data_value(true), _ts, _ttl);
+        auto log_cdef_ptr = _log_schema.get_column_definition(log_data_column_deleted_name_bytes(base_cdef.name()));
+        if (!log_cdef_ptr) {
+            throw exceptions::invalid_request_exception(format("CDC log schema for {}.{} does not have base column {}",
+                _log_schema.ks_name(), _log_schema.cf_name(), base_cdef.name_as_text()));
+        }
+        auto& log_cdef = *log_cdef_ptr;
+        _log_mut.set_cell(log_ck, *log_cdef_ptr, atomic_cell::make_live(*log_cdef.type, _ts, log_cdef.type->decompose(true), _ttl));
    }

    // Each regular and static non-atomic column in the base schema has a corresponding column in the log schema
@@ -978,7 +988,12 @@ public:
    // Given a reference to such a column from the base schema, this function sets the corresponding column
    // in the log to the given set of keys for the given row.
    void set_deleted_elements(const clustering_key& log_ck, const column_definition& base_cdef, const managed_bytes& deleted_elements) {
-        auto& log_cdef = *_log_schema.get_column_definition(log_data_column_deleted_elements_name_bytes(base_cdef.name()));
+        auto log_cdef_ptr = _log_schema.get_column_definition(log_data_column_deleted_elements_name_bytes(base_cdef.name()));
+        if (!log_cdef_ptr) {
+            throw exceptions::invalid_request_exception(format("CDC log schema for {}.{} does not have base column {}",
+                _log_schema.ks_name(), _log_schema.cf_name(), base_cdef.name_as_text()));
+        }
+        auto& log_cdef = *log_cdef_ptr;
        _log_mut.set_cell(log_ck, log_cdef, atomic_cell::make_live(*log_cdef.type, _ts, deleted_elements, _ttl));
    }

@@ -1865,5 +1880,10 @@ bool cdc::cdc_service::needs_cdc_augmentation(const std::vector<mutation>& mutat

 future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
 cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
+    if (utils::get_local_injector().enter("sleep_before_cdc_augmentation")) {
+        return seastar::sleep(std::chrono::milliseconds(100)).then([this, timeout, mutations = std::move(mutations), tr_state = std::move(tr_state), write_cl] () mutable {
+            return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
+        });
+    }
    return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
 }
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -855,3 +855,18 @@ rf_rack_valid_keyspaces: false
 # Maximum number of items in single BatchWriteItem command. Default is 100.
 # Note: DynamoDB has a hard-coded limit of 25.
 # alternator_max_items_in_batch_write: 100
+
+# 
+# io-streaming rate limiting
+# When setting this value to be non-zero scylla throttles disk throughput for
+# stream (network) activities such as backup, repair, tablet migration and more.
+# This limit is useful for user queries so the network interface does 
+# not get saturated by streaming activities.
+# The recommended value is 75% of network bandwidth
+# E.g for i4i.8xlarge (https://github.com/scylladb/scylla-machine-image/tree/next/common/aws_net_params.json):
+# network: 18.75 GiB/s --> 18750 Mib/s --> 1875 MB/s (from network bits to network bytes: divide by 10, not 8)
+# Converted to disk bytes: 1875 * 1000 / 1024 = 1831 MB/s (disk wise)
+# 75% of disk bytes is: 0.75 * 1831 = 1373 megabytes/s
+# stream_io_throughput_mb_per_sec: 1373
+# 
+
--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -245,12 +245,18 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
            qp.db().real_database().validate_keyspace_update(*ks_md_update);

            service::topology_mutation_builder builder(ts);
+            service::topology_request_tracking_mutation_builder rtbuilder{global_request_id, qp.proxy().features().topology_requests_type_column};
+            rtbuilder.set("done", false)
+                     .set("start_time", db_clock::now());
            if (!qp.proxy().features().topology_global_request_queue) {
                builder.set_global_topology_request(service::global_topology_request::keyspace_rf_change);
                builder.set_global_topology_request_id(global_request_id);
                builder.set_new_keyspace_rf_change_data(_name, ks_options);
            } else {
                builder.queue_global_topology_request_id(global_request_id);
+                rtbuilder.set("request_type", service::global_topology_request::keyspace_rf_change)
+                         .set_new_keyspace_rf_change_data(_name, ks_options);
+
            };
            service::topology_change change{{builder.build()}};

@@ -259,13 +265,6 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
                return cm.to_mutation(topo_schema);
            });

-            service::topology_request_tracking_mutation_builder rtbuilder{global_request_id, qp.proxy().features().topology_requests_type_column};
-            rtbuilder.set("done", false)
-                     .set("start_time", db_clock::now())
-                     .set("request_type", service::global_topology_request::keyspace_rf_change);
-            if (qp.proxy().features().topology_global_request_queue) {
-                rtbuilder.set_new_keyspace_rf_change_data(_name, ks_options);
-            }
            service::topology_change req_change{{rtbuilder.build()}};

            auto topo_req_schema = qp.db().find_schema(db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY_REQUESTS);
@@ -277,33 +276,44 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
            muts.insert(muts.begin(), schema_mutations.begin(), schema_mutations.end());
        }

+        auto rs = locator::abstract_replication_strategy::create_replication_strategy(
+                ks_md_update->strategy_name(),
+                locator::replication_strategy_params(ks_md_update->strategy_options(), ks_md_update->initial_tablets()));
+
        // If `rf_rack_valid_keyspaces` is enabled, it's forbidden to perform a schema change that
        // would lead to an RF-rack-valid keyspace. Verify that this change does not.
        // For more context, see: scylladb/scylladb#23071.
-        if (qp.db().get_config().rf_rack_valid_keyspaces()) {
-            auto rs = locator::abstract_replication_strategy::create_replication_strategy(
-                    ks_md_update->strategy_name(),
-                    locator::replication_strategy_params(ks_md_update->strategy_options(), ks_md_update->initial_tablets()));
-
-            try {
-                // There are two things to note here:
-                // 1. We hold a group0_guard, so it's correct to check this here.
-                //    The topology or schema cannot change while we're performing this query.
-                // 2. The replication strategy we use here does NOT represent the actual state
-                //    we will arrive at after applying the schema change. For instance, if the user
-                //    did not specify the RF for some of the DCs, it's equal to 0 in the replication
-                //    strategy we pass to this function, while in reality that means that the RF
-                //    will NOT change. That is not a problem:
-                //    - RF=0 is valid for all DCs, so it won't trigger an exception on its own,
-                //    - the keyspace must've been RF-rack-valid before this change. We check that
-                //      condition for all keyspaces at startup.
-                //    The second hyphen is not really true because currently topological changes can
-                //    disturb it (see scylladb/scylladb#23345), but we ignore that.
-                locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
-            } catch (const std::exception& e) {
+        try {
+            // There are two things to note here:
+            // 1. We hold a group0_guard, so it's correct to check this here.
+            //    The topology or schema cannot change while we're performing this query.
+            // 2. The replication strategy we use here does NOT represent the actual state
+            //    we will arrive at after applying the schema change. For instance, if the user
+            //    did not specify the RF for some of the DCs, it's equal to 0 in the replication
+            //    strategy we pass to this function, while in reality that means that the RF
+            //    will NOT change. That is not a problem:
+            //    - RF=0 is valid for all DCs, so it won't trigger an exception on its own,
+            //    - the keyspace must've been RF-rack-valid before this change. We check that
+            //      condition for all keyspaces at startup.
+            //    The second hyphen is not really true because currently topological changes can
+            //    disturb it (see scylladb/scylladb#23345), but we ignore that.
+            locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
+        } catch (const std::exception& e) {
+            if (qp.db().get_config().rf_rack_valid_keyspaces()) {
                // There's no guarantee what the type of the exception will be, so we need to
                // wrap it manually here in a type that can be passed to the user.
                throw exceptions::invalid_request_exception(e.what());
+            } else {
+                // Even when the configuration option `rf_rack_valid_keyspaces` is set to false,
+                // we'd like to inform the user that the keyspace they're altering will not
+                // satisfy the restriction after the change--but just as a warning.
+                // For more context, see issue: scylladb/scylladb#23330.
+                warnings.push_back(seastar::format(
+                    "Keyspace '{}' is not RF-rack-valid: the replication factor doesn't match "
+                    "the rack count in at least one datacenter. A rack failure may reduce availability. "
+                    "For more context, see: "
+                    "https://docs.scylladb.com/manual/stable/reference/glossary.html#term-RF-rack-valid-keyspace.",
+                    _name));
            }
        }

--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -8,6 +8,7 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */

+#include "cdc/log.hh"
 #include "utils/assert.hh"
 #include <seastar/core/coroutine.hh>
 #include "cql3/query_options.hh"
@@ -27,6 +28,7 @@
 #include "db/view/view.hh"
 #include "cql3/query_processor.hh"
 #include "cdc/cdc_extension.hh"
+#include "cdc/cdc_partitioner.hh"

 namespace cql3 {

@@ -290,6 +292,53 @@ std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_sche
        throw exceptions::invalid_request_exception("Cannot use ALTER TABLE on Materialized View");
    }

+    const bool is_cdc_log_table = cdc::is_log_for_some_table(db.real_database(), s->ks_name(), s->cf_name());
+    // Only a CDC log table will have this partitioner name. User tables should
+    // not be able to set this. Note that we perform a similar check when trying to
+    // re-enable CDC for a table, when the log table has been replaced by a user table.
+    // For better visualization of the above, consider this
+    //
+    // cqlsh> CREATE TABLE ks.t (p int PRIMARY KEY, v int) WITH cdc = {'enabled': true};
+    // cqlsh> INSERT INTO ks.t (p, v) VALUES (1, 2);
+    // cqlsh> ALTER TABLE ks.t WITH cdc = {'enabled': false};
+    // cqlsh> DESC TABLE ks.t_scylla_cdc_log WITH INTERNALS; # Save this output!
+    // cqlsh> DROP TABLE ks.t_scylla_cdc_log;
+    // cqlsh> [Recreate the log table using the received statement]
+    // cqlsh> ALTER TABLE ks.t WITH cdc = {'enabled': true};
+    //
+    // InvalidRequest: Error from server: code=2200 [Invalid query] message="Cannot create CDC log
+    //                 table for table ks.t because a table of name ks.t_scylla_cdc_log already exists"
+    //
+    // See commit adda43edc75b901b2329bca8f3eb74596698d05f for more information on THAT case.
+    // We reuse the same technique here.
+    const bool was_cdc_log_table = s->get_partitioner().name() == cdc::cdc_partitioner::classname;
+
+    if (_column_changes.size() != 0 && is_cdc_log_table) {
+        throw exceptions::invalid_request_exception(
+                "You cannot modify the set of columns of a CDC log table directly. "
+                "Modify the base table instead.");
+    }
+    if (_column_changes.size() != 0 && was_cdc_log_table) {
+        throw exceptions::invalid_request_exception(
+                "You cannot modify the set of columns of a CDC log table directly. "
+                "Although the base table has deactivated CDC, this table will continue being "
+                "a CDC log table until it is dropped. If you want to modify the columns in it, "
+                "you can only do that by reenabling CDC on the base table, which will reattach "
+                "this log table. Then you will be able to modify the columns in the base table, "
+                "and that will have effect on the log table too. Modifying the columns of a CDC "
+                "log table directly is never allowed.");
+    }
+
+    if (_renames.size() != 0 && is_cdc_log_table) {
+        throw exceptions::invalid_request_exception("Cannot rename a column of a CDC log table.");
+    }
+    if (_renames.size() != 0 && was_cdc_log_table) {
+        throw exceptions::invalid_request_exception(
+                "You cannot rename a column of a CDC log table. Although the base table "
+                "has deactivated CDC, this table will continue being a CDC log table until it "
+                "is dropped.");
+    }
+
    auto cfm = schema_builder(s);

    if (_properties->get_id()) {
--- a/cql3/statements/create_keyspace_statement.cc
+++ b/cql3/statements/create_keyspace_statement.cc
@@ -124,15 +124,26 @@ future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector
        // If `rf_rack_valid_keyspaces` is enabled, it's forbidden to create an RF-rack-invalid keyspace.
        // Verify that it's RF-rack-valid.
        // For more context, see: scylladb/scylladb#23071.
-        if (cfg.rf_rack_valid_keyspaces()) {
-            try {
-                // We hold a group0_guard, so it's correct to check this here.
-                // The topology or schema cannot change while we're performing this query.
-                locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
-            } catch (const std::exception& e) {
+        try {
+            // We hold a group0_guard, so it's correct to check this here.
+            // The topology or schema cannot change while we're performing this query.
+            locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
+        } catch (const std::exception& e) {
+            if (cfg.rf_rack_valid_keyspaces()) {
                // There's no guarantee what the type of the exception will be, so we need to
                // wrap it manually here in a type that can be passed to the user.
                throw exceptions::invalid_request_exception(e.what());
+            } else {
+                // Even when the configuration option `rf_rack_valid_keyspaces` is set to false,
+                // we'd like to inform the user that the keyspace they're creating does not
+                // satisfy the restriction--but just as a warning.
+                // For more context, see issue: scylladb/scylladb#23330.
+                warnings.push_back(seastar::format(
+                    "Keyspace '{}' is not RF-rack-valid: the replication factor doesn't match "
+                    "the rack count in at least one datacenter. A rack failure may reduce availability. "
+                    "For more context, see: "
+                    "https://docs.scylladb.com/manual/stable/reference/glossary.html#term-RF-rack-valid-keyspace.",
+                    _name));
            }
        }
    } catch (const exceptions::already_exists_exception& e) {
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -36,7 +36,7 @@

 static logging::logger blogger("batchlog_manager");

-const uint32_t db::batchlog_manager::replay_interval;
+const std::chrono::seconds db::batchlog_manager::replay_interval;
 const uint32_t db::batchlog_manager::page_size;

 db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, batchlog_manager_config config)
@@ -116,7 +116,8 @@ future<> db::batchlog_manager::batchlog_replay_loop() {
        } catch (...) {
            blogger.error("Exception in batch replay: {}", std::current_exception());
        }
-        delay = std::chrono::milliseconds(replay_interval);
+        delay = utils::get_local_injector().is_enabled("short_batchlog_manager_replay_interval") ?
+                std::chrono::seconds(1) : replay_interval;
    }
 }

@@ -132,6 +133,8 @@ future<> db::batchlog_manager::drain() {
        _sem.broken();
    }

+    co_await _qp.proxy().abort_batch_writes();
+
    co_await std::move(_loop_done);
    blogger.info("Drained");
 }
@@ -173,6 +176,11 @@ future<> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cle
            return make_ready_future<stop_iteration>(stop_iteration::no);
        }

+        if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
+            blogger.debug("Skipping batch replay due to skip_batch_replay injection");
+            return make_ready_future<stop_iteration>(stop_iteration::no);
+        }
+
        // check version of serialization format
        if (!row.has("version")) {
            blogger.warn("Skipping logged batch because of unknown version");
@@ -242,7 +250,8 @@ future<> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cle
                // send to partially or wholly fail in actually sending stuff. Since we don't
                // have hints (yet), send with CL=ALL, and hope we can re-do this soon.
                // See below, we use retry on write failure.
-                return _qp.proxy().mutate(mutations, db::consistency_level::ALL, db::no_timeout, nullptr, empty_service_permit(), db::allow_per_partition_rate_limit::no);
+                auto timeout = db::timeout_clock::now() + write_timeout;
+                return _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
            });
        }).then_wrapped([this, id](future<> batch_result) {
            try {
--- a/db/batchlog_manager.hh
+++ b/db/batchlog_manager.hh
@@ -43,8 +43,9 @@ public:
    using post_replay_cleanup = bool_class<class post_replay_cleanup_tag>;

 private:
-    static constexpr uint32_t replay_interval = 60 * 1000; // milliseconds
+    static constexpr std::chrono::seconds replay_interval = std::chrono::seconds(60);
    static constexpr uint32_t page_size = 128; // same as HHOM, for now, w/out using any heuristics. TODO: set based on avg batch size.
+    static constexpr std::chrono::seconds write_timeout = std::chrono::seconds(300);

    using clock_type = lowres_clock;

--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -800,6 +800,8 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
    void end_flush() {
        _segment_manager->end_flush();
        if (can_delete()) {
+            // #25709 - do this early if possible
+            _extended_segments.clear();
            _segment_manager->discard_unused_segments();
        }
    }
@@ -875,6 +877,8 @@ public:
    void release_cf_count(const cf_id_type& cf) {
        mark_clean(cf, 1);
        if (can_delete()) {
+            // #25709 - do this early if possible
+            _extended_segments.clear();
            _segment_manager->discard_unused_segments();
        }
    }
@@ -2576,20 +2580,24 @@ struct fmt::formatter<db::commitlog::segment::cf_mark> {
 void db::commitlog::segment_manager::discard_unused_segments() noexcept {
    clogger.trace("Checking for unused segments ({} active)", _segments.size());

-    std::erase_if(_segments, [=](sseg_ptr s) {
-        if (s->can_delete()) {
-            clogger.debug("Segment {} is unused", *s);
-            return true;
-        }
-        if (s->is_still_allocating()) {
-            clogger.debug("Not safe to delete segment {}; still allocating.", *s);
-        } else if (!s->is_clean()) {
-            clogger.debug("Not safe to delete segment {}; dirty is {}", *s, segment::cf_mark {*s});
-        } else {
-            clogger.debug("Not safe to delete segment {}; disk ops pending", *s);
-        }
-        return false;
-    });
+    // #25709 ensure we don't free any segment until after prune.
+    {
+        auto tmp = _segments; 
+        std::erase_if(_segments, [=](sseg_ptr s) {
+            if (s->can_delete()) {
+                clogger.debug("Segment {} is unused", *s);
+                return true;
+            }
+            if (s->is_still_allocating()) {
+                clogger.debug("Not safe to delete segment {}; still allocating.", *s);
+            } else if (!s->is_clean()) {
+                clogger.debug("Not safe to delete segment {}; dirty is {}", *s, segment::cf_mark {*s});
+            } else {
+                clogger.debug("Not safe to delete segment {}; disk ops pending", *s);
+            }
+            return false;
+        });
+    }

    // launch in background, but guard with gate so this deletion is
    // sure to finish in shutdown, because at least through this path,
@@ -2878,7 +2886,10 @@ future<> db::commitlog::segment_manager::do_pending_deletes() {
 }

 future<> db::commitlog::segment_manager::orphan_all() {
-    _segments.clear();
+    // #25709. the actual process of destroying the elements here
+    // might cause a call into discard_unused_segments.
+    // ensure the target vector is empty when we get to destructors
+    auto tmp = std::exchange(_segments, {});
    return clear_reserve_segments();
 }

@@ -3255,9 +3266,13 @@ const db::commitlog::config& db::commitlog::active_config() const {
    return _segment_manager->cfg;
 }

+db::commitlog::segment_data_corruption_error::segment_data_corruption_error(std::string_view msg, uint64_t s)
+    : _msg(fmt::format("Segment data corruption: {}", msg))
+    , _bytes(s)
+{}

-db::commitlog::segment_truncation::segment_truncation(uint64_t pos) 
-    : _msg(fmt::format("Segment truncation at {}", pos))
+db::commitlog::segment_truncation::segment_truncation(std::string_view reason, uint64_t pos)
+    : _msg(fmt::format("Segment truncation at {}. Reason: {}", pos, reason))
    , _pos(pos)
 {}

@@ -3447,7 +3462,8 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin

            while (rem < size) {
                if (eof) {
-                    throw segment_truncation(block_boundry);
+                    auto reason = fmt::format("unexpected EOF, rem={}, size={}", rem, size);
+                    throw segment_truncation(std::move(reason), block_boundry);
                }

                auto block_size = alignment - initial.size_bytes();
@@ -3458,7 +3474,8 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin

                if (tmp.size_bytes() == 0) {
                    eof = true;
-                    throw segment_truncation(block_boundry);
+                    auto reason = fmt::format("read 0 bytes, while tried to read {}", block_size);
+                    throw segment_truncation(std::move(reason), block_boundry);
                }

                crc32_nbo crc;
@@ -3493,10 +3510,12 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
                    auto checksum = crc.checksum();

                    if (check != checksum) {
-                        throw segment_data_corruption_error("Data corruption", alignment);
+                        auto reason = fmt::format("checksums do not match: {:x} vs. {:x}", check, checksum);
+                        throw segment_data_corruption_error(std::move(reason), alignment);
                    }
                    if (id != this->id) {
-                        throw segment_truncation(pos + rem);
+                        auto reason = fmt::format("IDs do not match: {} vs. {}", id, this->id);
+                        throw segment_truncation(std::move(reason), pos + rem);
                    }
                }
                tmp.remove_suffix(detail::sector_overhead_size);
@@ -3771,7 +3790,8 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
                    co_await read_chunk();
                }
                if (corrupt_size > 0) {
-                    throw segment_data_corruption_error("Data corruption", corrupt_size);
+                    auto reason = fmt::format("corrupted size while reading file: {}", corrupt_size);
+                    throw segment_data_corruption_error(std::move(reason), corrupt_size);
                }
            } catch (...) {
                p = std::current_exception();
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -392,9 +392,7 @@ public:
    class segment_data_corruption_error: public segment_error {
        std::string _msg;
    public:
-        segment_data_corruption_error(std::string msg, uint64_t s)
-                : _msg(std::move(msg)), _bytes(s) {
-        }
+        segment_data_corruption_error(std::string_view msg, uint64_t s);
        uint64_t bytes() const {
            return _bytes;
        }
@@ -425,7 +423,7 @@ public:
        std::string _msg;
        uint64_t _pos;
    public:
-        segment_truncation(uint64_t);
+        segment_truncation(std::string_view reason, uint64_t position);

        uint64_t position() const;
        const char* what() const noexcept override;
--- a/db/config.cc
+++ b/db/config.cc
@@ -86,6 +86,12 @@ object_storage_endpoints_to_json(const std::vector<db::object_storage_endpoint_p
    return value_to_json(m);
 }

+static
+json::json_return_type
+uuid_to_json(const db::config::UUID& uuid) {
+    return value_to_json(format("{}", uuid));
+}
+
 // Convert a value that can be printed with fmt::format, or a vector of
 // such values, to JSON. An example is enum_option<T>, because enum_option<T>
 // has a specialization for fmt::formatter.
@@ -294,6 +300,12 @@ const config_type& config_type_for<std::vector<db::object_storage_endpoint_param
    return ct;
 }

+template <>
+const config_type& config_type_for<db::config::UUID>() {
+    static config_type ct("UUID", uuid_to_json);
+    return ct;
+}
+
 }

 namespace YAML {
@@ -491,6 +503,22 @@ struct convert<db::object_storage_endpoint_param> {
    }
 };

+template<>
+struct convert<utils::UUID> {
+    static bool decode(const Node& node, utils::UUID& uuid) {
+        std::string uuid_string;
+        if (!convert<std::string>::decode(node, uuid_string)) {
+            return false;
+        }
+        try {
+            std::istringstream(uuid_string) >> uuid;
+        } catch (boost::program_options::invalid_option_value&) {
+            return false;
+        }
+        return true;
+    }
+};
+
 }

 #if defined(DEBUG)
@@ -819,7 +847,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , inter_dc_stream_throughput_outbound_megabits_per_sec(this, "inter_dc_stream_throughput_outbound_megabits_per_sec", value_status::Unused, 0,
        "Throttles all streaming file transfer between the data centers. This setting allows throttles streaming throughput betweens data centers in addition to throttling all network stream traffic as configured with stream_throughput_outbound_megabits_per_sec.")
    , stream_io_throughput_mb_per_sec(this, "stream_io_throughput_mb_per_sec", liveness::LiveUpdate, value_status::Used, 0,
-        "Throttles streaming I/O to the specified total throughput (in MiBs/s) across the entire system. Streaming I/O includes the one performed by repair and both RBNO and legacy topology operations such as adding or removing a node. Setting the value to 0 disables stream throttling.")
+        "Throttles streaming I/O to the specified total throughput (in MiBs/s) across the entire system. Streaming I/O includes the one performed by repair and both RBNO and legacy topology operations such as adding or removing a node. Setting the value to 0 disables stream throttling. It is recommended to set the value for this parameter to be 75% of network bandwidth")
    , stream_plan_ranges_fraction(this, "stream_plan_ranges_fraction", liveness::LiveUpdate, value_status::Used, 0.1,
        "Specify the fraction of ranges to stream in a single stream plan. Value is between 0 and 1.")
    , enable_file_stream(this, "enable_file_stream", liveness::LiveUpdate, value_status::Used, true, "Set true to use file based stream for tablet instead of mutation based stream")
@@ -942,6 +970,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "The default timeout for other, miscellaneous operations.\n"
        "\n"
        "Related information: About hinted handoff writes")
+    , request_timeout_on_shutdown_in_seconds(this, "request_timeout_on_shutdown_in_seconds", value_status::Used, 30,
+        "Timeout for CQL server requests on shutdown. After this timeout the server will shutdown all connections.")
    , group0_raft_op_timeout_in_ms(this, "group0_raft_op_timeout_in_ms", liveness::LiveUpdate, value_status::Used, 60000,
            "The time in milliseconds that group0 allows a Raft operation to complete.")
    /**
@@ -1230,7 +1260,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , sstable_summary_ratio(this, "sstable_summary_ratio", value_status::Used, 0.0005, "Enforces that 1 byte of summary is written for every N (2000 by default)"
        "bytes written to data file. Value must be between 0 and 1.")
    , components_memory_reclaim_threshold(this, "components_memory_reclaim_threshold", liveness::LiveUpdate, value_status::Used, .2, "Ratio of available memory for all in-memory components of SSTables in a shard beyond which the memory will be reclaimed from components until it falls back under the threshold. Currently, this limit is only enforced for bloom filters.")
-    , large_memory_allocation_warning_threshold(this, "large_memory_allocation_warning_threshold", value_status::Used, (size_t(128) << 10) + 1, "Warn about memory allocations above this size; set to zero to disable.")
+    , large_memory_allocation_warning_threshold(this, "large_memory_allocation_warning_threshold", value_status::Used, size_t(1) << 20, "Warn about memory allocations above this size; set to zero to disable.")
    , enable_deprecated_partitioners(this, "enable_deprecated_partitioners", value_status::Used, false, "Enable the byteordered and random partitioners. These partitioners are deprecated and will be removed in a future version.")
    , enable_keyspace_column_family_metrics(this, "enable_keyspace_column_family_metrics", value_status::Used, false, "Enable per keyspace and per column family metrics reporting.")
    , enable_node_aggregated_table_metrics(this, "enable_node_aggregated_table_metrics", value_status::Used, true, "Enable aggregated per node, per keyspace and per table metrics reporting, applicable if enable_keyspace_column_family_metrics is false.")
@@ -1399,7 +1429,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "The maximum fraction of cache memory permitted for use by index cache. Clamped to the [0.0; 1.0] range. Must be small enough to not deprive the row cache of memory, but should be big enough to fit a large fraction of the index. The default value 0.2 means that at least 80\% of cache memory is reserved for the row cache, while at most 20\% is usable by the index cache.")
    , consistent_cluster_management(this, "consistent_cluster_management", value_status::Deprecated, true, "Use RAFT for cluster management and DDL.")
    , force_gossip_topology_changes(this, "force_gossip_topology_changes", value_status::Used, false, "Force gossip-based topology operations in a fresh cluster. Only the first node in the cluster must use it. The rest will fall back to gossip-based operations anyway. This option should be used only for testing.  Note: gossip topology changes are incompatible with tablets.")
-    , recovery_leader(this, "recovery_leader", liveness::LiveUpdate, value_status::Used, "", "Host ID of the node restarted first while performing the Manual Raft-based Recovery Procedure. Warning: this option disables some guardrails for the needs of the Manual Raft-based Recovery Procedure. Make sure you unset it at the end of the procedure.")
+    , recovery_leader(this, "recovery_leader", liveness::LiveUpdate, value_status::Used, utils::null_uuid(), "Host ID of the node restarted first while performing the Manual Raft-based Recovery Procedure. Warning: this option disables some guardrails for the needs of the Manual Raft-based Recovery Procedure. Make sure you unset it at the end of the procedure.")
    , wasm_cache_memory_fraction(this, "wasm_cache_memory_fraction", value_status::Used, 0.01, "Maximum total size of all WASM instances stored in the cache as fraction of total shard memory.")
    , wasm_cache_timeout_in_ms(this, "wasm_cache_timeout_in_ms", value_status::Used, 5000, "Time after which an instance is evicted from the cache.")
    , wasm_cache_instance_size_limit(this, "wasm_cache_instance_size_limit", value_status::Used, 1024*1024, "Instances with size above this limit will not be stored in the cache.")
--- a/db/config.hh
+++ b/db/config.hh
@@ -207,6 +207,7 @@ public:
    using seed_provider_type = db::seed_provider_type;
    using hinted_handoff_enabled_type = db::hints::host_filter;
    using error_injection_at_startup = db::error_injection_at_startup;
+    using UUID = utils::UUID;

    /*
     * All values and documentation taken from
@@ -322,6 +323,7 @@ public:
    named_value<uint32_t> truncate_request_timeout_in_ms;
    named_value<uint32_t> write_request_timeout_in_ms;
    named_value<uint32_t> request_timeout_in_ms;
+    named_value<uint32_t> request_timeout_on_shutdown_in_seconds;
    named_value<uint32_t> group0_raft_op_timeout_in_ms;
    named_value<bool> cross_node_timeout;
    named_value<uint32_t> internode_send_buff_size_in_bytes;
@@ -521,7 +523,7 @@ public:

    named_value<bool> consistent_cluster_management;
    named_value<bool> force_gossip_topology_changes;
-    named_value<sstring> recovery_leader;
+    named_value<UUID> recovery_leader;

    named_value<double> wasm_cache_memory_fraction;
    named_value<uint32_t> wasm_cache_timeout_in_ms;
--- a/db/hints/internal/hint_endpoint_manager.cc
+++ b/db/hints/internal/hint_endpoint_manager.cc
@@ -65,18 +65,18 @@ future<> hint_endpoint_manager::do_store_hint(schema_ptr s, lw_shared_ptr<const
        const replay_position rp = rh.release();
        if (_last_written_rp < rp) {
            _last_written_rp = rp;
-            manager_logger.debug("[{}] Updated last written replay position to {}", end_point_key(), rp);
+            manager_logger.trace("hint_endpoint_manager[{}]:do_store_hint: Updated last written replay position to {}", end_point_key(), rp);
        }

        ++shard_stats().written;

-        manager_logger.trace("Hint to {} was stored", end_point_key());
+        manager_logger.trace("hint_endpoint_manager[{}]:do_store_hint: Hint has been stored", end_point_key());
        tracing::trace(tr_state, "Hint to {} was stored", end_point_key());
    } catch (...) {
        ++shard_stats().errors;
        const auto eptr = std::current_exception();

-        manager_logger.debug("store_hint(): got the exception when storing a hint to {}: {}", end_point_key(), eptr);
+        manager_logger.debug("hint_endpoint_manager[{}]:do_store_hint: Exception when storing a hint: {}", end_point_key(), eptr);
        tracing::trace(tr_state, "Failed to store a hint to {}: {}", end_point_key(), eptr);
    }

@@ -92,7 +92,7 @@ bool hint_endpoint_manager::store_hint(schema_ptr s, lw_shared_ptr<const frozen_
            return do_store_hint(std::move(s), std::move(fm), tr_state);
        });
    } catch (...) {
-        manager_logger.trace("Failed to store a hint to {}: {}", end_point_key(), std::current_exception());
+        manager_logger.trace("hint_endpoint_manager[{}]:store_hint: Failed to store a hint: {}", end_point_key(), std::current_exception());
        tracing::trace(tr_state, "Failed to store a hint to {}: {}", end_point_key(), std::current_exception());

        ++shard_stats().dropped;
@@ -109,16 +109,23 @@ future<> hint_endpoint_manager::populate_segments_to_replay() {
 }

 void hint_endpoint_manager::start() {
+    manager_logger.debug("hint_endpoint_manager[{}]:start: Starting", end_point_key());
+
    clear_stopped();
    allow_hints();
    _sender.start();
+
+    manager_logger.debug("hint_endpoint_manager[{}]:start: Finished", end_point_key());
 }

 future<> hint_endpoint_manager::stop(drain should_drain) noexcept {
-    if(stopped()) {
+    if (stopped()) {
+        manager_logger.warn("hint_endpoint_manager[{}]:stop: Stop had already been called", end_point_key());
        return make_exception_future<>(std::logic_error(format("ep_manager[{}]: stop() is called twice", _key).c_str()));
    }

+    manager_logger.debug("hint_endpoint_manager[{}]:stop: Starting", end_point_key());
+
    return seastar::async([this, should_drain] {
        std::exception_ptr eptr;

@@ -139,10 +146,11 @@ future<> hint_endpoint_manager::stop(drain should_drain) noexcept {
        }).handle_exception([&eptr] (auto e) { eptr = std::move(e); }).get();

        if (eptr) {
-            manager_logger.error("ep_manager[{}]: exception: {}", _key, eptr);
+            manager_logger.error("hint_endpoint_manager[{}]:stop: Exception occurred: {}", _key, eptr);
        }

        set_stopped();
+        manager_logger.debug("hint_endpoint_manager[{}]:stop: Finished", end_point_key());
    });
 }

@@ -194,7 +202,7 @@ future<hints_store_ptr> hint_endpoint_manager::get_or_load() {
 }

 future<db::commitlog> hint_endpoint_manager::add_store() noexcept {
-    manager_logger.trace("Going to add a store to {}", _hints_dir.c_str());
+    manager_logger.debug("hint_endpoint_manager[{}]:add_store: Going to add a store: {}", end_point_key(), _hints_dir.native());

    return futurize_invoke([this] {
        return io_check([name = _hints_dir.c_str()] { return recursive_touch_directory(name); }).then([this] () {
@@ -289,6 +297,8 @@ future<db::commitlog> hint_endpoint_manager::add_store() noexcept {
                    _sender.add_segment(std::move(seg));
                }

+                manager_logger.debug("hint_endpoint_manager[{}]:add_store: Finished", end_point_key());
+
                co_return l;
            });
        });
--- a/db/hints/internal/hint_sender.cc
+++ b/db/hints/internal/hint_sender.cc
@@ -56,8 +56,8 @@ future<> hint_sender::flush_maybe() noexcept {
    if (current_time >= _next_flush_tp) {
        return _ep_manager.flush_current_hints().then([this, current_time] {
            _next_flush_tp = current_time + manager::hints_flush_period;
-        }).handle_exception([] (auto eptr) {
-            manager_logger.trace("flush_maybe() failed: {}", eptr);
+        }).handle_exception([this] (auto eptr) {
+            manager_logger.debug("hint_sender[{}]:flush_maybe: Failed with {}", _ep_key, eptr);
            return make_ready_future<>();
        });
    }
@@ -115,7 +115,7 @@ const column_mapping& hint_sender::get_column_mapping(lw_shared_ptr<send_one_fil
            throw no_column_mapping(fm.schema_version());
        }

-        manager_logger.debug("new schema version {}", fm.schema_version());
+        manager_logger.trace("hint_sender[{}]:get_column_mapping: new schema version {}", _ep_key, fm.schema_version());
        cm_it = ctx_ptr->schema_ver_to_column_mapping.emplace(fm.schema_version(), *hr.get_column_mapping()).first;
    }

@@ -175,23 +175,22 @@ future<> hint_sender::stop(drain should_drain) noexcept {
            //
            // The next call for send_hints_maybe() will send the last hints to the current end point and when it is
            // done there is going to be no more pending hints and the corresponding hints directory may be removed.
-            manager_logger.trace("Draining for {}: start", end_point_key());
+            manager_logger.trace("hint_sender[{}]:stop: Draining starts", end_point_key());
            set_draining();
            send_hints_maybe();
-            _ep_manager.flush_current_hints().handle_exception([] (auto e) {
-                manager_logger.error("Failed to flush pending hints: {}. Ignoring...", e);
+            _ep_manager.flush_current_hints().handle_exception([this] (auto e) {
+                manager_logger.error("hint_sender[{}]:stop: Failed to flush pending hints: {}. Ignoring", _ep_key, e);
            }).get();
            send_hints_maybe();
-            manager_logger.trace("Draining for {}: end", end_point_key());
+            manager_logger.trace("hint_sender[{}]:stop: Draining finished", end_point_key());
        }
-        // TODO: Change this log to match the class name, but first make sure no test
-        //       relies on the old one.
-        manager_logger.trace("ep_manager({})::sender: exiting", end_point_key());
+
+        manager_logger.debug("hint_sender[{}]:stop: Finished", end_point_key());
    });
 }

 void hint_sender::cancel_draining() {
-    manager_logger.info("Draining of {} has been marked as canceled", _ep_key);
+    manager_logger.info("hint_sender[{}]:cancel_draining: Marking as canceled", _ep_key);
    if (_state.contains(state::draining)) {
        _state.remove(state::draining);
    }
@@ -222,9 +221,8 @@ void hint_sender::start() {

    attr.sched_group = _hints_cpu_sched_group;
    _stopped = seastar::async(std::move(attr), [this] {
-        // TODO: Change this log to match the class name, but first make sure no test
-        //       relies on the old one.
-        manager_logger.trace("ep_manager({})::sender: started", end_point_key());
+        manager_logger.debug("hint_sender[{}]:start: Starting", end_point_key());
+
        while (!stopping()) {
            try {
                flush_maybe().get();
@@ -237,11 +235,11 @@ void hint_sender::start() {
                break;
            } catch (...) {
                // log and keep on spinning
-                // TODO: Change this log to match the class name, but first make sure no test
-                //       relies on the old one.
-                manager_logger.trace("sender: got the exception: {}", std::current_exception());
+                manager_logger.debug("hint_sender[{}]:start: Exception in the loop: {}", _ep_key, std::current_exception());
            }
        }
+
+        manager_logger.debug("hint_sender[{}]:start: Exited the loop", _ep_key);
    });
 }

@@ -257,14 +255,14 @@ future<> hint_sender::send_one_mutation(frozen_mutation_and_schema m) {
        const auto dst = end_point_key();

        if (std::ranges::contains(natural_endpoints, dst) && !tm.is_leaving(dst)) {
-            manager_logger.trace("Sending directly to {}", dst);
+            manager_logger.trace("hint_sender[{}]:send_one_mutation: Sending directly", dst);
            return _proxy.send_hint_to_endpoint(std::move(m), std::move(ermp), dst);
        } else {
            if (manager_logger.is_enabled(log_level::trace)) {
                if (tm.is_leaving(end_point_key())) {
-                    manager_logger.trace("The original target endpoint {} is leaving. Mutating from scratch...", dst);
+                    manager_logger.trace("hint_sender[{}]:send_one_mutation: Original target is leaving. Mutating from scratch", dst);
                } else {
-                    manager_logger.trace("Endpoints set has changed and {} is no longer a replica. Mutating from scratch...", dst);
+                    manager_logger.trace("hint_sender[{}]:send_one_mutation: Endpoint set has changed and original target is no longer a replica. Mutating from scratch", dst);
                }
            }
            return _proxy.send_hint_to_all_replicas(std::move(m));
@@ -288,9 +286,9 @@ future<> hint_sender::send_one_hint(lw_shared_ptr<send_one_file_ctx> ctx_ptr, fr
                // Files are aggregated for at most manager::hints_timer_period therefore the oldest hint there is
                // (last_modification - manager::hints_timer_period) old.
                if (const auto now = gc_clock::now().time_since_epoch(); now - secs_since_file_mod > gc_grace_sec - manager::hints_flush_period) {
-                    manager_logger.debug("send_hints(): the hint is too old, skipping it, "
+                    manager_logger.trace("hint_sender[{}]:send_hints: Hint is too old, skipping it, "
                        "secs since file last modification {}, gc_grace_sec {}, hints_flush_period {}",
-                        now - secs_since_file_mod, gc_grace_sec, manager::hints_flush_period);
+                        _ep_key, now - secs_since_file_mod, gc_grace_sec, manager::hints_flush_period);
                    return make_ready_future<>();
                }

@@ -299,24 +297,24 @@ future<> hint_sender::send_one_hint(lw_shared_ptr<send_one_file_ctx> ctx_ptr, fr
                    ++this->shard_stats().sent_total;
                    this->shard_stats().sent_hints_bytes_total += mutation_size;
                }).handle_exception([this, ctx_ptr] (auto eptr) {
-                    manager_logger.trace("send_one_hint(): failed to send to {}: {}", end_point_key(), eptr);
+                    manager_logger.trace("hint_sender[{}]:send_one_hint: Failed to send: {}", end_point_key(), eptr);
                    ++this->shard_stats().send_errors;
                    return make_exception_future<>(std::move(eptr));
                });

            // ignore these errors and move on - probably this hint is too old and the KS/CF has been deleted...
            } catch (replica::no_such_column_family& e) {
-                manager_logger.debug("send_hints(): no_such_column_family: {}", e.what());
+                manager_logger.debug("hint_sender[{}]:send_one_hint: no_such_column_family: {}", _ep_key, e.what());
                ++this->shard_stats().discarded;
            } catch (replica::no_such_keyspace& e) {
-                manager_logger.debug("send_hints(): no_such_keyspace: {}", e.what());
+                manager_logger.debug("hint_sender[{}]:send_one_hint: no_such_keyspace: {}", _ep_key, e.what());
                ++this->shard_stats().discarded;
            } catch (no_column_mapping& e) {
-                manager_logger.debug("send_hints(): {} at {}: {}", fname, rp, e.what());
+                manager_logger.debug("hint_sender[{}]:send_one_hint: no_column_mapping: {} at {}: {}", _ep_key, fname, rp, e.what());
                ++this->shard_stats().discarded;
            } catch (...) {
                auto eptr = std::current_exception();
-                manager_logger.debug("send_hints(): unexpected error in file {} at {}: {}", fname, rp, eptr);
+                manager_logger.debug("hint_sender[{}]:send_one_hint: Unexpected error in file {} at {}: {}", _ep_key, fname, rp, eptr);
                ++this->shard_stats().send_errors;
                return make_exception_future<>(std::move(eptr));
            }
@@ -338,21 +336,24 @@ future<> hint_sender::send_one_hint(lw_shared_ptr<send_one_file_ctx> ctx_ptr, fr
            }
            f.ignore_ready_future();
        });
-    }).handle_exception([ctx_ptr, rp] (auto eptr) {
-        manager_logger.trace("send_one_file(): Hmmm. Something bad had happened: {}", eptr);
+    }).handle_exception([this, ctx_ptr, rp] (auto eptr) {
+        manager_logger.trace("hint_sender[{}]:send_one_hint: Exception occurred: {}", _ep_key, eptr);
        ctx_ptr->on_hint_send_failure(rp);
    });
 }

 void hint_sender::notify_replay_waiters() noexcept {
    if (!_foreign_segments_to_replay.empty()) {
-        manager_logger.trace("[{}] notify_replay_waiters(): not notifying because there are still {} foreign segments to replay", end_point_key(), _foreign_segments_to_replay.size());
+        manager_logger.trace("hint_sender[{}]:notify_replay_waiters: Not notifying because there are still {} foreign segments to replay",
+                end_point_key(), _foreign_segments_to_replay.size());
        return;
    }

-    manager_logger.trace("[{}] notify_replay_waiters(): replay position upper bound was updated to {}", end_point_key(), _sent_upper_bound_rp);
+    manager_logger.trace("hint_sender[{}]:notify_replay_waiters: Replay position upper bound was updated to {}", end_point_key(), _sent_upper_bound_rp);
    while (!_replay_waiters.empty() && _replay_waiters.begin()->first < _sent_upper_bound_rp) {
-        manager_logger.trace("[{}] notify_replay_waiters(): notifying one ({} < {})", end_point_key(), _replay_waiters.begin()->first, _sent_upper_bound_rp);
+        manager_logger.trace("hint_sender[{}]:notify_replay_waiters: Notifying one ({} < {})",
+                end_point_key(), _replay_waiters.begin()->first, _sent_upper_bound_rp);
+
        auto ptr = _replay_waiters.begin()->second;
        (**ptr).set_value();
        (*ptr) = std::nullopt; // Prevent it from being resolved by abort source subscription
@@ -362,7 +363,7 @@ void hint_sender::notify_replay_waiters() noexcept {

 void hint_sender::dismiss_replay_waiters() noexcept {
    for (auto& p : _replay_waiters) {
-        manager_logger.debug("[{}] dismiss_replay_waiters(): dismissing one", end_point_key());
+        manager_logger.debug("hint_sender[{}]:dismiss_replay_waiters: Dismissing one", end_point_key());
        auto ptr = p.second;
        (**ptr).set_exception(std::runtime_error(format("Hints manager for {} is stopping", end_point_key())));
        (*ptr) = std::nullopt; // Prevent it from being resolved by abort source subscription
@@ -371,14 +372,15 @@ void hint_sender::dismiss_replay_waiters() noexcept {
 }

 future<> hint_sender::wait_until_hints_are_replayed_up_to(abort_source& as, db::replay_position up_to_rp) {
-    manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): entering with target {}", end_point_key(), up_to_rp);
+    manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Entering with target {}", end_point_key(), up_to_rp);
    if (_foreign_segments_to_replay.empty() && up_to_rp < _sent_upper_bound_rp) {
-        manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): hints were already replayed above the point ({} < {})", end_point_key(), up_to_rp, _sent_upper_bound_rp);
+        manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Hints were already replayed above the point ({} < {})",
+                end_point_key(), up_to_rp, _sent_upper_bound_rp);
        return make_ready_future<>();
    }

    if (as.abort_requested()) {
-        manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): already aborted - stopping", end_point_key());
+        manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Already aborted - stopping", end_point_key());
        return make_exception_future<>(abort_requested_exception());
    }

@@ -389,7 +391,7 @@ future<> hint_sender::wait_until_hints_are_replayed_up_to(abort_source& as, db::
            // The promise already was resolved by `notify_replay_waiters` and removed from the map
            return;
        }
-        manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): abort requested - stopping", end_point_key());
+        manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Abort requested - stopping", end_point_key());
        _replay_waiters.erase(it);
        (**ptr).set_exception(abort_requested_exception());
    });
@@ -398,7 +400,7 @@ future<> hint_sender::wait_until_hints_are_replayed_up_to(abort_source& as, db::
    // therefore we cannot capture `this`
    auto ep = end_point_key();
    return (**ptr).get_future().finally([sub = std::move(sub), ep] {
-        manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): returning after the future was satisfied", ep);
+        manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Returning after the future was satisfied", ep);
    });
 }

@@ -470,7 +472,7 @@ bool hint_sender::send_one_file(const sstring& fname) {
                }

                if (canceled_draining()) {
-                    manager_logger.debug("[{}] Exiting reading from commitlog because of canceled draining", _ep_key);
+                    manager_logger.debug("hint_sender[{}]:send_one_file: Exiting reading from commitlog because of canceled draining", _ep_key);
                    // We need to throw an exception here to cancel reading the segment.
                    throw canceled_draining_exception{};
                }
@@ -502,13 +504,15 @@ bool hint_sender::send_one_file(const sstring& fname) {
            };
        }, _last_not_complete_rp.pos, &_db.extensions()).get();
    } catch (db::commitlog::segment_error& ex) {
-        manager_logger.error("{}: {}. Dropping...", fname, ex.what());
+        manager_logger.error("hint_sender[{}]:send_one_file: Segment error in {}: {}. Last not complete position={}",
+                _ep_key, fname, ex.what(), _last_not_complete_rp);
        ctx_ptr->segment_replay_failed = false;
        ++this->shard_stats().corrupted_files;
    } catch  (const canceled_draining_exception&) {
-        manager_logger.debug("[{}] Loop in send_one_file finishes due to canceled draining", _ep_key);
+        manager_logger.debug("hint_sender[{}]:send_one_file: Loop in send_one_file finishes due to canceled draining", _ep_key);
    } catch (...) {
-        manager_logger.trace("sending of {} failed: {}", fname, std::current_exception());
+        manager_logger.debug("hint_sender[{}]:send_one_file: Sending of {} failed: {}. Last not complete position={}",
+                _ep_key, fname, std::current_exception(), _last_not_complete_rp);
        ctx_ptr->segment_replay_failed = true;
    }

@@ -523,7 +527,7 @@ bool hint_sender::send_one_file(const sstring& fname) {

    // If we are draining ignore failures and drop the segment even if we failed to send it.
    if (draining() && ctx_ptr->segment_replay_failed) {
-        manager_logger.trace("send_one_file(): we are draining so we are going to delete the segment anyway");
+        manager_logger.debug("hint_sender[{}]:send_one_file: We are draining, so we are going to delete the segment anyway", _ep_key);
        ctx_ptr->segment_replay_failed = false;
    }

@@ -533,7 +537,7 @@ bool hint_sender::send_one_file(const sstring& fname) {
        // If there was an error thrown by read_log_file function itself, we will retry sending from
        // the last hint that was successfully sent (last_succeeded_rp).
        _last_not_complete_rp = ctx_ptr->first_failed_rp.value_or(ctx_ptr->last_succeeded_rp.value_or(_last_not_complete_rp));
-        manager_logger.trace("send_one_file(): error while sending hints from {}, last RP is {}", fname, _last_not_complete_rp);
+        manager_logger.debug("hint_sender[{}]:send_one_file: Error while sending hints from {}, last RP is {}", _ep_key, fname, _last_not_complete_rp);
        return false;
    }

@@ -546,7 +550,7 @@ bool hint_sender::send_one_file(const sstring& fname) {
    // clear the replay position - we are going to send the next segment...
    _last_not_complete_rp = replay_position();
    _last_schema_ver_to_column_mapping.clear();
-    manager_logger.trace("send_one_file(): segment {} was sent in full and deleted", fname);
+    manager_logger.debug("hint_sender[{}]:send_one_file: Segment {} has been sent in full and deleted", _ep_key, fname);
    return true;
 }

@@ -572,14 +576,15 @@ void hint_sender::pop_current_segment() {
 // Runs in the seastar::async context
 void hint_sender::send_hints_maybe() noexcept {
    using namespace std::literals::chrono_literals;
-    manager_logger.trace("send_hints(): going to send hints to {}, we have {} segment to replay", end_point_key(), _segments_to_replay.size() + _foreign_segments_to_replay.size());
+    manager_logger.trace("hint_sender[{}]:send_hints_maybe: Going to send hints. We have {} segment to replay",
+            end_point_key(), _segments_to_replay.size() + _foreign_segments_to_replay.size());

    int replayed_segments_count = 0;

    try {
        while (true) {
            if (canceled_draining()) {
-                manager_logger.debug("[{}] Exiting loop in send_hints_maybe because of canceled draining", _ep_key);
+                manager_logger.debug("hint_sender[{}]:send_hints_maybe: Exiting loop in send_hints_maybe because of canceled draining", _ep_key);
                break;
            }
            const sstring* seg_name = name_of_current_segment();
@@ -598,7 +603,7 @@ void hint_sender::send_hints_maybe() noexcept {
    // Ignore exceptions, we will retry sending this file from where we left off the next time.
    // Exceptions are not expected here during the regular operation, so just log them.
    } catch (...) {
-        manager_logger.trace("send_hints(): got the exception: {}", std::current_exception());
+        manager_logger.debug("hint_sender[{}]:send_hints_maybe: Exception occurred while sending: {}", _ep_key, std::current_exception());
    }

    if (have_segments()) {
@@ -609,7 +614,7 @@ void hint_sender::send_hints_maybe() noexcept {
        _next_send_retry_tp = _next_flush_tp;
    }

-    manager_logger.trace("send_hints(): we handled {} segments", replayed_segments_count);
+    manager_logger.debug("hint_sender[{}]:send_hints_maybe: We handled {} segments", _ep_key, replayed_segments_count);
 }

 hint_stats& hint_sender::shard_stats() {
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -505,20 +505,20 @@ bool manager::can_hint_for(endpoint_id ep) const noexcept {
    // hints where N is the total number nodes in the cluster.
    const auto hipf = hints_in_progress_for(ep);
    if (_stats.size_of_hints_in_progress > max_size_of_hints_in_progress() && hipf > 0) {
-        manager_logger.trace("size_of_hints_in_progress {} hints_in_progress_for({}) {}",
+        manager_logger.trace("can_hint_for: size_of_hints_in_progress {} hints_in_progress_for({}) {}",
                _stats.size_of_hints_in_progress, ep, hipf);
        return false;
    }

    // Check that the destination DC is "hintable".
    if (!check_dc_for(ep)) {
-        manager_logger.trace("{}'s DC is not hintable", ep);
+        manager_logger.trace("can_hint_for: {}'s DC is not hintable", ep);
        return false;
    }

    const bool node_is_alive = local_gossiper().get_endpoint_downtime(ep) <= _max_hint_window_us;
    if (!node_is_alive) {
-        manager_logger.trace("{} has been down for too long, not hinting", ep);
+        manager_logger.trace("can_hint_for: {} has been down for too long, not hinting", ep);
        return false;
    }

--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -11,9 +11,11 @@
 #include <boost/functional/hash.hpp>
 #include <boost/icl/interval_map.hpp>
 #include <fmt/ranges.h>
+#include <ranges>

 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
+#include <seastar/core/loop.hh>
 #include <seastar/core/on_internal_error.hh>
 #include "system_keyspace.hh"
 #include "cql3/untyped_result_set.hh"
@@ -1694,6 +1696,12 @@ future<> system_keyspace::peers_table_read_fixup() {
            continue;
        }
        const auto host_id = row.get_as<utils::UUID>("host_id");
+        if (!host_id) {
+            slogger.error("Peer {} has null host_id in system.{}, the record is broken, removing it",
+                peer, system_keyspace::PEERS);
+            co_await remove_endpoint(gms::inet_address{peer});
+            continue;
+        }
        const auto ts = row.get_as<int64_t>("ts");
        const auto it = map.find(host_id);
        if (it == map.end()) {
@@ -1757,8 +1765,15 @@ future<> system_keyspace::drop_truncation_rp_records() {
    auto rs = co_await execute_cql(req);

    bool any = false;
-    co_await coroutine::parallel_for_each(*rs, [&] (const cql3::untyped_result_set_row& row) -> future<> {
+    std::unordered_set<table_id> to_delete;
+    auto db = _qp.db();
+    auto max_concurrency = std::min(1024u, smp::count * 8);
+    co_await seastar::max_concurrent_for_each(*rs, max_concurrency, [&] (const cql3::untyped_result_set_row& row) -> future<> {
        auto table_uuid = table_id(row.get_as<utils::UUID>("table_uuid"));
+        if (!db.try_find_table(table_uuid)) {
+            to_delete.emplace(table_uuid);
+            co_return;
+        }
        auto shard = row.get_as<int32_t>("shard");
        auto segment_id = row.get_as<int64_t>("segment_id");

@@ -1768,11 +1783,26 @@ future<> system_keyspace::drop_truncation_rp_records() {
            co_await execute_cql(req);
        }
    });
+    if (!to_delete.empty()) {
+        // IN has a limit to how many values we can put into it.
+        for (auto&& chunk : to_delete | std::views::transform(&table_id::to_sstring) | std::views::chunk(100)) {
+            auto str = std::ranges::to<std::string>(chunk | std::views::join_with(','));
+            auto req = fmt::format("DELETE FROM system.{} WHERE table_uuid IN ({})", TRUNCATED, str);
+            co_await execute_cql(req);
+        }
+        any = true;
+    }
    if (any) {
        co_await force_blocking_flush(TRUNCATED);
    }
 }

+future<> system_keyspace::remove_truncation_records(table_id id) {
+    auto req = format("DELETE FROM system.{} WHERE table_uuid = {}", TRUNCATED, id);
+    co_await execute_cql(req);
+    co_await force_blocking_flush(TRUNCATED);
+}
+
 future<> system_keyspace::save_truncation_record(const replica::column_family& cf, db_clock::time_point truncated_at, db::replay_position rp) {
    sstring req = format("INSERT INTO system.{} (table_uuid, shard, position, segment_id, truncated_at) VALUES(?,?,?,?,?)", TRUNCATED);
    co_await _qp.execute_internal(req, {cf.schema()->id().uuid(), int32_t(rp.shard_id()), int32_t(rp.pos), int64_t(rp.base_id()), truncated_at}, cql3::query_processor::cache_internal::yes);
@@ -2155,7 +2185,59 @@ future<> system_keyspace::update_peer_info(gms::inet_address ep, locator::host_i

    slogger.debug("{}: values={}", query, values);

-    co_await _qp.execute_internal(query, db::consistency_level::ONE, values, cql3::query_processor::cache_internal::yes);
+    const auto guard = co_await get_units(_peers_cache_lock, 1);
+    try {
+        co_await _qp.execute_internal(query, db::consistency_level::ONE, values, cql3::query_processor::cache_internal::yes);
+        if (auto* cache = get_peers_cache()) {
+            cache->host_id_to_inet_ip[hid] = ep;
+            cache->inet_ip_to_host_id[ep] = hid;
+        }
+    } catch (...) {
+        _peers_cache = nullptr;
+        throw;
+    }
+}
+
+system_keyspace::peers_cache* system_keyspace::get_peers_cache() {
+    auto* cache = _peers_cache.get();
+    if (cache && (lowres_clock::now() > cache->expiration_time)) {
+        _peers_cache = nullptr;
+        return nullptr;
+    }
+    return cache;
+}
+
+future<lw_shared_ptr<const system_keyspace::peers_cache>> system_keyspace::get_or_load_peers_cache() {
+    const auto guard = co_await get_units(_peers_cache_lock, 1);
+    if (auto* cache = get_peers_cache()) {
+        co_return cache->shared_from_this();
+    }
+    auto cache = make_lw_shared<peers_cache>();
+    cache->inet_ip_to_host_id = co_await load_host_ids();
+    cache->host_id_to_inet_ip.reserve(cache->inet_ip_to_host_id.size());
+    for (const auto [ip, id]: cache->inet_ip_to_host_id) {
+        const auto [it, inserted] = cache->host_id_to_inet_ip.insert({id, ip});
+        if (!inserted) {
+            on_internal_error(slogger, ::format("duplicate IP for host_id {}, first IP {}, second IP {}",
+                id, it->second, ip));
+        }
+    }
+    cache->expiration_time = lowres_clock::now() + std::chrono::milliseconds(200);
+    _peers_cache = cache;
+    co_return std::move(cache);
+}
+
+future<std::optional<gms::inet_address>> system_keyspace::get_ip_from_peers_table(locator::host_id id) {
+    const auto cache = co_await get_or_load_peers_cache();
+    if (const auto it = cache->host_id_to_inet_ip.find(id); it != cache->host_id_to_inet_ip.end()) {
+        co_return it->second;
+    }
+    co_return std::nullopt;
+}
+
+future<system_keyspace::host_id_to_ip_map_t> system_keyspace::get_host_id_to_ip_map() {
+    const auto cache = co_await get_or_load_peers_cache();
+    co_return cache->host_id_to_inet_ip;
 }

 template <typename T>
@@ -2205,7 +2287,22 @@ future<> system_keyspace::update_schema_version(table_schema_version version) {
 future<> system_keyspace::remove_endpoint(gms::inet_address ep) {
    const sstring req = format("DELETE FROM system.{} WHERE peer = ?", PEERS);
    slogger.debug("DELETE FROM system.{} WHERE peer = {}", PEERS, ep);
-    co_await execute_cql(req, ep.addr()).discard_result();
+
+    const auto guard = co_await get_units(_peers_cache_lock, 1);
+    try {
+        co_await execute_cql(req, ep.addr()).discard_result();
+        if (auto* cache = get_peers_cache()) {
+            const auto it = cache->inet_ip_to_host_id.find(ep);
+            if (it != cache->inet_ip_to_host_id.end()) {
+                const auto id = it->second;
+                cache->inet_ip_to_host_id.erase(it);
+                cache->host_id_to_inet_ip.erase(id);
+            }
+        }
+    } catch (...) {
+        _peers_cache = nullptr;
+        throw;
+    }
 }

 future<> system_keyspace::update_tokens(const std::unordered_set<dht::token>& tokens) {
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -143,6 +143,17 @@ class system_keyspace : public seastar::peering_sharded_service<system_keyspace>
    //  and this node crashes after adding a new IP but before removing the old one. The
    //  record with older timestamp is removed, the warning is written to the log.
    future<> peers_table_read_fixup();
+
+    struct peers_cache: public enable_lw_shared_from_this<peers_cache> {
+        std::unordered_map<gms::inet_address, locator::host_id> inet_ip_to_host_id;
+        std::unordered_map<locator::host_id, gms::inet_address> host_id_to_inet_ip;
+        lowres_clock::time_point expiration_time;
+    };
+    lw_shared_ptr<peers_cache> _peers_cache;
+    semaphore _peers_cache_lock{1};
+    peers_cache* get_peers_cache();
+    future<lw_shared_ptr<const peers_cache>> get_or_load_peers_cache();
+
 public:
    static schema_ptr size_estimates();
 public:
@@ -308,6 +319,12 @@ public:

    future<> update_peer_info(gms::inet_address ep, locator::host_id hid, const peer_info& info);

+    // Return ip of the peers table entry with given host id
+    future<std::optional<gms::inet_address>> get_ip_from_peers_table(locator::host_id id);
+
+    using host_id_to_ip_map_t = std::unordered_map<locator::host_id, gms::inet_address>;
+    future<host_id_to_ip_map_t> get_host_id_to_ip_map();
+
    future<> remove_endpoint(gms::inet_address ep);

    // Saves the key-value pair into system.scylla_local table.
@@ -418,6 +435,7 @@ public:
    future<> save_truncation_record(const replica::column_family&, db_clock::time_point truncated_at, db::replay_position);
    future<replay_positions> get_truncated_positions(table_id);
    future<> drop_truncation_rp_records();
+    future<> remove_truncation_records(table_id);

    // Converts a `dht::token_range` object to the left-open integer range (x,y] form.
    //
--- a/dist/common/scripts/scylla_sysconfig_setup
+++ b/dist/common/scripts/scylla_sysconfig_setup
@@ -86,9 +86,9 @@ if __name__ == '__main__':
    ethpciid = ''
    if network_mode == 'dpdk':
        dpdk_status = out('/opt/scylladb/scripts/dpdk-devbind.py --status')
-        match = re.search('if={} drv=(\S+)'.format(ifname), dpdk_status, flags=re.MULTILINE)
+        match = re.search(r'if={} drv=(\S+)'.format(ifname), dpdk_status, flags=re.MULTILINE)
        ethdrv = match.group(1)
-        match = re.search('^(\\S+:\\S+:\\S+\.\\S+) [^\n]+ if={} '.format(ifname), dpdk_status, flags=re.MULTILINE)
+        match = re.search(r'^(\S+:\S+:\S+\.\S+) [^\n]+ if={} '.format(ifname), dpdk_status, flags=re.MULTILINE)
        ethpciid = match.group(1)

    if args.mode:
--- a/dist/debian/control.template
+++ b/dist/debian/control.template
@@ -18,7 +18,7 @@ Breaks: scylla-enterprise-conf (<< 2025.1.0~)

 Package: %{product}-server
 Architecture: any
-Depends: ${misc:Depends}, %{product}-conf (= ${binary:Version}), %{product}-python3 (= ${binary:Version})
+Depends: ${misc:Depends}, %{product}-conf (= ${binary:Version}), %{product}-python3 (= ${binary:Version}), procps
 Replaces: %{product}-tools (<<5.5), scylla-enterprise-tools (<< 2024.2.0~), scylla-enterprise-server (<< 2025.1.0~)
 Breaks: %{product}-tools (<<5.5), scylla-enterprise-tools (<< 2024.2.0~), scylla-enterprise-server (<< 2025.1.0~)
 Description: Scylla database server binaries
--- a/dist/docker/redhat/build_docker.sh
+++ b/dist/docker/redhat/build_docker.sh
@@ -88,7 +88,7 @@ bcp LICENSE-ScyllaDB-Source-Available.md /licenses/

 run microdnf clean all
 run microdnf --setopt=tsflags=nodocs -y update
-run microdnf --setopt=tsflags=nodocs -y install hostname python3 python3-pip kmod
+run microdnf --setopt=tsflags=nodocs -y install hostname kmod procps-ng python3 python3-pip
 run microdnf clean all
 run pip3 install --no-cache-dir --prefix /usr supervisor
 run bash -ec "echo LANG=C.UTF-8 > /etc/locale.conf"
--- a/dist/redhat/scylla.spec
+++ b/dist/redhat/scylla.spec
@@ -76,6 +76,7 @@ Group:          Applications/Databases
 Summary:        The Scylla database server
 Requires:       %{product}-conf = %{version}-%{release}
 Requires:       %{product}-python3 = %{version}-%{release}
+Requires:       procps-ng
 AutoReqProv:    no
 Provides:       %{product}-tools:%{_bindir}/nodetool
 Provides:       %{product}-tools:%{_sysconfigdir}/bash_completion.d/nodetool-completion
--- a/docs/_static/data/os-support.json
+++ b/docs/_static/data/os-support.json
@@ -2,10 +2,19 @@
    "Linux Distributions": {
      "Ubuntu": ["22.04", "24.04"],
      "Debian": ["11"],
-      "Rocky / CentOS / RHEL": ["8", "9"],
+      "Rocky / CentOS / RHEL": ["8", "9", "10"],
      "Amazon Linux": ["2023"]
    },
    "ScyllaDB Versions": [
+      {
+        "version": "ScyllaDB 2025.3",
+        "supported_OS": {
+          "Ubuntu": ["22.04", "24.04"],
+          "Debian": ["11"],
+          "Rocky / CentOS / RHEL": ["8", "9", "10"],
+          "Amazon Linux": ["2023"]
+        }
+      },
      {
        "version": "ScyllaDB 2025.2",
        "supported_OS": {
--- a/docs/_utils/redirects.yaml
+++ b/docs/_utils/redirects.yaml
@@ -172,4 +172,7 @@
 /stable/upgrade/upgrade-opensource/upgrade-guide-from-4.5-to-4.6/metric-update-4.5-to-4.6.html: /stable/upgrade/index.html

 # Divide API reference to smaller files
-# /stable/reference/api-reference.html: /stable/reference/api/index.html
+# /stable/reference/api-reference.html: /stable/reference/api/index.html
+
+# Fixed typo in the file name
+/stable/operating-scylla/nodetool-commands/enbleautocompaction.html: /stable/operating-scylla/nodetool-commands/enableautocompaction.html
--- a/docs/cql/types.rst
+++ b/docs/cql/types.rst
@@ -481,7 +481,8 @@ Creating a new user-defined type is done using a ``CREATE TYPE`` statement defin
   field_definition: `identifier` `cql_type`

 A UDT has a name (``udt_name``), which is used to declare columns of that type and is a set of named and typed fields. The ``udt_name`` can be any
-type, including collections or other UDTs. UDTs and collections inside collections must always be frozen (no matter which version of ScyllaDB you are using). 
+type, including collections or other UDTs.
+Similar to collections, a UDT can be frozen or non-frozen. A frozen UDT is immutable and can only be updated as a whole. Nested UDTs or UDTs used in keys must always be frozen.

 For example::

@@ -506,26 +507,15 @@ For example::

  CREATE TABLE superheroes (
       name frozen<full_name> PRIMARY KEY,
-       home frozen<address>
+       home address
  );

 .. note::

   - Attempting to create an already existing type will result in an error unless the ``IF NOT EXISTS`` option is used. If it is used, the statement will be a no-op if the type already exists.
   - A type is intrinsically bound to the keyspace in which it is created and can only be used in that keyspace. At creation, if the type name is prefixed by a keyspace name, it is created in that keyspace. Otherwise, it is created in the current keyspace.
-   - As of ScyllaDB Open Source 3.2, UDTs not inside collections do not have to be frozen, but in all versions prior to ScyllaDB Open Source 3.2, and in all ScyllaDB Enterprise versions, UDTs **must** be frozen. 


-A non-frozen UDT example with ScyllaDB Open Source 3.2 and higher::
-
-   CREATE TYPE ut (a int, b int);
-   CREATE TABLE cf (a int primary key, b ut);
-
-Same UDT in versions prior::
-
-   CREATE TYPE ut (a int, b int);
-   CREATE TABLE cf (a int primary key, b frozen<ut>);
-
 UDT literals
 ~~~~~~~~~~~~

--- a/docs/dev/testing.md
+++ b/docs/dev/testing.md
@@ -10,7 +10,7 @@ This is a manual for `test.py`.

 ## Installation

-To run `test.py`, Python 3.7 or higher is required.
+To run `test.py`, Python 3.11 or higher is required.
 `./install-dependencies.sh` should install all the required Python
 modules. If `install-dependencies.sh` does not support your distribution,
 please manually install all Python modules it lists with `pip`.
@@ -106,6 +106,19 @@ shed more light on this.
 Build artefacts, such as test output and harness output is stored
 in `./testlog`. Scylla data files are stored in `/tmp`.

+There are several test directories that are excluded from orchestration by `test.py`:
+
+- test/boost
+- test/raft
+- test/ldap
+- test/unit
+
+This means that `test.py` will not run tests directly, but will delegate all work to `pytest`.
+That's why all these directories do not have `suite.yaml` files.
+Additionally, these directories do not follow abstract naming suite/testname
+convention, and instead use the `pytest` naming convention, i.e. to run a test you need to provide the path to the file
+and optionally the test name, e.g. `test/boost/aggregate_fcts_test.cc::test_aggregate_avg`.
+
 ## How it works

 On start, `test.py` invokes `ninja` to find out configured build modes. Then
@@ -176,11 +189,11 @@ Scylla (possibly started in debugger) using `cqlsh`.

 The same unit test can be run in different seastar configurations, i.e. with
 different command line arguments. The custom arguments can be set in
-`custom_args` key of the `suite.yaml` file.
+`custom_args` key of the `test_config.yaml` file.

 Tests from boost suite are divided into test-cases. These are top-level
 functions wrapped by `BOOST_AUTO_TEST_CASE`, `SEASTAR_TEST_CASE` or alike.
-Boost tests support `suitename/testname::casename` selection described above.
+Boost tests support `path/to/file_name.cc::casename` selection described above.

 ### Debugging unit tests

@@ -328,6 +341,19 @@ as it was at the beginning of the test, is considered "dirty".
 Such clusters are not returned to the pool, but destroyed, and
 the pool is replenished with a new cluster instead.

+## Test metrics
+
+The parameter `--gather-metrics` is used to gather CPU/RAM usage during tests from the cgroup and system overall CPU/RAM
+usage.
+For that, SQLite database is used to store the metrics in `testlog/sqlite.db`.
+The database is created in the `testlog` directory and contains the following tables:
+
+- `tests` - contains the list of tests that were executed with information about the test name, directory, architecture,
+  and mode
+- `test_metrics` - contains the metrics for each test, such as memory peak usage, CPU usage, and duration
+- `system_resource_metrics` - contains system CPU and memory utilization in percents during the whole run
+- `cgroup_memory_metrics` - contains cgroup memory usage during the test run
+
 ## Automation, CI, and Jenkins

 If any of the tests fails, `test.py` returns a non-zero exit status.
--- a/docs/features/backup-and-restore.rst
+++ b/docs/features/backup-and-restore.rst
@@ -0,0 +1,53 @@
+===========================
+Backup And Restore Overview
+===========================
+
+Backup and restore are critical components of data management, ensuring that your data is safe and can be recovered in case of loss or corruption. This document provides an overview of the backup and restore process, including best practices, tools, procedures, metrics and more
+
+
+Process Overview
+----------------
+**The Backup process** is managed by ScyllaDB Manager as a whole.
+The overview given here is for a single node, ScyllaDB Manager is responsible to orchestrate backups across the cluster.
+For backup, a snapshot is created, and then the data is copied to a remote location - normally an S3 bucket, Google Cloud Storage, or a similar service.
+
+**The Restore process** is also managed by ScyllaDB manager and it involves copying the data back from the remote location to an empty ScyllaDB node.
+Restoring to a live cluster is not yet supported.
+
+Backup Process
+--------------
+
+#. **Snapshot Creation**: A snapshot of the data is created on the ScyllaDB node.
+   This is a point-in-time copy of the data.
+#. **Upload Data**: The snapshot data is transferred to a remote storage location,
+   such as an S3 bucket or Google Cloud Storage. You can upload data in two ways:
+
+   * **rclone** - the tool responsible for the upload is the scylla manager agent
+     that runs on the node.
+
+          - It runs side by side with scylla and therefore may interfere
+            with ScyllaDB performance.
+          - It supports many cloud storage providers.
+
+   * **Native upload** - ScyllaDB itself is responsible for the upload.
+
+          - It takes into consideration ScyllaDB performance and does
+            not interfere with it.
+          - It supports only S3 compatible storage providers.
+
+      See the `ScyllaDB Manager backup documentation <https://manager.docs.scylladb.com/stable/backup/index.html>`_
+      for more details on how to configure the upload method.
+
+#. **Native Configuration**: 
+
+   * For `native` backup to work without interference to users' workload, it is
+     best to limit io-scheduling. See :ref:`stream_io_throughput_mb_per_sec <confprop_stream_io_throughput_mb_per_sec>` for details.     
+   * For `native` backup to work, ScyllaDB node must have access to the S3 bucket.
+     See :ref:`Configuring Object Storage <object-storage-configuration>` for details.
+
+Restore Process
+---------------
+The restore process is managed completely by ScyllaDB Manager.
+No special configuration is needed.
+Restore may be executed by rclone or natively, not depending on the backup method used.
+See `ScyllaDB Manager restore documentation <https://manager.docs.scylladb.com/stable/restore/index.html>`_ for more details on how to restore data.
--- a/docs/features/index.rst
+++ b/docs/features/index.rst
@@ -15,6 +15,7 @@ This document highlights ScyllaDB's key data modeling features.
   Change Data Capture </features/cdc/index>
   Workload Attributes </features/workload-attributes>
   Workload Prioritization </features/workload-prioritization>
+   Backup and Restore </features/backup-and-restore>

 .. panel-box::
  :title: ScyllaDB Features
@@ -36,3 +37,5 @@ This document highlights ScyllaDB's key data modeling features.
    state and the history of all changes made to tables in the database.
  * :doc:`Workload Attributes </features/workload-attributes>` assigned to your workloads
    specify how ScyllaDB will handle requests depending on the workload.
+  * :doc:`Backup and Restore </features/backup-and-restore>` allows you to create
+    backups of your data and restore it when needed.
--- a/docs/getting-started/cloud-instance-recommendations.rst
+++ b/docs/getting-started/cloud-instance-recommendations.rst
@@ -113,8 +113,8 @@ Pick a zone where Haswell CPUs are found. Local SSD performance offers, accordin
 Image with NVMe disk interface is recommended.
 (`More info <https://cloud.google.com/compute/docs/disks/local-ssd>`_)

-Recommended instances types are `z3-highmem-highlssd <https://cloud.google.com/compute/docs/storage-optimized-machines#z3_machine_types>`_,
-`n1-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n1_machines>`_, and `n2-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n2_machines>`_
+Recommended instances types are `z3-highmem-highlssd and z3-highmem-standardlssd <https://cloud.google.com/compute/docs/storage-optimized-machines#z3_machine_types>`_,
+`n1-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n1_machines>`_, and `n2-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n2_machines>`_.


 .. list-table::
@@ -145,6 +145,39 @@ Recommended instances types are `z3-highmem-highlssd <https://cloud.google.com/c
     - 44
     - 352
     - 18,000
+   * - z3-highmem-88-highlssd
+     - 88
+     - 704
+     - 36,000	  
+
+.. list-table::
+   :widths: 30 20 20 30
+   :header-rows: 1
+
+   * - Model
+     - vCPU
+     - Mem (GB)
+     - Storage (GB)
+   * - z3-highmem-14-standardlssd
+     - 14
+     - 112
+     - 3,000
+   * - z3-highmem-22-standardlssd
+     - 22
+     - 176
+     - 6,000
+   * - z3-highmem-44-standardlssd 
+     - 44
+     - 352
+     - 9,000
+   * - z3-highmem-88-standardlssd
+     - 88
+     - 704
+     - 18,000
+   * - z3-highmem-176-standardlssd
+     - 176
+     - 1,406
+     - 36,000 

 .. list-table::
   :widths: 30 20 20 30
--- a/docs/getting-started/install-scylla/launch-on-gcp.rst
+++ b/docs/getting-started/install-scylla/launch-on-gcp.rst
@@ -30,7 +30,7 @@ Launching ScyllaDB on GCP

   .. code-block:: console
      
-        gcloud compute instances create <name of new instance> --image <ScyllaDB image name> --image-project < ScyllaDB project name> --local-ssd interface=nvme --zone <GCP zone - optional> --machine-type=<machine type>
+        gcloud compute instances create <name of new instance> --image <ScyllaDB image name> --image-project < ScyllaDB project name> --local-ssd interface=nvme --zone=<GCP zone - optional> --machine-type=<machine type>
   
   For example:

--- a/docs/kb/consistency.rst
+++ b/docs/kb/consistency.rst
@@ -83,7 +83,7 @@ Additional References

 * `Jepsen and ScyllaDB: Putting Consistency to the Test blog post <https://www.scylladb.com/2020/12/23/jepsen-and-scylla-putting-consistency-to-the-test/>`_ 
 * `Nauto: Achieving Consistency in an Eventually Consistent Environment blog post <https://www.scylladb.com/2020/02/20/nauto-achieving-consistency-in-an-eventually-consistent-environment/>`_ 
-* `Consistency Levels documentation <https://docs.scylladb.com/stable/cql/consistency.html>`_ 
+* `Consistency Levels documentation <https://docs.scylladb.com/manual/stable/cql/consistency.html>`_ 
 * `High Availability lesson on ScyllaDB University <https://university.scylladb.com/courses/scylla-essentials-overview/lessons/high-availability/>`_ 
 * `Lightweight Transactions lesson on ScyllaDB University <https://university.scylladb.com/courses/data-modeling/lessons/lightweight-transactions/>`_ 
 * `Getting the Most out of Lightweight Transactions in ScyllaDB blog post <https://www.scylladb.com/2020/07/15/getting-the-most-out-of-lightweight-transactions-in-scylla/>`_ 
--- a/docs/operating-scylla/nodetool-commands/enableautocompaction.rst
+++ b/docs/operating-scylla/nodetool-commands/enableautocompaction.rst
--- a/docs/operating-scylla/nodetool-commands/restore.rst
+++ b/docs/operating-scylla/nodetool-commands/restore.rst
@@ -26,6 +26,7 @@ Syntax
               --table <table>
               [--nowait]
               [--scope <scope>]
+               [--sstables-file-list <file>]
               <sstables>...

 Example
@@ -51,6 +52,7 @@ Options
 * ``--table`` - Name of the table to load SSTables into
 * ``--nowait`` - Don't wait on the restore process
 * ``--scope <scope>`` - Use specified load-and-stream scope
+* ``--sstables-file-list <file>`` - restore the sstables listed in the given <file>. the list should be new-line seperated.
 * ``<sstables>`` - Remainder of keys of the TOC (Table of Contents) components of SSTables to restore, relative to the specified prefix

 The `scope` parameter describes the subset of cluster nodes where you want to load data:
@@ -60,6 +62,8 @@ The `scope` parameter describes the subset of cluster nodes where you want to lo
 * `dc` - In the datacenter (DC) where the local node lives.
 * `all` (default) - Everywhere across the cluster.

+`--sstables-file-list <file>` and `<sstable>` can be combined together, `nodetool restore` will attempt to restore the combined list. duplicates are _not_ removed
+
 To fully restore a cluster, you should combine the ``scope`` parameter with the correct list of
 SStables to restore to each node.
 On one extreme, one node is given all SStables with the scope ``all``; on the other extreme, all
--- a/docs/operating-scylla/nodetool.rst
+++ b/docs/operating-scylla/nodetool.rst
@@ -14,9 +14,9 @@ Nodetool
   nodetool-commands/cleanup
   nodetool-commands/clearsnapshot
   nodetool-commands/cluster/index
+   nodetool-commands/compact
   nodetool-commands/compactionhistory
   nodetool-commands/compactionstats
-   nodetool-commands/compact
   nodetool-commands/decommission
   nodetool-commands/describecluster
   nodetool-commands/describering
@@ -25,13 +25,15 @@ Nodetool
   nodetool-commands/disablebinary
   nodetool-commands/disablegossip
   nodetool-commands/drain
-   nodetool-commands/enbleautocompaction   
+   nodetool-commands/enableautocompaction
   nodetool-commands/enablebackup
   nodetool-commands/enablebinary
   nodetool-commands/enablegossip
   nodetool-commands/flush
+   nodetool-commands/getcompactionthroughput
   nodetool-commands/getendpoints
   nodetool-commands/getsstables
+   nodetool-commands/getstreamthroughput
   nodetool-commands/gettraceprobability
   nodetool-commands/gossipinfo
   nodetool-commands/help
@@ -46,25 +48,23 @@ Nodetool
   nodetool-commands/restore
   nodetool-commands/ring
   nodetool-commands/scrub
-   nodetool-commands/settraceprobability
+   nodetool-commands/setcompactionthroughput
   nodetool-commands/setlogginglevel
+   nodetool-commands/setstreamthroughput
+   nodetool-commands/settraceprobability
   nodetool-commands/snapshot
   nodetool-commands/sstableinfo
+   nodetool-commands/status
   nodetool-commands/statusbackup
   nodetool-commands/statusbinary
   nodetool-commands/statusgossip
-   nodetool-commands/status
   Nodetool stop compaction <nodetool-commands/stop>
   nodetool-commands/tablestats
   nodetool-commands/tasks/index
   nodetool-commands/toppartitions
   nodetool-commands/upgradesstables
-   nodetool-commands/viewbuildstatus
   nodetool-commands/version
-   nodetool-commands/getcompactionthroughput
-   nodetool-commands/setcompactionthroughput
-   nodetool-commands/getstreamthroughput
-   nodetool-commands/setstreamthroughput
+   nodetool-commands/viewbuildstatus

 The ``nodetool`` utility provides a simple command-line interface to the following exposed operations and attributes.

@@ -87,9 +87,9 @@ Operations that are not listed below are currently not available.
 * :doc:`cleanup </operating-scylla/nodetool-commands/cleanup/>` - Triggers the immediate cleanup of keys no longer belonging to a node.
 * :doc:`clearsnapshot </operating-scylla/nodetool-commands/clearsnapshot/>` - This command removes snapshots.
 * :doc:`cluster <nodetool-commands/cluster/index>` - Run a cluster operation.
+* :doc:`compact </operating-scylla/nodetool-commands/compact/>`- Force a (major) compaction on one or more column families.
 * :doc:`compactionhistory </operating-scylla/nodetool-commands/compactionhistory/>` - Provides the history of compactions.
 * :doc:`compactionstats </operating-scylla/nodetool-commands/compactionstats/>`- Print statistics on compactions.
-* :doc:`compact </operating-scylla/nodetool-commands/compact/>`- Force a (major) compaction on one or more column families.
 * :doc:`decommission </operating-scylla/nodetool-commands/decommission/>` - Decommission the node.
 * :doc:`describecluster </operating-scylla/nodetool-commands/describecluster/>` - Print the name, snitch, partitioner and schema version of a cluster.
 * :doc:`describering </operating-scylla/nodetool-commands/describering/>` - :code:`<keyspace>`- Shows the partition ranges of a given keyspace.
@@ -98,14 +98,16 @@ Operations that are not listed below are currently not available.
 * :doc:`disablebinary </operating-scylla/nodetool-commands/disablebinary/>` - Disable native transport (binary protocol).
 * :doc:`disablegossip </operating-scylla/nodetool-commands/disablegossip/>` - Disable gossip (effectively marking the node down).
 * :doc:`drain </operating-scylla/nodetool-commands/drain/>` - Drain the node (stop accepting writes and flush all column families).
-* :doc:`enableautocompaction </operating-scylla/nodetool-commands/enbleautocompaction/>` - Enable automatic compaction of a keyspace or table.
+* :doc:`enableautocompaction </operating-scylla/nodetool-commands/enableautocompaction/>` - Enable automatic compaction of a keyspace or table.
 * :doc:`enablebackup </operating-scylla/nodetool-commands/enablebackup/>` - Enable incremental backup.
 * :doc:`enablebinary </operating-scylla/nodetool-commands/enablebinary/>` - Re-enable native transport (binary protocol).
 * :doc:`enablegossip </operating-scylla/nodetool-commands/enablegossip/>` - Re-enable gossip.
 * :doc:`flush </operating-scylla/nodetool-commands/flush/>` - Flush one or more column families.
+* :doc:`getcompactionthroughput </operating-scylla/nodetool-commands/getcompactionthroughput>` - Print the throughput cap for compaction in the system
 * :doc:`getendpoints <nodetool-commands/getendpoints/>` :code:`<keyspace>` :code:`<table>` :code:`<key>`- Print the end points that owns the key.
 * **getlogginglevels** - Get the runtime logging levels.
 * :doc:`getsstables </operating-scylla/nodetool-commands/getsstables>` - Print the sstable filenames that own the key.
+* :doc:`getstreamthroughput </operating-scylla/nodetool-commands/getstreamthroughput>` - Print the throughput cap for SSTables streaming in the system
 * :doc:`gettraceprobability </operating-scylla/nodetool-commands/gettraceprobability>` - Displays the current trace probability value. 0 is disabled 1 is enabled.
 * :doc:`gossipinfo </operating-scylla/nodetool-commands/gossipinfo/>` - Shows the gossip information for the cluster.
 * :doc:`help </operating-scylla/nodetool-commands/help/>` - Display list of available nodetool commands.
@@ -118,28 +120,26 @@ Operations that are not listed below are currently not available.
 * :doc:`refresh </operating-scylla/nodetool-commands/refresh/>`- Load newly placed SSTables to the system without restart
 * :doc:`removenode </operating-scylla/nodetool-commands/removenode/>`- Remove node with the provided ID
 * :doc:`repair <nodetool-commands/repair/>`  :code:`<keyspace>` :code:`<table>` - Repair one or more vnode tables.
-* :doc:`restore </operating-scylla/nodetool-commands/restore/>` - Load SSTables from a designated bucket in object store into a specified keyspace or table
 * :doc:`resetlocalschema </operating-scylla/nodetool-commands/resetlocalschema/>` - Reset the node's local schema.
+* :doc:`restore </operating-scylla/nodetool-commands/restore/>` - Load SSTables from a designated bucket in object store into a specified keyspace or table
 * :doc:`ring <nodetool-commands/ring/>` - The nodetool ring command display the token ring information.
 * :doc:`scrub </operating-scylla/nodetool-commands/scrub>` :code:`[-m mode] [--no-snapshot] <keyspace> [<table>...]` - Scrub the SSTable files in the specified keyspace or table(s)
+* :doc:`setcompactionthroughput </operating-scylla/nodetool-commands/setcompactionthroughput>` - Set the throughput cap for compaction in the system
 * :doc:`setlogginglevel</operating-scylla/nodetool-commands/setlogginglevel>` - sets the logging level threshold for ScyllaDB classes
+* :doc:`setstreamthroughput </operating-scylla/nodetool-commands/setstreamthroughput>` - Set the throughput cap for SSTables streaming in the system
 * :doc:`settraceprobability </operating-scylla/nodetool-commands/settraceprobability/>` ``<value>`` - Sets the probability for tracing a request. race probability value
 * :doc:`snapshot </operating-scylla/nodetool-commands/snapshot>` :code:`[-t tag] [-cf column_family] <keyspace>`  - Take a snapshot of specified keyspaces or a snapshot of the specified table.
 * :doc:`sstableinfo </operating-scylla/nodetool-commands/sstableinfo>` - Get information about sstables per keyspace/table.
+* :doc:`status </operating-scylla/nodetool-commands/status/>` - Print cluster information.
 * :doc:`statusbackup </operating-scylla/nodetool-commands/statusbackup/>` - Status of incremental backup.
 * :doc:`statusbinary </operating-scylla/nodetool-commands/statusbinary/>` - Status of native transport (binary protocol).
 * :doc:`statusgossip </operating-scylla/nodetool-commands/statusgossip/>` - Status of gossip.
-* :doc:`status </operating-scylla/nodetool-commands/status/>` - Print cluster information.
 * :doc:`stop </operating-scylla/nodetool-commands/stop/>` - Stop compaction operation.
 * **tablehistograms** see :doc:`cfhistograms <nodetool-commands/cfhistograms/>`
 * :doc:`tablestats </operating-scylla/nodetool-commands/tablestats/>` - Provides in-depth diagnostics regard table. 
 * :doc:`tasks </operating-scylla/nodetool-commands/tasks/index>` - Manage tasks manager tasks.
 * :doc:`toppartitions </operating-scylla/nodetool-commands/toppartitions/>` - Samples cluster writes and reads and reports the most active partitions in a specified table and time frame.
 * :doc:`upgradesstables </operating-scylla/nodetool-commands/upgradesstables>` - Upgrades each table that is not running the latest ScyllaDB version, by rewriting SSTables.
-* :doc:`viewbuildstatus </operating-scylla/nodetool-commands/viewbuildstatus/>` - Shows the progress of a materialized view build.
 * :doc:`version </operating-scylla/nodetool-commands/version>` - Print the DB version.
-* :doc:`getcompactionthroughput </operating-scylla/nodetool-commands/getcompactionthroughput>` - Print the throughput cap for compaction in the system
-* :doc:`setcompactionthroughput </operating-scylla/nodetool-commands/setcompactionthroughput>` - Set the throughput cap for compaction in the system
-* :doc:`getstreamthroughput </operating-scylla/nodetool-commands/getstreamthroughput>` - Print the throughput cap for SSTables streaming in the system
-* :doc:`setstreamthroughput </operating-scylla/nodetool-commands/setstreamthroughput>` - Set the throughput cap for SSTables streaming in the system
+* :doc:`viewbuildstatus </operating-scylla/nodetool-commands/viewbuildstatus/>` - Shows the progress of a materialized view build.

--- a/docs/troubleshooting/handling-node-failures.rst
+++ b/docs/troubleshooting/handling-node-failures.rst
@@ -148,16 +148,25 @@ will leave the recovery mode and remove the obsolete internal Raft data.
        cqlsh> TRUNCATE TABLE system.discovery;
        cqlsh> DELETE value FROM system.scylla_local WHERE key = 'raft_group0_id';

-#. Add the ``recovery_leader`` property to the ``scylla.yaml`` file and set it to the host ID of the recovery leader on
-   **every live node**. Make sure the change is applied on all nodes by sending the ``SIGHUP`` signal to all ScyllaDB
-   processes.
-
 #. Perform a :doc:`rolling restart </operating-scylla/procedures/config-change/rolling-restart/>` of all live nodes,
-   however, this time **the recovery leader must be restarted first**.
+   but:
+
+   * **restart the recovery leader first**,
+
+   * before restarting each node, add the ``recovery_leader`` property to its ``scylla.yaml`` file and set it to the
+     host ID of the recovery leader,
+
+   * after restarting each node, make sure it participated in Raft recovery; look for one of the following messages
+     in its logs:
+
+    .. code-block:: console
+
+        storage_service - Performing Raft-based recovery procedure with recovery leader <host ID of the recovery leader>/<IP address of the recovery leader>
+        storage_service - Raft-based recovery procedure - found group 0 with ID <ID of the new group 0; different from the one used in other steps>

   After completing this step, Raft should be fully functional.

-#. Replace all dead nodes from the cluster using the
+#. Replace all dead nodes in the cluster using the
   :doc:`node replacement procedure </operating-scylla/procedures/cluster-management/replace-dead-node/>`.

   .. note::
--- a/docs/upgrade/upgrade-guides/index.rst
+++ b/docs/upgrade/upgrade-guides/index.rst
@@ -4,7 +4,7 @@ Upgrade ScyllaDB

 .. toctree::
   
-   ScyllaDB 2025.1 to ScyllaDB 2025.2 <upgrade-guide-from-2025.1-to-2025.2/index>
+   ScyllaDB 2025.2 to ScyllaDB 2025.3 <upgrade-guide-from-2025.2-to-2025.3/index>
   ScyllaDB Image <ami-upgrade>


--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/index.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/index.rst
@@ -1,13 +0,0 @@
-==========================================================
-Upgrade - ScyllaDB 2025.1 to ScyllaDB 2025.2
-==========================================================
-
-.. toctree::
-   :maxdepth: 2
-   :hidden:
-
-   Upgrade ScyllaDB <upgrade-guide-from-2025.1-to-2025.2>
-   Metrics Update <metric-update-2025.1-to-2025.2>
-
-* :doc:`Upgrade from ScyllaDB 2025.1.x to ScyllaDB 2025.2.y <upgrade-guide-from-2025.1-to-2025.2>`
-* :doc:`Metrics Update Between 2025.1 and 2025.2 <metric-update-2025.1-to-2025.2>`
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/metric-update-2025.1-to-2025.2.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/metric-update-2025.1-to-2025.2.rst
@@ -1,61 +0,0 @@
-.. |SRC_VERSION| replace:: 2025.1
-.. |NEW_VERSION| replace:: 2025.2
-
-Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
-================================================================
-
-.. toctree::
-   :maxdepth: 2
-   :hidden:
-
-ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
-
-New Metrics
------------
-
-The following metrics are new in ScyllaDB |NEW_VERSION| compared to |SRC_VERSION|:
-
-
-.. list-table::
-   :widths: 25 150
-   :header-rows: 1
-
-   * - Metric
-     - Description
-   * - scylla_alternator_batch_item_count_histogram
-     - A histogram of the number of items in a batch request.
-   * - scylla_database_total_view_updates_failed_pairing
-     - Total number of view updates for which we failed base/view pairing.
-   * - scylla_group_name_cross_rack_collocations
-     - The number of co-locating migrations that move replica across racks.
-   * - scylla_network_bytes_received
-     - The number of bytes received from network sockets.
-   * - scylla_network_bytes_sent
-     - The number of bytes written to network sockets.
-   * - scylla_reactor_awake_time_ms_total
-     - Total reactor awake time (wall_clock).
-   * - scylla_reactor_cpu_used_time_ms
-     - Total reactor thread CPU time (from CLOCK_THREAD_CPUTIME).
-   * - scylla_reactor_sleep_time_ms_total
-     - Total reactor sleep time (wall clock).
-   * - scylla_sstable_compression_dicts_total_live_memory_bytes
-     - Total amount of memory consumed by SSTable compression dictionaries in RAM.
-   * - scylla_transport_connections_blocked
-     - Holds an incrementing counter with the CQL connections that were blocked
-       before being processed due to threshold configured via
-       uninitialized_connections_semaphore_cpu_concurrency.Blocks are normal
-       when we have multiple connections initialized at once. If connectionsare
-       timing out and this value is high it indicates either connections storm
-       or unusually slow processing.
-   * - scylla_transport_connections_shed
-     - Holds an incrementing counter with the CQL connections that were shed
-       due to concurrency semaphore timeout (threshold configured via
-       uninitialized_connections_semaphore_cpu_concurrency). This typically can
-       happen during connection.
-   
-  
-
-
-
-
-
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/index.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/index.rst
@@ -0,0 +1,13 @@
+==========================================================
+Upgrade - ScyllaDB 2025.2 to ScyllaDB 2025.3
+==========================================================
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+   Upgrade ScyllaDB <upgrade-guide-from-2025.2-to-2025.3>
+   Metrics Update <metric-update-2025.2-to-2025.3>
+
+* :doc:`Upgrade from ScyllaDB 2025.2.x to ScyllaDB 2025.3.y <upgrade-guide-from-2025.2-to-2025.3>`
+* :doc:`Metrics Update Between 2025.2 and 2025.3 <metric-update-2025.2-to-2025.3>`
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/metric-update-2025.2-to-2025.3.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/metric-update-2025.2-to-2025.3.rst
@@ -0,0 +1,95 @@
+.. |SRC_VERSION| replace:: 2025.2
+.. |NEW_VERSION| replace:: 2025.3
+
+================================================================
+Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
+================================================================
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
+
+New Metrics
+------------
+
+The following metrics are new in ScyllaDB |NEW_VERSION| compared to |SRC_VERSION|.
+
+Alternator Per-table Metrics
+===================================
+
+.. list-table::
+   :widths: 25 150
+   :header-rows: 1
+
+   * - Metric
+     - Description
+   * - scylla_alternator_table_batch_item_count
+     - The total number of items processed across all batches.
+   * - scylla_alternator_table_batch_item_count_histogram	
+     - A histogram of the number of items in a batch request.
+   * - scylla_alternator_table_filtered_rows_dropped_total	
+     - The number of rows read and dropped during filtering operations.
+   * - scylla_alternator_table_filtered_rows_matched_total	
+     - The number of rows read and matched during filtering operations.
+   * - scylla_alternator_table_filtered_rows_read_total	
+     - The number of rows read during filtering operations.
+   * - scylla_alternator_table_op_latency
+     - A latency histogram of an operation via Alternator API.
+   * - scylla_alternator_table_op_latency_summary	
+     - A latency summary of an operation via Alternator API.
+   * - scylla_alternator_table_operation
+     - The number of operations via Alternator API.
+   * - scylla_alternator_table_rcu_total
+     - The total number of consumed read units.
+   * - scylla_alternator_table_reads_before_write
+     - The number of performed read-before-write operations.
+   * - scylla_alternator_table_requests_blocked_memory
+     - Counts the number of requests blocked due to memory pressure.
+   * - scylla_alternator_table_requests_shed
+     - Counts the number of requests shed due to overload.
+   * - scylla_alternator_table_shard_bounce_for_lwt
+     - The number of writes that had to be bounced from this shard because of LWT requirements.
+   * - scylla_alternator_table_total_operations
+     - The number of total operations via Alternator API.
+   * - scylla_alternator_table_unsupported_operations
+     - The number of unsupported operations via Alternator API.
+   * - scylla_alternator_table_wcu_total	
+     - The total number of consumed write units.
+   * - scylla_alternator_table_write_using_lwt
+     - The number of writes that used LWT.
+
+Other Metrics
+===============
+
+.. list-table::
+   :widths: 25 150
+   :header-rows: 1
+
+   * - Metric
+     - Description
+   * - scylla_batchlog_manager_total_write_replay_attempts
+     - Counts write operations issued in a batchlog replay flow.
+       A high value of this metric indicates that there is a long batch replay list.
+   * - scylla_corrupt_data_entries_reported	
+     - Counts the number of corrupt data instances reported to the corrupt data handler.
+       A non-zero value indicates that the database suffered data corruption.
+   * - scylla_memory_oversized_allocs
+     - The total count of oversized memory allocations.
+   * - scylla_reactor_internal_errors
+     - The total number of internal errors (subset of cpp_exceptions) that usually
+       indicate a malfunction in the code
+   * - scylla_stall_detector_io_threaded_fallbacks	
+     - The total number of io-threaded-fallbacks operations.
+
+Removed Metrics
+---------------------
+
+The following metrics have been removed in 2025.3:
+
+* scylla_cql_authorized_prepared_statements_cache_evictions
+* scylla_lsa_large_objects_total_space_bytes
+* scylla_lsa_small_objects_total_space_bytes
+* scylla_lsa_small_objects_used_space_bytes
+
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/upgrade-guide-from-2025.2-to-2025.3.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/upgrade-guide-from-2025.2-to-2025.3.rst
@@ -1,13 +1,13 @@
 .. |SCYLLA_NAME| replace:: ScyllaDB

-.. |SRC_VERSION| replace:: 2025.1
-.. |NEW_VERSION| replace:: 2025.2
+.. |SRC_VERSION| replace:: 2025.2
+.. |NEW_VERSION| replace:: 2025.3

 .. |ROLLBACK| replace:: rollback
 .. _ROLLBACK: ./#rollback-procedure

-.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.1 to 2025.2
-.. _SCYLLA_METRICS: ../metric-update-2025.1-to-2025.2
+.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.2 to 2025.3
+.. _SCYLLA_METRICS: ../metric-update-2025.2-to-2025.3

 =======================================================================================
 Upgrade from |SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION|
@@ -44,14 +44,6 @@ We recommend upgrading the Monitoring Stack to the latest version.
 See the ScyllaDB Release Notes for the latest updates. The Release Notes are published 
 at the `ScyllaDB Community Forum <https://forum.scylladb.com/>`_.

-.. note::
-
-   If you previously upgraded from 2024.x to 2025.1 without enabling consistent
-   topology updates, ensure you enable the feature before you upgrade to 2025.2.
-   For instructions, see
-   `Enable Consistent Topology Updates <https://docs.scylladb.com/manual/branch-2025.1/upgrade/upgrade-guides/upgrade-guide-from-2024.x-to-2025.1/enable-consistent-topology.html>`_
-   in the upgrade guide for version 2025.1.
-
 Upgrade Procedure
 =================

@@ -158,7 +150,7 @@ You should take note of the current version in case you want to |ROLLBACK|_ the

            .. code-block:: console

-               sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/scylla-2025.2.list
+               sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/scylla-2025.3.list

        #. Install the new ScyllaDB version:

@@ -176,7 +168,7 @@ You should take note of the current version in case you want to |ROLLBACK|_ the

            .. code-block:: console

-               sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/scylla-2025.2.repo
+               sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/scylla-2025.3.repo

        #. Install the new ScyllaDB version:

--- a/docs/using-scylla/drivers/cql-drivers/index.rst
+++ b/docs/using-scylla/drivers/cql-drivers/index.rst
@@ -16,42 +16,99 @@ ScyllaDB CQL Drivers
 ScyllaDB Drivers
 -----------------

+The following ScyllaDB drivers are available:
+
+* :doc:`Python Driver</using-scylla/drivers/cql-drivers/scylla-python-driver>`
+* :doc:`Java Driver </using-scylla/drivers/cql-drivers/scylla-java-driver>`
+* :doc:`Go Driver </using-scylla/drivers/cql-drivers/scylla-go-driver>`
+* :doc:`Go Extension </using-scylla/drivers/cql-drivers/scylla-gocqlx-driver>`
+* :doc:`C++ Driver </using-scylla/drivers/cql-drivers/scylla-cpp-driver>`
+* `CPP-over-Rust Driver <https://github.com/scylladb/cpp-rust-driver>`_
+* :doc:`Rust Driver </using-scylla/drivers/cql-drivers/scylla-rust-driver>`
+
 We recommend using ScyllaDB drivers. All ScyllaDB drivers are shard-aware and provide additional 
 benefits over third-party drivers.

 ScyllaDB supports the CQL binary protocol version 3, so any Apache Cassandra/CQL driver that implements 
 the same version works with ScyllaDB.

-The following table lists the available ScyllaDB drivers, specifying which support
-`ScyllaDB Cloud Serversless <https://cloud.docs.scylladb.com/stable/serverless/index.html>`_ 
-or include a library for :doc:`CDC </features/cdc/cdc-intro>`.
+CDC Integration with ScyllaDB Drivers
+-------------------------------------------
+
+The following table specifies which ScyllaDB drivers include a library for
+:doc:`CDC </features/cdc/cdc-intro>`.

 .. list-table:: 
-   :widths: 30 35 35 
+   :widths: 40 60
   :header-rows: 1

-   * - 
-     - ScyllaDB Driver
+   * - ScyllaDB Driver
     - CDC Connector
-   * - :doc:`Python</using-scylla/drivers/cql-drivers/scylla-python-driver>`
-     - |v| 
+   * - :doc:`Python </using-scylla/drivers/cql-drivers/scylla-python-driver>`
     - |x| 
   * - :doc:`Java </using-scylla/drivers/cql-drivers/scylla-java-driver>`
-     - |v| 
     - |v|
   * - :doc:`Go </using-scylla/drivers/cql-drivers/scylla-go-driver>`
-     - |v| 
     - |v|
   * - :doc:`Go Extension </using-scylla/drivers/cql-drivers/scylla-gocqlx-driver>`
-     - |v|
     - |x| 
   * - :doc:`C++ </using-scylla/drivers/cql-drivers/scylla-cpp-driver>`
-     - |v|
     - |x| 
+   * - `CPP-over-Rust Driver <https://github.com/scylladb/cpp-rust-driver>`_
+     - |x|
   * - :doc:`Rust </using-scylla/drivers/cql-drivers/scylla-rust-driver>`
     - |v| 
-     - |v| 

+Support for Tablets
+-------------------------
+
+The following table specifies which ScyllaDB drivers support
+:doc:`tablets </architecture/tablets>` and since which version.
+
+.. list-table:: 
+   :widths: 30 35 35
+   :header-rows: 1
+
+   * - ScyllaDB Driver
+     - Support for Tablets
+     - Since Version
+   * - :doc:`Python</using-scylla/drivers/cql-drivers/scylla-python-driver>`
+     - |v| 
+     - 3.26.5
+   * - :doc:`Java </using-scylla/drivers/cql-drivers/scylla-java-driver>`
+     - |v| 
+     - 4.18.0 (Java Driver 4.x)
+
+       3.11.5.2 (Java Driver 3.x)
+   * - :doc:`Go </using-scylla/drivers/cql-drivers/scylla-go-driver>`
+     - |v|
+     - 1.13.0
+   * - :doc:`Go Extension </using-scylla/drivers/cql-drivers/scylla-gocqlx-driver>`
+     - |x| 
+     - N/A
+   * - :doc:`C++ </using-scylla/drivers/cql-drivers/scylla-cpp-driver>`
+     - |x| 
+     - N/A
+   * - `CPP-over-Rust Driver <https://github.com/scylladb/cpp-rust-driver>`_
+     - |v|
+     - All versions
+   * - :doc:`Rust </using-scylla/drivers/cql-drivers/scylla-rust-driver>`
+     - |v| 
+     - 0.13.0
+
+Driver Support Policy
+-------------------------------
+
+We support the **two most recent minor releases** of our drivers.
+
+* We test and validate the latest two minor versions.
+* We typically patch only the latest minor release.
+
+We recommend staying up to date with the latest supported versions to receive
+updates and fixes.
+
+At a minimum, upgrade your driver when upgrading to a new ScyllaDB version
+to ensure compatibility between the driver and the database.

 Third-party Drivers
 ----------------------
--- a/ent/encryption/encrypted_file_impl.cc
+++ b/ent/encryption/encrypted_file_impl.cc
@@ -701,5 +701,97 @@ std::unique_ptr<data_sink_impl> make_encrypted_sink(data_sink sink, shared_ptr<s
    return std::make_unique<encrypted_data_sink>(std::move(sink), std::move(k));
 }

+class encrypted_data_source : public data_source_impl, public block_encryption_base {
+    input_stream<char> _input;
+    temporary_buffer<char> _next;
+    size_t _current_position = 0;
+    size_t _skip = 0;
+
+public:
+    encrypted_data_source(data_source source, shared_ptr<symmetric_key> k) 
+        : block_encryption_base(std::move(k))
+        , _input(std::move(source)) 
+    {}
+
+    future<temporary_buffer<char>> get() override {
+        // First, get as much as we can get now (or the remainder of previous call)
+        auto buf1 = _next.empty()
+            ? co_await _input.read()
+            : std::exchange(_next, {})
+            ;
+
+        // eof?
+        if (buf1.empty()) {
+            co_return buf1;
+        }
+
+        // now we need one page more to be able to save one for next lap
+        auto fill_size = align_up(buf1.size(), block_size) + block_size - buf1.size();
+        auto buf2 = co_await _input.read_exactly(fill_size);
+
+        temporary_buffer<char> output(buf1.size() + buf2.size());
+
+        // we copy data even for the part we will cache. this to
+        // fix block alignment of the resulting shared buffer.
+        std::copy(buf1.begin(), buf1.end(), output.get_write());
+        std::copy(buf2.begin(), buf2.end(), output.get_write() + buf1.size());
+
+        // we need to keep one page buffered (beyond the input stream buffer - would be neat 
+        // to share it), to be able to detect actual eof stream size. We always need
+        // at least _two_ pages of data to process, to be able to handle the case where
+        // actual size is <aligned block size> - <less than key block size>.
+        // I.e. stream size = 8180, encrypted data size will be 8192, and data stream
+        // will be 8196. So we need to make sure buf1 == [4096-8192], and buf2 == [8192-8196].
+        if (is_aligned(output.size(), block_size) && output.size() >= 2*block_size) {
+            _next = output.share(output.size() - block_size, block_size);
+            output.trim(output.size() - block_size);
+        }
+
+        const size_t key_block_size = _key->block_size();
+
+        // decrypt all blocks we have to return. might include the last, partial block
+        for (size_t offset = 0; offset < output.size(); offset += block_size, _current_position += block_size) {
+            auto iv = iv_for(_current_position);
+            auto rem = std::min(block_size, output.size() - offset);
+            _key->transform_unpadded(mode::decrypt, output.get() + offset, align_down(rem, key_block_size), output.get_write() + offset, iv.data());
+        }
+
+        // now, if the output buffer is not aligned, we are at eof, and
+        // also need to trim result.
+        if (!is_aligned(output.size(), key_block_size)) {
+            output.trim(output.size() - std::min(output.size(), key_block_size));
+        }
+
+        assert(is_aligned(_current_position, block_size));
+        // finally trim front to handle any skip remainders
+        output.trim_front(std::min(std::exchange(_skip, 0), output.size()));
+
+        co_return output;
+    }
+
+    future<temporary_buffer<char>> skip(uint64_t n) override {
+        if (n >= block_size) {
+            // since we only give back data aligned to block_size chunks,
+            // a client would only ever skip from a block boundary.
+            auto to_skip = align_down(n, block_size);
+            assert(is_aligned(_next.size(), block_size));
+            co_await _input.skip(to_skip - _next.size());
+            n -= to_skip;
+            _current_position += to_skip;
+            _next = {};
+        }
+        _skip = n;
+        co_return temporary_buffer<char>{};
+    }
+
+    future<> close() override { 
+        return _input.close(); 
+    }
+};
+
+
+std::unique_ptr<data_source_impl> make_encrypted_source(data_source source, shared_ptr<symmetric_key> k) {
+    return std::make_unique<encrypted_data_source>(std::move(source), std::move(k));
+}
 }

--- a/ent/encryption/encrypted_file_impl.hh
+++ b/ent/encryption/encrypted_file_impl.hh
@@ -25,4 +25,6 @@ shared_ptr<file_impl> make_delayed_encrypted_file(file, size_t, get_key_func);

 std::unique_ptr<data_sink_impl> make_encrypted_sink(data_sink, ::shared_ptr<symmetric_key>);

+std::unique_ptr<data_source_impl> make_encrypted_source(data_source source, shared_ptr<symmetric_key> k);
+
 }
--- a/ent/encryption/encryption.cc
+++ b/ent/encryption/encryption.cc
@@ -472,6 +472,14 @@ public:
            for (auto&& [id, h] : _per_thread_kmip_host_cache[this_shard_id()]) {
                co_await h->disconnect();
            }
+            static auto stop_all = [](auto&& cache) -> future<> {
+                for (auto& [k, host] : cache) {
+                    co_await host->stop();
+                }
+            };
+            co_await stop_all(_per_thread_kms_host_cache[this_shard_id()]);
+            co_await stop_all(_per_thread_gcp_host_cache[this_shard_id()]);
+
            _per_thread_provider_cache[this_shard_id()].clear();
            _per_thread_system_key_cache[this_shard_id()].clear();
            _per_thread_kmip_host_cache[this_shard_id()].clear();
@@ -676,6 +684,33 @@ public:
        return res;
    }

+    std::tuple<opt_bytes, shared_ptr<encryption_schema_extension>> get_encryption_schema_extension(const sstables::sstable& sst,
+                                                                                                   sstables::component_type type) const {
+        const auto& sc = sst.get_shared_components();
+        if (!sc.scylla_metadata) {
+            return {};
+        }
+        const auto* ext_attr = sc.scylla_metadata->get_extension_attributes();
+        if (!ext_attr) {
+            return {};
+        }
+
+        bool ok = ext_attr->map.contains(encryption_attribute_ds);
+        if (ok && type != sstables::component_type::Data) {
+            ok = (ser::deserialize_from_buffer(ext_attr->map.at(encrypted_components_attribute_ds).value, std::type_identity<uint32_t>{}, 0) & (1 << static_cast<int>(type))) > 0;
+        }
+
+        if (!ok) {
+            return {};
+        }
+        auto esx = encryption_schema_extension::create(*_ctxt, ext_attr->map.at(encryption_attribute_ds).value);
+        opt_bytes id;
+        if (ext_attr->map.contains(key_id_attribute_ds)) {
+            id = ext_attr->map.at(key_id_attribute_ds).value;
+        }
+        return {std::move(id), std::move(esx)};
+    }
+
    future<file> wrap_file(const sstables::sstable& sst, sstables::component_type type, file f, open_flags flags) override {
        switch (type) {
        case sstables::component_type::Scylla:
@@ -688,44 +723,21 @@ public:

        if (flags == open_flags::ro) {
            // open existing. check read opts.
-            auto& sc = sst.get_shared_components();
-            if (sc.scylla_metadata) {
-                auto* exta  = sc.scylla_metadata->get_extension_attributes();
-                if (exta) {
-                    auto i = exta->map.find(encryption_attribute_ds);
-                    // note: earlier builds of encryption extension would only encrypt data component,
-                    // so iff we are opening old sstables we need to check if this component is actually
-                    // encrypted. We use a bitmask attribute for this.
+            auto [id, esx] = get_encryption_schema_extension(sst, type);
+            if (esx) {
+                if (esx->should_delay_read(id)) {
+                    logg.debug("Encrypted sstable component {} using delayed opening {} (id: {})", sst.component_basename(type), *esx, id);

-                    bool ok = i != exta->map.end();
-                    if (ok && type != sstables::component_type::Data) {
-                        ok = exta->map.count(encrypted_components_attribute_ds) &&
-                                        (ser::deserialize_from_buffer(exta->map.at(encrypted_components_attribute_ds).value, std::type_identity<uint32_t>{}, 0) & (1 << int(type)));
-                    }
-
-                    if (ok) {
-                        auto esx = encryption_schema_extension::create(*_ctxt, i->second.value);
-                        opt_bytes id;
-
-                        if (exta->map.count(key_id_attribute_ds)) {
-                            id = exta->map.at(key_id_attribute_ds).value;
-                        }
-
-                        if (esx->should_delay_read(id)) {
-                            logg.debug("Encrypted sstable component {} using delayed opening {} (id: {})", sst.component_basename(type), *esx, id);
-
-                            co_return make_delayed_encrypted_file(f, esx->key_block_size(), [esx, comp = sst.component_basename(type), id = std::move(id)] {
-                                logg.trace("Delayed component {} using {} (id: {}) resolve", comp, *esx, id);
-                                return esx->key_for_read(id);
-                            });
-                        }
-
-                        logg.debug("Open encrypted sstable component {} using {} (id: {})", sst.component_basename(type), *esx, id);
-
-                        auto k = co_await esx->key_for_read(std::move(id));
-                        co_return make_encrypted_file(f, std::move(k));
-                    }
+                    co_return make_delayed_encrypted_file(f, esx->key_block_size(), [esx, comp = sst.component_basename(type), id = std::move(id)] {
+                        logg.trace("Delayed component {} using {} (id: {}) resolve", comp, *esx, id);
+                        return esx->key_for_read(id);
+                    });
                }
+
+                logg.debug("Open encrypted sstable component {} using {} (id: {})", sst.component_basename(type), *esx, id);
+
+                auto k = co_await esx->key_for_read(std::move(id));
+                co_return make_encrypted_file(f, std::move(k));
            }
        } else {
            if (co_await wrap_writeonly(sst, type, [&f](shared_ptr<symmetric_key> k) { f = make_encrypted_file(std::move(f), std::move(k)); })) {
@@ -823,6 +835,36 @@ public:
        });
        co_return sink;
    }
+
+    future<data_source> wrap_source(const sstables::sstable& sst,
+                                                         sstables::component_type type,
+                                                         sstables::data_source_creator_fn data_source_creator,
+                                                         uint64_t offset,
+                                                         uint64_t len) override {
+        switch (type) {
+        case sstables::component_type::Scylla:
+        case sstables::component_type::TemporaryTOC:
+        case sstables::component_type::TOC:
+            co_return data_source_creator(offset, len);
+        case sstables::component_type::CompressionInfo:
+        case sstables::component_type::CRC:
+        case sstables::component_type::Data:
+        case sstables::component_type::Digest:
+        case sstables::component_type::Filter:
+        case sstables::component_type::Index:
+        case sstables::component_type::Statistics:
+        case sstables::component_type::Summary:
+        case sstables::component_type::TemporaryStatistics:
+        case sstables::component_type::Unknown:
+            auto [id, esx] = get_encryption_schema_extension(sst, type);
+            if (esx) {
+                auto key = co_await esx->key_for_read(std::move(id));
+                auto block_size = key->block_size();
+                co_return data_source(make_encrypted_source(data_source_creator(align_down(offset, block_size), align_up(len, block_size)), std::move(key)));
+            }
+            co_return data_source_creator(offset, len);
+        }
+    }
 };

 std::string encryption_provider(const sstables::sstable& sst) {
--- a/ent/encryption/gcp_host.cc
+++ b/ent/encryption/gcp_host.cc
@@ -97,6 +97,7 @@ public:
    ~impl() = default;

    future<> init();
+    future<> stop();
    const host_options& options() const {
        return _options;
    }
@@ -477,13 +478,14 @@ encryption::gcp_host::impl::get_default_credentials() {
        }
    }

-    {
+    auto home = std::getenv("HOME");
+    if (home) {
        std::string well_known_file;
        auto env_path = std::getenv("CLOUDSDK_CONFIG");
        if (env_path) {
-            well_known_file = fmt::format("~/{}/{}", env_path, WELL_KNOWN_CREDENTIALS_FILE);
+            well_known_file = fmt::format("{}/{}/{}", home, env_path, WELL_KNOWN_CREDENTIALS_FILE);
        } else {
-            well_known_file = fmt::format("~/.config/{}/{}", CLOUDSDK_CONFIG_DIRECTORY, WELL_KNOWN_CREDENTIALS_FILE);
+            well_known_file = fmt::format("{}/.config/{}/{}", home, CLOUDSDK_CONFIG_DIRECTORY, WELL_KNOWN_CREDENTIALS_FILE);
        }

        if (co_await seastar::file_exists(well_known_file)) {
@@ -671,7 +673,7 @@ encryption::gcp_host::impl::get_access_token(const google_credentials& creds, co
                { "client_id", c.client_id },
                { "client_secret", c.client_secret },
                { "refresh_token", c.refresh_token },
-                { "grant_type", "grant_type" },
+                { "grant_type", "refresh_token" },
            }), "", httpd::operation_type::POST);

            co_return access_token{ json };
@@ -827,6 +829,11 @@ future<> encryption::gcp_host::impl::init() {
    _initialized = true;
 }

+future<> encryption::gcp_host::impl::stop() {
+    co_await _attr_cache.stop();
+    co_await _id_cache.stop();
+}
+
 std::tuple<std::string, std::string> encryption::gcp_host::impl::parse_key(std::string_view spec) {
    auto i = spec.find_last_of('/');
    if (i == std::string_view::npos) {
@@ -989,6 +996,10 @@ future<> encryption::gcp_host::init() {
    return _impl->init();
 }

+future<> encryption::gcp_host::stop() {
+    return _impl->stop();
+}
+
 const encryption::gcp_host::host_options& encryption::gcp_host::options() const {
    return _impl->options();
 }
--- a/ent/encryption/gcp_host.hh
+++ b/ent/encryption/gcp_host.hh
@@ -65,6 +65,8 @@ public:
    ~gcp_host();

    future<> init();
+    future<> stop();
+
    const host_options& options() const;

    struct option_override : public t_credentials_source<std::optional<std::string>> {
--- a/ent/encryption/kmip_host.cc
+++ b/ent/encryption/kmip_host.cc
@@ -724,9 +724,11 @@ future<> kmip_host::impl::connect() {
 }

 future<> kmip_host::impl::disconnect() {
-    return do_for_each(_options.hosts, [this](const sstring& host) {
+    co_await do_for_each(_options.hosts, [this](const sstring& host) {
        return clear_connections(host);
    });
+    co_await _attr_cache.stop();
+    co_await _id_cache.stop();
 }

 static unsigned from_str(unsigned (*f)(char*, int, int*), const sstring& s, const sstring& what) {
--- a/ent/encryption/kms_host.cc
+++ b/ent/encryption/kms_host.cc
@@ -154,6 +154,8 @@ public:
    ~impl() = default;

    future<> init();
+    future<> stop();
+
    const host_options& options() const {
        return _options;
    }
@@ -826,6 +828,11 @@ future<> encryption::kms_host::impl::init() {
    _initialized = true;
 }

+future<> encryption::kms_host::impl::stop() {
+    co_await _attr_cache.stop();
+    co_await _id_cache.stop();
+}
+
 future<encryption::kms_host::impl::key_and_id_type> encryption::kms_host::impl::create_key(const attr_cache_key& k) {
    auto& master_key = k.master_key;
    auto& aws_assume_role_arn = k.aws_assume_role_arn;
@@ -988,6 +995,10 @@ future<> encryption::kms_host::init() {
    return _impl->init();
 }

+future<> encryption::kms_host::stop() {
+    return _impl->stop();
+}
+
 const encryption::kms_host::host_options& encryption::kms_host::options() const {
    return _impl->options();
 }
--- a/ent/encryption/kms_host.hh
+++ b/ent/encryption/kms_host.hh
@@ -63,6 +63,8 @@ public:
    ~kms_host();

    future<> init();
+    future<> stop();
+
    const host_options& options() const;

    struct option_override {
--- a/generic_server.cc
+++ b/generic_server.cc
@@ -8,7 +8,6 @@

 #include "generic_server.hh"

-
 #include <exception>
 #include <fmt/ranges.h>
 #include <seastar/core/when_all.hh>
@@ -229,11 +228,28 @@ void connection::on_connection_ready()

 void connection::shutdown()
 {
+    shutdown_input();
+    shutdown_output();
+}
+
+bool connection::shutdown_input() {
    try {
        _fd.shutdown_input();
+    } catch (...) {
+        _server._logger.warn("Error shutting down input side of connection {}->{}, exception: {}", _fd.remote_address(), _fd.local_address(), std::current_exception());
+        return false;
+    }
+    return true;
+}
+
+bool connection::shutdown_output() {
+    try {
        _fd.shutdown_output();
    } catch (...) {
+        _server._logger.warn("Error shutting down output side of connection {}->{}, exception: {}", _fd.remote_address(), _fd.local_address(), std::current_exception());
+        return false;
    }
+    return true;
 }

 server::server(const sstring& server_name, logging::logger& logger, config cfg)
@@ -256,10 +272,7 @@ server::server(const sstring& server_name, logging::logger& logger, config cfg)
    }))
    , _prev_conns_cpu_concurrency(_conns_cpu_concurrency)
    , _conns_cpu_concurrency_semaphore(_conns_cpu_concurrency, named_semaphore_exception_factory{"connections cpu concurrency semaphore"})
-{
-}
-
-server::~server()
+    , _shutdown_timeout(std::chrono::seconds{cfg.shutdown_timeout_in_seconds})
 {
 }

@@ -272,7 +285,11 @@ future<> server::shutdown() {
    if (_gate.is_closed()) {
        co_return;
    }
-    _all_connections_stopped = _gate.close();
+
+    shared_future<> connections_stopped{_gate.close()};
+    _all_connections_stopped = connections_stopped.get_future();
+
+     // Stop all listeners.
    size_t nr = 0;
    size_t nr_total = _listeners.size();
    _logger.debug("abort accept nr_total={}", nr_total);
@@ -280,14 +297,35 @@ future<> server::shutdown() {
        l.abort_accept();
        _logger.debug("abort accept {} out of {} done", ++nr, nr_total);
    }
+     co_await std::move(_listeners_stopped);
+
+    // Shutdown RX side of the connections, so no new requests could be received.
+    // Leave the TX side so the responses to ongoing requests could be sent.
+    _logger.debug("Shutting down RX side of {} connections", _connections_list.size());
+    co_await for_each_gently([](auto& connection) {
+        if (!connection.shutdown_input()) {
+            // If failed to shutdown the input side, then attempt to shutdown the output side which should do a complete shutdown of the connection.
+            connection.shutdown_output();
+        }
+    });
+
+    // Wait for the remaining requests to finish.
+    _logger.debug("Waiting for connections to stop");
+    try {
+        co_await connections_stopped.get_future(seastar::lowres_clock::now() + _shutdown_timeout);
+    } catch (const timed_out_error& _) {
+        _logger.info("Timed out waiting for connections shutdown.");
+    }
+
+    // Either all requests stopped or a timeout occurred, do the full shutdown of the connections.
    size_t nr_conn = 0;
    auto nr_conn_total = _connections_list.size();
    _logger.debug("shutdown connection nr_total={}", nr_conn_total);
-    for (auto& c : _connections_list) {
+    co_await for_each_gently([&nr_conn, nr_conn_total, this](connection& c) {
        c.shutdown();
        _logger.debug("shutdown connection {} out of {} done", ++nr_conn, nr_conn_total);
-    }
-    co_await std::move(_listeners_stopped);
+    });
+
    _abort_source.request_abort();
 }

--- a/generic_server.hh
+++ b/generic_server.hh
@@ -39,6 +39,7 @@ class server;
 // member function to perform request processing. This base class provides a
 // `_read_buf` and a `_write_buf` for reading requests and writing responses.
 class connection : public boost::intrusive::list_base_hook<> {
+    friend class server;
 public:
    using connection_process_loop = noncopyable_function<future<> ()>;
    using execute_under_tenant_type = noncopyable_function<future<> (connection_process_loop)>;
@@ -61,6 +62,8 @@ protected:

 private:
    future<> process_until_tenant_switch();
+    bool shutdown_input();
+    bool shutdown_output();
 public:
    connection(server& server, connected_socket&& fd, named_semaphore& sem, semaphore_units<named_semaphore_exception_factory> initial_sem_units);
    virtual ~connection();
@@ -82,6 +85,7 @@ public:

 struct config {
    utils::updateable_value<uint32_t> uninitialized_connections_semaphore_cpu_concurrency;
+    utils::updateable_value<uint32_t> shutdown_timeout_in_seconds;
 };

 // A generic TCP socket server.
@@ -126,10 +130,11 @@ private:
    utils::observer<uint32_t> _conns_cpu_concurrency_observer;
    uint32_t _prev_conns_cpu_concurrency;
    named_semaphore _conns_cpu_concurrency_semaphore;
+    std::chrono::seconds _shutdown_timeout;
 public:
    server(const sstring& server_name, logging::logger& logger, config cfg);

-    virtual ~server();
+    virtual ~server() = default;

    // Makes sure listening sockets no longer generate new connections and aborts the
    // connected sockets, so that new requests are not served and existing requests don't
@@ -140,9 +145,9 @@ public:
    future<> shutdown();
    future<> stop();

-    future<> listen(socket_address addr, 
-        std::shared_ptr<seastar::tls::credentials_builder> creds, 
-        bool is_shard_aware, bool keepalive, 
+    future<> listen(socket_address addr,
+        std::shared_ptr<seastar::tls::credentials_builder> creds,
+        bool is_shard_aware, bool keepalive,
        std::optional<file_permissions> unix_domain_socket_permissions,
        std::function<server&()> get_shard_instance = {}
        );
--- a/gms/gossip_digest_syn.cc
+++ b/gms/gossip_digest_syn.cc
@@ -13,8 +13,8 @@
 auto fmt::formatter<gms::gossip_digest_syn>::format(const gms::gossip_digest_syn& syn, fmt::format_context& ctx) const
        -> decltype(ctx.out()) {
    auto out = ctx.out();
-    out = fmt::format_to(out, "cluster_id:{},partioner:{},group0_id{},",
-                         syn._cluster_id, syn._partioner, syn._group0_id);
+    out = fmt::format_to(out, "cluster_id:{},partioner:{},group0_id:{},recovery_leader:{}",
+                         syn._cluster_id, syn._partioner, syn._group0_id, syn._recovery_leader);
    out = fmt::format_to(out, "digests:{{");
    for (auto& d : syn._digests) {
        out = fmt::format_to(out, "{} ", d);
--- a/gms/gossip_digest_syn.hh
+++ b/gms/gossip_digest_syn.hh
@@ -28,15 +28,19 @@ private:
    sstring _partioner;
    utils::chunked_vector<gossip_digest> _digests;
    utils::UUID _group0_id;
+    utils::UUID _recovery_leader;
 public:
    gossip_digest_syn() {
    }

-    gossip_digest_syn(sstring id, sstring p, utils::chunked_vector<gossip_digest> digests, utils::UUID group0_id)
+    gossip_digest_syn(
+            sstring id, sstring p, utils::chunked_vector<gossip_digest> digests,
+            utils::UUID group0_id, utils::UUID recovery_leader)
        : _cluster_id(std::move(id))
        , _partioner(std::move(p))
        , _digests(std::move(digests))
-        , _group0_id(std::move(group0_id)) {
+        , _group0_id(std::move(group0_id))
+        , _recovery_leader(std::move(recovery_leader)) {
    }

    sstring cluster_id() const {
@@ -47,6 +51,10 @@ public:
        return _group0_id;
    }

+    utils::UUID recovery_leader() const {
+        return _recovery_leader;
+    }
+
    sstring partioner() const {
        return _partioner;
    }
@@ -59,6 +67,10 @@ public:
        return group0_id();
    }

+    utils::UUID get_recovery_leader() const {
+        return _recovery_leader;
+    }
+
    sstring get_partioner() const {
        return partioner();
    }
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -76,6 +76,10 @@ const utils::UUID& gossiper::get_group0_id() const noexcept {
    return _gcfg.group0_id;
 }

+const utils::UUID& gossiper::get_recovery_leader() const noexcept {
+    return _gcfg.recovery_leader;
+}
+
 const std::set<inet_address>& gossiper::get_seeds() const noexcept {
    return _gcfg.seeds;
 }
@@ -172,8 +176,11 @@ void gossiper::do_sort(utils::chunked_vector<gossip_digest>& g_digest_list) cons
 // Depends on
 // - no external dependency
 future<> gossiper::handle_syn_msg(locator::host_id from, gossip_digest_syn syn_msg) {
-    logger.trace("handle_syn_msg():from={},cluster_name:peer={},local={},group0_id:peer={},local={},partitioner_name:peer={},local={}",
-        from, syn_msg.cluster_id(), get_cluster_name(), syn_msg.group0_id(), get_group0_id(), syn_msg.partioner(), get_partitioner_name());
+    logger.trace(
+            "handle_syn_msg():from={},cluster_name:peer={},local={},group0_id:peer={},local={},"
+            "recovery_leader:peer={},local={},partitioner_name:peer={},local={}",
+            from, syn_msg.cluster_id(), get_cluster_name(), syn_msg.group0_id(), get_group0_id(),
+            syn_msg.recovery_leader(), get_recovery_leader(), syn_msg.partioner(), get_partitioner_name());
    if (!is_enabled()) {
        co_return;
    }
@@ -184,10 +191,20 @@ future<> gossiper::handle_syn_msg(locator::host_id from, gossip_digest_syn syn_m
        co_return;
    }

+    // Recovery leader mismatch implies an administrator's mistake during the Raft-based recovery procedure.
+    // Throw away the message and signal that something is wrong.
+    bool both_nodes_in_recovery = syn_msg.recovery_leader() && get_recovery_leader();
+    if (both_nodes_in_recovery && syn_msg.recovery_leader() != get_recovery_leader()) {
+        logger.warn("Recovery leader mismatch from {} {} != {},",
+                from, syn_msg.recovery_leader(), get_recovery_leader());
+        co_return;
+    }
+
    // If the message is from a node with a different group0 id throw it away.
    // A group0 id mismatch is expected during a rolling restart in the Raft-based recovery procedure.
-    if (_gcfg.recovery_leader().empty()
-            && syn_msg.group0_id() && get_group0_id() && syn_msg.group0_id() != get_group0_id()) {
+    bool no_recovery = !syn_msg.recovery_leader() && !get_recovery_leader();
+    bool group0_ids_mismatch = syn_msg.group0_id() && get_group0_id() && syn_msg.group0_id() != get_group0_id();
+    if (no_recovery && group0_ids_mismatch) {
        logger.warn("Group0Id mismatch from {} {} != {}", from, syn_msg.group0_id(), get_group0_id());
        co_return;
    }
@@ -1102,7 +1119,8 @@ void gossiper::run() {
            utils::chunked_vector<gossip_digest> g_digests = make_random_gossip_digest();

            if (g_digests.size() > 0) {
-                gossip_digest_syn message(get_cluster_name(), get_partitioner_name(), g_digests, get_group0_id());
+                gossip_digest_syn message(
+                        get_cluster_name(), get_partitioner_name(), g_digests, get_group0_id(), get_recovery_leader());

                if (_endpoints_to_talk_with.empty() && !_live_endpoints.empty()) {
                    auto live_endpoints = _live_endpoints | std::ranges::to<std::vector>();
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -69,7 +69,7 @@ struct gossip_config {
    uint32_t skip_wait_for_gossip_to_settle = -1;
    utils::updateable_value<uint32_t> failure_detector_timeout_ms;
    utils::updateable_value<int32_t> force_gossip_generation;
-    utils::updateable_value<sstring> recovery_leader;
+    utils::updateable_value<utils::UUID> recovery_leader;
 };

 struct loaded_endpoint_state {
@@ -136,6 +136,8 @@ public:
    void set_group0_id(utils::UUID group0_id);
    const utils::UUID& get_group0_id() const noexcept;

+    const utils::UUID& get_recovery_leader() const noexcept;
+
    const sstring& get_partitioner_name() const noexcept {
        return _gcfg.partitioner;
    }
--- a/idl/gossip_digest.idl.hh
+++ b/idl/gossip_digest.idl.hh
@@ -57,6 +57,7 @@ class gossip_digest_syn {
    sstring get_partioner();
    utils::chunked_vector<gms::gossip_digest> get_gossip_digests();
    utils::UUID get_group0_id()[[version 5.4]];
+    utils::UUID get_recovery_leader()[[version 2025.2.2]];
 };

 class gossip_digest_ack {
--- a/keys.cc
+++ b/keys.cc
@@ -38,12 +38,18 @@ partition_key_view::ring_order_tri_compare(const schema& s, partition_key_view k

 partition_key partition_key::from_nodetool_style_string(const schema_ptr s, const sstring& key) {
    std::vector<sstring> vec;
-    boost::split(vec, key, boost::is_any_of(":"));
+    if (s->partition_key_type()->types().size() == 1) {
+        // For a single column partition key. Don't try to split the key
+        // See #16596
+        vec.push_back(key);
+    } else {
+        boost::split(vec, key, boost::is_any_of(":"));
+        if (vec.size() != s->partition_key_type()->types().size()) {
+            throw std::invalid_argument(fmt::format("partition key '{}' has mismatch number of components: expected {}, got {}", key, s->partition_key_type()->types().size(), vec.size()));
+        }
+    }

    auto it = std::begin(vec);
-    if (vec.size() != s->partition_key_type()->types().size()) {
-        throw std::invalid_argument("partition key '" + key + "' has mismatch number of components");
-    }
    std::vector<bytes> r;
    r.reserve(vec.size());
    for (auto t : s->partition_key_type()->types()) {
--- a/locator/tablets.cc
+++ b/locator/tablets.cc
@@ -335,18 +335,25 @@ void tablet_metadata::drop_tablet_map(table_id id) {
 }

 future<> tablet_metadata::clear_gently() {
-    for (auto&& [id, map] : _tablets) {
-        const auto shard = map.get_owner_shard();
-        co_await smp::submit_to(shard, [map = std::move(map)] () mutable {
-            auto map_ptr = map.release();
-            // Others copies exist, we simply drop ours, no need to clear anything.
-            if (map_ptr.use_count() > 1) {
-                return make_ready_future<>();
-            }
-            return const_cast<tablet_map&>(*map_ptr).clear_gently().finally([map_ptr = std::move(map_ptr)] { });
-        });
+    tablet_logger.debug("tablet_metadata::clear_gently {}", fmt::ptr(this));
+    // First, Sort the tablet maps per shard to avoid destruction of all foreign tablet map ptrs
+    // on this shard. We don't use sharded<> here since it will require a similar 
+    // submit_to to each shard owner per tablet-map.
+    std::vector<std::vector<tablet_map_ptr>> tablet_maps_per_shard;
+    tablet_maps_per_shard.resize(smp::count);
+    for (auto& [_, map_ptr] : _tablets) {
+        tablet_maps_per_shard[map_ptr.get_owner_shard()].emplace_back(std::move(map_ptr));
    }
    _tablets.clear();
+
+    // Now destroy the foreign tablet map pointers on each shard.
+    co_await smp::invoke_on_all([&] -> future<> {
+        for (auto& map_ptr : tablet_maps_per_shard[this_shard_id()]) {
+            auto map = map_ptr.release();
+            co_await utils::clear_gently(map);
+        }
+    });
+
    co_return;
 }

--- a/locator/token_metadata.cc
+++ b/locator/token_metadata.cc
@@ -357,6 +357,7 @@ future<std::unique_ptr<token_metadata_impl>> token_metadata_impl::clone_only_tok
 }

 future<> token_metadata_impl::clear_gently() noexcept {
+    _version_tracker = {};
    co_await utils::clear_gently(_token_to_endpoint_map);
    co_await utils::clear_gently(_normal_token_owners);
    co_await utils::clear_gently(_bootstrap_tokens);
@@ -834,16 +835,30 @@ token_metadata::token_metadata(std::unique_ptr<token_metadata_impl> impl)
 {
 }

-token_metadata::token_metadata(config cfg)
-        : _impl(std::make_unique<token_metadata_impl>(cfg))
+token_metadata::token_metadata(shared_token_metadata& stm, config cfg)
+        : _shared_token_metadata(&stm)
+        , _impl(std::make_unique<token_metadata_impl>(std::move(cfg)))
 {
 }

-token_metadata::~token_metadata() = default;
+token_metadata::~token_metadata() {
+    clear_and_dispose_impl();
+}

 token_metadata::token_metadata(token_metadata&&) noexcept = default;

-token_metadata& token_metadata::token_metadata::operator=(token_metadata&&) noexcept = default;
+token_metadata& token_metadata::token_metadata::operator=(token_metadata&& o) noexcept {
+    if (this != &o) {
+        clear_and_dispose_impl();
+        _shared_token_metadata = std::exchange(o._shared_token_metadata, nullptr);
+        _impl = std::exchange(o._impl, nullptr);
+    }
+    return *this;
+}
+
+void token_metadata::set_shared_token_metadata(shared_token_metadata& stm) {
+    _shared_token_metadata = &stm;
+}

 const std::vector<token>&
 token_metadata::sorted_tokens() const {
@@ -1027,6 +1042,15 @@ token_metadata::clone_after_all_left() const noexcept {
    co_return token_metadata(co_await _impl->clone_after_all_left());
 }

+void token_metadata::clear_and_dispose_impl() noexcept {
+    if (!_shared_token_metadata) {
+        return;
+    }
+    if (auto impl = std::exchange(_impl, nullptr)) {
+        _shared_token_metadata->clear_and_dispose(std::move(impl));
+    }
+}
+
 future<> token_metadata::clear_gently() noexcept {
    return _impl->clear_gently();
 }
@@ -1143,6 +1167,17 @@ version_tracker shared_token_metadata::new_tracker(token_metadata::version_t ver
    return tracker;
 }

+future<> shared_token_metadata::stop() noexcept {
+    co_await _background_dispose_gate.close();
+}
+
+void shared_token_metadata::clear_and_dispose(std::unique_ptr<token_metadata_impl> impl) noexcept {
+    // Safe to drop the future since the gate is closed in stop()
+    if (auto gh = _background_dispose_gate.try_hold()) {
+        (void)impl->clear_gently().finally([i = std::move(impl), gh = std::move(gh)] {});
+    }
+}
+
 void shared_token_metadata::set(mutable_token_metadata_ptr tmptr) noexcept {
    if (_shared->get_ring_version() >= tmptr->get_ring_version()) {
        on_internal_error(tlogger, format("shared_token_metadata: must not set non-increasing ring_version: {} -> {}", _shared->get_ring_version(), tmptr->get_ring_version()));
@@ -1154,6 +1189,7 @@ void shared_token_metadata::set(mutable_token_metadata_ptr tmptr) noexcept {
        _stale_versions_in_use = _versions_barrier.advance_and_await();
    }

+    tmptr->set_shared_token_metadata(*this);
    _shared = std::move(tmptr);
    _shared->set_version_tracker(new_tracker(_shared->get_version()));

@@ -1216,7 +1252,7 @@ future<> shared_token_metadata::mutate_on_all_shards(sharded<shared_token_metada

    std::vector<mutable_token_metadata_ptr> pending_token_metadata_ptr;
    pending_token_metadata_ptr.resize(smp::count);
-    auto tmptr = make_token_metadata_ptr(co_await stm.local().get()->clone_async());
+    auto tmptr = stm.local().make_token_metadata_ptr(co_await stm.local().get()->clone_async());
    auto& tm = *tmptr;
    // bump the token_metadata ring_version
    // to invalidate cached token/replication mappings
@@ -1227,7 +1263,7 @@ future<> shared_token_metadata::mutate_on_all_shards(sharded<shared_token_metada
    // Apply the mutated token_metadata only after successfully cloning it on all shards.
    pending_token_metadata_ptr[base_shard] = tmptr;
    co_await smp::invoke_on_others(base_shard, [&] () -> future<> {
-        pending_token_metadata_ptr[this_shard_id()] = make_token_metadata_ptr(co_await tm.clone_async());
+        pending_token_metadata_ptr[this_shard_id()] = stm.local().make_token_metadata_ptr(co_await tm.clone_async());
    });

    co_await stm.invoke_on_all([&] (shared_token_metadata& stm) {
--- a/locator/token_metadata.hh
+++ b/locator/token_metadata.hh
@@ -47,7 +47,7 @@ class abstract_replication_strategy;

 using token = dht::token;

-class token_metadata;
+class shared_token_metadata;
 class tablet_metadata;

 struct host_id_or_endpoint {
@@ -166,6 +166,7 @@ private:
 };

 class token_metadata final {
+    shared_token_metadata* _shared_token_metadata = nullptr;
    std::unique_ptr<token_metadata_impl> _impl;
 private:
    friend class token_metadata_ring_splitter;
@@ -178,7 +179,7 @@ public:
    using version_t = service::topology::version_t;
    using version_tracker_t = version_tracker;

-    token_metadata(config cfg);
+    token_metadata(shared_token_metadata& stm, config cfg);
    explicit token_metadata(std::unique_ptr<token_metadata_impl> impl);
    token_metadata(token_metadata&&) noexcept; // Can't use "= default;" - hits some static_assert in unique_ptr
    token_metadata& operator=(token_metadata&&) noexcept;
@@ -355,6 +356,11 @@ public:
    friend class shared_token_metadata;
 private:
    void set_version_tracker(version_tracker_t tracker);
+
+    void set_shared_token_metadata(shared_token_metadata& stm);
+
+    // Clears and disposes the token metadata impl in the background, if present.
+    void clear_and_dispose_impl() noexcept;
 };

 struct topology_change_info {
@@ -371,12 +377,8 @@ struct topology_change_info {
 using token_metadata_lock = semaphore_units<>;
 using token_metadata_lock_func = noncopyable_function<future<token_metadata_lock>() noexcept>;

-template <typename... Args>
-mutable_token_metadata_ptr make_token_metadata_ptr(Args... args) {
-    return make_lw_shared<token_metadata>(std::forward<Args>(args)...);
-}
-
-class shared_token_metadata {
+class shared_token_metadata : public peering_sharded_service<shared_token_metadata> {
+    named_gate _background_dispose_gate{"shared_token_metadata::background_dispose_gate"};
    mutable_token_metadata_ptr _shared;
    token_metadata_lock_func _lock_func;
    std::chrono::steady_clock::duration _stall_detector_threshold = std::chrono::seconds(2);
@@ -408,7 +410,7 @@ public:
    // used to construct the shared object as a sharded<> instance
    // lock_func returns semaphore_units<>
    explicit shared_token_metadata(token_metadata_lock_func lock_func, token_metadata::config cfg)
-        : _shared(make_token_metadata_ptr(std::move(cfg)))
+        : _shared(make_lw_shared<token_metadata>(*this, cfg))
        , _lock_func(std::move(lock_func))
        , _versions_barrier("shared_token_metadata::versions_barrier")
    {
@@ -418,6 +420,17 @@ public:
    shared_token_metadata(const shared_token_metadata& x) = delete;
    shared_token_metadata(shared_token_metadata&& x) = default;

+    future<> stop() noexcept;
+
+    mutable_token_metadata_ptr make_token_metadata_ptr() {
+        return make_lw_shared<token_metadata>(*this, token_metadata::config{_shared->get_topology().get_config()});
+    }
+
+    mutable_token_metadata_ptr make_token_metadata_ptr(token_metadata&& tm) {
+        tm.set_shared_token_metadata(*this);
+        return make_lw_shared<token_metadata>(std::move(tm));
+    }
+
    token_metadata_ptr get() const noexcept {
        return _shared;
    }
@@ -467,6 +480,8 @@ public:
    // Must be called on shard 0.
    static future<> mutate_on_all_shards(sharded<shared_token_metadata>& stm, seastar::noncopyable_function<future<> (token_metadata&)> func);

+    void clear_and_dispose(std::unique_ptr<token_metadata_impl> impl) noexcept;
+
 private:
    // for testing only, unsafe to be called without awaiting get_lock() first
    void mutate_token_metadata_for_test(seastar::noncopyable_function<void (token_metadata&)> func);
--- a/main.cc
+++ b/main.cc
@@ -2182,11 +2182,8 @@ sharded<locator::shared_token_metadata> token_metadata;

            // At this point, `locator::topology` should be stable, i.e. we should have complete information
            // about the layout of the cluster (= list of nodes along with the racks/DCs).
-            if (cfg->rf_rack_valid_keyspaces()) {
-                startlog.info("Verifying that all of the keyspaces are RF-rack-valid");
-                db.local().check_rf_rack_validity(token_metadata.local().get());
-                startlog.info("All keyspaces are RF-rack-valid");
-            }
+            startlog.info("Verifying that all of the keyspaces are RF-rack-valid");
+            db.local().check_rf_rack_validity(cfg->rf_rack_valid_keyspaces(), token_metadata.local().get());

            dictionary_service dict_service(
                dict_sampler,
@@ -2249,6 +2246,13 @@ sharded<locator::shared_token_metadata> token_metadata;
            const qualified_name qualified_authenticator_name(auth::meta::AUTH_PACKAGE_NAME, cfg->authenticator());
            const qualified_name qualified_role_manager_name(auth::meta::AUTH_PACKAGE_NAME, cfg->role_manager());

+            // Reproducer of scylladb/scylladb#24792.
+            auto i24792_reproducer = defer([] {
+                if (utils::get_local_injector().enter("reload_service_level_cache_after_auth_service_is_stopped")) {
+                    sl_controller.local().update_cache(qos::update_both_cache_levels::yes).get();
+                }
+            });
+
            checkpoint(stop_signal, "starting auth service");
            auth::service_config auth_config;
            auth_config.authorizer_java_name = qualified_authorizer_name;
@@ -2272,6 +2276,17 @@ sharded<locator::shared_token_metadata> token_metadata;
                api::unset_server_authorization_cache(ctx).get();
            });

+            // Precondition: we can only call this after `auth::service` has been initialized and started on all shards.
+            sl_controller.invoke_on_all([] (qos::service_level_controller& controller) {
+                controller.register_auth_integration(auth_service.local());
+            }).get();
+
+            auto unregister_sl_controller_integration = defer([] {
+                sl_controller.invoke_on_all([] (qos::service_level_controller& controller) {
+                    return controller.unregister_auth_integration();
+                }).get();
+            });
+
            // update the service level cache after the SL data accessor and auth service are initialized.
            if (sl_controller.local().is_v2()) {
                sl_controller.local().update_cache(qos::update_both_cache_levels::yes).get();
--- a/pgo/profiles/aarch64/profile.profdata.xz
+++ b/pgo/profiles/aarch64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f69e30ac03713e439d4f9fe347aafe2201d8605880358d3142b6f6bc706c3014
-size 5966816
+oid sha256:4c899bb3c62675ca4e6a6f7ffc75a37a6b2703dae0b781f1a0c9fc93e5bfe0b2
+size 6025944
--- a/pgo/profiles/x86_64/profile.profdata.xz
+++ b/pgo/profiles/x86_64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9ec68edb2980fae1fcf63046b399f30b882fc7b77b4bc316c7055f75820d26f1
-size 5975376
+oid sha256:d7a9bdc43add8030e1e5b4d3920052e0667b8cefeabe7a0f4a6ffea5162120e1
+size 6046544
--- a/redis/server.cc
+++ b/redis/server.cc
@@ -31,7 +31,7 @@ namespace redis_transport {
 static logging::logger logging("redis_server");

 redis_server::redis_server(seastar::sharded<redis::query_processor>& qp, auth::service& auth_service, const db::config& cfg, redis_server_config config)
-    : server("Redis", logging, generic_server::config{cfg.uninitialized_connections_semaphore_cpu_concurrency})
+    : server("Redis", logging, generic_server::config{cfg.uninitialized_connections_semaphore_cpu_concurrency, cfg.request_timeout_on_shutdown_in_seconds})
    , _query_processor(qp)
    , _config(std::move(config))
    , _max_request_size(_config._max_request_size)
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -12,6 +12,7 @@
 #include "repair/row_level.hh"

 #include "locator/network_topology_strategy.hh"
+#include "seastar/core/on_internal_error.hh"
 #include "streaming/stream_reason.hh"
 #include "gms/inet_address.hh"
 #include "gms/gossiper.hh"
@@ -1452,10 +1453,6 @@ future<std::optional<double>> repair::user_requested_repair_task_impl::expected_
    co_return _ranges.size() * _cfs.size() * smp::count;
 }

-std::optional<double> repair::user_requested_repair_task_impl::expected_children_number() const {
-    return smp::count;
-}
-
 future<int> repair_start(seastar::sharded<repair_service>& repair, sharded<gms::gossip_address_map>& am,
        sstring keyspace, std::unordered_map<sstring, sstring> options) {
    return repair.invoke_on(0, [keyspace = std::move(keyspace), options = std::move(options), &am] (repair_service& local_repair) {
@@ -1624,10 +1621,6 @@ future<std::optional<double>> repair::data_sync_repair_task_impl::expected_total
    co_return _cfs_size ? std::make_optional<double>(_ranges.size() * _cfs_size * smp::count) : std::nullopt;
 }

-std::optional<double> repair::data_sync_repair_task_impl::expected_children_number() const {
-    return smp::count;
-}
-
 future<> repair_service::bootstrap_with_repair(locator::token_metadata_ptr tmptr, std::unordered_set<dht::token> bootstrap_tokens) {
    SCYLLA_ASSERT(this_shard_id() == 0);
    return seastar::async([this, tmptr = std::move(tmptr), tokens = std::move(bootstrap_tokens)] () mutable {
@@ -2244,7 +2237,7 @@ future<> repair_service::replace_with_repair(std::unordered_map<sstring, locator
    auto reason = streaming::stream_reason::replace;
    // update a cloned version of tmptr
    // no need to set the original version
-    auto cloned_tmptr = make_token_metadata_ptr(std::move(cloned_tm));
+    auto cloned_tmptr = _db.local().get_shared_token_metadata().make_token_metadata_ptr(std::move(cloned_tm));
    cloned_tmptr->update_topology(tmptr->get_my_id(), myloc, locator::node::state::replacing);
    co_await cloned_tmptr->update_normal_tokens(replacing_tokens, tmptr->get_my_id());
    auto source_dc = utils::optional_param(myloc.dc);
@@ -2267,9 +2260,92 @@ static std::unordered_set<locator::host_id> get_token_owners_in_dcs(std::vector<
    return dc_endpoints;
 }

+future<> repair::remote_metas::for_each_local_meta(std::function<future<>(const tablet_repair_task_meta&)> func) const {
+    size_t this_shard = this_shard_id();
+    if (!_metas_on_shards[this_shard]) {
+        co_return;
+    }
+    for (size_t i = 0; i < _metas_on_shards[this_shard]->metas.size(); ++i) {
+        co_await func(_metas_on_shards[this_shard]->metas[i]);
+    }
+}
+
+future<repair::hosts_and_tables> repair::remote_metas::get_hosts_and_tables() const {
+    repair::hosts_and_tables ret;
+
+    co_await coroutine::parallel_for_each(smp::all_cpus(), [this, &ret](const auto& shard) -> future<> {
+        auto hosts_and_tables = co_await smp::submit_to(shard, [this]() mutable -> future<repair::hosts_and_tables> {
+            repair::hosts_and_tables shard_ret;
+
+            co_await for_each_local_meta([&shard_ret](const auto& meta) {
+                shard_ret.hosts.insert(meta.neighbors.all.begin(), meta.neighbors.all.end());
+                shard_ret.tables.insert(meta.tid);
+                return make_ready_future<>();
+            });
+
+            co_return shard_ret;
+        });
+
+        ret.hosts.insert(hosts_and_tables.hosts.begin(), hosts_and_tables.hosts.end());
+        ret.tables.insert(hosts_and_tables.tables.begin(), hosts_and_tables.tables.end());
+    });
+
+    co_return ret;
+}
+
+size_t repair::remote_metas::size() const {
+    return _metas_count;
+}
+
+void repair::remote_metas::clear() {
+    _metas_on_shards.clear();
+    _metas_count = 0;
+}
+
+future<> repair::remote_metas_builder::allocate_on_shard(size_t shard_id) {
+    _remote_metas._metas_on_shards[shard_id] = co_await smp::submit_to(shard_id, []() -> future<remote_metas::remote_data_ptr> {
+        auto ptr = make_lw_shared<remote_data>();
+        co_return remote_metas::remote_data_ptr(std::move(ptr));
+    });
+}
+
+future<> repair::remote_metas_builder::flush(size_t shard_id) {
+    auto it = _pending_metas.find(shard_id);
+    if (it == _pending_metas.end() || it->second.empty()) {
+        co_return;
+    }
+    auto local_pending_metas = std::move(it->second);
+    _pending_metas.erase(it);
+
+    auto& metas_ptr = _remote_metas._metas_on_shards[shard_id];
+    if (!metas_ptr) {
+        co_await allocate_on_shard(shard_id);
+    }
+    co_await smp::submit_to(shard_id, [metas = std::move(local_pending_metas), &metas_ptr]() {
+        metas_ptr->metas.insert(metas_ptr->metas.end(), std::make_move_iterator(metas.begin()), std::make_move_iterator(metas.end()));
+    });
+}
+
+future<> repair::remote_metas_builder::add_on_shard(size_t shard_id, tablet_repair_task_meta meta) {
+    ++_remote_metas._metas_count;
+    auto& local_pending_metas = _pending_metas[shard_id];
+    local_pending_metas.push_back(std::move(meta));
+    if (local_pending_metas.size() >= max_pending_metas_per_shard) {
+        co_await flush(shard_id);
+    }
+}
+
+future<repair::remote_metas> repair::remote_metas_builder::build() && {
+    for (size_t shard_id = 0; shard_id < smp::count; ++shard_id) {
+        co_await flush(shard_id);
+    }
+    co_return std::move(_remote_metas);
+}
+
 // Repair all tablets belong to this node for the given table
 future<> repair_service::repair_tablets(repair_uniq_id rid, sstring keyspace_name, std::vector<sstring> table_names, bool primary_replica_only, dht::token_range_vector ranges_specified, std::vector<sstring> data_centers, std::unordered_set<locator::host_id> hosts, std::unordered_set<locator::host_id> ignore_nodes, std::optional<int> ranges_parallelism) {
-    std::vector<tablet_repair_task_meta> task_metas;
+    utils::chunked_vector<locator::effective_replication_map_ptr> erms;
+    repair::remote_metas_builder task_metas_builder;
    for (auto& table_name : table_names) {
        lw_shared_ptr<replica::table> t;
        try {
@@ -2283,7 +2359,8 @@ future<> repair_service::repair_tablets(repair_uniq_id rid, sstring keyspace_nam
        }
        table_id tid = t->schema()->id();
        // Invoke group0 read barrier before obtaining erm pointer so that it sees all prior metadata changes
-        auto dropped = co_await streaming::table_sync_and_check(_db.local(), _mm, tid);
+        auto dropped = !utils::get_local_injector().enter("repair_tablets_no_sync") &&
+            co_await streaming::table_sync_and_check(_db.local(), _mm, tid);
        if (dropped) {
            rlogger.debug("repair[{}] Table {}.{} does not exist anymore", rid.uuid(), keyspace_name, table_name);
            continue;
@@ -2292,11 +2369,15 @@ future<> repair_service::repair_tablets(repair_uniq_id rid, sstring keyspace_nam
        while (true) {
            _repair_module->check_in_shutdown();
            erm = t->get_effective_replication_map();
+            auto local_version = erm->get_token_metadata().get_version();
            const locator::tablet_map& tmap = erm->get_token_metadata_ptr()->tablets().get_tablet_map(tid);
-            if (!tmap.has_transitions()) {
+            if (!tmap.has_transitions() && co_await container().invoke_on(0, [local_version] (repair_service& rs) {
+                    // We need to ensure that there is no ongoing global request.
+                    return local_version == rs._tsm.local()._topology.version && !rs._tsm.local()._topology.is_busy();
+                })) {
                break;
            }
-            rlogger.info("repair[{}] Table {}.{} has tablet transitions, waiting for topology to quiesce", rid.uuid(), keyspace_name, table_name);
+            rlogger.info("repair[{}] Topology is busy, waiting for it to quiesce", rid.uuid());
            erm = nullptr;
            co_await container().invoke_on(0, [] (repair_service& rs) {
                return rs._tsm.local().await_not_busy();
@@ -2369,6 +2450,7 @@ future<> repair_service::repair_tablets(repair_uniq_id rid, sstring keyspace_nam
        }

        size_t nr = 0;
+        bool metas_added = false;
        for (auto& m : metas) {
            nr++;
            rlogger.debug("repair[{}] Collect {} out of {} tablets: table={}.{} tablet_id={} range={} replicas={} primary_replica_only={}",
@@ -2428,14 +2510,17 @@ future<> repair_service::repair_tablets(repair_uniq_id rid, sstring keyspace_nam
                }
            }
            for (auto& r : intersection_ranges) {
+                metas_added = true;
                rlogger.debug("repair[{}] Repair tablet task table={}.{} master_shard_id={} range={} neighbors={} replicas={}",
                        rid.uuid(), keyspace_name, table_name, master_shard_id, r, repair_neighbors(nodes, shards).shard_map, m.replicas);
-                task_metas.push_back(tablet_repair_task_meta{keyspace_name, table_name, tid, master_shard_id, r, repair_neighbors(nodes, shards), m.replicas, erm});
-                co_await coroutine::maybe_yield();
+                co_await task_metas_builder.add_on_shard(master_shard_id, tablet_repair_task_meta{keyspace_name, table_name, tid, master_shard_id, r, repair_neighbors(nodes, shards), m.replicas});
            }
        }
+        if (metas_added) {
+            erms.push_back(std::move(erm));
+        }
    }
-    auto task = co_await _repair_module->make_and_start_task<repair::tablet_repair_task_impl>({}, rid, keyspace_name, tasks::task_id::create_null_id(), table_names, streaming::stream_reason::repair, std::move(task_metas), ranges_parallelism, service::default_session_id);
+    auto task = co_await _repair_module->make_and_start_task<repair::tablet_repair_task_impl>({}, rid, keyspace_name, tasks::task_id::create_null_id(), table_names, streaming::stream_reason::repair, co_await std::move(task_metas_builder).build(), ranges_parallelism, service::default_session_id, std::move(erms));
 }

 // It is called by the repair_tablet rpc verb to repair the given tablet
@@ -2494,11 +2579,11 @@ future<gc_clock::time_point> repair_service::repair_tablet(gms::gossip_address_m
        co_return flush_time;
    }

-    std::vector<tablet_repair_task_meta> task_metas;
+    repair::remote_metas_builder task_metas_builder;
    auto ranges_parallelism = std::nullopt;
    auto start = std::chrono::steady_clock::now();
-    task_metas.push_back(tablet_repair_task_meta{keyspace_name, table_name, table_id, *master_shard_id, range, repair_neighbors(nodes, shards), replicas});
-    auto task_impl_ptr = seastar::make_shared<repair::tablet_repair_task_impl>(_repair_module, id, keyspace_name, global_tablet_repair_task_info.id, table_names, streaming::stream_reason::repair, std::move(task_metas), ranges_parallelism, topo_guard, rebuild_replicas.has_value());
+    co_await task_metas_builder.add_on_shard(*master_shard_id, tablet_repair_task_meta{keyspace_name, table_name, table_id, *master_shard_id, range, repair_neighbors(nodes, shards), replicas});
+    auto task_impl_ptr = seastar::make_shared<repair::tablet_repair_task_impl>(_repair_module, id, keyspace_name, global_tablet_repair_task_info.id, table_names, streaming::stream_reason::repair, co_await std::move(task_metas_builder).build(), ranges_parallelism, topo_guard, utils::chunked_vector<locator::effective_replication_map_ptr>{}, rebuild_replicas.has_value());
    task_impl_ptr->sched_by_scheduler = true;
    auto task = co_await _repair_module->make_task(task_impl_ptr, global_tablet_repair_task_info);
    task->start();
@@ -2521,13 +2606,14 @@ tasks::is_user_task repair::tablet_repair_task_impl::is_user_task() const noexce

 future<> repair::tablet_repair_task_impl::release_resources() noexcept {
    _metas_size = _metas.size();
-    _metas = {};
+    _metas.clear();
    _tables = {};
+    _erms = {};
    return make_ready_future();
 }

 size_t repair::tablet_repair_task_impl::get_metas_size() const noexcept {
-    return _metas.size() > 0 ? _metas.size() : _metas_size;
+    return _metas_size == 0 ? _metas.size() : _metas_size;
 }

 future<> repair::tablet_repair_task_impl::run() {
@@ -2545,11 +2631,9 @@ future<> repair::tablet_repair_task_impl::run() {
        // Start the off strategy updater
        std::unordered_set<locator::host_id> participants;
        std::unordered_set<table_id> table_ids;
-        for (auto& meta : _metas) {
-            thread::maybe_yield();
-            participants.insert(meta.neighbors.all.begin(), meta.neighbors.all.end());
-            table_ids.insert(meta.tid);
-        }
+        auto hosts_and_tables = _metas.get_hosts_and_tables().get();
+        participants = std::move(hosts_and_tables.hosts);
+        table_ids = std::move(hosts_and_tables.tables);
        abort_source as;
        auto off_strategy_updater = seastar::async([&rs, uuid = id.uuid().uuid(), &table_ids, &participants, &as] {
            auto tables = std::list<table_id>(table_ids.begin(), table_ids.end());
@@ -2590,19 +2674,17 @@ future<> repair::tablet_repair_task_impl::run() {

        auto parent_shard = this_shard_id();
        std::vector<gc_clock::time_point> flush_times(smp::count);
-        rs.container().invoke_on_all([&idx, &flush_times, id, metas = _metas, parent_data, reason = _reason, tables = _tables, sched_by_scheduler = sched_by_scheduler, ranges_parallelism = _ranges_parallelism, parent_shard, topo_guard = _topo_guard, skip_flush = _skip_flush] (repair_service& rs) -> future<> {
+        rs.container().invoke_on_all([&idx, &flush_times, id, &metas = _metas, &parent_data, reason = _reason, &tables = _tables, sched_by_scheduler = sched_by_scheduler, ranges_parallelism = _ranges_parallelism, parent_shard, topo_guard = _topo_guard, skip_flush = _skip_flush] (repair_service& rs) -> future<> {
            std::exception_ptr error;
-            for (auto& m : metas) {
-                if (m.master_shard_id != this_shard_id()) {
-                    continue;
-                }
+            co_await metas.for_each_local_meta(coroutine::lambda([&rs, metas_size = metas.size(), &idx, id, &flush_times, parent_data, reason, &tables, sched_by_scheduler = sched_by_scheduler, ranges_parallelism, parent_shard, topo_guard, skip_flush, &error] (const tablet_repair_task_meta& m) -> future<> {
+                co_await coroutine::maybe_yield();
                auto nr = idx.fetch_add(1);
                rlogger.info("repair[{}] Repair {} out of {} tablets: table={}.{} range={} replicas={}",
-                    id.uuid(), nr, metas.size(), m.keyspace_name, m.table_name, m.range, m.replicas);
+                    id.uuid(), nr, metas_size, m.keyspace_name, m.table_name, m.range, m.replicas);
                lw_shared_ptr<replica::table> t = rs._db.local().get_tables_metadata().get_table_if_exists(m.tid);
                if (!t) {
                    rlogger.debug("repair[{}] Table {}.{} does not exist anymore", id.uuid(), m.keyspace_name, m.table_name);
-                    continue;
+                    co_return;
                }
                if (co_await rs.get_repair_module().is_aborted(id.uuid(), parent_shard)) {
                    throw abort_requested_exception();
@@ -2650,7 +2732,7 @@ future<> repair::tablet_repair_task_impl::run() {
                auto current = flush_times[this_shard_id()];
                auto time = task_impl_ptr->get_flush_time();
                flush_times[this_shard_id()] = current == gc_clock::time_point() ? time : std::min(current, time);
-            }
+            }));
            if (error) {
                co_await coroutine::return_exception_ptr(std::move(error));
            }
@@ -2677,10 +2759,6 @@ future<std::optional<double>> repair::tablet_repair_task_impl::expected_total_wo
    co_return sz ? std::make_optional<double>(sz) : std::nullopt;
 }

-std::optional<double> repair::tablet_repair_task_impl::expected_children_number() const {
-    return get_metas_size();
-}
-
 node_ops_cmd_category categorize_node_ops_cmd(node_ops_cmd cmd) noexcept {
    switch (cmd) {
    case node_ops_cmd::removenode_prepare:
--- a/repair/repair.hh
+++ b/repair/repair.hh
@@ -268,7 +268,6 @@ struct tablet_repair_task_meta {
    dht::token_range range;
    repair_neighbors neighbors;
    locator::tablet_replica_set replicas;
-    locator::effective_replication_map_ptr erm;
 };

 namespace std {
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -1448,7 +1448,9 @@ private:
        size_t row_bytes = co_await get_repair_rows_size(row_list);
        _metrics.tx_row_nr += row_list.size();
        _metrics.tx_row_bytes += row_bytes;
-        for (repair_row& r : row_list) {
+        while (!row_list.empty()) {
+            repair_row r = std::move(row_list.front());
+            row_list.pop_front();
            const auto& dk_with_hash = r.get_dk_with_hash();
            // No need to search from the beginning of the rows. Look at the end of repair_rows_on_wire is enough.
            if (rows.empty()) {
@@ -2762,7 +2764,12 @@ private:

    size_t get_max_row_buf_size(row_level_diff_detect_algorithm algo) {
        // Max buffer size per repair round
-        return is_rpc_stream_supported(algo) ?  repair::task_manager_module::max_repair_memory_per_range : 256 * 1024;
+        size_t size = is_rpc_stream_supported(algo) ? repair::task_manager_module::max_repair_memory_per_range : 256 * 1024;
+        if (_small_table_optimization) {
+            // For small table optimization, we reduce the buffer size to reduce memory consumption.
+            size /= _all_live_peer_nodes.size();
+        }
+        return size;
    }

    // Step A: Negotiate sync boundary to use
@@ -3096,7 +3103,7 @@ public:

            auto& mem_sem = _shard_task.rs.memory_sem();
            auto max = _shard_task.rs.max_repair_memory();
-            auto wanted = (_all_live_peer_nodes.size() + 1) * repair::task_manager_module::max_repair_memory_per_range;
+            auto wanted = (_all_live_peer_nodes.size() + 1) * max_row_buf_size;
            wanted = std::min(max, wanted);
            rlogger.trace("repair[{}]: Started to get memory budget, wanted={}, available={}, max_repair_memory={}",
                    _shard_task.global_repair_id.uuid(), wanted, mem_sem.current(), max);
--- a/repair/task_manager_module.hh
+++ b/repair/task_manager_module.hh
@@ -13,9 +13,12 @@
 #include "service/topology_guard.hh"
 #include "streaming/stream_reason.hh"
 #include "tasks/task_manager.hh"
+#include <cstddef>

 namespace repair {

+class remote_metas;
+
 class repair_task_impl : public tasks::task_manager::task::impl {
 protected:
    streaming::stream_reason _reason;
@@ -74,7 +77,6 @@ protected:
    future<> run() override;

    virtual future<std::optional<double>> expected_total_workload() const override;
-    virtual std::optional<double> expected_children_number() const override;
 };

 class data_sync_repair_task_impl : public repair_task_impl {
@@ -103,30 +105,72 @@ protected:
    future<> run() override;

    virtual future<std::optional<double>> expected_total_workload() const override;
-    virtual std::optional<double> expected_children_number() const override;
+};
+
+struct hosts_and_tables {
+    std::unordered_set<locator::host_id> hosts;
+    std::unordered_set<table_id> tables;
+};
+
+struct remote_data {
+    utils::chunked_vector<tablet_repair_task_meta> metas;
+};
+
+class remote_metas {
+public:
+    using remote_data_ptr = foreign_ptr<lw_shared_ptr<remote_data>>;
+private:
+    std::vector<remote_data_ptr> _metas_on_shards;
+    size_t _metas_count = 0;
+
+    remote_metas() : _metas_on_shards(smp::count) {}
+public:
+    future<> for_each_local_meta(std::function<future<>(const tablet_repair_task_meta&)> func) const;
+    size_t size() const;
+    void clear();
+    future<hosts_and_tables> get_hosts_and_tables() const;
+
+    friend class remote_metas_builder;
+};
+
+class remote_metas_builder {
+private:
+    remote_metas _remote_metas;
+    std::unordered_map<size_t, std::vector<tablet_repair_task_meta>> _pending_metas;
+    constexpr static size_t max_pending_metas_per_shard = 32;
+public:
+    remote_metas_builder() : _remote_metas() {}
+
+    future<> add_on_shard(size_t shard_id, tablet_repair_task_meta meta);
+    future<remote_metas> build() &&;
+private:
+    future<> flush(size_t shard_id);
+    future<> allocate_on_shard(size_t shard_id);
 };

 class tablet_repair_task_impl : public repair_task_impl {
 private:
    sstring _keyspace;
    std::vector<sstring> _tables;
-    std::vector<tablet_repair_task_meta> _metas;
+    remote_metas _metas;
    optimized_optional<abort_source::subscription> _abort_subscription;
    std::optional<int> _ranges_parallelism;
    size_t _metas_size = 0;
    gc_clock::time_point _flush_time;
    service::frozen_topology_guard _topo_guard;
+    utils::chunked_vector<locator::effective_replication_map_ptr> _erms;
    bool _skip_flush;
 public:
    bool sched_by_scheduler = false;
 public:
-    tablet_repair_task_impl(tasks::task_manager::module_ptr module, repair_uniq_id id, sstring keyspace, tasks::task_id parent_id, std::vector<sstring> tables, streaming::stream_reason reason, std::vector<tablet_repair_task_meta> metas, std::optional<int> ranges_parallelism, service::frozen_topology_guard topo_guard, bool skip_flush = false)
+    tablet_repair_task_impl(tasks::task_manager::module_ptr module, repair_uniq_id id, sstring keyspace, tasks::task_id parent_id, std::vector<sstring> tables, streaming::stream_reason reason, remote_metas metas, std::optional<int> ranges_parallelism, service::frozen_topology_guard topo_guard, utils::chunked_vector<locator::effective_replication_map_ptr> erms, bool skip_flush = false)
        : repair_task_impl(module, id.uuid(), id.id, "keyspace", keyspace, "", "", parent_id, reason)
        , _keyspace(std::move(keyspace))
        , _tables(std::move(tables))
        , _metas(std::move(metas))
        , _ranges_parallelism(ranges_parallelism)
        , _topo_guard(topo_guard)
+        , _erms(std::move(erms))
        , _skip_flush(skip_flush)
    {
    }
@@ -145,7 +189,6 @@ protected:
    future<> run() override;

    virtual future<std::optional<double>> expected_total_workload() const override;
-    virtual std::optional<double> expected_children_number() const override;
 };

 class shard_repair_task_impl : public repair_task_impl {
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -355,7 +355,7 @@ database::view_update_read_concurrency_sem() {
    return *sem;
 }

-database::database(const db::config& cfg, database_config dbcfg, service::migration_notifier& mn, gms::feature_service& feat, const locator::shared_token_metadata& stm,
+database::database(const db::config& cfg, database_config dbcfg, service::migration_notifier& mn, gms::feature_service& feat, locator::shared_token_metadata& stm,
        compaction_manager& cm, sstables::storage_manager& sstm, lang::manager& langm, sstables::directory_semaphore& sst_dir_sem, sstable_compressor_factory& scf, const abort_source& abort, utils::cross_shard_barrier barrier)
    : _stats(make_lw_shared<db_stats>())
    , _user_types(std::make_shared<db_user_types_storage>(*this))
@@ -1097,6 +1097,7 @@ future<> database::drop_table_on_all_shards(sharded<database>& sharded_db, shard
        return table_shards->stop();
    });
    f.get(); // re-throw exception from truncate() if any
+    co_await sys_ks.local().remove_truncation_records(table_shards->schema()->id());
    co_await table_shards->destroy_storage();
 }

@@ -2671,7 +2672,8 @@ future<> database::truncate(db::system_keyspace& sys_ks, column_family& cf, cons
    // since we don't want to leave behind data on disk with RP lower than the one we set
    // in the truncation table.
    if (st.did_flush && rp != db::replay_position() && st.low_mark < rp) {
-        on_internal_error(dblog, "Data written after truncation time was incorrectly truncated. Truncate is known to not work well with concurrent writes. Retry!");
+        dblog.warn("Data in table {}.{} is written after truncation time and was incorrectly truncated. truncated_at: {} low_mark: {} rp: {}",
+                    cf.schema()->ks_name(), cf.schema()->cf_name(), truncated_at, st.low_mark, rp);
    }
    if (rp == db::replay_position()) {
        // If this shard had no mutations, st.low_mark will be an empty, default constructed
@@ -3230,11 +3232,34 @@ database::on_effective_service_levels_cache_reloaded() {
    co_return;
 }

-void database::check_rf_rack_validity(const locator::token_metadata_ptr tmptr) const {
-    SCYLLA_ASSERT(get_config().rf_rack_valid_keyspaces());
+void database::check_rf_rack_validity(const bool enforce_rf_rack_valid_keyspaces, const locator::token_metadata_ptr tmptr) const {
+    const auto& keyspaces = get_keyspaces();
+    std::vector<std::string_view> invalid_keyspaces{};

-    for (const auto& [name, info] : get_keyspaces()) {
-        locator::assert_rf_rack_valid_keyspace(name, tmptr, info.get_replication_strategy());
+    for (const auto& [name, info] : keyspaces) {
+        try {
+            locator::assert_rf_rack_valid_keyspace(name, tmptr, info.get_replication_strategy());
+        } catch (...) {
+            if (enforce_rf_rack_valid_keyspaces) {
+                throw;
+            }
+
+            invalid_keyspaces.push_back(std::string_view(name));
+        }
+    }
+
+    if (invalid_keyspaces.size() == 0) {
+        dblog.info("All keyspaces are RF-rack-valid");
+    } else {
+        const auto ks_list = invalid_keyspaces
+                | std::views::join_with(std::string_view(", "))
+                | std::ranges::to<std::string>();
+
+        dblog.warn("Some existing keyspaces are not RF-rack-valid, i.e. the replication factor "
+                "does not match the number of racks in one of the datacenters. That may reduce "
+                "availability in case of a failure (cf. "
+                "https://docs.scylladb.com/manual/stable/reference/glossary.html#term-RF-rack-valid-keyspace). "
+                "Those keyspaces are: {}", ks_list);
    }
 }

--- a/replica/database.hh
+++ b/replica/database.hh
@@ -1599,7 +1599,7 @@ private:
    service::migration_notifier& _mnotifier;
    gms::feature_service& _feat;
    std::vector<std::any> _listeners;
-    const locator::shared_token_metadata& _shared_token_metadata;
+    locator::shared_token_metadata& _shared_token_metadata;
    lang::manager& _lang_manager;

    reader_concurrency_semaphore_group _reader_concurrency_semaphores_group;
@@ -1684,7 +1684,7 @@ public:
    // (keyspace/table definitions, column mappings etc.)
    future<> parse_system_tables(distributed<service::storage_proxy>&, sharded<db::system_keyspace>&);

-    database(const db::config&, database_config dbcfg, service::migration_notifier& mn, gms::feature_service& feat, const locator::shared_token_metadata& stm,
+    database(const db::config&, database_config dbcfg, service::migration_notifier& mn, gms::feature_service& feat, locator::shared_token_metadata& stm,
            compaction_manager& cm, sstables::storage_manager& sstm, lang::manager& langm, sstables::directory_semaphore& sst_dir_sem, sstable_compressor_factory&, const abort_source& abort,
            utils::cross_shard_barrier barrier = utils::cross_shard_barrier(utils::cross_shard_barrier::solo{}) /* for single-shard usage */);
    database(database&&) = delete;
@@ -1719,7 +1719,7 @@ public:
        return _compaction_manager;
    }

-    const locator::shared_token_metadata& get_shared_token_metadata() const { return _shared_token_metadata; }
+    locator::shared_token_metadata& get_shared_token_metadata() const { return _shared_token_metadata; }
    locator::token_metadata_ptr get_token_metadata_ptr() const { return _shared_token_metadata.get(); }
    const locator::token_metadata& get_token_metadata() const { return *_shared_token_metadata.get(); }

@@ -1991,11 +1991,16 @@ public:

    // Verify that the existing keyspaces are all RF-rack-valid.
    //
+    // Result:
+    // * If `enforce_rf_rack_valid_keyspaces`, throws an exception with a relevant message
+    //   if there is a keyspace that violates RF-rack-validity.
+    // * If not `enforce_rf_rack_valid_keyspaces`, a warning will be printed for each keyspace
+    //   that is not RF-rack-valid, but no exception should be produced.
+    //
    // Preconditions:
-    // * the option `rf_rack_valid_keyspaces` in enabled,
    // * the `locator::topology` instance corresponding to the passed `locator::token_metadata_ptr`
    //   must contain a complete list of racks and data centers in the cluster.
-    void check_rf_rack_validity(const locator::token_metadata_ptr) const;
+    void check_rf_rack_validity(const bool enforce_rf_rack_valid_keyspaces, const locator::token_metadata_ptr) const;
 private:
    // SSTable sampling might require considerable amounts of memory,
    // so we want to limit the number of concurrent sampling operations.
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -2737,7 +2737,17 @@ void tablet_storage_group_manager::update_effective_replication_map(const locato
            _storage_groups[tid.value()] = allocate_storage_group(*new_tablet_map, tid, std::move(range));
            tablet_migrating_in = true;
        } else if (_storage_groups.contains(tid.value()) && locator::is_post_cleanup(this_replica, new_tablet_map->get_tablet_info(tid), transition_info)) {
+            // The storage group should be cleaned up and stopped at this point usually by the tablet cleanup stage,
+            // unless the storage group was allocated after tablet cleanup was completed for this node. This could
+            // happen if the node was restarted after tablet cleanup was run but before moving to the next stage. To
+            // handle this case we stop the storage group here if it's not stopped already.
+            auto sg = _storage_groups[tid.value()];
+
            remove_storage_group(tid.value());
+
+            (void) with_gate(_t.async_gate(), [sg] {
+                return sg->stop("tablet post-cleanup").then([sg] {});
+            });
        }
    }

@@ -3175,8 +3185,13 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat
    // materialized view was created right after truncation started, and it
    // would not have compaction disabled when this function is called on it.
    if (!schema()->is_view()) {
-        auto compaction_disabled = std::ranges::all_of(storage_groups() | std::views::values,
-                                                       std::mem_fn(&storage_group::compaction_disabled));
+        // Check if the storage groups have compaction disabled, but also check if they have been stopped.
+        // This is to avoid races with tablet cleanup which stops the storage group, and then stops the
+        // compaction groups. We could have a situation where compaction couldn't have been disabled by
+        // truncate because the storage group has been stopped, but the compaction groups have not yet been stopped.
+        auto compaction_disabled = std::ranges::all_of(storage_groups() | std::views::values, [] (const storage_group_ptr& sgp) {
+            return sgp->async_gate().is_closed() || sgp->compaction_disabled();
+        });
        if (!compaction_disabled) {
            utils::on_internal_error(fmt::format("compaction not disabled on table {}.{} during TRUNCATE",
                schema()->ks_name(), schema()->cf_name()));
@@ -4075,6 +4090,12 @@ future<> storage_group::stop(sstring reason) noexcept {
    // picking this group that is being stopped.
    auto closed_gate_fut = _async_gate.close();

+    co_await utils::get_local_injector().inject("wait_before_stop_compaction_groups", [] (auto& handler) -> future<> {
+        dblog.info("wait_before_stop_compaction_groups: wait");
+        co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
+        dblog.info("wait_before_stop_compaction_groups: done");
+    }, false);
+
    // Synchronizes with in-flight writes if any, and also takes care of flushing if needed.

    // The reason we have to stop main cg first, is because an ongoing split always run in main cg
--- a/scripts/metrics-config.yml
+++ b/scripts/metrics-config.yml
@@ -67,5 +67,6 @@
        "_queue_name + \"_tx_frags\"": ["queue"]
        "_queue_name + \"_rx_frags\"": ["queue"]
 "alternator/stats.cc":
+  allowmismatch: true
  params:
-    group_name:  "alternator"
+    group_name: "alternator"
--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -999,6 +999,8 @@ class managed_bytes:
        inf = gdb.selected_inferior()

        def to_bytes(data, size):
+            if size == 0:
+                return b''
            return bytes(inf.read_memory(data, size))

        if self.is_inline():
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -56,7 +56,9 @@ migration_manager::migration_manager(migration_notifier& notifier, gms::feature_
        , _group0_barrier(this_shard_id() == 0 ?
            std::function<future<>()>([this] () -> future<> {
                // This will run raft barrier and will sync schema with the leader
-                (void)co_await start_group0_operation();
+                return with_scheduling_group(_storage_proxy.get_db().local().get_gossip_scheduling_group(), [this] {
+                    return start_group0_operation().discard_result();
+                });
            }) :
            std::function<future<>()>([this] () -> future<> {
                co_await container().invoke_on(0, [] (migration_manager& mm) -> future<> {
--- a/service/qos/raft_service_level_distributed_data_accessor.cc
+++ b/service/qos/raft_service_level_distributed_data_accessor.cc
@@ -100,6 +100,10 @@ bool raft_service_level_distributed_data_accessor::is_v2() const {
    return true;
 }

+bool raft_service_level_distributed_data_accessor::can_use_effective_service_level_cache() const {
+    return !auth::legacy_mode(_qp);
+}
+
 ::shared_ptr<service_level_controller::service_level_distributed_data_accessor> raft_service_level_distributed_data_accessor::upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const {
    return nullptr;
 }
--- a/service/qos/raft_service_level_distributed_data_accessor.hh
+++ b/service/qos/raft_service_level_distributed_data_accessor.hh
@@ -39,6 +39,7 @@ public:
    virtual future<> commit_mutations(service::group0_batch&& mc, abort_source& as) const override;

    virtual bool is_v2() const override;
+    virtual bool can_use_effective_service_level_cache() const override;
    virtual ::shared_ptr<service_level_distributed_data_accessor> upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const override;
 };

--- a/service/qos/service_level_controller.cc
+++ b/service/qos/service_level_controller.cc
@@ -42,6 +42,20 @@ constexpr const char* scheduling_group_name_pattern = "sl:{}";
 constexpr const char* deleted_scheduling_group_name_pattern = "sl_deleted:{}";
 constexpr const char* temp_scheduling_group_name_pattern = "sl_temp:{}";

+service_level_controller::auth_integration::auth_integration(service_level_controller& sl_controller, auth::service& auth_service)
+    : _sl_controller(sl_controller)
+    , _auth_service(auth_service)
+    , _stop_gate("service_level_controller_auth_integration_stop_gate")
+{}
+
+future<> service_level_controller::auth_integration::stop() {
+    co_await _stop_gate.close();
+}
+
+void service_level_controller::auth_integration::clear_cache() {
+    _cache.clear();
+}
+
 service_level_controller::service_level_controller(sharded<auth::service>& auth_service, locator::shared_token_metadata& tm, abort_source& as, service_level_options default_service_level_config, scheduling_group default_scheduling_group, bool destroy_default_sg_on_drain)
        : _sl_data_accessor(nullptr)
        , _auth_service(auth_service)
@@ -268,7 +282,11 @@ future<> service_level_controller::update_service_levels_cache(qos::query_contex
                sl_logger.info("service level \"{}\" was updated. New values: (timeout: {}, workload_type: {}, shares: {})",
                        sl.first, sl.second.timeout, sl.second.workload, sl.second.shares);
            }
-            _effective_service_levels_db.clear();
+
+            if (_auth_integration) {
+                _auth_integration->clear_cache();
+            }
+
            for (auto&& sl : service_levels_for_add) {
                bool make_room = false;
                std::map<sstring, service_level>::reverse_iterator it;
@@ -300,17 +318,24 @@ future<> service_level_controller::update_service_levels_cache(qos::query_contex
    });
 }

-future<> service_level_controller::update_effective_service_levels_cache() {
+future<> service_level_controller::auth_integration::reload_cache() {
    SCYLLA_ASSERT(this_shard_id() == global_controller);
-    
-    if (!_auth_service.local_is_initialized()) {
-        // Because cache update is triggered in `topology_state_load()`, auth service
-        // might be not initialized yet.
+    const auto _ = _stop_gate.hold();
+
+    if (!_sl_controller._sl_data_accessor || !_sl_controller._sl_data_accessor->can_use_effective_service_level_cache()) {
+        // Don't populate the effective service level cache until auth is migrated to raft.
+        // Otherwise, executing the code that follows would read roles data
+        // from system_auth tables; that would be bad because reading from
+        // those tables is prone to timeouts, and `reload_cache`
+        // is called from the group0 context - a timeout like that would render
+        // group0 non-functional on the node until restart.
+        //
+        // See scylladb/scylladb#24963 for more details.
        co_return;
    }
-    auto units = co_await get_units(_global_controller_db->notifications_serializer, 1);
+    auto units = co_await get_units(_sl_controller._global_controller_db->notifications_serializer, 1);

-    auto& role_manager = _auth_service.local().underlying_role_manager();
+    auto& role_manager = _auth_service.underlying_role_manager();
    const auto all_roles = co_await role_manager.query_all();
    const auto hierarchy = co_await role_manager.query_all_directly_granted();
    // includes only roles with attached service level
@@ -328,11 +353,11 @@ future<> service_level_controller::update_effective_service_levels_cache() {
        std::optional<service_level_options> sl_options;

        if (auto sl_name_it = attributes.find(role); sl_name_it != attributes.end()) {
-            if (auto sl_it = _service_levels_db.find(sl_name_it->second); sl_it != _service_levels_db.end()) { 
+            if (auto sl_it = _sl_controller._service_levels_db.find(sl_name_it->second); sl_it != _sl_controller._service_levels_db.end()) { 
                sl_options = sl_it->second.slo;
                sl_options->init_effective_names(sl_name_it->second);
                sl_options->shares_name = sl_name_it->second;
-            } else if (_effectively_dropped_sls.contains(sl_name_it->second)) {
+            } else if (_sl_controller._effectively_dropped_sls.contains(sl_name_it->second)) {
                // service level might be effective dropped, then it's not present in `_service_levels_db`
                sl_logger.warn("Service level {} is effectively dropped and its values are ignored.", sl_name_it->second);
            } else {
@@ -360,8 +385,12 @@ future<> service_level_controller::update_effective_service_levels_cache() {
        co_await coroutine::maybe_yield();
    }

-    co_await container().invoke_on_all([effective_sl_map] (service_level_controller& sl_controller) -> future<> {
-        sl_controller._effective_service_levels_db = std::move(effective_sl_map);
+    co_await _sl_controller.container().invoke_on_all([effective_sl_map] (service_level_controller& sl_controller) -> future<> {
+        // We probably cannot predict if `auth_integration` is still in place on another shard,
+        // so let's play it safe here.
+        if (sl_controller._auth_integration) {
+            sl_controller._auth_integration->_cache = std::move(effective_sl_map);
+        }
        co_await sl_controller.notify_effective_service_levels_cache_reloaded();
    });
 }
@@ -371,7 +400,10 @@ future<> service_level_controller::update_cache(update_both_cache_levels update_
    if (update_both_cache_levels) {
        co_await update_service_levels_cache(ctx);
    }
-    co_await update_effective_service_levels_cache();
+
+    if (_auth_integration) {
+        co_await _auth_integration->reload_cache();
+    }
 }

 void service_level_controller::stop_legacy_update_from_distributed_data() {
@@ -383,14 +415,16 @@ void service_level_controller::stop_legacy_update_from_distributed_data() {
    _global_controller_db->dist_data_update_aborter.request_abort();
 }

-future<std::optional<service_level_options>> service_level_controller::find_effective_service_level(const sstring& role_name) {
-    if (_sl_data_accessor->is_v2()) {
-        auto effective_sl_it = _effective_service_levels_db.find(role_name);
-        co_return effective_sl_it != _effective_service_levels_db.end() 
+future<std::optional<service_level_options>> service_level_controller::auth_integration::find_effective_service_level(const sstring& role_name) {
+    const auto _ = _stop_gate.hold();
+
+    if (_sl_controller._sl_data_accessor->can_use_effective_service_level_cache()) {
+        auto effective_sl_it = _cache.find(role_name);
+        co_return effective_sl_it != _cache.end() 
            ? std::optional<service_level_options>(effective_sl_it->second)
            : std::nullopt;
    } else {
-        auto& role_manager = _auth_service.local().underlying_role_manager();
+        auto& role_manager = _auth_service.underlying_role_manager();
        auto roles = co_await role_manager.query_granted(role_name, auth::recursive_role_query::yes);

        // converts a list of roles into the chosen service level.
@@ -401,8 +435,8 @@ future<std::optional<service_level_options>> service_level_controller::find_effe
                    if (!sl_name) {
                        return std::nullopt;
                    }
-                    auto sl_it = _service_levels_db.find(*sl_name);
-                    if ( sl_it == _service_levels_db.end()) {
+                    auto sl_it = _sl_controller._service_levels_db.find(*sl_name);
+                    if ( sl_it == _sl_controller._service_levels_db.end()) {
                        return std::nullopt;
                    }

@@ -427,17 +461,27 @@ future<std::optional<service_level_options>> service_level_controller::find_effe
    }
 }

-std::optional<service_level_options> service_level_controller::find_cached_effective_service_level(const sstring& role_name) {
-    if (!_sl_data_accessor->is_v2()) {
+future<std::optional<service_level_options>> service_level_controller::find_effective_service_level(const sstring& role_name) {
+    SCYLLA_ASSERT(_auth_integration != nullptr);
+    return _auth_integration->find_effective_service_level(role_name);
+}
+
+std::optional<service_level_options> service_level_controller::auth_integration::find_cached_effective_service_level(const sstring& role_name) {
+    if (!_sl_controller._sl_data_accessor->is_v2()) {
        return std::nullopt;
    }

-    auto effective_sl_it = _effective_service_levels_db.find(role_name);
-    return effective_sl_it != _effective_service_levels_db.end() 
+    auto effective_sl_it = _cache.find(role_name);
+    return effective_sl_it != _cache.end() 
        ? std::optional<service_level_options>(effective_sl_it->second)
        : std::nullopt;
 }

+std::optional<service_level_options> service_level_controller::find_cached_effective_service_level(const sstring& role_name) {
+    SCYLLA_ASSERT(_auth_integration != nullptr);
+    return _auth_integration->find_cached_effective_service_level(role_name);
+}
+
 future<>  service_level_controller::notify_service_level_added(sstring name, service_level sl_data) {
    return seastar::async( [this, name, sl_data] {
        service_level_info sl_info = {
@@ -534,17 +578,24 @@ scheduling_group service_level_controller::get_scheduling_group(sstring service_
    }
 }

-future<scheduling_group> service_level_controller::get_user_scheduling_group(const std::optional<auth::authenticated_user>& usr) {
+future<scheduling_group> service_level_controller::auth_integration::get_user_scheduling_group(const std::optional<auth::authenticated_user>& usr) {
+    const auto _ = _stop_gate.hold();
+
    if (usr && usr->name) {
        auto sl_opt = co_await find_effective_service_level(*usr->name);
        auto& sl_name = (sl_opt && sl_opt->shares_name) ? *sl_opt->shares_name : default_service_level_name;
-        co_return get_scheduling_group(sl_name);
+        co_return _sl_controller.get_scheduling_group(sl_name);
    }
    else {
-        co_return get_default_scheduling_group();
+        co_return _sl_controller.get_default_scheduling_group();
    }
 }

+future<scheduling_group> service_level_controller::get_user_scheduling_group(const std::optional<auth::authenticated_user>& usr) {
+    SCYLLA_ASSERT(_auth_integration != nullptr);
+    return _auth_integration->get_user_scheduling_group(usr);
+}
+
 std::optional<sstring> service_level_controller::get_active_service_level() {
    unsigned sched_idx = internal::scheduling_group_index(current_scheduling_group());
    if (_sl_lookup[sched_idx].first) {
@@ -982,8 +1033,10 @@ future<std::vector<cql3::description>> service_level_controller::describe_create
    co_return result;
 }

-future<std::vector<cql3::description>> service_level_controller::describe_attached_service_levels() {
-    const auto attached_service_levels = co_await _auth_service.local().underlying_role_manager().query_attribute_for_all("service_level");
+future<std::vector<cql3::description>> service_level_controller::auth_integration::describe_attached_service_levels() {
+    const auto _ = _stop_gate.hold();
+
+    const auto attached_service_levels = co_await _auth_service.underlying_role_manager().query_attribute_for_all("service_level");

    std::vector<cql3::description> result{};
    result.reserve(attached_service_levels.size());
@@ -1012,8 +1065,10 @@ future<std::vector<cql3::description>> service_level_controller::describe_attach
 }

 future<std::vector<cql3::description>> service_level_controller::describe_service_levels() {
+    SCYLLA_ASSERT(_auth_integration != nullptr);
+
    std::vector<cql3::description> created_service_levels_descs = co_await describe_created_service_levels();
-    std::vector<cql3::description> attached_service_levels_descs = co_await describe_attached_service_levels();
+    std::vector<cql3::description> attached_service_levels_descs = co_await _auth_integration->describe_attached_service_levels();

    created_service_levels_descs.insert(created_service_levels_descs.end(),
            std::make_move_iterator(attached_service_levels_descs.begin()), std::make_move_iterator(attached_service_levels_descs.end()));
@@ -1021,6 +1076,19 @@ future<std::vector<cql3::description>> service_level_controller::describe_servic
    co_return created_service_levels_descs;
 }

+void service_level_controller::register_auth_integration(auth::service& auth_service) {
+    SCYLLA_ASSERT(_auth_integration == nullptr);
+    _auth_integration = std::make_unique<auth_integration>(*this, auth_service);
+}
+
+future<> service_level_controller::unregister_auth_integration() {
+    SCYLLA_ASSERT(_auth_integration != nullptr);
+    // First, prevent new tasks coming to `auth_integration`.
+    auto tmp = std::exchange(_auth_integration, nullptr);
+    // Now we can stop it.
+    co_await tmp->stop();
+}
+
 future<shared_ptr<service_level_controller::service_level_distributed_data_accessor>> 
 get_service_level_distributed_data_accessor_for_current_version(
    db::system_keyspace& sys_ks,
--- a/service/qos/service_level_controller.hh
+++ b/service/qos/service_level_controller.hh
@@ -114,11 +114,70 @@ public:
        virtual future<> commit_mutations(service::group0_batch&& mc, abort_source& as) const = 0;

        virtual bool is_v2() const = 0;
+        // Returns whether effective service level cache can be populated and used.
+        // This is equivalent to checking whether auth + raft have been migrated to raft.
+        virtual bool can_use_effective_service_level_cache() const = 0;
        // Returns v2(raft) data accessor. If data accessor is already a raft one, returns nullptr.
        virtual ::shared_ptr<service_level_distributed_data_accessor> upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const = 0;
    };
    using service_level_distributed_data_accessor_ptr = ::shared_ptr<service_level_distributed_data_accessor>;

+    class auth_integration {
+    private:
+        friend class service_level_controller;
+
+    private:
+        service_level_controller& _sl_controller;
+        auth::service& _auth_service;
+
+        /// Mappings `role name` -> `service level options`.
+        std::map<sstring, service_level_options> _cache;
+        /// This gate is supposed to synchronize `stop` with other tasks that
+        /// this interface performs. Because of that, EVERY coroutine function
+        /// of this class should hold it throughout its execution.
+        ///
+        /// Failing to do so may result in a segmentation fault and the like.
+        seastar::named_gate _stop_gate;
+
+    public:
+        auth_integration(service_level_controller&, auth::service&);
+
+        future<> stop();
+
+        /// Find the effective service level for a given role.
+        /// If there is no applicable service level for it, `std::nullopt` is returned instead.
+        future<std::optional<service_level_options>> find_effective_service_level(const sstring& role_name);
+        /// Synchronous version of `find_effective_service_level` that only checks the cache.
+        std::optional<service_level_options> find_cached_effective_service_level(const sstring& role_name);
+
+        future<scheduling_group> get_user_scheduling_group(const std::optional<auth::authenticated_user>& usr);
+
+        template <typename Func, typename Ret = std::invoke_result_t<Func>>
+            requires std::invocable<Func>
+        futurize_t<Ret> with_user_service_level(const std::optional<auth::authenticated_user>& user, Func&& func) {
+            // No need to hold `_stop_gate` here. It'll be held during the call to `find_effective_service_level`,
+            // and after that it's not necessary. We do NOT hold it here to avoid postpoing finishing `stop`.
+
+            if (user && user->name) {
+                const std::optional<service_level_options> maybe_sl_opts = co_await find_effective_service_level(*user->name);
+                const sstring& sl_name = maybe_sl_opts && maybe_sl_opts->shares_name
+                        ? *maybe_sl_opts->shares_name
+                        : service_level_controller::default_service_level_name;
+
+                co_return co_await _sl_controller.with_service_level(sl_name, std::forward<Func>(func));
+            } else {
+                co_return co_await _sl_controller.with_service_level(service_level_controller::default_service_level_name, std::forward<Func>(func));
+            }
+        }
+
+        future<std::vector<cql3::description>> describe_attached_service_levels();
+
+        /// Must be executed on shard 0.
+        future<> reload_cache();
+
+        void clear_cache();
+    };
+
 private:
    struct global_controller_data {
        service_levels_info  static_configurations{};
@@ -147,8 +206,10 @@ private:

    // service level name -> service_level object
    std::map<sstring, service_level> _service_levels_db;
-    // role name -> effective service_level_options 
-    std::map<sstring, service_level_options> _effective_service_levels_db;
+
+    // Invariant: Non-null strictly within the lifetime of `auth::service`.
+    std::unique_ptr<auth_integration> _auth_integration = nullptr;
+
    // Keeps names of effectively dropped service levels. Those service levels exits in the table but are not present in _service_levels_db cache
    std::set<sstring> _effectively_dropped_sls;
    std::pair<const sstring*, service_level*> _sl_lookup[max_scheduling_groups()];
@@ -172,6 +233,10 @@ public:
     */
    future<> start();

+    void register_auth_integration(auth::service&);
+
+    future<> unregister_auth_integration();
+
    void set_distributed_data_accessor(service_level_distributed_data_accessor_ptr sl_data_accessor);

    /**
@@ -219,14 +284,8 @@ public:
    template <typename Func, typename Ret = std::invoke_result_t<Func>>
    requires std::invocable<Func>
    futurize_t<Ret> with_user_service_level(const std::optional<auth::authenticated_user>& usr, Func&& func) {
-        if (usr && usr->name) {
-            return find_effective_service_level(*usr->name).then([this, func = std::move(func)] (std::optional<service_level_options> opts) mutable {
-                auto& service_level_name = (opts && opts->shares_name) ? *opts->shares_name : default_service_level_name;
-                return with_service_level(service_level_name, std::move(func));
-            });
-        } else {
-            return with_service_level(default_service_level_name, std::move(func));
-        }
+        SCYLLA_ASSERT(_auth_integration != nullptr);
+        return _auth_integration->with_user_service_level(usr, std::forward<Func>(func));
    }

    /**
@@ -298,15 +357,6 @@ public:
     */
    future<> update_service_levels_cache(qos::query_context ctx = qos::query_context::unspecified);

-    /**
-     * Updates effective service levels cache.
-     * The method uses service levels cache (_service_levels_db)
-     * and data from auth tables.
-     * Must be executed on shard 0.
-     * @return a future that is resolved when the update is done
-     */
-    future<> update_effective_service_levels_cache();
-
    /**
     * Service levels cache consists of two levels: service levels cache and effective service levels cache
     * The second one is dependent on the first one.
--- a/service/qos/standard_service_level_distributed_data_accessor.cc
+++ b/service/qos/standard_service_level_distributed_data_accessor.cc
@@ -37,6 +37,10 @@ bool standard_service_level_distributed_data_accessor::is_v2() const {
    return false;
 }

+bool standard_service_level_distributed_data_accessor::can_use_effective_service_level_cache() const {
+    return false;
+}
+
 ::shared_ptr<service_level_controller::service_level_distributed_data_accessor> standard_service_level_distributed_data_accessor::upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const {
    return ::static_pointer_cast<service_level_controller::service_level_distributed_data_accessor>(
                ::make_shared<raft_service_level_distributed_data_accessor>(qp, group0_client));
--- a/service/qos/standard_service_level_distributed_data_accessor.hh
+++ b/service/qos/standard_service_level_distributed_data_accessor.hh
@@ -31,6 +31,7 @@ public:
    virtual future<> commit_mutations(service::group0_batch&& mc, abort_source& as) const override { return make_ready_future(); }

    virtual bool is_v2() const override;
+    virtual bool can_use_effective_service_level_cache() const override;
    virtual ::shared_ptr<service_level_distributed_data_accessor> upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const override;
 };
 }
--- a/service/raft/group0_state_machine.cc
+++ b/service/raft/group0_state_machine.cc
@@ -333,6 +333,7 @@ future<> group0_state_machine::load_snapshot(raft::snapshot_id id) {
    if (_feature_service.compression_dicts) {
        co_await _ss.compression_dictionary_updated_callback_all();
    }
+    co_await _ss.update_service_levels_cache(qos::update_both_cache_levels::yes, qos::query_context::group0);
    _ss._topology_state_machine.event.broadcast();
 }

--- a/service/raft/raft_group0.cc
+++ b/service/raft/raft_group0.cc
@@ -414,12 +414,6 @@ future<group0_info> persistent_discovery::run(
 }

 future<> raft_group0::abort() {
-    if (_aborted) {
-        co_return;
-    }
-    _aborted = true;
-    group0_log.debug("Raft group0 service is aborting...");
-
    co_await smp::invoke_on_all([this]() {
        return uninit_rpc_verbs(_ms.local());
    });
@@ -431,8 +425,6 @@ future<> raft_group0::abort() {
    co_await std::move(_leadership_monitor);

    co_await stop_group0();
-
-    group0_log.debug("Raft group0 service is aborted");
 }

 future<> raft_group0::start_server_for_group0(raft::group_id group0_id, service::storage_service& ss, cql3::query_processor& qp, service::migration_manager& mm, bool topology_change_enabled) {
@@ -543,7 +535,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
                // created in the Raft-based recovery procedure). The persistent topology state is present on that node
                // when it creates the new group 0. Also, it joins the new group 0 using legacy_handshaker, so there is
                // no need to create a join request.
-                if (topology_change_enabled && qp.db().get_config().recovery_leader().empty()) {
+                if (topology_change_enabled && !qp.db().get_config().recovery_leader.is_set()) {
                    co_await ss.raft_initialize_discovery_leader(params);
                }

@@ -718,7 +710,7 @@ future<> raft_group0::setup_group0_if_exist(db::system_keyspace& sys_ks, service
        } else {
            // We'll disable them once we complete the upgrade procedure.
        }
-    } else if (qp.db().get_config().recovery_leader().empty()) {
+    } else if (!qp.db().get_config().recovery_leader.is_set()) {
        // Scylla has bootstrapped earlier but group 0 ID is not present and we are not recovering from majority loss
        // using the Raft-based procedure. This means we're upgrading.
        // Upgrade will start through a feature listener created after we enter NORMAL state.
--- a/service/raft/raft_group0.hh
+++ b/service/raft/raft_group0.hh
@@ -133,7 +133,6 @@ class raft_group0 {
    future<> _leadership_monitor = make_ready_future<>();
    abort_source _leadership_monitor_as;
    utils::updateable_value_source<bool> _leadership_observable;
-    bool _aborted = false;

 public:
    // Passed to `setup_group0` when replacing a node.
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -600,6 +600,8 @@ private:
        ++p->get_stats().received_mutations;
        p->get_stats().forwarded_mutations += forward_host_id.size();

+        co_await utils::get_local_injector().inject("storage_proxy_write_response_pause", utils::wait_for_message(5min));
+
        if (auto stale = _sp.apply_fence(fence, src_addr)) {
            errors.count += (forward_host_id.size() + 1);
            errors.local = std::move(*stale);
@@ -1101,26 +1103,23 @@ private:

            global_request_id = guard.new_group0_state_id();

-            std::vector<canonical_mutation> updates;
            topology_mutation_builder builder(guard.write_timestamp());
+            topology_request_tracking_mutation_builder trbuilder(global_request_id, _sp._features.topology_requests_type_column);
+            trbuilder.set_truncate_table_data(table_id)
+                     .set("done", false)
+                     .set("start_time", db_clock::now());
+
            if (!_sp._features.topology_global_request_queue) {
                builder.set_global_topology_request(global_topology_request::truncate_table)
                       .set_global_topology_request_id(global_request_id);
            } else {
                builder.queue_global_topology_request_id(global_request_id);
+                trbuilder.set("request_type", global_topology_request::truncate_table);
            }
-            updates.emplace_back(builder.build());
-
-            updates.emplace_back(topology_request_tracking_mutation_builder(global_request_id, _sp._features.topology_requests_type_column)
-                                    .set_truncate_table_data(table_id)
-                                    .set("done", false)
-                                    .set("start_time", db_clock::now())
-                                    .set("request_type", global_topology_request::truncate_table)
-                                    .build());

            slogger.info("Creating TRUNCATE global topology request for table {}.{}", ks_name, cf_name);

-            topology_change change{std::move(updates)};
+            topology_change change{{builder.build(), trbuilder.build()}};
            sstring reason = "Truncating table";
            group0_command g0_cmd = _group0_client.prepare_command(std::move(change), guard, reason);
            try {
@@ -1615,6 +1614,10 @@ public:
        return _type == db::write_type::VIEW;
    }

+    bool is_batch() const noexcept {
+        return _type == db::write_type::BATCH;
+    }
+
    void set_cdc_operation_result_tracker(lw_shared_ptr<cdc::operation_result_tracker> tracker) {
        _cdc_operation_result_tracker = std::move(tracker);
    }
@@ -2120,7 +2123,7 @@ paxos_response_handler::begin_and_repair_paxos(client_state& cs, unsigned& conte
            // create_write_response_handler is overloaded for paxos::proposal and will
            // create cas_mutation holder, which consequently will ensure paxos::learn is
            // used.
-            auto f = _proxy->mutate_internal(std::move(m), db::consistency_level::ANY, false, tr_state, _permit, _timeout)
+            auto f = _proxy->mutate_internal(std::move(m), db::consistency_level::ANY, tr_state, _permit, _timeout)
                    .then(utils::result_into_future<result<>>);

            // TODO: provided commits did not invalidate the prepare we just did above (which they
@@ -2472,7 +2475,7 @@ future<> paxos_response_handler::learn_decision(lw_shared_ptr<paxos::proposal> d
                return v.schema()->id() == base_tbl_id;
            });
            if (!mutations.empty()) {
-                f_cdc = _proxy->mutate_internal(std::move(mutations), _cl_for_learn, false, tr_state, _permit, _timeout, std::move(tracker))
+                f_cdc = _proxy->mutate_internal(std::move(mutations), _cl_for_learn, tr_state, _permit, _timeout, {}, std::move(tracker))
                        .then(utils::result_into_future<result<>>);
            }
        }
@@ -2480,7 +2483,7 @@ future<> paxos_response_handler::learn_decision(lw_shared_ptr<paxos::proposal> d

    // Path for the "base" mutations
    std::array<std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>, 1> m{std::make_tuple(std::move(decision), _schema, shared_from_this(), _key.token())};
-    future<> f_lwt = _proxy->mutate_internal(std::move(m), _cl_for_learn, false, tr_state, _permit, _timeout)
+    future<> f_lwt = _proxy->mutate_internal(std::move(m), _cl_for_learn, tr_state, _permit, _timeout)
            .then(utils::result_into_future<result<>>);

    co_await when_all_succeed(std::move(f_cdc), std::move(f_lwt)).discard_result();
@@ -3071,6 +3074,10 @@ struct hint_wrapper {
    mutation mut;
 };

+struct batchlog_replay_mutation {
+    mutation mut;
+};
+
 struct read_repair_mutation {
    std::unordered_map<locator::host_id, std::optional<mutation>> value;
    locator::effective_replication_map_ptr ermp;
@@ -3084,6 +3091,12 @@ template <> struct fmt::formatter<service::hint_wrapper> : fmt::formatter<string
    }
 };

+template <> struct fmt::formatter<service::batchlog_replay_mutation> : fmt::formatter<string_view> {
+    auto format(const service::batchlog_replay_mutation& h, fmt::format_context& ctx) const {
+        return fmt::format_to(ctx.out(), "batchlog_replay_mutation{{{}}}", h.mut);
+    }
+};
+
 template <>
 struct fmt::formatter<service::read_repair_mutation> : fmt::formatter<string_view> {
    auto format(const service::read_repair_mutation& m, fmt::format_context& ctx) const {
@@ -3449,6 +3462,12 @@ storage_proxy::create_write_response_handler(const hint_wrapper& h, db::consiste
            std::move(permit), allow_limit, is_cancellable::yes);
 }

+result<storage_proxy::response_id_type>
+storage_proxy::create_write_response_handler(const batchlog_replay_mutation& m, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit) {
+    return create_write_response_handler_helper(m.mut.schema(), m.mut.token(), std::make_unique<shared_mutation>(m.mut), cl, type, tr_state,
+            std::move(permit), allow_limit, is_cancellable::yes);
+}
+
 result<storage_proxy::response_id_type>
 storage_proxy::create_write_response_handler(const read_repair_mutation& mut, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit) {
    host_id_vector_replica_set endpoints;
@@ -3843,7 +3862,7 @@ future<result<>> storage_proxy::do_mutate(std::vector<mutation> mutations, db::c
    }).begin();
    return seastar::when_all_succeed(
        mutate_counters(std::ranges::subrange(mutations.begin(), mid), cl, tr_state, permit, timeout),
-        mutate_internal(std::ranges::subrange(mid, mutations.end()), cl, false, tr_state, permit, timeout, std::move(cdc_tracker), allow_limit)
+        mutate_internal(std::ranges::subrange(mid, mutations.end()), cl, tr_state, permit, timeout, {}, std::move(cdc_tracker), allow_limit)
    ).then([] (std::tuple<result<>> res) {
        // For now, only mutate_internal returns a result<>
        return std::get<0>(std::move(res));
@@ -3852,8 +3871,10 @@ future<result<>> storage_proxy::do_mutate(std::vector<mutation> mutations, db::c

 future<> storage_proxy::replicate_counter_from_leader(mutation m, db::consistency_level cl, tracing::trace_state_ptr tr_state,
                                                      clock_type::time_point timeout, service_permit permit) {
+    // we need to pass correct db::write_type in case of a timeout so that
+    // client doesn't attempt to retry the request.
    // FIXME: do not send the mutation to itself, it has already been applied (it is not incorrect to do so, though)
-    return mutate_internal(std::array<mutation, 1>{std::move(m)}, cl, true, std::move(tr_state), std::move(permit), timeout)
+    return mutate_internal(std::array<mutation, 1>{std::move(m)}, cl, std::move(tr_state), std::move(permit), timeout, db::write_type::COUNTER)
            .then(utils::result_into_future<result<>>);
 }

@@ -3864,8 +3885,8 @@ future<> storage_proxy::replicate_counter_from_leader(mutation m, db::consistenc
 */
 template<typename Range>
 future<result<>>
-storage_proxy::mutate_internal(Range mutations, db::consistency_level cl, bool counters, tracing::trace_state_ptr tr_state, service_permit permit,
-                               std::optional<clock_type::time_point> timeout_opt, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker,
+storage_proxy::mutate_internal(Range mutations, db::consistency_level cl, tracing::trace_state_ptr tr_state, service_permit permit,
+                               std::optional<clock_type::time_point> timeout_opt, std::optional<db::write_type> type_opt, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker,
                               db::allow_per_partition_rate_limit allow_limit) {
    if (std::ranges::empty(mutations)) {
        return make_ready_future<result<>>(bo::success());
@@ -3874,12 +3895,10 @@ storage_proxy::mutate_internal(Range mutations, db::consistency_level cl, bool c
    slogger.trace("mutate cl={}", cl);
    mlogger.trace("mutations={}", mutations);

-    // If counters is set it means that we are replicating counter shards. There
-    // is no need for special handling anymore, since the leader has already
-    // done its job, but we need to return correct db::write_type in case of
-    // a timeout so that client doesn't attempt to retry the request.
-    auto type = counters ? db::write_type::COUNTER
-                         : (std::next(std::begin(mutations)) == std::end(mutations) ? db::write_type::SIMPLE : db::write_type::UNLOGGED_BATCH);
+    // the parameter type_opt allows to pass a specific type if needed for
+    // special handling, e.g. counters. otherwise, a default type is used.
+    auto type = type_opt.value_or(std::next(std::begin(mutations)) == std::end(mutations) ? db::write_type::SIMPLE : db::write_type::UNLOGGED_BATCH);
+
    utils::latency_counter lc;
    lc.start();

@@ -4065,6 +4084,7 @@ storage_proxy::mutate_atomically_result(std::vector<mutation> mutations, db::con
        };
        future<> async_remove_from_batchlog() {
            // delete batch
+            utils::get_local_injector().inject("storage_proxy_fail_remove_from_batchlog", [] { throw std::runtime_error("Error injection: failing remove from batchlog"); });
            auto key = partition_key::from_exploded(*_schema, {uuid_type->decompose(_batch_uuid)});
            auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
            mutation m(_schema, key);
@@ -4136,13 +4156,15 @@ mutation storage_proxy::do_get_batchlog_mutation_for(schema_ptr schema, const st
        for (auto& m : fm) {
            ser::serialize(out, m);
        }
-        return to_bytes(out.linearize());
+        return std::move(out).to_managed_bytes();
    }();

    mutation m(schema, key);
    m.set_cell(clustering_key_prefix::make_empty(), to_bytes("version"), version, timestamp);
    m.set_cell(clustering_key_prefix::make_empty(), to_bytes("written_at"), now, timestamp);
-    m.set_cell(clustering_key_prefix::make_empty(), to_bytes("data"), data_value(std::move(data)), timestamp);
+    // Avoid going through data_value and therefore `bytes`, as it can be large (#24809).
+    auto cdef_data = schema->get_column_definition(to_bytes("data"));
+    m.set_cell(clustering_key_prefix::make_empty(), *cdef_data, atomic_cell::make_live(*cdef_data->type, timestamp, std::move(data)));

    return m;
 }
@@ -4248,7 +4270,16 @@ future<> storage_proxy::send_hint_to_endpoint(frozen_mutation_and_schema fm_a_s,

 future<> storage_proxy::send_hint_to_all_replicas(frozen_mutation_and_schema fm_a_s) {
    std::array<hint_wrapper, 1> ms{hint_wrapper { fm_a_s.fm.unfreeze(fm_a_s.s) }};
-    return mutate_internal(std::move(ms), db::consistency_level::ALL, false, nullptr, empty_service_permit())
+    return mutate_internal(std::move(ms), db::consistency_level::ALL, nullptr, empty_service_permit())
+            .then(utils::result_into_future<result<>>);
+}
+
+future<> storage_proxy::send_batchlog_replay_to_all_replicas(std::vector<mutation> mutations, clock_type::time_point timeout) {
+    std::vector<batchlog_replay_mutation> ms = mutations | std::views::transform([] (auto&& m) {
+            return batchlog_replay_mutation(std::move(m));
+        }) | std::ranges::to<std::vector<batchlog_replay_mutation>>();
+
+    return mutate_internal(std::move(ms), db::consistency_level::ALL, nullptr, empty_service_permit(), timeout, db::write_type::BATCH)
            .then(utils::result_into_future<result<>>);
 }

@@ -4431,7 +4462,7 @@ future<result<>> storage_proxy::schedule_repair(locator::effective_replication_m
                    std::views::transform([ermp] (auto& v) { return read_repair_mutation{std::move(v), ermp}; }) |
                    // The transform above is destructive, materialize into a vector to make the range re-iterable.
                    std::ranges::to<std::vector<read_repair_mutation>>()
-            , cl, false, std::move(trace_state), std::move(permit));
+            , cl, std::move(trace_state), std::move(permit));
 }

 class abstract_read_resolver {
@@ -6953,7 +6984,7 @@ future<> storage_proxy::drain_on_shutdown() {
    //NOTE: the thread is spawned here because there are delicate lifetime issues to consider
    // and writing them down with plain futures is error-prone.
    return async([this] {
-        cancel_write_handlers([] (const abstract_write_response_handler&) { return true; });
+        cancel_all_write_response_handlers().get();
        _hints_resource_manager.stop().get();
    });
 }
@@ -6964,6 +6995,12 @@ future<> storage_proxy::abort_view_writes() {
    });
 }

+future<> storage_proxy::abort_batch_writes() {
+    return async([this] {
+        cancel_write_handlers([] (const abstract_write_response_handler& handler) { return handler.is_batch(); });
+    });
+}
+
 future<>
 storage_proxy::stop() {
    return make_ready_future<>();
@@ -6977,4 +7014,13 @@ future<utils::chunked_vector<dht::token_range_endpoints>> storage_proxy::describ
    return locator::describe_ring(_db.local(), _remote->gossiper(), keyspace, include_only_local_dc);
 }

+future<> storage_proxy::cancel_all_write_response_handlers() {
+    while (!_response_handlers.empty()) {
+        _response_handlers.begin()->second->timeout_cb();
+
+        if (!_response_handlers.empty()) {
+            co_await maybe_yield();
+        }
+    }
+}
 }
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -87,6 +87,7 @@ class mutation_holder;
 class client_state;
 class migration_manager;
 struct hint_wrapper;
+struct batchlog_replay_mutation;
 struct read_repair_mutation;

 using replicas_per_token_range = std::unordered_map<dht::token_range, std::vector<locator::host_id>>;
@@ -340,6 +341,7 @@ private:
            const host_id_vector_topology_change& pending_endpoints, host_id_vector_topology_change, tracing::trace_state_ptr tr_state, storage_proxy::write_stats& stats, service_permit permit, db::per_partition_rate_limit::info rate_limit_info, is_cancellable);
    result<response_id_type> create_write_response_handler(const mutation&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
    result<response_id_type> create_write_response_handler(const hint_wrapper&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
+    result<response_id_type> create_write_response_handler(const batchlog_replay_mutation&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
    result<response_id_type> create_write_response_handler(const read_repair_mutation&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
    result<response_id_type> create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>& proposal,
            db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit);
@@ -427,7 +429,7 @@ private:
    void unthrottle();
    void handle_read_error(std::variant<exceptions::coordinator_exception_container, std::exception_ptr> failure, bool range);
    template<typename Range>
-    future<result<>> mutate_internal(Range mutations, db::consistency_level cl, bool counter_write, tracing::trace_state_ptr tr_state, service_permit permit, std::optional<clock_type::time_point> timeout_opt = { }, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker = { }, db::allow_per_partition_rate_limit allow_limit = db::allow_per_partition_rate_limit::no);
+    future<result<>> mutate_internal(Range mutations, db::consistency_level cl, tracing::trace_state_ptr tr_state, service_permit permit, std::optional<clock_type::time_point> timeout_opt = { }, std::optional<db::write_type> type = { }, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker = { }, db::allow_per_partition_rate_limit allow_limit = db::allow_per_partition_rate_limit::no);
    future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> query_nonsingular_mutations_locally(
            schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range_vector&& pr, tracing::trace_state_ptr trace_state,
            clock_type::time_point timeout);
@@ -521,6 +523,8 @@ public:
    bool is_me(gms::inet_address addr) const noexcept;
    bool is_me(const locator::effective_replication_map& erm, locator::host_id id) const noexcept;

+    future<> cancel_all_write_response_handlers();
+
 private:
    bool only_me(const locator::effective_replication_map& erm, const host_id_vector_replica_set& replicas) const noexcept;

@@ -631,6 +635,8 @@ public:

    future<> send_hint_to_all_replicas(frozen_mutation_and_schema fm_a_s);

+    future<> send_batchlog_replay_to_all_replicas(std::vector<mutation> mutations, clock_type::time_point timeout);
+
    // Send a mutation to one specific remote target.
    // Inspired by Cassandra's StorageProxy.sendToHintedEndpoints but without
    // hinted handoff support, and just one target. See also
@@ -705,6 +711,7 @@ public:
    void allow_replaying_hints() noexcept;
    future<> drain_hints_for_left_nodes();
    future<> abort_view_writes();
+    future<> abort_batch_writes();

    future<> change_hints_host_filter(db::hints::host_filter new_filter);
    const db::hints::host_filter& get_hints_host_filter() const;
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -111,7 +111,6 @@
 #include "node_ops/task_manager_module.hh"
 #include "service/task_manager_module.hh"
 #include "service/topology_mutation.hh"
-#include "service/topology_coordinator.hh"
 #include "cql3/query_processor.hh"
 #include "service/qos/service_level_controller.hh"
 #include "service/qos/standard_service_level_distributed_data_accessor.hh"
@@ -426,21 +425,6 @@ static locator::node::state to_topology_node_state(node_state ns) {
    on_internal_error(rtlogger, format("unhandled node state: {}", ns));
 }

-future<storage_service::host_id_to_ip_map_t> storage_service::get_host_id_to_ip_map() {
-    host_id_to_ip_map_t map;
-    const auto ep_to_id_map = co_await _sys_ks.local().load_host_ids();
-    map.reserve(ep_to_id_map.size());
-    for (const auto& [ep, id]: ep_to_id_map) {
-        const auto [it, inserted] = map.insert({id, ep});
-        if (!inserted) {
-            on_internal_error(slogger, ::format("duplicate IP for host_id {}, first IP {}, second IP {}",
-                id, it->second, ep));
-        }
-    }
-    co_return map;
-};
-
-
 future<> storage_service::raft_topology_update_ip(locator::host_id id, gms::inet_address ip, const host_id_to_ip_map_t& host_id_to_ip_map, nodes_to_notify_after_sync* nodes_to_notify) {
    const auto& t = _topology_state_machine._topology;
    raft::server_id raft_id{id.uuid()};
@@ -623,7 +607,7 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t

    sys_ks_futures.reserve(t.left_nodes.size() + t.normal_nodes.size() + t.transition_nodes.size());

-    auto id_to_ip_map = co_await get_host_id_to_ip_map();
+    auto id_to_ip_map = co_await _sys_ks.local().get_host_id_to_ip_map();
    for (const auto& id: t.left_nodes) {
        locator::host_id host_id{id.uuid()};
        auto ip = _address_map.find(host_id);
@@ -707,7 +691,6 @@ future<> storage_service::topology_state_load(state_change_hint hint) {
    co_await _sl_controller.invoke_on_all([this] (qos::service_level_controller& sl_controller) {
        sl_controller.upgrade_to_v2(_qp, _group0->client());
    });
-    co_await update_service_levels_cache(qos::update_both_cache_levels::yes, qos::query_context::group0);

    // the view_builder is migrated to v2 in view_builder::migrate_to_v2.
    // it writes a v2 version mutation as topology_change, then we get here
@@ -740,9 +723,7 @@ future<> storage_service::topology_state_load(state_change_hint hint) {
    auto saved_tmpr = get_token_metadata_ptr();
    {
        auto tmlock = co_await get_token_metadata_lock();
-        auto tmptr = make_token_metadata_ptr(token_metadata::config {
-            get_token_metadata().get_topology().get_config()
-        });
+        auto tmptr = _shared_token_metadata.make_token_metadata_ptr();
        tmptr->invalidate_cached_rings();

        tmptr->set_version(_topology_state_machine._topology.version);
@@ -817,10 +798,6 @@ future<> storage_service::topology_state_load(state_change_hint hint) {

    for (const auto& gen_id : _topology_state_machine._topology.committed_cdc_generations) {
        rtlogger.trace("topology_state_load: process committed cdc generation {}", gen_id);
-        co_await utils::get_local_injector().inject("topology_state_load_before_update_cdc", [](auto& handler) -> future<> {
-            rtlogger.info("topology_state_load_before_update_cdc hit, wait for message");
-            co_await handler.wait_for_message(db::timeout_clock::now() + std::chrono::minutes(5));
-        });
        co_await _cdc_gens.local().handle_cdc_generation(gen_id);
        if (gen_id == _topology_state_machine._topology.committed_cdc_generations.back()) {
            co_await _sys_ks.local().update_cdc_generation_id(gen_id);
@@ -903,7 +880,10 @@ future<> storage_service::merge_topology_snapshot(raft_snapshot snp) {

 future<> storage_service::update_service_levels_cache(qos::update_both_cache_levels update_only_effective_cache, qos::query_context ctx) {
    SCYLLA_ASSERT(this_shard_id() == 0);
-    co_await _sl_controller.local().update_cache(update_only_effective_cache, ctx);
+    if (_sl_controller.local().is_v2()) {
+        // Skip cache update unless the topology upgrade is done
+        co_await _sl_controller.local().update_cache(update_only_effective_cache, ctx);
+    }
 }

 future<> storage_service::compression_dictionary_updated_callback_all() {
@@ -943,7 +923,7 @@ class storage_service::ip_address_updater: public gms::i_endpoint_state_change_s
        rslog.debug("ip_address_updater::on_endpoint_change({}) {} {}", ev, endpoint, id);

        // If id maps to different ip in peers table it needs to be updated which is done by sync_raft_topology_nodes below
-        std::optional<gms::inet_address> prev_ip = co_await _ss.get_ip_from_peers_table(id);
+        std::optional<gms::inet_address> prev_ip = co_await _ss._sys_ks.local().get_ip_from_peers_table(id);
        if (prev_ip == endpoint) {
            co_return;
        }
@@ -972,7 +952,7 @@ class storage_service::ip_address_updater: public gms::i_endpoint_state_change_s
                co_await utils::get_local_injector().inject("ip-change-raft-sync-delay", std::chrono::milliseconds(500));
                // Set notify_join to true since here we detected address change and drivers have to be notified
                nodes_to_notify_after_sync nodes_to_notify;
-                co_await _ss.raft_topology_update_ip(id, endpoint, co_await _ss.get_host_id_to_ip_map(), &nodes_to_notify);
+                co_await _ss.raft_topology_update_ip(id, endpoint, co_await _ss._sys_ks.local().get_host_id_to_ip_map(), &nodes_to_notify);
                co_await _ss.notify_nodes_after_sync(std::move(nodes_to_notify));
            }));
        }
@@ -1134,7 +1114,8 @@ future<> storage_service::raft_state_monitor_fiber(raft::server& raft, gate::hol
                    _tablet_allocator.local(),
                    get_ring_delay(),
                    _lifecycle_notifier,
-                    _feature_service);
+                    _feature_service,
+                    _topology_cmd_rpc_tracker);
        }
    } catch (...) {
        rtlogger.info("raft_state_monitor_fiber aborted with {}", std::current_exception());
@@ -1773,7 +1754,7 @@ future<> storage_service::join_topology(sharded<service::storage_proxy>& proxy,
    // the topology coordinator. We can assume this node has already been accepted by the topology coordinator once
    // and joined topology.
    ::shared_ptr<group0_handshaker> handshaker =
-            raft_topology_change_enabled() && _db.local().get_config().recovery_leader().empty()
+            raft_topology_change_enabled() && !_db.local().get_config().recovery_leader.is_set()
            ? ::make_shared<join_node_rpc_handshaker>(*this, join_params)
            : _group0->make_legacy_handshaker(can_vote::no);
    co_await _group0->setup_group0(_sys_ks.local(), initial_contact_nodes, std::move(handshaker),
@@ -2347,7 +2328,7 @@ future<> storage_service::handle_state_normal(inet_address endpoint, locator::ho
    // Old node in replace-with-same-IP scenario.
    std::optional<locator::host_id> replaced_id;

-    auto id_to_ip_map = co_await get_host_id_to_ip_map();
+    auto id_to_ip_map = co_await _sys_ks.local().get_host_id_to_ip_map();

    std::optional<inet_address> existing;

@@ -2619,14 +2600,6 @@ future<> storage_service::on_alive(gms::inet_address endpoint, locator::host_id
    }
 }

-future<std::optional<gms::inet_address>> storage_service::get_ip_from_peers_table(locator::host_id id) {
-    auto peers = co_await _sys_ks.local().load_host_ids();
-    if (auto it = std::ranges::find_if(peers, [&id] (const auto& e) { return e.second == id; }); it != peers.end()) {
-        co_return it->first;
-    }
-    co_return std::nullopt;
-}
-
 future<> storage_service::on_change(gms::inet_address endpoint, locator::host_id host_id, const gms::application_state_map& states_, gms::permit_id pid) {
    // copy the states map locally since the coroutine may yield
    auto states = states_;
@@ -2670,7 +2643,7 @@ future<> storage_service::on_change(gms::inet_address endpoint, locator::host_id
    // overwrites the IP back to its old value.
    // In essence, the code under the 'if' should fire if the given IP belongs
    // to a cluster member.
-    if (node && node->is_member() && (co_await get_ip_from_peers_table(host_id)) == endpoint) {
+    if (node && node->is_member() && (co_await _sys_ks.local().get_ip_from_peers_table(host_id)) == endpoint) {
        if (!is_me(endpoint)) {
            slogger.debug("endpoint={}/{} on_change:     updating system.peers table", endpoint, host_id);
            if (auto info = get_peer_info_for_update(host_id, states)) {
@@ -2682,18 +2655,18 @@ future<> storage_service::on_change(gms::inet_address endpoint, locator::host_id
            co_await notify_cql_change(endpoint, host_id, ep_state->is_cql_ready());
        }
        if (auto it = states.find(application_state::INTERNAL_IP); it != states.end()) {
-            co_await maybe_reconnect_to_preferred_ip(endpoint, inet_address(it->second.value()));
+            co_await maybe_reconnect_to_preferred_ip(endpoint, inet_address(it->second.value()), host_id);
        }
    }
 }

-future<> storage_service::maybe_reconnect_to_preferred_ip(inet_address ep, inet_address local_ip) {
+future<> storage_service::maybe_reconnect_to_preferred_ip(inet_address ep, inet_address local_ip, locator::host_id host_id) {
    if (!_snitch.local()->prefer_local()) {
        co_return;
    }

    const auto& topo = get_token_metadata().get_topology();
-    if (topo.get_datacenter() == topo.get_datacenter(_gossiper.get_host_id(ep)) && _messaging.local().get_preferred_ip(ep) != local_ip) {
+    if (topo.get_datacenter() == topo.get_datacenter(host_id) && _messaging.local().get_preferred_ip(ep) != local_ip) {
        slogger.debug("Initiated reconnect to an Internal IP {} for the {}", local_ip, ep);
        co_await _messaging.invoke_on_all([ep, local_ip] (auto& local_ms) {
            local_ms.cache_preferred_ip(ep, local_ip);
@@ -2936,7 +2909,7 @@ future<> storage_service::join_cluster(sharded<service::storage_proxy>& proxy,

    gms::inet_address recovery_leader_ip;
    locator::host_id recovery_leader_id;
-    if (!_db.local().get_config().recovery_leader().empty()) {
+    if (_db.local().get_config().recovery_leader.is_set()) {
        if (_group0->joined_group0()) {
            // Something is wrong unless it is a noninitial (and unneeded) restart while recreating the new group 0 in
            // the Raft-based recovery procedure.
@@ -2946,7 +2919,7 @@ future<> storage_service::join_cluster(sharded<service::storage_proxy>& proxy,
                    "the Raft-based recovery procedure, please follow the steps in the documentation.",
                    _db.local().get_config().recovery_leader(), _group0->load_my_id());
        } else {
-            recovery_leader_id = locator::host_id(utils::UUID(_db.local().get_config().recovery_leader()));
+            recovery_leader_id = locator::host_id(_db.local().get_config().recovery_leader());
            auto recovery_leader_it = loaded_endpoints.find(recovery_leader_id);
            if (recovery_leader_id != my_host_id() && recovery_leader_it == loaded_endpoints.end()) {
                throw std::runtime_error(
@@ -3146,9 +3119,10 @@ future<> storage_service::replicate_to_all_cores(mutable_token_metadata_ptr tmpt
    try {
        auto base_shard = this_shard_id();
        pending_token_metadata_ptr[base_shard] = tmptr;
+        auto& sharded_token_metadata = _shared_token_metadata.container();
        // clone a local copy of updated token_metadata on all other shards
        co_await smp::invoke_on_others(base_shard, [&, tmptr] () -> future<> {
-            pending_token_metadata_ptr[this_shard_id()] = make_token_metadata_ptr(co_await tmptr->clone_async());
+            pending_token_metadata_ptr[this_shard_id()] = sharded_token_metadata.local().make_token_metadata_ptr(co_await tmptr->clone_async());
        });

        // Precalculate new effective_replication_map for all keyspaces
@@ -4701,17 +4675,13 @@ future<> storage_service::drain() {
 }

 future<> storage_service::do_drain() {
-    // Need to stop transport before group0, otherwise RPCs may fail with raft_group_not_found.
    co_await stop_transport();

-    // group0 persistence relies on local storage, so we need to stop group0 first.
-    // This must be kept in sync with defer_verbose_shutdown for group0 in main.cc to
-    // handle the case when initialization fails before reaching drain_on_shutdown for ss.
-    _sl_controller.local().abort_group0_operations();
+    // Drain view builder before group0, because the view builder uses group0 to coordinate view building.
+    // Drain after transport is stopped, because view_builder::drain aborts view writes for user writes as well.
+    co_await _view_builder.invoke_on_all(&db::view::view_builder::drain);
+
    co_await wait_for_group0_stop();
-    if (_group0) {
-        co_await _group0->abort();
-    }

    co_await tracing::tracing::tracing_instance().invoke_on_all(&tracing::tracing::shutdown);

@@ -4719,7 +4689,6 @@ future<> storage_service::do_drain() {
        return bm.drain();
    });

-    co_await _view_builder.invoke_on_all(&db::view::view_builder::drain);
    co_await _db.invoke_on_all(&replica::database::drain);
    co_await _sys_ks.invoke_on_all(&db::system_keyspace::shutdown);
    co_await _repair.invoke_on_all(&repair_service::shutdown);
@@ -5747,7 +5716,7 @@ future<> storage_service::snitch_reconfigured() {

 future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft::term_t term, uint64_t cmd_index, const raft_topology_cmd& cmd) {
    raft_topology_cmd_result result;
-    rtlogger.debug("topology cmd rpc {} is called", cmd.cmd);
+    rtlogger.info("topology cmd rpc {} is called index={}", cmd.cmd, cmd_index);

    try {
        auto& raft_server = _group0->group0_server();
@@ -5816,6 +5785,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
            }
            break;
            case raft_topology_cmd::command::barrier_and_drain: {
+                co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
                if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
                    for (auto& n : _topology_state_machine._topology.transition_nodes) {
                        if (!_address_map.find(locator::host_id{n.first.uuid()})) {
@@ -6077,6 +6047,9 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
    } catch (...) {
        rtlogger.error("raft_topology_cmd {} failed with: {}", cmd.cmd, std::current_exception());
    }
+
+    rtlogger.info("topology cmd rpc {} completed with status={} index={}",
+        cmd.cmd, (result.status == raft_topology_cmd_result::command_status::success) ? "suceeded" : "failed", cmd_index);
    co_return result;
 }

--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -48,6 +48,7 @@
 #include "timestamp.hh"
 #include "utils/user_provided_param.hh"
 #include "utils/sequenced_set.hh"
+#include "service/topology_coordinator.hh"

 class node_ops_cmd_request;
 class node_ops_cmd_response;
@@ -282,12 +283,12 @@ private:
    future<> snitch_reconfigured();

    future<mutable_token_metadata_ptr> get_mutable_token_metadata_ptr() noexcept {
-        return _shared_token_metadata.get()->clone_async().then([] (token_metadata tm) {
+        return _shared_token_metadata.get()->clone_async().then([this] (token_metadata tm) {
            // bump the token_metadata ring_version
            // to invalidate cached token/replication mappings
            // when the modified token_metadata is committed.
            tm.invalidate_cached_rings();
-            return make_ready_future<mutable_token_metadata_ptr>(make_token_metadata_ptr(std::move(tm)));
+            return _shared_token_metadata.make_token_metadata_ptr(std::move(tm));
        });
    }

@@ -626,10 +627,8 @@ private:
    // needs to be modified to accept either a keyspace or ARS.
    future<std::unordered_multimap<dht::token_range, locator::host_id>> get_changed_ranges_for_leaving(locator::vnode_effective_replication_map_ptr erm, locator::host_id endpoint);

-    future<> maybe_reconnect_to_preferred_ip(inet_address ep, inet_address local_ip);
+    future<> maybe_reconnect_to_preferred_ip(inet_address ep, inet_address local_ip, locator::host_id host_id);

-    // Return ip of the peers table entry with given host id
-    future<std::optional<gms::inet_address>> get_ip_from_peers_table(locator::host_id id);
 public:

    sstring get_release_version();
@@ -873,6 +872,11 @@ private:
    std::optional<shared_future<>> _rebuild_result;
    std::unordered_map<raft::server_id, std::optional<shared_future<>>> _remove_result;
    tablet_op_registry _tablet_ops;
+    // This tracks active topology cmd rpc. There can be only one active
+    // cmd running and by inspecting this structure it can be checked which
+    // cmd is current executing and which nodes are still did not reply.
+    // Needed for debugging.
+    topology_coordinator_cmd_rpc_tracker _topology_cmd_rpc_tracker;
    struct {
        raft::term_t term{0};
        uint64_t last_index{0};
@@ -941,6 +945,10 @@ public:
    // Waits for topology state in which none of tablets has replaced_id as a replica.
    // Must be called on shard 0.
    future<> await_tablets_rebuilt(raft::server_id replaced_id);
+
+    topology_coordinator_cmd_rpc_tracker get_topology_cmd_status() {
+        return _topology_cmd_rpc_tracker;
+    }
 private:
    // Tracks progress of the upgrade to topology coordinator.
    future<> _upgrade_to_topology_coordinator_fiber = make_ready_future<>();
@@ -994,8 +1002,7 @@ private:
        std::vector<std::pair<gms::inet_address, locator::host_id>> joined;
    };

-    using host_id_to_ip_map_t = std::unordered_map<locator::host_id, gms::inet_address>;
-    future<host_id_to_ip_map_t> get_host_id_to_ip_map();
+    using host_id_to_ip_map_t = db::system_keyspace::host_id_to_ip_map_t;
    future<> raft_topology_update_ip(locator::host_id id, gms::inet_address ip, const host_id_to_ip_map_t& map, nodes_to_notify_after_sync* nodes_to_notify);
    // Synchronizes the local node state (token_metadata, system.peers/system.local tables,
    // gossiper) to align it with the other raft topology nodes.
--- a/service/tablet_allocator.cc
+++ b/service/tablet_allocator.cc
@@ -842,7 +842,7 @@ public:
            db_clock::duration repair_time_diff;
        };

-        std::vector<repair_plan> plans;
+        utils::chunked_vector<repair_plan> plans;
        auto migration_tablet_ids = co_await mplan.get_migration_tablet_ids();
        for (auto&& [table, tmap_] : _tm->tablets().all_tables()) {
            auto& tmap = *tmap_;
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -147,6 +147,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {

    group0_voter_handler _voter_handler;

+    topology_coordinator_cmd_rpc_tracker& _topology_cmd_rpc_tracker;
+
    const locator::token_metadata& get_token_metadata() const noexcept {
        return *_shared_tm.get();
    }
@@ -389,6 +391,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
    future<> exec_direct_command_helper(raft::server_id id, uint64_t cmd_index, const raft_topology_cmd& cmd) {
        rtlogger.debug("send {} command with term {} and index {} to {}",
            cmd.cmd, _term, cmd_index, id);
+        _topology_cmd_rpc_tracker.active_dst.emplace(id);
+        auto _ = seastar::defer([this, id] { _topology_cmd_rpc_tracker.active_dst.erase(id); });
+
        auto result = _db.get_token_metadata().get_topology().is_me(to_host_id(id)) ?
                    co_await _raft_topology_cmd_handler(_term, cmd_index, cmd) :
                    co_await ser::storage_service_rpc_verbs::send_raft_topology_cmd(
@@ -403,12 +408,16 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
        auto id = node.id;
        release_node(std::move(node));
        const auto cmd_index = ++_last_cmd_index;
+        _topology_cmd_rpc_tracker.current = cmd.cmd;
+        _topology_cmd_rpc_tracker.index = cmd_index;
        co_await exec_direct_command_helper(id, cmd_index, cmd);
        co_return retake_node(co_await start_operation(), id);
    };

    future<> exec_global_command_helper(auto nodes, const raft_topology_cmd& cmd) {
        const auto cmd_index = ++_last_cmd_index;
+        _topology_cmd_rpc_tracker.current = cmd.cmd;
+        _topology_cmd_rpc_tracker.index = cmd_index;
        auto f = co_await coroutine::as_future(
                seastar::parallel_for_each(std::move(nodes), [this, &cmd, cmd_index] (raft::server_id id) {
            return exec_direct_command_helper(id, cmd_index, cmd);
@@ -1510,7 +1519,13 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
                        }
                        rtlogger.info("Initiating tablet cleanup of {} on {}", gid, dst);
                        return ser::storage_service_rpc_verbs::send_tablet_cleanup(&_messaging,
-                                                                                   dst.host, _as, raft::server_id(dst.host.uuid()), gid);
+                                                                                   dst.host, _as, raft::server_id(dst.host.uuid()), gid)
+                            .then([] {
+                                return utils::get_local_injector().inject("wait_after_tablet_cleanup", [] (auto& handler) -> future<> {
+                                    rtlogger.info("Waiting after tablet cleanup");
+                                    return handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::seconds{60});
+                                });
+                            });
                    })) {
                        transition_to(locator::tablet_transition_stage::end_migration);
                    }
@@ -1730,6 +1745,11 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
    }

    future<> handle_tablet_resize_finalization(group0_guard g) {
+        co_await utils::get_local_injector().inject("handle_tablet_resize_finalization_wait", [] (auto& handler) -> future<> {
+            rtlogger.info("handle_tablet_resize_finalization: waiting");
+            co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::seconds{60});
+        });
+
        // Executes a global barrier to guarantee that any process (e.g. repair) holding stale version
        // of token metadata will complete before we update topology.
        auto guard = co_await global_tablet_token_metadata_barrier(std::move(g));
@@ -1916,6 +1936,12 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
        }
    }

+    void trigger_load_stats_refresh() {
+        (void)_tablet_load_stats_refresh.trigger().handle_exception([] (auto ep) {
+            rtlogger.warn("Error during tablet load stats refresh: {}", ep);
+        });
+    }
+
    future<> cancel_all_requests(group0_guard guard, std::unordered_set<raft::server_id> dead_nodes) {
        std::vector<canonical_mutation> muts;
        std::vector<raft::server_id> reject_join;
@@ -2406,10 +2432,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
                    muts.emplace_back(rtbuilder.build());
                    co_await update_topology_state(take_guard(std::move(node)), std::move(muts),
                                                   "bootstrap: read fence completed");
-                    // Make sure the load balancer knows the capacity for the new node immediately.
-                    (void)_tablet_load_stats_refresh.trigger().handle_exception([] (auto ep) {
-                        rtlogger.warn("Error during tablet load stats refresh: {}", ep);
-                    });
+                    trigger_load_stats_refresh();
                    }
                    co_await _voter_handler.on_node_added(node.id, _as);
                    break;
@@ -2468,6 +2491,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
                    co_await db::view::view_builder::generate_mutations_on_node_left(_db, _sys_ks, node.guard.write_timestamp(), locator::host_id(replaced_node_id.uuid()), muts);
                    co_await update_topology_state(take_guard(std::move(node)), std::move(muts),
                                                  "replace: read fence completed");
+                    trigger_load_stats_refresh();
                    }
                    co_await _voter_handler.on_node_added(node.id, _as);
                    break;
@@ -2988,7 +3012,8 @@ public:
            raft_topology_cmd_handler_type raft_topology_cmd_handler,
            tablet_allocator& tablet_allocator,
            std::chrono::milliseconds ring_delay,
-            gms::feature_service& feature_service)
+            gms::feature_service& feature_service,
+            topology_coordinator_cmd_rpc_tracker& topology_cmd_rpc_tracker)
        : _sys_dist_ks(sys_dist_ks), _gossiper(gossiper), _messaging(messaging)
        , _shared_tm(shared_tm), _sys_ks(sys_ks), _db(db)
        , _group0(group0), _topo_sm(topo_sm), _as(as)
@@ -3000,6 +3025,7 @@ public:
        , _ring_delay(ring_delay)
        , _group0_holder(_group0.hold_group0_gate())
        , _voter_handler(group0, topo_sm._topology, gossiper, feature_service)
+        , _topology_cmd_rpc_tracker(topology_cmd_rpc_tracker)
        , _async_gate("topology_coordinator")
    {}

@@ -3614,7 +3640,8 @@ future<> run_topology_coordinator(
        tablet_allocator& tablet_allocator,
        std::chrono::milliseconds ring_delay,
        endpoint_lifecycle_notifier& lifecycle_notifier,
-        gms::feature_service& feature_service) {
+        gms::feature_service& feature_service,
+        topology_coordinator_cmd_rpc_tracker& topology_cmd_rpc_tracker) {

    topology_coordinator coordinator{
            sys_dist_ks, gossiper, messaging, shared_tm,
@@ -3622,7 +3649,8 @@ future<> run_topology_coordinator(
            std::move(raft_topology_cmd_handler),
            tablet_allocator,
            ring_delay,
-            feature_service};
+            feature_service,
+            topology_cmd_rpc_tracker};

    std::exception_ptr ex;
    lifecycle_notifier.register_subscriber(&coordinator);
--- a/service/topology_coordinator.hh
+++ b/service/topology_coordinator.hh
@@ -62,6 +62,12 @@ future<> wait_for_gossiper(raft::server_id id, const gms::gossiper& g, seastar::
 using raft_topology_cmd_handler_type = noncopyable_function<future<raft_topology_cmd_result>(
        raft::term_t, uint64_t, const raft_topology_cmd&)>;

+struct topology_coordinator_cmd_rpc_tracker {
+    raft_topology_cmd::command current;
+    uint64_t index;
+    std::set<raft::server_id> active_dst;
+};
+
 future<> run_topology_coordinator(
        seastar::sharded<db::system_distributed_keyspace>& sys_dist_ks, gms::gossiper& gossiper,
        netw::messaging_service& messaging, locator::shared_token_metadata& shared_tm,
@@ -71,6 +77,7 @@ future<> run_topology_coordinator(
        tablet_allocator& tablet_allocator,
        std::chrono::milliseconds ring_delay,
        endpoint_lifecycle_notifier& lifecycle_notifier,
-        gms::feature_service& feature_service);
+        gms::feature_service& feature_service,
+        topology_coordinator_cmd_rpc_tracker& topology_cmd_rpc_tracker);

 }
--- a/Show More
+++ b/Show More