doc: update the warning about shared dictionary training

This commit updates the inadequate warning on the Advanced Internode (RPC) Compression page. The warning is replaced with a note about how training data is encrypted. Fixes https://github.com/scylladb/scylladb/issues/29109 Closes scylladb/scylladb#29111 (cherry picked from commit 88b98fac3a)
Update pgo profiles - aarch64
2026-03-25 10:59:12 +02:00 · 2026-03-15 05:09:46 +02:00 · 2026-03-10 22:45:44 +02:00 · 2026-03-05 21:12:27 +02:00 · 2026-03-05 17:35:21 +02:00 · 2026-03-03 13:27:50 +02:00
399 changed files with 12659 additions and 3577 deletions
--- a/.github/scripts/auto-backport.py
+++ b/.github/scripts/auto-backport.py
@@ -52,7 +52,7 @@ def create_pull_request(repo, new_branch_name, base_branch_name, pr, backport_pr
        if is_draft:
            backport_pr.add_to_labels("conflicts")
            pr_comment = f"@{pr.user.login} - This PR was marked as draft because it has conflicts\n"
-            pr_comment += "Please resolve them and mark this PR as ready for review"
+            pr_comment += "Please resolve them and remove the 'conflicts' label. The PR will be made ready for review automatically."
            backport_pr.create_issue_comment(pr_comment)
        logging.info(f"Assigned PR to original author: {pr.user}")
        return backport_pr
@@ -112,29 +112,45 @@ def backport(repo, pr, version, commits, backport_base_branch, is_collaborator):
                    is_draft = True
                    repo_local.git.add(A=True)
                    repo_local.git.cherry_pick('--continue')
-            repo_local.git.push(fork_repo, new_branch_name, force=True)
-            create_pull_request(repo, new_branch_name, backport_base_branch, pr, backport_pr_title, commits,
-                                is_draft, is_collaborator)
-
+            # Check if the branch already exists in the remote fork
+            remote_refs = repo_local.git.ls_remote('--heads', fork_repo, new_branch_name)
+            if not remote_refs:
+                # Branch does not exist, create it with a regular push
+                repo_local.git.push(fork_repo, new_branch_name)
+                create_pull_request(repo, new_branch_name, backport_base_branch, pr, backport_pr_title, commits,
+                                    is_draft, is_collaborator)
+            else:
+                logging.info(f"Remote branch {new_branch_name} already exists in fork. Skipping push.")
        except GitCommandError as e:
            logging.warning(f"GitCommandError: {e}")


 def with_github_keyword_prefix(repo, pr):
-    pattern = rf"(?:fix(?:|es|ed))\s*:?\s*(?:(?:(?:{repo.full_name})?#)|https://github\.com/{repo.full_name}/issues/)(\d+)"
-    match = re.findall(pattern, pr.body, re.IGNORECASE)
-    if not match:
-        for commit in pr.get_commits():
-            match = re.findall(pattern, commit.commit.message, re.IGNORECASE)
-            if match:
-                print(f'{pr.number} has a valid close reference in commit message {commit.sha}')
-                break
-    if not match:
-        print(f'No valid close reference for {pr.number}')
-        return False
-    else:
+    # GitHub issue pattern: #123, scylladb/scylladb#123, or full GitHub URLs
+    github_pattern = rf"(?:fix(?:|es|ed))\s*:?\s*(?:(?:(?:{repo.full_name})?#)|https://github\.com/{repo.full_name}/issues/)(\d+)"
+    
+    # JIRA issue pattern: PKG-92 or https://scylladb.atlassian.net/browse/PKG-92
+    jira_pattern = r"(?:fix(?:|es|ed))\s*:?\s*(?:(?:https://scylladb\.atlassian\.net/browse/)?([A-Z]+-\d+))"
+    
+    # Check PR body for GitHub issues
+    github_match = re.findall(github_pattern, pr.body, re.IGNORECASE)
+    # Check PR body for JIRA issues
+    jira_match = re.findall(jira_pattern, pr.body, re.IGNORECASE)
+    
+    match = github_match or jira_match
+
+    if match:
        return True

+    for commit in pr.get_commits():
+        github_match = re.findall(github_pattern, commit.commit.message, re.IGNORECASE)
+        jira_match = re.findall(jira_pattern, commit.commit.message, re.IGNORECASE)
+        if github_match or jira_match:
+            print(f'{pr.number} has a valid close reference in commit message {commit.sha}')
+            return True
+
+    print(f'No valid close reference for {pr.number}')
+    return False

 def main():
    args = parse_args()
--- a/.github/workflows/backport-pr-fixes-validation.yaml
+++ b/.github/workflows/backport-pr-fixes-validation.yaml
@@ -18,7 +18,7 @@ jobs:
            
            // Regular expression pattern to check for "Fixes" prefix
            // Adjusted to dynamically insert the repository full name
-            const pattern = `Fixes:? (?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)`;
+            const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|(?:https://scylladb\\.atlassian\\.net/browse/)?([A-Z]+-\\d+))`;
            const regex = new RegExp(pattern);
            
            if (!regex.test(body)) {
--- a/.github/workflows/call_backport_with_jira.yaml
+++ b/.github/workflows/call_backport_with_jira.yaml
@@ -0,0 +1,53 @@
+name: Backport with Jira Integration
+
+on:
+  push:
+    branches:
+      - master
+      - next-*.*
+      - branch-*.*
+  pull_request_target:
+    types: [labeled, closed]
+    branches: 
+      - master
+      - next
+      - next-*.*
+      - branch-*.*
+
+jobs:
+  backport-on-push:
+    if: github.event_name == 'push'
+    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
+    with:
+      event_type: 'push'
+      base_branch: ${{ github.ref }}
+      commits: ${{ github.event.before }}..${{ github.sha }}
+    secrets:
+      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
+
+  backport-on-label:
+    if: github.event_name == 'pull_request_target' && github.event.action == 'labeled'
+    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
+    with:
+      event_type: 'labeled'
+      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
+      pull_request_number: ${{ github.event.pull_request.number }}
+      head_commit: ${{ github.event.pull_request.base.sha }}
+      label_name: ${{ github.event.label.name }}
+      pr_state: ${{ github.event.pull_request.state }}
+    secrets:
+      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
+
+  backport-chain:
+    if: github.event_name == 'pull_request_target' && github.event.action == 'closed' && github.event.pull_request.merged == true
+    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
+    with:
+      event_type: 'chain'
+      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
+      pull_request_number: ${{ github.event.pull_request.number }}
+      pr_body: ${{ github.event.pull_request.body }}
+    secrets:
+      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2025.3.0-dev
+VERSION=2025.3.9

 if test -f version
 then
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -38,7 +38,6 @@
 #include <optional>
 #include "utils/assert.hh"
 #include "utils/overloaded_functor.hh"
-#include <seastar/json/json_elements.hh>
 #include "collection_mutation.hh"
 #include "schema/schema.hh"
 #include "db/tags/extension.hh"
@@ -121,47 +120,50 @@ static lw_shared_ptr<stats> get_stats_from_schema(service::storage_proxy& sp, co
    }
 }

-make_jsonable::make_jsonable(rjson::value&& value)
-    : _value(std::move(value))
-{}
-std::string make_jsonable::to_json() const {
-    return rjson::print(_value);
-}
-
-json::json_return_type make_streamed(rjson::value&& value) {
-    // CMH. json::json_return_type uses std::function, not noncopyable_function.
-    // Need to make a copyable version of value. Gah.
-    auto rs = make_shared<rjson::value>(std::move(value));
-    std::function<future<>(output_stream<char>&&)> func = [rs](output_stream<char>&& os) mutable -> future<> {
-        // move objects to coroutine frame.
-        auto los = std::move(os);
-        auto lrs = std::move(rs);
+executor::body_writer make_streamed(rjson::value&& value) {
+    return [value = std::move(value)](output_stream<char>&& _out) mutable -> future<> {
+        auto out = std::move(_out);
        std::exception_ptr ex;
        try {
-            co_await rjson::print(*lrs, los);
+            co_await rjson::print(value, out);
        } catch (...) {
-            // at this point, we cannot really do anything. HTTP headers and return code are
-            // already written, and quite potentially a portion of the content data.
-            // just log + rethrow. It is probably better the HTTP server closes connection
-            // abruptly or something...
            ex = std::current_exception();
-            elogger.error("Exception during streaming HTTP response: {}", ex);
        }
-        co_await los.close();
-        co_await rjson::destroy_gently(std::move(*lrs));
+        co_await out.close();
+        co_await rjson::destroy_gently(std::move(value));
        if (ex) {
            co_await coroutine::return_exception_ptr(std::move(ex));
        }
-        co_return;
    };
-    return func;
 }

-json_string::json_string(std::string&& value)
-    : _value(std::move(value))
-{}
-std::string json_string::to_json() const {
-    return _value;
+// make_streamed_with_extra_array() is variant of make_streamed() above, which
+// builds a streaming response (a function writing to an output stream) from a
+// JSON object (rjson::value) but adds to it at the end an additional array.
+// The extra array is given a separate chunked_vector to avoid putting it
+// inside the rjson::value - because RapidJSON does contiguous allocations for
+// arrays which we want to avoid for potentially long arrays in Query/Scan
+// responses (see #23535).
+// If we ever fix RapidJSON to avoid contiguous allocations for arrays, or
+// replace it entirely (#24458), we can remove this function and the function
+// rjson::print_with_extra_array() which it calls.
+executor::body_writer make_streamed_with_extra_array(rjson::value&& value,
+    std::string array_name, utils::chunked_vector<rjson::value>&& array) {
+    return [value = std::move(value), array_name = std::move(array_name), array = std::move(array)](output_stream<char>&& _out) mutable -> future<> {
+        auto out = std::move(_out);
+        std::exception_ptr ex;
+        try {
+            co_await rjson::print_with_extra_array(value, array_name, array, out);
+        } catch (...) {
+            ex = std::current_exception();
+        }
+        co_await out.close();
+        co_await rjson::destroy_gently(std::move(value));
+        // TODO: can/should we also destroy the array gently?
+        if (ex) {
+            co_await coroutine::return_exception_ptr(std::move(ex));
+        }
+    };
 }

 // This function throws api_error::validation if input value is not an object.
@@ -764,7 +766,7 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
    rjson::value response = rjson::empty_object();
    rjson::add(response, "Table", std::move(table_description));
    elogger.trace("returning {}", response);
-    co_return make_jsonable(std::move(response));
+    co_return rjson::print(std::move(response));
 }

 // Check CQL's Role-Based Access Control (RBAC) permission_to_check (MODIFY,
@@ -837,7 +839,7 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
                throw api_error::resource_not_found(fmt::format("Requested resource not found: Table: {} not found", table_name));
            }

-            auto m = co_await service::prepare_column_family_drop_announcement(_proxy, keyspace_name, table_name, group0_guard.write_timestamp(), service::drop_views::yes);
+            auto m = co_await service::prepare_column_family_drop_announcement(p.local(), keyspace_name, table_name, group0_guard.write_timestamp(), service::drop_views::yes);
            auto m2 = co_await service::prepare_keyspace_drop_announcement(_proxy.local_db(), keyspace_name, group0_guard.write_timestamp());

            std::move(m2.begin(), m2.end(), std::back_inserter(m));
@@ -881,7 +883,7 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
    rjson::value response = rjson::empty_object();
    rjson::add(response, "TableDescription", std::move(table_description));
    elogger.trace("returning {}", response);
-    co_return make_jsonable(std::move(response));
+    co_return rjson::print(std::move(response));
 }

 static data_type parse_key_type(std::string_view type) {
@@ -1165,7 +1167,7 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
        update_tags_map(*tags, tags_map, update_tags_action::add_tags);
    });
-    co_return json_string("");
+    co_return ""; // empty response
 }

 future<executor::request_return_type> executor::untag_resource(client_state& client_state, service_permit permit, rjson::value request) {
@@ -1186,7 +1188,7 @@ future<executor::request_return_type> executor::untag_resource(client_state& cli
    co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
        update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
    });
-    co_return json_string("");
+    co_return ""; // empty response
 }

 future<executor::request_return_type> executor::list_tags_of_resource(client_state& client_state, service_permit permit, rjson::value request) {
@@ -1212,7 +1214,7 @@ future<executor::request_return_type> executor::list_tags_of_resource(client_sta
        rjson::push_back(tags, std::move(new_entry));
    }

-    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+    return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
 }

 struct billing_mode_type {
@@ -1674,7 +1676,7 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
    rjson::value status = rjson::empty_object();
    executor::supplement_table_info(request, *schema, sp);
    rjson::add(status, "TableDescription", std::move(request));
-    co_return make_jsonable(std::move(status));
+    co_return rjson::print(std::move(status));
 }

 future<executor::request_return_type> executor::create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
@@ -1951,7 +1953,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
        rjson::value status = rjson::empty_object();
        supplement_table_info(request, *schema, p.local());
        rjson::add(status, "TableDescription", std::move(request));
-        co_return make_jsonable(std::move(status));
+        co_return rjson::print(std::move(status));
    });
 }

@@ -2417,7 +2419,7 @@ static future<executor::request_return_type> rmw_operation_return(rjson::value&&
    if (!attributes.IsNull()) {
        rjson::add(ret, "Attributes", std::move(attributes));
    }
-    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+    return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
 }

 static future<std::unique_ptr<rjson::value>> get_previous_item(
@@ -3009,7 +3011,7 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
        rjson::add(ret, "ConsumedCapacity", std::move(consumed_capacity));
    }
    _stats.api_operations.batch_write_item_latency.mark(std::chrono::steady_clock::now() - start_time);
-    co_return make_jsonable(std::move(ret));
+    co_return rjson::print(std::move(ret));
 }

 static const std::string_view get_item_type_string(const rjson::value& v) {
@@ -3407,16 +3409,16 @@ future<std::vector<rjson::value>> executor::describe_multi_item(schema_ptr schem
        shared_ptr<cql3::selection::selection> selection,
        foreign_ptr<lw_shared_ptr<query::result>> query_result,
        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get,
-        uint64_t& rcu_half_units) {
+        noncopyable_function<void(uint64_t)> item_callback) {
    cql3::selection::result_set_builder builder(*selection, gc_clock::now());
    query::result_view::consume(*query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
    auto result_set = builder.build();
    std::vector<rjson::value> ret;
    for (auto& result_row : result_set->rows()) {
        rjson::value item = rjson::empty_object();
-        rcu_consumed_capacity_counter consumed_capacity;
-        describe_single_item(*selection, result_row, *attrs_to_get, item, &consumed_capacity._total_bytes);
-        rcu_half_units += consumed_capacity.get_half_units();
+        uint64_t item_length_in_bytes = 0;
+        describe_single_item(*selection, result_row, *attrs_to_get, item, &item_length_in_bytes);
+        item_callback(item_length_in_bytes);
        ret.push_back(std::move(item));
        co_await coroutine::maybe_yield();
    }
@@ -4007,9 +4009,6 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
            }
        }
    }
-    if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
-        _return_attributes = std::move(*previous_item);
-    }
    if (_attribute_updates) {
        for (auto it = _attribute_updates->MemberBegin(); it != _attribute_updates->MemberEnd(); ++it) {
            // Note that it.key() is the name of the column, *it is the operation
@@ -4119,6 +4118,9 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
        // don't report the new item in the returned Attributes.
        _return_attributes = rjson::null_value();
    }
+    if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
+        _return_attributes = std::move(*previous_item);
+    }
    // ReturnValues=UPDATED_OLD/NEW never return an empty Attributes field,
    // even if a new item was created. Instead it should be missing entirely.
    if (_returnvalues == returnvalues::UPDATED_OLD || _returnvalues == returnvalues::UPDATED_NEW) {
@@ -4249,18 +4251,17 @@ future<executor::request_return_type> executor::get_item(client_state& client_st
    verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "GetItem");
    rcu_consumed_capacity_counter add_capacity(request, cl == db::consistency_level::LOCAL_QUORUM);
    co_await verify_permission(_enforce_authorization, client_state, schema, auth::permission::SELECT);
-    co_return co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl,
-            service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state, trace_state)).then(
-            [per_table_stats, this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = std::move(attrs_to_get), start_time = std::move(start_time), add_capacity=std::move(add_capacity)] (service::storage_proxy::coordinator_query_result qr) mutable {
-
-        per_table_stats->api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
-        _stats.api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
-        uint64_t rcu_half_units = 0;
-        auto res = make_ready_future<executor::request_return_type>(make_jsonable(describe_item(schema, partition_slice, *selection, *qr.query_result, std::move(attrs_to_get), add_capacity, rcu_half_units)));
-        per_table_stats->rcu_half_units_total += rcu_half_units;
-        _stats.rcu_half_units_total += rcu_half_units;
-        return res;
-    });
+    service::storage_proxy::coordinator_query_result qr =
+        co_await _proxy.query(
+            schema, std::move(command), std::move(partition_ranges), cl,
+            service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state, trace_state));
+    per_table_stats->api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
+    _stats.api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
+    uint64_t rcu_half_units = 0;
+    rjson::value res = describe_item(schema, partition_slice, *selection, *qr.query_result, std::move(attrs_to_get), add_capacity, rcu_half_units);
+    per_table_stats->rcu_half_units_total += rcu_half_units;
+    _stats.rcu_half_units_total += rcu_half_units;
+    co_return rjson::print(std::move(res));
 }

 static void check_big_object(const rjson::value& val, int& size_left);
@@ -4358,7 +4359,6 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
        }
    };
    std::vector<table_requests> requests;
-    std::vector<std::vector<uint64_t>> responses_sizes;
    uint batch_size = 0;
    for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) {
        table_requests rs(get_table_from_batch_request(_proxy, it));
@@ -4386,11 +4386,10 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
    // If we got here, all "requests" are valid, so let's start the
    // requests for the different partitions all in parallel.
    std::vector<future<std::vector<rjson::value>>> response_futures;
-    responses_sizes.resize(requests.size());
-    size_t responses_sizes_pos = 0;
-    for (const auto& rs : requests) {
-        responses_sizes[responses_sizes_pos].resize(rs.requests.size());
-        size_t pos = 0;
+    std::vector<uint64_t> consumed_rcu_half_units_per_table(requests.size());
+    for (size_t i = 0; i < requests.size(); i++) {
+        const table_requests& rs = requests[i];
+        bool is_quorum = rs.cl == db::consistency_level::LOCAL_QUORUM;
        lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
        per_table_stats->api_operations.batch_get_item_histogram.add(rs.requests.size());
        for (const auto &r : rs.requests) {
@@ -4413,16 +4412,17 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
            auto command = ::make_lw_shared<query::read_command>(rs.schema->id(), rs.schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
                    query::tombstone_limit(_proxy.get_tombstone_limit()));
            command->allow_limit = db::allow_per_partition_rate_limit::yes;
+            const auto item_callback = [is_quorum, &rcus_per_table = consumed_rcu_half_units_per_table[i]](uint64_t size) {
+                rcus_per_table += rcu_consumed_capacity_counter::get_half_units(size, is_quorum);
+            };
            future<std::vector<rjson::value>> f = _proxy.query(rs.schema, std::move(command), std::move(partition_ranges), rs.cl,
                    service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then(
-                    [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get, &response_size = responses_sizes[responses_sizes_pos][pos]] (service::storage_proxy::coordinator_query_result qr) mutable {
+                    [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get, item_callback = std::move(item_callback)] (service::storage_proxy::coordinator_query_result qr) mutable {
                utils::get_local_injector().inject("alternator_batch_get_item", [] { throw std::runtime_error("batch_get_item injection"); });
-                return describe_multi_item(std::move(schema), std::move(partition_slice), std::move(selection), std::move(qr.query_result), std::move(attrs_to_get), response_size);
+                return describe_multi_item(std::move(schema), std::move(partition_slice), std::move(selection), std::move(qr.query_result), std::move(attrs_to_get), std::move(item_callback));
            });
-            pos++;
            response_futures.push_back(std::move(f));
        }
-        responses_sizes_pos++;
    }

    // Wait for all requests to complete, and then return the response.
@@ -4434,14 +4434,11 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
    rjson::value response = rjson::empty_object();
    rjson::add(response, "Responses", rjson::empty_object());
    rjson::add(response, "UnprocessedKeys", rjson::empty_object());
-    size_t rcu_half_units;
    auto fut_it = response_futures.begin();
-    responses_sizes_pos = 0;
    rjson::value consumed_capacity = rjson::empty_array();
-    for (const auto& rs : requests) {
+    for (size_t i = 0; i < requests.size(); i++) {
+        const table_requests& rs = requests[i];
        std::string table = table_name(*rs.schema);
-        size_t pos = 0;
-        rcu_half_units = 0;
        for (const auto &r : rs.requests) {
            auto& pk = r.first;
            auto& cks = r.second;
@@ -4456,7 +4453,6 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
                for (rjson::value& json : results) {
                    rjson::push_back(response["Responses"][table], std::move(json));
                }
-                rcu_half_units += rcu_consumed_capacity_counter::get_half_units(responses_sizes[responses_sizes_pos][pos], rs.cl == db::consistency_level::LOCAL_QUORUM);
            } catch(...) {
                eptr = std::current_exception();
                // This read of potentially several rows in one partition,
@@ -4480,8 +4476,8 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
                    rjson::push_back(response["UnprocessedKeys"][table]["Keys"], std::move(*ck.second));
                }
            }
-            pos++;
        }
+        uint64_t rcu_half_units = consumed_rcu_half_units_per_table[i];
        _stats.rcu_half_units_total += rcu_half_units;
        lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
        per_table_stats->rcu_half_units_total += rcu_half_units;
@@ -4491,7 +4487,6 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
            rjson::add(entry, "CapacityUnits", rcu_half_units*0.5);
            rjson::push_back(consumed_capacity, std::move(entry));
        }
-        responses_sizes_pos++;
    }

    if (should_add_rcu) {
@@ -4505,7 +4500,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
    if (is_big(response)) {
        co_return make_streamed(std::move(response));
    } else {
-        co_return make_jsonable(std::move(response));
+        co_return rjson::print(std::move(response));
    }
 }

@@ -4649,7 +4644,11 @@ class describe_items_visitor {
    const filter& _filter;
    typename columns_t::const_iterator _column_it;
    rjson::value _item;
-    rjson::value _items;
+    // _items is a chunked_vector<rjson::value> instead of a RapidJson array
+    // (rjson::value) because unfortunately RapidJson arrays are stored
+    // contiguously in memory, and cause large allocations when a Query/Scan
+    // returns a long list of short items (issue #23535).
+    utils::chunked_vector<rjson::value> _items;
    size_t _scanned_count;

 public:
@@ -4659,7 +4658,6 @@ public:
            , _filter(filter)
            , _column_it(columns.begin())
            , _item(rjson::empty_object())
-            , _items(rjson::empty_array())
            , _scanned_count(0)
    {
        // _filter.check() may need additional attributes not listed in
@@ -4738,13 +4736,13 @@ public:
                rjson::remove_member(_item, attr);
            }

-            rjson::push_back(_items, std::move(_item));
+            _items.push_back(std::move(_item));
        }
        _item = rjson::empty_object();
        ++_scanned_count;
    }

-    rjson::value get_items() && {
+    utils::chunked_vector<rjson::value> get_items() && {
        return std::move(_items);
    }

@@ -4753,13 +4751,25 @@ public:
    }
 };

-static future<std::tuple<rjson::value, size_t>> describe_items(const cql3::selection::selection& selection, std::unique_ptr<cql3::result_set> result_set, std::optional<attrs_to_get>&& attrs_to_get, filter&& filter) {
+// describe_items() returns a JSON object that includes members "Count"
+// and "ScannedCount", but *not* "Items" - that is returned separately
+// as a chunked_vector to avoid large contiguous allocations which
+// RapidJSON does of its array. The caller should add "Items" to the
+// returned JSON object if needed, or print it separately.
+// The returned chunked_vector (the items) is std::optional<>, because
+// the user may have requested only to count items, and not return any
+// items - which is different from returning an empty list of items.
+static future<std::tuple<rjson::value, std::optional<utils::chunked_vector<rjson::value>>, size_t>> describe_items(
+        const cql3::selection::selection& selection,
+        std::unique_ptr<cql3::result_set> result_set,
+        std::optional<attrs_to_get>&& attrs_to_get,
+        filter&& filter) {
    describe_items_visitor visitor(selection.get_columns(), attrs_to_get, filter);
    co_await result_set->visit_gently(visitor);
    auto scanned_count = visitor.get_scanned_count();
-    rjson::value items = std::move(visitor).get_items();
+    utils::chunked_vector<rjson::value> items = std::move(visitor).get_items();
    rjson::value items_descr = rjson::empty_object();
-    auto size = items.Size();
+    auto size = items.size();
    rjson::add(items_descr, "Count", rjson::value(size));
    rjson::add(items_descr, "ScannedCount", rjson::value(scanned_count));
    // If attrs_to_get && attrs_to_get->empty(), this means the user asked not
@@ -4769,10 +4779,11 @@ static future<std::tuple<rjson::value, size_t>> describe_items(const cql3::selec
    // In that case, we currently build a list of empty items and here drop
    // it. We could just count the items and not bother with the empty items.
    // (However, remember that when we do have a filter, we need the items).
+    std::optional<utils::chunked_vector<rjson::value>> opt_items;
    if (!attrs_to_get || !attrs_to_get->empty()) {
-        rjson::add(items_descr, "Items", std::move(items));
+        opt_items = std::move(items);
    }
-    co_return std::tuple<rjson::value, size_t>{std::move(items_descr), size};
+    co_return std::tuple(std::move(items_descr), std::move(opt_items), size);
 }

 static rjson::value encode_paging_state(const schema& schema, const service::pager::paging_state& paging_state) {
@@ -4810,6 +4821,12 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag
    return last_evaluated_key;
 }

+// RapidJSON allocates arrays contiguously in memory, so we want to avoid
+// returning a large number of items as a single rapidjson array, and use
+// a chunked_vector instead. The following constant is an arbitrary cutoff
+// point for when to switch from a rapidjson array to a chunked_vector.
+static constexpr int max_items_for_rapidjson_array = 256;
+
 static future<executor::request_return_type> do_query(service::storage_proxy& proxy,
        schema_ptr table_schema,
        const rjson::value* exclusive_start_key,
@@ -4882,19 +4899,35 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
    }
    auto paging_state = rs->get_metadata().paging_state();
    bool has_filter = filter;
-    auto [items, size] = co_await describe_items(*selection, std::move(rs), std::move(attrs_to_get), std::move(filter));
+    auto [items_descr, opt_items, size] = co_await describe_items(*selection, std::move(rs), std::move(attrs_to_get), std::move(filter));
    if (paging_state) {
-        rjson::add(items, "LastEvaluatedKey", encode_paging_state(*table_schema, *paging_state));
+        rjson::add(items_descr, "LastEvaluatedKey", encode_paging_state(*table_schema, *paging_state));
    }
    if (has_filter){
        cql_stats.filtered_rows_read_total += p->stats().rows_read_total;
        // update our "filtered_row_matched_total" for all the rows matched, despited the filter
        cql_stats.filtered_rows_matched_total += size;
    }
-    if (is_big(items)) {
-        co_return executor::request_return_type(make_streamed(std::move(items)));
+    if (opt_items) {
+        if (opt_items->size() >= max_items_for_rapidjson_array) {
+            // There are many items, better print the JSON and the array of
+            // items (opt_items) separately to avoid RapidJSON's contiguous
+            // allocation of arrays.
+            co_return make_streamed_with_extra_array(std::move(items_descr), "Items", std::move(*opt_items));
+        }
+        // There aren't many items in the chunked vector opt_items,
+        // let's just insert them into the JSON object and print the
+        // full JSON normally.
+        rjson::value items_json = rjson::empty_array();
+        for (auto& item : *opt_items) {
+            rjson::push_back(items_json, std::move(item));
+        }
+        rjson::add(items_descr, "Items", std::move(items_json));
    }
-    co_return executor::request_return_type(make_jsonable(std::move(items)));
+    if (is_big(items_descr)) {
+        co_return make_streamed(std::move(items_descr));
+    }
+    co_return rjson::print(std::move(items_descr));
 }

 static dht::token token_for_segment(int segment, int total_segments) {
@@ -5489,7 +5522,7 @@ future<executor::request_return_type> executor::list_tables(client_state& client
    std::string exclusive_start = exclusive_start_json ? exclusive_start_json->GetString() : "";
    int limit = limit_json ? limit_json->GetInt() : 100;
    if (limit < 1 || limit > 100) {
-        return make_ready_future<request_return_type>(api_error::validation("Limit must be greater than 0 and no greater than 100"));
+        co_return api_error::validation("Limit must be greater than 0 and no greater than 100");
    }

    auto tables = _proxy.data_dictionary().get_tables(); // hold on to temporary, table_names isn't a container, it's a view
@@ -5531,7 +5564,7 @@ future<executor::request_return_type> executor::list_tables(client_state& client
        rjson::add(response, "LastEvaluatedTableName", rjson::copy(last_table_name));
    }

-    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(response)));
+    co_return rjson::print(std::move(response));
 }

 future<executor::request_return_type> executor::describe_endpoints(client_state& client_state, service_permit permit, rjson::value request, std::string host_header) {
@@ -5542,8 +5575,8 @@ future<executor::request_return_type> executor::describe_endpoints(client_state&
    if (!override.empty()) {
        if (override == "disabled") {
            _stats.unsupported_operations++;
-            return make_ready_future<request_return_type>(api_error::unknown_operation(
-                "DescribeEndpoints disabled by configuration (alternator_describe_endpoints=disabled)"));
+            co_return api_error::unknown_operation(
+                "DescribeEndpoints disabled by configuration (alternator_describe_endpoints=disabled)");
        }
        host_header = std::move(override);
    }
@@ -5555,13 +5588,13 @@ future<executor::request_return_type> executor::describe_endpoints(client_state&
    // A "Host:" header includes both host name and port, exactly what we need
    // to return.
    if (host_header.empty()) {
-        return make_ready_future<request_return_type>(api_error::validation("DescribeEndpoints needs a 'Host:' header in request"));
+        co_return api_error::validation("DescribeEndpoints needs a 'Host:' header in request");
    }
    rjson::add(response, "Endpoints", rjson::empty_array());
    rjson::push_back(response["Endpoints"], rjson::empty_object());
    rjson::add(response["Endpoints"][0], "Address", rjson::from_string(host_header));
    rjson::add(response["Endpoints"][0], "CachePeriodInMinutes", rjson::value(1440));
-    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(response)));
+    co_return rjson::print(std::move(response));
 }

 static std::map<sstring, sstring> get_network_topology_options(service::storage_proxy& sp, gms::gossiper& gossiper, int rf) {
@@ -5596,7 +5629,7 @@ future<executor::request_return_type> executor::describe_continuous_backups(clie
    rjson::add(desc, "PointInTimeRecoveryDescription", std::move(pitr));
    rjson::value response = rjson::empty_object();
    rjson::add(response, "ContinuousBackupsDescription", std::move(desc));
-    co_return make_jsonable(std::move(response));
+    co_return rjson::print(std::move(response));
 }

 // Create the metadata for the keyspace in which we put the alternator
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -10,8 +10,8 @@

 #include <seastar/core/future.hh>
 #include "seastarx.hh"
-#include <seastar/json/json_elements.hh>
 #include <seastar/core/sharded.hh>
+#include <seastar/util/noncopyable_function.hh>

 #include "service/migration_manager.hh"
 #include "service/client_state.hh"
@@ -58,29 +58,6 @@ namespace alternator {

 class rmw_operation;

-struct make_jsonable : public json::jsonable {
-    rjson::value _value;
-public:
-    explicit make_jsonable(rjson::value&& value);
-    std::string to_json() const override;
-};
-
-/**
- * Make return type for serializing the object "streamed",
- * i.e. direct to HTTP output stream. Note: only useful for
- * (very) large objects as there are overhead issues with this
- * as well, but for massive lists of return objects this can
- * help avoid large allocations/many re-allocs
- */
-json::json_return_type make_streamed(rjson::value&&);
-
-struct json_string : public json::jsonable {
-    std::string _value;
-public:
-    explicit json_string(std::string&& value);
-    std::string to_json() const override;
-};
-
 namespace parsed {
 class path;
 };
@@ -169,7 +146,19 @@ class executor : public peering_sharded_service<executor> {

 public:
    using client_state = service::client_state;
-    using request_return_type = std::variant<json::json_return_type, api_error>;
+    // request_return_type is the return type of the executor methods, which
+    // can be one of:
+    // 1. A string, which is the response body for the request.
+    // 2. A body_writer, an asynchronous function (returning future<>) that
+    //    takes an output_stream and writes the response body into it.
+    // 3. An api_error, which is an error response that should be returned to
+    //    the client.
+    // The body_writer is used for streaming responses, where the response body
+    // is written in chunks to the output_stream. This allows for efficient
+    // handling of large responses without needing to allocate a large buffer
+    // in memory.
+    using body_writer = noncopyable_function<future<>(output_stream<char>&&)>;
+    using request_return_type = std::variant<std::string, body_writer, api_error>;
    stats _stats;
    // The metric_groups object holds this stat object's metrics registered
    // as long as the stats object is alive.
@@ -240,12 +229,15 @@ public:
        const std::optional<attrs_to_get>&,
        uint64_t* = nullptr);

+    // Converts a multi-row selection result to JSON compatible with DynamoDB.
+    // For each row, this method calls item_callback, which takes the size of
+    // the item as the parameter.
    static future<std::vector<rjson::value>> describe_multi_item(schema_ptr schema,
        const query::partition_slice&& slice,
        shared_ptr<cql3::selection::selection> selection,
        foreign_ptr<lw_shared_ptr<query::result>> query_result,
        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get,
-        uint64_t& rcu_half_units);
+        noncopyable_function<void(uint64_t)> item_callback = {});

    static void describe_single_item(const cql3::selection::selection&,
        const std::vector<managed_bytes_opt>&,
@@ -275,4 +267,13 @@ bool is_big(const rjson::value& val, int big_size = 100'000);
 // appropriate user-readable api_error::access_denied is thrown.
 future<> verify_permission(bool enforce_authorization, const service::client_state&, const schema_ptr&, auth::permission);

+/**
+ * Make return type for serializing the object "streamed",
+ * i.e. direct to HTTP output stream. Note: only useful for
+ * (very) large objects as there are overhead issues with this
+ * as well, but for massive lists of return objects this can
+ * help avoid large allocations/many re-allocs
+ */
+executor::body_writer make_streamed(rjson::value&&);
+
 }
--- a/alternator/expressions.g
+++ b/alternator/expressions.g
@@ -91,6 +91,18 @@ options {
        throw expressions_syntax_error(format("{} at char {}", err,
            ex->get_charPositionInLine()));
    }
+
+    // ANTLR3 tries to recover missing tokens - it tries to finish parsing
+    // and create valid objects, as if the missing token was there.
+    // But it has a bug and leaks these tokens.
+    // We override offending method and handle abandoned pointers.
+    std::vector<std::unique_ptr<TokenType>> _missing_tokens;
+    TokenType* getMissingSymbol(IntStreamType* istream, ExceptionBaseType* e,
+                                ANTLR_UINT32 expectedTokenType, BitsetListType* follow) {
+        auto token = BaseType::getMissingSymbol(istream, e, expectedTokenType, follow);
+        _missing_tokens.emplace_back(token);
+        return token;
+    }
 }
@lexer::context {
    void displayRecognitionError(ANTLR_UINT8** token_names, ExceptionBaseType* ex) {
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -13,7 +13,6 @@
 #include <seastar/http/function_handlers.hh>
 #include <seastar/http/short_streams.hh>
 #include <seastar/core/coroutine.hh>
-#include <seastar/json/json_elements.hh>
 #include <seastar/util/defer.hh>
 #include <seastar/util/short_streams.hh>
 #include "seastarx.hh"
@@ -124,22 +123,22 @@ public:
             }
             auto res = resf.get();
             std::visit(overloaded_functor {
-                 [&] (const json::json_return_type& json_return_value) {
-                     slogger.trace("api_handler success case");
-                     if (json_return_value._body_writer) {
-                         // Unfortunately, write_body() forces us to choose
-                         // from a fixed and irrelevant list of "mime-types"
-                         // at this point. But we'll override it with the
-                         // one (application/x-amz-json-1.0) below.
-                         rep->write_body("json", std::move(json_return_value._body_writer));
-                     } else {
-                         rep->_content += json_return_value._res;
-                     }
-                 },
-                 [&] (const api_error& err) {
-                     generate_error_reply(*rep, err);
-                 }
-             }, res);
+                [&] (std::string&& str) {
+                    // Note that despite the move, there is a copy here -
+                    // as str is std::string and rep->_content is sstring.
+                    rep->_content = std::move(str);
+                },
+                [&] (executor::body_writer&& body_writer) {
+                    // Unfortunately, write_body() forces us to choose
+                    // from a fixed and irrelevant list of "mime-types"
+                    // at this point. But we'll override it with the
+                    // correct one (application/x-amz-json-1.0) below.
+                    rep->write_body("json", std::move(body_writer));
+                },
+                [&] (const api_error& err) {
+                    generate_error_reply(*rep, err);
+                }
+             }, std::move(res));

             return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
         });
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -217,7 +217,7 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
        rjson::add(ret, "LastEvaluatedStreamArn", *last);
    }

-    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+    return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
 }

 struct shard_id {
@@ -491,7 +491,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    if (!opts.enabled()) {
        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-        return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
    }

    // TODO: label
@@ -617,7 +617,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
        rjson::add(stream_desc, "Shards", std::move(shards));
        rjson::add(ret, "StreamDescription", std::move(stream_desc));
            
-        return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
    });
 }

@@ -770,7 +770,7 @@ future<executor::request_return_type> executor::get_shard_iterator(client_state&
    auto ret = rjson::empty_object();
    rjson::add(ret, "ShardIterator", iter);

-    return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+    return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
 }

 struct event_id {
@@ -1021,7 +1021,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
            // will notice end end of shard and not return NextShardIterator.
            rjson::add(ret, "NextShardIterator", next_iter);
            _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
-            return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
        }

        // ugh. figure out if we are and end-of-shard
@@ -1047,7 +1047,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
            if (is_big(ret)) {
                return make_ready_future<executor::request_return_type>(make_streamed(std::move(ret)));
            }
-            return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
+            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
        });
    });
 }
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -118,7 +118,7 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
    // basically identical to the request's
    rjson::value response = rjson::empty_object();
    rjson::add(response, "TimeToLiveSpecification", std::move(*spec));
-    co_return make_jsonable(std::move(response));
+    co_return rjson::print(std::move(response));
 }

 future<executor::request_return_type> executor::describe_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
@@ -135,7 +135,7 @@ future<executor::request_return_type> executor::describe_time_to_live(client_sta
    }
    rjson::value response = rjson::empty_object();
    rjson::add(response, "TimeToLiveDescription", std::move(desc));
-    co_return make_jsonable(std::move(response));
+    co_return rjson::print(std::move(response));
 }

 // expiration_service is a sharded service responsible for cleaning up expired
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -984,7 +984,7 @@
         ]
      },
      {
-         "path":"/storage_service/cleanup_all",
+         "path":"/storage_service/cleanup_all/",
         "operations":[
            {
               "method":"POST",
@@ -994,6 +994,30 @@
               "produces":[
                  "application/json"
               ],
+               "parameters":[
+                    {
+                     "name":"global",
+                     "description":"true if cleanup of entire cluster is requested",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/storage_service/mark_node_as_clean",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Mark the node as clean. After that the node will not be considered as needing cleanup during automatic cleanup which is triggered by some topology operations",
+               "type":"void",
+               "nickname":"reset_cleanup_needed",
+               "produces":[
+                  "application/json"
+               ],
               "parameters":[]
            }
         ]
@@ -3161,6 +3185,22 @@
               ]
            }
         ]
+      },
+      {
+         "path":"/storage_service/raft_topology/cmd_rpc_status",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get information about currently running topology cmd rpc",
+               "type":"string",
+               "nickname":"raft_topology_get_cmd_status",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+               ]
+            }
+         ]
      }
   ],
   "models":{
@@ -3297,11 +3337,11 @@
         "properties":{
            "start_token":{
               "type":"string",
-               "description":"The range start token"
+               "description":"The range start token (exclusive)"
            },
            "end_token":{
               "type":"string",
-               "description":"The range start token"
+               "description":"The range end token (inclusive)"
            },
            "endpoints":{
               "type":"array",
--- a/api/api-doc/tasks.json
+++ b/api/api-doc/tasks.json
@@ -42,6 +42,14 @@
                     "allowMultiple":false,
                     "type":"boolean",
                     "paramType":"query"
+                  },
+                  {
+                     "name":"consider_only_existing_data",
+                     "description":"Set to \"true\" to flush all memtables and force tombstone garbage collection to check only the sstables being compacted (false by default). The memtable, commitlog and other uncompacted sstables will not be checked during tombstone garbage collection.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
                  }
               ]
            }
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -12,6 +12,7 @@
 #include "api/api-doc/storage_service.json.hh"
 #include "api/api-doc/storage_proxy.json.hh"
 #include "api/scrub_status.hh"
+#include "api/tasks.hh"
 #include "db/config.hh"
 #include "db/schema_tables.hh"
 #include "gms/feature_service.hh"
@@ -20,6 +21,7 @@
 #include "utils/hash.hh"
 #include <optional>
 #include <sstream>
+#include <stdexcept>
 #include <time.h>
 #include <algorithm>
 #include <functional>
@@ -749,77 +751,39 @@ rest_force_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
            fmopt = flush_mode::skip;
        }
        auto task = co_await compaction_module.make_and_start_task<global_major_compaction_task_impl>({}, db, fmopt, consider_only_existing_data);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("force_compaction failed: {}", std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json_void();
 }

 static
 future<json::json_return_type>
 rest_force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
-        auto& db = ctx.db;
-        auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
-        auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
-        auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
-        apilog.info("force_keyspace_compaction: keyspace={} tables={}, flush={} consider_only_existing_data={}", keyspace, table_infos, flush, consider_only_existing_data);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        std::optional<flush_mode> fmopt;
-        if (!flush && !consider_only_existing_data) {
-            fmopt = flush_mode::skip;
-        }
-        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("force_keyspace_compaction: keyspace={} tables={} failed: {}", task->get_status().keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        auto task = co_await force_keyspace_compaction(ctx, std::move(req));
+        co_await task->done();
        co_return json_void();
 }

 static
 future<json::json_return_type>
 rest_force_keyspace_cleanup(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
-        auto& db = ctx.db;
-        auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
-        const auto& rs = db.local().find_keyspace(keyspace).get_replication_strategy();
-        if (rs.get_type() == locator::replication_strategy_type::local || !rs.is_vnode_based()) {
-            auto reason = rs.get_type() == locator::replication_strategy_type::local ? "require" : "support";
-            apilog.info("Keyspace {} does not {} cleanup", keyspace, reason);
-            co_return json::json_return_type(0);
-        }
-        apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
-        if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
-            auto msg = "Can not perform cleanup operation when topology changes";
-            apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
-            co_await coroutine::return_exception(std::runtime_error(msg));
-        }
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>(
-            {}, std::move(keyspace), db, table_infos, flush_mode::all_tables, tasks::is_user_task::yes);
-        try {
+        auto task = co_await force_keyspace_cleanup(ctx, ss, std::move(req));
+        if (task) {
            co_await task->done();
-        } catch (...) {
-            apilog.error("force_keyspace_cleanup: keyspace={} tables={} failed: {}", task->get_status().keyspace, table_infos, std::current_exception());
-            throw;
        }
-
        co_return json::json_return_type(0);
 }

 static
 future<json::json_return_type>
 rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
-        apilog.info("cleanup_all");
-        auto done = co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
+        bool global = true;
+        if (auto global_param = req->get_query_param("global"); !global_param.empty()) {
+            global = validate_bool(global_param);
+        }
+
+        apilog.info("cleanup_all global={}", global);
+
+        auto done = !global ? false : co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
            if (!ss.is_topology_coordinator_enabled()) {
                co_return false;
            }
@@ -829,19 +793,35 @@ rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::
        if (done) {
            co_return json::json_return_type(0);
        }
-        // fall back to the local global cleanup if topology coordinator is not enabled
+        // fall back to the local cleanup if topology coordinator is not enabled or local cleanup is requested
        auto& db = ctx.db;
        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<global_cleanup_compaction_task_impl>({}, db);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("cleanup_all failed: {}", std::current_exception());
-            throw;
-        }
+        co_await task->done();
+
+        // Mark this node as clean
+        co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<> {
+            if (ss.is_topology_coordinator_enabled()) {
+                co_await ss.reset_cleanup_needed();
+            }
+        });
+
        co_return json::json_return_type(0);
 }

+static
+future<json::json_return_type>
+rest_reset_cleanup_needed(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+        apilog.info("reset_cleanup_needed");
+        co_await ss.invoke_on(0, [] (service::storage_service& ss) {
+            if (!ss.is_topology_coordinator_enabled()) {
+                throw std::runtime_error("mark_node_as_clean is only supported when topology over raft is enabled");
+            }
+            return ss.reset_cleanup_needed();
+        });
+        co_return json_void();
+}
+
 static
 future<json::json_return_type>
 rest_perform_keyspace_offstrategy_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
@@ -850,34 +830,16 @@ rest_perform_keyspace_offstrategy_compaction(http_context& ctx, std::unique_ptr<
        bool res = false;
        auto& compaction_module = ctx.db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<offstrategy_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, table_infos, &res);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("perform_keyspace_offstrategy_compaction: keyspace={} tables={} failed: {}", task->get_status().keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json::json_return_type(res);
 }

 static
 future<json::json_return_type>
 rest_upgrade_sstables(http_context& ctx, std::unique_ptr<http::request> req) {
-        auto& db = ctx.db;
        auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
-        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
-
-        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("upgrade_sstables: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        auto task = co_await upgrade_sstables(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
+        co_await task->done();
        co_return json::json_return_type(0);
 }

@@ -1670,6 +1632,18 @@ rest_raft_topology_upgrade_status(sharded<service::storage_service>& ss, std::un
        co_return sstring(format("{}", ustate));
 }

+static
+future<json::json_return_type>
+rest_raft_topology_get_cmd_status(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+        const auto status = co_await ss.invoke_on(0, [] (auto& ss) {
+            return ss.get_topology_cmd_status();
+        });
+        if (status.active_dst.empty()) {
+            co_return sstring("none");
+        }
+        co_return sstring(fmt::format("{}[{}]: {}", status.current, status.index, fmt::join(status.active_dst, ",")));
+}
+
 static
 future<json::json_return_type>
 rest_move_tablet(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -1845,6 +1819,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::force_keyspace_compaction.set(r, rest_bind(rest_force_keyspace_compaction, ctx));
    ss::force_keyspace_cleanup.set(r, rest_bind(rest_force_keyspace_cleanup, ctx, ss));
    ss::cleanup_all.set(r, rest_bind(rest_cleanup_all, ctx, ss));
+    ss::reset_cleanup_needed.set(r, rest_bind(rest_reset_cleanup_needed, ctx, ss));
    ss::perform_keyspace_offstrategy_compaction.set(r, rest_bind(rest_perform_keyspace_offstrategy_compaction, ctx));
    ss::upgrade_sstables.set(r, rest_bind(rest_upgrade_sstables, ctx));
    ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
@@ -1902,6 +1877,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
    ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
    ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
+    ss::raft_topology_get_cmd_status.set(r, rest_bind(rest_raft_topology_get_cmd_status, ss));
    ss::move_tablet.set(r, rest_bind(rest_move_tablet, ctx, ss));
    ss::add_tablet_replica.set(r, rest_bind(rest_add_tablet_replica, ctx, ss));
    ss::del_tablet_replica.set(r, rest_bind(rest_del_tablet_replica, ctx, ss));
@@ -1928,6 +1904,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::force_keyspace_compaction.unset(r);
    ss::force_keyspace_cleanup.unset(r);
    ss::cleanup_all.unset(r);
+    ss::reset_cleanup_needed.unset(r);
    ss::perform_keyspace_offstrategy_compaction.unset(r);
    ss::upgrade_sstables.unset(r);
    ss::force_flush.unset(r);
@@ -1983,6 +1960,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::reload_raft_topology_state.unset(r);
    ss::upgrade_to_raft_topology.unset(r);
    ss::raft_topology_upgrade_status.unset(r);
+    ss::raft_topology_get_cmd_status.unset(r);
    ss::move_tablet.unset(r);
    ss::add_tablet_replica.unset(r);
    ss::del_tablet_replica.unset(r);
--- a/api/tasks.cc
+++ b/api/tasks.cc
@@ -36,37 +36,65 @@ static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
    };
 }

+future<tasks::task_manager::task_ptr> force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
+    auto& db = ctx.db;
+    auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
+    auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
+    auto consider_only_existing_data = validate_bool_x(req->get_query_param("consider_only_existing_data"), false);
+    apilog.info("force_keyspace_compaction: keyspace={} tables={}, flush={} consider_only_existing_data={}", keyspace, table_infos, flush, consider_only_existing_data);
+
+    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+    std::optional<compaction::flush_mode> fmopt;
+    if (!flush && !consider_only_existing_data) {
+        fmopt = compaction::flush_mode::skip;
+    }
+    return compaction_module.make_and_start_task<compaction::major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
+}
+
+future<tasks::task_manager::task_ptr> upgrade_sstables(http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) {
+    auto& db = ctx.db;
+    bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
+
+    apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
+
+    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+    return compaction_module.make_and_start_task<compaction::upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
+}
+
+future<tasks::task_manager::task_ptr> force_keyspace_cleanup(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+    auto& db = ctx.db;
+    auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
+    const auto& rs = db.local().find_keyspace(keyspace).get_replication_strategy();
+    if (rs.get_type() == locator::replication_strategy_type::local || !rs.is_vnode_based()) {
+        auto reason = rs.get_type() == locator::replication_strategy_type::local ? "require" : "support";
+        apilog.info("Keyspace {} does not {} cleanup", keyspace, reason);
+        co_return nullptr;
+    }
+    apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
+    if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
+        auto msg = "Can not perform cleanup operation when topology changes";
+        apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
+        co_await coroutine::return_exception(std::runtime_error(msg));
+    }
+
+    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+    co_return co_await compaction_module.make_and_start_task<compaction::cleanup_keyspace_compaction_task_impl>(
+        {}, std::move(keyspace), db, table_infos, compaction::flush_mode::all_tables, tasks::is_user_task::yes);
+}
+
 void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& snap_ctl) {
    t::force_keyspace_compaction_async.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        auto [ keyspace, table_infos ] = parse_table_infos(ctx, *req, "cf");
-        auto flush = validate_bool_x(req->get_query_param("flush_memtables"), true);
-        apilog.debug("force_keyspace_compaction_async: keyspace={} tables={}, flush={}", keyspace, table_infos, flush);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        std::optional<flush_mode> fmopt;
-        if (!flush) {
-            fmopt = flush_mode::skip;
-        }
-        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt);
-
+        auto task = co_await force_keyspace_compaction(ctx, std::move(req));
        co_return json::json_return_type(task->get_status().id.to_sstring());
    });

    t::force_keyspace_cleanup_async.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        auto [keyspace, table_infos] = parse_table_infos(ctx, *req);
-        apilog.info("force_keyspace_cleanup_async: keyspace={} tables={}", keyspace, table_infos);
-        if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
-            auto msg = "Can not perform cleanup operation when topology changes";
-            apilog.warn("force_keyspace_cleanup_async: keyspace={} tables={}: {}", keyspace, table_infos, msg);
-            co_await coroutine::return_exception(std::runtime_error(msg));
+        tasks::task_id id = tasks::task_id::create_null_id();
+        auto task = co_await force_keyspace_cleanup(ctx, ss, std::move(req));
+        if (task) {
+            id = task->get_status().id;
        }
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>({}, std::move(keyspace), db, table_infos, flush_mode::all_tables, tasks::is_user_task::yes);
-
-        co_return json::json_return_type(task->get_status().id.to_sstring());
+        co_return json::json_return_type(id.to_sstring());
    });

    t::perform_keyspace_offstrategy_compaction_async.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
@@ -78,14 +106,7 @@ void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::
    }));

    t::upgrade_sstables_async.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
-
-        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
-
+        auto task = co_await upgrade_sstables(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
        co_return json::json_return_type(task->get_status().id.to_sstring());
    }));

--- a/api/tasks.hh
+++ b/api/tasks.hh
@@ -15,6 +15,10 @@ namespace seastar::httpd {
 class routes;
 }

+namespace seastar::http {
+struct request;
+}
+
 namespace service {
 class storage_service;
 }
@@ -25,4 +29,8 @@ struct http_context;
 void set_tasks_compaction_module(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& snap_ctl);
 void unset_tasks_compaction_module(http_context& ctx, httpd::routes& r);

+future<tasks::task_manager::task_ptr> force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request> req);
+future<tasks::task_manager::task_ptr> force_keyspace_cleanup(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req);
+future<tasks::task_manager::task_ptr> upgrade_sstables(http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos);
+
 }
--- a/auth/allow_all_authenticator.cc
+++ b/auth/allow_all_authenticator.cc
@@ -9,6 +9,7 @@
 #include "auth/allow_all_authenticator.hh"

 #include "service/migration_manager.hh"
+#include "utils/alien_worker.hh"
 #include "utils/class_registrator.hh"

 namespace auth {
@@ -21,6 +22,7 @@ static const class_registrator<
        allow_all_authenticator,
        cql3::query_processor&,
        ::service::raft_group0_client&,
-        ::service::migration_manager&> registration("org.apache.cassandra.auth.AllowAllAuthenticator");
+        ::service::migration_manager&,
+        utils::alien_worker&> registration("org.apache.cassandra.auth.AllowAllAuthenticator");

 }
--- a/auth/allow_all_authenticator.hh
+++ b/auth/allow_all_authenticator.hh
@@ -13,6 +13,7 @@
 #include "auth/authenticated_user.hh"
 #include "auth/authenticator.hh"
 #include "auth/common.hh"
+#include "utils/alien_worker.hh"

 namespace cql3 {
 class query_processor;
@@ -28,7 +29,7 @@ extern const std::string_view allow_all_authenticator_name;

 class allow_all_authenticator final : public authenticator {
 public:
-    allow_all_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&) {
+    allow_all_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&) {
    }

    virtual future<> start() override {
--- a/auth/certificate_authenticator.cc
+++ b/auth/certificate_authenticator.cc
@@ -33,13 +33,14 @@ static const class_registrator<auth::authenticator
    , auth::certificate_authenticator
    , cql3::query_processor&
    , ::service::raft_group0_client&
-    , ::service::migration_manager&> cert_auth_reg(CERT_AUTH_NAME);
+    , ::service::migration_manager&
+    , utils::alien_worker&> cert_auth_reg(CERT_AUTH_NAME);

 enum class auth::certificate_authenticator::query_source {
    subject, altname
 };

-auth::certificate_authenticator::certificate_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&)
+auth::certificate_authenticator::certificate_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&)
    : _queries([&] {
        auto& conf = qp.db().get_config();
        auto queries = conf.auth_certificate_role_queries();
--- a/auth/certificate_authenticator.hh
+++ b/auth/certificate_authenticator.hh
@@ -10,6 +10,7 @@
 #pragma once

 #include "auth/authenticator.hh"
+#include "utils/alien_worker.hh"
 #include <boost/regex_fwd.hpp>  // IWYU pragma: keep

 namespace cql3 {
@@ -31,7 +32,7 @@ class certificate_authenticator : public authenticator {
    enum class query_source;
    std::vector<std::pair<query_source, boost::regex>> _queries;
 public:
-    certificate_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&);
+    certificate_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&);
    ~certificate_authenticator();

    future<> start() override;
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -233,9 +233,9 @@ future<role_set> ldap_role_manager::query_granted(std::string_view grantee_name,
 }

 future<role_to_directly_granted_map>
-ldap_role_manager::query_all_directly_granted() {
+ldap_role_manager::query_all_directly_granted(::service::query_state& qs) {
    role_to_directly_granted_map result;
-    auto roles = co_await query_all();
+    auto roles = co_await query_all(qs);
    for (auto& role: roles) {
        auto granted_set = co_await query_granted(role, recursive_role_query::no);
        for (auto& granted: granted_set) {
@@ -247,8 +247,8 @@ ldap_role_manager::query_all_directly_granted() {
    co_return result;
 }

-future<role_set> ldap_role_manager::query_all() {
-    return _std_mgr.query_all();
+future<role_set> ldap_role_manager::query_all(::service::query_state& qs) {
+    return _std_mgr.query_all(qs);
 }

 future<> ldap_role_manager::create_role(std::string_view role_name) {
@@ -311,12 +311,12 @@ future<bool> ldap_role_manager::can_login(std::string_view role_name) {
 }

 future<std::optional<sstring>> ldap_role_manager::get_attribute(
-        std::string_view role_name, std::string_view attribute_name) {
-    return _std_mgr.get_attribute(role_name, attribute_name);
+        std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
+    return _std_mgr.get_attribute(role_name, attribute_name, qs);
 }

-future<role_manager::attribute_vals> ldap_role_manager::query_attribute_for_all(std::string_view attribute_name) {
-    return _std_mgr.query_attribute_for_all(attribute_name);
+future<role_manager::attribute_vals> ldap_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
+    return _std_mgr.query_attribute_for_all(attribute_name, qs);
 }

 future<> ldap_role_manager::set_attribute(
--- a/auth/ldap_role_manager.hh
+++ b/auth/ldap_role_manager.hh
@@ -75,9 +75,9 @@ class ldap_role_manager : public role_manager {

    future<role_set> query_granted(std::string_view, recursive_role_query) override;

-    future<role_to_directly_granted_map> query_all_directly_granted() override;
+    future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;

-    future<role_set> query_all() override;
+    future<role_set> query_all(::service::query_state&) override;

    future<bool> exists(std::string_view) override;

@@ -85,9 +85,9 @@ class ldap_role_manager : public role_manager {

    future<bool> can_login(std::string_view) override;

-    future<std::optional<sstring>> get_attribute(std::string_view, std::string_view) override;
+    future<std::optional<sstring>> get_attribute(std::string_view, std::string_view, ::service::query_state&) override;

-    future<role_manager::attribute_vals> query_attribute_for_all(std::string_view) override;
+    future<role_manager::attribute_vals> query_attribute_for_all(std::string_view, ::service::query_state&) override;

    future<> set_attribute(std::string_view, std::string_view, std::string_view, ::service::group0_batch& mc) override;

--- a/auth/maintenance_socket_role_manager.cc
+++ b/auth/maintenance_socket_role_manager.cc
@@ -78,11 +78,11 @@ future<role_set> maintenance_socket_role_manager::query_granted(std::string_view
    return operation_not_supported_exception<role_set>("QUERY GRANTED");
 }

-future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted() {
+future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted(::service::query_state&) {
    return operation_not_supported_exception<role_to_directly_granted_map>("QUERY ALL DIRECTLY GRANTED");
 }

-future<role_set> maintenance_socket_role_manager::query_all() {
+future<role_set> maintenance_socket_role_manager::query_all(::service::query_state&) {
    return operation_not_supported_exception<role_set>("QUERY ALL");
 }

@@ -98,11 +98,11 @@ future<bool> maintenance_socket_role_manager::can_login(std::string_view role_na
    return make_ready_future<bool>(true);
 }

-future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name) {
+future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) {
    return operation_not_supported_exception<std::optional<sstring>>("GET ATTRIBUTE");
 }

-future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name) {
+future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) {
    return operation_not_supported_exception<role_manager::attribute_vals>("QUERY ATTRIBUTE");
 }

--- a/auth/maintenance_socket_role_manager.hh
+++ b/auth/maintenance_socket_role_manager.hh
@@ -53,9 +53,9 @@ public:

    virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;

-    virtual future<role_to_directly_granted_map> query_all_directly_granted() override;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;

-    virtual future<role_set> query_all() override;
+    virtual future<role_set> query_all(::service::query_state&) override;

    virtual future<bool> exists(std::string_view role_name) override;

@@ -63,9 +63,9 @@ public:

    virtual future<bool> can_login(std::string_view role_name) override;

-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) override;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) override;

-    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name) override;
+    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) override;

    virtual future<> set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) override;

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -48,14 +48,14 @@ static const class_registrator<
        password_authenticator,
        cql3::query_processor&,
        ::service::raft_group0_client&,
-        ::service::migration_manager&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");
+        ::service::migration_manager&,
+        utils::alien_worker&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");

 static thread_local auto rng_for_salt = std::default_random_engine(std::random_device{}());

 static std::string_view get_config_value(std::string_view value, std::string_view def) {
    return value.empty() ? def : value;
 }
-
 std::string password_authenticator::default_superuser(const db::config& cfg) {
    return std::string(get_config_value(cfg.auth_superuser_name(), DEFAULT_USER_NAME));
 }
@@ -63,12 +63,13 @@ std::string password_authenticator::default_superuser(const db::config& cfg) {
 password_authenticator::~password_authenticator() {
 }

-password_authenticator::password_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
+password_authenticator::password_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, utils::alien_worker& hashing_worker)
    : _qp(qp)
    , _group0_client(g0)
    , _migration_manager(mm)
    , _stopped(make_ready_future<>()) 
    , _superuser(default_superuser(qp.db().get_config()))
+    , _hashing_worker(hashing_worker)
 {}

 static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
@@ -125,7 +126,7 @@ future<> password_authenticator::legacy_create_default_if_missing() {
    }
    std::string salted_pwd(get_config_value(_qp.db().get_config().auth_superuser_salted_password(), ""));
    if (salted_pwd.empty()) {
-        salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt);
+        salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt, _scheme);
    }
    const auto query = update_row_query();
    co_await _qp.execute_internal(
@@ -172,7 +173,7 @@ future<> password_authenticator::maybe_create_default_password() {
    // Set default superuser's password.
    std::string salted_pwd(get_config_value(_qp.db().get_config().auth_superuser_salted_password(), ""));
    if (salted_pwd.empty()) {
-        salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt);
+        salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt, _scheme);
    }
    const auto update_query = update_row_query();
    co_await collect_mutations(_qp, batch, update_query, {salted_pwd, _superuser});
@@ -202,6 +203,10 @@ future<> password_authenticator::maybe_create_default_password_with_retries() {

 future<> password_authenticator::start() {
    return once_among_shards([this] {
+        // Verify that at least one hashing scheme is supported.
+        passwords::detail::verify_scheme(_scheme);
+        plogger.info("Using password hashing scheme: {}", passwords::detail::prefix_for_scheme(_scheme));
+
        _stopped = do_after_system_ready(_as, [this] {
            return async([this] {
                if (legacy_mode(_qp)) {
@@ -227,7 +232,9 @@ future<> password_authenticator::start() {
                utils::get_local_injector().inject("password_authenticator_start_pause", utils::wait_for_message(5min)).get();
                if (!legacy_mode(_qp)) {
                    maybe_create_default_password_with_retries().get();
-                    _superuser_created_promise.set_value();
+                    if (!_superuser_created_promise.available()) {
+                        _superuser_created_promise.set_value();
+                    }
                }
            });
        });
@@ -287,7 +294,13 @@ future<authenticated_user> password_authenticator::authenticate(

    try {
        const std::optional<sstring> salted_hash = co_await get_password_hash(username);
-        if (!salted_hash || !passwords::check(password, *salted_hash)) {
+        if (!salted_hash) {
+            throw exceptions::authentication_exception("Username and/or password are incorrect");
+        }
+        const bool password_match = co_await _hashing_worker.submit<bool>([password = std::move(password), salted_hash = std::move(salted_hash)]{
+            return passwords::check(password, *salted_hash);
+        });
+        if (!password_match) {
            throw exceptions::authentication_exception("Username and/or password are incorrect");
        }
        co_return username;
@@ -311,7 +324,7 @@ future<> password_authenticator::create(std::string_view role_name, const authen
    auto maybe_hash = options.credentials.transform([&] (const auto& creds) -> sstring {
        return std::visit(make_visitor(
                [&] (const password_option& opt) {
-                    return passwords::hash(opt.password, rng_for_salt);
+                    return passwords::hash(opt.password, rng_for_salt, _scheme);
                },
                [] (const hashed_password_option& opt) {
                    return opt.hashed_password;
@@ -354,11 +367,11 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
                query,
                consistency_for_user(role_name),
                internal_distributed_query_state(),
-                {passwords::hash(password, rng_for_salt), sstring(role_name)},
+                {passwords::hash(password, rng_for_salt, _scheme), sstring(role_name)},
                cql3::query_processor::cache_internal::no).discard_result();
    } else {
        co_await collect_mutations(_qp, mc, query,
-                {passwords::hash(password, rng_for_salt), sstring(role_name)});
+                {passwords::hash(password, rng_for_salt, _scheme), sstring(role_name)});
    }
 }

--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -15,7 +15,9 @@

 #include "db/consistency_level_type.hh"
 #include "auth/authenticator.hh"
+#include "auth/passwords.hh"
 #include "service/raft/raft_group0_client.hh"
+#include "utils/alien_worker.hh"

 namespace db {
    class config;
@@ -43,12 +45,15 @@ class password_authenticator : public authenticator {
    abort_source _as;
    std::string _superuser; // default superuser name from the config (may or may not be present in roles table)
    shared_promise<> _superuser_created_promise;
+    // We used to also support bcrypt, SHA-256, and MD5 (ref. scylladb#24524).
+    constexpr static auth::passwords::scheme _scheme = passwords::scheme::sha_512;
+    utils::alien_worker& _hashing_worker;

 public:
    static db::consistency_level consistency_for_user(std::string_view role_name);
    static std::string default_superuser(const db::config&);

-    password_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&);
+    password_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&);

    ~password_authenticator();

--- a/auth/passwords.cc
+++ b/auth/passwords.cc
@@ -21,18 +21,14 @@ static thread_local crypt_data tlcrypt = {};

 namespace detail {

-scheme identify_best_supported_scheme() {
-    const auto all_schemes = { scheme::bcrypt_y, scheme::bcrypt_a, scheme::sha_512, scheme::sha_256, scheme::md5 };
-    // "Random", for testing schemes.
+void verify_scheme(scheme scheme) {
    const sstring random_part_of_salt = "aaaabbbbccccdddd";

-    for (scheme c : all_schemes) {
-        const sstring salt = sstring(prefix_for_scheme(c)) + random_part_of_salt;
-        const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);
+    const sstring salt = sstring(prefix_for_scheme(scheme)) + random_part_of_salt;
+    const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);

-        if (e && (e[0] != '*')) {
-            return c;
-        }
+    if (e && (e[0] != '*')) {
+        return;
    }

    throw no_supported_schemes();
--- a/auth/passwords.hh
+++ b/auth/passwords.hh
@@ -21,10 +21,11 @@ class no_supported_schemes : public std::runtime_error {
 public:
    no_supported_schemes();
 };
-
 ///
-/// Apache Cassandra uses a library to provide the bcrypt scheme. Many Linux implementations do not support bcrypt, so
-/// we support alternatives. The cost is loss of direct compatibility with Apache Cassandra system tables.
+/// Apache Cassandra uses a library to provide the bcrypt scheme. In ScyllaDB, we use SHA-512
+/// instead of bcrypt for performance and for historical reasons (see scylladb#24524).
+/// Currently, SHA-512 is always chosen as the hashing scheme for new passwords, but the other
+/// algorithms remain supported for CREATE ROLE WITH HASHED PASSWORD and backward compatibility.
 ///
 enum class scheme {
    bcrypt_y,
@@ -51,11 +52,11 @@ sstring generate_random_salt_bytes(RandomNumberEngine& g) {
 }

 ///
-/// Test each allowed hashing scheme and report the best supported one on the current system.
+/// Test given hashing scheme on the current system.
 ///
-/// \throws \ref no_supported_schemes when none of the known schemes is supported.
+/// \throws \ref no_supported_schemes when scheme is unsupported.
 ///
-scheme identify_best_supported_scheme();
+void verify_scheme(scheme scheme);

 std::string_view prefix_for_scheme(scheme) noexcept;

@@ -67,8 +68,7 @@ std::string_view prefix_for_scheme(scheme) noexcept;
 /// \throws \ref no_supported_schemes when no known hashing schemes are supported on the system.
 ///
 template <typename RandomNumberEngine>
-sstring generate_salt(RandomNumberEngine& g) {
-    static const scheme scheme = identify_best_supported_scheme();
+sstring generate_salt(RandomNumberEngine& g, scheme scheme) {
    static const sstring prefix = sstring(prefix_for_scheme(scheme));
    return prefix + generate_random_salt_bytes(g);
 }
@@ -93,8 +93,8 @@ sstring hash_with_salt(const sstring& pass, const sstring& salt);
 /// \throws \ref std::system_error when the implementation-specific implementation fails to hash the cleartext.
 ///
 template <typename RandomNumberEngine>
-sstring hash(const sstring& pass, RandomNumberEngine& g) {
-    return detail::hash_with_salt(pass, detail::generate_salt(g));
+sstring hash(const sstring& pass, RandomNumberEngine& g, scheme scheme) {
+    return detail::hash_with_salt(pass, detail::generate_salt(g, scheme));
 }

 ///
--- a/auth/role_manager.hh
+++ b/auth/role_manager.hh
@@ -17,12 +17,17 @@
 #include <seastar/core/format.hh>
 #include <seastar/core/sstring.hh>

+#include "auth/common.hh"
 #include "auth/resource.hh"
 #include "cql3/description.hh"
 #include "seastarx.hh"
 #include "exceptions/exceptions.hh"
 #include "service/raft/raft_group0_client.hh"

+namespace service {
+class query_state;
+};
+
 namespace auth {

 struct role_config final {
@@ -167,9 +172,9 @@ public:
    ///   (role2, role3)
    /// }
    ///  
-    virtual future<role_to_directly_granted_map> query_all_directly_granted() = 0;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state& = internal_distributed_query_state()) = 0;

-    virtual future<role_set> query_all() = 0;
+    virtual future<role_set> query_all(::service::query_state& = internal_distributed_query_state()) = 0;

    virtual future<bool> exists(std::string_view role_name) = 0;

@@ -186,12 +191,12 @@ public:
    ///
    /// \returns the value of the named attribute, if one is set.
    ///
-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) = 0;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& = internal_distributed_query_state()) = 0;

    ///
    /// \returns a mapping of each role's value for the named attribute, if one is set for the role.
    ///
-    virtual future<attribute_vals> query_attribute_for_all(std::string_view attribute_name) = 0;
+    virtual future<attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state& = internal_distributed_query_state()) = 0;

    /// Sets `attribute_name` with `attribute_value` for `role_name`.
    /// \returns an exceptional future with nonexistant_role if the role does not exist.
--- a/auth/saslauthd_authenticator.cc
+++ b/auth/saslauthd_authenticator.cc
@@ -34,9 +34,10 @@ static const class_registrator<
        saslauthd_authenticator,
        cql3::query_processor&,
        ::service::raft_group0_client&,
-        ::service::migration_manager&> saslauthd_auth_reg("com.scylladb.auth.SaslauthdAuthenticator");
+        ::service::migration_manager&,
+        utils::alien_worker&> saslauthd_auth_reg("com.scylladb.auth.SaslauthdAuthenticator");

-saslauthd_authenticator::saslauthd_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&)
+saslauthd_authenticator::saslauthd_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&)
    : _socket_path(qp.db().get_config().saslauthd_socket_path())
 {}

--- a/auth/saslauthd_authenticator.hh
+++ b/auth/saslauthd_authenticator.hh
@@ -11,6 +11,7 @@
 #pragma once

 #include "auth/authenticator.hh"
+#include "utils/alien_worker.hh"

 namespace cql3 {
 class query_processor;
@@ -28,7 +29,7 @@ namespace auth {
 class saslauthd_authenticator : public authenticator {
    sstring _socket_path; ///< Path to the domain socket on which saslauthd is listening.
 public:
-    saslauthd_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&);
+    saslauthd_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&);

    future<> start() override;

--- a/auth/service.cc
+++ b/auth/service.cc
@@ -187,14 +187,15 @@ service::service(
        ::service::migration_notifier& mn,
        ::service::migration_manager& mm,
        const service_config& sc,
-        maintenance_socket_enabled used_by_maintenance_socket)
+        maintenance_socket_enabled used_by_maintenance_socket,
+        utils::alien_worker& hashing_worker)
            : service(
                      std::move(c),
                      qp,
                      g0,
                      mn,
                      create_object<authorizer>(sc.authorizer_java_name, qp, g0, mm),
-                      create_object<authenticator>(sc.authenticator_java_name, qp, g0, mm),
+                      create_object<authenticator>(sc.authenticator_java_name, qp, g0, mm, hashing_worker),
                      create_object<role_manager>(sc.role_manager_java_name, qp, g0, mm),
                      used_by_maintenance_socket) {
 }
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -26,6 +26,7 @@
 #include "cql3/description.hh"
 #include "seastarx.hh"
 #include "service/raft/raft_group0_client.hh"
+#include "utils/alien_worker.hh"
 #include "utils/observable.hh"
 #include "utils/serialized_action.hh"
 #include "service/maintenance_mode.hh"
@@ -126,7 +127,8 @@ public:
            ::service::migration_notifier&,
            ::service::migration_manager&,
            const service_config&,
-            maintenance_socket_enabled);
+            maintenance_socket_enabled,
+            utils::alien_worker&);

    future<> start(::service::migration_manager&, db::system_keyspace&);

--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -9,6 +9,7 @@
 #include "auth/standard_role_manager.hh"

 #include <optional>
+#include <stdexcept>
 #include <unordered_set>
 #include <vector>

@@ -28,6 +29,7 @@
 #include "cql3/util.hh"
 #include "db/consistency_level_type.hh"
 #include "exceptions/exceptions.hh"
+#include "utils/error_injection.hh"
 #include "utils/log.hh"
 #include <seastar/core/loop.hh>
 #include <seastar/coroutine/maybe_yield.hh>
@@ -321,7 +323,9 @@ future<> standard_role_manager::start() {
            }
            if (!legacy) {
                co_await maybe_create_default_role_with_retries();
-                _superuser_created_promise.set_value();
+                if (!_superuser_created_promise.available()) {
+                    _superuser_created_promise.set_value();
+                }
            }
        };

@@ -648,21 +652,30 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
    });
 }

-future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted() {
+future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted(::service::query_state& qs) {
    const sstring query = seastar::format("SELECT * FROM {}.{}",
            get_auth_ks_name(_qp),
            meta::role_members_table::name);

+    const auto results = co_await _qp.execute_internal(
+            query,
+            db::consistency_level::ONE,
+            qs,
+            cql3::query_processor::cache_internal::yes);
+
    role_to_directly_granted_map roles_map;
-    co_await _qp.query_internal(query, [&roles_map] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
-        roles_map.insert({row.get_as<sstring>("member"), row.get_as<sstring>("role")});
-        co_return stop_iteration::no;
-    });
+    std::transform(
+            results->begin(),
+            results->end(),
+            std::inserter(roles_map, roles_map.begin()),
+            [] (const cql3::untyped_result_set_row& row) {
+                return std::make_pair(row.get_as<sstring>("member"), row.get_as<sstring>("role")); }
+    );

    co_return roles_map;
 }

-future<role_set> standard_role_manager::query_all() {
+future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
    const sstring query = seastar::format("SELECT {} FROM {}.{}",
            meta::roles_table::role_col_name,
            get_auth_ks_name(_qp),
@@ -671,10 +684,16 @@ future<role_set> standard_role_manager::query_all() {
    // To avoid many copies of a view.
    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);

+    if (utils::get_local_injector().enter("standard_role_manager_fail_legacy_query")) {
+        if (legacy_mode(_qp)) {
+            throw std::runtime_error("standard_role_manager::query_all: failed due to error injection");
+        }
+    }
+
    const auto results = co_await _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
-            internal_distributed_query_state(),
+            qs,
            cql3::query_processor::cache_internal::yes);

    role_set roles;
@@ -706,11 +725,11 @@ future<bool> standard_role_manager::can_login(std::string_view role_name) {
    });
 }

-future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name) {
+future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
    const sstring query = seastar::format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
            get_auth_ks_name(_qp),
            meta::role_attributes_table::name);
-    const auto result_set = co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
+    const auto result_set = co_await _qp.execute_internal(query, db::consistency_level::ONE, qs, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
    if (!result_set->empty()) {
        const cql3::untyped_result_set_row &row = result_set->one();
        co_return std::optional<sstring>(row.get_as<sstring>("value"));
@@ -718,11 +737,11 @@ future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_
    co_return std::optional<sstring>{};
 }

-future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name) {
-    return query_all().then([this, attribute_name] (role_set roles) {
-        return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles)] (attribute_vals &role_to_att_val) {
-            return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name] (sstring role) {
-                return get_attribute(role, attribute_name).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
+future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name, ::service::query_state& qs) {
+    return query_all(qs).then([this, attribute_name, &qs] (role_set roles) {
+        return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles), &qs] (attribute_vals &role_to_att_val) {
+            return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name, &qs] (sstring role) {
+                return get_attribute(role, attribute_name, qs).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
                    if (att_val) {
                        role_to_att_val.emplace(std::move(role), std::move(*att_val));
                    }
@@ -767,7 +786,7 @@ future<> standard_role_manager::remove_attribute(std::string_view role_name, std
 future<std::vector<cql3::description>> standard_role_manager::describe_role_grants() {
    std::vector<cql3::description> result{};

-    const auto grants = co_await query_all_directly_granted();
+    const auto grants = co_await query_all_directly_granted(internal_distributed_query_state());
    result.reserve(grants.size());

    for (const auto& [grantee_role, granted_role] : grants) {
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -66,9 +66,9 @@ public:

    virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;

-    virtual future<role_to_directly_granted_map> query_all_directly_granted() override;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;

-    virtual future<role_set> query_all() override;
+    virtual future<role_set> query_all(::service::query_state&) override;

    virtual future<bool> exists(std::string_view role_name) override;

@@ -76,9 +76,9 @@ public:

    virtual future<bool> can_login(std::string_view role_name) override;

-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) override;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) override;

-    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name) override;
+    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) override;

    virtual future<> set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) override;

--- a/auth/transitional.cc
+++ b/auth/transitional.cc
@@ -37,8 +37,8 @@ class transitional_authenticator : public authenticator {
 public:
    static const sstring PASSWORD_AUTHENTICATOR_NAME;

-    transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
-            : transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm)) {
+    transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, utils::alien_worker& hashing_worker)
+            : transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm, hashing_worker)) {
    }
    transitional_authenticator(std::unique_ptr<authenticator> a)
            : _authenticator(std::move(a)) {
@@ -239,7 +239,8 @@ static const class_registrator<
        auth::transitional_authenticator,
        cql3::query_processor&,
        ::service::raft_group0_client&,
-        ::service::migration_manager&> transitional_authenticator_reg(auth::PACKAGE_NAME + "TransitionalAuthenticator");
+        ::service::migration_manager&,
+        utils::alien_worker&> transitional_authenticator_reg(auth::PACKAGE_NAME + "TransitionalAuthenticator");

 static const class_registrator<
        auth::authorizer,
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -56,8 +56,17 @@ using namespace std::chrono_literals;

 logging::logger cdc_log("cdc");

+namespace {
+
+// When dropping a column from a CDC log table, we set the drop timestamp
+// `column_drop_leeway` seconds into the future to ensure that for writes concurrent
+// with column drop, the write timestamp is before the column drop timestamp.
+constexpr auto column_drop_leeway = std::chrono::seconds(5);
+
+} // anonymous namespace
+
 namespace cdc {
-static schema_ptr create_log_schema(const schema&, std::optional<table_id> = {}, schema_ptr = nullptr);
+static schema_ptr create_log_schema(const schema&, api::timestamp_type, std::optional<table_id> = {}, schema_ptr = nullptr);
 }

 static constexpr auto cdc_group_name = "cdc";
@@ -167,7 +176,7 @@ public:
            ensure_that_table_uses_vnodes(ksm, schema);

            // in seastar thread
-            auto log_schema = create_log_schema(schema);
+            auto log_schema = create_log_schema(schema, timestamp);

            auto log_mut = db::schema_tables::make_create_table_mutations(log_schema, timestamp);

@@ -205,7 +214,7 @@ public:
            ensure_that_table_has_no_counter_columns(new_schema);
            ensure_that_table_uses_vnodes(*keyspace.metadata(), new_schema);

-            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);
+            auto new_log_schema = create_log_schema(new_schema, timestamp, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);

            auto log_mut = log_schema 
                ? db::schema_tables::make_update_table_mutations(db, keyspace.metadata(), log_schema, new_log_schema, timestamp)
@@ -496,7 +505,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
    return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
 }

-static schema_ptr create_log_schema(const schema& s, std::optional<table_id> uuid, schema_ptr old) {
+static schema_ptr create_log_schema(const schema& s, api::timestamp_type timestamp, std::optional<table_id> uuid, schema_ptr old) {
    schema_builder b(s.ks_name(), log_name(s.cf_name()));
    b.with_partitioner(cdc::cdc_partitioner::classname);
    b.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
@@ -531,6 +540,28 @@ static schema_ptr create_log_schema(const schema& s, std::optional<table_id> uui
    b.with_column(log_meta_column_name_bytes("ttl"), long_type);
    b.with_column(log_meta_column_name_bytes("end_of_batch"), boolean_type);
    b.set_caching_options(caching_options::get_disabled_caching_options());
+
+    auto validate_new_column = [&] (const sstring& name) {
+        // When dropping a column from a CDC log table, we set the drop timestamp to be
+        // `column_drop_leeway` seconds into the future (see `create_log_schema`).
+        // Therefore, when recreating a column with the same name, we need to validate
+        // that it's not recreated too soon and that the drop timestamp has passed.
+        if (old && old->dropped_columns().contains(name)) {
+            const auto& drop_info = old->dropped_columns().at(name);
+            auto create_time = api::timestamp_clock::time_point(api::timestamp_clock::duration(timestamp));
+            auto drop_time = api::timestamp_clock::time_point(api::timestamp_clock::duration(drop_info.timestamp));
+            if (drop_time > create_time) {
+                throw exceptions::invalid_request_exception(format("Cannot add column {} because a column with the same name was dropped too recently. Please retry after {} seconds",
+                        name, std::chrono::duration_cast<std::chrono::seconds>(drop_time - create_time).count() + 1));
+            }
+        }
+    };
+
+    auto add_column = [&] (sstring name, data_type type) {
+        validate_new_column(name);
+        b.with_column(to_bytes(name), type);
+    };
+
    auto add_columns = [&] (const schema::const_iterator_range_type& columns, bool is_data_col = false) {
        for (const auto& column : columns) {
            auto type = column.type;
@@ -552,9 +583,9 @@ static schema_ptr create_log_schema(const schema& s, std::optional<table_id> uui
                    }
                ));
            }
-            b.with_column(log_data_column_name_bytes(column.name()), type);
+            add_column(log_data_column_name(column.name_as_text()), type);
            if (is_data_col) {
-                b.with_column(log_data_column_deleted_name_bytes(column.name()), boolean_type);
+                add_column(log_data_column_deleted_name(column.name_as_text()), boolean_type);
            }
            if (column.type->is_multi_cell()) {
                auto dtype = visit(*type, make_visitor(
@@ -570,7 +601,7 @@ static schema_ptr create_log_schema(const schema& s, std::optional<table_id> uui
                        throw std::invalid_argument("Should not reach");
                    }
                ));
-                b.with_column(log_data_column_deleted_elements_name_bytes(column.name()), dtype);
+                add_column(log_data_column_deleted_elements_name(column.name_as_text()), dtype);
            }
        }
    };
@@ -592,7 +623,8 @@ static schema_ptr create_log_schema(const schema& s, std::optional<table_id> uui
        // not super efficient, but we don't do this often.
        for (auto& col : old->all_columns()) {
            if (!b.has_column({col.name(), col.name_as_text() })) {
-                b.without_column(col.name_as_text(), col.type, api::new_timestamp());
+                auto drop_ts = api::timestamp_clock::now() + column_drop_leeway;
+                b.without_column(col.name_as_text(), col.type, drop_ts.time_since_epoch().count());
            }
        }
    }
@@ -960,8 +992,12 @@ public:
    // Given a reference to such a column from the base schema, this function sets the corresponding column
    // in the log to the given value for the given row.
    void set_value(const clustering_key& log_ck, const column_definition& base_cdef, const managed_bytes_view& value) {
-        auto& log_cdef = *_log_schema.get_column_definition(log_data_column_name_bytes(base_cdef.name()));
-        _log_mut.set_cell(log_ck, log_cdef, atomic_cell::make_live(*base_cdef.type, _ts, value, _ttl));
+        auto log_cdef_ptr = _log_schema.get_column_definition(log_data_column_name_bytes(base_cdef.name()));
+        if (!log_cdef_ptr) {
+            throw exceptions::invalid_request_exception(format("CDC log schema for {}.{} does not have base column {}",
+                _log_schema.ks_name(), _log_schema.cf_name(), base_cdef.name_as_text()));
+        }
+        _log_mut.set_cell(log_ck, *log_cdef_ptr, atomic_cell::make_live(*base_cdef.type, _ts, value, _ttl));
    }

    // Each regular and static column in the base schema has a corresponding column in the log schema
@@ -969,7 +1005,13 @@ public:
    // Given a reference to such a column from the base schema, this function sets the corresponding column
    // in the log to `true` for the given row. If not called, the column will be `null`.
    void set_deleted(const clustering_key& log_ck, const column_definition& base_cdef) {
-        _log_mut.set_cell(log_ck, log_data_column_deleted_name_bytes(base_cdef.name()), data_value(true), _ts, _ttl);
+        auto log_cdef_ptr = _log_schema.get_column_definition(log_data_column_deleted_name_bytes(base_cdef.name()));
+        if (!log_cdef_ptr) {
+            throw exceptions::invalid_request_exception(format("CDC log schema for {}.{} does not have base column {}",
+                _log_schema.ks_name(), _log_schema.cf_name(), base_cdef.name_as_text()));
+        }
+        auto& log_cdef = *log_cdef_ptr;
+        _log_mut.set_cell(log_ck, *log_cdef_ptr, atomic_cell::make_live(*log_cdef.type, _ts, log_cdef.type->decompose(true), _ttl));
    }

    // Each regular and static non-atomic column in the base schema has a corresponding column in the log schema
@@ -978,7 +1020,12 @@ public:
    // Given a reference to such a column from the base schema, this function sets the corresponding column
    // in the log to the given set of keys for the given row.
    void set_deleted_elements(const clustering_key& log_ck, const column_definition& base_cdef, const managed_bytes& deleted_elements) {
-        auto& log_cdef = *_log_schema.get_column_definition(log_data_column_deleted_elements_name_bytes(base_cdef.name()));
+        auto log_cdef_ptr = _log_schema.get_column_definition(log_data_column_deleted_elements_name_bytes(base_cdef.name()));
+        if (!log_cdef_ptr) {
+            throw exceptions::invalid_request_exception(format("CDC log schema for {}.{} does not have base column {}",
+                _log_schema.ks_name(), _log_schema.cf_name(), base_cdef.name_as_text()));
+        }
+        auto& log_cdef = *log_cdef_ptr;
        _log_mut.set_cell(log_ck, log_cdef, atomic_cell::make_live(*log_cdef.type, _ts, deleted_elements, _ttl));
    }

@@ -1865,5 +1912,10 @@ bool cdc::cdc_service::needs_cdc_augmentation(const std::vector<mutation>& mutat

 future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
 cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
+    if (utils::get_local_injector().enter("sleep_before_cdc_augmentation")) {
+        return seastar::sleep(std::chrono::milliseconds(100)).then([this, timeout, mutations = std::move(mutations), tr_state = std::move(tr_state), write_cl] () mutable {
+            return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
+        });
+    }
    return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
 }
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -1933,7 +1933,11 @@ static future<compaction_result> scrub_sstables_validate_mode(sstables::compacti
    using scrub = sstables::compaction_type_options::scrub;
    if (validation_errors != 0 && descriptor.options.as<scrub>().quarantine_sstables == scrub::quarantine_invalid_sstables::yes) {
        for (auto& sst : descriptor.sstables) {
-            co_await sst->change_state(sstables::sstable_state::quarantine);
+            try {
+                co_await sst->change_state(sstables::sstable_state::quarantine);
+            } catch (...) {
+                clogger.error("Moving {} to quarantine failed due to {}, continuing.", sst->get_filename(), std::current_exception());
+            }
        }
    }

--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -1155,9 +1155,6 @@ future<> compaction_manager::drain() {

 future<> compaction_manager::stop() {
    do_stop();
-    if (auto cm = std::exchange(_task_manager_module, nullptr)) {
-        co_await cm->stop();
-    }
    if (_stop_future) {
        co_await std::exchange(*_stop_future, make_ready_future());
    }
@@ -1168,14 +1165,15 @@ future<> compaction_manager::really_do_stop() noexcept {
    // Reset the metrics registry
    _metrics.clear();
    co_await stop_ongoing_compactions("shutdown");
-    if (!_tasks.empty()) {
-        on_fatal_internal_error(cmlog, format("{} tasks still exist after being stopped", _tasks.size()));
-    }
+    co_await _task_manager_module->stop();
    co_await coroutine::parallel_for_each(_compaction_state | std::views::values, [] (compaction_state& cs) -> future<> {
        if (!cs.gate.is_closed()) {
            co_await cs.gate.close();
        }
    });
+    if (!_tasks.empty()) {
+        on_fatal_internal_error(cmlog, format("{} tasks still exist after being stopped", _tasks.size()));
+    }
    reevaluate_postponed_compactions();
    co_await std::move(_waiting_reevalution);
    co_await _sys_ks.close();
@@ -1839,8 +1837,21 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sst
    if (!gh) {
        co_return compaction_stats_opt{};
    }
-    // All sstables must be included, even the ones being compacted, such that everything in table is validated.
-    auto all_sstables = get_all_sstables(t);
+
+    // Collect and register all sstables as compacting while compaction is disabled, to avoid a race condition where
+    // regular compaction runs in between and picks the same files.
+    std::vector<sstables::shared_sstable> all_sstables;
+    compacting_sstable_registration compacting(*this, get_compaction_state(&t));
+    co_await run_with_compaction_disabled(t, [&all_sstables, &compacting, &t] () -> future<> {
+        // All sstables must be included.
+        all_sstables = get_all_sstables(t);
+        compacting.register_compacting(all_sstables);
+        return make_ready_future<>();
+    });
+    if (all_sstables.empty()) {
+        co_return compaction_stats_opt{};
+    }
+
    co_return co_await perform_compaction<validate_sstables_compaction_task_executor>(throw_if_stopping::no, info, &t, info.id, std::move(all_sstables), quarantine_sstables);
 }

--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -301,6 +301,11 @@ public:
    // unless it is moved back to enabled state.
    future<> drain();

+    // Check if compaction manager is running, i.e. it was enabled or drained
+    bool is_running() const noexcept {
+        return _state == state::enabled || _state == state::disabled;
+    }
+
    using compaction_history_consumer = noncopyable_function<future<>(const db::compaction_history_entry&)>;
    future<> get_compaction_history(compaction_history_consumer&& f);

--- a/compress.cc
+++ b/compress.cc
@@ -24,6 +24,7 @@
 #include "sstables/sstable_compressor_factory.hh"
 #include "compress.hh"
 #include "exceptions/exceptions.hh"
+#include "utils/config_file_impl.hh"
 #include "utils/class_registrator.hh"
 #include "gms/feature_service.hh"

@@ -488,6 +489,8 @@ compression_parameters::compression_parameters(const std::map<sstring, sstring>&

    if (auto v = get_option(SSTABLE_COMPRESSION)) {
        _algorithm = name_to_algorithm(*v);
+    } else if (!options.empty()) {
+        throw exceptions::configuration_exception(seastar::format("Missing compression option '{}'", SSTABLE_COMPRESSION));
    } else {
        _algorithm = algorithm::none;
    }
@@ -511,7 +514,7 @@ compression_parameters::compression_parameters(const std::map<sstring, sstring>&
        try {
            _crc_check_chance = std::stod(*v);
        } catch (const std::exception& e) {
-            throw exceptions::syntax_exception(sstring("Invalid double value ") + *v + "for " + CRC_CHECK_CHANCE);
+            throw exceptions::syntax_exception(sstring("Invalid double value ") + *v + " for " + CRC_CHECK_CHANCE);
        }
    }

@@ -536,7 +539,7 @@ compression_parameters::compression_parameters(const std::map<sstring, sstring>&
    }
 }

-void compression_parameters::validate(dicts_feature_enabled dicts_enabled, dicts_usage_allowed dicts_allowed) {
+void compression_parameters::validate(dicts_feature_enabled dicts_enabled, dicts_usage_allowed dicts_allowed) const {
    if (_algorithm == algorithm::zstd_with_dicts || _algorithm == algorithm::lz4_with_dicts) {
        if (!dicts_enabled) {
            throw std::runtime_error(std::format("sstable_compression {} can't be used before "
@@ -593,6 +596,13 @@ std::map<sstring, sstring> compression_parameters::get_options() const {
    return opts;
 }

+std::istream& operator>>(std::istream& is, compression_parameters& cp) {
+    std::unordered_map<sstring, sstring> options_map;
+    is >> options_map;
+    cp = compression_parameters(options_map | std::ranges::to<std::map>());
+    return is;
+}
+
 lz4_processor::lz4_processor(cdict_ptr cdict, ddict_ptr ddict)
    : _cdict(std::move(cdict))
    , _ddict(std::move(ddict))
--- a/compress.hh
+++ b/compress.hh
@@ -107,7 +107,7 @@ public:

    using dicts_feature_enabled = bool_class<struct dicts_feature_enabled_tag>;
    using dicts_usage_allowed = bool_class<struct dicts_usage_allowed_tag>;
-    void validate(dicts_feature_enabled, dicts_usage_allowed);
+    void validate(dicts_feature_enabled, dicts_usage_allowed) const;

    std::map<sstring, sstring> get_options() const;

@@ -124,3 +124,13 @@ private:
    static void validate_options(const std::map<sstring, sstring>&);
    static algorithm name_to_algorithm(std::string_view name);
 };
+
+// Stream operator for boost::program_options support
+std::istream& operator>>(std::istream& is, compression_parameters& cp);
+
+template <>
+struct fmt::formatter<compression_parameters> : fmt::formatter<std::string_view> {
+    auto format(const compression_parameters& cp, fmt::format_context& ctx) const -> decltype(ctx.out()) {
+        return fmt::format_to(ctx.out(), "{}", cp.get_options());
+    }
+};
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -855,3 +855,18 @@ rf_rack_valid_keyspaces: false
 # Maximum number of items in single BatchWriteItem command. Default is 100.
 # Note: DynamoDB has a hard-coded limit of 25.
 # alternator_max_items_in_batch_write: 100
+
+# 
+# io-streaming rate limiting
+# When setting this value to be non-zero scylla throttles disk throughput for
+# stream (network) activities such as backup, repair, tablet migration and more.
+# This limit is useful for user queries so the network interface does 
+# not get saturated by streaming activities.
+# The recommended value is 75% of network bandwidth
+# E.g for i4i.8xlarge (https://github.com/scylladb/scylla-machine-image/tree/next/common/aws_net_params.json):
+# network: 18.75 GiB/s --> 18750 Mib/s --> 1875 MB/s (from network bits to network bytes: divide by 10, not 8)
+# Converted to disk bytes: 1875 * 1000 / 1024 = 1831 MB/s (disk wise)
+# 75% of disk bytes is: 0.75 * 1831 = 1373 megabytes/s
+# stream_io_throughput_mb_per_sec: 1373
+# 
+
--- a/configure.py
+++ b/configure.py
@@ -1043,7 +1043,6 @@ scylla_core = (['message/messaging_service.cc',
                'utils/s3/client.cc',
                'utils/s3/retryable_http_client.cc',
                'utils/s3/retry_strategy.cc',
-                'utils/s3/s3_retry_strategy.cc',
                'utils/s3/credentials_providers/aws_credentials_provider.cc',
                'utils/s3/credentials_providers/environment_aws_credentials_provider.cc',
                'utils/s3/credentials_providers/instance_profile_credentials_provider.cc',
@@ -1537,6 +1536,7 @@ deps['test/boost/combined_tests'] += [
    'test/boost/query_processor_test.cc',
    'test/boost/reader_concurrency_semaphore_test.cc',
    'test/boost/repair_test.cc',
+    'test/boost/replicator_test.cc',
    'test/boost/restrictions_test.cc',
    'test/boost/role_manager_test.cc',
    'test/boost/row_cache_test.cc',
@@ -1546,6 +1546,7 @@ deps['test/boost/combined_tests'] += [
    'test/boost/sessions_test.cc',
    'test/boost/sstable_compaction_test.cc',
    'test/boost/sstable_compressor_factory_test.cc',
+    'test/boost/sstable_compression_config_test.cc',
    'test/boost/sstable_directory_test.cc',
    'test/boost/sstable_set_test.cc',
    'test/boost/statement_restrictions_test.cc',
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -1349,7 +1349,7 @@ static managed_bytes reserialize_value(View value_bytes,
    if (type.is_map()) {
        std::vector<std::pair<managed_bytes, managed_bytes>> elements = partially_deserialize_map(value_bytes);

-        const map_type_impl mapt = dynamic_cast<const map_type_impl&>(type);
+        const map_type_impl& mapt = dynamic_cast<const map_type_impl&>(type);
        const abstract_type& key_type = mapt.get_keys_type()->without_reversed();
        const abstract_type& value_type = mapt.get_values_type()->without_reversed();

@@ -1391,7 +1391,7 @@ static managed_bytes reserialize_value(View value_bytes,
        const vector_type_impl& vtype = dynamic_cast<const vector_type_impl&>(type);
        std::vector<managed_bytes> elements = vtype.split_fragmented(value_bytes);

-        auto elements_type = vtype.get_elements_type()->without_reversed();
+        const auto& elements_type = vtype.get_elements_type()->without_reversed();

        if (elements_type.bound_value_needs_to_be_reserialized()) {
            for (size_t i = 0; i < elements.size(); i++) {
--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -245,12 +245,18 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
            qp.db().real_database().validate_keyspace_update(*ks_md_update);

            service::topology_mutation_builder builder(ts);
+            service::topology_request_tracking_mutation_builder rtbuilder{global_request_id, qp.proxy().features().topology_requests_type_column};
+            rtbuilder.set("done", false)
+                     .set("start_time", db_clock::now());
            if (!qp.proxy().features().topology_global_request_queue) {
                builder.set_global_topology_request(service::global_topology_request::keyspace_rf_change);
                builder.set_global_topology_request_id(global_request_id);
                builder.set_new_keyspace_rf_change_data(_name, ks_options);
            } else {
                builder.queue_global_topology_request_id(global_request_id);
+                rtbuilder.set("request_type", service::global_topology_request::keyspace_rf_change)
+                         .set_new_keyspace_rf_change_data(_name, ks_options);
+
            };
            service::topology_change change{{builder.build()}};

@@ -259,13 +265,6 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
                return cm.to_mutation(topo_schema);
            });

-            service::topology_request_tracking_mutation_builder rtbuilder{global_request_id, qp.proxy().features().topology_requests_type_column};
-            rtbuilder.set("done", false)
-                     .set("start_time", db_clock::now())
-                     .set("request_type", service::global_topology_request::keyspace_rf_change);
-            if (qp.proxy().features().topology_global_request_queue) {
-                rtbuilder.set_new_keyspace_rf_change_data(_name, ks_options);
-            }
            service::topology_change req_change{{rtbuilder.build()}};

            auto topo_req_schema = qp.db().find_schema(db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY_REQUESTS);
@@ -277,33 +276,44 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
            muts.insert(muts.begin(), schema_mutations.begin(), schema_mutations.end());
        }

+        auto rs = locator::abstract_replication_strategy::create_replication_strategy(
+                ks_md_update->strategy_name(),
+                locator::replication_strategy_params(ks_md_update->strategy_options(), ks_md_update->initial_tablets()));
+
        // If `rf_rack_valid_keyspaces` is enabled, it's forbidden to perform a schema change that
        // would lead to an RF-rack-valid keyspace. Verify that this change does not.
        // For more context, see: scylladb/scylladb#23071.
-        if (qp.db().get_config().rf_rack_valid_keyspaces()) {
-            auto rs = locator::abstract_replication_strategy::create_replication_strategy(
-                    ks_md_update->strategy_name(),
-                    locator::replication_strategy_params(ks_md_update->strategy_options(), ks_md_update->initial_tablets()));
-
-            try {
-                // There are two things to note here:
-                // 1. We hold a group0_guard, so it's correct to check this here.
-                //    The topology or schema cannot change while we're performing this query.
-                // 2. The replication strategy we use here does NOT represent the actual state
-                //    we will arrive at after applying the schema change. For instance, if the user
-                //    did not specify the RF for some of the DCs, it's equal to 0 in the replication
-                //    strategy we pass to this function, while in reality that means that the RF
-                //    will NOT change. That is not a problem:
-                //    - RF=0 is valid for all DCs, so it won't trigger an exception on its own,
-                //    - the keyspace must've been RF-rack-valid before this change. We check that
-                //      condition for all keyspaces at startup.
-                //    The second hyphen is not really true because currently topological changes can
-                //    disturb it (see scylladb/scylladb#23345), but we ignore that.
-                locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
-            } catch (const std::exception& e) {
+        try {
+            // There are two things to note here:
+            // 1. We hold a group0_guard, so it's correct to check this here.
+            //    The topology or schema cannot change while we're performing this query.
+            // 2. The replication strategy we use here does NOT represent the actual state
+            //    we will arrive at after applying the schema change. For instance, if the user
+            //    did not specify the RF for some of the DCs, it's equal to 0 in the replication
+            //    strategy we pass to this function, while in reality that means that the RF
+            //    will NOT change. That is not a problem:
+            //    - RF=0 is valid for all DCs, so it won't trigger an exception on its own,
+            //    - the keyspace must've been RF-rack-valid before this change. We check that
+            //      condition for all keyspaces at startup.
+            //    The second hyphen is not really true because currently topological changes can
+            //    disturb it (see scylladb/scylladb#23345), but we ignore that.
+            locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
+        } catch (const std::exception& e) {
+            if (qp.db().get_config().rf_rack_valid_keyspaces()) {
                // There's no guarantee what the type of the exception will be, so we need to
                // wrap it manually here in a type that can be passed to the user.
                throw exceptions::invalid_request_exception(e.what());
+            } else {
+                // Even when the configuration option `rf_rack_valid_keyspaces` is set to false,
+                // we'd like to inform the user that the keyspace they're altering will not
+                // satisfy the restriction after the change--but just as a warning.
+                // For more context, see issue: scylladb/scylladb#23330.
+                warnings.push_back(seastar::format(
+                    "Keyspace '{}' is not RF-rack-valid: the replication factor doesn't match "
+                    "the rack count in at least one datacenter. A rack failure may reduce availability. "
+                    "For more context, see: "
+                    "https://docs.scylladb.com/manual/stable/reference/glossary.html#term-RF-rack-valid-keyspace.",
+                    _name));
            }
        }

--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -8,6 +8,7 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */

+#include "cdc/log.hh"
 #include "utils/assert.hh"
 #include <seastar/core/coroutine.hh>
 #include "cql3/query_options.hh"
@@ -27,6 +28,7 @@
 #include "db/view/view.hh"
 #include "cql3/query_processor.hh"
 #include "cdc/cdc_extension.hh"
+#include "cdc/cdc_partitioner.hh"

 namespace cql3 {

@@ -290,6 +292,53 @@ std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_sche
        throw exceptions::invalid_request_exception("Cannot use ALTER TABLE on Materialized View");
    }

+    const bool is_cdc_log_table = cdc::is_log_for_some_table(db.real_database(), s->ks_name(), s->cf_name());
+    // Only a CDC log table will have this partitioner name. User tables should
+    // not be able to set this. Note that we perform a similar check when trying to
+    // re-enable CDC for a table, when the log table has been replaced by a user table.
+    // For better visualization of the above, consider this
+    //
+    // cqlsh> CREATE TABLE ks.t (p int PRIMARY KEY, v int) WITH cdc = {'enabled': true};
+    // cqlsh> INSERT INTO ks.t (p, v) VALUES (1, 2);
+    // cqlsh> ALTER TABLE ks.t WITH cdc = {'enabled': false};
+    // cqlsh> DESC TABLE ks.t_scylla_cdc_log WITH INTERNALS; # Save this output!
+    // cqlsh> DROP TABLE ks.t_scylla_cdc_log;
+    // cqlsh> [Recreate the log table using the received statement]
+    // cqlsh> ALTER TABLE ks.t WITH cdc = {'enabled': true};
+    //
+    // InvalidRequest: Error from server: code=2200 [Invalid query] message="Cannot create CDC log
+    //                 table for table ks.t because a table of name ks.t_scylla_cdc_log already exists"
+    //
+    // See commit adda43edc75b901b2329bca8f3eb74596698d05f for more information on THAT case.
+    // We reuse the same technique here.
+    const bool was_cdc_log_table = s->get_partitioner().name() == cdc::cdc_partitioner::classname;
+
+    if (_column_changes.size() != 0 && is_cdc_log_table) {
+        throw exceptions::invalid_request_exception(
+                "You cannot modify the set of columns of a CDC log table directly. "
+                "Modify the base table instead.");
+    }
+    if (_column_changes.size() != 0 && was_cdc_log_table) {
+        throw exceptions::invalid_request_exception(
+                "You cannot modify the set of columns of a CDC log table directly. "
+                "Although the base table has deactivated CDC, this table will continue being "
+                "a CDC log table until it is dropped. If you want to modify the columns in it, "
+                "you can only do that by reenabling CDC on the base table, which will reattach "
+                "this log table. Then you will be able to modify the columns in the base table, "
+                "and that will have effect on the log table too. Modifying the columns of a CDC "
+                "log table directly is never allowed.");
+    }
+
+    if (_renames.size() != 0 && is_cdc_log_table) {
+        throw exceptions::invalid_request_exception("Cannot rename a column of a CDC log table.");
+    }
+    if (_renames.size() != 0 && was_cdc_log_table) {
+        throw exceptions::invalid_request_exception(
+                "You cannot rename a column of a CDC log table. Although the base table "
+                "has deactivated CDC, this table will continue being a CDC log table until it "
+                "is dropped.");
+    }
+
    auto cfm = schema_builder(s);

    if (_properties->get_id()) {
--- a/cql3/statements/create_keyspace_statement.cc
+++ b/cql3/statements/create_keyspace_statement.cc
@@ -124,15 +124,26 @@ future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector
        // If `rf_rack_valid_keyspaces` is enabled, it's forbidden to create an RF-rack-invalid keyspace.
        // Verify that it's RF-rack-valid.
        // For more context, see: scylladb/scylladb#23071.
-        if (cfg.rf_rack_valid_keyspaces()) {
-            try {
-                // We hold a group0_guard, so it's correct to check this here.
-                // The topology or schema cannot change while we're performing this query.
-                locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
-            } catch (const std::exception& e) {
+        try {
+            // We hold a group0_guard, so it's correct to check this here.
+            // The topology or schema cannot change while we're performing this query.
+            locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
+        } catch (const std::exception& e) {
+            if (cfg.rf_rack_valid_keyspaces()) {
                // There's no guarantee what the type of the exception will be, so we need to
                // wrap it manually here in a type that can be passed to the user.
                throw exceptions::invalid_request_exception(e.what());
+            } else {
+                // Even when the configuration option `rf_rack_valid_keyspaces` is set to false,
+                // we'd like to inform the user that the keyspace they're creating does not
+                // satisfy the restriction--but just as a warning.
+                // For more context, see issue: scylladb/scylladb#23330.
+                warnings.push_back(seastar::format(
+                    "Keyspace '{}' is not RF-rack-valid: the replication factor doesn't match "
+                    "the rack count in at least one datacenter. A rack failure may reduce availability. "
+                    "For more context, see: "
+                    "https://docs.scylladb.com/manual/stable/reference/glossary.html#term-RF-rack-valid-keyspace.",
+                    _name));
            }
        }
    } catch (const exceptions::already_exists_exception& e) {
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -31,6 +31,8 @@
 #include "db/config.hh"
 #include "compaction/time_window_compaction_strategy.hh"

+bool is_internal_keyspace(std::string_view name);
+
 namespace cql3 {

 namespace statements {
@@ -122,6 +124,10 @@ void create_table_statement::apply_properties_to(schema_builder& builder, const
        addColumnMetadataFromAliases(cfmd, Collections.singletonList(valueAlias), defaultValidator, ColumnDefinition.Kind.COMPACT_VALUE);
 #endif

+    if (!_properties->get_compression_options() && !is_internal_keyspace(keyspace())) {
+        builder.set_compressor_params(db.get_config().sstable_compression_user_table_options());
+    }
+
    _properties->apply_to_builder(builder, _properties->make_schema_extensions(db.extensions()), db, keyspace());
 }

--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -233,7 +233,10 @@ future<> select_statement::check_access(query_processor& qp, const service::clie
    try {
        const data_dictionary::database db = qp.db();
        auto&& s = db.find_schema(keyspace(), column_family());
-        auto& cf_name = s->is_view() ? s->view_info()->base_name() : column_family();
+        auto cdc = db.get_cdc_base_table(*s);
+        auto& cf_name = s->is_view()
+            ? s->view_info()->base_name()
+            : (cdc ? cdc->cf_name() : column_family());
        co_await state.has_column_family_access(keyspace(), cf_name, auth::permission::SELECT);
    } catch (const data_dictionary::no_such_column_family& e) {
        // Will be validated afterwards.
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -18,6 +18,7 @@
 #include <seastar/core/sleep.hh>

 #include "batchlog_manager.hh"
+#include "data_dictionary/data_dictionary.hh"
 #include "mutation/canonical_mutation.hh"
 #include "service/storage_proxy.hh"
 #include "system_keyspace.hh"
@@ -36,7 +37,7 @@

 static logging::logger blogger("batchlog_manager");

-const uint32_t db::batchlog_manager::replay_interval;
+const std::chrono::seconds db::batchlog_manager::replay_interval;
 const uint32_t db::batchlog_manager::page_size;

 db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, batchlog_manager_config config)
@@ -58,27 +59,31 @@ db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_key
    });
 }

-future<> db::batchlog_manager::do_batch_log_replay(post_replay_cleanup cleanup) {
-    return container().invoke_on(0, [cleanup] (auto& bm) -> future<> {
+future<db::all_batches_replayed> db::batchlog_manager::do_batch_log_replay(post_replay_cleanup cleanup) {
+    return container().invoke_on(0, [cleanup] (auto& bm) -> future<db::all_batches_replayed> {
        auto gate_holder = bm._gate.hold();
        auto sem_units = co_await get_units(bm._sem, 1);

        auto dest = bm._cpu++ % smp::count;
        blogger.debug("Batchlog replay on shard {}: starts", dest);
        auto last_replay = gc_clock::now();
+        all_batches_replayed all_replayed = all_batches_replayed::yes;
        if (dest == 0) {
-            co_await bm.replay_all_failed_batches(cleanup);
+            all_replayed = co_await bm.replay_all_failed_batches(cleanup);
        } else {
-            co_await bm.container().invoke_on(dest, [cleanup] (auto& bm) {
+            all_replayed = co_await bm.container().invoke_on(dest, [cleanup] (auto& bm) {
                return with_gate(bm._gate, [&bm, cleanup] {
                    return bm.replay_all_failed_batches(cleanup);
                });
            });
        }
-        co_await bm.container().invoke_on_all([last_replay] (auto& bm) {
-            bm._last_replay = last_replay;
-        });
+        if (all_replayed == all_batches_replayed::yes) {
+            co_await bm.container().invoke_on_all([last_replay] (auto& bm) {
+                bm._last_replay = last_replay;
+            });
+        }
        blogger.debug("Batchlog replay on shard {}: done", dest);
+        co_return all_replayed;
    });
 }

@@ -116,7 +121,8 @@ future<> db::batchlog_manager::batchlog_replay_loop() {
        } catch (...) {
            blogger.error("Exception in batch replay: {}", std::current_exception());
        }
-        delay = std::chrono::milliseconds(replay_interval);
+        delay = utils::get_local_injector().is_enabled("short_batchlog_manager_replay_interval") ?
+                std::chrono::seconds(1) : replay_interval;
    }
 }

@@ -132,6 +138,8 @@ future<> db::batchlog_manager::drain() {
        _sem.broken();
    }

+    co_await _qp.proxy().abort_batch_writes();
+
    co_await std::move(_loop_done);
    blogger.info("Drained");
 }
@@ -155,116 +163,127 @@ db_clock::duration db::batchlog_manager::get_batch_log_timeout() const {
    return _write_request_timeout * 2;
 }

-future<> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
+future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
    typedef db_clock::rep clock_type;

+    db::all_batches_replayed all_replayed = all_batches_replayed::yes;
    // rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
    // max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
    auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
    auto limiter = make_lw_shared<utils::rate_limiter>(throttle);

-    auto batch = [this, limiter](const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
+    auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
+    auto delete_batch = [this, schema = std::move(schema)] (utils::UUID id) {
+        auto key = partition_key::from_singular(*schema, id);
+        mutation m(schema, key);
+        auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
+        m.partition().apply_delete(*schema, clustering_key_prefix::make_empty(), tombstone(now, gc_clock::now()));
+        return _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+    };
+
+    auto batch = [this, limiter, delete_batch = std::move(delete_batch), &all_replayed](const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
        auto written_at = row.get_as<db_clock::time_point>("written_at");
        auto id = row.get_as<utils::UUID>("id");
        // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
+        auto now = db_clock::now();
        auto timeout = get_batch_log_timeout();
-        if (db_clock::now() < written_at + timeout) {
-            blogger.debug("Skipping replay of {}, too fresh", id);
-            return make_ready_future<stop_iteration>(stop_iteration::no);
+
+        if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
+            blogger.debug("Skipping batch replay due to skip_batch_replay injection");
+            all_replayed = all_batches_replayed::no;
+            co_return stop_iteration::no;
        }

        // check version of serialization format
        if (!row.has("version")) {
            blogger.warn("Skipping logged batch because of unknown version");
-            return make_ready_future<stop_iteration>(stop_iteration::no);
+            co_await delete_batch(id);
+            co_return stop_iteration::no;
        }

        auto version = row.get_as<int32_t>("version");
        if (version != netw::messaging_service::current_version) {
-            blogger.warn("Skipping logged batch because of incorrect version");
-            return make_ready_future<stop_iteration>(stop_iteration::no);
+            blogger.warn("Skipping logged batch because of incorrect version {}; current version = {}", version, netw::messaging_service::current_version);
+            co_await delete_batch(id);
+            co_return stop_iteration::no;
        }

        auto data = row.get_blob_unfragmented("data");

        blogger.debug("Replaying batch {}", id);

-        auto fms = make_lw_shared<std::deque<canonical_mutation>>();
-        auto in = ser::as_input_stream(data);
-        while (in.size()) {
-            fms->emplace_back(ser::deserialize(in, std::type_identity<canonical_mutation>()));
-        }
-
-        auto size = data.size();
-
-        return map_reduce(*fms, [this, written_at] (canonical_mutation& fm) {
-            const auto& cf = _qp.proxy().local_db().find_column_family(fm.column_family_id());
-            return make_ready_future<canonical_mutation*>(written_at > cf.get_truncation_time() ? &fm : nullptr);
-        },
-        std::vector<mutation>(),
-        [this] (std::vector<mutation> mutations, canonical_mutation* fm) {
-            if (fm) {
-                schema_ptr s = _qp.db().find_schema(fm->column_family_id());
-                mutations.emplace_back(fm->to_mutation(s));
+        try {
+            auto fms = make_lw_shared<std::deque<canonical_mutation>>();
+            auto in = ser::as_input_stream(data);
+            while (in.size()) {
+                fms->emplace_back(ser::deserialize(in, std::type_identity<canonical_mutation>()));
+                schema_ptr s = _qp.db().find_schema(fms->back().column_family_id());
+                timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
            }
-            return mutations;
-        }).then([this, limiter, written_at, size, fms] (std::vector<mutation> mutations) {
-            if (mutations.empty()) {
-                return make_ready_future<>();
+
+            if (now < written_at + timeout) {
+                blogger.debug("Skipping replay of {}, too fresh", id);
+                co_return stop_iteration::no;
            }
-            const auto ttl = [written_at]() -> clock_type {
-                /*
-                 * Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
-                 * This ensures that deletes aren't "undone" by an old batch replay.
-                 */
-                auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
-                warn(unimplemented::cause::HINT);
-#if 0
-                for (auto& m : *mutations) {
-                    unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
+
+            auto size = data.size();
+
+            auto mutations = co_await map_reduce(*fms, [this, written_at] (canonical_mutation& fm) {
+                const auto& cf = _qp.proxy().local_db().find_column_family(fm.column_family_id());
+                return make_ready_future<canonical_mutation*>(written_at > cf.get_truncation_time() ? &fm : nullptr);
+            },
+            std::vector<mutation>(),
+            [this] (std::vector<mutation> mutations, canonical_mutation* fm) {
+                if (fm) {
+                    schema_ptr s = _qp.db().find_schema(fm->column_family_id());
+                    mutations.emplace_back(fm->to_mutation(s));
                }
-#endif
-                return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
-            }();
-
-            if (ttl <= 0) {
-                return make_ready_future<>();
-            }
-            // Origin does the send manually, however I can't see a super great reason to do so.
-            // Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
-            // in both cases.
-            // FIXME: verify that the above is reasonably true.
-            return limiter->reserve(size).then([this, mutations = std::move(mutations)] {
-                _stats.write_attempts += mutations.size();
-                // #1222 - change cl level to ALL, emulating origins behaviour of sending/hinting
-                // to all natural end points.
-                // Note however that origin uses hints here, and actually allows for this
-                // send to partially or wholly fail in actually sending stuff. Since we don't
-                // have hints (yet), send with CL=ALL, and hope we can re-do this soon.
-                // See below, we use retry on write failure.
-                return _qp.proxy().mutate(mutations, db::consistency_level::ALL, db::no_timeout, nullptr, empty_service_permit(), db::allow_per_partition_rate_limit::no);
+                return mutations;
            });
-        }).then_wrapped([this, id](future<> batch_result) {
-            try {
-                batch_result.get();
-            } catch (data_dictionary::no_such_keyspace& ex) {
-                // should probably ignore and drop the batch
-            } catch (...) {
-                blogger.warn("Replay failed (will retry): {}", std::current_exception());
-                // timeout, overload etc.
-                // Do _not_ remove the batch, assuning we got a node write error.
-                // Since we don't have hints (which origin is satisfied with),
-                // we have to resort to keeping this batch to next lap.
-                return make_ready_future<>();
+
+            if (!mutations.empty()) {
+                const auto ttl = [written_at]() -> clock_type {
+                    /*
+                    * Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
+                    * This ensures that deletes aren't "undone" by an old batch replay.
+                    */
+                    auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
+                    warn(unimplemented::cause::HINT);
+#if 0
+                    for (auto& m : *mutations) {
+                        unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
+                    }
+#endif
+                    return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
+                }();
+
+                if (ttl > 0) {
+                    // Origin does the send manually, however I can't see a super great reason to do so.
+                    // Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
+                    // in both cases.
+                    // FIXME: verify that the above is reasonably true.
+                    co_await limiter->reserve(size);
+                        _stats.write_attempts += mutations.size();
+                        auto timeout = db::timeout_clock::now() + write_timeout;
+                        co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
+                }
            }
-            // delete batch
-            auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
-            auto key = partition_key::from_singular(*schema, id);
-            mutation m(schema, key);
-            auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
-            m.partition().apply_delete(*schema, clustering_key_prefix::make_empty(), tombstone(now, gc_clock::now()));
-            return _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
-        }).then([] { return make_ready_future<stop_iteration>(stop_iteration::no); });
+        } catch (data_dictionary::no_such_keyspace& ex) {
+            // should probably ignore and drop the batch
+        } catch (const data_dictionary::no_such_column_family&) {
+            // As above -- we should drop the batch if the table doesn't exist anymore.
+        } catch (...) {
+            blogger.warn("Replay failed (will retry): {}", std::current_exception());
+            all_replayed = all_batches_replayed::no;
+            // timeout, overload etc.
+            // Do _not_ remove the batch, assuning we got a node write error.
+            // Since we don't have hints (which origin is satisfied with),
+            // we have to resort to keeping this batch to next lap.
+            co_return stop_iteration::no;
+        }
+        // delete batch
+        co_await delete_batch(id);
+        co_return stop_iteration::no;
    };

    co_await with_gate(_gate, [this, cleanup, batch = std::move(batch)] () mutable -> future<> {
@@ -286,4 +305,6 @@ future<> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cle
            blogger.debug("Finished replayAllFailedBatches");
        });
    });
+
+    co_return all_replayed;
 }
--- a/db/batchlog_manager.hh
+++ b/db/batchlog_manager.hh
@@ -31,6 +31,8 @@ namespace db {

 class system_keyspace;

+using all_batches_replayed = bool_class<struct all_batches_replayed_tag>;
+
 struct batchlog_manager_config {
    std::chrono::duration<double> write_request_timeout;
    uint64_t replay_rate = std::numeric_limits<uint64_t>::max();
@@ -43,8 +45,9 @@ public:
    using post_replay_cleanup = bool_class<class post_replay_cleanup_tag>;

 private:
-    static constexpr uint32_t replay_interval = 60 * 1000; // milliseconds
+    static constexpr std::chrono::seconds replay_interval = std::chrono::seconds(60);
    static constexpr uint32_t page_size = 128; // same as HHOM, for now, w/out using any heuristics. TODO: set based on avg batch size.
+    static constexpr std::chrono::seconds write_timeout = std::chrono::seconds(300);

    using clock_type = lowres_clock;

@@ -68,7 +71,7 @@ private:

    gc_clock::time_point _last_replay;

-    future<> replay_all_failed_batches(post_replay_cleanup cleanup);
+    future<all_batches_replayed> replay_all_failed_batches(post_replay_cleanup cleanup);
 public:
    // Takes a QP, not a distributes. Because this object is supposed
    // to be per shard and does no dispatching beyond delegating the the
@@ -79,7 +82,7 @@ public:
    future<> drain();
    future<> stop();

-    future<> do_batch_log_replay(post_replay_cleanup cleanup);
+    future<all_batches_replayed> do_batch_log_replay(post_replay_cleanup cleanup);

    future<size_t> count_all_batches() const;
    db_clock::duration get_batch_log_timeout() const;
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -501,6 +501,9 @@ public:
    void flush_segments(uint64_t size_to_remove);
    void check_no_data_older_than_allowed();

+    // whitebox testing
+    std::function<future<>()> _oversized_pre_wait_memory_func;
+
 private:
    class shutdown_marker{};

@@ -800,6 +803,8 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
    void end_flush() {
        _segment_manager->end_flush();
        if (can_delete()) {
+            // #25709 - do this early if possible
+            _extended_segments.clear();
            _segment_manager->discard_unused_segments();
        }
    }
@@ -875,6 +880,8 @@ public:
    void release_cf_count(const cf_id_type& cf) {
        mark_clean(cf, 1);
        if (can_delete()) {
+            // #25709 - do this early if possible
+            _extended_segments.clear();
            _segment_manager->discard_unused_segments();
        }
    }
@@ -1592,8 +1599,15 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ

    scope_increment_counter allocating(totals.active_allocations);

+    // #27992 - whitebox testing. signal we are trying to lock out 
+    // all allocators
+    if (_oversized_pre_wait_memory_func) {
+        co_await _oversized_pre_wait_memory_func();
+    }
+
    auto permit = co_await std::move(fut);
-    SCYLLA_ASSERT(_request_controller.available_units() == 0);
+    // #27992 - task reordering _can_ force the available units to negative. this is ok.
+    SCYLLA_ASSERT(_request_controller.available_units() <= 0);

    decltype(permit) fake_permit; // can't have allocate+sync release semaphore.
    bool failed = false;
@@ -1854,13 +1868,15 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
            }
        }
    }
-    SCYLLA_ASSERT(_request_controller.available_units() == 0);
+
+    auto avail = _request_controller.available_units();
+    SCYLLA_ASSERT(avail <= 0);
    SCYLLA_ASSERT(permit.count() == max_request_controller_units());
    auto nw = _request_controller.waiters();
    permit.return_all();
    // #20633 cannot guarantee controller avail is now full, since we could have had waiters when doing
    // return all -> now will be less avail
-    SCYLLA_ASSERT(nw > 0 || _request_controller.available_units() == ssize_t(max_request_controller_units()));
+    SCYLLA_ASSERT(nw > 0 || _request_controller.available_units() == (avail + ssize_t(max_request_controller_units())));

    if (!failed) {
        clogger.trace("Oversized allocation succeeded.");
@@ -1969,13 +1985,13 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
            }
            continue;
        } catch (shutdown_marker&) {
-            _reserve_segments.abort(std::current_exception());
            break;
        } catch (...) {
            clogger.warn("Exception in segment reservation: {}", std::current_exception());
        }
        co_await sleep(100ms);
    }
+    _reserve_segments.abort(std::make_exception_ptr(shutdown_marker()));
 }

 future<std::vector<db::commitlog::descriptor>>
@@ -2576,20 +2592,24 @@ struct fmt::formatter<db::commitlog::segment::cf_mark> {
 void db::commitlog::segment_manager::discard_unused_segments() noexcept {
    clogger.trace("Checking for unused segments ({} active)", _segments.size());

-    std::erase_if(_segments, [=](sseg_ptr s) {
-        if (s->can_delete()) {
-            clogger.debug("Segment {} is unused", *s);
-            return true;
-        }
-        if (s->is_still_allocating()) {
-            clogger.debug("Not safe to delete segment {}; still allocating.", *s);
-        } else if (!s->is_clean()) {
-            clogger.debug("Not safe to delete segment {}; dirty is {}", *s, segment::cf_mark {*s});
-        } else {
-            clogger.debug("Not safe to delete segment {}; disk ops pending", *s);
-        }
-        return false;
-    });
+    // #25709 ensure we don't free any segment until after prune.
+    {
+        auto tmp = _segments; 
+        std::erase_if(_segments, [=](sseg_ptr s) {
+            if (s->can_delete()) {
+                clogger.debug("Segment {} is unused", *s);
+                return true;
+            }
+            if (s->is_still_allocating()) {
+                clogger.debug("Not safe to delete segment {}; still allocating.", *s);
+            } else if (!s->is_clean()) {
+                clogger.debug("Not safe to delete segment {}; dirty is {}", *s, segment::cf_mark {*s});
+            } else {
+                clogger.debug("Not safe to delete segment {}; disk ops pending", *s);
+            }
+            return false;
+        });
+    }

    // launch in background, but guard with gate so this deletion is
    // sure to finish in shutdown, because at least through this path,
@@ -2878,7 +2898,10 @@ future<> db::commitlog::segment_manager::do_pending_deletes() {
 }

 future<> db::commitlog::segment_manager::orphan_all() {
-    _segments.clear();
+    // #25709. the actual process of destroying the elements here
+    // might cause a call into discard_unused_segments.
+    // ensure the target vector is empty when we get to destructors
+    auto tmp = std::exchange(_segments, {});
    return clear_reserve_segments();
 }

@@ -3255,9 +3278,13 @@ const db::commitlog::config& db::commitlog::active_config() const {
    return _segment_manager->cfg;
 }

+db::commitlog::segment_data_corruption_error::segment_data_corruption_error(std::string_view msg, uint64_t s)
+    : _msg(fmt::format("Segment data corruption: {}", msg))
+    , _bytes(s)
+{}

-db::commitlog::segment_truncation::segment_truncation(uint64_t pos) 
-    : _msg(fmt::format("Segment truncation at {}", pos))
+db::commitlog::segment_truncation::segment_truncation(std::string_view reason, uint64_t pos)
+    : _msg(fmt::format("Segment truncation at {}. Reason: {}", pos, reason))
    , _pos(pos)
 {}

@@ -3447,7 +3474,8 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin

            while (rem < size) {
                if (eof) {
-                    throw segment_truncation(block_boundry);
+                    auto reason = fmt::format("unexpected EOF, rem={}, size={}", rem, size);
+                    throw segment_truncation(std::move(reason), block_boundry);
                }

                auto block_size = alignment - initial.size_bytes();
@@ -3458,7 +3486,8 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin

                if (tmp.size_bytes() == 0) {
                    eof = true;
-                    throw segment_truncation(block_boundry);
+                    auto reason = fmt::format("read 0 bytes, while tried to read {}", block_size);
+                    throw segment_truncation(std::move(reason), block_boundry);
                }

                crc32_nbo crc;
@@ -3493,10 +3522,12 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
                    auto checksum = crc.checksum();

                    if (check != checksum) {
-                        throw segment_data_corruption_error("Data corruption", alignment);
+                        auto reason = fmt::format("checksums do not match: {:x} vs. {:x}", check, checksum);
+                        throw segment_data_corruption_error(std::move(reason), alignment);
                    }
                    if (id != this->id) {
-                        throw segment_truncation(pos + rem);
+                        auto reason = fmt::format("IDs do not match: {} vs. {}", id, this->id);
+                        throw segment_truncation(std::move(reason), pos + rem);
                    }
                }
                tmp.remove_suffix(detail::sector_overhead_size);
@@ -3604,6 +3635,10 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
            auto old = pos;
            pos = next_pos(off);
            clogger.trace("Pos {} -> {} ({})", old, pos, off);
+            // #24346 check eof status whenever we move file pos.
+            if (pos >= file_size) {
+                eof = true;
+            }
        }

        future<> read_entry() {
@@ -3771,7 +3806,8 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
                    co_await read_chunk();
                }
                if (corrupt_size > 0) {
-                    throw segment_data_corruption_error("Data corruption", corrupt_size);
+                    auto reason = fmt::format("corrupted size while reading file: {}", corrupt_size);
+                    throw segment_data_corruption_error(std::move(reason), corrupt_size);
                }
            } catch (...) {
                p = std::current_exception();
@@ -3918,6 +3954,9 @@ void db::commitlog::update_max_data_lifetime(std::optional<uint64_t> commitlog_d
    _segment_manager->cfg.commitlog_data_max_lifetime_in_seconds = commitlog_data_max_lifetime_in_seconds;
 }

+void db::commitlog::set_oversized_pre_wait_memory_func(std::function<future<>()> f) {
+    _segment_manager->_oversized_pre_wait_memory_func = std::move(f);
+}

 future<std::vector<sstring>> db::commitlog::get_segments_to_replay() const {
    return _segment_manager->get_segments_to_replay();
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -385,6 +385,9 @@ public:
    // (Re-)set data mix lifetime.
    void update_max_data_lifetime(std::optional<uint64_t> commitlog_data_max_lifetime_in_seconds);

+    // Whitebox testing. Do not use for production
+    void set_oversized_pre_wait_memory_func(std::function<future<>()>);
+
    using commit_load_reader_func = std::function<future<>(buffer_and_replay_position)>;

    class segment_error : public std::exception {};
@@ -392,9 +395,7 @@ public:
    class segment_data_corruption_error: public segment_error {
        std::string _msg;
    public:
-        segment_data_corruption_error(std::string msg, uint64_t s)
-                : _msg(std::move(msg)), _bytes(s) {
-        }
+        segment_data_corruption_error(std::string_view msg, uint64_t s);
        uint64_t bytes() const {
            return _bytes;
        }
@@ -425,7 +426,7 @@ public:
        std::string _msg;
        uint64_t _pos;
    public:
-        segment_truncation(uint64_t);
+        segment_truncation(std::string_view reason, uint64_t position);

        uint64_t position() const;
        const char* what() const noexcept override;
--- a/db/config.cc
+++ b/db/config.cc
@@ -32,11 +32,15 @@
 #include "db/tags/extension.hh"
 #include "config.hh"
 #include "extensions.hh"
+#include "compress.hh"
 #include "utils/log.hh"
 #include "service/tablet_allocator_fwd.hh"
 #include "utils/config_file_impl.hh"
+#include "exceptions/exceptions.hh"
 #include <seastar/core/metrics_api.hh>
 #include <seastar/core/relabel_config.hh>
+
+static logging::logger cfglogger("config");
 #include <seastar/util/file.hh>

 namespace utils {
@@ -86,6 +90,12 @@ object_storage_endpoints_to_json(const std::vector<db::object_storage_endpoint_p
    return value_to_json(m);
 }

+static
+json::json_return_type
+uuid_to_json(const db::config::UUID& uuid) {
+    return value_to_json(format("{}", uuid));
+}
+
 // Convert a value that can be printed with fmt::format, or a vector of
 // such values, to JSON. An example is enum_option<T>, because enum_option<T>
 // has a specialization for fmt::formatter.
@@ -111,6 +121,12 @@ error_injection_list_to_json(const std::vector<db::config::error_injection_at_st
    return value_to_json("error_injection_list");
 }

+static
+json::json_return_type
+compression_parameters_to_json(const compression_parameters& cp) {
+    return value_to_json(cp.get_options());
+}
+
 template <>
 bool
 config_from_string(std::string_view value) {
@@ -294,6 +310,18 @@ const config_type& config_type_for<std::vector<db::object_storage_endpoint_param
    return ct;
 }

+template <>
+const config_type& config_type_for<db::config::UUID>() {
+    static config_type ct("UUID", uuid_to_json);
+    return ct;
+}
+
+template <>
+const config_type& config_type_for<compression_parameters>() {
+    static config_type ct("compression parameters", compression_parameters_to_json);
+    return ct;
+}
+
 }

 namespace YAML {
@@ -491,6 +519,50 @@ struct convert<db::object_storage_endpoint_param> {
    }
 };

+template<>
+struct convert<utils::UUID> {
+    static bool decode(const Node& node, utils::UUID& uuid) {
+        std::string uuid_string;
+        if (!convert<std::string>::decode(node, uuid_string)) {
+            return false;
+        }
+        try {
+            std::istringstream(uuid_string) >> uuid;
+        } catch (boost::program_options::invalid_option_value&) {
+            return false;
+        }
+        return true;
+    }
+};
+
+template<>
+struct convert<compression_parameters> {
+    static bool decode(const Node& node, compression_parameters& cp) {
+        if (!node.IsMap()) {
+            return false;
+        }
+
+        std::map<sstring, sstring> options;
+        for (const auto& kv : node) {
+            options[kv.first.as<sstring>()] = kv.second.as<sstring>();
+        }
+
+        try {
+            cp = compression_parameters(options);
+            return true;
+        } catch (const exceptions::syntax_exception& e) {
+            cfglogger.error("Invalid compression parameters syntax: {}", e.what());
+            return false;
+        } catch (const exceptions::configuration_exception& e) {
+            cfglogger.error("Invalid compression parameters configuration: {}", e.what());
+            return false;
+        } catch (const std::runtime_error& e) {
+            cfglogger.error("Error parsing compression parameters: {}", e.what());
+            return false;
+        }
+    }
+};
+
 }

 #if defined(DEBUG)
@@ -819,7 +891,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , inter_dc_stream_throughput_outbound_megabits_per_sec(this, "inter_dc_stream_throughput_outbound_megabits_per_sec", value_status::Unused, 0,
        "Throttles all streaming file transfer between the data centers. This setting allows throttles streaming throughput betweens data centers in addition to throttling all network stream traffic as configured with stream_throughput_outbound_megabits_per_sec.")
    , stream_io_throughput_mb_per_sec(this, "stream_io_throughput_mb_per_sec", liveness::LiveUpdate, value_status::Used, 0,
-        "Throttles streaming I/O to the specified total throughput (in MiBs/s) across the entire system. Streaming I/O includes the one performed by repair and both RBNO and legacy topology operations such as adding or removing a node. Setting the value to 0 disables stream throttling.")
+        "Throttles streaming I/O to the specified total throughput (in MiBs/s) across the entire system. Streaming I/O includes the one performed by repair and both RBNO and legacy topology operations such as adding or removing a node. Setting the value to 0 disables stream throttling. It is recommended to set the value for this parameter to be 75% of network bandwidth")
    , stream_plan_ranges_fraction(this, "stream_plan_ranges_fraction", liveness::LiveUpdate, value_status::Used, 0.1,
        "Specify the fraction of ranges to stream in a single stream plan. Value is between 0 and 1.")
    , enable_file_stream(this, "enable_file_stream", liveness::LiveUpdate, value_status::Used, true, "Set true to use file based stream for tablet instead of mutation based stream")
@@ -942,6 +1014,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "The default timeout for other, miscellaneous operations.\n"
        "\n"
        "Related information: About hinted handoff writes")
+    , request_timeout_on_shutdown_in_seconds(this, "request_timeout_on_shutdown_in_seconds", value_status::Used, 30,
+        "Timeout for CQL server requests on shutdown. After this timeout the server will shutdown all connections.")
    , group0_raft_op_timeout_in_ms(this, "group0_raft_op_timeout_in_ms", liveness::LiveUpdate, value_status::Used, 60000,
            "The time in milliseconds that group0 allows a Raft operation to complete.")
    /**
@@ -1230,7 +1304,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , sstable_summary_ratio(this, "sstable_summary_ratio", value_status::Used, 0.0005, "Enforces that 1 byte of summary is written for every N (2000 by default)"
        "bytes written to data file. Value must be between 0 and 1.")
    , components_memory_reclaim_threshold(this, "components_memory_reclaim_threshold", liveness::LiveUpdate, value_status::Used, .2, "Ratio of available memory for all in-memory components of SSTables in a shard beyond which the memory will be reclaimed from components until it falls back under the threshold. Currently, this limit is only enforced for bloom filters.")
-    , large_memory_allocation_warning_threshold(this, "large_memory_allocation_warning_threshold", value_status::Used, (size_t(128) << 10) + 1, "Warn about memory allocations above this size; set to zero to disable.")
+    , large_memory_allocation_warning_threshold(this, "large_memory_allocation_warning_threshold", value_status::Used, size_t(1) << 20, "Warn about memory allocations above this size; set to zero to disable.")
    , enable_deprecated_partitioners(this, "enable_deprecated_partitioners", value_status::Used, false, "Enable the byteordered and random partitioners. These partitioners are deprecated and will be removed in a future version.")
    , enable_keyspace_column_family_metrics(this, "enable_keyspace_column_family_metrics", value_status::Used, false, "Enable per keyspace and per column family metrics reporting.")
    , enable_node_aggregated_table_metrics(this, "enable_node_aggregated_table_metrics", value_status::Used, true, "Enable aggregated per node, per keyspace and per table metrics reporting, applicable if enable_keyspace_column_family_metrics is false.")
@@ -1243,6 +1317,14 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , enable_sstables_mc_format(this, "enable_sstables_mc_format", value_status::Unused, true, "Enable SSTables 'mc' format to be used as the default file format.  Deprecated, please use \"sstable_format\" instead.")
    , enable_sstables_md_format(this, "enable_sstables_md_format", value_status::Unused, true, "Enable SSTables 'md' format to be used as the default file format.  Deprecated, please use \"sstable_format\" instead.")
    , sstable_format(this, "sstable_format", value_status::Used, "me", "Default sstable file format", {"md", "me"})
+    , sstable_compression_user_table_options(this, "sstable_compression_user_table_options", value_status::Used, compression_parameters{},
+        "Server-global user table compression options. If enabled, all user tables"
+        "will be compressed using the provided options, unless overridden"
+        "by compression options in the table schema. The available options are:\n"
+        "* sstable_compression: The compression algorithm to use. Supported values: LZ4Compressor (default), LZ4WithDictsCompressor, SnappyCompressor, DeflateCompressor, ZstdCompressor, ZstdWithDictsCompressor, '' (empty string; disables compression).\n"
+        "* chunk_length_in_kb: (Default: 4) The size of chunks to compress in kilobytes. Allowed values are powers of two between 1 and 128.\n"
+        "* crc_check_chance: (Default: 1.0) Not implemented (option value is ignored).\n"
+        "* compression_level: (Default: 3) Compression level for ZstdCompressor and ZstdWithDictsCompressor. Higher levels provide better compression ratios at the cost of speed. Allowed values are integers between 1 and 22.")
    , sstable_compression_dictionaries_allow_in_ddl(this, "sstable_compression_dictionaries_allow_in_ddl", liveness::LiveUpdate, value_status::Used, true,
        "Allows for configuring tables to use SSTable compression with shared dictionaries. "
        "If the option is disabled, Scylla will reject CREATE and ALTER statements which try to set dictionary-based sstable compressors.\n"
@@ -1399,7 +1481,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "The maximum fraction of cache memory permitted for use by index cache. Clamped to the [0.0; 1.0] range. Must be small enough to not deprive the row cache of memory, but should be big enough to fit a large fraction of the index. The default value 0.2 means that at least 80\% of cache memory is reserved for the row cache, while at most 20\% is usable by the index cache.")
    , consistent_cluster_management(this, "consistent_cluster_management", value_status::Deprecated, true, "Use RAFT for cluster management and DDL.")
    , force_gossip_topology_changes(this, "force_gossip_topology_changes", value_status::Used, false, "Force gossip-based topology operations in a fresh cluster. Only the first node in the cluster must use it. The rest will fall back to gossip-based operations anyway. This option should be used only for testing.  Note: gossip topology changes are incompatible with tablets.")
-    , recovery_leader(this, "recovery_leader", liveness::LiveUpdate, value_status::Used, "", "Host ID of the node restarted first while performing the Manual Raft-based Recovery Procedure. Warning: this option disables some guardrails for the needs of the Manual Raft-based Recovery Procedure. Make sure you unset it at the end of the procedure.")
+    , recovery_leader(this, "recovery_leader", liveness::LiveUpdate, value_status::Used, utils::null_uuid(), "Host ID of the node restarted first while performing the Manual Raft-based Recovery Procedure. Warning: this option disables some guardrails for the needs of the Manual Raft-based Recovery Procedure. Make sure you unset it at the end of the procedure.")
    , wasm_cache_memory_fraction(this, "wasm_cache_memory_fraction", value_status::Used, 0.01, "Maximum total size of all WASM instances stored in the cache as fraction of total shard memory.")
    , wasm_cache_timeout_in_ms(this, "wasm_cache_timeout_in_ms", value_status::Used, 5000, "Time after which an instance is evicted from the cache.")
    , wasm_cache_instance_size_limit(this, "wasm_cache_instance_size_limit", value_status::Used, 1024*1024, "Instances with size above this limit will not be stored in the cache.")
--- a/db/config.hh
+++ b/db/config.hh
@@ -25,6 +25,7 @@
 #include "utils/error_injection.hh"
 #include "utils/dict_trainer.hh"
 #include "utils/advanced_rpc_compressor.hh"
+#include "compress.hh"

 namespace boost::program_options {

@@ -207,6 +208,7 @@ public:
    using seed_provider_type = db::seed_provider_type;
    using hinted_handoff_enabled_type = db::hints::host_filter;
    using error_injection_at_startup = db::error_injection_at_startup;
+    using UUID = utils::UUID;

    /*
     * All values and documentation taken from
@@ -322,6 +324,7 @@ public:
    named_value<uint32_t> truncate_request_timeout_in_ms;
    named_value<uint32_t> write_request_timeout_in_ms;
    named_value<uint32_t> request_timeout_in_ms;
+    named_value<uint32_t> request_timeout_on_shutdown_in_seconds;
    named_value<uint32_t> group0_raft_op_timeout_in_ms;
    named_value<bool> cross_node_timeout;
    named_value<uint32_t> internode_send_buff_size_in_bytes;
@@ -436,6 +439,7 @@ public:
    named_value<bool> enable_sstables_mc_format;
    named_value<bool> enable_sstables_md_format;
    named_value<sstring> sstable_format;
+    named_value<compression_parameters> sstable_compression_user_table_options;
    named_value<bool> sstable_compression_dictionaries_allow_in_ddl;
    named_value<bool> sstable_compression_dictionaries_enable_writing;
    named_value<float> sstable_compression_dictionaries_memory_budget_fraction;
@@ -521,7 +525,7 @@ public:

    named_value<bool> consistent_cluster_management;
    named_value<bool> force_gossip_topology_changes;
-    named_value<sstring> recovery_leader;
+    named_value<UUID> recovery_leader;

    named_value<double> wasm_cache_memory_fraction;
    named_value<uint32_t> wasm_cache_timeout_in_ms;
--- a/db/hints/internal/hint_endpoint_manager.cc
+++ b/db/hints/internal/hint_endpoint_manager.cc
@@ -65,18 +65,18 @@ future<> hint_endpoint_manager::do_store_hint(schema_ptr s, lw_shared_ptr<const
        const replay_position rp = rh.release();
        if (_last_written_rp < rp) {
            _last_written_rp = rp;
-            manager_logger.debug("[{}] Updated last written replay position to {}", end_point_key(), rp);
+            manager_logger.trace("hint_endpoint_manager[{}]:do_store_hint: Updated last written replay position to {}", end_point_key(), rp);
        }

        ++shard_stats().written;

-        manager_logger.trace("Hint to {} was stored", end_point_key());
+        manager_logger.trace("hint_endpoint_manager[{}]:do_store_hint: Hint has been stored", end_point_key());
        tracing::trace(tr_state, "Hint to {} was stored", end_point_key());
    } catch (...) {
        ++shard_stats().errors;
        const auto eptr = std::current_exception();

-        manager_logger.debug("store_hint(): got the exception when storing a hint to {}: {}", end_point_key(), eptr);
+        manager_logger.debug("hint_endpoint_manager[{}]:do_store_hint: Exception when storing a hint: {}", end_point_key(), eptr);
        tracing::trace(tr_state, "Failed to store a hint to {}: {}", end_point_key(), eptr);
    }

@@ -92,7 +92,7 @@ bool hint_endpoint_manager::store_hint(schema_ptr s, lw_shared_ptr<const frozen_
            return do_store_hint(std::move(s), std::move(fm), tr_state);
        });
    } catch (...) {
-        manager_logger.trace("Failed to store a hint to {}: {}", end_point_key(), std::current_exception());
+        manager_logger.trace("hint_endpoint_manager[{}]:store_hint: Failed to store a hint: {}", end_point_key(), std::current_exception());
        tracing::trace(tr_state, "Failed to store a hint to {}: {}", end_point_key(), std::current_exception());

        ++shard_stats().dropped;
@@ -109,16 +109,23 @@ future<> hint_endpoint_manager::populate_segments_to_replay() {
 }

 void hint_endpoint_manager::start() {
+    manager_logger.debug("hint_endpoint_manager[{}]:start: Starting", end_point_key());
+
    clear_stopped();
    allow_hints();
    _sender.start();
+
+    manager_logger.debug("hint_endpoint_manager[{}]:start: Finished", end_point_key());
 }

 future<> hint_endpoint_manager::stop(drain should_drain) noexcept {
-    if(stopped()) {
+    if (stopped()) {
+        manager_logger.warn("hint_endpoint_manager[{}]:stop: Stop had already been called", end_point_key());
        return make_exception_future<>(std::logic_error(format("ep_manager[{}]: stop() is called twice", _key).c_str()));
    }

+    manager_logger.debug("hint_endpoint_manager[{}]:stop: Starting", end_point_key());
+
    return seastar::async([this, should_drain] {
        std::exception_ptr eptr;

@@ -139,10 +146,11 @@ future<> hint_endpoint_manager::stop(drain should_drain) noexcept {
        }).handle_exception([&eptr] (auto e) { eptr = std::move(e); }).get();

        if (eptr) {
-            manager_logger.error("ep_manager[{}]: exception: {}", _key, eptr);
+            manager_logger.error("hint_endpoint_manager[{}]:stop: Exception occurred: {}", _key, eptr);
        }

        set_stopped();
+        manager_logger.debug("hint_endpoint_manager[{}]:stop: Finished", end_point_key());
    });
 }

@@ -194,7 +202,7 @@ future<hints_store_ptr> hint_endpoint_manager::get_or_load() {
 }

 future<db::commitlog> hint_endpoint_manager::add_store() noexcept {
-    manager_logger.trace("Going to add a store to {}", _hints_dir.c_str());
+    manager_logger.debug("hint_endpoint_manager[{}]:add_store: Going to add a store: {}", end_point_key(), _hints_dir.native());

    return futurize_invoke([this] {
        return io_check([name = _hints_dir.c_str()] { return recursive_touch_directory(name); }).then([this] () {
@@ -240,7 +248,7 @@ future<db::commitlog> hint_endpoint_manager::add_store() noexcept {
            // which is larger than the segment ID of the RP of the last written hint.
            cfg.base_segment_id = _last_written_rp.base_id();

-            return commitlog::create_commitlog(std::move(cfg)).then([this] (commitlog l) -> future<commitlog> {
+            return commitlog::create_commitlog(std::move(cfg)).then([this] (this auto, commitlog l) -> future<commitlog> {
                // add_store() is triggered every time hint files are forcefully flushed to I/O (every hints_flush_period).
                // When this happens we want to refill _sender's segments only if it has finished with the segments he had before.
                if (_sender.have_segments()) {
@@ -289,6 +297,8 @@ future<db::commitlog> hint_endpoint_manager::add_store() noexcept {
                    _sender.add_segment(std::move(seg));
                }

+                manager_logger.debug("hint_endpoint_manager[{}]:add_store: Finished", end_point_key());
+
                co_return l;
            });
        });
--- a/db/hints/internal/hint_sender.cc
+++ b/db/hints/internal/hint_sender.cc
@@ -56,8 +56,8 @@ future<> hint_sender::flush_maybe() noexcept {
    if (current_time >= _next_flush_tp) {
        return _ep_manager.flush_current_hints().then([this, current_time] {
            _next_flush_tp = current_time + manager::hints_flush_period;
-        }).handle_exception([] (auto eptr) {
-            manager_logger.trace("flush_maybe() failed: {}", eptr);
+        }).handle_exception([this] (auto eptr) {
+            manager_logger.debug("hint_sender[{}]:flush_maybe: Failed with {}", _ep_key, eptr);
            return make_ready_future<>();
        });
    }
@@ -115,7 +115,7 @@ const column_mapping& hint_sender::get_column_mapping(lw_shared_ptr<send_one_fil
            throw no_column_mapping(fm.schema_version());
        }

-        manager_logger.debug("new schema version {}", fm.schema_version());
+        manager_logger.trace("hint_sender[{}]:get_column_mapping: new schema version {}", _ep_key, fm.schema_version());
        cm_it = ctx_ptr->schema_ver_to_column_mapping.emplace(fm.schema_version(), *hr.get_column_mapping()).first;
    }

@@ -175,23 +175,22 @@ future<> hint_sender::stop(drain should_drain) noexcept {
            //
            // The next call for send_hints_maybe() will send the last hints to the current end point and when it is
            // done there is going to be no more pending hints and the corresponding hints directory may be removed.
-            manager_logger.trace("Draining for {}: start", end_point_key());
+            manager_logger.trace("hint_sender[{}]:stop: Draining starts", end_point_key());
            set_draining();
            send_hints_maybe();
-            _ep_manager.flush_current_hints().handle_exception([] (auto e) {
-                manager_logger.error("Failed to flush pending hints: {}. Ignoring...", e);
+            _ep_manager.flush_current_hints().handle_exception([this] (auto e) {
+                manager_logger.error("hint_sender[{}]:stop: Failed to flush pending hints: {}. Ignoring", _ep_key, e);
            }).get();
            send_hints_maybe();
-            manager_logger.trace("Draining for {}: end", end_point_key());
+            manager_logger.trace("hint_sender[{}]:stop: Draining finished", end_point_key());
        }
-        // TODO: Change this log to match the class name, but first make sure no test
-        //       relies on the old one.
-        manager_logger.trace("ep_manager({})::sender: exiting", end_point_key());
+
+        manager_logger.debug("hint_sender[{}]:stop: Finished", end_point_key());
    });
 }

 void hint_sender::cancel_draining() {
-    manager_logger.info("Draining of {} has been marked as canceled", _ep_key);
+    manager_logger.info("hint_sender[{}]:cancel_draining: Marking as canceled", _ep_key);
    if (_state.contains(state::draining)) {
        _state.remove(state::draining);
    }
@@ -222,9 +221,8 @@ void hint_sender::start() {

    attr.sched_group = _hints_cpu_sched_group;
    _stopped = seastar::async(std::move(attr), [this] {
-        // TODO: Change this log to match the class name, but first make sure no test
-        //       relies on the old one.
-        manager_logger.trace("ep_manager({})::sender: started", end_point_key());
+        manager_logger.debug("hint_sender[{}]:start: Starting", end_point_key());
+
        while (!stopping()) {
            try {
                flush_maybe().get();
@@ -237,34 +235,36 @@ void hint_sender::start() {
                break;
            } catch (...) {
                // log and keep on spinning
-                // TODO: Change this log to match the class name, but first make sure no test
-                //       relies on the old one.
-                manager_logger.trace("sender: got the exception: {}", std::current_exception());
+                manager_logger.debug("hint_sender[{}]:start: Exception in the loop: {}", _ep_key, std::current_exception());
            }
        }
+
+        manager_logger.debug("hint_sender[{}]:start: Exited the loop", _ep_key);
    });
 }

 future<> hint_sender::send_one_mutation(frozen_mutation_and_schema m) {
    auto ermp = _db.find_column_family(m.s).get_effective_replication_map();
    auto token = dht::get_token(*m.s, m.fm.key());
-    host_id_vector_replica_set natural_endpoints = ermp->get_natural_replicas(std::move(token));
+    host_id_vector_replica_set natural_endpoints = ermp->get_natural_replicas(token);
+    host_id_vector_topology_change pending_endpoints  = ermp->get_pending_replicas(token);

-    return futurize_invoke([this, m = std::move(m), ermp = std::move(ermp), &natural_endpoints] () mutable -> future<> {
+    return futurize_invoke([this, m = std::move(m), ermp = std::move(ermp), &natural_endpoints, &pending_endpoints] () mutable -> future<> {
        // The fact that we send with CL::ALL in both cases below ensures that new hints are not going
        // to be generated as a result of hints sending.
        const auto& tm = ermp->get_token_metadata();
        const auto dst = end_point_key();

        if (std::ranges::contains(natural_endpoints, dst) && !tm.is_leaving(dst)) {
-            manager_logger.trace("Sending directly to {}", dst);
-            return _proxy.send_hint_to_endpoint(std::move(m), std::move(ermp), dst);
+            manager_logger.trace("hint_sender[{}]:send_one_mutation: Sending directly", dst);
+            // dst is not duplicated in pending_endpoints because it's in natural_endpoints
+            return _proxy.send_hint_to_endpoint(std::move(m), std::move(ermp), dst, std::move(pending_endpoints));
        } else {
            if (manager_logger.is_enabled(log_level::trace)) {
                if (tm.is_leaving(end_point_key())) {
-                    manager_logger.trace("The original target endpoint {} is leaving. Mutating from scratch...", dst);
+                    manager_logger.trace("hint_sender[{}]:send_one_mutation: Original target is leaving. Mutating from scratch", dst);
                } else {
-                    manager_logger.trace("Endpoints set has changed and {} is no longer a replica. Mutating from scratch...", dst);
+                    manager_logger.trace("hint_sender[{}]:send_one_mutation: Endpoint set has changed and original target is no longer a replica. Mutating from scratch", dst);
                }
            }
            return _proxy.send_hint_to_all_replicas(std::move(m));
@@ -288,9 +288,9 @@ future<> hint_sender::send_one_hint(lw_shared_ptr<send_one_file_ctx> ctx_ptr, fr
                // Files are aggregated for at most manager::hints_timer_period therefore the oldest hint there is
                // (last_modification - manager::hints_timer_period) old.
                if (const auto now = gc_clock::now().time_since_epoch(); now - secs_since_file_mod > gc_grace_sec - manager::hints_flush_period) {
-                    manager_logger.debug("send_hints(): the hint is too old, skipping it, "
+                    manager_logger.trace("hint_sender[{}]:send_hints: Hint is too old, skipping it, "
                        "secs since file last modification {}, gc_grace_sec {}, hints_flush_period {}",
-                        now - secs_since_file_mod, gc_grace_sec, manager::hints_flush_period);
+                        _ep_key, now - secs_since_file_mod, gc_grace_sec, manager::hints_flush_period);
                    return make_ready_future<>();
                }

@@ -299,24 +299,24 @@ future<> hint_sender::send_one_hint(lw_shared_ptr<send_one_file_ctx> ctx_ptr, fr
                    ++this->shard_stats().sent_total;
                    this->shard_stats().sent_hints_bytes_total += mutation_size;
                }).handle_exception([this, ctx_ptr] (auto eptr) {
-                    manager_logger.trace("send_one_hint(): failed to send to {}: {}", end_point_key(), eptr);
+                    manager_logger.trace("hint_sender[{}]:send_one_hint: Failed to send: {}", end_point_key(), eptr);
                    ++this->shard_stats().send_errors;
                    return make_exception_future<>(std::move(eptr));
                });

            // ignore these errors and move on - probably this hint is too old and the KS/CF has been deleted...
            } catch (replica::no_such_column_family& e) {
-                manager_logger.debug("send_hints(): no_such_column_family: {}", e.what());
+                manager_logger.debug("hint_sender[{}]:send_one_hint: no_such_column_family: {}", _ep_key, e.what());
                ++this->shard_stats().discarded;
            } catch (replica::no_such_keyspace& e) {
-                manager_logger.debug("send_hints(): no_such_keyspace: {}", e.what());
+                manager_logger.debug("hint_sender[{}]:send_one_hint: no_such_keyspace: {}", _ep_key, e.what());
                ++this->shard_stats().discarded;
            } catch (no_column_mapping& e) {
-                manager_logger.debug("send_hints(): {} at {}: {}", fname, rp, e.what());
+                manager_logger.debug("hint_sender[{}]:send_one_hint: no_column_mapping: {} at {}: {}", _ep_key, fname, rp, e.what());
                ++this->shard_stats().discarded;
            } catch (...) {
                auto eptr = std::current_exception();
-                manager_logger.debug("send_hints(): unexpected error in file {} at {}: {}", fname, rp, eptr);
+                manager_logger.debug("hint_sender[{}]:send_one_hint: Unexpected error in file {} at {}: {}", _ep_key, fname, rp, eptr);
                ++this->shard_stats().send_errors;
                return make_exception_future<>(std::move(eptr));
            }
@@ -338,21 +338,24 @@ future<> hint_sender::send_one_hint(lw_shared_ptr<send_one_file_ctx> ctx_ptr, fr
            }
            f.ignore_ready_future();
        });
-    }).handle_exception([ctx_ptr, rp] (auto eptr) {
-        manager_logger.trace("send_one_file(): Hmmm. Something bad had happened: {}", eptr);
+    }).handle_exception([this, ctx_ptr, rp] (auto eptr) {
+        manager_logger.trace("hint_sender[{}]:send_one_hint: Exception occurred: {}", _ep_key, eptr);
        ctx_ptr->on_hint_send_failure(rp);
    });
 }

 void hint_sender::notify_replay_waiters() noexcept {
    if (!_foreign_segments_to_replay.empty()) {
-        manager_logger.trace("[{}] notify_replay_waiters(): not notifying because there are still {} foreign segments to replay", end_point_key(), _foreign_segments_to_replay.size());
+        manager_logger.trace("hint_sender[{}]:notify_replay_waiters: Not notifying because there are still {} foreign segments to replay",
+                end_point_key(), _foreign_segments_to_replay.size());
        return;
    }

-    manager_logger.trace("[{}] notify_replay_waiters(): replay position upper bound was updated to {}", end_point_key(), _sent_upper_bound_rp);
+    manager_logger.trace("hint_sender[{}]:notify_replay_waiters: Replay position upper bound was updated to {}", end_point_key(), _sent_upper_bound_rp);
    while (!_replay_waiters.empty() && _replay_waiters.begin()->first < _sent_upper_bound_rp) {
-        manager_logger.trace("[{}] notify_replay_waiters(): notifying one ({} < {})", end_point_key(), _replay_waiters.begin()->first, _sent_upper_bound_rp);
+        manager_logger.trace("hint_sender[{}]:notify_replay_waiters: Notifying one ({} < {})",
+                end_point_key(), _replay_waiters.begin()->first, _sent_upper_bound_rp);
+
        auto ptr = _replay_waiters.begin()->second;
        (**ptr).set_value();
        (*ptr) = std::nullopt; // Prevent it from being resolved by abort source subscription
@@ -362,7 +365,7 @@ void hint_sender::notify_replay_waiters() noexcept {

 void hint_sender::dismiss_replay_waiters() noexcept {
    for (auto& p : _replay_waiters) {
-        manager_logger.debug("[{}] dismiss_replay_waiters(): dismissing one", end_point_key());
+        manager_logger.debug("hint_sender[{}]:dismiss_replay_waiters: Dismissing one", end_point_key());
        auto ptr = p.second;
        (**ptr).set_exception(std::runtime_error(format("Hints manager for {} is stopping", end_point_key())));
        (*ptr) = std::nullopt; // Prevent it from being resolved by abort source subscription
@@ -371,14 +374,15 @@ void hint_sender::dismiss_replay_waiters() noexcept {
 }

 future<> hint_sender::wait_until_hints_are_replayed_up_to(abort_source& as, db::replay_position up_to_rp) {
-    manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): entering with target {}", end_point_key(), up_to_rp);
+    manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Entering with target {}", end_point_key(), up_to_rp);
    if (_foreign_segments_to_replay.empty() && up_to_rp < _sent_upper_bound_rp) {
-        manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): hints were already replayed above the point ({} < {})", end_point_key(), up_to_rp, _sent_upper_bound_rp);
+        manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Hints were already replayed above the point ({} < {})",
+                end_point_key(), up_to_rp, _sent_upper_bound_rp);
        return make_ready_future<>();
    }

    if (as.abort_requested()) {
-        manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): already aborted - stopping", end_point_key());
+        manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Already aborted - stopping", end_point_key());
        return make_exception_future<>(abort_requested_exception());
    }

@@ -389,7 +393,7 @@ future<> hint_sender::wait_until_hints_are_replayed_up_to(abort_source& as, db::
            // The promise already was resolved by `notify_replay_waiters` and removed from the map
            return;
        }
-        manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): abort requested - stopping", end_point_key());
+        manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Abort requested - stopping", end_point_key());
        _replay_waiters.erase(it);
        (**ptr).set_exception(abort_requested_exception());
    });
@@ -398,7 +402,7 @@ future<> hint_sender::wait_until_hints_are_replayed_up_to(abort_source& as, db::
    // therefore we cannot capture `this`
    auto ep = end_point_key();
    return (**ptr).get_future().finally([sub = std::move(sub), ep] {
-        manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): returning after the future was satisfied", ep);
+        manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Returning after the future was satisfied", ep);
    });
 }

@@ -470,7 +474,7 @@ bool hint_sender::send_one_file(const sstring& fname) {
                }

                if (canceled_draining()) {
-                    manager_logger.debug("[{}] Exiting reading from commitlog because of canceled draining", _ep_key);
+                    manager_logger.debug("hint_sender[{}]:send_one_file: Exiting reading from commitlog because of canceled draining", _ep_key);
                    // We need to throw an exception here to cancel reading the segment.
                    throw canceled_draining_exception{};
                }
@@ -502,13 +506,15 @@ bool hint_sender::send_one_file(const sstring& fname) {
            };
        }, _last_not_complete_rp.pos, &_db.extensions()).get();
    } catch (db::commitlog::segment_error& ex) {
-        manager_logger.error("{}: {}. Dropping...", fname, ex.what());
+        manager_logger.error("hint_sender[{}]:send_one_file: Segment error in {}: {}. Last not complete position={}",
+                _ep_key, fname, ex.what(), _last_not_complete_rp);
        ctx_ptr->segment_replay_failed = false;
        ++this->shard_stats().corrupted_files;
    } catch  (const canceled_draining_exception&) {
-        manager_logger.debug("[{}] Loop in send_one_file finishes due to canceled draining", _ep_key);
+        manager_logger.debug("hint_sender[{}]:send_one_file: Loop in send_one_file finishes due to canceled draining", _ep_key);
    } catch (...) {
-        manager_logger.trace("sending of {} failed: {}", fname, std::current_exception());
+        manager_logger.debug("hint_sender[{}]:send_one_file: Sending of {} failed: {}. Last not complete position={}",
+                _ep_key, fname, std::current_exception(), _last_not_complete_rp);
        ctx_ptr->segment_replay_failed = true;
    }

@@ -523,7 +529,7 @@ bool hint_sender::send_one_file(const sstring& fname) {

    // If we are draining ignore failures and drop the segment even if we failed to send it.
    if (draining() && ctx_ptr->segment_replay_failed) {
-        manager_logger.trace("send_one_file(): we are draining so we are going to delete the segment anyway");
+        manager_logger.debug("hint_sender[{}]:send_one_file: We are draining, so we are going to delete the segment anyway", _ep_key);
        ctx_ptr->segment_replay_failed = false;
    }

@@ -533,7 +539,7 @@ bool hint_sender::send_one_file(const sstring& fname) {
        // If there was an error thrown by read_log_file function itself, we will retry sending from
        // the last hint that was successfully sent (last_succeeded_rp).
        _last_not_complete_rp = ctx_ptr->first_failed_rp.value_or(ctx_ptr->last_succeeded_rp.value_or(_last_not_complete_rp));
-        manager_logger.trace("send_one_file(): error while sending hints from {}, last RP is {}", fname, _last_not_complete_rp);
+        manager_logger.debug("hint_sender[{}]:send_one_file: Error while sending hints from {}, last RP is {}", _ep_key, fname, _last_not_complete_rp);
        return false;
    }

@@ -546,7 +552,7 @@ bool hint_sender::send_one_file(const sstring& fname) {
    // clear the replay position - we are going to send the next segment...
    _last_not_complete_rp = replay_position();
    _last_schema_ver_to_column_mapping.clear();
-    manager_logger.trace("send_one_file(): segment {} was sent in full and deleted", fname);
+    manager_logger.debug("hint_sender[{}]:send_one_file: Segment {} has been sent in full and deleted", _ep_key, fname);
    return true;
 }

@@ -572,14 +578,15 @@ void hint_sender::pop_current_segment() {
 // Runs in the seastar::async context
 void hint_sender::send_hints_maybe() noexcept {
    using namespace std::literals::chrono_literals;
-    manager_logger.trace("send_hints(): going to send hints to {}, we have {} segment to replay", end_point_key(), _segments_to_replay.size() + _foreign_segments_to_replay.size());
+    manager_logger.trace("hint_sender[{}]:send_hints_maybe: Going to send hints. We have {} segment to replay",
+            end_point_key(), _segments_to_replay.size() + _foreign_segments_to_replay.size());

    int replayed_segments_count = 0;

    try {
        while (true) {
            if (canceled_draining()) {
-                manager_logger.debug("[{}] Exiting loop in send_hints_maybe because of canceled draining", _ep_key);
+                manager_logger.debug("hint_sender[{}]:send_hints_maybe: Exiting loop in send_hints_maybe because of canceled draining", _ep_key);
                break;
            }
            const sstring* seg_name = name_of_current_segment();
@@ -598,7 +605,7 @@ void hint_sender::send_hints_maybe() noexcept {
    // Ignore exceptions, we will retry sending this file from where we left off the next time.
    // Exceptions are not expected here during the regular operation, so just log them.
    } catch (...) {
-        manager_logger.trace("send_hints(): got the exception: {}", std::current_exception());
+        manager_logger.debug("hint_sender[{}]:send_hints_maybe: Exception occurred while sending: {}", _ep_key, std::current_exception());
    }

    if (have_segments()) {
@@ -609,7 +616,7 @@ void hint_sender::send_hints_maybe() noexcept {
        _next_send_retry_tp = _next_flush_tp;
    }

-    manager_logger.trace("send_hints(): we handled {} segments", replayed_segments_count);
+    manager_logger.debug("hint_sender[{}]:send_hints_maybe: We handled {} segments", _ep_key, replayed_segments_count);
 }

 hint_stats& hint_sender::shard_stats() {
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -505,20 +505,20 @@ bool manager::can_hint_for(endpoint_id ep) const noexcept {
    // hints where N is the total number nodes in the cluster.
    const auto hipf = hints_in_progress_for(ep);
    if (_stats.size_of_hints_in_progress > max_size_of_hints_in_progress() && hipf > 0) {
-        manager_logger.trace("size_of_hints_in_progress {} hints_in_progress_for({}) {}",
+        manager_logger.trace("can_hint_for: size_of_hints_in_progress {} hints_in_progress_for({}) {}",
                _stats.size_of_hints_in_progress, ep, hipf);
        return false;
    }

    // Check that the destination DC is "hintable".
    if (!check_dc_for(ep)) {
-        manager_logger.trace("{}'s DC is not hintable", ep);
+        manager_logger.trace("can_hint_for: {}'s DC is not hintable", ep);
        return false;
    }

    const bool node_is_alive = local_gossiper().get_endpoint_downtime(ep) <= _max_hint_window_us;
    if (!node_is_alive) {
-        manager_logger.trace("{} has been down for too long, not hinting", ep);
+        manager_logger.trace("can_hint_for: {} has been down for too long, not hinting", ep);
        return false;
    }

--- a/db/row_cache.cc
+++ b/db/row_cache.cc
@@ -850,7 +850,7 @@ mutation_reader row_cache::make_nonpopulating_reader(schema_ptr schema, reader_p
                    std::move(permit),
                    e.key(),
                    query::clustering_key_filter_ranges(slice.row_ranges(*schema, e.key().key())),
-                    e.partition().read(_tracker.region(), _tracker.memtable_cleaner(), nullptr, phase_of(pos)),
+                    e.partition().read(_tracker.region(), _tracker.memtable_cleaner(), &_tracker, phase_of(pos)),
                    false,
                    _tracker.region(),
                    _read_section,
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -11,9 +11,11 @@
 #include <boost/functional/hash.hpp>
 #include <boost/icl/interval_map.hpp>
 #include <fmt/ranges.h>
+#include <ranges>

 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
+#include <seastar/core/loop.hh>
 #include <seastar/core/on_internal_error.hh>
 #include "system_keyspace.hh"
 #include "cql3/untyped_result_set.hh"
@@ -51,6 +53,7 @@
 #include "utils/shared_dict.hh"
 #include "replica/database.hh"
 #include "db/compaction_history_entry.hh"
+#include "mutation/async_utils.hh"

 #include <unordered_map>

@@ -359,6 +362,7 @@ schema_ptr system_keyspace::raft() {

            .set_comment("Persisted RAFT log, votes and snapshot info")
            .with_hash_version()
+            .set_caching_options(caching_options::get_disabled_caching_options())
            .build();
    }();
    return schema;
@@ -1694,6 +1698,12 @@ future<> system_keyspace::peers_table_read_fixup() {
            continue;
        }
        const auto host_id = row.get_as<utils::UUID>("host_id");
+        if (!host_id) {
+            slogger.error("Peer {} has null host_id in system.{}, the record is broken, removing it",
+                peer, system_keyspace::PEERS);
+            co_await remove_endpoint(gms::inet_address{peer});
+            continue;
+        }
        const auto ts = row.get_as<int64_t>("ts");
        const auto it = map.find(host_id);
        if (it == map.end()) {
@@ -1757,8 +1767,15 @@ future<> system_keyspace::drop_truncation_rp_records() {
    auto rs = co_await execute_cql(req);

    bool any = false;
-    co_await coroutine::parallel_for_each(*rs, [&] (const cql3::untyped_result_set_row& row) -> future<> {
+    std::unordered_set<table_id> to_delete;
+    auto db = _qp.db();
+    auto max_concurrency = std::min(1024u, smp::count * 8);
+    co_await seastar::max_concurrent_for_each(*rs, max_concurrency, [&] (const cql3::untyped_result_set_row& row) -> future<> {
        auto table_uuid = table_id(row.get_as<utils::UUID>("table_uuid"));
+        if (!db.try_find_table(table_uuid)) {
+            to_delete.emplace(table_uuid);
+            co_return;
+        }
        auto shard = row.get_as<int32_t>("shard");
        auto segment_id = row.get_as<int64_t>("segment_id");

@@ -1768,11 +1785,26 @@ future<> system_keyspace::drop_truncation_rp_records() {
            co_await execute_cql(req);
        }
    });
+    if (!to_delete.empty()) {
+        // IN has a limit to how many values we can put into it.
+        for (auto&& chunk : to_delete | std::views::transform(&table_id::to_sstring) | std::views::chunk(100)) {
+            auto str = std::ranges::to<std::string>(chunk | std::views::join_with(','));
+            auto req = fmt::format("DELETE FROM system.{} WHERE table_uuid IN ({})", TRUNCATED, str);
+            co_await execute_cql(req);
+        }
+        any = true;
+    }
    if (any) {
        co_await force_blocking_flush(TRUNCATED);
    }
 }

+future<> system_keyspace::remove_truncation_records(table_id id) {
+    auto req = format("DELETE FROM system.{} WHERE table_uuid = {}", TRUNCATED, id);
+    co_await execute_cql(req);
+    co_await force_blocking_flush(TRUNCATED);
+}
+
 future<> system_keyspace::save_truncation_record(const replica::column_family& cf, db_clock::time_point truncated_at, db::replay_position rp) {
    sstring req = format("INSERT INTO system.{} (table_uuid, shard, position, segment_id, truncated_at) VALUES(?,?,?,?,?)", TRUNCATED);
    co_await _qp.execute_internal(req, {cf.schema()->id().uuid(), int32_t(rp.shard_id()), int32_t(rp.pos), int64_t(rp.base_id()), truncated_at}, cql3::query_processor::cache_internal::yes);
@@ -2155,7 +2187,59 @@ future<> system_keyspace::update_peer_info(gms::inet_address ep, locator::host_i

    slogger.debug("{}: values={}", query, values);

-    co_await _qp.execute_internal(query, db::consistency_level::ONE, values, cql3::query_processor::cache_internal::yes);
+    const auto guard = co_await get_units(_peers_cache_lock, 1);
+    try {
+        co_await _qp.execute_internal(query, db::consistency_level::ONE, values, cql3::query_processor::cache_internal::yes);
+        if (auto* cache = get_peers_cache()) {
+            cache->host_id_to_inet_ip[hid] = ep;
+            cache->inet_ip_to_host_id[ep] = hid;
+        }
+    } catch (...) {
+        _peers_cache = nullptr;
+        throw;
+    }
+}
+
+system_keyspace::peers_cache* system_keyspace::get_peers_cache() {
+    auto* cache = _peers_cache.get();
+    if (cache && (lowres_clock::now() > cache->expiration_time)) {
+        _peers_cache = nullptr;
+        return nullptr;
+    }
+    return cache;
+}
+
+future<lw_shared_ptr<const system_keyspace::peers_cache>> system_keyspace::get_or_load_peers_cache() {
+    const auto guard = co_await get_units(_peers_cache_lock, 1);
+    if (auto* cache = get_peers_cache()) {
+        co_return cache->shared_from_this();
+    }
+    auto cache = make_lw_shared<peers_cache>();
+    cache->inet_ip_to_host_id = co_await load_host_ids();
+    cache->host_id_to_inet_ip.reserve(cache->inet_ip_to_host_id.size());
+    for (const auto [ip, id]: cache->inet_ip_to_host_id) {
+        const auto [it, inserted] = cache->host_id_to_inet_ip.insert({id, ip});
+        if (!inserted) {
+            on_internal_error(slogger, ::format("duplicate IP for host_id {}, first IP {}, second IP {}",
+                id, it->second, ip));
+        }
+    }
+    cache->expiration_time = lowres_clock::now() + std::chrono::milliseconds(200);
+    _peers_cache = cache;
+    co_return std::move(cache);
+}
+
+future<std::optional<gms::inet_address>> system_keyspace::get_ip_from_peers_table(locator::host_id id) {
+    const auto cache = co_await get_or_load_peers_cache();
+    if (const auto it = cache->host_id_to_inet_ip.find(id); it != cache->host_id_to_inet_ip.end()) {
+        co_return it->second;
+    }
+    co_return std::nullopt;
+}
+
+future<system_keyspace::host_id_to_ip_map_t> system_keyspace::get_host_id_to_ip_map() {
+    const auto cache = co_await get_or_load_peers_cache();
+    co_return cache->host_id_to_inet_ip;
 }

 template <typename T>
@@ -2205,7 +2289,22 @@ future<> system_keyspace::update_schema_version(table_schema_version version) {
 future<> system_keyspace::remove_endpoint(gms::inet_address ep) {
    const sstring req = format("DELETE FROM system.{} WHERE peer = ?", PEERS);
    slogger.debug("DELETE FROM system.{} WHERE peer = {}", PEERS, ep);
-    co_await execute_cql(req, ep.addr()).discard_result();
+
+    const auto guard = co_await get_units(_peers_cache_lock, 1);
+    try {
+        co_await execute_cql(req, ep.addr()).discard_result();
+        if (auto* cache = get_peers_cache()) {
+            const auto it = cache->inet_ip_to_host_id.find(ep);
+            if (it != cache->inet_ip_to_host_id.end()) {
+                const auto id = it->second;
+                cache->inet_ip_to_host_id.erase(it);
+                cache->host_id_to_inet_ip.erase(id);
+            }
+        }
+    } catch (...) {
+        _peers_cache = nullptr;
+        throw;
+    }
 }

 future<> system_keyspace::update_tokens(const std::unordered_set<dht::token>& tokens) {
@@ -2930,7 +3029,9 @@ future<mutation> system_keyspace::get_group0_history(distributed<replica::databa
    SCYLLA_ASSERT(rs);
    auto& ps = rs->partitions();
    for (auto& p: ps) {
-        auto mut = p.mut().unfreeze(s);
+        // Note: we could decorate the frozen_mutation's key to check if it's the expected one
+        // but since this is a single partition table, we can just check after unfreezing the whole mutation.
+        auto mut = co_await unfreeze_gently(p.mut(), s);
        auto partition_key = value_cast<sstring>(utf8_type->deserialize(mut.key().get_component(*s, 0)));
        if (partition_key == GROUP0_HISTORY_KEY) {
            co_return mut;
@@ -3180,7 +3281,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
            supported_features = decode_features(deserialize_set_column(*topology(), row, "supported_features"));
        }

-        if (row.has("topology_request")) {
+        if (row.has("topology_request") && nstate != service::node_state::left) {
            auto req = service::topology_request_from_string(row.get_as<sstring>("topology_request"));
            ret.requests.emplace(host_id, req);
            switch(req) {
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -143,6 +143,17 @@ class system_keyspace : public seastar::peering_sharded_service<system_keyspace>
    //  and this node crashes after adding a new IP but before removing the old one. The
    //  record with older timestamp is removed, the warning is written to the log.
    future<> peers_table_read_fixup();
+
+    struct peers_cache: public enable_lw_shared_from_this<peers_cache> {
+        std::unordered_map<gms::inet_address, locator::host_id> inet_ip_to_host_id;
+        std::unordered_map<locator::host_id, gms::inet_address> host_id_to_inet_ip;
+        lowres_clock::time_point expiration_time;
+    };
+    lw_shared_ptr<peers_cache> _peers_cache;
+    semaphore _peers_cache_lock{1};
+    peers_cache* get_peers_cache();
+    future<lw_shared_ptr<const peers_cache>> get_or_load_peers_cache();
+
 public:
    static schema_ptr size_estimates();
 public:
@@ -308,6 +319,12 @@ public:

    future<> update_peer_info(gms::inet_address ep, locator::host_id hid, const peer_info& info);

+    // Return ip of the peers table entry with given host id
+    future<std::optional<gms::inet_address>> get_ip_from_peers_table(locator::host_id id);
+
+    using host_id_to_ip_map_t = std::unordered_map<locator::host_id, gms::inet_address>;
+    future<host_id_to_ip_map_t> get_host_id_to_ip_map();
+
    future<> remove_endpoint(gms::inet_address ep);

    // Saves the key-value pair into system.scylla_local table.
@@ -418,6 +435,7 @@ public:
    future<> save_truncation_record(const replica::column_family&, db_clock::time_point truncated_at, db::replay_position);
    future<replay_positions> get_truncated_positions(table_id);
    future<> drop_truncation_rp_records();
+    future<> remove_truncation_records(table_id);

    // Converts a `dht::token_range` object to the left-open integer range (x,y] form.
    //
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -2680,9 +2680,16 @@ void view_builder::on_create_view(const sstring& ks_name, const sstring& view_na
            // threshold.
          return initialize_reader_at_current_token(step).then([this, view, &step] () mutable {
            return add_new_view(view, step).then_wrapped([this, view] (future<>&& f) {
-                if (f.failed()) {
-                    vlogger.error("Error setting up view for building {}.{}: {}", view->ks_name(), view->cf_name(), f.get_exception());
+                try {
+                    f.get();
+                } catch (abort_requested_exception&) {
+                    vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
+                } catch (raft::request_aborted&) {
+                    vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
+                } catch (...) {
+                    vlogger.error("Error setting up view for building {}.{}: {}", view->ks_name(), view->cf_name(), std::current_exception());
                }
+
                // Waited on indirectly in stop().
                (void)_build_step.trigger();
            });
@@ -3186,15 +3193,6 @@ public:
                          _step.base->schema()->cf_name(), _step.current_token(), view_names);
        }
        if (_step.reader.is_end_of_stream() && _step.reader.is_buffer_empty()) {
-            if (_step.current_key.key().is_empty()) {
-                // consumer got end-of-stream without consuming a single partition
-                vlogger.debug("Reader didn't produce anything, marking views as built");
-                while (!_step.build_status.empty()) {
-                    _built_views.views.push_back(std::move(_step.build_status.back()));
-                    _step.build_status.pop_back();
-                }
-            }
-
            // before going back to the minimum token, advance current_key to the end
            // and check for built views in that range.
            _step.current_key = { _step.prange.end().value_or(dht::ring_position::max()).value().token(), partition_key::make_empty()};
@@ -3213,6 +3211,7 @@ public:

 // Called in the context of a seastar::thread.
 void view_builder::execute(build_step& step, exponential_backoff_retry r) {
+    inject_failure("dont_start_build_step");
    gc_clock::time_point now = gc_clock::now();
    auto compaction_state = make_lw_shared<compact_for_query_state>(
            *step.reader.schema(),
@@ -3246,6 +3245,7 @@ void view_builder::execute(build_step& step, exponential_backoff_retry r) {
    seastar::when_all_succeed(bookkeeping_ops.begin(), bookkeeping_ops.end()).handle_exception([] (std::exception_ptr ep) {
        vlogger.warn("Failed to update materialized view bookkeeping ({}), continuing anyway.", ep);
    }).get();
+    utils::get_local_injector().inject("delay_finishing_build_step", utils::wait_for_message(60s)).get();
 }

 future<> view_builder::mark_as_built(view_ptr view) {
@@ -3449,6 +3449,7 @@ void delete_ghost_rows_visitor::accept_new_row(const clustering_key& ck, const q
    auto view_exploded_ck = ck.explode();
    std::vector<bytes> base_exploded_pk(_base_schema->partition_key_size());
    std::vector<bytes> base_exploded_ck(_base_schema->clustering_key_size());
+    std::map<const column_definition*, bytes> view_key_cols_not_in_base_key;
    for (const column_definition& view_cdef : _view->all_columns()) {
        const column_definition* base_cdef = _base_schema->get_column_definition(view_cdef.name());
        if (base_cdef) {
@@ -3457,6 +3458,8 @@ void delete_ghost_rows_visitor::accept_new_row(const clustering_key& ck, const q
                base_exploded_pk[base_cdef->id] = view_exploded_key[view_cdef.id];
            } else if (base_cdef->is_clustering_key()) {
                base_exploded_ck[base_cdef->id] = view_exploded_key[view_cdef.id];
+            } else if (!base_cdef->is_computed() && view_cdef.is_primary_key()) {
+                view_key_cols_not_in_base_key[base_cdef] = view_exploded_key[view_cdef.id];
            }
        }
    }
@@ -3464,22 +3467,44 @@ void delete_ghost_rows_visitor::accept_new_row(const clustering_key& ck, const q
    clustering_key base_ck = clustering_key::from_exploded(base_exploded_ck);

    dht::partition_range_vector partition_ranges({dht::partition_range::make_singular(dht::decorate_key(*_base_schema, base_pk))});
-    auto selection = cql3::selection::selection::for_columns(_base_schema, std::vector<const column_definition*>({&_base_schema->partition_key_columns().front()}));
+    auto view_key_cols_not_in_base_key_cdefs = view_key_cols_not_in_base_key | std::views::keys | std::ranges::to<std::vector<const column_definition*>>();
+    auto selection = cql3::selection::selection::for_columns(_base_schema,
+        view_key_cols_not_in_base_key.empty() ? std::vector<const column_definition*>({&_base_schema->partition_key_columns().front()}) : view_key_cols_not_in_base_key_cdefs);

    std::vector<query::clustering_range> bounds{query::clustering_range::make_singular(base_ck)};
-    query::partition_slice partition_slice(std::move(bounds), {},  {}, selection->get_query_options());
+    utils::small_vector<column_id, 8> view_key_col_ids;
+    for (const auto& [col_def, _] : view_key_cols_not_in_base_key) {
+        view_key_col_ids.push_back(col_def->id);
+    }
+    query::partition_slice partition_slice(std::move(bounds), {}, std::move(view_key_col_ids), selection->get_query_options());
    auto command = ::make_lw_shared<query::read_command>(_base_schema->id(), _base_schema->version(), partition_slice,
            _proxy.get_max_result_size(partition_slice), query::tombstone_limit(_proxy.get_tombstone_limit()));
    auto timeout = db::timeout_clock::now() + _timeout_duration;
    service::storage_proxy::coordinator_query_options opts{timeout, _state.get_permit(), _state.get_client_state(), _state.get_trace_state()};
    auto base_qr = _proxy.query(_base_schema, command, std::move(partition_ranges), db::consistency_level::ALL, opts).get();
    query::result& result = *base_qr.query_result;
-    if (result.row_count().value_or(0) == 0) {
+    auto delete_ghost_row = [&]() {
        mutation m(_view, *_view_pk);
        auto& row = m.partition().clustered_row(*_view, ck);
        row.apply(tombstone(api::new_timestamp(), gc_clock::now()));
        timeout = db::timeout_clock::now() + _timeout_duration;
        _proxy.mutate({m}, db::consistency_level::ALL, timeout, _state.get_trace_state(), empty_service_permit(), db::allow_per_partition_rate_limit::no).get();
+    };
+    if (result.row_count().value_or(0) == 0) {
+        delete_ghost_row();
+    } else if (!view_key_cols_not_in_base_key.empty()) {
+        if (result.row_count().value_or(0) != 1) {
+            on_internal_error(vlogger, format("Got multiple base rows corresponding to a single view row when pruning {}.{}", _view->ks_name(), _view->cf_name()));
+        }
+        auto results = query::result_set::from_raw_result(_base_schema, partition_slice, result);
+        auto& base_row = results.row(0);
+        for (const auto& [col_def, col_val] : view_key_cols_not_in_base_key) {
+            const data_value* base_val = base_row.get_data_value(col_def->name_as_text());
+            if (!base_val || base_val->is_null() || col_val != base_val->serialize_nonnull()) {
+                delete_ghost_row();
+                break;
+            }
+        }
    }
 }

--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -102,13 +102,13 @@ view_update_generator::view_update_generator(replica::database& db, sharded<serv
        , _early_abort_subscription(as.subscribe([this] () noexcept { do_abort(); }))
 {
    setup_metrics();
-    discover_staging_sstables();
    _db.plug_view_update_generator(*this);
 }

 view_update_generator::~view_update_generator() {}

 future<> view_update_generator::start() {
+    discover_staging_sstables();
    _started = seastar::async([this]() mutable {
        auto drop_sstable_references = defer([&] () noexcept {
            // Clear sstable references so sstables_manager::stop() doesn't hang.
--- a/dist/common/scripts/scylla_io_setup
+++ b/dist/common/scripts/scylla_io_setup
@@ -131,6 +131,28 @@ def configure_iotune_open_fd_limit(shards_count):
        logging.error(f"Required FDs count: {precalculated_fds_count}, default limit: {fd_limits}!")
        sys.exit(1)

+def force_random_request_size_of_4k():
+    """
+    It is a known bug that on i4i, i7i, i8g, i8ge instances, the disk controller reports the wrong
+    physical sector size as 512bytes, but the actual physical sector size is 4096bytes. This function
+    helps us work around that issue until AWS manages to get a fix for it. It returns 4096 if it
+    detect it's running on one of the affected instance types, otherwise it returns None and IOTune
+    will use the physical sector size reported by the disk.
+    """
+    path="/sys/devices/virtual/dmi/id/product_name"
+
+    try:
+        with open(path, "r") as f:
+            instance_type = f.read().strip()
+    except FileNotFoundError:
+        logging.warning(f"Couldn't find {path}. Falling back to IOTune using the physical sector size reported by disk.")
+        return
+
+    prefixes = ["i7i", "i4i", "i8g", "i8ge"]
+    if any(instance_type.startswith(p) for p in prefixes):
+        return 4096
+
+
 def run_iotune():
            if "SCYLLA_CONF" in os.environ:
                conf_dir = os.environ["SCYLLA_CONF"]
@@ -173,6 +195,8 @@ def run_iotune():

            configure_iotune_open_fd_limit(cpudata.nr_shards())

+            if (reqsize := force_random_request_size_of_4k()):
+                iotune_args += ["--random-write-io-buffer-size", f"{reqsize}"]
            try:
                subprocess.check_call([bindir() + "/iotune",
                                       "--format", "envfile",
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -17,6 +17,7 @@ import stat
 import logging
 import pyudev
 import psutil
+import platform
 from pathlib import Path
 from scylla_util import *
 from subprocess import run, SubprocessError
@@ -102,6 +103,21 @@ def is_selinux_enabled():
                return True
    return False

+def is_kernel_version_at_least(major, minor):
+    """Check if the Linux kernel version is at least major.minor"""
+    try:
+        kernel_version = platform.release()
+        # Extract major.minor from version string like "5.15.0-56-generic"
+        version_parts = kernel_version.split('.')
+        if len(version_parts) >= 2:
+            kernel_major = int(version_parts[0])
+            kernel_minor = int(version_parts[1])
+            return (kernel_major, kernel_minor) >= (major, minor)
+    except (ValueError, IndexError):
+        # If we can't parse the version, assume older kernel for safety
+        pass
+    return False
+
 if __name__ == '__main__':
    if os.getuid() > 0:
        print('Requires root permission.')
@@ -231,8 +247,17 @@ if __name__ == '__main__':
    # see https://git.kernel.org/pub/scm/fs/xfs/xfsprogs-dev.git/tree/mkfs/xfs_mkfs.c .
    # and it also cannot be smaller than the sector size.
    block_size = max(1024, sector_size)
+
    run('udevadm settle', shell=True, check=True)
-    run(f'mkfs.xfs -b size={block_size} {fsdev} -K -m rmapbt=0 -m reflink=0', shell=True, check=True)
+
+    # On Linux 5.12+, sub-block overwrites are supported well, so keep the default block
+    # size, which will play better with the SSD.
+    if is_kernel_version_at_least(5, 12):
+        block_size_opt = ""
+    else:
+        block_size_opt = f"-b size={block_size}"
+
+    run(f'mkfs.xfs {block_size_opt} {fsdev} -K -m rmapbt=0 -m reflink=0', shell=True, check=True)
    run('udevadm settle', shell=True, check=True)

    if is_debian_variant():
--- a/dist/common/scripts/scylla_sysconfig_setup
+++ b/dist/common/scripts/scylla_sysconfig_setup
@@ -86,9 +86,9 @@ if __name__ == '__main__':
    ethpciid = ''
    if network_mode == 'dpdk':
        dpdk_status = out('/opt/scylladb/scripts/dpdk-devbind.py --status')
-        match = re.search('if={} drv=(\S+)'.format(ifname), dpdk_status, flags=re.MULTILINE)
+        match = re.search(r'if={} drv=(\S+)'.format(ifname), dpdk_status, flags=re.MULTILINE)
        ethdrv = match.group(1)
-        match = re.search('^(\\S+:\\S+:\\S+\.\\S+) [^\n]+ if={} '.format(ifname), dpdk_status, flags=re.MULTILINE)
+        match = re.search(r'^(\S+:\S+:\S+\.\S+) [^\n]+ if={} '.format(ifname), dpdk_status, flags=re.MULTILINE)
        ethpciid = match.group(1)

    if args.mode:
--- a/dist/debian/control.template
+++ b/dist/debian/control.template
@@ -18,7 +18,7 @@ Breaks: scylla-enterprise-conf (<< 2025.1.0~)

 Package: %{product}-server
 Architecture: any
-Depends: ${misc:Depends}, %{product}-conf (= ${binary:Version}), %{product}-python3 (= ${binary:Version})
+Depends: ${misc:Depends}, %{product}-conf (= ${binary:Version}), %{product}-python3 (= ${binary:Version}), procps
 Replaces: %{product}-tools (<<5.5), scylla-enterprise-tools (<< 2024.2.0~), scylla-enterprise-server (<< 2025.1.0~)
 Breaks: %{product}-tools (<<5.5), scylla-enterprise-tools (<< 2024.2.0~), scylla-enterprise-server (<< 2025.1.0~)
 Description: Scylla database server binaries
--- a/dist/docker/redhat/build_docker.sh
+++ b/dist/docker/redhat/build_docker.sh
@@ -14,6 +14,15 @@ product="$(<build/SCYLLA-PRODUCT-FILE)"
 version="$(sed 's/-/~/' <build/SCYLLA-VERSION-FILE)"
 release="$(<build/SCYLLA-RELEASE-FILE)"

+original_version="$(<build/SCYLLA-VERSION-FILE)"
+if [[ "$original_version" == *"-dev"* ]]; then
+    repo_file_url="https://downloads.scylladb.com/unstable/scylla/master/rpm/centos/latest/scylla.repo"
+else
+    # Remove the last dot-separated component
+    repo_version="${original_version%.*}"
+    repo_file_url="https://downloads.scylladb.com/rpm/centos/scylla-$repo_version.repo"
+fi
+
 mode="release"

 arch="$(uname -m)"
@@ -88,8 +97,8 @@ bcp LICENSE-ScyllaDB-Source-Available.md /licenses/

 run microdnf clean all
 run microdnf --setopt=tsflags=nodocs -y update
-run microdnf --setopt=tsflags=nodocs -y install hostname python3 python3-pip kmod
-run microdnf clean all
+run microdnf --setopt=tsflags=nodocs -y install hostname kmod procps-ng python3 python3-pip
+run curl -L --output /etc/yum.repos.d/scylla.repo ${repo_file_url}
 run pip3 install --no-cache-dir --prefix /usr supervisor
 run bash -ec "echo LANG=C.UTF-8 > /etc/locale.conf"
 run bash -ec "rpm -ivh packages/*.rpm"
--- a/dist/redhat/scylla.spec
+++ b/dist/redhat/scylla.spec
@@ -76,6 +76,7 @@ Group:          Applications/Databases
 Summary:        The Scylla database server
 Requires:       %{product}-conf = %{version}-%{release}
 Requires:       %{product}-python3 = %{version}-%{release}
+Requires:       procps-ng
 AutoReqProv:    no
 Provides:       %{product}-tools:%{_bindir}/nodetool
 Provides:       %{product}-tools:%{_sysconfigdir}/bash_completion.d/nodetool-completion
--- a/docs/_static/data/os-support.json
+++ b/docs/_static/data/os-support.json
@@ -1,16 +1,25 @@
 {
    "Linux Distributions": {
      "Ubuntu": ["22.04", "24.04"],
-      "Debian": ["11"],
-      "Rocky / CentOS / RHEL": ["8", "9"],
+      "Debian": ["11", "12"],
+      "Rocky / CentOS / RHEL": ["8", "9", "10"],
      "Amazon Linux": ["2023"]
    },
    "ScyllaDB Versions": [
+      {
+        "version": "ScyllaDB 2025.3",
+        "supported_OS": {
+          "Ubuntu": ["22.04", "24.04"],
+          "Debian": ["11", "12"],
+          "Rocky / CentOS / RHEL": ["8", "9", "10"],
+          "Amazon Linux": ["2023"]
+        }
+      },
      {
        "version": "ScyllaDB 2025.2",
        "supported_OS": {
          "Ubuntu": ["22.04", "24.04"],
-          "Debian": ["11"],
+          "Debian": ["11", "12"],
          "Rocky / CentOS / RHEL": ["8", "9"],
          "Amazon Linux": ["2023"]
        }
@@ -19,7 +28,7 @@
        "version": "ScyllaDB 2025.1",
        "supported_OS": {
          "Ubuntu": ["22.04", "24.04"],
-          "Debian": ["11"],
+          "Debian": ["11", "12"],
          "Rocky / CentOS / RHEL": ["8", "9"],
          "Amazon Linux": ["2023"]
        }
--- a/docs/_utils/redirects.yaml
+++ b/docs/_utils/redirects.yaml
@@ -1,6 +1,18 @@
 ### a dictionary of redirections
 #old path: new path

+# Move the diver information to another project
+
+/stable/using-scylla/drivers/index.html: https://docs.scylladb.com/stable/drivers/index.html
+/stable/using-scylla/drivers/dynamo-drivers/index.html: https://docs.scylladb.com/stable/drivers/dynamo-drivers.html
+/stable/using-scylla/drivers/cql-drivers/index.html: https://docs.scylladb.com/stable/drivers/cql-drivers.html
+/stable/using-scylla/drivers/cql-drivers/scylla-python-driver.html: https://docs.scylladb.com/stable/drivers/cql-drivers.html
+/stable/using-scylla/drivers/cql-drivers/scylla-java-driver.html: https://docs.scylladb.com/stable/drivers/cql-drivers.html
+/stable/using-scylla/drivers/cql-drivers/scylla-go-driver.html: https://docs.scylladb.com/stable/drivers/cql-drivers.html
+/stable/using-scylla/drivers/cql-drivers/scylla-gocqlx-driver.html: https://docs.scylladb.com/stable/drivers/cql-drivers.html
+/stable/using-scylla/drivers/cql-drivers/scylla-cpp-driver.html: https://docs.scylladb.com/stable/drivers/cql-drivers.html
+/stable/using-scylla/drivers/cql-drivers/scylla-rust-driver.html: https://docs.scylladb.com/stable/drivers/cql-drivers.html
+
 # Redirect 2025.1 upgrade guides that are not on master but were indexed by Google (404 reported)

 /master/upgrade/upgrade-guides/upgrade-guide-from-2024.x-to-2025.1/upgrade-guide-from-2024.x-to-2025.1.html: https://docs.scylladb.com/manual/stable/upgrade/index.html
@@ -172,4 +184,7 @@
 /stable/upgrade/upgrade-opensource/upgrade-guide-from-4.5-to-4.6/metric-update-4.5-to-4.6.html: /stable/upgrade/index.html

 # Divide API reference to smaller files
-# /stable/reference/api-reference.html: /stable/reference/api/index.html
+# /stable/reference/api-reference.html: /stable/reference/api/index.html
+
+# Fixed typo in the file name
+/stable/operating-scylla/nodetool-commands/enbleautocompaction.html: /stable/operating-scylla/nodetool-commands/enableautocompaction.html
--- a/docs/architecture/tablets.rst
+++ b/docs/architecture/tablets.rst
@@ -42,6 +42,10 @@ the administrator. The tablet load balancer decides where to migrate
 the tablets, either within the same node to balance the shards or across 
 the nodes to balance the global load in the cluster.

+The number of tablets the load balancer maintains on a node is directly
+proportional to the node's storage capacity. A node with twice
+the storage will have twice the number of tablets located on it.
+
 As a table grows, each tablet can split into two, creating a new tablet.
 The load balancer can migrate the split halves independently to different nodes
 or shards.
@@ -83,6 +87,53 @@ especially for data models that contain small cells.
 File-based streaming is used for tablet migration in all 
 :ref:`keyspaces created with tablets enabled <tablets>`.

+.. _absolute-number-of-tablets:
+
+Absolute number of tablets
+==========================
+
+ScyllaDB has a background process that periodically re-evaluates the number of tablets of each table.
+The computed number of tablets a table will have is based on several parameters and factors. These are:
+
+* Keyspace tablets option ``'initial'``. This option sets the initial number of tablets on the keyspace level.
+  See :ref:`The tablets property <tablets>` for details.
+* Table-level option ``'expected_data_size_in_gb'``. This option sets the minimal number of tablets for a table
+  based on the expected table size and the target tablet size. See
+  :ref:`Per-table tablet options <cql-per-table-tablet-options>` for details.
+* Table-level option ``'min_per_shard_tablet_count'``. Using this option results in the number of tablets being
+  computed based on the number of shards in a DC so that each shard has at least ``'min_per_shard_tablet_count'``
+  tablets on average. See :ref:`Per-table tablet options <cql-per-table-tablet-options>` for details.
+* Table-level option ``'min_tablet_count'``. This option sets the minimal number of tablets for the given table.
+  See :ref:`Per-table tablet options <cql-per-table-tablet-options>` for details.
+* Config option ``'tablets_initial_scale_factor'``. This option sets the minimal number of tablets per shard
+  per table globally. This option can be overridden by the table-level option: ``'min_per_shard_tablet_count'``.
+  ``'tablets_initial_scale_factor'`` is ignored if either the keyspace option ``'initial'`` or table-level
+  option ``'min_tablet_count'`` is set.
+
+Another factor that determines the absolute tablet count is the amount of data the table contains. If the
+amount of data in the table is such that the average tablet size is larger than double the target tablet size,
+the table will be split (the number of tablets will be doubled), and if the average tablet size is smaller than
+half the target tablet size, it will be merged (the number of tablets will be halved).
+
+Each of these factors is taken into consideration, and the one producing the largest number of tablets wins, and
+will be used as the number of tablets for the given table.
+
+As the last step, in order to avoid having too many tablets per shard, which could potentially lead to overload
+and performance degradation, ScyllaDB will run the following algorithm to respect the ``tablets_per_shard_goal``
+config option:
+
+* Compute average tablet count per-shard in each DC.
+* Determine if per-shard goal is exceeded in that DC.
+* Compute scale factor by which tablet count should be multiplied so that the goal is not exceeded in that DC.
+* Take the smallest scale factor among all DCs, which ensures that no DC is overloaded.
+* Each table's tablet count is aligned to the nearest power of 2 post-scaling.
+
+Please note that because of this alignment, the scaling may not be effective and in the worst case may be
+overshot by a factor of 2, and that the ``tablets_per_shard_goal`` is a soft limit and not a hard constraint.
+
+Finally, the computed tablet count is compared with the current tablet count for each table, and if there is
+a difference, a table resize (split or merge) is executed.
+
 .. _tablets-enable-tablets: 

 Enabling Tablets
--- a/docs/cql/ddl.rst
+++ b/docs/cql/ddl.rst
@@ -755,7 +755,8 @@ when replicas are slow or unresponsive.  The following are legal values (case-in
 ``XPERCENTILE``           90.5PERCENTILE   Coordinators record average per-table response times for all replicas.
                                            If a replica takes longer than ``X`` percent of this table's average
                                            response time, the coordinator queries an additional replica.
-                                            ``X`` must be between 0 and 100.
+                                            ``X`` must be between 0 and 100, including those values.
+                                            The value is rounded to the nearest 0.1 (1 decimal place).
 ``XP``                    90.5P            Synonym for ``XPERCENTILE``
 ``Yms``                   25ms             If a replica takes more than ``Y`` milliseconds to respond,
                                            the coordinator queries an additional replica.
--- a/docs/cql/types.rst
+++ b/docs/cql/types.rst
@@ -481,7 +481,8 @@ Creating a new user-defined type is done using a ``CREATE TYPE`` statement defin
   field_definition: `identifier` `cql_type`

 A UDT has a name (``udt_name``), which is used to declare columns of that type and is a set of named and typed fields. The ``udt_name`` can be any
-type, including collections or other UDTs. UDTs and collections inside collections must always be frozen (no matter which version of ScyllaDB you are using). 
+type, including collections or other UDTs.
+Similar to collections, a UDT can be frozen or non-frozen. A frozen UDT is immutable and can only be updated as a whole. Nested UDTs or UDTs used in keys must always be frozen.

 For example::

@@ -506,26 +507,15 @@ For example::

  CREATE TABLE superheroes (
       name frozen<full_name> PRIMARY KEY,
-       home frozen<address>
+       home address
  );

 .. note::

   - Attempting to create an already existing type will result in an error unless the ``IF NOT EXISTS`` option is used. If it is used, the statement will be a no-op if the type already exists.
   - A type is intrinsically bound to the keyspace in which it is created and can only be used in that keyspace. At creation, if the type name is prefixed by a keyspace name, it is created in that keyspace. Otherwise, it is created in the current keyspace.
-   - As of ScyllaDB Open Source 3.2, UDTs not inside collections do not have to be frozen, but in all versions prior to ScyllaDB Open Source 3.2, and in all ScyllaDB Enterprise versions, UDTs **must** be frozen. 


-A non-frozen UDT example with ScyllaDB Open Source 3.2 and higher::
-
-   CREATE TYPE ut (a int, b int);
-   CREATE TABLE cf (a int primary key, b ut);
-
-Same UDT in versions prior::
-
-   CREATE TYPE ut (a int, b int);
-   CREATE TABLE cf (a int primary key, b frozen<ut>);
-
 UDT literals
 ~~~~~~~~~~~~

--- a/docs/dev/service_levels.md
+++ b/docs/dev/service_levels.md
@@ -30,7 +30,7 @@ SELECT * FROM  system.role_attributes WHERE role='r' and attribute_name='service
 ### Service Level Configuration Table

 ```
-    CREATE TABLE system_distributed.service_levels (
+    CREATE TABLE system.service_levels_v2 (
    service_level text PRIMARY KEY,
    timeout duration,
    workload_type text,
@@ -45,14 +45,19 @@ The table column names meanings are:
 *shares* - a number that represents this service level priority in relation to other service levels.

 ```
-select * from system_distributed.service_levels ;
+select * from system.service_levels_v2 ;

- service_level | timeout | workload_type
---------------+---------+---------------
-            sl |   500ms |   interactive
+ service_level | shares | timeout | workload_type
+---------------+--------+---------+---------------
+            sl |    100 |   500ms |   interactive

 ```

+* Please note that `system.service_levels_v2` is used as service levels configuration table only with consistent topology and service levels on Raft,
+which is enabled by default but not mandatory yet.
+Otherwise, the old `system_distributed.service_levels` is used as configuration table.
+For more information see [service levels on Raft](#service-levels-on-raft) section. *
+
 ### Service Level Timeout

 Service level timeout can be used to assign a default timeout value for all operations for a particular service level.
@@ -189,3 +194,32 @@ The command displays a table with: option name, effective service level the valu
        workload_type |                     sl2 |       batch
              timeout |                     sl1 |          2s
 ```
+
+## Service levels on Raft
+
+Since ScyllaDB 6.0 and ScyllaDB Enterprise 2024.2, service levels metadata is managed by Raft group0 and stored in `system.service_levels_v2`.
+
+In existing clusters, service levels metadata is automatically copied from `system_distributed.service_levels` to `system.service_levels_v2`
+during the topology upgrade (see [upgrade to raft topology section](topology-over-raft.md#upgrade-from-legacy-topology-to-raft-based-topology)). Because of this, no service levels operations should be done during the topology upgrade.
+
+Before the service levels on Raft, service levels cache was updated in 10s intervals by polling `system_distributed.service_levels` table.
+Once the service levels are migrated to raft, the cache is updated on every group0 state apply fiber, if there are any mutations to
+`system.service_levels_v2`, `system.role_attributes` or ` system.role_members`.
+
+With service levels on Raft, the cache also has an additional layer, called `service level effective cache`. This layer combines service levels
+and auth information and stores effective service level for each role (see [effective service level section](#effective-service-level)).
+
+## Implementation
+### Integration with auth
+
+Service levels ultimately depend on the state of `auth`. Since `auth::service` is initialized long after
+`service_level_controller`, we register it separately once it's started, and unregister it right before
+it's stopped. For that, we wrap it in a struct called `auth_integration` that manages access to it.
+That ensures that `service_level_controller` will not try to reference it beyond its lifetime.
+
+It's important to note that there may still be attempts to fetch an effective service level for a role
+or indirectly access `auth::service` in some other way when `auth_integration` is absent. One important
+situation to have in mind is when the user connects to Scylla via the maintenance socket. It's possible
+early on, way before Scylla is fully initialized. Since we don't have access to `auth` yet, we need to
+ensure that the semantics of the operations performed on `service_level_controller` still make sense
+in that context.
--- a/docs/dev/testing.md
+++ b/docs/dev/testing.md
@@ -10,7 +10,7 @@ This is a manual for `test.py`.

 ## Installation

-To run `test.py`, Python 3.7 or higher is required.
+To run `test.py`, Python 3.11 or higher is required.
 `./install-dependencies.sh` should install all the required Python
 modules. If `install-dependencies.sh` does not support your distribution,
 please manually install all Python modules it lists with `pip`.
@@ -106,6 +106,19 @@ shed more light on this.
 Build artefacts, such as test output and harness output is stored
 in `./testlog`. Scylla data files are stored in `/tmp`.

+There are several test directories that are excluded from orchestration by `test.py`:
+
+- test/boost
+- test/raft
+- test/ldap
+- test/unit
+
+This means that `test.py` will not run tests directly, but will delegate all work to `pytest`.
+That's why all these directories do not have `suite.yaml` files.
+Additionally, these directories do not follow abstract naming suite/testname
+convention, and instead use the `pytest` naming convention, i.e. to run a test you need to provide the path to the file
+and optionally the test name, e.g. `test/boost/aggregate_fcts_test.cc::test_aggregate_avg`.
+
 ## How it works

 On start, `test.py` invokes `ninja` to find out configured build modes. Then
@@ -176,11 +189,11 @@ Scylla (possibly started in debugger) using `cqlsh`.

 The same unit test can be run in different seastar configurations, i.e. with
 different command line arguments. The custom arguments can be set in
-`custom_args` key of the `suite.yaml` file.
+`custom_args` key of the `test_config.yaml` file.

 Tests from boost suite are divided into test-cases. These are top-level
 functions wrapped by `BOOST_AUTO_TEST_CASE`, `SEASTAR_TEST_CASE` or alike.
-Boost tests support `suitename/testname::casename` selection described above.
+Boost tests support `path/to/file_name.cc::casename` selection described above.

 ### Debugging unit tests

@@ -328,6 +341,19 @@ as it was at the beginning of the test, is considered "dirty".
 Such clusters are not returned to the pool, but destroyed, and
 the pool is replenished with a new cluster instead.

+## Test metrics
+
+The parameter `--gather-metrics` is used to gather CPU/RAM usage during tests from the cgroup and system overall CPU/RAM
+usage.
+For that, SQLite database is used to store the metrics in `testlog/sqlite.db`.
+The database is created in the `testlog` directory and contains the following tables:
+
+- `tests` - contains the list of tests that were executed with information about the test name, directory, architecture,
+  and mode
+- `test_metrics` - contains the metrics for each test, such as memory peak usage, CPU usage, and duration
+- `system_resource_metrics` - contains system CPU and memory utilization in percents during the whole run
+- `cgroup_memory_metrics` - contains cgroup memory usage during the test run
+
 ## Automation, CI, and Jenkins

 If any of the tests fails, `test.py` returns a non-zero exit status.
--- a/docs/dev/topology-over-raft.md
+++ b/docs/dev/topology-over-raft.md
@@ -804,6 +804,9 @@ From the admin's point of view, the steps are as follows:
  or via observing the logs
 - After all nodes report `done` via the GET endpoint, the upgrade has fully finished

+Note that during the upgrade no service levels or auth operations should be done,
+as those services are performing migrations to raft metadata.
+
 The `upgrade_state` static column in `system.topology` serves the key role
 in coordinating the upgrade. It goes through the following states in the following
 order:
--- a/docs/features/backup-and-restore.rst
+++ b/docs/features/backup-and-restore.rst
@@ -0,0 +1,53 @@
+===========================
+Backup And Restore Overview
+===========================
+
+Backup and restore are critical components of data management, ensuring that your data is safe and can be recovered in case of loss or corruption. This document provides an overview of the backup and restore process, including best practices, tools, procedures, metrics and more
+
+
+Process Overview
+----------------
+**The Backup process** is managed by ScyllaDB Manager as a whole.
+The overview given here is for a single node, ScyllaDB Manager is responsible to orchestrate backups across the cluster.
+For backup, a snapshot is created, and then the data is copied to a remote location - normally an S3 bucket, Google Cloud Storage, or a similar service.
+
+**The Restore process** is also managed by ScyllaDB manager and it involves copying the data back from the remote location to an empty ScyllaDB node.
+Restoring to a live cluster is not yet supported.
+
+Backup Process
+--------------
+
+#. **Snapshot Creation**: A snapshot of the data is created on the ScyllaDB node.
+   This is a point-in-time copy of the data.
+#. **Upload Data**: The snapshot data is transferred to a remote storage location,
+   such as an S3 bucket or Google Cloud Storage. You can upload data in two ways:
+
+   * **rclone** - the tool responsible for the upload is the scylla manager agent
+     that runs on the node.
+
+          - It runs side by side with scylla and therefore may interfere
+            with ScyllaDB performance.
+          - It supports many cloud storage providers.
+
+   * **Native upload** - ScyllaDB itself is responsible for the upload.
+
+          - It takes into consideration ScyllaDB performance and does
+            not interfere with it.
+          - It supports only S3 compatible storage providers.
+
+      See the `ScyllaDB Manager backup documentation <https://manager.docs.scylladb.com/stable/backup/index.html>`_
+      for more details on how to configure the upload method.
+
+#. **Native Configuration**: 
+
+   * For `native` backup to work without interference to users' workload, it is
+     best to limit io-scheduling. See :ref:`stream_io_throughput_mb_per_sec <confprop_stream_io_throughput_mb_per_sec>` for details.     
+   * For `native` backup to work, ScyllaDB node must have access to the S3 bucket.
+     See :ref:`Configuring Object Storage <object-storage-configuration>` for details.
+
+Restore Process
+---------------
+The restore process is managed completely by ScyllaDB Manager.
+No special configuration is needed.
+Restore may be executed by rclone or natively, not depending on the backup method used.
+See `ScyllaDB Manager restore documentation <https://manager.docs.scylladb.com/stable/restore/index.html>`_ for more details on how to restore data.
--- a/docs/features/index.rst
+++ b/docs/features/index.rst
@@ -15,6 +15,7 @@ This document highlights ScyllaDB's key data modeling features.
   Change Data Capture </features/cdc/index>
   Workload Attributes </features/workload-attributes>
   Workload Prioritization </features/workload-prioritization>
+   Backup and Restore </features/backup-and-restore>

 .. panel-box::
  :title: ScyllaDB Features
@@ -36,3 +37,5 @@ This document highlights ScyllaDB's key data modeling features.
    state and the history of all changes made to tables in the database.
  * :doc:`Workload Attributes </features/workload-attributes>` assigned to your workloads
    specify how ScyllaDB will handle requests depending on the workload.
+  * :doc:`Backup and Restore </features/backup-and-restore>` allows you to create
+    backups of your data and restore it when needed.
--- a/docs/features/materialized-views.rst
+++ b/docs/features/materialized-views.rst
@@ -86,7 +86,7 @@ Compaction Strategies with Materialized Views
 Materialized views, just like regular tables, use one of the available :doc:`compaction strategies </architecture/compaction/compaction-strategies>`.
 When a materialized view is created, it does not inherit its base table compaction strategy settings, because the data model
 of a view does not necessarily have the same characteristics as the one from its base table.
-Instead, the default compaction strategy (SizeTieredCompactionStrategy) is used.
+Instead, the default compaction strategy (IncrementalCompactionStrategy) is used.

 A compaction strategy for a new materialized view can be explicitly set during its creation, using the following command:

--- a/docs/getting-started/_common/setup-after-install.rst
+++ b/docs/getting-started/_common/setup-after-install.rst
@@ -45,10 +45,3 @@ Run cqlsh:
     
     cqlsh

-Run cassandra-stress:
-
-.. code-block:: console
-     
-     cassandra-stress write -mode cql3 native 
-
-
--- a/docs/getting-started/cloud-instance-recommendations.rst
+++ b/docs/getting-started/cloud-instance-recommendations.rst
@@ -20,8 +20,7 @@ You can run your ScyllaDB workloads on AWS, GCE, and Azure using a ScyllaDB imag
 Amazon Web Services (AWS)
 -----------------------------

-* The recommended instance types are :ref:`i3en <system-requirements-i3en-instances>`, and :ref:`i4i <system-requirements-i4i-instances>`.
-* We recommend using enhanced networking that exposes the physical network cards to the VM.
+The recommended instance types are :ref:`i3en <system-requirements-i3en-instances>`, :ref:`i4i <system-requirements-i4i-instances>`, :ref:`i7i <system-requirements-i7i-instances>`, and :ref:`i7ie <system-requirements-i7ie-instances>`.

 .. note::

@@ -99,6 +98,102 @@ See  `Amazon EC2 I4i Instances <https://aws.amazon.com/ec2/instance-types/i4i/>`
 See `ScyllaDB on the New AWS EC2 I4i Instances: Twice the Throughput & Lower Latency <https://www.scylladb.com/2022/05/09/scylladb-on-the-new-aws-ec2-i4i-instances-twice-the-throughput-lower-latency/>`_ to 
 learn more about using ScyllaDB with i4i instances.

+.. _system-requirements-i7i-instances:
+
+i7i instances
+^^^^^^^^^^^^^^
+
+The following i7i instances are supported. Larger i7i instances will be
+supported in upcoming releases.
+
+.. list-table::
+   :widths: 30 20 20 30
+   :header-rows: 1
+
+   * - Model
+     - vCPU
+     - Mem (GiB)
+     - Storage (GB)
+   * - i7i.large
+     - 2
+     - 16
+     - 1 x 468 GB
+   * - i7i.xlarge
+     - 4
+     - 32
+     - 1 x 937.5 GB
+   * - i7i.2xlarge
+     - 8
+     - 64
+     - 1 x 1875 GB
+
+All i7i instances have the following specs:
+
+* 3.2 GHz all-core turbo Intel® Xeon® Scalable (Emerald Rapids) processors.
+* Up to 100 Gbps of networking bandwidth and 60 Gbps bandwidth to Amazon Elastic Block Store (EBS).
+* Advanced Matrix Extensions (AMX) for accelerating CPU-based machine learning.
+
+See `Amazon EC2 I7i Instances <https://aws.amazon.com/ec2/instance-types/i7i/>`_ for details.
+
+.. _system-requirements-i7ie-instances:
+
+i7ie instances
+^^^^^^^^^^^^^^
+
+The following i7ie instances are supported.
+
+.. list-table::
+   :widths: 30 20 20 30
+   :header-rows: 1
+
+   * - Model
+     - vCPU
+     - Mem (GiB)
+     - Storage (GB)
+   * - i7ie.large
+     - 2
+     - 16
+     - 1 x 1,250 GB
+   * - i7ie.xlarge
+     - 4
+     - 32
+     - 1 x 2,500 GB
+   * - i7ie.2xlarge
+     - 8
+     - 64
+     - 2 x 2,500 GB
+   * - i7ie.3xlarge
+     - 12
+     - 96
+     - 1 x 7,500 GB
+   * - i7ie.6xlarge
+     - 24
+     - 192
+     - 2 x 7,500 GB
+   * - i7ie.12xlarge
+     - 48
+     - 384
+     - 4 x 7,500 GB
+   * - i7ie.18xlarge
+     - 72
+     - 576
+     - 6 x 7,500 GB
+   * - i7ie.24xlarge
+     - 96
+     - 768
+     - 8 x 7,500 GB
+   * - i7ie.48xlarge
+     - 192
+     - 1,536
+     - 16 x 7,500 GB
+
+All i7i instances have the following specs:
+
+* 3.2 GHz all-core turbo Intel® Xeon® Scalable (Emerald Rapids) processors.
+* Up to 100 Gbps of networking bandwidth and 60 Gbps bandwidth to Amazon Elastic Block Store (EBS).
+* Advanced Matrix Extensions (AMX) for accelerating CPU-based machine learning.
+
+See `Amazon EC2 I7i Instances <https://aws.amazon.com/ec2/instance-types/i7i/>`_ for details.

 Im4gn and Is4gen instances
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -113,8 +208,8 @@ Pick a zone where Haswell CPUs are found. Local SSD performance offers, accordin
 Image with NVMe disk interface is recommended.
 (`More info <https://cloud.google.com/compute/docs/disks/local-ssd>`_)

-Recommended instances types are `z3-highmem-highlssd <https://cloud.google.com/compute/docs/storage-optimized-machines#z3_machine_types>`_,
-`n1-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n1_machines>`_, and `n2-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n2_machines>`_
+Recommended instances types are `z3-highmem-highlssd and z3-highmem-standardlssd <https://cloud.google.com/compute/docs/storage-optimized-machines#z3_machine_types>`_,
+`n1-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n1_machines>`_, and `n2-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n2_machines>`_.


 .. list-table::
@@ -145,6 +240,39 @@ Recommended instances types are `z3-highmem-highlssd <https://cloud.google.com/c
     - 44
     - 352
     - 18,000
+   * - z3-highmem-88-highlssd
+     - 88
+     - 704
+     - 36,000	  
+
+.. list-table::
+   :widths: 30 20 20 30
+   :header-rows: 1
+
+   * - Model
+     - vCPU
+     - Mem (GB)
+     - Storage (GB)
+   * - z3-highmem-14-standardlssd
+     - 14
+     - 112
+     - 3,000
+   * - z3-highmem-22-standardlssd
+     - 22
+     - 176
+     - 6,000
+   * - z3-highmem-44-standardlssd 
+     - 44
+     - 352
+     - 9,000
+   * - z3-highmem-88-standardlssd
+     - 88
+     - 704
+     - 18,000
+   * - z3-highmem-176-standardlssd
+     - 176
+     - 1,406
+     - 36,000 

 .. list-table::
   :widths: 30 20 20 30
--- a/docs/getting-started/index.rst
+++ b/docs/getting-started/index.rst
@@ -25,8 +25,7 @@ Getting Started
  :id: "getting-started"
  :class: my-panel

-  * `Install ScyllaDB (Binary Packages, Docker, or EC2) <https://www.scylladb.com/download/#core>`_ - Links to the ScyllaDB Download Center
-  
+  * :doc:`Install ScyllaDB </getting-started/install-scylla/index/>`
  * :doc:`Configure ScyllaDB </getting-started/system-configuration/>`
  * :doc:`Run ScyllaDB in a Shared Environment </getting-started/scylla-in-a-shared-environment>`
  * :doc:`Create a ScyllaDB Cluster - Single Data Center (DC) </operating-scylla/procedures/cluster-management/create-cluster/>`
@@ -37,7 +36,7 @@ Getting Started
  :id: "getting-started"
  :class: my-panel

-  * :doc:`ScyllaDB Drivers</using-scylla/drivers/index>`
+  * `ScyllaDB Drivers <https://docs.scylladb.com/stable/drivers/index.html>`_
  * `Get Started Lesson on ScyllaDB University <https://university.scylladb.com/courses/scylla-essentials-overview/lessons/quick-wins-install-and-run-scylla/>`_    
  * :doc:`CQL Reference </cql/index>`
  * :doc:`cqlsh - the CQL shell </cql/cqlsh/>`
@@ -60,4 +59,5 @@ Getting Started
  
  * `Build an IoT App with sensor simulator and a REST API <https://iot.scylladb.com/stable/>`_ - ScyllaDB Tutorial
  * `Implement CRUD operations with a TODO App <https://github.com/scylladb/scylla-cloud-getting-started/>`_ - ScyllaDB Cloud Tutorial
-  * `Build a machine learning (ML) feature store with ScyllaDB <https://feature-store.scylladb.com/stable/>`_ - ScyllaDB Cloud Tutorial  ` <>`_
+  * `Build a machine learning (ML) feature store with ScyllaDB <https://feature-store.scylladb.com/stable/>`_ - ScyllaDB Cloud Tutorial
+  
--- a/docs/getting-started/install-scylla/index.rst
+++ b/docs/getting-started/install-scylla/index.rst
@@ -10,7 +10,6 @@ Install ScyllaDB
   /getting-started/install-scylla/launch-on-azure
   /getting-started/installation-common/scylla-web-installer
   /getting-started/install-scylla/install-on-linux
-   /getting-started/installation-common/install-jmx
   /getting-started/install-scylla/run-in-docker
   /getting-started/installation-common/unified-installer
   /getting-started/installation-common/air-gapped-install
@@ -36,7 +35,6 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa

  * :doc:`Install ScyllaDB with Web Installer (recommended) </getting-started/installation-common/scylla-web-installer>`
  * :doc:`Install ScyllaDB Linux Packages </getting-started/install-scylla/install-on-linux>`
-  * :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`
  * :doc:`Install ScyllaDB Without root Privileges </getting-started/installation-common/unified-installer>`
  * :doc:`Air-gapped Server Installation </getting-started/installation-common/air-gapped-install>`
  * :doc:`ScyllaDB Developer Mode </getting-started/installation-common/dev-mod>`
--- a/docs/getting-started/install-scylla/install-on-linux.rst
+++ b/docs/getting-started/install-scylla/install-on-linux.rst
@@ -90,16 +90,6 @@ Install ScyllaDB
    
               apt-get install scylla{,-server,-tools,-tools-core,-kernel-conf,-node-exporter,-conf,-python3}=5.2.3-0.20230608.ea08d409f155-1

-
-        #. (Ubuntu only) Set Java 11.
-
-            .. code-block:: console
-    
-               sudo apt-get update
-               sudo apt-get install -y openjdk-11-jre-headless
-               sudo update-java-alternatives --jre-headless -s java-1.11.0-openjdk-amd64
-
-
   .. group-tab:: Centos/RHEL

        #. Install the EPEL repository.
@@ -153,14 +143,6 @@ Install ScyllaDB
    
               sudo yum install scylla-5.2.3

-(Optional) Install scylla-jmx
-------------------------------
-
-    scylla-jmx is an optional package and is not installed by default.
-    If you need JMX server, see :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`.
-
-
-
 .. include:: /getting-started/_common/setup-after-install.rst

 Next Steps
--- a/docs/getting-started/install-scylla/launch-on-gcp.rst
+++ b/docs/getting-started/install-scylla/launch-on-gcp.rst
@@ -30,7 +30,7 @@ Launching ScyllaDB on GCP

   .. code-block:: console
      
-        gcloud compute instances create <name of new instance> --image <ScyllaDB image name> --image-project < ScyllaDB project name> --local-ssd interface=nvme --zone <GCP zone - optional> --machine-type=<machine type>
+        gcloud compute instances create <name of new instance> --image <ScyllaDB image name> --image-project < ScyllaDB project name> --local-ssd interface=nvme --zone=<GCP zone - optional> --machine-type=<machine type>
   
   For example:

--- a/docs/getting-started/installation-common/disable-housekeeping.rst
+++ b/docs/getting-started/installation-common/disable-housekeeping.rst
@@ -3,8 +3,7 @@
 ScyllaDB Housekeeping and how to disable it
 ============================================

-It is always recommended to run the latest version of ScyllaDB. 
-The latest stable release version is always available from the `Download Center <https://www.scylladb.com/download/>`_.
+It is always recommended to run the latest stable version of ScyllaDB. 

 When you install ScyllaDB, it installs by default two services: **scylla-housekeeping-restart** and **scylla-housekeeping-daily**. These services check for the latest ScyllaDB version and prompt the user if they are using a version that is older than what is publicly available.
 Information about your ScyllaDB deployment, including the ScyllaDB version currently used, as well as unique user and server identifiers, are collected by a centralized service.
--- a/docs/getting-started/installation-common/install-jmx.rst
+++ b/docs/getting-started/installation-common/install-jmx.rst
@@ -1,78 +0,0 @@
-
-======================================
-Install scylla-jmx Package
-======================================
-
-scylla-jmx is an optional package and is not installed by default.
-If you need JMX server, you can still install it from scylla-jmx GitHub page.
-
-.. tabs::
-
-   .. group-tab:: Debian/Ubuntu
-        #. Download .deb package from scylla-jmx page.
-
-            Access to https://github.com/scylladb/scylla-jmx, select latest
-            release from "releases", download a file end with ".deb".
-
-        #. (Optional) Transfer the downloaded package to the install node.
-
-            If the pc from which you downloaded the package is different from
-            the node where you install scylladb, you will need to transfer
-            the files to the node.
-
-        #. Install scylla-jmx package.
-
-            .. code-block:: console
-    
-               sudo apt install -y ./scylla-jmx_<version>_all.deb
-
-
-   .. group-tab:: Centos/RHEL
-
-        #. Download .rpm package from scylla-jmx page.
-
-            Access to https://github.com/scylladb/scylla-jmx, select latest
-            release from "releases", download a file end with ".rpm".
-
-        #. (Optional) Transfer the downloaded package to the install node.
-
-            If the pc from which you downloaded the package is different from
-            the node where you install scylladb, you will need to transfer
-            the files to the node.
-
-        #. Install scylla-jmx package.
-
-            .. code-block:: console
-    
-               sudo yum install -y ./scylla-jmx-<version>.noarch.rpm
-
-
-   .. group-tab:: Install without root privileges
-
-        #. Download .tar.gz package from scylla-jmx page.
-
-            Access to https://github.com/scylladb/scylla-jmx, select latest
-            release from "releases", download a file end with ".tar.gz".
-
-        #. (Optional) Transfer the downloaded package to the install node.
-
-            If the pc from which you downloaded the package is different from
-            the node where you install scylladb, you will need to transfer
-            the files to the node.
-
-        #. Install scylla-jmx package.
-
-            .. code:: console
-    
-                tar xpf scylla-jmx-<version>.noarch.tar.gz
-                cd scylla-jmx
-                ./install.sh --nonroot
-
-Next Steps
-----------
-
-* :doc:`Configure ScyllaDB </getting-started/system-configuration>`
-* Manage your clusters with `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_
-* Monitor your cluster and data with `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_
-* Get familiar with ScyllaDB’s :doc:`command line reference guide </operating-scylla/nodetool>`.
-* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_
--- a/docs/getting-started/installation-common/scylla-web-installer.rst
+++ b/docs/getting-started/installation-common/scylla-web-installer.rst
@@ -14,6 +14,7 @@ See :doc:`OS Support by Platform and Version </getting-started/os-support/>`.

 Install ScyllaDB with Web Installer
 ---------------------------------------
+
 To install ScyllaDB with Web Installer, run:

 .. code:: console
@@ -27,7 +28,13 @@ You can run the command with the ``-h`` or ``--help`` flag to print information
 Installing a Non-default Version
 ---------------------------------------

-You can install a version other than the default.
+You can install a version other than the default. To get the list of supported
+release versions, run:
+
+.. code:: console
+  
+  curl -sSf get.scylladb.com/server | sudo bash -s -- --list-active-releases
+

 Versions 2025.1 and Later
 ==============================
--- a/docs/getting-started/installation-common/unified-installer.rst
+++ b/docs/getting-started/installation-common/unified-installer.rst
@@ -14,44 +14,35 @@ Prerequisites
 Ensure your platform is supported by the ScyllaDB version you want to install. 
 See :doc:`OS Support </getting-started/os-support>` for information about supported Linux distributions and versions.

-Note that if you're on CentOS 7, only root offline installation is supported.
-
 Download and Install
 -----------------------

 #. Download the latest tar.gz file for ScyllaDB version (x86 or ARM) from ``https://downloads.scylladb.com/downloads/scylla/relocatable/scylladb-<version>/``.

-   Example for version 6.1: https://downloads.scylladb.com/downloads/scylla/relocatable/scylladb-6.1/
+   **Example** for version 2025.1:
+   
+   - Go to https://downloads.scylladb.com/downloads/scylla/relocatable/scylladb-2025.1/
+   - Download the ``scylla-unified`` file for the patch version you want to
+     install. For example, to install 2025.1.9 (x86), download
+     ``scylla-unified-2025.1.9-0.20251010.6c539463bbda.x86_64.tar.gz``.

 #. Uncompress the downloaded package.

-   The following example shows the package for ScyllaDB 6.1.1 (x86):
+   **Example** for version 2025.1.9 (x86) (downloaded in the previous step):

-   .. code:: console
+   .. code::

-    tar xvfz scylla-unified-6.1.1-0.20240814.8d90b817660a.x86_64.tar.gz
+    tar xvfz scylla-unified-2025.1.9-0.20251010.6c539463bbda.x86_64.tar.gz

-#. Install OpenJDK 8 or 11.
-
-   The following example shows Java installation on a CentOS-like system:
-
-   .. code:: console
-    
-    sudo yum install -y java-11-openjdk-headless
-
-   For root offline installation on Debian-like systems, two additional packages, ``xfsprogs`` 
-   and ``mdadm``, should be installed to be used in RAID setup.
+#. (Root offline installation only) For root offline installation on Debian-like
+   systems, two additional packages, ``xfsprogs`` and ``mdadm``, should be
+   installed to be used in RAID setup.

 #. Install ScyllaDB as a user with non-root privileges:

   .. code:: console

-    ./install.sh --nonroot --python3 ~/scylladb/python3/bin/python3
-
-#. (Optional) Install scylla-jmx
-
-    scylla-jmx is an optional package and is not installed by default.
-    If you need JMX server, see :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`.
+    ./install.sh --nonroot

 Configure and Run ScyllaDB
 ----------------------------
@@ -81,19 +72,14 @@ Run nodetool:

 .. code:: console

-    ~/scylladb/share/cassandra/bin/nodetool status
+    ~/scylladb/bin/nodetool nodetool status

 Run cqlsh:

 .. code:: console

-    ~/scylladb/share/cassandra/bin/cqlsh 
+    ~/scylladb/bin/cqlsh 

-Run cassandra-stress:
-
-.. code:: console
-
-    ~/scylladb/share/cassandra/bin/cassandra-stress write

 .. note::

@@ -124,7 +110,7 @@ Nonroot install

    ./install.sh --upgrade --nonroot

-.. note:: The installation script does not upgrade scylla-jmx and scylla-tools. You will have to upgrade them separately. 
+.. note:: The installation script does not upgrade scylla-tools. You will have to upgrade them separately. 

 Uninstall
 ===========
@@ -154,4 +140,4 @@ Next Steps
 * Manage your clusters with `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_
 * Monitor your cluster and data with `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_
 * Get familiar with ScyllaDB’s :doc:`command line reference guide </operating-scylla/nodetool>`.
-* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_
+* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -35,7 +35,7 @@ Documentation Highlights
 * :doc:`Cluster Management Procedures </operating-scylla/procedures/cluster-management/index>`
 * :doc:`Upgrade ScyllaDB </upgrade/index>`
 * :doc:`CQL Reference </cql/index>`
-* :doc:`ScyllaDB Drivers </using-scylla/drivers/index>`
+* `ScyllaDB Drivers <https://docs.scylladb.com/stable/drivers/index.html>`_
 * :doc:`Features </features/index>`

 ScyllaDB Support
--- a/docs/kb/consistency.rst
+++ b/docs/kb/consistency.rst
@@ -83,7 +83,7 @@ Additional References

 * `Jepsen and ScyllaDB: Putting Consistency to the Test blog post <https://www.scylladb.com/2020/12/23/jepsen-and-scylla-putting-consistency-to-the-test/>`_ 
 * `Nauto: Achieving Consistency in an Eventually Consistent Environment blog post <https://www.scylladb.com/2020/02/20/nauto-achieving-consistency-in-an-eventually-consistent-environment/>`_ 
-* `Consistency Levels documentation <https://docs.scylladb.com/stable/cql/consistency.html>`_ 
+* :doc:`Consistency Levels documentation </cql/consistency/>`
 * `High Availability lesson on ScyllaDB University <https://university.scylladb.com/courses/scylla-essentials-overview/lessons/high-availability/>`_ 
 * `Lightweight Transactions lesson on ScyllaDB University <https://university.scylladb.com/courses/data-modeling/lessons/lightweight-transactions/>`_ 
 * `Getting the Most out of Lightweight Transactions in ScyllaDB blog post <https://www.scylladb.com/2020/07/15/getting-the-most-out-of-lightweight-transactions-in-scylla/>`_ 
--- a/docs/kb/tombstones-flush.rst
+++ b/docs/kb/tombstones-flush.rst
@@ -38,7 +38,7 @@ Steps:

 4. Run compaction (this will remove big partitions with tombstones from specified table)

-.. note:: By default, major compaction runs on all the keyspaces and tables, so if we want to specyfy e.g. only one table, we should point at it using arguments: ``<keyspace>.<mytable>``. For more information, please refer to `this article <https://docs.scylladb.com/operating-scylla/nodetool-commands/compact/>`_.
+.. note:: By default, major compaction runs on all the keyspaces and tables, so if we want to specyfy e.g. only one table, we should point at it using arguments: ``<keyspace>.<mytable>``. For more information, please see :doc:`Nodetool compact </operating-scylla/nodetool-commands/compact/>`.

 .. code-block:: sh
   
--- a/docs/operating-scylla/admin.rst
+++ b/docs/operating-scylla/admin.rst
@@ -219,7 +219,7 @@ For example:
 * `ScyllaDB Java Driver <https://github.com/scylladb/java-driver/tree/3.7.1-scylla/manual/compression>`_
 * `Go Driver <https://godoc.org/github.com/gocql/gocql#Compressor>`_

-Refer to the :doc:`Drivers Page </using-scylla/drivers/index>` for more drivers.
+Refer to `ScyllaDB Drivers <https://docs.scylladb.com/stable/drivers/index.html>`_ for more drivers.

 .. _internode-compression:

--- a/docs/operating-scylla/diagnostics.rst
+++ b/docs/operating-scylla/diagnostics.rst
@@ -11,7 +11,7 @@ Logs

 The most obvious source of information to find out more about why ScyllaDB is misbehaving.
 On production systems, ScyllaDB logs to syslog; thus logs can usually be viewed via ``journalctl``.
-See `Logging </getting-started/logging/>`_ on more information on how to access the logs.
+See :doc:`Logging </getting-started/logging/>` on more information on how to access the logs.


 ScyllaDB has the following log levels: ``trace``, ``debug``, ``info``, ``warn``, ``error``.
@@ -64,21 +64,21 @@ Tracing
 Tracing allows you to retrieve the internal log of events happening in the context of a single query.
 Therefore, tracing is only useful to diagnose problems related to a certain query and cannot be used to diagnose generic problems.
 That said, when it comes to diagnosing problems with a certain query, tracing is an excellent tool, allowing you to have a peek at what happens when that query is processed, including the timestamp of each event.
-For more details, see `Tracing </using-scylla/tracing>`_.
+For more details, see :doc:`Tracing </using-scylla/tracing>`.

 Nodetool
 --------

 Although ``nodetool`` is primarily an administration tool, it has various commands that retrieve and display useful information about the state of a certain ScyllaDB node.
 Look for commands with "stats", "info", "describe", "get", "histogram" in their names.
-For a comprehensive list of all available nodetool commands, see the `Nodetool Reference </operating-scylla/nodetool>`_.
+For a comprehensive list of all available nodetool commands, see the :doc:`Nodetool Reference </operating-scylla/nodetool>`.

 REST API
 --------

 ScyllaDB has a REST API which is a superset of all ``nodetool`` commands, in the sense that it is the backend serving all of them.
 It has many more endpoints, many of which can supply valuable information about the internal state of ScyllaDB.
-For more information, see `REST API </operating-scylla/rest>`_.
+For more information, see :doc:`REST API </operating-scylla/rest>`.

 System Tables
 -------------
@@ -102,9 +102,9 @@ Other Tools
 ScyllaDB has various other tools, mainly to work with sstables.
 If you are diagnosing a problem that is related to sstables misbehaving or being corrupt, you may find these useful:

-* `sstabledump </operating-scylla/admin-tools/sstabledump/>`_
-* `ScyllaDB SStable </operating-scylla/admin-tools/scylla-sstable/>`_
-* `ScyllaDB Types </operating-scylla/admin-tools/scylla-types/>`_
+* :doc:`sstabledump </operating-scylla/admin-tools/sstabledump/>`
+* :doc:`ScyllaDB SStable </operating-scylla/admin-tools/scylla-sstable/>`
+* :doc:`ScyllaDB Types </operating-scylla/admin-tools/scylla-types/>`

 GDB
 ---
--- a/docs/operating-scylla/nodetool-commands/cleanup.rst
+++ b/docs/operating-scylla/nodetool-commands/cleanup.rst
@@ -20,6 +20,8 @@ To clean up the data of a specific node and specific keyspace, use this command:

   nodetool -h <host name> cleanup <keyspace>

+To clean up entire cluster see :doc:`nodetool cluster cleanup </operating-scylla/nodetool-commands/cluster/cleanup/>`
+
 .. warning::

   Make sure there are no topology changes before running cleanup. To validate, run ``nodetool status``, all nodes should be in status Up Normal (``UN``).
--- a/docs/operating-scylla/nodetool-commands/cluster/cleanup.rst
+++ b/docs/operating-scylla/nodetool-commands/cluster/cleanup.rst
@@ -0,0 +1,15 @@
+Nodetool cluster cleanup
+========================
+
+**cluster cleanup** - A process that runs in the background and removes data no longer owned by nodes. Used for non tablet (vnode-based) tables only.
+
+Running ``cluster cleanup`` on a **single node** cleans up all non tablet tables on all nodes in the cluster (tablet enabled tables are cleaned up automatically).
+
+
+  For example:
+
+  ::
+
+     nodetool cluster cleanup
+
+See also `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_.
--- a/docs/operating-scylla/nodetool-commands/cluster/index.rst
+++ b/docs/operating-scylla/nodetool-commands/cluster/index.rst
@@ -5,6 +5,7 @@ Nodetool cluster
   :hidden:

   repair <repair>
+   cleanup <cleanup>

 **cluster** - Nodetool supercommand for running cluster operations.

@@ -12,3 +13,4 @@ Supported cluster suboperations
 -------------------------------

 * :doc:`repair </operating-scylla/nodetool-commands/cluster/repair>`  :code:`<keyspace>` :code:`<table>` - Repair one or more tablet tables.
+* :doc:`cleanup </operating-scylla/nodetool-commands/cluster/cleanup>`  - Clean up all non tablet (vnode-based) keyspaces in a cluster
--- a/docs/operating-scylla/nodetool-commands/enableautocompaction.rst
+++ b/docs/operating-scylla/nodetool-commands/enableautocompaction.rst
--- a/Show More
+++ b/Show More