Update pgo profiles - x86_64

Update pgo profiles - aarch64
2026-02-01 04:10:42 +02:00 · 2026-01-01 04:44:25 +02:00 · 2026-01-01 04:06:12 +02:00 · 2025-12-30 11:24:02 +01:00 · 2025-12-28 13:34:01 +02:00 · 2025-12-23 17:08:54 +02:00
482 changed files with 13473 additions and 4219 deletions
--- a/.github/scripts/auto-backport.py
+++ b/.github/scripts/auto-backport.py
@@ -52,7 +52,7 @@ def create_pull_request(repo, new_branch_name, base_branch_name, pr, backport_pr
        if is_draft:
            backport_pr.add_to_labels("conflicts")
            pr_comment = f"@{pr.user.login} - This PR was marked as draft because it has conflicts\n"
-            pr_comment += "Please resolve them and mark this PR as ready for review"
+            pr_comment += "Please resolve them and remove the 'conflicts' label. The PR will be made ready for review automatically."
            backport_pr.create_issue_comment(pr_comment)
        logging.info(f"Assigned PR to original author: {pr.user}")
        return backport_pr
@@ -112,29 +112,45 @@ def backport(repo, pr, version, commits, backport_base_branch, is_collaborator):
                    is_draft = True
                    repo_local.git.add(A=True)
                    repo_local.git.cherry_pick('--continue')
-            repo_local.git.push(fork_repo, new_branch_name, force=True)
-            create_pull_request(repo, new_branch_name, backport_base_branch, pr, backport_pr_title, commits,
-                                is_draft, is_collaborator)
-
+            # Check if the branch already exists in the remote fork
+            remote_refs = repo_local.git.ls_remote('--heads', fork_repo, new_branch_name)
+            if not remote_refs:
+                # Branch does not exist, create it with a regular push
+                repo_local.git.push(fork_repo, new_branch_name)
+                create_pull_request(repo, new_branch_name, backport_base_branch, pr, backport_pr_title, commits,
+                                    is_draft, is_collaborator)
+            else:
+                logging.info(f"Remote branch {new_branch_name} already exists in fork. Skipping push.")
        except GitCommandError as e:
            logging.warning(f"GitCommandError: {e}")


 def with_github_keyword_prefix(repo, pr):
-    pattern = rf"(?:fix(?:|es|ed))\s*:?\s*(?:(?:(?:{repo.full_name})?#)|https://github\.com/{repo.full_name}/issues/)(\d+)"
-    match = re.findall(pattern, pr.body, re.IGNORECASE)
-    if not match:
-        for commit in pr.get_commits():
-            match = re.findall(pattern, commit.commit.message, re.IGNORECASE)
-            if match:
-                print(f'{pr.number} has a valid close reference in commit message {commit.sha}')
-                break
-    if not match:
-        print(f'No valid close reference for {pr.number}')
-        return False
-    else:
+    # GitHub issue pattern: #123, scylladb/scylladb#123, or full GitHub URLs
+    github_pattern = rf"(?:fix(?:|es|ed))\s*:?\s*(?:(?:(?:{repo.full_name})?#)|https://github\.com/{repo.full_name}/issues/)(\d+)"
+    
+    # JIRA issue pattern: PKG-92 or https://scylladb.atlassian.net/browse/PKG-92
+    jira_pattern = r"(?:fix(?:|es|ed))\s*:?\s*(?:(?:https://scylladb\.atlassian\.net/browse/)?([A-Z]+-\d+))"
+    
+    # Check PR body for GitHub issues
+    github_match = re.findall(github_pattern, pr.body, re.IGNORECASE)
+    # Check PR body for JIRA issues
+    jira_match = re.findall(jira_pattern, pr.body, re.IGNORECASE)
+    
+    match = github_match or jira_match
+
+    if match:
        return True

+    for commit in pr.get_commits():
+        github_match = re.findall(github_pattern, commit.commit.message, re.IGNORECASE)
+        jira_match = re.findall(jira_pattern, commit.commit.message, re.IGNORECASE)
+        if github_match or jira_match:
+            print(f'{pr.number} has a valid close reference in commit message {commit.sha}')
+            return True
+
+    print(f'No valid close reference for {pr.number}')
+    return False

 def main():
    args = parse_args()
--- a/.github/workflows/backport-pr-fixes-validation.yaml
+++ b/.github/workflows/backport-pr-fixes-validation.yaml
@@ -18,7 +18,7 @@ jobs:
            
            // Regular expression pattern to check for "Fixes" prefix
            // Adjusted to dynamically insert the repository full name
-            const pattern = `Fixes:? (?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)`;
+            const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|([A-Z]+-\\d+))`;
            const regex = new RegExp(pattern);
            
            if (!regex.test(body)) {
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2025.2.0-dev
+VERSION=2025.2.6

 if test -f version
 then
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -141,6 +141,50 @@ json::json_return_type make_streamed(rjson::value&& value) {
    return func;
 }

+// make_streamed_with_extra_array() is variant of make_streamed() above, which
+// builds a response from a JSON object (rjson::value) but adds to it at the
+// end an additional array. The extra array is given a separate chunked_vector
+// to avoid putting it inside the rjson::value - because RapidJSON does
+// contiguous allocations for arrays which we want to avoid for potentially
+// long arrays in Query/Scan responses (see #23535).
+// If we ever fix RapidJSON to avoid contiguous allocations for arrays, or
+// replace it entirely (#24458), we can remove this function and the function
+// rjson::print_with_extra_array() which it calls.
+json::json_return_type make_streamed_with_extra_array(rjson::value&& value,
+    std::string array_name, utils::chunked_vector<rjson::value>&& array) {
+    // CMH. json::json_return_type uses std::function, not noncopyable_function.
+    // Need to make a copyable version of value. Gah.
+    auto rs = make_shared<rjson::value>(std::move(value));
+    auto ns = make_shared<std::string>(std::move(array_name));
+    auto as = make_shared<utils::chunked_vector<rjson::value>>(std::move(array));
+    std::function<future<>(output_stream<char>&&)> func = [rs, ns, as](output_stream<char>&& os) mutable -> future<> {
+        // move objects to coroutine frame.
+        auto los = std::move(os);
+        auto lrs = std::move(rs);
+        auto lns = std::move(ns);
+        auto las = std::move(as);
+        std::exception_ptr ex;
+        try {
+            co_await rjson::print_with_extra_array(*lrs, *lns, *las, los);
+        } catch (...) {
+            // at this point, we cannot really do anything. HTTP headers and return code are
+            // already written, and quite potentially a portion of the content data.
+            // just log + rethrow. It is probably better the HTTP server closes connection
+            // abruptly or something...
+            ex = std::current_exception();
+            elogger.error("Exception during streaming HTTP response: {}", ex);
+        }
+        co_await los.close();
+        co_await rjson::destroy_gently(std::move(*lrs));
+        // TODO: can/should we also destroy the array (*las) gently?
+        if (ex) {
+            co_await coroutine::return_exception_ptr(std::move(ex));
+        }
+        co_return;
+    };
+    return func;
+}
+
 json_string::json_string(std::string&& value)
    : _value(std::move(value))
 {}
@@ -1006,6 +1050,17 @@ void rmw_operation::set_default_write_isolation(std::string_view value) {
    default_write_isolation = parse_write_isolation(value);
 }

+// Alternator uses tags whose keys start with the "system:" prefix for
+// internal purposes. Those should not be readable by ListTagsOfResource,
+// nor writable with TagResource or UntagResource (see #24098).
+// Only a few specific system tags, currently only system:write_isolation,
+// are deliberately intended to be set and read by the user, so are not
+// considered "internal".
+static bool tag_key_is_internal(std::string_view tag_key) {
+    return tag_key.starts_with("system:") &&
+        tag_key != rmw_operation::WRITE_ISOLATION_TAG_KEY;
+}
+
 enum class update_tags_action { add_tags, delete_tags };
 static void update_tags_map(const rjson::value& tags, std::map<sstring, sstring>& tags_map, update_tags_action action) {
    if (action == update_tags_action::add_tags) {
@@ -1030,6 +1085,9 @@ static void update_tags_map(const rjson::value& tags, std::map<sstring, sstring>
            if (!validate_legal_tag_chars(tag_key)) {
                throw api_error::validation("A tag Key can only contain letters, spaces, and [+-=._:/]");
            }
+            if (tag_key_is_internal(tag_key)) {
+                throw api_error::validation(fmt::format("Tag key '{}' is reserved for internal use", tag_key));
+            }
            // Note tag values are limited similarly to tag keys, but have a
            // longer length limit, and *can* be empty.
            if (tag_value.size() > 256) {
@@ -1042,7 +1100,11 @@ static void update_tags_map(const rjson::value& tags, std::map<sstring, sstring>
        }
    } else if (action == update_tags_action::delete_tags) {
        for (auto it = tags.Begin(); it != tags.End(); ++it) {
-            tags_map.erase(sstring(it->GetString(), it->GetStringLength()));
+            auto tag_key = rjson::to_string_view(*it);
+            if (tag_key_is_internal(tag_key)) {
+                throw api_error::validation(fmt::format("Tag key '{}' is reserved for internal use", tag_key));
+            }
+            tags_map.erase(sstring(tag_key));
        }
    }

@@ -1117,6 +1179,9 @@ future<executor::request_return_type> executor::list_tags_of_resource(client_sta

    rjson::value& tags = ret["Tags"];
    for (auto& tag_entry : tags_map) {
+        if (tag_key_is_internal(tag_entry.first)) {
+            continue;
+        }
        rjson::value new_entry = rjson::empty_object();
        rjson::add(new_entry, "Key", rjson::from_string(tag_entry.first));
        rjson::add(new_entry, "Value", rjson::from_string(tag_entry.second));
@@ -3882,9 +3947,6 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
            }
        }
    }
-    if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
-        _return_attributes = std::move(*previous_item);
-    }
    if (_attribute_updates) {
        for (auto it = _attribute_updates->MemberBegin(); it != _attribute_updates->MemberEnd(); ++it) {
            // Note that it.key() is the name of the column, *it is the operation
@@ -3994,6 +4056,9 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
        // don't report the new item in the returned Attributes.
        _return_attributes = rjson::null_value();
    }
+    if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
+        _return_attributes = std::move(*previous_item);
+    }
    // ReturnValues=UPDATED_OLD/NEW never return an empty Attributes field,
    // even if a new item was created. Instead it should be missing entirely.
    if (_returnvalues == returnvalues::UPDATED_OLD || _returnvalues == returnvalues::UPDATED_NEW) {
@@ -4509,7 +4574,11 @@ class describe_items_visitor {
    const filter& _filter;
    typename columns_t::const_iterator _column_it;
    rjson::value _item;
-    rjson::value _items;
+    // _items is a chunked_vector<rjson::value> instead of a RapidJson array
+    // (rjson::value) because unfortunately RapidJson arrays are stored
+    // contiguously in memory, and cause large allocations when a Query/Scan
+    // returns a long list of short items (issue #23535).
+    utils::chunked_vector<rjson::value> _items;
    size_t _scanned_count;

 public:
@@ -4519,7 +4588,6 @@ public:
            , _filter(filter)
            , _column_it(columns.begin())
            , _item(rjson::empty_object())
-            , _items(rjson::empty_array())
            , _scanned_count(0)
    {
        // _filter.check() may need additional attributes not listed in
@@ -4598,13 +4666,13 @@ public:
                rjson::remove_member(_item, attr);
            }

-            rjson::push_back(_items, std::move(_item));
+            _items.push_back(std::move(_item));
        }
        _item = rjson::empty_object();
        ++_scanned_count;
    }

-    rjson::value get_items() && {
+    utils::chunked_vector<rjson::value> get_items() && {
        return std::move(_items);
    }

@@ -4613,13 +4681,25 @@ public:
    }
 };

-static future<std::tuple<rjson::value, size_t>> describe_items(const cql3::selection::selection& selection, std::unique_ptr<cql3::result_set> result_set, std::optional<attrs_to_get>&& attrs_to_get, filter&& filter) {
+// describe_items() returns a JSON object that includes members "Count"
+// and "ScannedCount", but *not* "Items" - that is returned separately
+// as a chunked_vector to avoid large contiguous allocations which
+// RapidJSON does of its array. The caller should add "Items" to the
+// returned JSON object if needed, or print it separately.
+// The returned chunked_vector (the items) is std::optional<>, because
+// the user may have requested only to count items, and not return any
+// items - which is different from returning an empty list of items.
+static future<std::tuple<rjson::value, std::optional<utils::chunked_vector<rjson::value>>, size_t>> describe_items(
+        const cql3::selection::selection& selection,
+        std::unique_ptr<cql3::result_set> result_set,
+        std::optional<attrs_to_get>&& attrs_to_get,
+        filter&& filter) {
    describe_items_visitor visitor(selection.get_columns(), attrs_to_get, filter);
    co_await result_set->visit_gently(visitor);
    auto scanned_count = visitor.get_scanned_count();
-    rjson::value items = std::move(visitor).get_items();
+    utils::chunked_vector<rjson::value> items = std::move(visitor).get_items();
    rjson::value items_descr = rjson::empty_object();
-    auto size = items.Size();
+    auto size = items.size();
    rjson::add(items_descr, "Count", rjson::value(size));
    rjson::add(items_descr, "ScannedCount", rjson::value(scanned_count));
    // If attrs_to_get && attrs_to_get->empty(), this means the user asked not
@@ -4629,10 +4709,11 @@ static future<std::tuple<rjson::value, size_t>> describe_items(const cql3::selec
    // In that case, we currently build a list of empty items and here drop
    // it. We could just count the items and not bother with the empty items.
    // (However, remember that when we do have a filter, we need the items).
+    std::optional<utils::chunked_vector<rjson::value>> opt_items;
    if (!attrs_to_get || !attrs_to_get->empty()) {
-        rjson::add(items_descr, "Items", std::move(items));
+        opt_items = std::move(items);
    }
-    co_return std::tuple<rjson::value, size_t>{std::move(items_descr), size};
+    co_return std::tuple(std::move(items_descr), std::move(opt_items), size);
 }

 static rjson::value encode_paging_state(const schema& schema, const service::pager::paging_state& paging_state) {
@@ -4670,6 +4751,12 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag
    return last_evaluated_key;
 }

+// RapidJSON allocates arrays contiguously in memory, so we want to avoid
+// returning a large number of items as a single rapidjson array, and use
+// a chunked_vector instead. The following constant is an arbitrary cutoff
+// point for when to switch from a rapidjson array to a chunked_vector.
+static constexpr int max_items_for_rapidjson_array = 256;
+
 static future<executor::request_return_type> do_query(service::storage_proxy& proxy,
        schema_ptr table_schema,
        const rjson::value* exclusive_start_key,
@@ -4742,19 +4829,35 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
    }
    auto paging_state = rs->get_metadata().paging_state();
    bool has_filter = filter;
-    auto [items, size] = co_await describe_items(*selection, std::move(rs), std::move(attrs_to_get), std::move(filter));
+    auto [items_descr, opt_items, size] = co_await describe_items(*selection, std::move(rs), std::move(attrs_to_get), std::move(filter));
    if (paging_state) {
-        rjson::add(items, "LastEvaluatedKey", encode_paging_state(*table_schema, *paging_state));
+        rjson::add(items_descr, "LastEvaluatedKey", encode_paging_state(*table_schema, *paging_state));
    }
    if (has_filter){
        cql_stats.filtered_rows_read_total += p->stats().rows_read_total;
        // update our "filtered_row_matched_total" for all the rows matched, despited the filter
        cql_stats.filtered_rows_matched_total += size;
    }
-    if (is_big(items)) {
-        co_return executor::request_return_type(make_streamed(std::move(items)));
+    if (opt_items) {
+        if (opt_items->size() >= max_items_for_rapidjson_array) {
+            // There are many items, better print the JSON and the array of
+            // items (opt_items) separately to avoid RapidJSON's contiguous
+            // allocation of arrays.
+            co_return executor::request_return_type(make_streamed_with_extra_array(std::move(items_descr), "Items", std::move(*opt_items)));
+        }
+        // There aren't many items in the chunked vector opt_items,
+        // let's just insert them into the JSON object and print the
+        // full JSON normally.
+        rjson::value items_json = rjson::empty_array();
+        for (auto& item : *opt_items) {
+            rjson::push_back(items_json, std::move(item));
+        }
+        rjson::add(items_descr, "Items", std::move(items_json));
    }
-    co_return executor::request_return_type(make_jsonable(std::move(items)));
+    if (is_big(items_descr)) {
+        co_return executor::request_return_type(make_streamed(std::move(items_descr)));
+    }
+    co_return executor::request_return_type(make_jsonable(std::move(items_descr)));
 }

 static dht::token token_for_segment(int segment, int total_segments) {
--- a/alternator/expressions.g
+++ b/alternator/expressions.g
@@ -91,6 +91,18 @@ options {
        throw expressions_syntax_error(format("{} at char {}", err,
            ex->get_charPositionInLine()));
    }
+
+    // ANTLR3 tries to recover missing tokens - it tries to finish parsing
+    // and create valid objects, as if the missing token was there.
+    // But it has a bug and leaks these tokens.
+    // We override offending method and handle abandoned pointers.
+    std::vector<std::unique_ptr<TokenType>> _missing_tokens;
+    TokenType* getMissingSymbol(IntStreamType* istream, ExceptionBaseType* e,
+                                ANTLR_UINT32 expectedTokenType, BitsetListType* follow) {
+        auto token = BaseType::getMissingSymbol(istream, e, expectedTokenType, follow);
+        _missing_tokens.emplace_back(token);
+        return token;
+    }
 }
@lexer::context {
    void displayRecognitionError(ANTLR_UINT8** token_names, ExceptionBaseType* ex) {
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -984,7 +984,7 @@
         ]
      },
      {
-         "path":"/storage_service/cleanup_all",
+         "path":"/storage_service/cleanup_all/",
         "operations":[
            {
               "method":"POST",
@@ -994,6 +994,30 @@
               "produces":[
                  "application/json"
               ],
+               "parameters":[
+                    {
+                     "name":"global",
+                     "description":"true if cleanup of entire cluster is requested",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/storage_service/mark_node_as_clean",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Mark the node as clean. After that the node will not be considered as needing cleanup during automatic cleanup which is triggered by some topology operations",
+               "type":"void",
+               "nickname":"reset_cleanup_needed",
+               "produces":[
+                  "application/json"
+               ],
               "parameters":[]
            }
         ]
@@ -2144,6 +2168,31 @@
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
+                  },
+                  {
+                     "name":"skip_cleanup",
+                     "description":"Don't cleanup keys from loaded sstables. Invalid if load_and_stream is true",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"skip_reshape",
+                     "description":"Don't reshape the loaded sstables. Invalid if load_and_stream is true",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"scope",
+                     "description":"Defines the set of nodes to which mutations can be streamed",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query",
+                     "enum": ["all", "dc", "rack", "node"]
                  }
               ]
            }
@@ -3136,6 +3185,22 @@
               ]
            }
         ]
+      },
+      {
+         "path":"/storage_service/raft_topology/cmd_rpc_status",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get information about currently running topology cmd rpc",
+               "type":"string",
+               "nickname":"raft_topology_get_cmd_status",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+               ]
+            }
+         ]
      }
   ],
   "models":{
@@ -3272,11 +3337,11 @@
         "properties":{
            "start_token":{
               "type":"string",
-               "description":"The range start token"
+               "description":"The range start token (exclusive)"
            },
            "end_token":{
               "type":"string",
-               "description":"The range start token"
+               "description":"The range end token (inclusive)"
            },
            "endpoints":{
               "type":"array",
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -20,6 +20,7 @@
 #include "utils/hash.hh"
 #include <optional>
 #include <sstream>
+#include <stdexcept>
 #include <time.h>
 #include <algorithm>
 #include <functional>
@@ -453,17 +454,26 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
        auto cf = req->get_query_param("cf");
        auto stream = req->get_query_param("load_and_stream");
        auto primary_replica = req->get_query_param("primary_replica_only");
+        auto skip_cleanup_p = req->get_query_param("skip_cleanup");
        boost::algorithm::to_lower(stream);
        boost::algorithm::to_lower(primary_replica);
        bool load_and_stream = stream == "true" || stream == "1";
        bool primary_replica_only = primary_replica == "true" || primary_replica == "1";
+        bool skip_cleanup = skip_cleanup_p == "true" || skip_cleanup_p == "1";
+        auto scope = parse_stream_scope(req->get_query_param("scope"));
+        auto skip_reshape_p = req->get_query_param("skip_reshape");
+        auto skip_reshape = skip_reshape_p == "true" || skip_reshape_p == "1";
+
+        if (scope != sstables_loader::stream_scope::all && !load_and_stream) {
+            throw httpd::bad_param_exception("scope takes no effect without load-and-stream");
+        }
        // No need to add the keyspace, since all we want is to avoid always sending this to the same
        // CPU. Even then I am being overzealous here. This is not something that happens all the time.
        auto coordinator = std::hash<sstring>()(cf) % smp::count;
        return sst_loader.invoke_on(coordinator,
                [ks = std::move(ks), cf = std::move(cf),
-                load_and_stream, primary_replica_only] (sstables_loader& loader) {
-            return loader.load_new_sstables(ks, cf, load_and_stream, primary_replica_only, sstables_loader::stream_scope::all);
+                load_and_stream, primary_replica_only, skip_cleanup, skip_reshape, scope] (sstables_loader& loader) {
+            return loader.load_new_sstables(ks, cf, load_and_stream, primary_replica_only, skip_cleanup, skip_reshape, scope);
        }).then_wrapped([] (auto&& f) {
            if (f.failed()) {
                auto msg = fmt::format("Failed to load new sstables: {}", f.get_exception());
@@ -738,13 +748,7 @@ rest_force_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
            fmopt = flush_mode::skip;
        }
        auto task = co_await compaction_module.make_and_start_task<global_major_compaction_task_impl>({}, db, fmopt, consider_only_existing_data);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("force_compaction failed: {}", std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json_void();
 }

@@ -771,13 +775,7 @@ rest_force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request>
            fmopt = flush_mode::skip;
        }
        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("force_keyspace_compaction: keyspace={} tables={} failed: {}", task->get_status().keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json_void();
 }

@@ -802,21 +800,21 @@ rest_force_keyspace_cleanup(http_context& ctx, sharded<service::storage_service>
        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>(
            {}, std::move(keyspace), db, table_infos, flush_mode::all_tables, tasks::is_user_task::yes);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("force_keyspace_cleanup: keyspace={} tables={} failed: {}", task->get_status().keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json::json_return_type(0);
 }

 static
 future<json::json_return_type>
 rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
-        apilog.info("cleanup_all");
-        auto done = co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
+        bool global = true;
+        if (auto global_param = req->get_query_param("global"); !global_param.empty()) {
+            global = validate_bool(global_param);
+        }
+
+        apilog.info("cleanup_all global={}", global);
+
+        auto done = !global ? false : co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
            if (!ss.is_topology_coordinator_enabled()) {
                co_return false;
            }
@@ -826,19 +824,35 @@ rest_cleanup_all(http_context& ctx, sharded<service::storage_service>& ss, std::
        if (done) {
            co_return json::json_return_type(0);
        }
-        // fall back to the local global cleanup if topology coordinator is not enabled
+        // fall back to the local cleanup if topology coordinator is not enabled or local cleanup is requested
        auto& db = ctx.db;
        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<global_cleanup_compaction_task_impl>({}, db);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("cleanup_all failed: {}", std::current_exception());
-            throw;
-        }
+        co_await task->done();
+
+        // Mark this node as clean
+        co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<> {
+            if (ss.is_topology_coordinator_enabled()) {
+                co_await ss.reset_cleanup_needed();
+            }
+        });
+
        co_return json::json_return_type(0);
 }

+static
+future<json::json_return_type>
+rest_reset_cleanup_needed(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+        apilog.info("reset_cleanup_needed");
+        co_await ss.invoke_on(0, [] (service::storage_service& ss) {
+            if (!ss.is_topology_coordinator_enabled()) {
+                throw std::runtime_error("mark_node_as_clean is only supported when topology over raft is enabled");
+            }
+            return ss.reset_cleanup_needed();
+        });
+        co_return json_void();
+}
+
 static
 future<json::json_return_type>
 rest_perform_keyspace_offstrategy_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
@@ -847,13 +861,7 @@ rest_perform_keyspace_offstrategy_compaction(http_context& ctx, std::unique_ptr<
        bool res = false;
        auto& compaction_module = ctx.db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<offstrategy_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, table_infos, &res);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("perform_keyspace_offstrategy_compaction: keyspace={} tables={} failed: {}", task->get_status().keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json::json_return_type(res);
 }

@@ -868,13 +876,7 @@ rest_upgrade_sstables(http_context& ctx, std::unique_ptr<http::request> req) {

        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("upgrade_sstables: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json::json_return_type(0);
 }

@@ -1667,6 +1669,18 @@ rest_raft_topology_upgrade_status(sharded<service::storage_service>& ss, std::un
        co_return sstring(format("{}", ustate));
 }

+static
+future<json::json_return_type>
+rest_raft_topology_get_cmd_status(sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+        const auto status = co_await ss.invoke_on(0, [] (auto& ss) {
+            return ss.get_topology_cmd_status();
+        });
+        if (status.active_dst.empty()) {
+            co_return sstring("none");
+        }
+        co_return sstring(fmt::format("{}[{}]: {}", status.current, status.index, fmt::join(status.active_dst, ",")));
+}
+
 static
 future<json::json_return_type>
 rest_move_tablet(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
@@ -1841,6 +1855,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::force_keyspace_compaction.set(r, rest_bind(rest_force_keyspace_compaction, ctx));
    ss::force_keyspace_cleanup.set(r, rest_bind(rest_force_keyspace_cleanup, ctx, ss));
    ss::cleanup_all.set(r, rest_bind(rest_cleanup_all, ctx, ss));
+    ss::reset_cleanup_needed.set(r, rest_bind(rest_reset_cleanup_needed, ctx, ss));
    ss::perform_keyspace_offstrategy_compaction.set(r, rest_bind(rest_perform_keyspace_offstrategy_compaction, ctx));
    ss::upgrade_sstables.set(r, rest_bind(rest_upgrade_sstables, ctx));
    ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
@@ -1898,6 +1913,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
    ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
    ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
+    ss::raft_topology_get_cmd_status.set(r, rest_bind(rest_raft_topology_get_cmd_status, ss));
    ss::move_tablet.set(r, rest_bind(rest_move_tablet, ctx, ss));
    ss::add_tablet_replica.set(r, rest_bind(rest_add_tablet_replica, ctx, ss));
    ss::del_tablet_replica.set(r, rest_bind(rest_del_tablet_replica, ctx, ss));
@@ -1924,6 +1940,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::force_keyspace_compaction.unset(r);
    ss::force_keyspace_cleanup.unset(r);
    ss::cleanup_all.unset(r);
+    ss::reset_cleanup_needed.unset(r);
    ss::perform_keyspace_offstrategy_compaction.unset(r);
    ss::upgrade_sstables.unset(r);
    ss::force_flush.unset(r);
@@ -1979,6 +1996,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::reload_raft_topology_state.unset(r);
    ss::upgrade_to_raft_topology.unset(r);
    ss::raft_topology_upgrade_status.unset(r);
+    ss::raft_topology_get_cmd_status.unset(r);
    ss::move_tablet.unset(r);
    ss::add_tablet_replica.unset(r);
    ss::del_tablet_replica.unset(r);
--- a/api/token_metadata.cc
+++ b/api/token_metadata.cc
@@ -74,6 +74,9 @@ void set_token_metadata(http_context& ctx, routes& r, sharded<locator::shared_to
    });

    ss::get_host_id_map.set(r, [&tm, &g](const_req req) {
+        if (!g.local().is_enabled()) {
+            throw std::runtime_error("The gossiper is not ready yet");
+        }
        std::vector<ss::mapper> res;
        auto map = tm.local().get()->get_host_ids() |
            std::views::transform([&g] (locator::host_id id) { return std::make_pair(g.local().get_address_map().get(id), id); }) |
--- a/auth/allow_all_authenticator.cc
+++ b/auth/allow_all_authenticator.cc
@@ -9,6 +9,7 @@
 #include "auth/allow_all_authenticator.hh"

 #include "service/migration_manager.hh"
+#include "utils/alien_worker.hh"
 #include "utils/class_registrator.hh"

 namespace auth {
@@ -21,6 +22,7 @@ static const class_registrator<
        allow_all_authenticator,
        cql3::query_processor&,
        ::service::raft_group0_client&,
-        ::service::migration_manager&> registration("org.apache.cassandra.auth.AllowAllAuthenticator");
+        ::service::migration_manager&,
+        utils::alien_worker&> registration("org.apache.cassandra.auth.AllowAllAuthenticator");

 }
--- a/auth/allow_all_authenticator.hh
+++ b/auth/allow_all_authenticator.hh
@@ -13,6 +13,7 @@
 #include "auth/authenticated_user.hh"
 #include "auth/authenticator.hh"
 #include "auth/common.hh"
+#include "utils/alien_worker.hh"

 namespace cql3 {
 class query_processor;
@@ -28,7 +29,7 @@ extern const std::string_view allow_all_authenticator_name;

 class allow_all_authenticator final : public authenticator {
 public:
-    allow_all_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&) {
+    allow_all_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&) {
    }

    virtual future<> start() override {
--- a/auth/certificate_authenticator.cc
+++ b/auth/certificate_authenticator.cc
@@ -33,13 +33,14 @@ static const class_registrator<auth::authenticator
    , auth::certificate_authenticator
    , cql3::query_processor&
    , ::service::raft_group0_client&
-    , ::service::migration_manager&> cert_auth_reg(CERT_AUTH_NAME);
+    , ::service::migration_manager&
+    , utils::alien_worker&> cert_auth_reg(CERT_AUTH_NAME);

 enum class auth::certificate_authenticator::query_source {
    subject, altname
 };

-auth::certificate_authenticator::certificate_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&)
+auth::certificate_authenticator::certificate_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&)
    : _queries([&] {
        auto& conf = qp.db().get_config();
        auto queries = conf.auth_certificate_role_queries();
--- a/auth/certificate_authenticator.hh
+++ b/auth/certificate_authenticator.hh
@@ -10,6 +10,7 @@
 #pragma once

 #include "auth/authenticator.hh"
+#include "utils/alien_worker.hh"
 #include <boost/regex_fwd.hpp>  // IWYU pragma: keep

 namespace cql3 {
@@ -31,7 +32,7 @@ class certificate_authenticator : public authenticator {
    enum class query_source;
    std::vector<std::pair<query_source, boost::regex>> _queries;
 public:
-    certificate_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&);
+    certificate_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&);
    ~certificate_authenticator();

    future<> start() override;
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -119,6 +119,11 @@ future<> create_legacy_metadata_table_if_missing(
    return qs;
 }

+::service::raft_timeout get_raft_timeout() noexcept {
+    auto dur = internal_distributed_query_state().get_client_state().get_timeout_config().other_timeout;
+    return ::service::raft_timeout{.value = lowres_clock::now() + dur};
+}
+
 static future<> announce_mutations_with_guard(
        ::service::raft_group0_client& group0_client,
        std::vector<canonical_mutation> muts,
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -17,6 +17,7 @@

 #include "types/types.hh"
 #include "service/raft/raft_group0_client.hh"
+#include "timeout_config.hh"

 using namespace std::chrono_literals;

@@ -77,6 +78,8 @@ future<> create_legacy_metadata_table_if_missing(
 ///
 ::service::query_state& internal_distributed_query_state() noexcept;

+::service::raft_timeout get_raft_timeout() noexcept;
+
 // Execute update query via group0 mechanism, mutations will be applied on all nodes.
 // Use this function when need to perform read before write on a single guard or if
 // you have more than one mutation and potentially exceed single command size limit.
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -233,9 +233,9 @@ future<role_set> ldap_role_manager::query_granted(std::string_view grantee_name,
 }

 future<role_to_directly_granted_map>
-ldap_role_manager::query_all_directly_granted() {
+ldap_role_manager::query_all_directly_granted(::service::query_state& qs) {
    role_to_directly_granted_map result;
-    auto roles = co_await query_all();
+    auto roles = co_await query_all(qs);
    for (auto& role: roles) {
        auto granted_set = co_await query_granted(role, recursive_role_query::no);
        for (auto& granted: granted_set) {
@@ -247,8 +247,8 @@ ldap_role_manager::query_all_directly_granted() {
    co_return result;
 }

-future<role_set> ldap_role_manager::query_all() {
-    return _std_mgr.query_all();
+future<role_set> ldap_role_manager::query_all(::service::query_state& qs) {
+    return _std_mgr.query_all(qs);
 }

 future<> ldap_role_manager::create_role(std::string_view role_name) {
@@ -311,12 +311,12 @@ future<bool> ldap_role_manager::can_login(std::string_view role_name) {
 }

 future<std::optional<sstring>> ldap_role_manager::get_attribute(
-        std::string_view role_name, std::string_view attribute_name) {
-    return _std_mgr.get_attribute(role_name, attribute_name);
+        std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
+    return _std_mgr.get_attribute(role_name, attribute_name, qs);
 }

-future<role_manager::attribute_vals> ldap_role_manager::query_attribute_for_all(std::string_view attribute_name) {
-    return _std_mgr.query_attribute_for_all(attribute_name);
+future<role_manager::attribute_vals> ldap_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
+    return _std_mgr.query_attribute_for_all(attribute_name, qs);
 }

 future<> ldap_role_manager::set_attribute(
@@ -338,8 +338,7 @@ future<std::vector<cql3::description>> ldap_role_manager::describe_role_grants()
 }

 future<> ldap_role_manager::ensure_superuser_is_created() {
-    // ldap is responsible for users
-    co_return;
+    return _std_mgr.ensure_superuser_is_created();
 }

 } // namespace auth
--- a/auth/ldap_role_manager.hh
+++ b/auth/ldap_role_manager.hh
@@ -75,9 +75,9 @@ class ldap_role_manager : public role_manager {

    future<role_set> query_granted(std::string_view, recursive_role_query) override;

-    future<role_to_directly_granted_map> query_all_directly_granted() override;
+    future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;

-    future<role_set> query_all() override;
+    future<role_set> query_all(::service::query_state&) override;

    future<bool> exists(std::string_view) override;

@@ -85,9 +85,9 @@ class ldap_role_manager : public role_manager {

    future<bool> can_login(std::string_view) override;

-    future<std::optional<sstring>> get_attribute(std::string_view, std::string_view) override;
+    future<std::optional<sstring>> get_attribute(std::string_view, std::string_view, ::service::query_state&) override;

-    future<role_manager::attribute_vals> query_attribute_for_all(std::string_view) override;
+    future<role_manager::attribute_vals> query_attribute_for_all(std::string_view, ::service::query_state&) override;

    future<> set_attribute(std::string_view, std::string_view, std::string_view, ::service::group0_batch& mc) override;

--- a/auth/maintenance_socket_role_manager.cc
+++ b/auth/maintenance_socket_role_manager.cc
@@ -78,11 +78,11 @@ future<role_set> maintenance_socket_role_manager::query_granted(std::string_view
    return operation_not_supported_exception<role_set>("QUERY GRANTED");
 }

-future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted() {
+future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted(::service::query_state&) {
    return operation_not_supported_exception<role_to_directly_granted_map>("QUERY ALL DIRECTLY GRANTED");
 }

-future<role_set> maintenance_socket_role_manager::query_all() {
+future<role_set> maintenance_socket_role_manager::query_all(::service::query_state&) {
    return operation_not_supported_exception<role_set>("QUERY ALL");
 }

@@ -98,11 +98,11 @@ future<bool> maintenance_socket_role_manager::can_login(std::string_view role_na
    return make_ready_future<bool>(true);
 }

-future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name) {
+future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) {
    return operation_not_supported_exception<std::optional<sstring>>("GET ATTRIBUTE");
 }

-future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name) {
+future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) {
    return operation_not_supported_exception<role_manager::attribute_vals>("QUERY ATTRIBUTE");
 }

--- a/auth/maintenance_socket_role_manager.hh
+++ b/auth/maintenance_socket_role_manager.hh
@@ -53,9 +53,9 @@ public:

    virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;

-    virtual future<role_to_directly_granted_map> query_all_directly_granted() override;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;

-    virtual future<role_set> query_all() override;
+    virtual future<role_set> query_all(::service::query_state&) override;

    virtual future<bool> exists(std::string_view role_name) override;

@@ -63,9 +63,9 @@ public:

    virtual future<bool> can_login(std::string_view role_name) override;

-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) override;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) override;

-    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name) override;
+    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) override;

    virtual future<> set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) override;

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -48,14 +48,14 @@ static const class_registrator<
        password_authenticator,
        cql3::query_processor&,
        ::service::raft_group0_client&,
-        ::service::migration_manager&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");
+        ::service::migration_manager&,
+        utils::alien_worker&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");

 static thread_local auto rng_for_salt = std::default_random_engine(std::random_device{}());

 static std::string_view get_config_value(std::string_view value, std::string_view def) {
    return value.empty() ? def : value;
 }
-
 std::string password_authenticator::default_superuser(const db::config& cfg) {
    return std::string(get_config_value(cfg.auth_superuser_name(), DEFAULT_USER_NAME));
 }
@@ -63,12 +63,13 @@ std::string password_authenticator::default_superuser(const db::config& cfg) {
 password_authenticator::~password_authenticator() {
 }

-password_authenticator::password_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
+password_authenticator::password_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, utils::alien_worker& hashing_worker)
    : _qp(qp)
    , _group0_client(g0)
    , _migration_manager(mm)
    , _stopped(make_ready_future<>()) 
    , _superuser(default_superuser(qp.db().get_config()))
+    , _hashing_worker(hashing_worker)
 {}

 static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
@@ -117,33 +118,95 @@ future<> password_authenticator::migrate_legacy_metadata() const {
    });
 }

-future<> password_authenticator::create_default_if_missing() {
+future<> password_authenticator::legacy_create_default_if_missing() {
+    SCYLLA_ASSERT(legacy_mode(_qp));
    const auto exists = co_await default_role_row_satisfies(_qp, &has_salted_hash, _superuser);
    if (exists) {
        co_return;
    }
    std::string salted_pwd(get_config_value(_qp.db().get_config().auth_superuser_salted_password(), ""));
    if (salted_pwd.empty()) {
-        salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt);
+        salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt, _scheme);
    }
    const auto query = update_row_query();
-    if (legacy_mode(_qp)) {
-        co_await _qp.execute_internal(
+    co_await _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_query_state(),
            {salted_pwd, _superuser},
            cql3::query_processor::cache_internal::no);
-        plogger.info("Created default superuser authentication record.");
-    } else {
-        co_await announce_mutations(_qp, _group0_client, query,
-            {salted_pwd, _superuser}, _as, ::service::raft_timeout{});
-        plogger.info("Created default superuser authentication record.");
+    plogger.info("Created default superuser authentication record.");
+}
+
+future<> password_authenticator::maybe_create_default_password() {
+    auto needs_password = [this] () -> future<bool> {
+        const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", get_auth_ks_name(_qp), meta::roles_table::name);
+        auto results = co_await _qp.execute_internal(query,
+                db::consistency_level::LOCAL_ONE,
+                internal_distributed_query_state(), cql3::query_processor::cache_internal::yes);
+        // Don't add default password if
+        // - there is no default superuser
+        // - there is a superuser with a password.
+        bool has_default = false;
+        bool has_superuser_with_password = false;
+        for (auto& result : *results) {
+            if (result.get_as<sstring>(meta::roles_table::role_col_name) == _superuser) {
+                has_default = true;
+            }
+            if (has_salted_hash(result)) {
+                has_superuser_with_password = true;
+            }
+        }
+        co_return has_default && !has_superuser_with_password;
+    };
+    if (!co_await needs_password()) {
+        co_return;
+    }
+    // We don't want to start operation earlier to avoid quorum requirement in
+    // a common case.
+    ::service::group0_batch batch(
+            co_await _group0_client.start_operation(_as, get_raft_timeout()));
+    // Check again as the state may have changed before we took the guard (batch).
+    if (!co_await needs_password()) {
+        co_return;
+    }
+    // Set default superuser's password.
+    std::string salted_pwd(get_config_value(_qp.db().get_config().auth_superuser_salted_password(), ""));
+    if (salted_pwd.empty()) {
+        salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt, _scheme);
+    }
+    const auto update_query = update_row_query();
+    co_await collect_mutations(_qp, batch, update_query, {salted_pwd, _superuser});
+    co_await std::move(batch).commit(_group0_client, _as, get_raft_timeout());
+    plogger.info("Created default superuser authentication record.");
+}
+
+future<> password_authenticator::maybe_create_default_password_with_retries() {
+    size_t retries = _migration_manager.get_concurrent_ddl_retries();
+    while (true)  {
+        try {
+            co_return co_await maybe_create_default_password();
+        } catch (const ::service::group0_concurrent_modification& ex) {
+            plogger.warn("Failed to execute maybe_create_default_password due to guard conflict.{}.", retries ? " Retrying" : " Number of retries exceeded, giving up");
+            if (retries--) {
+                continue;
+            }
+            // Log error but don't crash the whole node startup sequence.
+            plogger.error("Failed to create default superuser password due to guard conflict.");
+            co_return;
+        } catch (const ::service::raft_operation_timeout_error& ex) {
+            plogger.error("Failed to create default superuser password due to exception: {}", ex.what());
+            co_return;
+        }
    }
 }

 future<> password_authenticator::start() {
    return once_among_shards([this] {
+        // Verify that at least one hashing scheme is supported.
+        passwords::detail::verify_scheme(_scheme);
+        plogger.info("Using password hashing scheme: {}", passwords::detail::prefix_for_scheme(_scheme));
+
        _stopped = do_after_system_ready(_as, [this] {
            return async([this] {
                if (legacy_mode(_qp)) {
@@ -164,11 +227,14 @@ future<> password_authenticator::start() {
                        migrate_legacy_metadata().get();
                        return;
                    }
+                    legacy_create_default_if_missing().get();
                }
                utils::get_local_injector().inject("password_authenticator_start_pause", utils::wait_for_message(5min)).get();
-                create_default_if_missing().get();
                if (!legacy_mode(_qp)) {
-                    _superuser_created_promise.set_value();
+                    maybe_create_default_password_with_retries().get();
+                    if (!_superuser_created_promise.available()) {
+                        _superuser_created_promise.set_value();
+                    }
                }
            });
        });
@@ -228,7 +294,13 @@ future<authenticated_user> password_authenticator::authenticate(

    try {
        const std::optional<sstring> salted_hash = co_await get_password_hash(username);
-        if (!salted_hash || !passwords::check(password, *salted_hash)) {
+        if (!salted_hash) {
+            throw exceptions::authentication_exception("Username and/or password are incorrect");
+        }
+        const bool password_match = co_await _hashing_worker.submit<bool>([password = std::move(password), salted_hash = std::move(salted_hash)]{
+            return passwords::check(password, *salted_hash);
+        });
+        if (!password_match) {
            throw exceptions::authentication_exception("Username and/or password are incorrect");
        }
        co_return username;
@@ -252,7 +324,7 @@ future<> password_authenticator::create(std::string_view role_name, const authen
    auto maybe_hash = options.credentials.transform([&] (const auto& creds) -> sstring {
        return std::visit(make_visitor(
                [&] (const password_option& opt) {
-                    return passwords::hash(opt.password, rng_for_salt);
+                    return passwords::hash(opt.password, rng_for_salt, _scheme);
                },
                [] (const hashed_password_option& opt) {
                    return opt.hashed_password;
@@ -295,11 +367,11 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
                query,
                consistency_for_user(role_name),
                internal_distributed_query_state(),
-                {passwords::hash(password, rng_for_salt), sstring(role_name)},
+                {passwords::hash(password, rng_for_salt, _scheme), sstring(role_name)},
                cql3::query_processor::cache_internal::no).discard_result();
    } else {
        co_await collect_mutations(_qp, mc, query,
-                {passwords::hash(password, rng_for_salt), sstring(role_name)});
+                {passwords::hash(password, rng_for_salt, _scheme), sstring(role_name)});
    }
 }

--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -15,7 +15,9 @@

 #include "db/consistency_level_type.hh"
 #include "auth/authenticator.hh"
+#include "auth/passwords.hh"
 #include "service/raft/raft_group0_client.hh"
+#include "utils/alien_worker.hh"

 namespace db {
    class config;
@@ -41,14 +43,17 @@ class password_authenticator : public authenticator {
    ::service::migration_manager& _migration_manager;
    future<> _stopped;
    abort_source _as;
-    std::string _superuser;
+    std::string _superuser; // default superuser name from the config (may or may not be present in roles table)
    shared_promise<> _superuser_created_promise;
+    // We used to also support bcrypt, SHA-256, and MD5 (ref. scylladb#24524).
+    constexpr static auth::passwords::scheme _scheme = passwords::scheme::sha_512;
+    utils::alien_worker& _hashing_worker;

 public:
    static db::consistency_level consistency_for_user(std::string_view role_name);
    static std::string default_superuser(const db::config&);

-    password_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&);
+    password_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&);

    ~password_authenticator();

@@ -89,7 +94,10 @@ private:

    future<> migrate_legacy_metadata() const;

-    future<> create_default_if_missing();
+    future<> legacy_create_default_if_missing();
+
+    future<> maybe_create_default_password();
+    future<> maybe_create_default_password_with_retries();

    sstring update_row_query() const;
 };
--- a/auth/passwords.cc
+++ b/auth/passwords.cc
@@ -21,18 +21,14 @@ static thread_local crypt_data tlcrypt = {};

 namespace detail {

-scheme identify_best_supported_scheme() {
-    const auto all_schemes = { scheme::bcrypt_y, scheme::bcrypt_a, scheme::sha_512, scheme::sha_256, scheme::md5 };
-    // "Random", for testing schemes.
+void verify_scheme(scheme scheme) {
    const sstring random_part_of_salt = "aaaabbbbccccdddd";

-    for (scheme c : all_schemes) {
-        const sstring salt = sstring(prefix_for_scheme(c)) + random_part_of_salt;
-        const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);
+    const sstring salt = sstring(prefix_for_scheme(scheme)) + random_part_of_salt;
+    const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);

-        if (e && (e[0] != '*')) {
-            return c;
-        }
+    if (e && (e[0] != '*')) {
+        return;
    }

    throw no_supported_schemes();
--- a/auth/passwords.hh
+++ b/auth/passwords.hh
@@ -21,10 +21,11 @@ class no_supported_schemes : public std::runtime_error {
 public:
    no_supported_schemes();
 };
-
 ///
-/// Apache Cassandra uses a library to provide the bcrypt scheme. Many Linux implementations do not support bcrypt, so
-/// we support alternatives. The cost is loss of direct compatibility with Apache Cassandra system tables.
+/// Apache Cassandra uses a library to provide the bcrypt scheme. In ScyllaDB, we use SHA-512
+/// instead of bcrypt for performance and for historical reasons (see scylladb#24524).
+/// Currently, SHA-512 is always chosen as the hashing scheme for new passwords, but the other
+/// algorithms remain supported for CREATE ROLE WITH HASHED PASSWORD and backward compatibility.
 ///
 enum class scheme {
    bcrypt_y,
@@ -51,11 +52,11 @@ sstring generate_random_salt_bytes(RandomNumberEngine& g) {
 }

 ///
-/// Test each allowed hashing scheme and report the best supported one on the current system.
+/// Test given hashing scheme on the current system.
 ///
-/// \throws \ref no_supported_schemes when none of the known schemes is supported.
+/// \throws \ref no_supported_schemes when scheme is unsupported.
 ///
-scheme identify_best_supported_scheme();
+void verify_scheme(scheme scheme);

 std::string_view prefix_for_scheme(scheme) noexcept;

@@ -67,8 +68,7 @@ std::string_view prefix_for_scheme(scheme) noexcept;
 /// \throws \ref no_supported_schemes when no known hashing schemes are supported on the system.
 ///
 template <typename RandomNumberEngine>
-sstring generate_salt(RandomNumberEngine& g) {
-    static const scheme scheme = identify_best_supported_scheme();
+sstring generate_salt(RandomNumberEngine& g, scheme scheme) {
    static const sstring prefix = sstring(prefix_for_scheme(scheme));
    return prefix + generate_random_salt_bytes(g);
 }
@@ -93,8 +93,8 @@ sstring hash_with_salt(const sstring& pass, const sstring& salt);
 /// \throws \ref std::system_error when the implementation-specific implementation fails to hash the cleartext.
 ///
 template <typename RandomNumberEngine>
-sstring hash(const sstring& pass, RandomNumberEngine& g) {
-    return detail::hash_with_salt(pass, detail::generate_salt(g));
+sstring hash(const sstring& pass, RandomNumberEngine& g, scheme scheme) {
+    return detail::hash_with_salt(pass, detail::generate_salt(g, scheme));
 }

 ///
--- a/auth/role_manager.hh
+++ b/auth/role_manager.hh
@@ -17,12 +17,17 @@
 #include <seastar/core/format.hh>
 #include <seastar/core/sstring.hh>

+#include "auth/common.hh"
 #include "auth/resource.hh"
 #include "cql3/description.hh"
 #include "seastarx.hh"
 #include "exceptions/exceptions.hh"
 #include "service/raft/raft_group0_client.hh"

+namespace service {
+class query_state;
+};
+
 namespace auth {

 struct role_config final {
@@ -167,9 +172,9 @@ public:
    ///   (role2, role3)
    /// }
    ///  
-    virtual future<role_to_directly_granted_map> query_all_directly_granted() = 0;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state& = internal_distributed_query_state()) = 0;

-    virtual future<role_set> query_all() = 0;
+    virtual future<role_set> query_all(::service::query_state& = internal_distributed_query_state()) = 0;

    virtual future<bool> exists(std::string_view role_name) = 0;

@@ -186,12 +191,12 @@ public:
    ///
    /// \returns the value of the named attribute, if one is set.
    ///
-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) = 0;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& = internal_distributed_query_state()) = 0;

    ///
    /// \returns a mapping of each role's value for the named attribute, if one is set for the role.
    ///
-    virtual future<attribute_vals> query_attribute_for_all(std::string_view attribute_name) = 0;
+    virtual future<attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state& = internal_distributed_query_state()) = 0;

    /// Sets `attribute_name` with `attribute_value` for `role_name`.
    /// \returns an exceptional future with nonexistant_role if the role does not exist.
--- a/auth/saslauthd_authenticator.cc
+++ b/auth/saslauthd_authenticator.cc
@@ -34,9 +34,10 @@ static const class_registrator<
        saslauthd_authenticator,
        cql3::query_processor&,
        ::service::raft_group0_client&,
-        ::service::migration_manager&> saslauthd_auth_reg("com.scylladb.auth.SaslauthdAuthenticator");
+        ::service::migration_manager&,
+        utils::alien_worker&> saslauthd_auth_reg("com.scylladb.auth.SaslauthdAuthenticator");

-saslauthd_authenticator::saslauthd_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&)
+saslauthd_authenticator::saslauthd_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&)
    : _socket_path(qp.db().get_config().saslauthd_socket_path())
 {}

--- a/auth/saslauthd_authenticator.hh
+++ b/auth/saslauthd_authenticator.hh
@@ -11,6 +11,7 @@
 #pragma once

 #include "auth/authenticator.hh"
+#include "utils/alien_worker.hh"

 namespace cql3 {
 class query_processor;
@@ -28,7 +29,7 @@ namespace auth {
 class saslauthd_authenticator : public authenticator {
    sstring _socket_path; ///< Path to the domain socket on which saslauthd is listening.
 public:
-    saslauthd_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&);
+    saslauthd_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, utils::alien_worker&);

    future<> start() override;

--- a/auth/service.cc
+++ b/auth/service.cc
@@ -187,14 +187,15 @@ service::service(
        ::service::migration_notifier& mn,
        ::service::migration_manager& mm,
        const service_config& sc,
-        maintenance_socket_enabled used_by_maintenance_socket)
+        maintenance_socket_enabled used_by_maintenance_socket,
+        utils::alien_worker& hashing_worker)
            : service(
                      std::move(c),
                      qp,
                      g0,
                      mn,
                      create_object<authorizer>(sc.authorizer_java_name, qp, g0, mm),
-                      create_object<authenticator>(sc.authenticator_java_name, qp, g0, mm),
+                      create_object<authenticator>(sc.authenticator_java_name, qp, g0, mm, hashing_worker),
                      create_object<role_manager>(sc.role_manager_java_name, qp, g0, mm),
                      used_by_maintenance_socket) {
 }
@@ -240,6 +241,13 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
        });
    }
    co_await _role_manager->start();
+    if (this_shard_id() == 0) {
+        // Role manager and password authenticator have this odd startup
+        // mechanism where they asynchronously create the superuser role
+        // in the background. Correct password creation depends on role
+        // creation therefore we need to wait here.
+        co_await _role_manager->ensure_superuser_is_created();
+    }
    co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
    _permissions_cache = std::make_unique<permissions_cache>(_loading_cache_config, *this, log);
    co_await once_among_shards([this] {
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -26,6 +26,7 @@
 #include "cql3/description.hh"
 #include "seastarx.hh"
 #include "service/raft/raft_group0_client.hh"
+#include "utils/alien_worker.hh"
 #include "utils/observable.hh"
 #include "utils/serialized_action.hh"
 #include "service/maintenance_mode.hh"
@@ -126,7 +127,8 @@ public:
            ::service::migration_notifier&,
            ::service::migration_manager&,
            const service_config&,
-            maintenance_socket_enabled);
+            maintenance_socket_enabled,
+            utils::alien_worker&);

    future<> start(::service::migration_manager&, db::system_keyspace&);

--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -9,6 +9,7 @@
 #include "auth/standard_role_manager.hh"

 #include <optional>
+#include <stdexcept>
 #include <unordered_set>
 #include <vector>

@@ -28,6 +29,7 @@
 #include "cql3/util.hh"
 #include "db/consistency_level_type.hh"
 #include "exceptions/exceptions.hh"
+#include "utils/error_injection.hh"
 #include "utils/log.hh"
 #include <seastar/core/loop.hh>
 #include <seastar/coroutine/maybe_yield.hh>
@@ -178,7 +180,8 @@ future<> standard_role_manager::create_legacy_metadata_tables_if_missing() const
                    _migration_manager)).discard_result();
 }

-future<> standard_role_manager::create_default_role_if_missing() {
+future<> standard_role_manager::legacy_create_default_role_if_missing() {
+    SCYLLA_ASSERT(legacy_mode(_qp));
    try {
        const auto exists = co_await default_role_row_satisfies(_qp, &has_can_login, _superuser);
        if (exists) {
@@ -188,16 +191,12 @@ future<> standard_role_manager::create_default_role_if_missing() {
                get_auth_ks_name(_qp),
                meta::roles_table::name,
                meta::roles_table::role_col_name);
-        if (legacy_mode(_qp)) {
-            co_await _qp.execute_internal(
-                    query,
-                    db::consistency_level::QUORUM,
-                    internal_distributed_query_state(),
-                    {_superuser},
-                    cql3::query_processor::cache_internal::no).discard_result();
-        } else {
-            co_await announce_mutations(_qp, _group0_client, query, {_superuser}, _as, ::service::raft_timeout{});
-        }
+        co_await _qp.execute_internal(
+                query,
+                db::consistency_level::QUORUM,
+                internal_distributed_query_state(),
+                {_superuser},
+                cql3::query_processor::cache_internal::no).discard_result();
        log.info("Created default superuser role '{}'.", _superuser);
    } catch(const exceptions::unavailable_exception& e) {
        log.warn("Skipped default role setup: some nodes were not ready; will retry");
@@ -205,6 +204,60 @@ future<> standard_role_manager::create_default_role_if_missing() {
    }
 }

+future<> standard_role_manager::maybe_create_default_role() {
+    auto has_superuser = [this] () -> future<bool> {
+        const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", get_auth_ks_name(_qp), meta::roles_table::name);
+        auto results = co_await _qp.execute_internal(query, db::consistency_level::LOCAL_ONE,
+                internal_distributed_query_state(), cql3::query_processor::cache_internal::yes);
+        for (const auto& result : *results) {
+            if (has_can_login(result)) {
+                co_return true;
+            }
+        }
+        co_return false;
+    };
+    if (co_await has_superuser()) {
+        co_return;
+    }
+    // We don't want to start operation earlier to avoid quorum requirement in
+    // a common case.
+    ::service::group0_batch batch(
+            co_await _group0_client.start_operation(_as, get_raft_timeout()));
+    // Check again as the state may have changed before we took the guard (batch).
+    if (co_await has_superuser()) {
+        co_return;
+    }
+    // There is no superuser which has can_login field - create default role.
+    // Note that we don't check if can_login is set to true.
+    const sstring insert_query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, true, true)",
+            get_auth_ks_name(_qp),
+            meta::roles_table::name,
+            meta::roles_table::role_col_name);
+    co_await collect_mutations(_qp, batch, insert_query, {_superuser});
+    co_await std::move(batch).commit(_group0_client, _as, get_raft_timeout());
+    log.info("Created default superuser role '{}'.", _superuser);
+}
+
+future<> standard_role_manager::maybe_create_default_role_with_retries() {
+    size_t retries = _migration_manager.get_concurrent_ddl_retries();
+    while (true)  {
+        try {
+            co_return co_await maybe_create_default_role();
+        } catch (const ::service::group0_concurrent_modification& ex) {
+            log.warn("Failed to execute maybe_create_default_role due to guard conflict.{}.", retries ? " Retrying" : " Number of retries exceeded, giving up");
+            if (retries--) {
+                continue;
+            }
+            // Log error but don't crash the whole node startup sequence.
+            log.error("Failed to create default superuser role due to guard conflict.");
+            co_return;
+        } catch (const ::service::raft_operation_timeout_error& ex) {
+            log.error("Failed to create default superuser role due to exception: {}", ex.what());
+            co_return;
+        }
+    }
+}
+
 static const sstring legacy_table_name{"users"};

 bool standard_role_manager::legacy_metadata_exists() {
@@ -266,10 +319,13 @@ future<> standard_role_manager::start() {
                    co_await migrate_legacy_metadata();
                    co_return;
                }
+                co_await legacy_create_default_role_if_missing();
            }
-            co_await create_default_role_if_missing();
            if (!legacy) {
-                _superuser_created_promise.set_value();
+                co_await maybe_create_default_role_with_retries();
+                if (!_superuser_created_promise.available()) {
+                    _superuser_created_promise.set_value();
+                }
            }
        };

@@ -596,21 +652,30 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
    });
 }

-future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted() {
+future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted(::service::query_state& qs) {
    const sstring query = seastar::format("SELECT * FROM {}.{}",
            get_auth_ks_name(_qp),
            meta::role_members_table::name);

+    const auto results = co_await _qp.execute_internal(
+            query,
+            db::consistency_level::ONE,
+            qs,
+            cql3::query_processor::cache_internal::yes);
+
    role_to_directly_granted_map roles_map;
-    co_await _qp.query_internal(query, [&roles_map] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
-        roles_map.insert({row.get_as<sstring>("member"), row.get_as<sstring>("role")});
-        co_return stop_iteration::no;
-    });
+    std::transform(
+            results->begin(),
+            results->end(),
+            std::inserter(roles_map, roles_map.begin()),
+            [] (const cql3::untyped_result_set_row& row) {
+                return std::make_pair(row.get_as<sstring>("member"), row.get_as<sstring>("role")); }
+    );

    co_return roles_map;
 }

-future<role_set> standard_role_manager::query_all() {
+future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
    const sstring query = seastar::format("SELECT {} FROM {}.{}",
            meta::roles_table::role_col_name,
            get_auth_ks_name(_qp),
@@ -619,10 +684,16 @@ future<role_set> standard_role_manager::query_all() {
    // To avoid many copies of a view.
    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);

+    if (utils::get_local_injector().enter("standard_role_manager_fail_legacy_query")) {
+        if (legacy_mode(_qp)) {
+            throw std::runtime_error("standard_role_manager::query_all: failed due to error injection");
+        }
+    }
+
    const auto results = co_await _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
-            internal_distributed_query_state(),
+            qs,
            cql3::query_processor::cache_internal::yes);

    role_set roles;
@@ -654,11 +725,11 @@ future<bool> standard_role_manager::can_login(std::string_view role_name) {
    });
 }

-future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name) {
+future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
    const sstring query = seastar::format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
            get_auth_ks_name(_qp),
            meta::role_attributes_table::name);
-    const auto result_set = co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
+    const auto result_set = co_await _qp.execute_internal(query, db::consistency_level::ONE, qs, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
    if (!result_set->empty()) {
        const cql3::untyped_result_set_row &row = result_set->one();
        co_return std::optional<sstring>(row.get_as<sstring>("value"));
@@ -666,11 +737,11 @@ future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_
    co_return std::optional<sstring>{};
 }

-future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name) {
-    return query_all().then([this, attribute_name] (role_set roles) {
-        return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles)] (attribute_vals &role_to_att_val) {
-            return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name] (sstring role) {
-                return get_attribute(role, attribute_name).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
+future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name, ::service::query_state& qs) {
+    return query_all(qs).then([this, attribute_name, &qs] (role_set roles) {
+        return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles), &qs] (attribute_vals &role_to_att_val) {
+            return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name, &qs] (sstring role) {
+                return get_attribute(role, attribute_name, qs).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
                    if (att_val) {
                        role_to_att_val.emplace(std::move(role), std::move(*att_val));
                    }
@@ -715,7 +786,7 @@ future<> standard_role_manager::remove_attribute(std::string_view role_name, std
 future<std::vector<cql3::description>> standard_role_manager::describe_role_grants() {
    std::vector<cql3::description> result{};

-    const auto grants = co_await query_all_directly_granted();
+    const auto grants = co_await query_all_directly_granted(internal_distributed_query_state());
    result.reserve(grants.size());

    for (const auto& [grantee_role, granted_role] : grants) {
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -66,9 +66,9 @@ public:

    virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;

-    virtual future<role_to_directly_granted_map> query_all_directly_granted() override;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;

-    virtual future<role_set> query_all() override;
+    virtual future<role_set> query_all(::service::query_state&) override;

    virtual future<bool> exists(std::string_view role_name) override;

@@ -76,9 +76,9 @@ public:

    virtual future<bool> can_login(std::string_view role_name) override;

-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) override;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) override;

-    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name) override;
+    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) override;

    virtual future<> set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) override;

@@ -95,7 +95,10 @@ private:

    future<> migrate_legacy_metadata();

-    future<> create_default_role_if_missing();
+    future<> legacy_create_default_role_if_missing();
+
+    future<> maybe_create_default_role();
+    future<> maybe_create_default_role_with_retries();

    future<> create_or_replace(std::string_view role_name, const role_config&, ::service::group0_batch&);

--- a/auth/transitional.cc
+++ b/auth/transitional.cc
@@ -37,8 +37,8 @@ class transitional_authenticator : public authenticator {
 public:
    static const sstring PASSWORD_AUTHENTICATOR_NAME;

-    transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm)
-            : transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm)) {
+    transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, utils::alien_worker& hashing_worker)
+            : transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm, hashing_worker)) {
    }
    transitional_authenticator(std::unique_ptr<authenticator> a)
            : _authenticator(std::move(a)) {
@@ -239,7 +239,8 @@ static const class_registrator<
        auth::transitional_authenticator,
        cql3::query_processor&,
        ::service::raft_group0_client&,
-        ::service::migration_manager&> transitional_authenticator_reg(auth::PACKAGE_NAME + "TransitionalAuthenticator");
+        ::service::migration_manager&,
+        utils::alien_worker&> transitional_authenticator_reg(auth::PACKAGE_NAME + "TransitionalAuthenticator");

 static const class_registrator<
        auth::authorizer,
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -960,8 +960,12 @@ public:
    // Given a reference to such a column from the base schema, this function sets the corresponding column
    // in the log to the given value for the given row.
    void set_value(const clustering_key& log_ck, const column_definition& base_cdef, const managed_bytes_view& value) {
-        auto& log_cdef = *_log_schema.get_column_definition(log_data_column_name_bytes(base_cdef.name()));
-        _log_mut.set_cell(log_ck, log_cdef, atomic_cell::make_live(*base_cdef.type, _ts, value, _ttl));
+        auto log_cdef_ptr = _log_schema.get_column_definition(log_data_column_name_bytes(base_cdef.name()));
+        if (!log_cdef_ptr) {
+            throw exceptions::invalid_request_exception(format("CDC log schema for {}.{} does not have base column {}",
+                _log_schema.ks_name(), _log_schema.cf_name(), base_cdef.name_as_text()));
+        }
+        _log_mut.set_cell(log_ck, *log_cdef_ptr, atomic_cell::make_live(*base_cdef.type, _ts, value, _ttl));
    }

    // Each regular and static column in the base schema has a corresponding column in the log schema
@@ -969,7 +973,13 @@ public:
    // Given a reference to such a column from the base schema, this function sets the corresponding column
    // in the log to `true` for the given row. If not called, the column will be `null`.
    void set_deleted(const clustering_key& log_ck, const column_definition& base_cdef) {
-        _log_mut.set_cell(log_ck, log_data_column_deleted_name_bytes(base_cdef.name()), data_value(true), _ts, _ttl);
+        auto log_cdef_ptr = _log_schema.get_column_definition(log_data_column_deleted_name_bytes(base_cdef.name()));
+        if (!log_cdef_ptr) {
+            throw exceptions::invalid_request_exception(format("CDC log schema for {}.{} does not have base column {}",
+                _log_schema.ks_name(), _log_schema.cf_name(), base_cdef.name_as_text()));
+        }
+        auto& log_cdef = *log_cdef_ptr;
+        _log_mut.set_cell(log_ck, *log_cdef_ptr, atomic_cell::make_live(*log_cdef.type, _ts, log_cdef.type->decompose(true), _ttl));
    }

    // Each regular and static non-atomic column in the base schema has a corresponding column in the log schema
@@ -978,7 +988,12 @@ public:
    // Given a reference to such a column from the base schema, this function sets the corresponding column
    // in the log to the given set of keys for the given row.
    void set_deleted_elements(const clustering_key& log_ck, const column_definition& base_cdef, const managed_bytes& deleted_elements) {
-        auto& log_cdef = *_log_schema.get_column_definition(log_data_column_deleted_elements_name_bytes(base_cdef.name()));
+        auto log_cdef_ptr = _log_schema.get_column_definition(log_data_column_deleted_elements_name_bytes(base_cdef.name()));
+        if (!log_cdef_ptr) {
+            throw exceptions::invalid_request_exception(format("CDC log schema for {}.{} does not have base column {}",
+                _log_schema.ks_name(), _log_schema.cf_name(), base_cdef.name_as_text()));
+        }
+        auto& log_cdef = *log_cdef_ptr;
        _log_mut.set_cell(log_ck, log_cdef, atomic_cell::make_live(*log_cdef.type, _ts, deleted_elements, _ttl));
    }

@@ -1865,5 +1880,10 @@ bool cdc::cdc_service::needs_cdc_augmentation(const std::vector<mutation>& mutat

 future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
 cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
+    if (utils::get_local_injector().enter("sleep_before_cdc_augmentation")) {
+        return seastar::sleep(std::chrono::milliseconds(100)).then([this, timeout, mutations = std::move(mutations), tr_state = std::move(tr_state), write_cl] () mutable {
+            return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
+        });
+    }
    return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
 }
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -1910,7 +1910,11 @@ static future<compaction_result> scrub_sstables_validate_mode(sstables::compacti
    using scrub = sstables::compaction_type_options::scrub;
    if (validation_errors != 0 && descriptor.options.as<scrub>().quarantine_sstables == scrub::quarantine_invalid_sstables::yes) {
        for (auto& sst : descriptor.sstables) {
-            co_await sst->change_state(sstables::sstable_state::quarantine);
+            try {
+                co_await sst->change_state(sstables::sstable_state::quarantine);
+            } catch (...) {
+                clogger.error("Moving {} to quarantine failed due to {}, continuing.", sst->get_filename(), std::current_exception());
+            }
        }
    }

--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -1126,16 +1126,16 @@ future<> compaction_manager::drain() {
        // Disable the state so that it can be enabled later if requested.
        _state = state::disabled;
    }
+    _compaction_submission_timer.cancel();
    // Stop ongoing compactions, if the request has not been sent already and wait for them to stop.
    co_await stop_ongoing_compactions("drain");
+    // Trigger a signal to properly exit from postponed_compactions_reevaluation() fiber
+    reevaluate_postponed_compactions();
    cmlog.info("Drained");
 }

 future<> compaction_manager::stop() {
    do_stop();
-    if (auto cm = std::exchange(_task_manager_module, nullptr)) {
-        co_await cm->stop();
-    }
    if (_stop_future) {
        co_await std::exchange(*_stop_future, make_ready_future());
    }
@@ -1146,14 +1146,15 @@ future<> compaction_manager::really_do_stop() noexcept {
    // Reset the metrics registry
    _metrics.clear();
    co_await stop_ongoing_compactions("shutdown");
-    if (!_tasks.empty()) {
-        on_fatal_internal_error(cmlog, format("{} tasks still exist after being stopped", _tasks.size()));
-    }
+    co_await _task_manager_module->stop();
    co_await coroutine::parallel_for_each(_compaction_state | std::views::values, [] (compaction_state& cs) -> future<> {
        if (!cs.gate.is_closed()) {
            co_await cs.gate.close();
        }
    });
+    if (!_tasks.empty()) {
+        on_fatal_internal_error(cmlog, format("{} tasks still exist after being stopped", _tasks.size()));
+    }
    reevaluate_postponed_compactions();
    co_await std::move(_waiting_reevalution);
    co_await _sys_ks.close();
@@ -1817,8 +1818,21 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sst
    if (!gh) {
        co_return compaction_stats_opt{};
    }
-    // All sstables must be included, even the ones being compacted, such that everything in table is validated.
-    auto all_sstables = get_all_sstables(t);
+
+    // Collect and register all sstables as compacting while compaction is disabled, to avoid a race condition where
+    // regular compaction runs in between and picks the same files.
+    std::vector<sstables::shared_sstable> all_sstables;
+    compacting_sstable_registration compacting(*this, get_compaction_state(&t));
+    co_await run_with_compaction_disabled(t, [&all_sstables, &compacting, &t] () -> future<> {
+        // All sstables must be included.
+        all_sstables = get_all_sstables(t);
+        compacting.register_compacting(all_sstables);
+        return make_ready_future();
+    });
+    if (all_sstables.empty()) {
+        co_return compaction_stats_opt{};
+    }
+
    co_return co_await perform_compaction<validate_sstables_compaction_task_executor>(throw_if_stopping::no, info, &t, info.id, std::move(all_sstables), quarantine_sstables);
 }

--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -301,6 +301,11 @@ public:
    // unless it is moved back to enabled state.
    future<> drain();

+    // Check if compaction manager is running, i.e. it was enabled or drained
+    bool is_running() const noexcept {
+        return _state == state::enabled || _state == state::disabled;
+    }
+
    using compaction_history_consumer = noncopyable_function<future<>(const db::compaction_history_entry&)>;
    future<> get_compaction_history(compaction_history_consumer&& f);

--- a/compound.hh
+++ b/compound.hh
@@ -255,6 +255,9 @@ public:
    // Returns true iff given prefix has no missing components
    bool is_full(managed_bytes_view v) const {
        SCYLLA_ASSERT(AllowPrefixes == allow_prefixes::yes);
+        if (_types.size() == 0) {
+            return v.empty();
+        }
        return std::distance(begin(v), end(v)) == (ssize_t)_types.size();
    }
    bool is_empty(managed_bytes_view v) const {
--- a/compress.cc
+++ b/compress.cc
@@ -15,6 +15,8 @@
 #include <seastar/core/metrics.hh>
 #include <seastar/core/sharded.hh>
 #include <seastar/core/weak_ptr.hh>
+#include <seastar/core/thread.hh>
+#include <seastar/core/reactor.hh>
 #include "utils/reusable_buffer.hh"
 #include "sstables/compress.hh"
 #include "sstables/exceptions.hh"
@@ -22,12 +24,13 @@
 #include "sstables/sstable_compressor_factory.hh"
 #include "compress.hh"
 #include "exceptions/exceptions.hh"
+#include "utils/config_file_impl.hh"
 #include "utils/class_registrator.hh"
 #include "gms/feature_service.hh"

 // SHA256
 using dict_id = std::array<std::byte, 32>;
-class sstable_compressor_factory_impl;
+class dictionary_holder;

 static seastar::logger compressor_factory_logger("sstable_compressor_factory");

@@ -41,11 +44,11 @@ template <> struct fmt::formatter<compression_parameters::algorithm> : fmt::form
 // raw dicts might be used (and kept alive) directly by compressors (in particular, lz4 decompressor)
 // or referenced by algorithm-specific dicts.
 class raw_dict : public enable_lw_shared_from_this<raw_dict> {
-    weak_ptr<sstable_compressor_factory_impl> _owner;
+    weak_ptr<dictionary_holder> _owner;
    dict_id _id;
    std::vector<std::byte> _dict;
 public:
-    raw_dict(sstable_compressor_factory_impl& owner, dict_id key, std::span<const std::byte> dict);
+    raw_dict(dictionary_holder& owner, dict_id key, std::span<const std::byte> dict);
    ~raw_dict();
    const std::span<const std::byte> raw() const { return _dict; }
    dict_id id() const { return _id; }
@@ -79,13 +82,13 @@ struct zstd_callback_allocator {
 // (which internally holds a pointer to the raw dictionary blob
 // and parsed entropy tables).
 class zstd_ddict : public enable_lw_shared_from_this<zstd_ddict> {
-    weak_ptr<sstable_compressor_factory_impl> _owner;
+    weak_ptr<dictionary_holder> _owner;
    lw_shared_ptr<const raw_dict> _raw;
    size_t _used_memory = 0;
    zstd_callback_allocator _alloc;
    std::unique_ptr<ZSTD_DDict, decltype(&ZSTD_freeDDict)> _dict;
 public:
-    zstd_ddict(sstable_compressor_factory_impl& owner, lw_shared_ptr<const raw_dict> raw);
+    zstd_ddict(dictionary_holder& owner, lw_shared_ptr<const raw_dict> raw);
    ~zstd_ddict();
    auto dict() const { return _dict.get(); }
    auto raw() const { return _raw->raw(); }
@@ -100,14 +103,14 @@ public:
 // so the level of compression is decided at the time of construction
 // of this dict.
 class zstd_cdict : public enable_lw_shared_from_this<zstd_cdict> {
-    weak_ptr<sstable_compressor_factory_impl> _owner;
+    weak_ptr<dictionary_holder> _owner;
    lw_shared_ptr<const raw_dict> _raw;
    int _level;
    size_t _used_memory = 0;
    zstd_callback_allocator _alloc;
    std::unique_ptr<ZSTD_CDict, decltype(&ZSTD_freeCDict)> _dict;
 public:
-    zstd_cdict(sstable_compressor_factory_impl& owner, lw_shared_ptr<const raw_dict> raw, int level);
+    zstd_cdict(dictionary_holder& owner, lw_shared_ptr<const raw_dict> raw, int level);
    ~zstd_cdict();
    auto dict() const { return _dict.get(); }
    auto raw() const { return _raw->raw(); }
@@ -119,11 +122,11 @@ public:
 // and a hash index over the substrings of the blob).
 //
 class lz4_cdict : public enable_lw_shared_from_this<lz4_cdict> {
-    weak_ptr<sstable_compressor_factory_impl> _owner;
+    weak_ptr<dictionary_holder> _owner;
    lw_shared_ptr<const raw_dict> _raw;
    std::unique_ptr<LZ4_stream_t, decltype(&LZ4_freeStream)> _dict;
 public:
-    lz4_cdict(sstable_compressor_factory_impl& owner, lw_shared_ptr<const raw_dict> raw);
+    lz4_cdict(dictionary_holder& owner, lw_shared_ptr<const raw_dict> raw);
    ~lz4_cdict();
    auto dict() const { return _dict.get(); }
    auto raw() const { return _raw->raw(); }
@@ -164,6 +167,7 @@ public:
    size_t compress_max_size(size_t input_len) const override;
    std::map<sstring, sstring> options() const override;
    algorithm get_algorithm() const override;
+    std::optional<unsigned> get_dict_owner_for_test() const override;
 };

 class snappy_processor: public compressor {
@@ -266,6 +270,7 @@ public:
    size_t compress_max_size(size_t input_len) const override;
    algorithm get_algorithm() const override;
    std::map<sstring, sstring> options() const override;
+    std::optional<unsigned> get_dict_owner_for_test() const override;
 };

 zstd_processor::zstd_processor(const compression_parameters& opts, cdict_ptr cdict, ddict_ptr ddict) {
@@ -323,6 +328,16 @@ auto zstd_processor::get_algorithm() const -> algorithm {
    return (_cdict || _ddict) ? algorithm::zstd_with_dicts : algorithm::zstd;
 }

+std::optional<unsigned> zstd_processor::get_dict_owner_for_test() const {
+    if (_cdict) {
+        return _cdict.get_owner_shard();
+    } else if (_ddict) {
+        return _ddict.get_owner_shard();
+    } else {
+        return std::nullopt;
+    }
+}
+
 const std::string_view DICTIONARY_OPTION = ".dictionary.";

 static std::map<sstring, sstring> dict_as_options(std::span<const std::byte> d) {
@@ -384,6 +399,10 @@ std::map<sstring, sstring> compressor::options() const {
    return {};
 }

+std::optional<unsigned> compressor::get_dict_owner_for_test() const {
+    return std::nullopt;
+}
+
 std::string compressor::name() const {
    return compression_parameters::algorithm_to_qualified_name(get_algorithm());
 }
@@ -434,7 +453,7 @@ std::string_view compression_parameters::algorithm_to_name(algorithm alg) {
        case algorithm::snappy: return "SnappyCompressor";
        case algorithm::zstd: return "ZstdCompressor";
        case algorithm::zstd_with_dicts: return "ZstdWithDictsCompressor";
-        case algorithm::none: on_internal_error(compressor_factory_logger, "algorithm_to_name(): called with algorithm::none");
+        case algorithm::none: return "none"; // Name used only for logging purposes, can't be chosen by the user.
    }
    abort();
 }
@@ -470,6 +489,8 @@ compression_parameters::compression_parameters(const std::map<sstring, sstring>&

    if (auto v = get_option(SSTABLE_COMPRESSION)) {
        _algorithm = name_to_algorithm(*v);
+    } else if (!options.empty()) {
+        throw exceptions::configuration_exception(seastar::format("Missing compression option '{}'", SSTABLE_COMPRESSION));
    } else {
        _algorithm = algorithm::none;
    }
@@ -493,7 +514,7 @@ compression_parameters::compression_parameters(const std::map<sstring, sstring>&
        try {
            _crc_check_chance = std::stod(*v);
        } catch (const std::exception& e) {
-            throw exceptions::syntax_exception(sstring("Invalid double value ") + *v + "for " + CRC_CHECK_CHANCE);
+            throw exceptions::syntax_exception(sstring("Invalid double value ") + *v + " for " + CRC_CHECK_CHANCE);
        }
    }

@@ -518,13 +539,17 @@ compression_parameters::compression_parameters(const std::map<sstring, sstring>&
    }
 }

-void compression_parameters::validate(const gms::feature_service& fs) {
-    if (!fs.sstable_compression_dicts) {
-        if (_algorithm == algorithm::zstd_with_dicts || _algorithm == algorithm::lz4_with_dicts) {
+void compression_parameters::validate(dicts_feature_enabled dicts_enabled, dicts_usage_allowed dicts_allowed) const {
+    if (_algorithm == algorithm::zstd_with_dicts || _algorithm == algorithm::lz4_with_dicts) {
+        if (!dicts_enabled) {
            throw std::runtime_error(std::format("sstable_compression {} can't be used before "
                                                 "all nodes are upgraded to a versions which supports it",
                                                 algorithm_to_name(_algorithm)));
        }
+        if (!dicts_allowed) {
+            throw std::runtime_error(std::format("sstable_compression {} has been disabled by `sstable_compression_dictionaries_allow_in_ddl: false`",
+                                                 algorithm_to_name(_algorithm)));
+        }
    }
    if (_chunk_length) {
        auto chunk_length = _chunk_length.value();
@@ -571,6 +596,13 @@ std::map<sstring, sstring> compression_parameters::get_options() const {
    return opts;
 }

+std::istream& operator>>(std::istream& is, compression_parameters& cp) {
+    std::unordered_map<sstring, sstring> options_map;
+    is >> options_map;
+    cp = compression_parameters(options_map | std::ranges::to<std::map>());
+    return is;
+}
+
 lz4_processor::lz4_processor(cdict_ptr cdict, ddict_ptr ddict)
    : _cdict(std::move(cdict))
    , _ddict(std::move(ddict))
@@ -660,6 +692,16 @@ std::map<sstring, sstring> lz4_processor::options() const {
    }
 }

+std::optional<unsigned> lz4_processor::get_dict_owner_for_test() const {
+    if (_cdict) {
+        return _cdict.get_owner_shard();
+    } else if (_ddict) {
+        return _ddict.get_owner_shard();
+    } else {
+        return std::nullopt;
+    }
+}
+
 compressor_ptr make_lz4_sstable_compressor_for_tests() {
    return std::make_unique<lz4_processor>();
 }
@@ -751,21 +793,12 @@ size_t snappy_processor::compress_max_size(size_t input_len) const {
    return snappy_max_compressed_length(input_len);
 }

-// Constructs compressors and decompressors for SSTables,
-// making sure that the expensive identical parts (dictionaries) are shared
-// across nodes.
-//
 // Holds weak pointers to all live dictionaries
 // (so that they can be cheaply shared with new SSTables if an identical dict is requested),
 // and shared (lifetime-extending) pointers to the current writer ("recommended")
 // dict for each table (so that they can be shared with new SSTables without consulting
 // `system.dicts`).
 //
-// To make coordination work without resorting to std::mutex and such, dicts have owner shards,
-// (and are borrowed by foreign shared pointers) and all requests for a given dict ID go through its owner.
-// (Note: this shouldn't pose a performance problem because a dict is only requested once per an opening of an SSTable).
-// (Note: at the moment of this writing, one shard owns all. Later we can spread the ownership. (E.g. shard it by dict hash)).
-//
 // Whenever a dictionary dies (because its refcount reaches 0), its weak pointer
 // is removed from the factory.
 //
@@ -774,10 +807,10 @@ size_t snappy_processor::compress_max_size(size_t input_len) const {
 // Has a configurable memory budget for live dicts. If the budget is exceeded,
 // will return null dicts to new writers (to avoid making the memory usage even worse)
 // and print warnings.
-class sstable_compressor_factory_impl : public sstable_compressor_factory, public weakly_referencable<sstable_compressor_factory_impl> {
+class dictionary_holder : public weakly_referencable<dictionary_holder> {
    mutable logger::rate_limit budget_warning_rate_limit{std::chrono::minutes(10)};
-    shard_id _owner_shard;
-    config _cfg;
+    using config = default_sstable_compressor_factory::config;
+    const config& _cfg;
    uint64_t _total_live_dict_memory = 0;
    metrics::metric_groups _metrics;
    struct zstd_cdict_id {
@@ -789,7 +822,7 @@ class sstable_compressor_factory_impl : public sstable_compressor_factory, publi
    std::map<zstd_cdict_id, const zstd_cdict*> _zstd_cdicts;
    std::map<dict_id, const zstd_ddict*> _zstd_ddicts;
    std::map<dict_id, const lz4_cdict*> _lz4_cdicts;
-    std::map<table_id, lw_shared_ptr<const raw_dict>> _recommended;
+    std::map<table_id, lw_shared_ptr<foreign_ptr<lw_shared_ptr<const raw_dict>>>> _recommended;

    size_t memory_budget() const {
        return _cfg.memory_fraction_starting_at_which_we_stop_writing_dicts() * seastar::memory::stats().total_memory();
@@ -806,8 +839,11 @@ class sstable_compressor_factory_impl : public sstable_compressor_factory, publi
            memory_budget()
        );
    }
+public:
    lw_shared_ptr<const raw_dict> get_canonical_ptr(std::span<const std::byte> dict) {
-        SCYLLA_ASSERT(this_shard_id() == _owner_shard);
+        if (dict.empty()) {
+            return nullptr;
+        }
        auto id = get_sha256(dict);
        if (auto it = _raw_dicts.find(id); it != _raw_dicts.end()) {
            return it->second->shared_from_this();
@@ -819,7 +855,9 @@ class sstable_compressor_factory_impl : public sstable_compressor_factory, publi
    }
    using foreign_zstd_ddict = foreign_ptr<lw_shared_ptr<const zstd_ddict>>;
    foreign_zstd_ddict get_zstd_dict_for_reading(lw_shared_ptr<const raw_dict> raw, int level) {
-        SCYLLA_ASSERT(this_shard_id() == _owner_shard);
+        if (!raw) {
+            return nullptr;
+        }
        lw_shared_ptr<const zstd_ddict> ddict;
        // Fo reading, we must allocate a new dict, even if memory budget is exceeded. We have no other choice.
        // In any case, if the budget is exceeded after we print a rate-limited warning about it.
@@ -835,15 +873,11 @@ class sstable_compressor_factory_impl : public sstable_compressor_factory, publi
        }
        return make_foreign(std::move(ddict));
    }
-    future<foreign_zstd_ddict> get_zstd_dict_for_reading(std::span<const std::byte> dict, int level) {
-        return smp::submit_to(_owner_shard, [this, dict, level] -> foreign_zstd_ddict {
-            auto raw = get_canonical_ptr(dict);
-            return get_zstd_dict_for_reading(raw, level);
-        });
-    }
    using foreign_zstd_cdict = foreign_ptr<lw_shared_ptr<const zstd_cdict>>;
    foreign_zstd_cdict get_zstd_dict_for_writing(lw_shared_ptr<const raw_dict> raw, int level) {
-        SCYLLA_ASSERT(this_shard_id() == _owner_shard);
+        if (!_cfg.enable_writing_dictionaries() || !raw) {
+            return nullptr;
+        }
        lw_shared_ptr<const zstd_cdict> cdict;
        // If we can share an already-allocated dict, we do that regardless of memory budget.
        // If we would have to allocate a new dict for writing, we only do that if we haven't exceeded
@@ -859,19 +893,6 @@ class sstable_compressor_factory_impl : public sstable_compressor_factory, publi
        }
        return make_foreign(std::move(cdict));
    }
-    future<foreign_zstd_cdict> get_zstd_dict_for_writing(table_id t, int level) {
-        return smp::submit_to(_owner_shard, [this, t, level] -> foreign_zstd_cdict {
-            if (!_cfg.enable_writing_dictionaries()) {
-                return {};
-            }
-            auto rec_it = _recommended.find(t);
-            if (rec_it != _recommended.end()) {
-                return get_zstd_dict_for_writing(rec_it->second, level);
-            } else {
-                return {};
-            }
-        });
-    }
    using lz4_dicts = std::pair<
        foreign_ptr<lw_shared_ptr<const raw_dict>>,
        foreign_ptr<lw_shared_ptr<const lz4_cdict>>
@@ -879,18 +900,12 @@ class sstable_compressor_factory_impl : public sstable_compressor_factory, publi
    using foreign_lz4_ddict = foreign_ptr<lw_shared_ptr<const raw_dict>>;
    using foreign_lz4_cdict = foreign_ptr<lw_shared_ptr<const lz4_cdict>>;
    foreign_lz4_ddict get_lz4_dict_for_reading(lw_shared_ptr<const raw_dict> raw) {
-        SCYLLA_ASSERT(this_shard_id() == _owner_shard);
-        lw_shared_ptr<const raw_dict> ddict;
        return make_foreign(std::move(raw));
    }
-    future<foreign_lz4_ddict> get_lz4_dicts_for_reading(std::span<const std::byte> dict) {
-        return smp::submit_to(_owner_shard, [this, dict] -> foreign_lz4_ddict {
-            auto raw = get_canonical_ptr(dict);
-            return get_lz4_dict_for_reading(raw);
-        });
-    }
    foreign_lz4_cdict get_lz4_dict_for_writing(lw_shared_ptr<const raw_dict> raw) {
-        SCYLLA_ASSERT(this_shard_id() == _owner_shard);
+        if (!_cfg.enable_writing_dictionaries() || !raw) {
+            return nullptr;
+        }
        lw_shared_ptr<const lz4_cdict> cdict;
        // If we can share an already-allocated dict, we do that regardless of memory budget.
        // If we would have to allocate a new dict for writing, we only do that if we haven't exceeded
@@ -905,24 +920,10 @@ class sstable_compressor_factory_impl : public sstable_compressor_factory, publi
        }
        return make_foreign(std::move(cdict));
    }
-    future<foreign_lz4_cdict> get_lz4_dicts_for_writing(table_id t) {
-        return smp::submit_to(_owner_shard, [this, t] -> foreign_lz4_cdict {
-            if (!_cfg.enable_writing_dictionaries()) {
-                return {};
-            }
-            auto rec_it = _recommended.find(t);
-            if (rec_it != _recommended.end()) {
-                return get_lz4_dict_for_writing(rec_it->second);
-            } else {
-                return {};
-            }
-        });
-    }

 public:
-    sstable_compressor_factory_impl(config cfg)
-        : _owner_shard(this_shard_id())
-        , _cfg(std::move(cfg))
+    dictionary_holder(const config& cfg)
+        : _cfg(cfg)
    {
        if (_cfg.register_metrics) {
            namespace sm = seastar::metrics;
@@ -931,8 +932,8 @@ public:
            });
        }
    }
-    sstable_compressor_factory_impl(sstable_compressor_factory_impl&&) = delete;
-    ~sstable_compressor_factory_impl() {
+    dictionary_holder(dictionary_holder&&) = delete;
+    ~dictionary_holder() {
        // Note: `_recommended` might be the only thing keeping some dicts alive,
        // so clearing it will destroy them.
        //
@@ -948,39 +949,39 @@ public:
        _recommended.clear();
    }
    void forget_raw_dict(dict_id id) {
-        SCYLLA_ASSERT(this_shard_id() == _owner_shard);
        _raw_dicts.erase(id);
    }
    void forget_zstd_cdict(dict_id id, int level) {
-        SCYLLA_ASSERT(this_shard_id() == _owner_shard);
        _zstd_cdicts.erase({id, level});
    }
    void forget_zstd_ddict(dict_id id) {
-        SCYLLA_ASSERT(this_shard_id() == _owner_shard);
        _zstd_ddicts.erase(id);
    }
    void forget_lz4_cdict(dict_id id) {
-        SCYLLA_ASSERT(this_shard_id() == _owner_shard);
        _lz4_cdicts.erase(id);
    }
-    future<> set_recommended_dict(table_id t, std::span<const std::byte> dict) override {
-        return smp::submit_to(_owner_shard, [this, t, dict] {
-            _recommended.erase(t);
-            if (dict.size()) {
-                auto canonical_ptr = get_canonical_ptr(dict);
-                _recommended.emplace(t, canonical_ptr);
-                compressor_factory_logger.debug("set_recommended_dict: table={} size={} id={}",
-                    t, dict.size(), fmt_hex(canonical_ptr->id()));
-            } else {
-                compressor_factory_logger.debug("set_recommended_dict: table={} size=0", t);
-            }
-        });
+    void set_recommended_dict(table_id t, foreign_ptr<lw_shared_ptr<const raw_dict>> dict) {
+        _recommended.erase(t);
+        if (dict) {
+            compressor_factory_logger.debug("set_recommended_dict: table={} size={} id={}",
+                t, dict->raw().size(), fmt_hex(dict->id()));
+            _recommended.emplace(t, make_lw_shared(std::move(dict)));
+        } else {
+            compressor_factory_logger.debug("set_recommended_dict: table={} size=0", t);
+        }
+    }
+    future<foreign_ptr<lw_shared_ptr<const raw_dict>>> get_recommended_dict(table_id t) {
+        auto rec_it = _recommended.find(t);
+        if (rec_it == _recommended.end()) {
+            co_return nullptr;
+        }
+        // Note that rec_it might be invalidated while we are doing the copy(),
+        // so we have to make a copy of the outer shared ptr first.
+        lw_shared_ptr<foreign_ptr<lw_shared_ptr<const raw_dict>>> ptr = rec_it->second;
+        co_return co_await ptr->copy();
    }
-    future<compressor_ptr> make_compressor_for_writing(schema_ptr) override;
-    future<compressor_ptr> make_compressor_for_reading(sstables::compression&) override;

    void account_memory_delta(ssize_t n) {
-        SCYLLA_ASSERT(this_shard_id() == _owner_shard);
        if (static_cast<ssize_t>(_total_live_dict_memory) + n < 0) {
            compressor_factory_logger.error(
                "Error in dictionary memory accounting: delta {} brings live memory {} below 0",
@@ -990,19 +991,85 @@ public:
    }
 };

+default_sstable_compressor_factory::default_sstable_compressor_factory(config cfg)
+    : _cfg(std::move(cfg))
+    , _holder(std::make_unique<dictionary_holder>(_cfg))
+{
+    for (shard_id i = 0; i < smp::count; ++i) {
+        auto numa_id = _cfg.numa_config[i];
+        _numa_groups.resize(std::max<size_t>(_numa_groups.size(), numa_id + 1));
+        _numa_groups[numa_id].push_back(i);
+    }
+}

-future<compressor_ptr> sstable_compressor_factory_impl::make_compressor_for_writing(schema_ptr s) {
-    const auto params = s->get_compressor_params();
+default_sstable_compressor_factory::~default_sstable_compressor_factory() {
+}
+
+std::vector<unsigned> default_sstable_compressor_factory_config::get_default_shard_to_numa_node_mapping() {
+    auto sp = local_engine->smp().shard_to_numa_node_mapping();
+    return std::vector<unsigned>(sp.begin(), sp.end());
+}
+
+unsigned default_sstable_compressor_factory::local_numa_id() {
+    return _cfg.numa_config[this_shard_id()];
+}
+
+shard_id default_sstable_compressor_factory::get_dict_owner(unsigned numa_id, const sha256_type& sha) {
+    auto hash = read_unaligned<uint64_t>(sha.data());
+    const auto& group = _numa_groups[numa_id];
+    if (group.empty()) {
+        on_internal_error(compressor_factory_logger, "get_dict_owner called on an empty NUMA group");
+    }
+    return group[hash % group.size()];
+}
+
+future<> default_sstable_compressor_factory::set_recommended_dict_local(table_id t, std::span<const std::byte> dict) {
+    if (_leader_shard != this_shard_id()) {
+        on_internal_error(compressor_factory_logger, fmt::format("set_recommended_dict_local called on wrong shard. Expected: {}, got {}", _leader_shard, this_shard_id()));
+    }
+    auto units = co_await get_units(_recommendation_setting_sem, 1);
+    auto sha = get_sha256(dict);
+    for (unsigned numa_id = 0; numa_id < _numa_groups.size(); ++numa_id) {
+        const auto& group = _numa_groups[numa_id];
+        if (group.empty()) {
+            continue;
+        }
+        auto r = get_dict_owner(numa_id, sha);
+        auto d = co_await container().invoke_on(r, [dict](self& local) {
+            return make_foreign(local._holder->get_canonical_ptr(dict));
+        });
+        auto local_coordinator = group[0];
+        co_await container().invoke_on(local_coordinator, coroutine::lambda([t, d = std::move(d)](self& local) mutable {
+            local._holder->set_recommended_dict(t, std::move(d));
+        }));
+    }
+}
+
+future<> default_sstable_compressor_factory::set_recommended_dict(table_id t, std::span<const std::byte> dict) {
+    return container().invoke_on(_leader_shard, &self::set_recommended_dict_local, t, dict);
+}
+
+future<foreign_ptr<lw_shared_ptr<const raw_dict>>> default_sstable_compressor_factory::get_recommended_dict(table_id t) {
+    const auto local_coordinator = _numa_groups[local_numa_id()][0];
+    return container().invoke_on(local_coordinator, [t](self& local) {
+        return local._holder->get_recommended_dict(t);
+    });
+}
+
+future<compressor_ptr> default_sstable_compressor_factory::make_compressor_for_writing_impl(const compression_parameters& params, table_id id) {
    using algorithm = compression_parameters::algorithm;
    const auto algo = params.get_algorithm();
-    compressor_factory_logger.debug("make_compressor_for_writing: table={} algo={}", s->id(), algo);
+    compressor_factory_logger.debug("make_compressor_for_writing: table={} algo={}", id, algo);
    switch (algo) {
    case algorithm::lz4:
        co_return std::make_unique<lz4_processor>(nullptr, nullptr);
    case algorithm::lz4_with_dicts: {
-        auto cdict = _cfg.enable_writing_dictionaries()
-            ? co_await get_lz4_dicts_for_writing(s->id())
-            : nullptr;
+        holder::foreign_lz4_cdict cdict;
+        if (auto recommended = co_await get_recommended_dict(id)) {
+            cdict = co_await container().invoke_on(recommended.get_owner_shard(), [recommended = std::move(recommended)] (self& local) mutable {
+                return local._holder->get_lz4_dict_for_writing(recommended.release());
+            });
+        }
        if (cdict) {
            compressor_factory_logger.debug("make_compressor_for_writing: using dict id={}", fmt_hex(cdict->id()));
        }
@@ -1015,9 +1082,13 @@ future<compressor_ptr> sstable_compressor_factory_impl::make_compressor_for_writ
    case algorithm::zstd:
        co_return std::make_unique<zstd_processor>(params, nullptr, nullptr);
    case algorithm::zstd_with_dicts: {
-        auto cdict = _cfg.enable_writing_dictionaries()
-            ? co_await get_zstd_dict_for_writing(s->id(), params.zstd_compression_level().value_or(ZSTD_defaultCLevel()))
-            : nullptr;
+        holder::foreign_zstd_cdict cdict;
+        if (auto recommended = co_await get_recommended_dict(id)) {
+            auto level = params.zstd_compression_level().value_or(ZSTD_defaultCLevel());
+            cdict = co_await container().invoke_on(recommended.get_owner_shard(), [level, recommended = std::move(recommended)] (self& local) mutable {
+                return local._holder->get_zstd_dict_for_writing(recommended.release(), level);
+            });
+        }
        if (cdict) {
            compressor_factory_logger.debug("make_compressor_for_writing: using dict id={}", fmt_hex(cdict->id()));
        }
@@ -1029,17 +1100,28 @@ future<compressor_ptr> sstable_compressor_factory_impl::make_compressor_for_writ
    abort();
 }

-future<compressor_ptr> sstable_compressor_factory_impl::make_compressor_for_reading(sstables::compression& c) {
-    const auto params = compression_parameters(sstables::options_from_compression(c));
+future<compressor_ptr> default_sstable_compressor_factory::make_compressor_for_writing(schema_ptr s) {
+    return make_compressor_for_writing_impl(s->get_compressor_params(), s->id());
+}
+
+future<compressor_ptr> default_sstable_compressor_factory::make_compressor_for_writing_for_tests(const compression_parameters& params, table_id id) {
+    return make_compressor_for_writing_impl(params, id);
+}
+
+future<compressor_ptr> default_sstable_compressor_factory::make_compressor_for_reading_impl(const compression_parameters& params, std::span<const std::byte> dict) {
    using algorithm = compression_parameters::algorithm;
    const auto algo = params.get_algorithm();
-    compressor_factory_logger.debug("make_compressor_for_reading: compression={} algo={}", fmt::ptr(&c), algo);
    switch (algo) {
    case algorithm::lz4:
        co_return std::make_unique<lz4_processor>(nullptr, nullptr);
    case algorithm::lz4_with_dicts: {
-        auto dict = dict_from_options(c);
-        auto ddict = co_await get_lz4_dicts_for_reading(std::as_bytes(std::span(*dict)));
+        auto dict_span = dict;
+        auto sha = get_sha256(dict_span);
+        auto dict_owner = get_dict_owner(local_numa_id(), sha);
+        auto ddict = co_await container().invoke_on(dict_owner, [dict_span] (self& local) mutable {
+            auto d = local._holder->get_canonical_ptr(dict_span);
+            return local._holder->get_lz4_dict_for_reading(std::move(d));
+        });
        if (ddict) {
            compressor_factory_logger.debug("make_compressor_for_reading: using dict id={}", fmt_hex(ddict->id()));
        }
@@ -1054,8 +1136,13 @@ future<compressor_ptr> sstable_compressor_factory_impl::make_compressor_for_read
    }
    case algorithm::zstd_with_dicts: {
        auto level = params.zstd_compression_level().value_or(ZSTD_defaultCLevel());
-        auto dict = dict_from_options(c);
-        auto ddict = co_await get_zstd_dict_for_reading(std::as_bytes(std::span(*dict)), level);
+        auto dict_span = dict;
+        auto sha = get_sha256(dict_span);
+        auto dict_owner = get_dict_owner(local_numa_id(), sha);
+        auto ddict = co_await container().invoke_on(dict_owner, [level, dict_span] (self& local) mutable {
+            auto d = local._holder->get_canonical_ptr(dict_span);
+            return local._holder->get_zstd_dict_for_reading(std::move(d), level);
+        });
        if (ddict) {
            compressor_factory_logger.debug("make_compressor_for_reading: using dict id={}", fmt_hex(ddict->id()));
        }
@@ -1067,7 +1154,19 @@ future<compressor_ptr> sstable_compressor_factory_impl::make_compressor_for_read
    abort();
 }

-raw_dict::raw_dict(sstable_compressor_factory_impl& owner, dict_id key, std::span<const std::byte> dict)
+future<compressor_ptr> default_sstable_compressor_factory::make_compressor_for_reading(sstables::compression& c) {
+    const auto params = compression_parameters(sstables::options_from_compression(c));
+    auto dict = dict_from_options(c);
+    const auto algo = params.get_algorithm();
+    compressor_factory_logger.debug("make_compressor_for_reading: compression={} algo={}", fmt::ptr(&c), algo);
+    co_return co_await make_compressor_for_reading_impl(params, std::as_bytes(std::span(*dict)));
+}
+
+future<compressor_ptr> default_sstable_compressor_factory::make_compressor_for_reading_for_tests(const compression_parameters& params, std::span<const std::byte> dict) {
+    return make_compressor_for_reading_impl(params, dict);
+}
+
+raw_dict::raw_dict(dictionary_holder& owner, dict_id key, std::span<const std::byte> dict)
    : _owner(owner.weak_from_this())
    , _id(key)
    , _dict(dict.begin(), dict.end())
@@ -1082,7 +1181,7 @@ raw_dict::~raw_dict() {
    }
 }

-zstd_cdict::zstd_cdict(sstable_compressor_factory_impl& owner, lw_shared_ptr<const raw_dict> raw, int level)
+zstd_cdict::zstd_cdict(dictionary_holder& owner, lw_shared_ptr<const raw_dict> raw, int level)
    : _owner(owner.weak_from_this())
    , _raw(raw)
    , _level(level)
@@ -1114,7 +1213,7 @@ zstd_cdict::~zstd_cdict() {
    }
 }

-zstd_ddict::zstd_ddict(sstable_compressor_factory_impl& owner, lw_shared_ptr<const raw_dict> raw)
+zstd_ddict::zstd_ddict(dictionary_holder& owner, lw_shared_ptr<const raw_dict> raw)
    : _owner(owner.weak_from_this())
    , _raw(raw)
    , _alloc([this] (ssize_t n) {
@@ -1143,7 +1242,7 @@ zstd_ddict::~zstd_ddict() {
    }
 }

-lz4_cdict::lz4_cdict(sstable_compressor_factory_impl& owner, lw_shared_ptr<const raw_dict> raw)
+lz4_cdict::lz4_cdict(dictionary_holder& owner, lw_shared_ptr<const raw_dict> raw)
    : _owner(owner.weak_from_this())
    , _raw(raw)
    , _dict(LZ4_createStream(), LZ4_freeStream)
@@ -1162,6 +1261,28 @@ lz4_cdict::~lz4_cdict() {
    }
 }

-std::unique_ptr<sstable_compressor_factory> make_sstable_compressor_factory(sstable_compressor_factory::config cfg) {
-    return std::make_unique<sstable_compressor_factory_impl>(std::move(cfg));
+std::unique_ptr<sstable_compressor_factory> make_sstable_compressor_factory_for_tests_in_thread() {
+    SCYLLA_ASSERT(thread::running_in_thread());
+    struct wrapper : sstable_compressor_factory {
+        using impl = default_sstable_compressor_factory;
+        sharded<impl> _impl;
+        future<compressor_ptr> make_compressor_for_writing(schema_ptr s) override {
+            return _impl.local().make_compressor_for_writing(s);
+        }
+        future<compressor_ptr> make_compressor_for_reading(sstables::compression& c) override {
+            return _impl.local().make_compressor_for_reading(c);
+        }
+        future<> set_recommended_dict(table_id t, std::span<const std::byte> d) override {
+            return _impl.local().set_recommended_dict(t, d);
+        };
+        wrapper(wrapper&&) = delete;
+        wrapper() {
+            _impl.start().get();
+        }
+        ~wrapper() {
+            _impl.stop().get();
+        }
+    };
+    return std::make_unique<wrapper>();
 }
+
--- a/compress.hh
+++ b/compress.hh
@@ -13,12 +13,9 @@

 #include <seastar/core/future.hh>
 #include <seastar/core/sstring.hh>
+#include <seastar/util/bool_class.hh>
 #include "seastarx.hh"

-namespace gms {
-class feature_service;
-} // namespace gms
-
 class compression_parameters;

 class compressor {
@@ -64,6 +61,8 @@ public:

    virtual algorithm get_algorithm() const = 0;

+    virtual std::optional<unsigned> get_dict_owner_for_test() const;
+
    using ptr_type = std::unique_ptr<compressor>;
 };

@@ -106,7 +105,10 @@ public:
    algorithm get_algorithm() const { return _algorithm; }
    std::optional<int> zstd_compression_level() const { return _zstd_compression_level; }

-    void validate(const gms::feature_service&);
+    using dicts_feature_enabled = bool_class<struct dicts_feature_enabled_tag>;
+    using dicts_usage_allowed = bool_class<struct dicts_usage_allowed_tag>;
+    void validate(dicts_feature_enabled, dicts_usage_allowed) const;
+
    std::map<sstring, sstring> get_options() const;

    bool compression_enabled() const { 
@@ -122,3 +124,13 @@ private:
    static void validate_options(const std::map<sstring, sstring>&);
    static algorithm name_to_algorithm(std::string_view name);
 };
+
+// Stream operator for boost::program_options support
+std::istream& operator>>(std::istream& is, compression_parameters& cp);
+
+template <>
+struct fmt::formatter<compression_parameters> : fmt::formatter<std::string_view> {
+    auto format(const compression_parameters& cp, fmt::format_context& ctx) const -> decltype(ctx.out()) {
+        return fmt::format_to(ctx.out(), "{}", cp.get_options());
+    }
+};
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -855,3 +855,18 @@ rf_rack_valid_keyspaces: false
 # Maximum number of items in single BatchWriteItem command. Default is 100.
 # Note: DynamoDB has a hard-coded limit of 25.
 # alternator_max_items_in_batch_write: 100
+
+# 
+# io-streaming rate limiting
+# When setting this value to be non-zero scylla throttles disk throughput for
+# stream (network) activities such as backup, repair, tablet migration and more.
+# This limit is useful for user queries so the network interface does 
+# not get saturated by streaming activities.
+# The recommended value is 75% of network bandwidth
+# E.g for i4i.8xlarge (https://github.com/scylladb/scylla-machine-image/tree/next/common/aws_net_params.json):
+# network: 18.75 GiB/s --> 18750 Mib/s --> 1875 MB/s (from network bits to network bytes: divide by 10, not 8)
+# Converted to disk bytes: 1875 * 1000 / 1024 = 1831 MB/s (disk wise)
+# 75% of disk bytes is: 0.75 * 1831 = 1373 megabytes/s
+# stream_io_throughput_mb_per_sec: 1373
+# 
+
--- a/configure.py
+++ b/configure.py
@@ -981,6 +981,7 @@ scylla_core = (['message/messaging_service.cc',
                'cql3/result_set.cc',
                'cql3/prepare_context.cc',
                'db/batchlog_manager.cc',
+                'db/corrupt_data_handler.cc',
                'db/commitlog/commitlog.cc',
                'db/commitlog/commitlog_entry.cc',
                'db/commitlog/commitlog_replayer.cc',
@@ -1034,6 +1035,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/multiprecision_int.cc',
                'utils/gz/crc_combine.cc',
                'utils/gz/crc_combine_table.cc',
+                'utils/http.cc',
                'utils/s3/aws_error.cc',
                'utils/s3/client.cc',
                'utils/s3/retryable_http_client.cc',
@@ -1338,6 +1340,7 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/replica_exception.idl.hh',
        'idl/per_partition_rate_limit_info.idl.hh',
        'idl/position_in_partition.idl.hh',
+        'idl/full_position.idl.hh',
        'idl/experimental/broadcast_tables_lang.idl.hh',
        'idl/storage_service.idl.hh',
        'idl/join_node.idl.hh',
@@ -1530,6 +1533,7 @@ deps['test/boost/combined_tests'] += [
    'test/boost/query_processor_test.cc',
    'test/boost/reader_concurrency_semaphore_test.cc',
    'test/boost/repair_test.cc',
+    'test/boost/replicator_test.cc',
    'test/boost/restrictions_test.cc',
    'test/boost/role_manager_test.cc',
    'test/boost/row_cache_test.cc',
@@ -1538,6 +1542,8 @@ deps['test/boost/combined_tests'] += [
    'test/boost/secondary_index_test.cc',
    'test/boost/sessions_test.cc',
    'test/boost/sstable_compaction_test.cc',
+    'test/boost/sstable_compressor_factory_test.cc',
+    'test/boost/sstable_compression_config_test.cc',
    'test/boost/sstable_directory_test.cc',
    'test/boost/sstable_set_test.cc',
    'test/boost/statement_restrictions_test.cc',
--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -267,33 +267,44 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
            muts.insert(muts.begin(), schema_mutations.begin(), schema_mutations.end());
        }

+        auto rs = locator::abstract_replication_strategy::create_replication_strategy(
+                ks_md_update->strategy_name(),
+                locator::replication_strategy_params(ks_md_update->strategy_options(), ks_md_update->initial_tablets()));
+
        // If `rf_rack_valid_keyspaces` is enabled, it's forbidden to perform a schema change that
        // would lead to an RF-rack-valid keyspace. Verify that this change does not.
        // For more context, see: scylladb/scylladb#23071.
-        if (qp.db().get_config().rf_rack_valid_keyspaces()) {
-            auto rs = locator::abstract_replication_strategy::create_replication_strategy(
-                    ks_md_update->strategy_name(),
-                    locator::replication_strategy_params(ks_md_update->strategy_options(), ks_md_update->initial_tablets()));
-
-            try {
-                // There are two things to note here:
-                // 1. We hold a group0_guard, so it's correct to check this here.
-                //    The topology or schema cannot change while we're performing this query.
-                // 2. The replication strategy we use here does NOT represent the actual state
-                //    we will arrive at after applying the schema change. For instance, if the user
-                //    did not specify the RF for some of the DCs, it's equal to 0 in the replication
-                //    strategy we pass to this function, while in reality that means that the RF
-                //    will NOT change. That is not a problem:
-                //    - RF=0 is valid for all DCs, so it won't trigger an exception on its own,
-                //    - the keyspace must've been RF-rack-valid before this change. We check that
-                //      condition for all keyspaces at startup.
-                //    The second hyphen is not really true because currently topological changes can
-                //    disturb it (see scylladb/scylladb#23345), but we ignore that.
-                locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
-            } catch (const std::exception& e) {
+        try {
+            // There are two things to note here:
+            // 1. We hold a group0_guard, so it's correct to check this here.
+            //    The topology or schema cannot change while we're performing this query.
+            // 2. The replication strategy we use here does NOT represent the actual state
+            //    we will arrive at after applying the schema change. For instance, if the user
+            //    did not specify the RF for some of the DCs, it's equal to 0 in the replication
+            //    strategy we pass to this function, while in reality that means that the RF
+            //    will NOT change. That is not a problem:
+            //    - RF=0 is valid for all DCs, so it won't trigger an exception on its own,
+            //    - the keyspace must've been RF-rack-valid before this change. We check that
+            //      condition for all keyspaces at startup.
+            //    The second hyphen is not really true because currently topological changes can
+            //    disturb it (see scylladb/scylladb#23345), but we ignore that.
+            locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
+        } catch (const std::exception& e) {
+            if (qp.db().get_config().rf_rack_valid_keyspaces()) {
                // There's no guarantee what the type of the exception will be, so we need to
                // wrap it manually here in a type that can be passed to the user.
                throw exceptions::invalid_request_exception(e.what());
+            } else {
+                // Even when the configuration option `rf_rack_valid_keyspaces` is set to false,
+                // we'd like to inform the user that the keyspace they're altering will not
+                // satisfy the restriction after the change--but just as a warning.
+                // For more context, see issue: scylladb/scylladb#23330.
+                warnings.push_back(seastar::format(
+                    "Keyspace '{}' is not RF-rack-valid: the replication factor doesn't match "
+                    "the rack count in at least one datacenter. A rack failure may reduce availability. "
+                    "For more context, see: "
+                    "https://docs.scylladb.com/manual/stable/reference/glossary.html#term-RF-rack-valid-keyspace.",
+                    _name));
            }
        }

--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -8,6 +8,7 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */

+#include "cdc/log.hh"
 #include "utils/assert.hh"
 #include <seastar/core/coroutine.hh>
 #include "cql3/query_options.hh"
@@ -27,6 +28,7 @@
 #include "db/view/view.hh"
 #include "cql3/query_processor.hh"
 #include "cdc/cdc_extension.hh"
+#include "cdc/cdc_partitioner.hh"

 namespace cql3 {

@@ -290,6 +292,53 @@ std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_sche
        throw exceptions::invalid_request_exception("Cannot use ALTER TABLE on Materialized View");
    }

+    const bool is_cdc_log_table = cdc::is_log_for_some_table(db.real_database(), s->ks_name(), s->cf_name());
+    // Only a CDC log table will have this partitioner name. User tables should
+    // not be able to set this. Note that we perform a similar check when trying to
+    // re-enable CDC for a table, when the log table has been replaced by a user table.
+    // For better visualization of the above, consider this
+    //
+    // cqlsh> CREATE TABLE ks.t (p int PRIMARY KEY, v int) WITH cdc = {'enabled': true};
+    // cqlsh> INSERT INTO ks.t (p, v) VALUES (1, 2);
+    // cqlsh> ALTER TABLE ks.t WITH cdc = {'enabled': false};
+    // cqlsh> DESC TABLE ks.t_scylla_cdc_log WITH INTERNALS; # Save this output!
+    // cqlsh> DROP TABLE ks.t_scylla_cdc_log;
+    // cqlsh> [Recreate the log table using the received statement]
+    // cqlsh> ALTER TABLE ks.t WITH cdc = {'enabled': true};
+    //
+    // InvalidRequest: Error from server: code=2200 [Invalid query] message="Cannot create CDC log
+    //                 table for table ks.t because a table of name ks.t_scylla_cdc_log already exists"
+    //
+    // See commit adda43edc75b901b2329bca8f3eb74596698d05f for more information on THAT case.
+    // We reuse the same technique here.
+    const bool was_cdc_log_table = s->get_partitioner().name() == cdc::cdc_partitioner::classname;
+
+    if (_column_changes.size() != 0 && is_cdc_log_table) {
+        throw exceptions::invalid_request_exception(
+                "You cannot modify the set of columns of a CDC log table directly. "
+                "Modify the base table instead.");
+    }
+    if (_column_changes.size() != 0 && was_cdc_log_table) {
+        throw exceptions::invalid_request_exception(
+                "You cannot modify the set of columns of a CDC log table directly. "
+                "Although the base table has deactivated CDC, this table will continue being "
+                "a CDC log table until it is dropped. If you want to modify the columns in it, "
+                "you can only do that by reenabling CDC on the base table, which will reattach "
+                "this log table. Then you will be able to modify the columns in the base table, "
+                "and that will have effect on the log table too. Modifying the columns of a CDC "
+                "log table directly is never allowed.");
+    }
+
+    if (_renames.size() != 0 && is_cdc_log_table) {
+        throw exceptions::invalid_request_exception("Cannot rename a column of a CDC log table.");
+    }
+    if (_renames.size() != 0 && was_cdc_log_table) {
+        throw exceptions::invalid_request_exception(
+                "You cannot rename a column of a CDC log table. Although the base table "
+                "has deactivated CDC, this table will continue being a CDC log table until it "
+                "is dropped.");
+    }
+
    auto cfm = schema_builder(s);

    if (_properties->get_id()) {
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -23,6 +23,7 @@
 #include "db/per_partition_rate_limit_options.hh"
 #include "db/tablet_options.hh"
 #include "utils/bloom_calculations.hh"
+#include "db/config.hh"

 #include <boost/algorithm/string/predicate.hpp>

@@ -135,7 +136,9 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
            throw exceptions::configuration_exception(sstring("Missing sub-option '") + compression_parameters::SSTABLE_COMPRESSION + "' for the '" + KW_COMPRESSION + "' option.");
        }
        compression_parameters cp(*compression_options);
-        cp.validate(db.features());
+        cp.validate(
+            compression_parameters::dicts_feature_enabled(bool(db.features().sstable_compression_dicts)),
+            compression_parameters::dicts_usage_allowed(db.get_config().sstable_compression_dictionaries_allow_in_ddl()));
    }

    auto per_partition_rate_limit_options = get_per_partition_rate_limit_options(schema_extensions);
--- a/cql3/statements/create_keyspace_statement.cc
+++ b/cql3/statements/create_keyspace_statement.cc
@@ -113,10 +113,9 @@ future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector
        if (rs->uses_tablets()) {
            warnings.push_back(
                "Tables in this keyspace will be replicated using Tablets "
-                "and will not support CDC, LWT and counters features. "
-                "To use CDC, LWT or counters, drop this keyspace and re-create it "
-                "without tablets by adding AND TABLETS = {'enabled': false} "
-                "to the CREATE KEYSPACE statement.");
+                "and will not support Materialized Views, Secondary Indexes, CDC, LWT and counters features. "
+                "To use Materialized Views, Secondary Indexes, CDC, LWT or counters, drop this keyspace and re-create it "
+                "without tablets by adding AND TABLETS = {'enabled': false} to the CREATE KEYSPACE statement.");
            if (ksm->initial_tablets().value()) {
                warnings.push_back("Keyspace `initial` tablets option is deprecated.  Use per-table tablet options instead.");
            }
@@ -125,15 +124,26 @@ future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector
        // If `rf_rack_valid_keyspaces` is enabled, it's forbidden to create an RF-rack-invalid keyspace.
        // Verify that it's RF-rack-valid.
        // For more context, see: scylladb/scylladb#23071.
-        if (cfg.rf_rack_valid_keyspaces()) {
-            try {
-                // We hold a group0_guard, so it's correct to check this here.
-                // The topology or schema cannot change while we're performing this query.
-                locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
-            } catch (const std::exception& e) {
+        try {
+            // We hold a group0_guard, so it's correct to check this here.
+            // The topology or schema cannot change while we're performing this query.
+            locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
+        } catch (const std::exception& e) {
+            if (cfg.rf_rack_valid_keyspaces()) {
                // There's no guarantee what the type of the exception will be, so we need to
                // wrap it manually here in a type that can be passed to the user.
                throw exceptions::invalid_request_exception(e.what());
+            } else {
+                // Even when the configuration option `rf_rack_valid_keyspaces` is set to false,
+                // we'd like to inform the user that the keyspace they're creating does not
+                // satisfy the restriction--but just as a warning.
+                // For more context, see issue: scylladb/scylladb#23330.
+                warnings.push_back(seastar::format(
+                    "Keyspace '{}' is not RF-rack-valid: the replication factor doesn't match "
+                    "the rack count in at least one datacenter. A rack failure may reduce availability. "
+                    "For more context, see: "
+                    "https://docs.scylladb.com/manual/stable/reference/glossary.html#term-RF-rack-valid-keyspace.",
+                    _name));
            }
        }
    } catch (const exceptions::already_exists_exception& e) {
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -31,6 +31,8 @@
 #include "db/config.hh"
 #include "compaction/time_window_compaction_strategy.hh"

+bool is_internal_keyspace(std::string_view name);
+
 namespace cql3 {

 namespace statements {
@@ -122,6 +124,10 @@ void create_table_statement::apply_properties_to(schema_builder& builder, const
        addColumnMetadataFromAliases(cfmd, Collections.singletonList(valueAlias), defaultValidator, ColumnDefinition.Kind.COMPACT_VALUE);
 #endif

+    if (!_properties->get_compression_options() && !is_internal_keyspace(keyspace())) {
+        builder.set_compressor_params(db.get_config().sstable_compression_user_table_options());
+    }
+
    _properties->apply_to_builder(builder, _properties->make_schema_extensions(db.extensions()), db, keyspace());
 }

--- a/db/CMakeLists.txt
+++ b/db/CMakeLists.txt
@@ -27,6 +27,7 @@ target_sources(db
    extensions.cc
    heat_load_balance.cc
    large_data_handler.cc
+    corrupt_data_handler.cc
    marshal/type_parser.cc
    batchlog_manager.cc
    tags/utils.cc
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -18,6 +18,7 @@
 #include <seastar/core/sleep.hh>

 #include "batchlog_manager.hh"
+#include "data_dictionary/data_dictionary.hh"
 #include "mutation/canonical_mutation.hh"
 #include "service/storage_proxy.hh"
 #include "system_keyspace.hh"
@@ -36,7 +37,7 @@

 static logging::logger blogger("batchlog_manager");

-const uint32_t db::batchlog_manager::replay_interval;
+const std::chrono::seconds db::batchlog_manager::replay_interval;
 const uint32_t db::batchlog_manager::page_size;

 db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, batchlog_manager_config config)
@@ -116,7 +117,8 @@ future<> db::batchlog_manager::batchlog_replay_loop() {
        } catch (...) {
            blogger.error("Exception in batch replay: {}", std::current_exception());
        }
-        delay = std::chrono::milliseconds(replay_interval);
+        delay = utils::get_local_injector().is_enabled("short_batchlog_manager_replay_interval") ?
+                std::chrono::seconds(1) : replay_interval;
    }
 }

@@ -132,6 +134,8 @@ future<> db::batchlog_manager::drain() {
        _sem.broken();
    }

+    co_await _qp.proxy().abort_batch_writes();
+
    co_await std::move(_loop_done);
    blogger.info("Drained");
 }
@@ -173,6 +177,11 @@ future<> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cle
            return make_ready_future<stop_iteration>(stop_iteration::no);
        }

+        if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
+            blogger.debug("Skipping batch replay due to skip_batch_replay injection");
+            return make_ready_future<stop_iteration>(stop_iteration::no);
+        }
+
        // check version of serialization format
        if (!row.has("version")) {
            blogger.warn("Skipping logged batch because of unknown version");
@@ -236,19 +245,16 @@ future<> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cle
            // FIXME: verify that the above is reasonably true.
            return limiter->reserve(size).then([this, mutations = std::move(mutations)] {
                _stats.write_attempts += mutations.size();
-                // #1222 - change cl level to ALL, emulating origins behaviour of sending/hinting
-                // to all natural end points.
-                // Note however that origin uses hints here, and actually allows for this
-                // send to partially or wholly fail in actually sending stuff. Since we don't
-                // have hints (yet), send with CL=ALL, and hope we can re-do this soon.
-                // See below, we use retry on write failure.
-                return _qp.proxy().mutate(mutations, db::consistency_level::ALL, db::no_timeout, nullptr, empty_service_permit(), db::allow_per_partition_rate_limit::no);
+                auto timeout = db::timeout_clock::now() + write_timeout;
+                return _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
            });
        }).then_wrapped([this, id](future<> batch_result) {
            try {
                batch_result.get();
            } catch (data_dictionary::no_such_keyspace& ex) {
                // should probably ignore and drop the batch
+            } catch (const data_dictionary::no_such_column_family&) {
+                // As above -- we should drop the batch if the table doesn't exist anymore.
            } catch (...) {
                blogger.warn("Replay failed (will retry): {}", std::current_exception());
                // timeout, overload etc.
--- a/db/batchlog_manager.hh
+++ b/db/batchlog_manager.hh
@@ -43,8 +43,9 @@ public:
    using post_replay_cleanup = bool_class<class post_replay_cleanup_tag>;

 private:
-    static constexpr uint32_t replay_interval = 60 * 1000; // milliseconds
+    static constexpr std::chrono::seconds replay_interval = std::chrono::seconds(60);
    static constexpr uint32_t page_size = 128; // same as HHOM, for now, w/out using any heuristics. TODO: set based on avg batch size.
+    static constexpr std::chrono::seconds write_timeout = std::chrono::seconds(300);

    using clock_type = lowres_clock;

--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -800,6 +800,8 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
    void end_flush() {
        _segment_manager->end_flush();
        if (can_delete()) {
+            // #25709 - do this early if possible
+            _extended_segments.clear();
            _segment_manager->discard_unused_segments();
        }
    }
@@ -875,6 +877,8 @@ public:
    void release_cf_count(const cf_id_type& cf) {
        mark_clean(cf, 1);
        if (can_delete()) {
+            // #25709 - do this early if possible
+            _extended_segments.clear();
            _segment_manager->discard_unused_segments();
        }
    }
@@ -2576,20 +2580,24 @@ struct fmt::formatter<db::commitlog::segment::cf_mark> {
 void db::commitlog::segment_manager::discard_unused_segments() noexcept {
    clogger.trace("Checking for unused segments ({} active)", _segments.size());

-    std::erase_if(_segments, [=](sseg_ptr s) {
-        if (s->can_delete()) {
-            clogger.debug("Segment {} is unused", *s);
-            return true;
-        }
-        if (s->is_still_allocating()) {
-            clogger.debug("Not safe to delete segment {}; still allocating.", *s);
-        } else if (!s->is_clean()) {
-            clogger.debug("Not safe to delete segment {}; dirty is {}", *s, segment::cf_mark {*s});
-        } else {
-            clogger.debug("Not safe to delete segment {}; disk ops pending", *s);
-        }
-        return false;
-    });
+    // #25709 ensure we don't free any segment until after prune.
+    {
+        auto tmp = _segments; 
+        std::erase_if(_segments, [=](sseg_ptr s) {
+            if (s->can_delete()) {
+                clogger.debug("Segment {} is unused", *s);
+                return true;
+            }
+            if (s->is_still_allocating()) {
+                clogger.debug("Not safe to delete segment {}; still allocating.", *s);
+            } else if (!s->is_clean()) {
+                clogger.debug("Not safe to delete segment {}; dirty is {}", *s, segment::cf_mark {*s});
+            } else {
+                clogger.debug("Not safe to delete segment {}; disk ops pending", *s);
+            }
+            return false;
+        });
+    }

    // launch in background, but guard with gate so this deletion is
    // sure to finish in shutdown, because at least through this path,
@@ -2878,7 +2886,10 @@ future<> db::commitlog::segment_manager::do_pending_deletes() {
 }

 future<> db::commitlog::segment_manager::orphan_all() {
-    _segments.clear();
+    // #25709. the actual process of destroying the elements here
+    // might cause a call into discard_unused_segments.
+    // ensure the target vector is empty when we get to destructors
+    auto tmp = std::exchange(_segments, {});
    return clear_reserve_segments();
 }

@@ -3255,9 +3266,13 @@ const db::commitlog::config& db::commitlog::active_config() const {
    return _segment_manager->cfg;
 }

+db::commitlog::segment_data_corruption_error::segment_data_corruption_error(std::string_view msg, uint64_t s)
+    : _msg(fmt::format("Segment data corruption: {}", msg))
+    , _bytes(s)
+{}

-db::commitlog::segment_truncation::segment_truncation(uint64_t pos) 
-    : _msg(fmt::format("Segment truncation at {}", pos))
+db::commitlog::segment_truncation::segment_truncation(std::string_view reason, uint64_t pos)
+    : _msg(fmt::format("Segment truncation at {}. Reason: {}", pos, reason))
    , _pos(pos)
 {}

@@ -3447,7 +3462,8 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin

            while (rem < size) {
                if (eof) {
-                    throw segment_truncation(block_boundry);
+                    auto reason = fmt::format("unexpected EOF, rem={}, size={}", rem, size);
+                    throw segment_truncation(std::move(reason), block_boundry);
                }

                auto block_size = alignment - initial.size_bytes();
@@ -3458,7 +3474,8 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin

                if (tmp.size_bytes() == 0) {
                    eof = true;
-                    throw segment_truncation(block_boundry);
+                    auto reason = fmt::format("read 0 bytes, while tried to read {}", block_size);
+                    throw segment_truncation(std::move(reason), block_boundry);
                }

                crc32_nbo crc;
@@ -3493,10 +3510,12 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
                    auto checksum = crc.checksum();

                    if (check != checksum) {
-                        throw segment_data_corruption_error("Data corruption", alignment);
+                        auto reason = fmt::format("checksums do not match: {:x} vs. {:x}", check, checksum);
+                        throw segment_data_corruption_error(std::move(reason), alignment);
                    }
                    if (id != this->id) {
-                        throw segment_truncation(pos + rem);
+                        auto reason = fmt::format("IDs do not match: {} vs. {}", id, this->id);
+                        throw segment_truncation(std::move(reason), pos + rem);
                    }
                }
                tmp.remove_suffix(detail::sector_overhead_size);
@@ -3604,6 +3623,10 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
            auto old = pos;
            pos = next_pos(off);
            clogger.trace("Pos {} -> {} ({})", old, pos, off);
+            // #24346 check eof status whenever we move file pos.
+            if (pos >= file_size) {
+                eof = true;
+            }
        }

        future<> read_entry() {
@@ -3771,7 +3794,8 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
                    co_await read_chunk();
                }
                if (corrupt_size > 0) {
-                    throw segment_data_corruption_error("Data corruption", corrupt_size);
+                    auto reason = fmt::format("corrupted size while reading file: {}", corrupt_size);
+                    throw segment_data_corruption_error(std::move(reason), corrupt_size);
                }
            } catch (...) {
                p = std::current_exception();
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -392,9 +392,7 @@ public:
    class segment_data_corruption_error: public segment_error {
        std::string _msg;
    public:
-        segment_data_corruption_error(std::string msg, uint64_t s)
-                : _msg(std::move(msg)), _bytes(s) {
-        }
+        segment_data_corruption_error(std::string_view msg, uint64_t s);
        uint64_t bytes() const {
            return _bytes;
        }
@@ -425,7 +423,7 @@ public:
        std::string _msg;
        uint64_t _pos;
    public:
-        segment_truncation(uint64_t);
+        segment_truncation(std::string_view reason, uint64_t position);

        uint64_t position() const;
        const char* what() const noexcept override;
--- a/db/config.cc
+++ b/db/config.cc
@@ -32,11 +32,15 @@
 #include "db/tags/extension.hh"
 #include "config.hh"
 #include "extensions.hh"
+#include "compress.hh"
 #include "utils/log.hh"
 #include "service/tablet_allocator_fwd.hh"
 #include "utils/config_file_impl.hh"
+#include "exceptions/exceptions.hh"
 #include <seastar/core/metrics_api.hh>
 #include <seastar/core/relabel_config.hh>
+
+static logging::logger cfglogger("config");
 #include <seastar/util/file.hh>

 namespace utils {
@@ -86,6 +90,12 @@ object_storage_endpoints_to_json(const std::vector<db::object_storage_endpoint_p
    return value_to_json(m);
 }

+static
+json::json_return_type
+uuid_to_json(const db::config::UUID& uuid) {
+    return value_to_json(format("{}", uuid));
+}
+
 // Convert a value that can be printed with fmt::format, or a vector of
 // such values, to JSON. An example is enum_option<T>, because enum_option<T>
 // has a specialization for fmt::formatter.
@@ -111,6 +121,12 @@ error_injection_list_to_json(const std::vector<db::config::error_injection_at_st
    return value_to_json("error_injection_list");
 }

+static
+json::json_return_type
+compression_parameters_to_json(const compression_parameters& cp) {
+    return value_to_json(cp.get_options());
+}
+
 template <>
 bool
 config_from_string(std::string_view value) {
@@ -294,6 +310,18 @@ const config_type& config_type_for<std::vector<db::object_storage_endpoint_param
    return ct;
 }

+template <>
+const config_type& config_type_for<db::config::UUID>() {
+    static config_type ct("UUID", uuid_to_json);
+    return ct;
+}
+
+template <>
+const config_type& config_type_for<compression_parameters>() {
+    static config_type ct("compression parameters", compression_parameters_to_json);
+    return ct;
+}
+
 }

 namespace YAML {
@@ -491,6 +519,50 @@ struct convert<db::object_storage_endpoint_param> {
    }
 };

+template<>
+struct convert<utils::UUID> {
+    static bool decode(const Node& node, utils::UUID& uuid) {
+        std::string uuid_string;
+        if (!convert<std::string>::decode(node, uuid_string)) {
+            return false;
+        }
+        try {
+            std::istringstream(uuid_string) >> uuid;
+        } catch (boost::program_options::invalid_option_value&) {
+            return false;
+        }
+        return true;
+    }
+};
+
+template<>
+struct convert<compression_parameters> {
+    static bool decode(const Node& node, compression_parameters& cp) {
+        if (!node.IsMap()) {
+            return false;
+        }
+
+        std::map<sstring, sstring> options;
+        for (const auto& kv : node) {
+            options[kv.first.as<sstring>()] = kv.second.as<sstring>();
+        }
+
+        try {
+            cp = compression_parameters(options);
+            return true;
+        } catch (const exceptions::syntax_exception& e) {
+            cfglogger.error("Invalid compression parameters syntax: {}", e.what());
+            return false;
+        } catch (const exceptions::configuration_exception& e) {
+            cfglogger.error("Invalid compression parameters configuration: {}", e.what());
+            return false;
+        } catch (const std::runtime_error& e) {
+            cfglogger.error("Error parsing compression parameters: {}", e.what());
+            return false;
+        }
+    }
+};
+
 }

 #if defined(DEBUG)
@@ -819,7 +891,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , inter_dc_stream_throughput_outbound_megabits_per_sec(this, "inter_dc_stream_throughput_outbound_megabits_per_sec", value_status::Unused, 0,
        "Throttles all streaming file transfer between the data centers. This setting allows throttles streaming throughput betweens data centers in addition to throttling all network stream traffic as configured with stream_throughput_outbound_megabits_per_sec.")
    , stream_io_throughput_mb_per_sec(this, "stream_io_throughput_mb_per_sec", liveness::LiveUpdate, value_status::Used, 0,
-        "Throttles streaming I/O to the specified total throughput (in MiBs/s) across the entire system. Streaming I/O includes the one performed by repair and both RBNO and legacy topology operations such as adding or removing a node. Setting the value to 0 disables stream throttling.")
+        "Throttles streaming I/O to the specified total throughput (in MiBs/s) across the entire system. Streaming I/O includes the one performed by repair and both RBNO and legacy topology operations such as adding or removing a node. Setting the value to 0 disables stream throttling. It is recommended to set the value for this parameter to be 75% of network bandwidth")
    , stream_plan_ranges_fraction(this, "stream_plan_ranges_fraction", liveness::LiveUpdate, value_status::Used, 0.1,
        "Specify the fraction of ranges to stream in a single stream plan. Value is between 0 and 1.")
    , enable_file_stream(this, "enable_file_stream", liveness::LiveUpdate, value_status::Used, true, "Set true to use file based stream for tablet instead of mutation based stream")
@@ -942,6 +1014,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "The default timeout for other, miscellaneous operations.\n"
        "\n"
        "Related information: About hinted handoff writes")
+    , request_timeout_on_shutdown_in_seconds(this, "request_timeout_on_shutdown_in_seconds", value_status::Used, 30,
+        "Timeout for CQL server requests on shutdown. After this timeout the server will shutdown all connections.")
    , group0_raft_op_timeout_in_ms(this, "group0_raft_op_timeout_in_ms", liveness::LiveUpdate, value_status::Used, 60000,
            "The time in milliseconds that group0 allows a Raft operation to complete.")
    /**
@@ -1230,7 +1304,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , sstable_summary_ratio(this, "sstable_summary_ratio", value_status::Used, 0.0005, "Enforces that 1 byte of summary is written for every N (2000 by default)"
        "bytes written to data file. Value must be between 0 and 1.")
    , components_memory_reclaim_threshold(this, "components_memory_reclaim_threshold", liveness::LiveUpdate, value_status::Used, .2, "Ratio of available memory for all in-memory components of SSTables in a shard beyond which the memory will be reclaimed from components until it falls back under the threshold. Currently, this limit is only enforced for bloom filters.")
-    , large_memory_allocation_warning_threshold(this, "large_memory_allocation_warning_threshold", value_status::Used, (size_t(128) << 10) + 1, "Warn about memory allocations above this size; set to zero to disable.")
+    , large_memory_allocation_warning_threshold(this, "large_memory_allocation_warning_threshold", value_status::Used, size_t(1) << 20, "Warn about memory allocations above this size; set to zero to disable.")
    , enable_deprecated_partitioners(this, "enable_deprecated_partitioners", value_status::Used, false, "Enable the byteordered and random partitioners. These partitioners are deprecated and will be removed in a future version.")
    , enable_keyspace_column_family_metrics(this, "enable_keyspace_column_family_metrics", value_status::Used, false, "Enable per keyspace and per column family metrics reporting.")
    , enable_node_aggregated_table_metrics(this, "enable_node_aggregated_table_metrics", value_status::Used, true, "Enable aggregated per node, per keyspace and per table metrics reporting, applicable if enable_keyspace_column_family_metrics is false.")
@@ -1243,6 +1317,21 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , enable_sstables_mc_format(this, "enable_sstables_mc_format", value_status::Unused, true, "Enable SSTables 'mc' format to be used as the default file format.  Deprecated, please use \"sstable_format\" instead.")
    , enable_sstables_md_format(this, "enable_sstables_md_format", value_status::Unused, true, "Enable SSTables 'md' format to be used as the default file format.  Deprecated, please use \"sstable_format\" instead.")
    , sstable_format(this, "sstable_format", value_status::Used, "me", "Default sstable file format", {"md", "me"})
+    , sstable_compression_user_table_options(this, "sstable_compression_user_table_options", value_status::Used, compression_parameters{},
+        "Server-global user table compression options. If enabled, all user tables"
+        "will be compressed using the provided options, unless overridden"
+        "by compression options in the table schema. The available options are:\n"
+        "* sstable_compression: The compression algorithm to use. Supported values: LZ4Compressor (default), LZ4WithDictsCompressor, SnappyCompressor, DeflateCompressor, ZstdCompressor, ZstdWithDictsCompressor, '' (empty string; disables compression).\n"
+        "* chunk_length_in_kb: (Default: 4) The size of chunks to compress in kilobytes. Allowed values are powers of two between 1 and 128.\n"
+        "* crc_check_chance: (Default: 1.0) Not implemented (option value is ignored).\n"
+        "* compression_level: (Default: 3) Compression level for ZstdCompressor and ZstdWithDictsCompressor. Higher levels provide better compression ratios at the cost of speed. Allowed values are integers between 1 and 22.")
+    , sstable_compression_dictionaries_allow_in_ddl(this, "sstable_compression_dictionaries_allow_in_ddl", liveness::LiveUpdate, value_status::Used, true,
+        "Allows for configuring tables to use SSTable compression with shared dictionaries. "
+        "If the option is disabled, Scylla will reject CREATE and ALTER statements which try to set dictionary-based sstable compressors.\n"
+        "This is only enforced when this node validates a new DDL statement; disabling the option won't disable dictionary-based compression "
+        "on tables which already have it configured, and won't do anything to existing sstables.\n"
+        "To affect existing tables, you can ALTER them to a non-dictionary compressor, or disable dictionary compression "
+        "for the whole node through `sstable_compression_dictionaries_enable_writing`.")
    , sstable_compression_dictionaries_enable_writing(this, "sstable_compression_dictionaries_enable_writing", liveness::LiveUpdate, value_status::Used, true,
        "Enables SSTable compression with shared dictionaries (for tables which opt in). If set to false, this node won't write any new SSTables using dictionary compression.\n"
        "Option meant not for regular usage, but for unforeseen problems that call for disabling dictionaries without modifying table schema.")
@@ -1392,7 +1481,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "The maximum fraction of cache memory permitted for use by index cache. Clamped to the [0.0; 1.0] range. Must be small enough to not deprive the row cache of memory, but should be big enough to fit a large fraction of the index. The default value 0.2 means that at least 80\% of cache memory is reserved for the row cache, while at most 20\% is usable by the index cache.")
    , consistent_cluster_management(this, "consistent_cluster_management", value_status::Deprecated, true, "Use RAFT for cluster management and DDL.")
    , force_gossip_topology_changes(this, "force_gossip_topology_changes", value_status::Used, false, "Force gossip-based topology operations in a fresh cluster. Only the first node in the cluster must use it. The rest will fall back to gossip-based operations anyway. This option should be used only for testing.  Note: gossip topology changes are incompatible with tablets.")
-    , recovery_leader(this, "recovery_leader", liveness::LiveUpdate, value_status::Used, "", "Host ID of the node restarted first while performing the Manual Raft-based Recovery Procedure. Warning: this option disables some guardrails for the needs of the Manual Raft-based Recovery Procedure. Make sure you unset it at the end of the procedure.")
+    , recovery_leader(this, "recovery_leader", liveness::LiveUpdate, value_status::Used, utils::null_uuid(), "Host ID of the node restarted first while performing the Manual Raft-based Recovery Procedure. Warning: this option disables some guardrails for the needs of the Manual Raft-based Recovery Procedure. Make sure you unset it at the end of the procedure.")
    , wasm_cache_memory_fraction(this, "wasm_cache_memory_fraction", value_status::Used, 0.01, "Maximum total size of all WASM instances stored in the cache as fraction of total shard memory.")
    , wasm_cache_timeout_in_ms(this, "wasm_cache_timeout_in_ms", value_status::Used, 5000, "Time after which an instance is evicted from the cache.")
    , wasm_cache_instance_size_limit(this, "wasm_cache_instance_size_limit", value_status::Used, 1024*1024, "Instances with size above this limit will not be stored in the cache.")
--- a/db/config.hh
+++ b/db/config.hh
@@ -25,6 +25,7 @@
 #include "utils/error_injection.hh"
 #include "utils/dict_trainer.hh"
 #include "utils/advanced_rpc_compressor.hh"
+#include "compress.hh"

 namespace boost::program_options {

@@ -207,6 +208,7 @@ public:
    using seed_provider_type = db::seed_provider_type;
    using hinted_handoff_enabled_type = db::hints::host_filter;
    using error_injection_at_startup = db::error_injection_at_startup;
+    using UUID = utils::UUID;

    /*
     * All values and documentation taken from
@@ -322,6 +324,7 @@ public:
    named_value<uint32_t> truncate_request_timeout_in_ms;
    named_value<uint32_t> write_request_timeout_in_ms;
    named_value<uint32_t> request_timeout_in_ms;
+    named_value<uint32_t> request_timeout_on_shutdown_in_seconds;
    named_value<uint32_t> group0_raft_op_timeout_in_ms;
    named_value<bool> cross_node_timeout;
    named_value<uint32_t> internode_send_buff_size_in_bytes;
@@ -436,6 +439,8 @@ public:
    named_value<bool> enable_sstables_mc_format;
    named_value<bool> enable_sstables_md_format;
    named_value<sstring> sstable_format;
+    named_value<compression_parameters> sstable_compression_user_table_options;
+    named_value<bool> sstable_compression_dictionaries_allow_in_ddl;
    named_value<bool> sstable_compression_dictionaries_enable_writing;
    named_value<float> sstable_compression_dictionaries_memory_budget_fraction;
    named_value<float> sstable_compression_dictionaries_retrain_period_in_seconds;
@@ -520,7 +525,7 @@ public:

    named_value<bool> consistent_cluster_management;
    named_value<bool> force_gossip_topology_changes;
-    named_value<sstring> recovery_leader;
+    named_value<UUID> recovery_leader;

    named_value<double> wasm_cache_memory_fraction;
    named_value<uint32_t> wasm_cache_timeout_in_ms;
--- a/db/corrupt_data_handler.cc
+++ b/db/corrupt_data_handler.cc
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#include "db/corrupt_data_handler.hh"
+#include "reader_concurrency_semaphore.hh"
+#include "replica/database.hh"
+#include "utils/UUID_gen.hh"
+
+static logging::logger corrupt_data_logger("corrupt_data");
+
+namespace sm = seastar::metrics;
+
+namespace db {
+
+corrupt_data_handler::corrupt_data_handler(register_metrics rm) {
+    if (rm) {
+        _metrics.add_group("corrupt_data", {
+                sm::make_counter("entries_reported", _stats.corrupt_data_reported,
+                               sm::description("Counts the number of corrupt data instances reported to the corrupt data handler. "
+                                               "A non-zero value indicates that the database suffered data corruption."))
+                });
+    }
+}
+
+future<corrupt_data_handler::entry_id> corrupt_data_handler::record_corrupt_clustering_row(const schema& s, const partition_key& pk,
+        clustering_row cr, sstring origin, std::optional<sstring> sstable_name) {
+    ++_stats.corrupt_data_reported;
+    ++_stats.corrupt_clustering_rows_reported;
+    return do_record_corrupt_clustering_row(s, pk, std::move(cr), std::move(origin), std::move(sstable_name)).then([this] (entry_id id) {
+        if (id) {
+            ++_stats.corrupt_data_recorded;
+            ++_stats.corrupt_clustering_rows_recorded;
+        }
+        return id;
+    });
+}
+
+system_table_corrupt_data_handler::system_table_corrupt_data_handler(config cfg, register_metrics rm)
+    : corrupt_data_handler(rm)
+    , _entry_ttl(cfg.entry_ttl)
+    , _sys_ks("system_table_corrupt_data_handler::system_keyspace")
+{
+}
+
+system_table_corrupt_data_handler::~system_table_corrupt_data_handler() {
+}
+
+reader_permit system_table_corrupt_data_handler::make_fragment_permit(const schema& s) {
+    return _fragment_semaphore->make_tracking_only_permit(s.shared_from_this(), "system_table_corrupt_data_handler::make_fragment_permit", db::no_timeout, {});
+}
+
+future<corrupt_data_handler::entry_id> system_table_corrupt_data_handler::do_record_corrupt_mutation_fragment(
+        pluggable_system_keyspace::permit sys_ks,
+        const schema& user_table_schema,
+        const partition_key& pk,
+        const clustering_key& ck,
+        mutation_fragment_v2::kind kind,
+        frozen_mutation_fragment_v2 fmf,
+        sstring origin,
+        std::optional<sstring> sstable_name) {
+    const corrupt_data_handler::entry_id id{utils::UUID_gen::get_time_UUID()};
+
+    const auto corrupt_data_schema = sys_ks->local_db().find_column_family(system_keyspace::NAME, system_keyspace::CORRUPT_DATA).schema();
+
+    // Using the lower-level mutation API to avoid large allocation warnings when linearizing the frozen mutation fragment.
+    mutation entry_mutation(corrupt_data_schema, partition_key::from_exploded(*corrupt_data_schema, {serialized(user_table_schema.ks_name()), serialized(user_table_schema.cf_name())}));
+    auto& entry_row = entry_mutation.partition().clustered_row(*corrupt_data_schema, clustering_key::from_single_value(*corrupt_data_schema, serialized(timeuuid_native_type{id.uuid()})));
+
+    const auto timestamp = api::new_timestamp();
+
+    auto set_cell_raw = [this, &entry_row, &corrupt_data_schema, timestamp] (const char* cell_name, managed_bytes cell_value) {
+        auto cdef = corrupt_data_schema->get_column_definition(cell_name);
+        SCYLLA_ASSERT(cdef);
+
+        entry_row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, timestamp, cell_value, _entry_ttl));
+    }; 
+
+    auto set_cell = [this, &entry_row, &corrupt_data_schema, timestamp] (const char* cell_name, data_value cell_value) {
+        auto cdef = corrupt_data_schema->get_column_definition(cell_name);
+        SCYLLA_ASSERT(cdef);
+
+        entry_row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, timestamp, cell_value.serialize_nonnull(), _entry_ttl));
+    };
+
+    entry_row.apply(row_marker(timestamp, _entry_ttl, gc_clock::now() + _entry_ttl));
+    set_cell("partition_key", data_value(to_bytes(pk.representation())));
+    set_cell("clustering_key", data_value(to_bytes(ck.representation())));
+    set_cell("mutation_fragment_kind", fmt::to_string(kind));
+    // FIXME: Exposing knowledge here that bytes are serialized by just storing the raw value.
+    // Need to replace with a fragmented-buffer serialize API call, which we don't have yet.
+    set_cell_raw("frozen_mutation_fragment", std::move(fmf).representation().to_managed_bytes());
+    set_cell("origin", origin);
+    set_cell("sstable_name", sstable_name);
+
+    return sys_ks->apply_mutation(std::move(entry_mutation)).then([id] {
+        return id;
+    });
+}
+
+future<corrupt_data_handler::entry_id> system_table_corrupt_data_handler::do_record_corrupt_clustering_row(const schema& s, const partition_key& pk,
+        clustering_row cr, sstring origin, std::optional<sstring> sstable_name) {
+    auto sys_ks = _sys_ks.get_permit();
+    if (!sys_ks) {
+        co_return corrupt_data_handler::entry_id::create_null_id();
+    }
+
+    const auto ck = cr.key();
+    auto fmf = freeze(s, mutation_fragment_v2(s, make_fragment_permit(s), std::move(cr)));
+
+    co_return co_await do_record_corrupt_mutation_fragment(std::move(sys_ks), s, pk, ck, mutation_fragment_v2::kind::clustering_row, std::move(fmf),
+            std::move(origin), std::move(sstable_name));
+}
+
+void system_table_corrupt_data_handler::plug_system_keyspace(db::system_keyspace& sys_ks) noexcept {
+    _sys_ks.plug(sys_ks.shared_from_this());
+    _fragment_semaphore = std::make_unique<reader_concurrency_semaphore>(reader_concurrency_semaphore::no_limits{}, "system_table_corrupt_data_handler", reader_concurrency_semaphore::register_metrics::no);
+}
+
+future<> system_table_corrupt_data_handler::unplug_system_keyspace() noexcept {
+    co_await _sys_ks.unplug();
+    co_await _fragment_semaphore->stop();
+}
+
+future<corrupt_data_handler::entry_id> nop_corrupt_data_handler::do_record_corrupt_clustering_row(const schema& s, const partition_key& pk,
+        clustering_row cr, sstring origin, std::optional<sstring> sstable_name) {
+    return make_ready_future<entry_id>(entry_id::create_null_id());
+}
+
+} // namespace db
--- a/db/corrupt_data_handler.hh
+++ b/db/corrupt_data_handler.hh
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include "db/system_keyspace.hh"
+#include "utils/UUID.hh"
+#include "utils/pluggable.hh"
+
+class reader_concurrency_semaphore;
+class reader_permit;
+
+namespace db {
+
+class corrupt_data_handler {
+public:
+    // An ID identifying the corrupt data entry.
+    // To be interpreted in the context of the storage where it is recorded, see storage_name().
+    using entry_id = utils::tagged_uuid<struct corrupt_data_entry_tag>;
+
+    struct stats {
+        // Counters for the number of corrupt data entries reported.
+        uint64_t corrupt_data_reported = 0;
+        // Counters for the number of corrupt data entries recorded.
+        // Can be less than reported depending on the configuration or if entries failed to be recorded.
+        uint64_t corrupt_data_recorded = 0;
+
+        uint64_t corrupt_clustering_rows_reported = 0;
+        uint64_t corrupt_clustering_rows_recorded = 0;
+    };
+
+private:
+    stats _stats;
+
+    seastar::metrics::metric_groups _metrics;
+
+protected:
+    virtual future<entry_id> do_record_corrupt_clustering_row(const schema& s, const partition_key& pk, clustering_row cr, sstring origin, std::optional<sstring> sstable_name) = 0;
+
+public:
+    using register_metrics = bool_class<struct corrupt_data_handler_register_metrics_tag>;
+    explicit corrupt_data_handler(register_metrics);
+    virtual ~corrupt_data_handler() = default;
+
+    const stats& get_stats() const noexcept {
+        return _stats;
+    }
+
+    // The name of the storage where corrupt data is recorded.
+    // The storage-name and the entry-id together should allow the user to unambiguously locate the entry.
+    virtual sstring storage_name() const noexcept = 0;
+
+    // Record a corrupt clustering row.
+    // If the returned id is null, the row was not recorded.
+    future<entry_id> record_corrupt_clustering_row(const schema& s, const partition_key& pk, clustering_row cr, sstring origin, std::optional<sstring> sstable_name);
+};
+
+// Stores corrupt data entries in the system.corrupt_data table.
+class system_table_corrupt_data_handler final : public corrupt_data_handler {
+public:
+    using pluggable_system_keyspace = utils::pluggable<db::system_keyspace>;
+
+    struct config {
+        gc_clock::duration entry_ttl;
+    };
+
+private:
+    gc_clock::duration _entry_ttl;
+
+    pluggable_system_keyspace _sys_ks;
+    std::unique_ptr<reader_concurrency_semaphore> _fragment_semaphore;
+
+private:
+    reader_permit make_fragment_permit(const schema& s);
+
+    future<entry_id> do_record_corrupt_mutation_fragment(pluggable_system_keyspace::permit sys_ks, const schema& user_table_schema, const partition_key& pk, const clustering_key& ck,
+            mutation_fragment_v2::kind kind, frozen_mutation_fragment_v2 mf, sstring origin, std::optional<sstring> sstable_name);
+
+    virtual future<entry_id> do_record_corrupt_clustering_row(const schema& s, const partition_key& pk, clustering_row cr, sstring origin, std::optional<sstring> sstable_name) override;
+
+public:
+    explicit system_table_corrupt_data_handler(config, register_metrics);
+    ~system_table_corrupt_data_handler();
+
+    virtual sstring storage_name() const noexcept override {
+        return format("{}.{}", db::system_keyspace::NAME, db::system_keyspace::CORRUPT_DATA);
+    }
+
+    void plug_system_keyspace(db::system_keyspace& sys_ks) noexcept;
+    future<> unplug_system_keyspace() noexcept;
+};
+
+// A no-op corrupt data handler that does not record any data.
+class nop_corrupt_data_handler final : public corrupt_data_handler {
+    virtual future<entry_id> do_record_corrupt_clustering_row(const schema& s, const partition_key& pk, clustering_row cr, sstring origin, std::optional<sstring> sstable_name) override;
+
+public:
+    explicit nop_corrupt_data_handler(register_metrics rm)
+        : corrupt_data_handler(rm) {}
+    virtual sstring storage_name() const noexcept override {
+        return "/dev/null";
+    }
+};
+
+} // namespace db
--- a/db/hints/internal/hint_endpoint_manager.cc
+++ b/db/hints/internal/hint_endpoint_manager.cc
@@ -65,18 +65,18 @@ future<> hint_endpoint_manager::do_store_hint(schema_ptr s, lw_shared_ptr<const
        const replay_position rp = rh.release();
        if (_last_written_rp < rp) {
            _last_written_rp = rp;
-            manager_logger.debug("[{}] Updated last written replay position to {}", end_point_key(), rp);
+            manager_logger.trace("hint_endpoint_manager[{}]:do_store_hint: Updated last written replay position to {}", end_point_key(), rp);
        }

        ++shard_stats().written;

-        manager_logger.trace("Hint to {} was stored", end_point_key());
+        manager_logger.trace("hint_endpoint_manager[{}]:do_store_hint: Hint has been stored", end_point_key());
        tracing::trace(tr_state, "Hint to {} was stored", end_point_key());
    } catch (...) {
        ++shard_stats().errors;
        const auto eptr = std::current_exception();

-        manager_logger.debug("store_hint(): got the exception when storing a hint to {}: {}", end_point_key(), eptr);
+        manager_logger.debug("hint_endpoint_manager[{}]:do_store_hint: Exception when storing a hint: {}", end_point_key(), eptr);
        tracing::trace(tr_state, "Failed to store a hint to {}: {}", end_point_key(), eptr);
    }

@@ -92,7 +92,7 @@ bool hint_endpoint_manager::store_hint(schema_ptr s, lw_shared_ptr<const frozen_
            return do_store_hint(std::move(s), std::move(fm), tr_state);
        });
    } catch (...) {
-        manager_logger.trace("Failed to store a hint to {}: {}", end_point_key(), std::current_exception());
+        manager_logger.trace("hint_endpoint_manager[{}]:store_hint: Failed to store a hint: {}", end_point_key(), std::current_exception());
        tracing::trace(tr_state, "Failed to store a hint to {}: {}", end_point_key(), std::current_exception());

        ++shard_stats().dropped;
@@ -109,16 +109,23 @@ future<> hint_endpoint_manager::populate_segments_to_replay() {
 }

 void hint_endpoint_manager::start() {
+    manager_logger.debug("hint_endpoint_manager[{}]:start: Starting", end_point_key());
+
    clear_stopped();
    allow_hints();
    _sender.start();
+
+    manager_logger.debug("hint_endpoint_manager[{}]:start: Finished", end_point_key());
 }

 future<> hint_endpoint_manager::stop(drain should_drain) noexcept {
-    if(stopped()) {
+    if (stopped()) {
+        manager_logger.warn("hint_endpoint_manager[{}]:stop: Stop had already been called", end_point_key());
        return make_exception_future<>(std::logic_error(format("ep_manager[{}]: stop() is called twice", _key).c_str()));
    }

+    manager_logger.debug("hint_endpoint_manager[{}]:stop: Starting", end_point_key());
+
    return seastar::async([this, should_drain] {
        std::exception_ptr eptr;

@@ -139,10 +146,11 @@ future<> hint_endpoint_manager::stop(drain should_drain) noexcept {
        }).handle_exception([&eptr] (auto e) { eptr = std::move(e); }).get();

        if (eptr) {
-            manager_logger.error("ep_manager[{}]: exception: {}", _key, eptr);
+            manager_logger.error("hint_endpoint_manager[{}]:stop: Exception occurred: {}", _key, eptr);
        }

        set_stopped();
+        manager_logger.debug("hint_endpoint_manager[{}]:stop: Finished", end_point_key());
    });
 }

@@ -194,7 +202,7 @@ future<hints_store_ptr> hint_endpoint_manager::get_or_load() {
 }

 future<db::commitlog> hint_endpoint_manager::add_store() noexcept {
-    manager_logger.trace("Going to add a store to {}", _hints_dir.c_str());
+    manager_logger.debug("hint_endpoint_manager[{}]:add_store: Going to add a store: {}", end_point_key(), _hints_dir.native());

    return futurize_invoke([this] {
        return io_check([name = _hints_dir.c_str()] { return recursive_touch_directory(name); }).then([this] () {
@@ -289,6 +297,8 @@ future<db::commitlog> hint_endpoint_manager::add_store() noexcept {
                    _sender.add_segment(std::move(seg));
                }

+                manager_logger.debug("hint_endpoint_manager[{}]:add_store: Finished", end_point_key());
+
                co_return l;
            });
        });
--- a/db/hints/internal/hint_sender.cc
+++ b/db/hints/internal/hint_sender.cc
@@ -56,8 +56,8 @@ future<> hint_sender::flush_maybe() noexcept {
    if (current_time >= _next_flush_tp) {
        return _ep_manager.flush_current_hints().then([this, current_time] {
            _next_flush_tp = current_time + manager::hints_flush_period;
-        }).handle_exception([] (auto eptr) {
-            manager_logger.trace("flush_maybe() failed: {}", eptr);
+        }).handle_exception([this] (auto eptr) {
+            manager_logger.debug("hint_sender[{}]:flush_maybe: Failed with {}", _ep_key, eptr);
            return make_ready_future<>();
        });
    }
@@ -115,7 +115,7 @@ const column_mapping& hint_sender::get_column_mapping(lw_shared_ptr<send_one_fil
            throw no_column_mapping(fm.schema_version());
        }

-        manager_logger.debug("new schema version {}", fm.schema_version());
+        manager_logger.trace("hint_sender[{}]:get_column_mapping: new schema version {}", _ep_key, fm.schema_version());
        cm_it = ctx_ptr->schema_ver_to_column_mapping.emplace(fm.schema_version(), *hr.get_column_mapping()).first;
    }

@@ -175,23 +175,22 @@ future<> hint_sender::stop(drain should_drain) noexcept {
            //
            // The next call for send_hints_maybe() will send the last hints to the current end point and when it is
            // done there is going to be no more pending hints and the corresponding hints directory may be removed.
-            manager_logger.trace("Draining for {}: start", end_point_key());
+            manager_logger.trace("hint_sender[{}]:stop: Draining starts", end_point_key());
            set_draining();
            send_hints_maybe();
-            _ep_manager.flush_current_hints().handle_exception([] (auto e) {
-                manager_logger.error("Failed to flush pending hints: {}. Ignoring...", e);
+            _ep_manager.flush_current_hints().handle_exception([this] (auto e) {
+                manager_logger.error("hint_sender[{}]:stop: Failed to flush pending hints: {}. Ignoring", _ep_key, e);
            }).get();
            send_hints_maybe();
-            manager_logger.trace("Draining for {}: end", end_point_key());
+            manager_logger.trace("hint_sender[{}]:stop: Draining finished", end_point_key());
        }
-        // TODO: Change this log to match the class name, but first make sure no test
-        //       relies on the old one.
-        manager_logger.trace("ep_manager({})::sender: exiting", end_point_key());
+
+        manager_logger.debug("hint_sender[{}]:stop: Finished", end_point_key());
    });
 }

 void hint_sender::cancel_draining() {
-    manager_logger.info("Draining of {} has been marked as canceled", _ep_key);
+    manager_logger.info("hint_sender[{}]:cancel_draining: Marking as canceled", _ep_key);
    if (_state.contains(state::draining)) {
        _state.remove(state::draining);
    }
@@ -222,9 +221,8 @@ void hint_sender::start() {

    attr.sched_group = _hints_cpu_sched_group;
    _stopped = seastar::async(std::move(attr), [this] {
-        // TODO: Change this log to match the class name, but first make sure no test
-        //       relies on the old one.
-        manager_logger.trace("ep_manager({})::sender: started", end_point_key());
+        manager_logger.debug("hint_sender[{}]:start: Starting", end_point_key());
+
        while (!stopping()) {
            try {
                flush_maybe().get();
@@ -237,34 +235,36 @@ void hint_sender::start() {
                break;
            } catch (...) {
                // log and keep on spinning
-                // TODO: Change this log to match the class name, but first make sure no test
-                //       relies on the old one.
-                manager_logger.trace("sender: got the exception: {}", std::current_exception());
+                manager_logger.debug("hint_sender[{}]:start: Exception in the loop: {}", _ep_key, std::current_exception());
            }
        }
+
+        manager_logger.debug("hint_sender[{}]:start: Exited the loop", _ep_key);
    });
 }

 future<> hint_sender::send_one_mutation(frozen_mutation_and_schema m) {
    auto ermp = _db.find_column_family(m.s).get_effective_replication_map();
    auto token = dht::get_token(*m.s, m.fm.key());
-    host_id_vector_replica_set natural_endpoints = ermp->get_natural_replicas(std::move(token));
+    host_id_vector_replica_set natural_endpoints = ermp->get_natural_replicas(token);
+    host_id_vector_topology_change pending_endpoints  = ermp->get_pending_replicas(token);

-    return futurize_invoke([this, m = std::move(m), ermp = std::move(ermp), &natural_endpoints] () mutable -> future<> {
+    return futurize_invoke([this, m = std::move(m), ermp = std::move(ermp), &natural_endpoints, &pending_endpoints] () mutable -> future<> {
        // The fact that we send with CL::ALL in both cases below ensures that new hints are not going
        // to be generated as a result of hints sending.
        const auto& tm = ermp->get_token_metadata();
        const auto dst = end_point_key();

        if (std::ranges::contains(natural_endpoints, dst) && !tm.is_leaving(dst)) {
-            manager_logger.trace("Sending directly to {}", dst);
-            return _proxy.send_hint_to_endpoint(std::move(m), std::move(ermp), dst);
+            manager_logger.trace("hint_sender[{}]:send_one_mutation: Sending directly", dst);
+            // dst is not duplicated in pending_endpoints because it's in natural_endpoints
+            return _proxy.send_hint_to_endpoint(std::move(m), std::move(ermp), dst, std::move(pending_endpoints));
        } else {
            if (manager_logger.is_enabled(log_level::trace)) {
                if (tm.is_leaving(end_point_key())) {
-                    manager_logger.trace("The original target endpoint {} is leaving. Mutating from scratch...", dst);
+                    manager_logger.trace("hint_sender[{}]:send_one_mutation: Original target is leaving. Mutating from scratch", dst);
                } else {
-                    manager_logger.trace("Endpoints set has changed and {} is no longer a replica. Mutating from scratch...", dst);
+                    manager_logger.trace("hint_sender[{}]:send_one_mutation: Endpoint set has changed and original target is no longer a replica. Mutating from scratch", dst);
                }
            }
            return _proxy.send_hint_to_all_replicas(std::move(m));
@@ -288,9 +288,9 @@ future<> hint_sender::send_one_hint(lw_shared_ptr<send_one_file_ctx> ctx_ptr, fr
                // Files are aggregated for at most manager::hints_timer_period therefore the oldest hint there is
                // (last_modification - manager::hints_timer_period) old.
                if (const auto now = gc_clock::now().time_since_epoch(); now - secs_since_file_mod > gc_grace_sec - manager::hints_flush_period) {
-                    manager_logger.debug("send_hints(): the hint is too old, skipping it, "
+                    manager_logger.trace("hint_sender[{}]:send_hints: Hint is too old, skipping it, "
                        "secs since file last modification {}, gc_grace_sec {}, hints_flush_period {}",
-                        now - secs_since_file_mod, gc_grace_sec, manager::hints_flush_period);
+                        _ep_key, now - secs_since_file_mod, gc_grace_sec, manager::hints_flush_period);
                    return make_ready_future<>();
                }

@@ -299,24 +299,24 @@ future<> hint_sender::send_one_hint(lw_shared_ptr<send_one_file_ctx> ctx_ptr, fr
                    ++this->shard_stats().sent_total;
                    this->shard_stats().sent_hints_bytes_total += mutation_size;
                }).handle_exception([this, ctx_ptr] (auto eptr) {
-                    manager_logger.trace("send_one_hint(): failed to send to {}: {}", end_point_key(), eptr);
+                    manager_logger.trace("hint_sender[{}]:send_one_hint: Failed to send: {}", end_point_key(), eptr);
                    ++this->shard_stats().send_errors;
                    return make_exception_future<>(std::move(eptr));
                });

            // ignore these errors and move on - probably this hint is too old and the KS/CF has been deleted...
            } catch (replica::no_such_column_family& e) {
-                manager_logger.debug("send_hints(): no_such_column_family: {}", e.what());
+                manager_logger.debug("hint_sender[{}]:send_one_hint: no_such_column_family: {}", _ep_key, e.what());
                ++this->shard_stats().discarded;
            } catch (replica::no_such_keyspace& e) {
-                manager_logger.debug("send_hints(): no_such_keyspace: {}", e.what());
+                manager_logger.debug("hint_sender[{}]:send_one_hint: no_such_keyspace: {}", _ep_key, e.what());
                ++this->shard_stats().discarded;
            } catch (no_column_mapping& e) {
-                manager_logger.debug("send_hints(): {} at {}: {}", fname, rp, e.what());
+                manager_logger.debug("hint_sender[{}]:send_one_hint: no_column_mapping: {} at {}: {}", _ep_key, fname, rp, e.what());
                ++this->shard_stats().discarded;
            } catch (...) {
                auto eptr = std::current_exception();
-                manager_logger.debug("send_hints(): unexpected error in file {} at {}: {}", fname, rp, eptr);
+                manager_logger.debug("hint_sender[{}]:send_one_hint: Unexpected error in file {} at {}: {}", _ep_key, fname, rp, eptr);
                ++this->shard_stats().send_errors;
                return make_exception_future<>(std::move(eptr));
            }
@@ -338,21 +338,24 @@ future<> hint_sender::send_one_hint(lw_shared_ptr<send_one_file_ctx> ctx_ptr, fr
            }
            f.ignore_ready_future();
        });
-    }).handle_exception([ctx_ptr, rp] (auto eptr) {
-        manager_logger.trace("send_one_file(): Hmmm. Something bad had happened: {}", eptr);
+    }).handle_exception([this, ctx_ptr, rp] (auto eptr) {
+        manager_logger.trace("hint_sender[{}]:send_one_hint: Exception occurred: {}", _ep_key, eptr);
        ctx_ptr->on_hint_send_failure(rp);
    });
 }

 void hint_sender::notify_replay_waiters() noexcept {
    if (!_foreign_segments_to_replay.empty()) {
-        manager_logger.trace("[{}] notify_replay_waiters(): not notifying because there are still {} foreign segments to replay", end_point_key(), _foreign_segments_to_replay.size());
+        manager_logger.trace("hint_sender[{}]:notify_replay_waiters: Not notifying because there are still {} foreign segments to replay",
+                end_point_key(), _foreign_segments_to_replay.size());
        return;
    }

-    manager_logger.trace("[{}] notify_replay_waiters(): replay position upper bound was updated to {}", end_point_key(), _sent_upper_bound_rp);
+    manager_logger.trace("hint_sender[{}]:notify_replay_waiters: Replay position upper bound was updated to {}", end_point_key(), _sent_upper_bound_rp);
    while (!_replay_waiters.empty() && _replay_waiters.begin()->first < _sent_upper_bound_rp) {
-        manager_logger.trace("[{}] notify_replay_waiters(): notifying one ({} < {})", end_point_key(), _replay_waiters.begin()->first, _sent_upper_bound_rp);
+        manager_logger.trace("hint_sender[{}]:notify_replay_waiters: Notifying one ({} < {})",
+                end_point_key(), _replay_waiters.begin()->first, _sent_upper_bound_rp);
+
        auto ptr = _replay_waiters.begin()->second;
        (**ptr).set_value();
        (*ptr) = std::nullopt; // Prevent it from being resolved by abort source subscription
@@ -362,7 +365,7 @@ void hint_sender::notify_replay_waiters() noexcept {

 void hint_sender::dismiss_replay_waiters() noexcept {
    for (auto& p : _replay_waiters) {
-        manager_logger.debug("[{}] dismiss_replay_waiters(): dismissing one", end_point_key());
+        manager_logger.debug("hint_sender[{}]:dismiss_replay_waiters: Dismissing one", end_point_key());
        auto ptr = p.second;
        (**ptr).set_exception(std::runtime_error(format("Hints manager for {} is stopping", end_point_key())));
        (*ptr) = std::nullopt; // Prevent it from being resolved by abort source subscription
@@ -371,14 +374,15 @@ void hint_sender::dismiss_replay_waiters() noexcept {
 }

 future<> hint_sender::wait_until_hints_are_replayed_up_to(abort_source& as, db::replay_position up_to_rp) {
-    manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): entering with target {}", end_point_key(), up_to_rp);
+    manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Entering with target {}", end_point_key(), up_to_rp);
    if (_foreign_segments_to_replay.empty() && up_to_rp < _sent_upper_bound_rp) {
-        manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): hints were already replayed above the point ({} < {})", end_point_key(), up_to_rp, _sent_upper_bound_rp);
+        manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Hints were already replayed above the point ({} < {})",
+                end_point_key(), up_to_rp, _sent_upper_bound_rp);
        return make_ready_future<>();
    }

    if (as.abort_requested()) {
-        manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): already aborted - stopping", end_point_key());
+        manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Already aborted - stopping", end_point_key());
        return make_exception_future<>(abort_requested_exception());
    }

@@ -389,7 +393,7 @@ future<> hint_sender::wait_until_hints_are_replayed_up_to(abort_source& as, db::
            // The promise already was resolved by `notify_replay_waiters` and removed from the map
            return;
        }
-        manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): abort requested - stopping", end_point_key());
+        manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Abort requested - stopping", end_point_key());
        _replay_waiters.erase(it);
        (**ptr).set_exception(abort_requested_exception());
    });
@@ -398,7 +402,7 @@ future<> hint_sender::wait_until_hints_are_replayed_up_to(abort_source& as, db::
    // therefore we cannot capture `this`
    auto ep = end_point_key();
    return (**ptr).get_future().finally([sub = std::move(sub), ep] {
-        manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): returning after the future was satisfied", ep);
+        manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Returning after the future was satisfied", ep);
    });
 }

@@ -470,7 +474,7 @@ bool hint_sender::send_one_file(const sstring& fname) {
                }

                if (canceled_draining()) {
-                    manager_logger.debug("[{}] Exiting reading from commitlog because of canceled draining", _ep_key);
+                    manager_logger.debug("hint_sender[{}]:send_one_file: Exiting reading from commitlog because of canceled draining", _ep_key);
                    // We need to throw an exception here to cancel reading the segment.
                    throw canceled_draining_exception{};
                }
@@ -502,13 +506,15 @@ bool hint_sender::send_one_file(const sstring& fname) {
            };
        }, _last_not_complete_rp.pos, &_db.extensions()).get();
    } catch (db::commitlog::segment_error& ex) {
-        manager_logger.error("{}: {}. Dropping...", fname, ex.what());
+        manager_logger.error("hint_sender[{}]:send_one_file: Segment error in {}: {}. Last not complete position={}",
+                _ep_key, fname, ex.what(), _last_not_complete_rp);
        ctx_ptr->segment_replay_failed = false;
        ++this->shard_stats().corrupted_files;
    } catch  (const canceled_draining_exception&) {
-        manager_logger.debug("[{}] Loop in send_one_file finishes due to canceled draining", _ep_key);
+        manager_logger.debug("hint_sender[{}]:send_one_file: Loop in send_one_file finishes due to canceled draining", _ep_key);
    } catch (...) {
-        manager_logger.trace("sending of {} failed: {}", fname, std::current_exception());
+        manager_logger.debug("hint_sender[{}]:send_one_file: Sending of {} failed: {}. Last not complete position={}",
+                _ep_key, fname, std::current_exception(), _last_not_complete_rp);
        ctx_ptr->segment_replay_failed = true;
    }

@@ -523,7 +529,7 @@ bool hint_sender::send_one_file(const sstring& fname) {

    // If we are draining ignore failures and drop the segment even if we failed to send it.
    if (draining() && ctx_ptr->segment_replay_failed) {
-        manager_logger.trace("send_one_file(): we are draining so we are going to delete the segment anyway");
+        manager_logger.debug("hint_sender[{}]:send_one_file: We are draining, so we are going to delete the segment anyway", _ep_key);
        ctx_ptr->segment_replay_failed = false;
    }

@@ -533,7 +539,7 @@ bool hint_sender::send_one_file(const sstring& fname) {
        // If there was an error thrown by read_log_file function itself, we will retry sending from
        // the last hint that was successfully sent (last_succeeded_rp).
        _last_not_complete_rp = ctx_ptr->first_failed_rp.value_or(ctx_ptr->last_succeeded_rp.value_or(_last_not_complete_rp));
-        manager_logger.trace("send_one_file(): error while sending hints from {}, last RP is {}", fname, _last_not_complete_rp);
+        manager_logger.debug("hint_sender[{}]:send_one_file: Error while sending hints from {}, last RP is {}", _ep_key, fname, _last_not_complete_rp);
        return false;
    }

@@ -546,7 +552,7 @@ bool hint_sender::send_one_file(const sstring& fname) {
    // clear the replay position - we are going to send the next segment...
    _last_not_complete_rp = replay_position();
    _last_schema_ver_to_column_mapping.clear();
-    manager_logger.trace("send_one_file(): segment {} was sent in full and deleted", fname);
+    manager_logger.debug("hint_sender[{}]:send_one_file: Segment {} has been sent in full and deleted", _ep_key, fname);
    return true;
 }

@@ -572,14 +578,15 @@ void hint_sender::pop_current_segment() {
 // Runs in the seastar::async context
 void hint_sender::send_hints_maybe() noexcept {
    using namespace std::literals::chrono_literals;
-    manager_logger.trace("send_hints(): going to send hints to {}, we have {} segment to replay", end_point_key(), _segments_to_replay.size() + _foreign_segments_to_replay.size());
+    manager_logger.trace("hint_sender[{}]:send_hints_maybe: Going to send hints. We have {} segment to replay",
+            end_point_key(), _segments_to_replay.size() + _foreign_segments_to_replay.size());

    int replayed_segments_count = 0;

    try {
        while (true) {
            if (canceled_draining()) {
-                manager_logger.debug("[{}] Exiting loop in send_hints_maybe because of canceled draining", _ep_key);
+                manager_logger.debug("hint_sender[{}]:send_hints_maybe: Exiting loop in send_hints_maybe because of canceled draining", _ep_key);
                break;
            }
            const sstring* seg_name = name_of_current_segment();
@@ -598,7 +605,7 @@ void hint_sender::send_hints_maybe() noexcept {
    // Ignore exceptions, we will retry sending this file from where we left off the next time.
    // Exceptions are not expected here during the regular operation, so just log them.
    } catch (...) {
-        manager_logger.trace("send_hints(): got the exception: {}", std::current_exception());
+        manager_logger.debug("hint_sender[{}]:send_hints_maybe: Exception occurred while sending: {}", _ep_key, std::current_exception());
    }

    if (have_segments()) {
@@ -609,7 +616,7 @@ void hint_sender::send_hints_maybe() noexcept {
        _next_send_retry_tp = _next_flush_tp;
    }

-    manager_logger.trace("send_hints(): we handled {} segments", replayed_segments_count);
+    manager_logger.debug("hint_sender[{}]:send_hints_maybe: We handled {} segments", _ep_key, replayed_segments_count);
 }

 hint_stats& hint_sender::shard_stats() {
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -505,20 +505,20 @@ bool manager::can_hint_for(endpoint_id ep) const noexcept {
    // hints where N is the total number nodes in the cluster.
    const auto hipf = hints_in_progress_for(ep);
    if (_stats.size_of_hints_in_progress > max_size_of_hints_in_progress() && hipf > 0) {
-        manager_logger.trace("size_of_hints_in_progress {} hints_in_progress_for({}) {}",
+        manager_logger.trace("can_hint_for: size_of_hints_in_progress {} hints_in_progress_for({}) {}",
                _stats.size_of_hints_in_progress, ep, hipf);
        return false;
    }

    // Check that the destination DC is "hintable".
    if (!check_dc_for(ep)) {
-        manager_logger.trace("{}'s DC is not hintable", ep);
+        manager_logger.trace("can_hint_for: {}'s DC is not hintable", ep);
        return false;
    }

    const bool node_is_alive = local_gossiper().get_endpoint_downtime(ep) <= _max_hint_window_us;
    if (!node_is_alive) {
-        manager_logger.trace("{} has been down for too long, not hinting", ep);
+        manager_logger.trace("can_hint_for: {} has been down for too long, not hinting", ep);
        return false;
    }

--- a/db/large_data_handler.cc
+++ b/db/large_data_handler.cc
@@ -148,7 +148,7 @@ cql_table_large_data_handler::cql_table_large_data_handler(gms::feature_service&

 template <typename... Args>
 future<> cql_table_large_data_handler::try_record(std::string_view large_table, const sstables::sstable& sst,  const sstables::key& partition_key, int64_t size,
-        std::string_view desc, std::string_view extra_path, const std::vector<sstring> &extra_fields, Args&&... args) const {
+        std::string_view size_desc, std::string_view desc, std::string_view extra_path, const std::vector<sstring> &extra_fields, Args&&... args) const {
    auto sys_ks = _sys_ks.get_permit();
    if (!sys_ks) {
        co_return;
@@ -168,7 +168,7 @@ future<> cql_table_large_data_handler::try_record(std::string_view large_table,
    const auto sstable_name = large_data_handler::sst_filename(sst);
    std::string pk_str = key_to_str(partition_key.to_partition_key(s), s);
    auto timestamp = db_clock::now();
-    large_data_logger.warn("Writing large {} {}/{}: {} ({} bytes) to {}", desc, ks_name, cf_name, extra_path, size, sstable_name);
+    large_data_logger.warn("Writing large {} {}/{}: {} ({}) to {}", desc, ks_name, cf_name, extra_path, size_desc, sstable_name);
    co_await sys_ks->execute_cql(req, ks_name, cf_name, sstable_name, size, pk_str, timestamp, args...)
            .discard_result()
            .handle_exception([ks_name, cf_name, large_table, sstable_name] (std::exception_ptr ep) {
@@ -184,12 +184,14 @@ future<> cql_table_large_data_handler::record_large_partitions(const sstables::s

 future<> cql_table_large_data_handler::internal_record_large_partitions(const sstables::sstable& sst, const sstables::key& key,
        uint64_t partition_size, uint64_t rows) const {
-    return try_record("partition", sst, key, int64_t(partition_size), "partition", "", {"rows"}, data_value((int64_t)rows));
+    const sstring size_desc = seastar::format("{} bytes/{} rows", partition_size, rows);
+    return try_record("partition", sst, key, int64_t(partition_size), size_desc, "partition", "", {"rows"}, data_value((int64_t)rows));
 }

 future<> cql_table_large_data_handler::internal_record_large_partitions_all_data(const sstables::sstable& sst, const sstables::key& key,
        uint64_t partition_size, uint64_t rows, uint64_t range_tombstones, uint64_t dead_rows) const {
-    return try_record("partition", sst, key, int64_t(partition_size), "partition", "", {"rows", "range_tombstones", "dead_rows"},
+    const sstring size_desc = seastar::format("{} bytes/{} rows", partition_size, rows);
+    return try_record("partition", sst, key, int64_t(partition_size), size_desc, "partition", "", {"rows", "range_tombstones", "dead_rows"},
                data_value((int64_t)rows), data_value((int64_t)range_tombstones), data_value((int64_t)dead_rows));
 }

@@ -203,13 +205,14 @@ future<> cql_table_large_data_handler::internal_record_large_cells(const sstable
    auto column_name = cdef.name_as_text();
    std::string_view cell_type = cdef.is_atomic() ? "cell" : "collection";
    static const std::vector<sstring> extra_fields{"clustering_key", "column_name"};
+    const sstring size_desc = seastar::format("{} bytes", cell_size);
    if (clustering_key) {
        const schema &s = *sst.get_schema();
        auto ck_str = key_to_str(*clustering_key, s);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, column_name, extra_fields, ck_str, column_name);
+        return try_record("cell", sst, partition_key, int64_t(cell_size), size_desc, cell_type, column_name, extra_fields, ck_str, column_name);
    } else {
        auto desc = seastar::format("static {}", cell_type);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), desc, column_name, extra_fields, data_value::make_null(utf8_type), column_name);
+        return try_record("cell", sst, partition_key, int64_t(cell_size), size_desc, desc, column_name, extra_fields, data_value::make_null(utf8_type), column_name);
    }
 }

@@ -217,26 +220,28 @@ future<> cql_table_large_data_handler::internal_record_large_cells_and_collectio
        const clustering_key_prefix* clustering_key, const column_definition& cdef, uint64_t cell_size, uint64_t collection_elements) const {
    auto column_name = cdef.name_as_text();
    std::string_view cell_type = cdef.is_atomic() ? "cell" : "collection";
+    const sstring size_desc = seastar::format("{} bytes", cell_size);
    static const std::vector<sstring> extra_fields{"clustering_key", "column_name", "collection_elements"};
    if (clustering_key) {
        const schema &s = *sst.get_schema();
        auto ck_str = key_to_str(*clustering_key, s);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, column_name, extra_fields, ck_str, column_name, data_value((int64_t)collection_elements));
+        return try_record("cell", sst, partition_key, int64_t(cell_size), size_desc, cell_type, column_name, extra_fields, ck_str, column_name, data_value((int64_t)collection_elements));
    } else {
        auto desc = seastar::format("static {}", cell_type);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), desc, column_name, extra_fields, data_value::make_null(utf8_type), column_name, data_value((int64_t)collection_elements));
+        return try_record("cell", sst, partition_key, int64_t(cell_size), size_desc, desc, column_name, extra_fields, data_value::make_null(utf8_type), column_name, data_value((int64_t)collection_elements));
    }
 }

 future<> cql_table_large_data_handler::record_large_rows(const sstables::sstable& sst, const sstables::key& partition_key,
        const clustering_key_prefix* clustering_key, uint64_t row_size) const {
    static const std::vector<sstring> extra_fields{"clustering_key"};
+    const sstring size_desc = seastar::format("{} bytes", row_size);
    if (clustering_key) {
        const schema &s = *sst.get_schema();
        std::string ck_str = key_to_str(*clustering_key, s);
-        return try_record("row", sst, partition_key, int64_t(row_size), "row", "", extra_fields, ck_str);
+        return try_record("row", sst, partition_key, int64_t(row_size), size_desc, "row", "", extra_fields, ck_str);
    } else {
-        return try_record("row", sst, partition_key, int64_t(row_size), "static row", "", extra_fields, data_value::make_null(utf8_type));
+        return try_record("row", sst, partition_key, int64_t(row_size), size_desc, "static row", "", extra_fields, data_value::make_null(utf8_type));
    }
 }

--- a/db/large_data_handler.hh
+++ b/db/large_data_handler.hh
@@ -188,7 +188,7 @@ private:
 private:
    template <typename... Args>
    future<> try_record(std::string_view large_table, const sstables::sstable& sst,  const sstables::key& partition_key, int64_t size,
-            std::string_view desc, std::string_view extra_path, const std::vector<sstring> &extra_fields, Args&&... args) const;
+            std::string_view size_desc, std::string_view desc, std::string_view extra_path, const std::vector<sstring> &extra_fields, Args&&... args) const;
 };

 class nop_large_data_handler : public large_data_handler {
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -11,9 +11,11 @@
 #include <boost/functional/hash.hpp>
 #include <boost/icl/interval_map.hpp>
 #include <fmt/ranges.h>
+#include <ranges>

 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
+#include <seastar/core/loop.hh>
 #include <seastar/core/on_internal_error.hh>
 #include "system_keyspace.hh"
 #include "cql3/untyped_result_set.hh"
@@ -36,6 +38,7 @@
 #include "db/schema_tables.hh"
 #include "gms/generation-number.hh"
 #include "service/storage_service.hh"
+#include "service/storage_proxy.hh"
 #include "service/paxos/paxos_state.hh"
 #include "query-result-set.hh"
 #include "idl/frozen_mutation.dist.hh"
@@ -351,6 +354,7 @@ schema_ptr system_keyspace::raft() {

            .set_comment("Persisted RAFT log, votes and snapshot info")
            .with_hash_version()
+            .set_caching_options(caching_options::get_disabled_caching_options())
            .build();
    }();
    return schema;
@@ -763,6 +767,35 @@ schema_ptr system_keyspace::large_cells() {
    return large_cells;
 }

+schema_ptr system_keyspace::corrupt_data() {
+    static thread_local auto corrupt_data = [] {
+        auto id = generate_legacy_id(NAME, CORRUPT_DATA);
+        return schema_builder(NAME, CORRUPT_DATA, id)
+                // partition key
+                .with_column("keyspace_name", utf8_type, column_kind::partition_key)
+                .with_column("table_name", utf8_type, column_kind::partition_key)
+                // clustering key
+                .with_column("id", timeuuid_type, column_kind::clustering_key)
+                // regular rows
+                // Storing keys as bytes: having a corrupt key might be the reason
+                // to record the row as corrupt, so we just dump what we have and
+                // leave interpreting to the lucky person investigating the disaster.
+                .with_column("partition_key", bytes_type)
+                .with_column("clustering_key", bytes_type)
+                // Note: mutation-fragment v2
+                .with_column("mutation_fragment_kind", utf8_type)
+                .with_column("frozen_mutation_fragment", bytes_type)
+                .with_column("origin", utf8_type)
+                .with_column("sstable_name", utf8_type)
+                // options
+                .set_comment("mutation-fragments found to be corrupted")
+                .set_gc_grace_seconds(0)
+                .with_hash_version()
+                .build();
+    }();
+    return corrupt_data;
+}
+
 static constexpr auto schema_gc_grace = std::chrono::duration_cast<std::chrono::seconds>(days(7)).count();

 /*static*/ schema_ptr system_keyspace::scylla_local() {
@@ -1649,6 +1682,12 @@ future<> system_keyspace::peers_table_read_fixup() {
            continue;
        }
        const auto host_id = row.get_as<utils::UUID>("host_id");
+        if (!host_id) {
+            slogger.error("Peer {} has null host_id in system.{}, the record is broken, removing it",
+                peer, system_keyspace::PEERS);
+            co_await remove_endpoint(gms::inet_address{peer});
+            continue;
+        }
        const auto ts = row.get_as<int64_t>("ts");
        const auto it = map.find(host_id);
        if (it == map.end()) {
@@ -1712,8 +1751,15 @@ future<> system_keyspace::drop_truncation_rp_records() {
    auto rs = co_await execute_cql(req);

    bool any = false;
-    co_await coroutine::parallel_for_each(*rs, [&] (const cql3::untyped_result_set_row& row) -> future<> {
+    std::unordered_set<table_id> to_delete;
+    auto db = _qp.db();
+    auto max_concurrency = std::min(1024u, smp::count * 8);
+    co_await seastar::max_concurrent_for_each(*rs, max_concurrency, [&] (const cql3::untyped_result_set_row& row) -> future<> {
        auto table_uuid = table_id(row.get_as<utils::UUID>("table_uuid"));
+        if (!db.try_find_table(table_uuid)) {
+            to_delete.emplace(table_uuid);
+            co_return;
+        }
        auto shard = row.get_as<int32_t>("shard");
        auto segment_id = row.get_as<int64_t>("segment_id");

@@ -1723,11 +1769,26 @@ future<> system_keyspace::drop_truncation_rp_records() {
            co_await execute_cql(req);
        }
    });
+    if (!to_delete.empty()) {
+        // IN has a limit to how many values we can put into it.
+        for (auto&& chunk : to_delete | std::views::transform(&table_id::to_sstring) | std::views::chunk(100)) {
+            auto str = std::ranges::to<std::string>(chunk | std::views::join_with(','));
+            auto req = fmt::format("DELETE FROM system.{} WHERE table_uuid IN ({})", TRUNCATED, str);
+            co_await execute_cql(req);
+        }
+        any = true;
+    }
    if (any) {
        co_await force_blocking_flush(TRUNCATED);
    }
 }

+future<> system_keyspace::remove_truncation_records(table_id id) {
+    auto req = format("DELETE FROM system.{} WHERE table_uuid = {}", TRUNCATED, id);
+    co_await execute_cql(req);
+    co_await force_blocking_flush(TRUNCATED);
+}
+
 future<> system_keyspace::save_truncation_record(const replica::column_family& cf, db_clock::time_point truncated_at, db::replay_position rp) {
    sstring req = format("INSERT INTO system.{} (table_uuid, shard, position, segment_id, truncated_at) VALUES(?,?,?,?,?)", TRUNCATED);
    co_await _qp.execute_internal(req, {cf.schema()->id().uuid(), int32_t(rp.shard_id()), int32_t(rp.pos), int64_t(rp.base_id()), truncated_at}, cql3::query_processor::cache_internal::yes);
@@ -2110,7 +2171,59 @@ future<> system_keyspace::update_peer_info(gms::inet_address ep, locator::host_i

    slogger.debug("{}: values={}", query, values);

-    co_await _qp.execute_internal(query, db::consistency_level::ONE, values, cql3::query_processor::cache_internal::yes);
+    const auto guard = co_await get_units(_peers_cache_lock, 1);
+    try {
+        co_await _qp.execute_internal(query, db::consistency_level::ONE, values, cql3::query_processor::cache_internal::yes);
+        if (auto* cache = get_peers_cache()) {
+            cache->host_id_to_inet_ip[hid] = ep;
+            cache->inet_ip_to_host_id[ep] = hid;
+        }
+    } catch (...) {
+        _peers_cache = nullptr;
+        throw;
+    }
+}
+
+system_keyspace::peers_cache* system_keyspace::get_peers_cache() {
+    auto* cache = _peers_cache.get();
+    if (cache && (lowres_clock::now() > cache->expiration_time)) {
+        _peers_cache = nullptr;
+        return nullptr;
+    }
+    return cache;
+}
+
+future<lw_shared_ptr<const system_keyspace::peers_cache>> system_keyspace::get_or_load_peers_cache() {
+    const auto guard = co_await get_units(_peers_cache_lock, 1);
+    if (auto* cache = get_peers_cache()) {
+        co_return cache->shared_from_this();
+    }
+    auto cache = make_lw_shared<peers_cache>();
+    cache->inet_ip_to_host_id = co_await load_host_ids();
+    cache->host_id_to_inet_ip.reserve(cache->inet_ip_to_host_id.size());
+    for (const auto [ip, id]: cache->inet_ip_to_host_id) {
+        const auto [it, inserted] = cache->host_id_to_inet_ip.insert({id, ip});
+        if (!inserted) {
+            on_internal_error(slogger, ::format("duplicate IP for host_id {}, first IP {}, second IP {}",
+                id, it->second, ip));
+        }
+    }
+    cache->expiration_time = lowres_clock::now() + std::chrono::milliseconds(200);
+    _peers_cache = cache;
+    co_return std::move(cache);
+}
+
+future<std::optional<gms::inet_address>> system_keyspace::get_ip_from_peers_table(locator::host_id id) {
+    const auto cache = co_await get_or_load_peers_cache();
+    if (const auto it = cache->host_id_to_inet_ip.find(id); it != cache->host_id_to_inet_ip.end()) {
+        co_return it->second;
+    }
+    co_return std::nullopt;
+}
+
+future<system_keyspace::host_id_to_ip_map_t> system_keyspace::get_host_id_to_ip_map() {
+    const auto cache = co_await get_or_load_peers_cache();
+    co_return cache->host_id_to_inet_ip;
 }

 template <typename T>
@@ -2160,7 +2273,22 @@ future<> system_keyspace::update_schema_version(table_schema_version version) {
 future<> system_keyspace::remove_endpoint(gms::inet_address ep) {
    const sstring req = format("DELETE FROM system.{} WHERE peer = ?", PEERS);
    slogger.debug("DELETE FROM system.{} WHERE peer = {}", PEERS, ep);
-    co_await execute_cql(req, ep.addr()).discard_result();
+
+    const auto guard = co_await get_units(_peers_cache_lock, 1);
+    try {
+        co_await execute_cql(req, ep.addr()).discard_result();
+        if (auto* cache = get_peers_cache()) {
+            const auto it = cache->inet_ip_to_host_id.find(ep);
+            if (it != cache->inet_ip_to_host_id.end()) {
+                const auto id = it->second;
+                cache->inet_ip_to_host_id.erase(it);
+                cache->host_id_to_inet_ip.erase(id);
+            }
+        }
+    } catch (...) {
+        _peers_cache = nullptr;
+        throw;
+    }
 }

 future<> system_keyspace::update_tokens(const std::unordered_set<dht::token>& tokens) {
@@ -2312,6 +2440,7 @@ std::vector<schema_ptr> system_keyspace::all_tables(const db::config& cfg) {
                    peers(), peer_events(), range_xfers(),
                    compactions_in_progress(), compaction_history(),
                    sstable_activity(), size_estimates(), large_partitions(), large_rows(), large_cells(),
+                    corrupt_data(),
                    scylla_local(), db::schema_tables::scylla_table_schema_history(),
                    repair_history(),
                    v3::views_builds_in_progress(), v3::built_views(),
@@ -3573,4 +3702,12 @@ future<::shared_ptr<cql3::untyped_result_set>> system_keyspace::execute_cql(cons
    return _qp.execute_internal(query_string, values, cql3::query_processor::cache_internal::yes);
 }

+future<> system_keyspace::apply_mutation(mutation m) {
+    if (m.schema()->ks_name() != NAME) {
+        on_internal_error(slogger, fmt::format("system_keyspace::apply_mutation(): attempted to apply mutation belonging to table {}.{}", m.schema()->cf_name(), m.schema()->ks_name()));
+    }
+
+    return _qp.proxy().mutate_locally(m, {}, db::commitlog::force_sync(m.schema()->static_props().wait_for_sync_to_commitlog), db::no_timeout);
+}
+
 } // namespace db
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -142,6 +142,7 @@ class system_keyspace : public seastar::peering_sharded_service<system_keyspace>
    static schema_ptr large_partitions();
    static schema_ptr large_rows();
    static schema_ptr large_cells();
+    static schema_ptr corrupt_data();
    static schema_ptr scylla_local();
    future<> force_blocking_flush(sstring cfname);
    // This function is called when the system.peers table is read,
@@ -153,6 +154,17 @@ class system_keyspace : public seastar::peering_sharded_service<system_keyspace>
    //  and this node crashes after adding a new IP but before removing the old one. The
    //  record with older timestamp is removed, the warning is written to the log.
    future<> peers_table_read_fixup();
+
+    struct peers_cache: public enable_lw_shared_from_this<peers_cache> {
+        std::unordered_map<gms::inet_address, locator::host_id> inet_ip_to_host_id;
+        std::unordered_map<locator::host_id, gms::inet_address> host_id_to_inet_ip;
+        lowres_clock::time_point expiration_time;
+    };
+    lw_shared_ptr<peers_cache> _peers_cache;
+    semaphore _peers_cache_lock{1};
+    peers_cache* get_peers_cache();
+    future<lw_shared_ptr<const peers_cache>> get_or_load_peers_cache();
+
 public:
    static schema_ptr size_estimates();
 public:
@@ -174,6 +186,7 @@ public:
    static constexpr auto LARGE_PARTITIONS = "large_partitions";
    static constexpr auto LARGE_ROWS = "large_rows";
    static constexpr auto LARGE_CELLS = "large_cells";
+    static constexpr auto CORRUPT_DATA = "corrupt_data";
    static constexpr auto SCYLLA_LOCAL = "scylla_local";
    static constexpr auto RAFT = "raft";
    static constexpr auto RAFT_SNAPSHOTS = "raft_snapshots";
@@ -317,6 +330,12 @@ public:

    future<> update_peer_info(gms::inet_address ep, locator::host_id hid, const peer_info& info);

+    // Return ip of the peers table entry with given host id
+    future<std::optional<gms::inet_address>> get_ip_from_peers_table(locator::host_id id);
+
+    using host_id_to_ip_map_t = std::unordered_map<locator::host_id, gms::inet_address>;
+    future<host_id_to_ip_map_t> get_host_id_to_ip_map();
+
    future<> remove_endpoint(gms::inet_address ep);

    // Saves the key-value pair into system.scylla_local table.
@@ -424,6 +443,7 @@ public:
    future<> save_truncation_record(const replica::column_family&, db_clock::time_point truncated_at, db::replay_position);
    future<replay_positions> get_truncated_positions(table_id);
    future<> drop_truncation_rp_records();
+    future<> remove_truncation_records(table_id);

    // Converts a `dht::token_range` object to the left-open integer range (x,y] form.
    //
@@ -692,6 +712,10 @@ public:
        return execute_cql(req, { data_value(std::forward<Args>(args))... });
    }

+    // Apply write as mutation to the system keyspace.
+    // Mutation has to belong to a table int he system keyspace.
+    future<> apply_mutation(mutation m);
+
    friend future<column_mapping> db::schema_tables::get_column_mapping(db::system_keyspace& sys_ks, ::table_id table_id, table_schema_version version);
    friend future<bool> db::schema_tables::column_mapping_exists(db::system_keyspace& sys_ks, table_id table_id, table_schema_version version);
    friend future<> db::schema_tables::drop_column_mapping(db::system_keyspace& sys_ks, table_id table_id, table_schema_version version);
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -2680,9 +2680,16 @@ void view_builder::on_create_view(const sstring& ks_name, const sstring& view_na
            // threshold.
          return initialize_reader_at_current_token(step).then([this, view, &step] () mutable {
            return add_new_view(view, step).then_wrapped([this, view] (future<>&& f) {
-                if (f.failed()) {
-                    vlogger.error("Error setting up view for building {}.{}: {}", view->ks_name(), view->cf_name(), f.get_exception());
+                try {
+                    f.get();
+                } catch (abort_requested_exception&) {
+                    vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
+                } catch (raft::request_aborted&) {
+                    vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
+                } catch (...) {
+                    vlogger.error("Error setting up view for building {}.{}: {}", view->ks_name(), view->cf_name(), std::current_exception());
                }
+
                // Waited on indirectly in stop().
                (void)_build_step.trigger();
            });
@@ -3449,6 +3456,7 @@ void delete_ghost_rows_visitor::accept_new_row(const clustering_key& ck, const q
    auto view_exploded_ck = ck.explode();
    std::vector<bytes> base_exploded_pk(_base_schema->partition_key_size());
    std::vector<bytes> base_exploded_ck(_base_schema->clustering_key_size());
+    std::map<const column_definition*, bytes> view_key_cols_not_in_base_key;
    for (const column_definition& view_cdef : _view->all_columns()) {
        const column_definition* base_cdef = _base_schema->get_column_definition(view_cdef.name());
        if (base_cdef) {
@@ -3457,6 +3465,8 @@ void delete_ghost_rows_visitor::accept_new_row(const clustering_key& ck, const q
                base_exploded_pk[base_cdef->id] = view_exploded_key[view_cdef.id];
            } else if (base_cdef->is_clustering_key()) {
                base_exploded_ck[base_cdef->id] = view_exploded_key[view_cdef.id];
+            } else if (!base_cdef->is_computed() && view_cdef.is_primary_key()) {
+                view_key_cols_not_in_base_key[base_cdef] = view_exploded_key[view_cdef.id];
            }
        }
    }
@@ -3464,22 +3474,44 @@ void delete_ghost_rows_visitor::accept_new_row(const clustering_key& ck, const q
    clustering_key base_ck = clustering_key::from_exploded(base_exploded_ck);

    dht::partition_range_vector partition_ranges({dht::partition_range::make_singular(dht::decorate_key(*_base_schema, base_pk))});
-    auto selection = cql3::selection::selection::for_columns(_base_schema, std::vector<const column_definition*>({&_base_schema->partition_key_columns().front()}));
+    auto view_key_cols_not_in_base_key_cdefs = view_key_cols_not_in_base_key | std::views::keys | std::ranges::to<std::vector<const column_definition*>>();
+    auto selection = cql3::selection::selection::for_columns(_base_schema,
+        view_key_cols_not_in_base_key.empty() ? std::vector<const column_definition*>({&_base_schema->partition_key_columns().front()}) : view_key_cols_not_in_base_key_cdefs);

    std::vector<query::clustering_range> bounds{query::clustering_range::make_singular(base_ck)};
-    query::partition_slice partition_slice(std::move(bounds), {},  {}, selection->get_query_options());
+    utils::small_vector<column_id, 8> view_key_col_ids;
+    for (const auto& [col_def, _] : view_key_cols_not_in_base_key) {
+        view_key_col_ids.push_back(col_def->id);
+    }
+    query::partition_slice partition_slice(std::move(bounds), {}, std::move(view_key_col_ids), selection->get_query_options());
    auto command = ::make_lw_shared<query::read_command>(_base_schema->id(), _base_schema->version(), partition_slice,
            _proxy.get_max_result_size(partition_slice), query::tombstone_limit(_proxy.get_tombstone_limit()));
    auto timeout = db::timeout_clock::now() + _timeout_duration;
    service::storage_proxy::coordinator_query_options opts{timeout, _state.get_permit(), _state.get_client_state(), _state.get_trace_state()};
    auto base_qr = _proxy.query(_base_schema, command, std::move(partition_ranges), db::consistency_level::ALL, opts).get();
    query::result& result = *base_qr.query_result;
-    if (result.row_count().value_or(0) == 0) {
+    auto delete_ghost_row = [&]() {
        mutation m(_view, *_view_pk);
        auto& row = m.partition().clustered_row(*_view, ck);
        row.apply(tombstone(api::new_timestamp(), gc_clock::now()));
        timeout = db::timeout_clock::now() + _timeout_duration;
        _proxy.mutate({m}, db::consistency_level::ALL, timeout, _state.get_trace_state(), empty_service_permit(), db::allow_per_partition_rate_limit::no).get();
+    };
+    if (result.row_count().value_or(0) == 0) {
+        delete_ghost_row();
+    } else if (!view_key_cols_not_in_base_key.empty()) {
+        if (result.row_count().value_or(0) != 1) {
+            on_internal_error(vlogger, format("Got multiple base rows corresponding to a single view row when pruning {}.{}", _view->ks_name(), _view->cf_name()));
+        }
+        auto results = query::result_set::from_raw_result(_base_schema, partition_slice, result);
+        auto& base_row = results.row(0);
+        for (const auto& [col_def, col_val] : view_key_cols_not_in_base_key) {
+            const data_value* base_val = base_row.get_data_value(col_def->name_as_text());
+            if (!base_val || base_val->is_null() || col_val != base_val->serialize_nonnull()) {
+                delete_ghost_row();
+                break;
+            }
+        }
    }
 }

--- a/db/virtual_tables.cc
+++ b/db/virtual_tables.cc
@@ -165,7 +165,7 @@ public:
        });
    }

-    future<> emit_ring(result_collector& result, const dht::decorated_key& dk, const sstring& table_name, std::vector<dht::token_range_endpoints> ranges) {
+    future<> emit_ring(result_collector& result, const dht::decorated_key& dk, const sstring& table_name, utils::chunked_vector<dht::token_range_endpoints> ranges) {

        co_await result.emit_partition_start(dk);
        std::ranges::sort(ranges, std::ranges::less(), std::mem_fn(&dht::token_range_endpoints::_start_token));
@@ -219,11 +219,11 @@ public:
                        co_return;
                    }
                    const auto& table_name = table->schema()->cf_name();
-                    std::vector<dht::token_range_endpoints> ranges = co_await _ss.describe_ring_for_table(e.name, table_name);
+                    utils::chunked_vector<dht::token_range_endpoints> ranges = co_await _ss.describe_ring_for_table(e.name, table_name);
                    co_await emit_ring(result, e.key, table_name, std::move(ranges));
                });
            } else {
-                std::vector<dht::token_range_endpoints> ranges = co_await _ss.describe_ring(e.name);
+                utils::chunked_vector<dht::token_range_endpoints> ranges = co_await _ss.describe_ring(e.name);
                co_await emit_ring(result, e.key, "<ALL>", std::move(ranges));
            }
        }
--- a/dht/i_partitioner_fwd.hh
+++ b/dht/i_partitioner_fwd.hh
@@ -10,6 +10,7 @@
 #pragma once
 #include <vector>
 #include "interval.hh"
+#include "utils/chunked_vector.hh"

 namespace sstables {

@@ -29,7 +30,7 @@ using partition_range = interval<ring_position>;
 using token_range = interval<token>;

 using partition_range_vector = std::vector<partition_range>;
-using token_range_vector = std::vector<token_range>;
+using token_range_vector = utils::chunked_vector<token_range>;

 class decorated_key;

--- a/dist/common/scripts/scylla_sysconfig_setup
+++ b/dist/common/scripts/scylla_sysconfig_setup
@@ -86,9 +86,9 @@ if __name__ == '__main__':
    ethpciid = ''
    if network_mode == 'dpdk':
        dpdk_status = out('/opt/scylladb/scripts/dpdk-devbind.py --status')
-        match = re.search('if={} drv=(\S+)'.format(ifname), dpdk_status, flags=re.MULTILINE)
+        match = re.search(r'if={} drv=(\S+)'.format(ifname), dpdk_status, flags=re.MULTILINE)
        ethdrv = match.group(1)
-        match = re.search('^(\\S+:\\S+:\\S+\.\\S+) [^\n]+ if={} '.format(ifname), dpdk_status, flags=re.MULTILINE)
+        match = re.search(r'^(\S+:\S+:\S+\.\S+) [^\n]+ if={} '.format(ifname), dpdk_status, flags=re.MULTILINE)
        ethpciid = match.group(1)

    if args.mode:
--- a/dist/debian/control.template
+++ b/dist/debian/control.template
@@ -18,7 +18,7 @@ Breaks: scylla-enterprise-conf (<< 2025.1.0~)

 Package: %{product}-server
 Architecture: any
-Depends: ${misc:Depends}, %{product}-conf (= ${binary:Version}), %{product}-python3 (= ${binary:Version})
+Depends: ${misc:Depends}, %{product}-conf (= ${binary:Version}), %{product}-python3 (= ${binary:Version}), procps
 Replaces: %{product}-tools (<<5.5), scylla-enterprise-tools (<< 2024.2.0~), scylla-enterprise-server (<< 2025.1.0~)
 Breaks: %{product}-tools (<<5.5), scylla-enterprise-tools (<< 2024.2.0~), scylla-enterprise-server (<< 2025.1.0~)
 Description: Scylla database server binaries
--- a/dist/docker/redhat/build_docker.sh
+++ b/dist/docker/redhat/build_docker.sh
@@ -14,6 +14,15 @@ product="$(<build/SCYLLA-PRODUCT-FILE)"
 version="$(sed 's/-/~/' <build/SCYLLA-VERSION-FILE)"
 release="$(<build/SCYLLA-RELEASE-FILE)"

+original_version="$(<build/SCYLLA-VERSION-FILE)"
+if [[ "$original_version" == *"-dev"* ]]; then
+    repo_file_url="https://downloads.scylladb.com/unstable/scylla/master/rpm/centos/latest/scylla.repo"
+else
+    # Remove the last dot-separated component
+    repo_version="${original_version%.*}"
+    repo_file_url="https://downloads.scylladb.com/rpm/centos/scylla-$repo_version.repo"
+fi
+
 mode="release"

 arch="$(uname -m)"
@@ -88,8 +97,8 @@ bcp LICENSE-ScyllaDB-Source-Available.md /licenses/

 run microdnf clean all
 run microdnf --setopt=tsflags=nodocs -y update
-run microdnf --setopt=tsflags=nodocs -y install hostname python3 python3-pip kmod
-run microdnf clean all
+run microdnf --setopt=tsflags=nodocs -y install hostname kmod procps-ng python3 python3-pip
+run curl -L --output /etc/yum.repos.d/scylla.repo ${repo_file_url}
 run pip3 install --no-cache-dir --prefix /usr supervisor
 run bash -ec "echo LANG=C.UTF-8 > /etc/locale.conf"
 run bash -ec "rpm -ivh packages/*.rpm"
--- a/dist/redhat/scylla.spec
+++ b/dist/redhat/scylla.spec
@@ -71,6 +71,7 @@ Group:          Applications/Databases
 Summary:        The Scylla database server
 Requires:       %{product}-conf = %{version}-%{release}
 Requires:       %{product}-python3 = %{version}-%{release}
+Requires:       procps-ng
 AutoReqProv:    no
 Provides:       %{product}-tools:%{_bindir}/nodetool
 Provides:       %{product}-tools:%{_sysconfigdir}/bash_completion.d/nodetool-completion
--- a/docs/_ext/utils.py
+++ b/docs/_ext/utils.py
@@ -22,6 +22,8 @@ def readable_desc_rst(description):

        cleaned_line = line.replace('\\n', '\n')

+        cleaned_line = cleaned_line.replace('\\t', '\n' + indent * 2)
+        
        if line.endswith('"'):
            cleaned_line = cleaned_line[:-1] + ' '

--- a/docs/_static/data/os-support.json
+++ b/docs/_static/data/os-support.json
@@ -1,16 +1,25 @@
 {
    "Linux Distributions": {
-      "Ubuntu": ["20.04", "22.04", "24.04"],
-      "Debian": ["11"],
+      "Ubuntu": ["20.04 (deprecated)", "22.04", "24.04"],
+      "Debian": ["11", "12"],
      "Rocky / CentOS / RHEL": ["8", "9"],
      "Amazon Linux": ["2023"]
    },
    "ScyllaDB Versions": [
      {
-        "version": "Enterprise 2025.1",
+        "version": "ScyllaDB 2025.2",
        "supported_OS": {
-          "Ubuntu": ["20.04", "22.04", "24.04"],
-          "Debian": ["11"],
+          "Ubuntu": ["20.04 (deprecated)", "22.04", "24.04"],
+          "Debian": ["11", "12"],
+          "Rocky / CentOS / RHEL": ["8", "9"],
+          "Amazon Linux": ["2023"]
+        }
+      },
+      {
+        "version": "ScyllaDB 2025.1",
+        "supported_OS": {
+          "Ubuntu": ["20.04 (deprecated)", "22.04", "24.04"],
+          "Debian": ["11", "12"],
          "Rocky / CentOS / RHEL": ["8", "9"],
          "Amazon Linux": ["2023"]
        }
@@ -18,7 +27,7 @@
      {
        "version": "Enterprise 2024.2",
        "supported_OS": {
-          "Ubuntu": ["20.04", "22.04", "24.04"],
+          "Ubuntu": ["20.04 (deprecated)", "22.04", "24.04"],
          "Debian": ["11"],
          "Rocky / CentOS / RHEL": ["8", "9"],
          "Amazon Linux": ["2023"]
@@ -27,20 +36,11 @@
      {
        "version": "Enterprise 2024.1",
        "supported_OS": {
-          "Ubuntu": ["20.04", "22.04", "24.04*"],
+          "Ubuntu": ["20.04 (deprecated)", "22.04", "24.04*"],
          "Debian": ["11"],
          "Rocky / CentOS / RHEL": ["8", "9"],
          "Amazon Linux": []
        }
-      },
-      {
-        "version": "Open Source 6.2",
-        "supported_OS": {
-          "Ubuntu": ["20.04", "22.04", "24.04"],
-          "Debian": ["11"],
-          "Rocky / CentOS / RHEL": ["8", "9"],
-          "Amazon Linux": ["2023"]
-        }
      }
    ]
  }
--- a/docs/_utils/redirects.yaml
+++ b/docs/_utils/redirects.yaml
@@ -1,6 +1,27 @@
 ### a dictionary of redirections
 #old path: new path

+# Move the diver information to another project
+
+/stable/using-scylla/drivers/index.html: https://docs.scylladb.com/stable/drivers/index.html
+/stable/using-scylla/drivers/dynamo-drivers/index.html: https://docs.scylladb.com/stable/drivers/dynamo-drivers.html
+/stable/using-scylla/drivers/cql-drivers/index.html: https://docs.scylladb.com/stable/drivers/cql-drivers.html
+/stable/using-scylla/drivers/cql-drivers/scylla-python-driver.html: https://docs.scylladb.com/stable/drivers/cql-drivers.html
+/stable/using-scylla/drivers/cql-drivers/scylla-java-driver.html: https://docs.scylladb.com/stable/drivers/cql-drivers.html
+/stable/using-scylla/drivers/cql-drivers/scylla-go-driver.html: https://docs.scylladb.com/stable/drivers/cql-drivers.html
+/stable/using-scylla/drivers/cql-drivers/scylla-gocqlx-driver.html: https://docs.scylladb.com/stable/drivers/cql-drivers.html
+/stable/using-scylla/drivers/cql-drivers/scylla-cpp-driver.html: https://docs.scylladb.com/stable/drivers/cql-drivers.html
+/stable/using-scylla/drivers/cql-drivers/scylla-rust-driver.html: https://docs.scylladb.com/stable/drivers/cql-drivers.html
+
+# Redirect 2025.1 upgrade guides that are not on master but were indexed by Google (404 reported)
+
+/master/upgrade/upgrade-guides/upgrade-guide-from-2024.x-to-2025.1/upgrade-guide-from-2024.x-to-2025.1.html: https://docs.scylladb.com/manual/stable/upgrade/index.html
+/master/upgrade/upgrade-guides/upgrade-guide-from-6.2-to-2025.1/index.html: https://docs.scylladb.com/manual/stable/upgrade/index.html
+
+# Remove reduntant pages
+
+/stable/getting-started/tutorials: https://docs.scylladb.com/stable/get-started/develop-with-scylladb/tutorials-example-projects.html
+/stable/contribute: https://github.com/scylladb/scylladb/blob/master/CONTRIBUTING.md

 # Remove an oudated article

@@ -157,3 +178,5 @@
 /stable/upgrade/upgrade-opensource/upgrade-guide-from-4.5-to-4.6/upgrade-guide-from-4.5-to-4.6-debian-10.html: /stable/upgrade/index.html
 /stable/upgrade/upgrade-opensource/upgrade-guide-from-4.5-to-4.6/metric-update-4.5-to-4.6.html: /stable/upgrade/index.html

+# Fixed typo in the file name
+/stable/operating-scylla/nodetool-commands/enbleautocompaction.html: /stable/operating-scylla/nodetool-commands/enableautocompaction.html
--- a/docs/architecture/raft.rst
+++ b/docs/architecture/raft.rst
@@ -58,112 +58,12 @@ of nodes in the cluster is available. The following examples illustrate how Raft

 In summary, Raft makes schema changes safe, but it requires that a quorum of nodes in the cluster is available.

-.. _verify-raft-procedure:
-
-Verifying that the Raft upgrade procedure finished successfully
-========================================================================
-
-You may need to perform the following procedure as part of
-the :ref:`manual recovery procedure <recovery-procedure>`.
-
-The Raft upgrade procedure requires **full cluster availability** to correctly setup the Raft algorithm; after the setup finishes, Raft can proceed with only a majority of nodes, but this initial setup is an exception.
-An unlucky event, such as a hardware failure, may cause one of your nodes to fail. If this happens before the Raft upgrade procedure finishes, the procedure will get stuck and your intervention will be required.
-
-To verify that the procedure finishes, look at the log of every ScyllaDB node (using ``journalctl _COMM=scylla``). Search for the following patterns:
-
-* ``Starting internal upgrade-to-raft procedure`` denotes the start of the procedure,
-* ``Raft upgrade finished`` denotes the end.
-
-The following is an example of a log from a node which went through the procedure correctly. Some parts were truncated for brevity:
-
-.. code-block:: console
-
-    features - Feature SUPPORTS_RAFT_CLUSTER_MANAGEMENT is enabled
-    raft_group0 - finish_setup_after_join: SUPPORTS_RAFT feature enabled. Starting internal upgrade-to-raft procedure.
-    raft_group0_upgrade - starting in `use_pre_raft_procedures` state.
-    raft_group0_upgrade - Waiting until everyone is ready to start upgrade...
-    raft_group0_upgrade - Joining group 0...
-    raft_group0 - server 624fa080-8c0e-4e3d-acf6-10af473639ca joined group 0 with group id 8f8a1870-5c4e-11ed-bb13-fe59693a23c9
-    raft_group0_upgrade - Waiting until every peer has joined Raft group 0...
-    raft_group0_upgrade - Every peer is a member of Raft group 0.
-    raft_group0_upgrade - Waiting for schema to synchronize across all nodes in group 0...
-    raft_group0_upgrade - synchronize_schema: my version: a37a3b1e-5251-3632-b6b4-a9468a279834
-    raft_group0_upgrade - synchronize_schema: schema mismatches: {}. 3 nodes had a matching version.
-    raft_group0_upgrade - synchronize_schema: finished.
-    raft_group0_upgrade - Entering synchronize state.
-    raft_group0_upgrade - Schema changes are disabled in synchronize state. If a failure makes us unable to proceed, manual recovery will be required.
-    raft_group0_upgrade - Waiting for all peers to enter synchronize state...
-    raft_group0_upgrade - All peers in synchronize state. Waiting for schema to synchronize...
-    raft_group0_upgrade - synchronize_schema: collecting schema versions from group 0 members...
-    raft_group0_upgrade - synchronize_schema: collected remote schema versions.
-    raft_group0_upgrade - synchronize_schema: my version: a37a3b1e-5251-3632-b6b4-a9468a279834
-    raft_group0_upgrade - synchronize_schema: schema mismatches: {}. 3 nodes had a matching version.
-    raft_group0_upgrade - synchronize_schema: finished.
-    raft_group0_upgrade - Schema synchronized.
-    raft_group0_upgrade - Raft upgrade finished.
-
-In a functioning cluster with good network connectivity the procedure should take no more than a few seconds.
-Network issues may cause the procedure to take longer, but if all nodes are alive and the network is eventually functional (each pair of nodes is eventually connected), the procedure will eventually finish.
-
-Note the following message, which appears in the log presented above:
-
-.. code-block:: console
-
-    Schema changes are disabled in synchronize state. If a failure makes us unable to proceed, manual recovery will be required.
-
-During the procedure, there is a brief window while schema changes are disabled. This is when the schema change mechanism switches from the older unsafe algorithm to the safe Raft-based algorithm. If everything runs smoothly, this window will be unnoticeable; the procedure is designed to minimize that window's length. However, if the procedure gets stuck e.g. due to network connectivity problem, ScyllaDB will return the following error when trying to perform a schema change during this window:
-
-.. code-block:: console
-
-    Cannot perform schema or topology changes during this time; the cluster is currently upgrading to use Raft for schema operations.
-    If this error keeps happening, check the logs of your nodes to learn the state of upgrade. The upgrade procedure may get stuck
-    if there was a node failure.
-
-In the next example, one of the nodes had a power outage before the procedure could finish. The following shows a part of another node's logs:
-
-.. code-block:: console
-
-    raft_group0_upgrade - Entering synchronize state.
-    raft_group0_upgrade - Schema changes are disabled in synchronize state. If a failure makes us unable to proceed, manual recovery will be required.
-    raft_group0_upgrade - Waiting for all peers to enter synchronize state...
-    raft_group0_upgrade - wait_for_peers_to_enter_synchronize_state: node 127.90.69.3 not in synchronize state yet...
-    raft_group0_upgrade - wait_for_peers_to_enter_synchronize_state: node 127.90.69.1 not in synchronize state yet...
-    raft_group0_upgrade - wait_for_peers_to_enter_synchronize_state: retrying in a while...
-    raft_group0_upgrade - wait_for_peers_to_enter_synchronize_state: node 127.90.69.1 not in synchronize state yet...
-    raft_group0_upgrade - wait_for_peers_to_enter_synchronize_state: retrying in a while...
-    ...
-    raft_group0_upgrade - Raft upgrade procedure taking longer than expected. Please check if all nodes are live and the network is healthy. If the upgrade procedure does not progress even though the cluster is healthy, try performing a rolling restart of the cluster. If that doesn 't help or some nodes are dead and irrecoverable, manual recovery may be required. Consult the relevant documentation.
-    raft_group0_upgrade - wait_for_peers_to_enter_synchronize_state: node 127.90.69.1 not in synchronize state yet...
-    raft_group0_upgrade - wait_for_peers_to_enter_synchronize_state: retrying in a while...
-
-.. TODO: the 'Consult the relevant documentation' message must be updated to point to this doc.
-
-Note the following message:
-
-.. code-block:: console
-
-    raft_group0_upgrade - Raft upgrade procedure taking longer than expected. Please check if all nodes are live and the network is healthy. If the upgrade procedure does not progress even though the cluster is healthy, try performing a rolling restart of the cluster. If that doesn 't help or some nodes are dead and irrecoverable, manual recovery may be required. Consult the relevant documentation.
-
-If the Raft upgrade procedure is stuck, this message will appear periodically in each node's logs.
-
-The message suggests the initial course of action:
-
-* Check if all nodes are alive.
-* If a node is down but can be restarted, restart it.
-* If all nodes are alive, ensure that the network is healthy: that every node is reachable from every other node.
-* If all nodes are alive and the network is healthy, perform a :doc:`rolling restart </operating-scylla/procedures/config-change/rolling-restart/>` of the cluster.
-
-One of the reasons why the procedure may get stuck is a pre-existing problem in schema definitions which causes schema to be unable to synchronize in the cluster. The procedure cannot proceed unless it ensures that schema is synchronized.
-If **all nodes are alive and the network is healthy**, you performed a rolling restart, but the issue still persists, contact `ScyllaDB support <https://www.scylladb.com/product/support/>`_ for assistance.
-
-If some nodes are **dead and irrecoverable**, you'll need to perform a manual recovery procedure. Consult :ref:`the section about Raft recovery <recovery-procedure>`.
-
 .. _raft-topology-changes:

 Consistent Topology with Raft
 -----------------------------------------------------------------

-ScyllaDB can use Raft to manage cluster topology. With Raft-managed topology 
+ScyllaDB uses Raft to manage cluster topology. With Raft-managed topology 
 enabled, all topology operations are internally sequenced in a consistent 
 way. A centralized coordination process ensures that topology metadata is 
 synchronized across the nodes on each step of a topology change procedure. 
@@ -173,42 +73,18 @@ will safely drive all of them to completion. For example, multiple nodes can
 be bootstrapped concurrently, which couldn't be done with the old 
 gossip-based topology.

-The feature is automatically enabled in new clusters.
+.. note::

-Verifying that Raft is Enabled
----------------------------------
+    Enabling consistent topology changes is mandatory in versions 2025.2 and later. If consistent topology changes are
+    disabled in your cluster, you need to follow the instructions in
+    `Enable Consistent Topology Updates <https://docs.scylladb.com/manual/branch-2025.1/upgrade/upgrade-guides/upgrade-guide-from-2024.x-to-2025.1/enable-consistent-topology.html>`_.

-.. _schema-on-raft-enabled:
-
-**Schema on Raft**
-
-You can verify that Raft is enabled on your cluster by performing the following query on each node:
-
-.. code-block:: sql
-
-   cqlsh> SELECT * FROM system.scylla_local WHERE key = 'group0_upgrade_state';
-
-The query should return:
-
-   .. code-block:: console
-
-     key                  | value
-    ----------------------+--------------------------
-     group0_upgrade_state | use_post_raft_procedures
-
-    (1 rows)
-
-on every node.
-
-If the query returns 0 rows, or ``value`` is ``synchronize`` or ``use_pre_raft_procedures``, it means that the cluster is in the middle of the Raft upgrade procedure; consult the :ref:`relevant section <verify-raft-procedure>`.
-
-If ``value`` is ``recovery``, it means that the cluster is in the middle of the manual recovery procedure. The procedure must be finished. Consult :ref:`the section about Raft recovery <recovery-procedure>`.
-
-If ``value`` is anything else, it might mean data corruption or a mistake when performing the manual recovery procedure. The value will be treated as if it was equal to ``recovery`` when the node is restarted.
+    If you are uncertain whether consistent topology changes are enabled, refer to the guide below.

 .. _verifying-consistent-topology-changes-enabled:

-**Consistent topology changes**
+Verifying that consistent topology changes are enabled
+-----------------------------------------------------------------

 You can verify that consistent topology management is enabled on your cluster in two ways:

--- a/docs/architecture/tablets.rst
+++ b/docs/architecture/tablets.rst
@@ -42,6 +42,10 @@ the administrator. The tablet load balancer decides where to migrate
 the tablets, either within the same node to balance the shards or across 
 the nodes to balance the global load in the cluster.

+The number of tablets the load balancer maintains on a node is directly
+proportional to the node's storage capacity. A node with twice
+the storage will have twice the number of tablets located on it.
+
 As a table grows, each tablet can split into two, creating a new tablet.
 The load balancer can migrate the split halves independently to different nodes
 or shards.
@@ -83,6 +87,53 @@ especially for data models that contain small cells.
 File-based streaming is used for tablet migration in all 
 :ref:`keyspaces created with tablets enabled <tablets>`.

+.. _absolute-number-of-tablets:
+
+Absolute number of tablets
+==========================
+
+ScyllaDB has a background process that periodically re-evaluates the number of tablets of each table.
+The computed number of tablets a table will have is based on several parameters and factors. These are:
+
+* Keyspace tablets option ``'initial'``. This option sets the initial number of tablets on the keyspace level.
+  See :ref:`The tablets property <tablets>` for details.
+* Table-level option ``'expected_data_size_in_gb'``. This option sets the minimal number of tablets for a table
+  based on the expected table size and the target tablet size. See
+  :ref:`Per-table tablet options <cql-per-table-tablet-options>` for details.
+* Table-level option ``'min_per_shard_tablet_count'``. Using this option results in the number of tablets being
+  computed based on the number of shards in a DC so that each shard has at least ``'min_per_shard_tablet_count'``
+  tablets on average. See :ref:`Per-table tablet options <cql-per-table-tablet-options>` for details.
+* Table-level option ``'min_tablet_count'``. This option sets the minimal number of tablets for the given table.
+  See :ref:`Per-table tablet options <cql-per-table-tablet-options>` for details.
+* Config option ``'tablets_initial_scale_factor'``. This option sets the minimal number of tablets per shard
+  per table globally. This option can be overridden by the table-level option: ``'min_per_shard_tablet_count'``.
+  ``'tablets_initial_scale_factor'`` is ignored if either the keyspace option ``'initial'`` or table-level
+  option ``'min_tablet_count'`` is set.
+
+Another factor that determines the absolute tablet count is the amount of data the table contains. If the
+amount of data in the table is such that the average tablet size is larger than double the target tablet size,
+the table will be split (the number of tablets will be doubled), and if the average tablet size is smaller than
+half the target tablet size, it will be merged (the number of tablets will be halved).
+
+Each of these factors is taken into consideration, and the one producing the largest number of tablets wins, and
+will be used as the number of tablets for the given table.
+
+As the last step, in order to avoid having too many tablets per shard, which could potentially lead to overload
+and performance degradation, ScyllaDB will run the following algorithm to respect the ``tablets_per_shard_goal``
+config option:
+
+* Compute average tablet count per-shard in each DC.
+* Determine if per-shard goal is exceeded in that DC.
+* Compute scale factor by which tablet count should be multiplied so that the goal is not exceeded in that DC.
+* Take the smallest scale factor among all DCs, which ensures that no DC is overloaded.
+* Each table's tablet count is aligned to the nearest power of 2 post-scaling.
+
+Please note that because of this alignment, the scaling may not be effective and in the worst case may be
+overshot by a factor of 2, and that the ``tablets_per_shard_goal`` is a soft limit and not a hard constraint.
+
+Finally, the computed tablet count is compared with the current tablet count for each table, and if there is
+a difference, a table resize (split or merge) is executed.
+
 .. _tablets-enable-tablets: 

 Enabling Tablets
@@ -147,24 +198,19 @@ Limitations and Unsupported Features
    performance problems, or other issues.

 The following ScyllaDB features are not supported if a keyspace has tablets
-enabled:
+enabled. If you plan to use any of the features listed below, CREATE your keyspace
+:ref:`with tablets disabled <tablets-enable-tablets>`.

 * Counters
 * Change Data Capture (CDC)
 * Lightweight Transactions (LWT)
 * Alternator (as it uses LWT)
+* Materialized Views (MV) ``*``
+* Secondary indexes (SI, as it depends on MV) ``*``

-If you plan to use any of the above features, CREATE your keyspace
-:ref:`with tablets disabled <tablets-enable-tablets>`.
-
-The following ScyllaDB features are disabled by default when used with a keyspace
-that has tablets enabled:
-
-* Materialized Views (MV)
-* Secondary indexes (SI, as it depends on MV)
-
-To enable MV and SI for tablet keyspaces, use the `--experimental-features=views-with-tablets`
-configuration option.  See :ref:`Views with tablets <admin-views-with-tablets>` for details.
+``*`` You can enable experimental support for MV and SI using
+the ``--experimental-features=views-with-tablets`` configuration option. 
+See :ref:`Views with tablets <admin-views-with-tablets>` for details.

 Resharding in keyspaces with tablets enabled has the following limitations:

--- a/docs/contribute.rst
+++ b/docs/contribute.rst
@@ -1,31 +0,0 @@
-Contribute to ScyllaDB
-=======================
-
-Thank you for your interest in making ScyllaDB better!
-We appreciate your help and look forward to welcoming you to the ScyllaDB Community.
-There are two ways you can contribute:
-
-* Send a patch to the ScyllaDB source code
-* Write documentation for ScyllaDB Docs
-
-
-Contribute to ScyllaDB's Source Code
------------------------------------
-ScyllaDB developers use patches and email to share and discuss changes.
-Setting up can take a little time, but once you have done it the first time, it’s easy.
-
-The basic steps are:
-
-* Join the ScyllaDB community
-* Create a Git branch to work on
-* Commit your work with clear commit messages and sign-offs.
-* Send a PR or use ``git format-patch`` and ``git send-email`` to send to the list
-
-
-The entire process is `documented here <https://github.com/scylladb/scylla/blob/master/CONTRIBUTING.md>`_.
-
-Contribute to ScyllaDB Docs
---------------------------
-
-Each ScyllaDB project has accompanying documentation. For information about contributing documentation to a specific ScyllaDB project, refer to the README file for the individual project.
-For general information or to contribute to the ScyllaDB Sphinx theme, read the `Contributor's Guide <https://sphinx-theme.scylladb.com/stable/contribute/>`_.
--- a/docs/cql/ddl.rst
+++ b/docs/cql/ddl.rst
@@ -60,11 +60,11 @@ Keyspace and table names are defined by the following grammar:
   keyspace_name: `name`
   table_name: [ `keyspace_name` '.' ] `name`
   name: `unquoted_name` | `quoted_name`
-   unquoted_name: re('[a-zA-Z_0-9]{1, 48}')
+   unquoted_name: re('[a-zA-Z_0-9]{1, 192}')
   quoted_name: '"' `unquoted_name` '"'

 Both keyspace and table names consist of only alphanumeric characters, cannot be empty, and are limited in
-size to 48 characters (that limit exists mostly to avoid filenames, which may include the keyspace and table name, to go
+size to 192 characters (that limit exists mostly to avoid filenames, which may include the keyspace and table name, to go
 over the limits of certain file systems). By default, keyspace and table names are case insensitive (``myTable`` is
 equivalent to ``mytable``), but case sensitivity can be forced by using double-quotes (``"myTable"`` is different from
 ``mytable``).
@@ -755,7 +755,8 @@ when replicas are slow or unresponsive.  The following are legal values (case-in
 ``XPERCENTILE``           90.5PERCENTILE   Coordinators record average per-table response times for all replicas.
                                            If a replica takes longer than ``X`` percent of this table's average
                                            response time, the coordinator queries an additional replica.
-                                            ``X`` must be between 0 and 100.
+                                            ``X`` must be between 0 and 100, including those values.
+                                            The value is rounded to the nearest 0.1 (1 decimal place).
 ``XP``                    90.5P            Synonym for ``XPERCENTILE``
 ``Yms``                   25ms             If a replica takes more than ``Y`` milliseconds to respond,
                                            the coordinator queries an additional replica.
--- a/docs/cql/types.rst
+++ b/docs/cql/types.rst
@@ -481,7 +481,8 @@ Creating a new user-defined type is done using a ``CREATE TYPE`` statement defin
   field_definition: `identifier` `cql_type`

 A UDT has a name (``udt_name``), which is used to declare columns of that type and is a set of named and typed fields. The ``udt_name`` can be any
-type, including collections or other UDTs. UDTs and collections inside collections must always be frozen (no matter which version of ScyllaDB you are using). 
+type, including collections or other UDTs.
+Similar to collections, a UDT can be frozen or non-frozen. A frozen UDT is immutable and can only be updated as a whole. Nested UDTs or UDTs used in keys must always be frozen.

 For example::

@@ -506,26 +507,15 @@ For example::

  CREATE TABLE superheroes (
       name frozen<full_name> PRIMARY KEY,
-       home frozen<address>
+       home address
  );

 .. note::

   - Attempting to create an already existing type will result in an error unless the ``IF NOT EXISTS`` option is used. If it is used, the statement will be a no-op if the type already exists.
   - A type is intrinsically bound to the keyspace in which it is created and can only be used in that keyspace. At creation, if the type name is prefixed by a keyspace name, it is created in that keyspace. Otherwise, it is created in the current keyspace.
-   - As of ScyllaDB Open Source 3.2, UDTs not inside collections do not have to be frozen, but in all versions prior to ScyllaDB Open Source 3.2, and in all ScyllaDB Enterprise versions, UDTs **must** be frozen. 


-A non-frozen UDT example with ScyllaDB Open Source 3.2 and higher::
-
-   CREATE TYPE ut (a int, b int);
-   CREATE TABLE cf (a int primary key, b ut);
-
-Same UDT in versions prior::
-
-   CREATE TYPE ut (a int, b int);
-   CREATE TABLE cf (a int primary key, b frozen<ut>);
-
 UDT literals
 ~~~~~~~~~~~~

--- a/docs/dev/service_levels.md
+++ b/docs/dev/service_levels.md
@@ -189,3 +189,18 @@ The command displays a table with: option name, effective service level the valu
        workload_type |                     sl2 |       batch
              timeout |                     sl1 |          2s
 ```
+
+## Implementation
+### Integration with auth
+
+Service levels ultimately depend on the state of `auth`. Since `auth::service` is initialized long after
+`service_level_controller`, we register it separately once it's started, and unregister it right before
+it's stopped. For that, we wrap it in a struct called `auth_integration` that manages access to it.
+That ensures that `service_level_controller` will not try to reference it beyond its lifetime.
+
+It's important to note that there may still be attempts to fetch an effective service level for a role
+or indirectly access `auth::service` in some other way when `auth_integration` is absent. One important
+situation to have in mind is when the user connects to Scylla via the maintenance socket. It's possible
+early on, way before Scylla is fully initialized. Since we don't have access to `auth` yet, we need to
+ensure that the semantics of the operations performed on `service_level_controller` still make sense
+in that context.
--- a/docs/dev/system_keyspace.md
+++ b/docs/dev/system_keyspace.md
@@ -121,6 +121,29 @@ SELECT * FROM system.large_cells;
 SELECT * FROM system.large_cells WHERE keyspace_name = 'ks1' and table_name = 'standard1';
 ~~~

+## system.corrupt\_data
+
+Stores data found to be corrupt during internal operations. This data cannot be written to sstables because then it will be spread around by repair and compaction. It will also possibly cause failures in sstable parsing.
+At the same time, the data should be kept around so that it can be inspected and possibly restored by the database operator.
+This table is used to store such data. Data is saved at the mutation-fragment level.
+
+Schema:
+```cql
+CREATE TABLE system.corrupt_data (
+    keyspace_name text,              # keyspace name of source table
+    table_name text,                 # table name of source table
+    id timeuuid,                     # id of the corrupt mutation fragment, assigned by the database when the corrupt data entry is created
+    partition_key blob,              # partition key of partition in the source table, can be incomplete or null due to corruption
+    clustering_key text,             # clustering key of mutation-fragment in the source table, can be null for some mutation-fragment kinds, can be incomplete or null due to corruption
+    mutation_fragment_kind text,     # kind of the mutation fragment, one of 'partition start', 'partition end', 'static row', 'clustering row', 'range tombstone change'; only the latter two can have clustering_key set
+    frozen_mutation_fragment blob,   # the serialized mutation fragment itself
+    origin text,                     # the name of the process that found the corruption, e.g. 'sstable-writer'
+    sstable_name text,               # the name of the sstable that contains the corrupt data, if known; sstable is not kept around, it could be compacted or deleted
+    PRIMARY KEY ((keyspace_name, table_name), id)
+) WITH CLUSTERING ORDER BY (id ASC)
+    AND gc_grace_seconds = 0;
+```
+
 ## system.raft

 Holds information about Raft
--- a/docs/features/cdc/cdc-intro.rst
+++ b/docs/features/cdc/cdc-intro.rst
@@ -67,9 +67,6 @@ You can enable CDC when creating or altering a table using the ``cdc`` option, f

    CREATE TABLE ks.t (pk int, ck int, v int, PRIMARY KEY (pk, ck, v)) WITH cdc = {'enabled':true};

-.. note::
-   If you enabled CDC and later decide to disable it, you need to **stop all writes** to the base table before issuing the ``ALTER TABLE ... WITH cdc = {'enabled':false};`` command.
-
 .. include:: /features/cdc/_common/cdc-params.rst

 Using CDC with Applications
--- a/docs/features/local-secondary-indexes.rst
+++ b/docs/features/local-secondary-indexes.rst
@@ -6,9 +6,9 @@ Local Secondary Indexes is an enhancement to :doc:`Global Secondary Indexes <sec
 which allows ScyllaDB to optimize workloads where the partition key of the base table and the index are the same key.

 .. note::
-   As of ScyllaDB Open Source 4.0, updates for local secondary indexes are performed **synchronously**. When updates are synchronous, the client acknowledges the write
+   Updates for local secondary indexes are performed **synchronously**. When updates are synchronous, the client acknowledges the write
   operation only **after both** the base table modification **and** the view update are written.
-   This is important to note because the process is no longer asynchronous and the modifications are immediately reflected in the index.
+   This is important to note because the process is no longer asynchronous, and the modifications are immediately reflected in the index.
   In addition, if the view update fails, the client receives a write error.

 Example:
--- a/docs/getting-started/cloud-instance-recommendations.rst
+++ b/docs/getting-started/cloud-instance-recommendations.rst
@@ -113,7 +113,38 @@ Pick a zone where Haswell CPUs are found. Local SSD performance offers, accordin
 Image with NVMe disk interface is recommended.
 (`More info <https://cloud.google.com/compute/docs/disks/local-ssd>`_)

-Recommended instances types are `n1-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n1_machines>`_ and `n2-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n2_machines>`_
+Recommended instances types are `z3-highmem-highlssd <https://cloud.google.com/compute/docs/storage-optimized-machines#z3_machine_types>`_,
+`n1-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n1_machines>`_, and `n2-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n2_machines>`_
+
+
+.. list-table::
+   :widths: 30 20 20 30
+   :header-rows: 1
+
+   * - Model
+     - vCPU
+     - Mem (GB)
+     - Storage (GB)
+   * - z3-highmem-8-highlssd
+     - 8
+     - 64
+     - 3,000
+   * - z3-highmem-16-highlssd
+     - 16
+     - 128
+     - 6,000
+   * - z3-highmem-22-highlssd	
+     - 22
+     - 176
+     - 9,000
+   * - z3-highmem-32-highlssd	
+     - 32
+     - 256
+     - 12,000
+   * - z3-highmem-44-highlssd	
+     - 44
+     - 352
+     - 18,000

 .. list-table::
   :widths: 30 20 20 30
--- a/docs/getting-started/index.rst
+++ b/docs/getting-started/index.rst
@@ -11,7 +11,6 @@ Getting Started
   requirements
   Migrate to ScyllaDB </using-scylla/migrate-scylla>
   Integration Solutions </using-scylla/integrations/index>
-   tutorials

 .. panel-box::
  :title: ScyllaDB Requirements
@@ -26,8 +25,7 @@ Getting Started
  :id: "getting-started"
  :class: my-panel

-  * `Install ScyllaDB (Binary Packages, Docker, or EC2) <https://www.scylladb.com/download/#core>`_ - Links to the ScyllaDB Download Center
-  
+  * :doc:`Install ScyllaDB </getting-started/install-scylla/index/>`
  * :doc:`Configure ScyllaDB </getting-started/system-configuration/>`
  * :doc:`Run ScyllaDB in a Shared Environment </getting-started/scylla-in-a-shared-environment>`
  * :doc:`Create a ScyllaDB Cluster - Single Data Center (DC) </operating-scylla/procedures/cluster-management/create-cluster/>`
@@ -38,7 +36,7 @@ Getting Started
  :id: "getting-started"
  :class: my-panel

-  * :doc:`ScyllaDB Drivers</using-scylla/drivers/index>`
+  * `ScyllaDB Drivers <https://docs.scylladb.com/stable/drivers/index.html>`_
  * `Get Started Lesson on ScyllaDB University <https://university.scylladb.com/courses/scylla-essentials-overview/lessons/quick-wins-install-and-run-scylla/>`_    
  * :doc:`CQL Reference </cql/index>`
  * :doc:`cqlsh - the CQL shell </cql/cqlsh/>`
--- a/docs/getting-started/install-scylla/launch-on-gcp.rst
+++ b/docs/getting-started/install-scylla/launch-on-gcp.rst
@@ -30,7 +30,7 @@ Launching ScyllaDB on GCP

   .. code-block:: console
      
-        gcloud compute instances create <name of new instance> --image <ScyllaDB image name> --image-project < ScyllaDB project name> --local-ssd interface=nvme --zone <GCP zone - optional> --machine-type=<machine type>
+        gcloud compute instances create <name of new instance> --image <ScyllaDB image name> --image-project < ScyllaDB project name> --local-ssd interface=nvme --zone=<GCP zone - optional> --machine-type=<machine type>
   
   For example:

--- a/docs/getting-started/installation-common/disable-housekeeping.rst
+++ b/docs/getting-started/installation-common/disable-housekeeping.rst
@@ -3,8 +3,7 @@
 ScyllaDB Housekeeping and how to disable it
 ============================================

-It is always recommended to run the latest version of ScyllaDB. 
-The latest stable release version is always available from the `Download Center <https://www.scylladb.com/download/>`_.
+It is always recommended to run the latest stable version of ScyllaDB. 

 When you install ScyllaDB, it installs by default two services: **scylla-housekeeping-restart** and **scylla-housekeeping-daily**. These services check for the latest ScyllaDB version and prompt the user if they are using a version that is older than what is publicly available.
 Information about your ScyllaDB deployment, including the ScyllaDB version currently used, as well as unique user and server identifiers, are collected by a centralized service.
--- a/docs/getting-started/installation-common/scylla-web-installer.rst
+++ b/docs/getting-started/installation-common/scylla-web-installer.rst
@@ -14,6 +14,7 @@ See :doc:`OS Support by Platform and Version </getting-started/os-support/>`.

 Install ScyllaDB with Web Installer
 ---------------------------------------
+
 To install ScyllaDB with Web Installer, run:

 .. code:: console
@@ -27,7 +28,13 @@ You can run the command with the ``-h`` or ``--help`` flag to print information
 Installing a Non-default Version
 ---------------------------------------

-You can install a version other than the default.
+You can install a version other than the default. To get the list of supported
+release versions, run:
+
+.. code:: console
+  
+  curl -sSf get.scylladb.com/server | sudo bash -s -- --list-active-releases
+

 Versions 2025.1 and Later
 ==============================
--- a/docs/getting-started/os-support.rst
+++ b/docs/getting-started/os-support.rst
@@ -4,6 +4,9 @@ OS Support by Linux Distributions and Version
 The following matrix shows which Linux distributions, containers, and images
 are :ref:`supported <os-support-definition>` with which versions of ScyllaDB.

+Note that support for Ubuntu 20.04 is deprecated and will be removed in
+a future release.
+
 .. datatemplate:json:: /_static/data/os-support.json
  :template: platforms.tmpl

--- a/docs/getting-started/tutorials.rst
+++ b/docs/getting-started/tutorials.rst
@@ -1,21 +0,0 @@
-============
-Tutorials
-============
-
-The tutorials will show you how to use ScyllaDB as a data source for an application.
-
-
-ScyllaDB Tutorial
-===================
-
-`Build an IoT App with sensor simulator and a REST API <https://iot.scylladb.com/stable/>`_
-
-ScyllaDB Cloud Tutorial
-=======================
-
-`Implement CRUD operations with a TODO App <https://github.com/scylladb/scylla-cloud-getting-started/>`_
-
-ScyllaDB Cloud Feature Store Tutorial
-=====================================
-
-`Build a machine learning (ML) feature store with ScyllaDB <https://feature-store.scylladb.com/stable/>`_
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -35,7 +35,7 @@ Documentation Highlights
 * :doc:`Cluster Management Procedures </operating-scylla/procedures/cluster-management/index>`
 * :doc:`Upgrade ScyllaDB </upgrade/index>`
 * :doc:`CQL Reference </cql/index>`
-* :doc:`ScyllaDB Drivers </using-scylla/drivers/index>`
+* `ScyllaDB Drivers <https://docs.scylladb.com/stable/drivers/index.html>`_
 * :doc:`Features </features/index>`

 ScyllaDB Support
@@ -73,6 +73,5 @@ In addition, you can read our `blog <https://www.scylladb.com/blog/>`_ and atten
  kb/index
  reference/index
  faq
-  Contribute to ScyllaDB <contribute>
  2024.2 and earlier documentation <https://enterprise.docs.scylladb.com/branch-2024.2/>

--- a/docs/kb/consistency.rst
+++ b/docs/kb/consistency.rst
@@ -83,7 +83,7 @@ Additional References

 * `Jepsen and ScyllaDB: Putting Consistency to the Test blog post <https://www.scylladb.com/2020/12/23/jepsen-and-scylla-putting-consistency-to-the-test/>`_ 
 * `Nauto: Achieving Consistency in an Eventually Consistent Environment blog post <https://www.scylladb.com/2020/02/20/nauto-achieving-consistency-in-an-eventually-consistent-environment/>`_ 
-* `Consistency Levels documentation <https://docs.scylladb.com/stable/cql/consistency.html>`_ 
+* `Consistency Levels documentation <https://docs.scylladb.com/manual/stable/cql/consistency.html>`_ 
 * `High Availability lesson on ScyllaDB University <https://university.scylladb.com/courses/scylla-essentials-overview/lessons/high-availability/>`_ 
 * `Lightweight Transactions lesson on ScyllaDB University <https://university.scylladb.com/courses/data-modeling/lessons/lightweight-transactions/>`_ 
 * `Getting the Most out of Lightweight Transactions in ScyllaDB blog post <https://www.scylladb.com/2020/07/15/getting-the-most-out-of-lightweight-transactions-in-scylla/>`_ 
--- a/docs/kb/rf-increase.rst
+++ b/docs/kb/rf-increase.rst
@@ -2,40 +2,65 @@
 How to Safely Increase the Replication Factor
 =======================================================

+A replication factor (RF) is configured per keyspace. You can change the RF
+using the :ref:`ALTER KEYSPACE <alter-keyspace-statement>` command. 

-**Topic: What can happen when you increase RF**
+To increase the RF safely, ensure you follow the guidelines below.
+The guidelines differ depending on whether your a keyspace is tablets-based
+(the default) or has tablets disabled. See :doc:`Data Distribution with Tablets </architecture/tablets>`
+for more information about tablets.

+Increasing the RF in Tablets-based Keyspaces
+-------------------------------------------------

-**Audience: ScyllaDB administrators**
+If a keyspace has tablets enabled (the default), changing the RF does not
+impact data consistency in the cluster.

+However, due to limitations in the current protocol used to pass tablet data
+to drivers, drivers will not pick up new replicas after the RF is increased.
+As a result, drivers will not route requests to new replicas, causing imbalance.

-Issues
------
+To avoid this issue, restart the client applications after the ALTER statement
+that changes the RF completes successfully.

-When a Replication Factor (RF) is increased, using the :ref:`ALTER KEYSPACE <alter-keyspace-statement>` command, the data consistency is effectively dropped
-by the difference of the RF_new value and the RF_old value for all pre-existing data.
+Increasing the RF in Keyspaces with Tablets Disabled
+----------------------------------------------------------
+
+If you :ref:`opted out of tablets when creating a keyspace <tablets-enable-tablets>`,
+so your keyspace is vnodes-based, increasing the RF will impact data consistency.
+
+Data consistency in your cluster is effectively dropped by the difference
+between the RF_new value and the RF_old value for all pre-existing data.
 Consistency will only be restored after running a repair.

-Another issue occurs in keyspaces with tablets enabled and is driver-related. Due to limitations in the current protocol used to pass tablet data to drivers, drivers will not pick
-up new replicas after replication factor is increased. This will cause them to avoid routing requests to those replicas, causing imbalance.

 Resolution
----------
+========================

-When one increases an RF, one should consider that the pre-existing data will **not be streamed** to new replicas (a common misconception).
+When you increase the RF, you should be aware that the pre-existing data will
+**not be streamed** to new replicas (a common misconception).

-As a result, in order to make sure that you can keep on reading the old data with the same level of consistency, increase the read Consistency Level (CL) according to the following formula:
+As a result, in order to make sure that you can keep on reading the old data
+with the same level of consistency:

-``CL_new = CL_old + RF_new - RF_old``
+#. Increase the read Consistency Level (CL) according to the following formula:

-After you run a repair, you can decrease the CL. If RF has only been changed in a particular Data Center (DC) only the nodes in that DC have to be repaired.
+   .. code::
+
+      CL_new = CL_old + RF_new - RF_old
+
+#. Run repair.
+#. Decrease the CL.
+
+
+If RF has only been changed in a particular Datacenter (DC), only the nodes in
+that DC have to be repaired.

-To resolve the driver-related issue, restart the client applications after the ALTER statement that changes the RF completes successfully.

 Example
 =======

-In this example your five node cluster RF is 3 and your CL is TWO. You want to increase your RF from 3 to 5.
+In this example, your five-node cluster RF is 3 and your CL is TWO. You want to increase your RF from 3 to 5.

 #. Increase the read CL by a RF_new - RF_old value.
   Following the example the RF_new is 5 and the RF_old is 3 so, 5-3 =2. You need to increase the CL by 2.
@@ -45,9 +70,9 @@ In this example your five node cluster RF is 3 and your CL is TWO. You want to i
 #. Restore the reads CL to the originally intended value. For this example, QUORUM.


-If you do not follow the procedure above you may start reading stale or null data after increasing the RF.
+If you do not follow the procedure above, you may start reading stale or null data after increasing the RF.

-More Information
+References
 ----------------

 * :doc:`Fault Tolerance </architecture/architecture-fault-tolerance/>`
--- a/docs/operating-scylla/admin-tools/cassandra-stress.rst
+++ b/docs/operating-scylla/admin-tools/cassandra-stress.rst
@@ -5,4 +5,3 @@ The cassandra-stress tool is used for benchmarking and load testing both ScyllaD

 Cassandra Stress is not part of ScyllaDB and it is not distributed along side it anymore. It has it's own separate repository and release cycle. More information about it can be found on `GitHub <https://github.com/scylladb/cassandra-stress>`_ or on `DockerHub <https://hub.docker.com/r/scylladb/cassandra-stress>`_.

-.. include:: /rst_include/apache-copyrights.rst
--- a/docs/operating-scylla/admin.rst
+++ b/docs/operating-scylla/admin.rst
@@ -217,7 +217,7 @@ For example:
 * `ScyllaDB Java Driver <https://github.com/scylladb/java-driver/tree/3.7.1-scylla/manual/compression>`_
 * `Go Driver <https://godoc.org/github.com/gocql/gocql#Compressor>`_

-Refer to the :doc:`Drivers Page </using-scylla/drivers/index>` for more drivers.
+Refer to `ScyllaDB Drivers <https://docs.scylladb.com/stable/drivers/index.html>`_ for more drivers.

 .. _internode-compression:

--- a/docs/operating-scylla/nodetool-commands/backup.rst
+++ b/docs/operating-scylla/nodetool-commands/backup.rst
@@ -18,13 +18,14 @@ Syntax
               [--snapshot <snapshot>]
               --endpoint <endpoint> --bucket <bucket> --prefix <prefix>
               [--nowait]
+               [--move-files]

 Example
 -------

 .. code-block:: console

-    nodetool backup --endpoint s3.us-east-2.amazonaws.com  --bucket bucket-foo --prefix foo/bar/baz --keyspace ks --table table --snapshot ss
+    nodetool backup --endpoint s3.us-east-2.amazonaws.com  --bucket bucket-foo --prefix foo/bar/baz --keyspace ks --table table --snapshot ss --move-files

 Options
 -------
@@ -38,6 +39,7 @@ Options
 * ``--bucket`` - Name of the bucket to backup SSTables to
 * ``--prefix`` - Prefix to backup SSTables to
 * ``--nowait`` - Don't wait on the backup process
+* ``--move-files`` - Move files instead of copying them. This will delete the files from the local disk after they are uploaded to the object storage.

 See also

--- a/docs/operating-scylla/nodetool-commands/cleanup.rst
+++ b/docs/operating-scylla/nodetool-commands/cleanup.rst
@@ -1,3 +1,5 @@
+.. _nodetool-cleanup-cmd:
+
 Nodetool cleanup
 ================
 **cleanup** ``[<keyspace> <tablename ...>]``- triggers the immediate removal of data from node(s) that "lose" part of their token range due to a range movement operation (node addition or node replacement).
@@ -18,6 +20,8 @@ To clean up the data of a specific node and specific keyspace, use this command:

   nodetool -h <host name> cleanup <keyspace>

+To clean up entire cluster see :doc:`nodetool cluster cleanup </operating-scylla/nodetool-commands/cluster/cleanup/>`
+
 .. warning::

   Make sure there are no topology changes before running cleanup. To validate, run ``nodetool status``, all nodes should be in status Up Normal (``UN``).
--- a/docs/operating-scylla/nodetool-commands/cluster/cleanup.rst
+++ b/docs/operating-scylla/nodetool-commands/cluster/cleanup.rst
@@ -0,0 +1,15 @@
+Nodetool cluster cleanup
+========================
+
+**cluster cleanup** - A process that runs in the background and removes data no longer owned by nodes. Used for non tablet (vnode-based) tables only.
+
+Running ``cluster cleanup`` on a **single node** cleans up all non tablet tables on all nodes in the cluster (tablet enabled tables are cleaned up automatically).
+
+
+  For example:
+
+  ::
+
+     nodetool cluster cleanup
+
+See also `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_.
--- a/docs/operating-scylla/nodetool-commands/cluster/index.rst
+++ b/docs/operating-scylla/nodetool-commands/cluster/index.rst
@@ -5,6 +5,7 @@ Nodetool cluster
   :hidden:

   repair <repair>
+   cleanup <cleanup>

 **cluster** - Nodetool supercommand for running cluster operations.

@@ -12,3 +13,4 @@ Supported cluster suboperations
 -------------------------------

 * :doc:`repair </operating-scylla/nodetool-commands/cluster/repair>`  :code:`<keyspace>` :code:`<table>` - Repair one or more tablet tables.
+* :doc:`cleanup </operating-scylla/nodetool-commands/cluster/cleanup>`  - Clean up all non tablet (vnode-based) keyspaces in a cluster
--- a/docs/operating-scylla/nodetool-commands/enableautocompaction.rst
+++ b/docs/operating-scylla/nodetool-commands/enableautocompaction.rst
--- a/docs/operating-scylla/nodetool-commands/refresh.rst
+++ b/docs/operating-scylla/nodetool-commands/refresh.rst
@@ -29,15 +29,63 @@ Load and Stream

 .. code::

-   nodetool refresh <my_keyspace> <my_table> [--load-and-stream | -las]
+   nodetool refresh <my_keyspace> <my_table> [(--load-and-stream | -las) [[(--primary-replica-only | -pro)] | [--scope <scope>]]]
+
+The Load and Stream feature extends nodetool refresh. 
+
+The ``--load-and-stream`` option loads arbitrary sstables into the cluster by reading the sstable data and streaming each partition to the replica(s) that owns it. In addition, the ``--scope`` and ``--primary-replica-only`` options are applied to filter the set of target replicas for each partition.  For example, say the old cluster has 6 nodes and the new cluster has 3 nodes. One can copy the sstables from the old cluster to any of the new nodes and trigger refresh with load and stream.
+
+
+

-The Load and Stream feature extends nodetool refresh. The new ``-las`` option loads arbitrary sstables that do not belong to a node into the cluster. It loads the sstables from the disk and calculates the data's owning nodes, and streams automatically.
-For example, say the old cluster has 6 nodes and the new cluster has 3 nodes. We can copy the sstables from the old cluster to any of the new nodes and trigger the load and stream process.

 Load and Stream make restores and migrations much easier:

 * You can place sstable from every node to every node
 * No need to run nodetool cleanup to remove unused data

+With --primary-replica-only (or -pro) option, only the primary replica of each partition in an sstable will be used as the target. 
+--primary-replica-only must be applied together with --load-and-stream.
+--primary-replica-only cannot be used with --scope, they are mutually exclusive.
+--primary-replica-only requires repair to be run after the load and stream operation is completed. 
+
+
+Scope
+-----
+
+The `scope` parameter describes the subset of cluster nodes where you want to load data:
+
+* `node` - On the local node.
+* `rack` - On the local rack.
+* `dc` - In the datacenter (DC) where the local node lives.
+* `all` (default) - Everywhere across the cluster.
+
+Scope supports a variety of options for filtering out the destination nodes.
+On one extreme, one node is given all SStables with the scope ``all``; on the other extreme, all
+nodes are loading only their own SStables with the scope ``node``. In between, you can choose
+a subset of nodes to load only SStables that belong to the rack or DC.
+
+This option is only valid when using the ``--load-and-stream`` option.
+
+
+Skip cleanup
+---------------
+
+.. code::
+
+   nodetool refresh <my_keyspace> <my_table> [--skip-cleanup]
+
+When loading an SSTable, Scylla will cleanup it from keys that the node is not responsible for. To skip this step, use the `--skip-cleanup` option.
+See :ref:`nodetool cleanup <nodetool-cleanup-cmd>`.
+
+
+Skip reshape
+---------------
+
+.. code::
+
+   nodetool refresh <my_keyspace> <my_table> [--skip-reshape]
+
+When refreshing, the SSTables to load might be out of shape, Scylla will attempt to reshape them if that's the case. To skip this step, use the `--skip-reshape` option.

 .. include:: nodetool-index.rst
--- a/docs/operating-scylla/nodetool.rst
+++ b/docs/operating-scylla/nodetool.rst
@@ -14,9 +14,9 @@ Nodetool
   nodetool-commands/cleanup
   nodetool-commands/clearsnapshot
   nodetool-commands/cluster/index
+   nodetool-commands/compact
   nodetool-commands/compactionhistory
   nodetool-commands/compactionstats
-   nodetool-commands/compact
   nodetool-commands/decommission
   nodetool-commands/describecluster
   nodetool-commands/describering
@@ -25,13 +25,15 @@ Nodetool
   nodetool-commands/disablebinary
   nodetool-commands/disablegossip
   nodetool-commands/drain
-   nodetool-commands/enbleautocompaction   
+   nodetool-commands/enableautocompaction
   nodetool-commands/enablebackup
   nodetool-commands/enablebinary
   nodetool-commands/enablegossip
   nodetool-commands/flush
+   nodetool-commands/getcompactionthroughput
   nodetool-commands/getendpoints
   nodetool-commands/getsstables
+   nodetool-commands/getstreamthroughput
   nodetool-commands/gettraceprobability
   nodetool-commands/gossipinfo
   nodetool-commands/help
@@ -46,25 +48,23 @@ Nodetool
   nodetool-commands/restore
   nodetool-commands/ring
   nodetool-commands/scrub
-   nodetool-commands/settraceprobability
+   nodetool-commands/setcompactionthroughput
   nodetool-commands/setlogginglevel
+   nodetool-commands/setstreamthroughput
+   nodetool-commands/settraceprobability
   nodetool-commands/snapshot
   nodetool-commands/sstableinfo
+   nodetool-commands/status
   nodetool-commands/statusbackup
   nodetool-commands/statusbinary
   nodetool-commands/statusgossip
-   nodetool-commands/status
   Nodetool stop compaction <nodetool-commands/stop>
   nodetool-commands/tablestats
   nodetool-commands/tasks/index
   nodetool-commands/toppartitions
   nodetool-commands/upgradesstables
-   nodetool-commands/viewbuildstatus
   nodetool-commands/version
-   nodetool-commands/getcompactionthroughput
-   nodetool-commands/setcompactionthroughput
-   nodetool-commands/getstreamthroughput
-   nodetool-commands/setstreamthroughput
+   nodetool-commands/viewbuildstatus

 The ``nodetool`` utility provides a simple command-line interface to the following exposed operations and attributes.

@@ -87,9 +87,9 @@ Operations that are not listed below are currently not available.
 * :doc:`cleanup </operating-scylla/nodetool-commands/cleanup/>` - Triggers the immediate cleanup of keys no longer belonging to a node.
 * :doc:`clearsnapshot </operating-scylla/nodetool-commands/clearsnapshot/>` - This command removes snapshots.
 * :doc:`cluster <nodetool-commands/cluster/index>` - Run a cluster operation.
+* :doc:`compact </operating-scylla/nodetool-commands/compact/>`- Force a (major) compaction on one or more column families.
 * :doc:`compactionhistory </operating-scylla/nodetool-commands/compactionhistory/>` - Provides the history of compactions.
 * :doc:`compactionstats </operating-scylla/nodetool-commands/compactionstats/>`- Print statistics on compactions.
-* :doc:`compact </operating-scylla/nodetool-commands/compact/>`- Force a (major) compaction on one or more column families.
 * :doc:`decommission </operating-scylla/nodetool-commands/decommission/>` - Decommission the node.
 * :doc:`describecluster </operating-scylla/nodetool-commands/describecluster/>` - Print the name, snitch, partitioner and schema version of a cluster.
 * :doc:`describering </operating-scylla/nodetool-commands/describering/>` - :code:`<keyspace>`- Shows the partition ranges of a given keyspace.
@@ -98,14 +98,16 @@ Operations that are not listed below are currently not available.
 * :doc:`disablebinary </operating-scylla/nodetool-commands/disablebinary/>` - Disable native transport (binary protocol).
 * :doc:`disablegossip </operating-scylla/nodetool-commands/disablegossip/>` - Disable gossip (effectively marking the node down).
 * :doc:`drain </operating-scylla/nodetool-commands/drain/>` - Drain the node (stop accepting writes and flush all column families).
-* :doc:`enableautocompaction </operating-scylla/nodetool-commands/enbleautocompaction/>` - Enable automatic compaction of a keyspace or table.
+* :doc:`enableautocompaction </operating-scylla/nodetool-commands/enableautocompaction/>` - Enable automatic compaction of a keyspace or table.
 * :doc:`enablebackup </operating-scylla/nodetool-commands/enablebackup/>` - Enable incremental backup.
 * :doc:`enablebinary </operating-scylla/nodetool-commands/enablebinary/>` - Re-enable native transport (binary protocol).
 * :doc:`enablegossip </operating-scylla/nodetool-commands/enablegossip/>` - Re-enable gossip.
 * :doc:`flush </operating-scylla/nodetool-commands/flush/>` - Flush one or more column families.
+* :doc:`getcompactionthroughput </operating-scylla/nodetool-commands/getcompactionthroughput>` - Print the throughput cap for compaction in the system
 * :doc:`getendpoints <nodetool-commands/getendpoints/>` :code:`<keyspace>` :code:`<table>` :code:`<key>`- Print the end points that owns the key.
 * **getlogginglevels** - Get the runtime logging levels.
 * :doc:`getsstables </operating-scylla/nodetool-commands/getsstables>` - Print the sstable filenames that own the key.
+* :doc:`getstreamthroughput </operating-scylla/nodetool-commands/getstreamthroughput>` - Print the throughput cap for SSTables streaming in the system
 * :doc:`gettraceprobability </operating-scylla/nodetool-commands/gettraceprobability>` - Displays the current trace probability value. 0 is disabled 1 is enabled.
 * :doc:`gossipinfo </operating-scylla/nodetool-commands/gossipinfo/>` - Shows the gossip information for the cluster.
 * :doc:`help </operating-scylla/nodetool-commands/help/>` - Display list of available nodetool commands.
@@ -118,28 +120,26 @@ Operations that are not listed below are currently not available.
 * :doc:`refresh </operating-scylla/nodetool-commands/refresh/>`- Load newly placed SSTables to the system without restart
 * :doc:`removenode </operating-scylla/nodetool-commands/removenode/>`- Remove node with the provided ID
 * :doc:`repair <nodetool-commands/repair/>`  :code:`<keyspace>` :code:`<table>` - Repair one or more vnode tables.
-* :doc:`restore </operating-scylla/nodetool-commands/restore/>` - Load SSTables from a designated bucket in object store into a specified keyspace or table
 * :doc:`resetlocalschema </operating-scylla/nodetool-commands/resetlocalschema/>` - Reset the node's local schema.
+* :doc:`restore </operating-scylla/nodetool-commands/restore/>` - Load SSTables from a designated bucket in object store into a specified keyspace or table
 * :doc:`ring <nodetool-commands/ring/>` - The nodetool ring command display the token ring information.
 * :doc:`scrub </operating-scylla/nodetool-commands/scrub>` :code:`[-m mode] [--no-snapshot] <keyspace> [<table>...]` - Scrub the SSTable files in the specified keyspace or table(s)
+* :doc:`setcompactionthroughput </operating-scylla/nodetool-commands/setcompactionthroughput>` - Set the throughput cap for compaction in the system
 * :doc:`setlogginglevel</operating-scylla/nodetool-commands/setlogginglevel>` - sets the logging level threshold for ScyllaDB classes
+* :doc:`setstreamthroughput </operating-scylla/nodetool-commands/setstreamthroughput>` - Set the throughput cap for SSTables streaming in the system
 * :doc:`settraceprobability </operating-scylla/nodetool-commands/settraceprobability/>` ``<value>`` - Sets the probability for tracing a request. race probability value
 * :doc:`snapshot </operating-scylla/nodetool-commands/snapshot>` :code:`[-t tag] [-cf column_family] <keyspace>`  - Take a snapshot of specified keyspaces or a snapshot of the specified table.
 * :doc:`sstableinfo </operating-scylla/nodetool-commands/sstableinfo>` - Get information about sstables per keyspace/table.
+* :doc:`status </operating-scylla/nodetool-commands/status/>` - Print cluster information.
 * :doc:`statusbackup </operating-scylla/nodetool-commands/statusbackup/>` - Status of incremental backup.
 * :doc:`statusbinary </operating-scylla/nodetool-commands/statusbinary/>` - Status of native transport (binary protocol).
 * :doc:`statusgossip </operating-scylla/nodetool-commands/statusgossip/>` - Status of gossip.
-* :doc:`status </operating-scylla/nodetool-commands/status/>` - Print cluster information.
 * :doc:`stop </operating-scylla/nodetool-commands/stop/>` - Stop compaction operation.
 * **tablehistograms** see :doc:`cfhistograms <nodetool-commands/cfhistograms/>`
 * :doc:`tablestats </operating-scylla/nodetool-commands/tablestats/>` - Provides in-depth diagnostics regard table. 
 * :doc:`tasks </operating-scylla/nodetool-commands/tasks/index>` - Manage tasks manager tasks.
 * :doc:`toppartitions </operating-scylla/nodetool-commands/toppartitions/>` - Samples cluster writes and reads and reports the most active partitions in a specified table and time frame.
 * :doc:`upgradesstables </operating-scylla/nodetool-commands/upgradesstables>` - Upgrades each table that is not running the latest ScyllaDB version, by rewriting SSTables.
-* :doc:`viewbuildstatus </operating-scylla/nodetool-commands/viewbuildstatus/>` - Shows the progress of a materialized view build.
 * :doc:`version </operating-scylla/nodetool-commands/version>` - Print the DB version.
-* :doc:`getcompactionthroughput </operating-scylla/nodetool-commands/getcompactionthroughput>` - Print the throughput cap for compaction in the system
-* :doc:`setcompactionthroughput </operating-scylla/nodetool-commands/setcompactionthroughput>` - Set the throughput cap for compaction in the system
-* :doc:`getstreamthroughput </operating-scylla/nodetool-commands/getstreamthroughput>` - Print the throughput cap for SSTables streaming in the system
-* :doc:`setstreamthroughput </operating-scylla/nodetool-commands/setstreamthroughput>` - Set the throughput cap for SSTables streaming in the system
+* :doc:`viewbuildstatus </operating-scylla/nodetool-commands/viewbuildstatus/>` - Shows the progress of a materialized view build.

--- a/docs/operating-scylla/procedures/cluster-management/_common/membership-change-failures-note.rst
+++ b/docs/operating-scylla/procedures/cluster-management/_common/membership-change-failures-note.rst
@@ -1,10 +0,0 @@
-.. note::
-
-    This page only applies to clusters where consistent topology updates are not enabled.
-    Consistent topology updates are mandatory, so **this page serves troubleshooting purposes**.
-
-    The page does NOT apply if you:
-
-    * Created a cluster with ScyllaDB 6.0 or later (consistent topology updates are automatically enabled).
-    * `Manually enabled consistent topology updates <https://opensource.docs.scylladb.com/branch-6.0/upgrade/upgrade-opensource/upgrade-guide-from-5.4-to-6.0/enable-consistent-topology.html>`_
-      after upgrading to 6.0 or before upgrading to 6.1 (required).
--- a/Show More
+++ b/Show More