Merge 'doc: fix the installation section' from Anna Stuchlik

This PR fixes the Installation page: - Replaces `http `with `https `in the download command. - Replaces the Open Source example from the Installation section for CentOS (we overlooked this example before). Fixes https://github.com/scylladb/scylladb/issues/29087 Fixes https://github.com/scylladb/scylladb/issues/29087 This update affects all supported versions and should be backported as a bug fix. Closes scylladb/scylladb#29088 * github.com:scylladb/scylladb: doc: remove the Open Source Example from Installation doc: replace http with https in the installation instructions (cherry picked from commit e8b37d1a89) Closes scylladb/scylladb#29135 Closes scylladb/scylladb#29192 Closes scylladb/scylladb#29201
Update seastar submodule (iotune fixes)
2026-03-24 21:13:53 +02:00 · 2026-03-24 21:13:19 +02:00 · 2026-03-24 16:06:22 +02:00 · 2026-03-24 13:09:36 +01:00 · 2026-03-20 11:02:15 +02:00 · 2026-03-15 04:52:56 +02:00
709 changed files with 28000 additions and 11945 deletions
--- a/.github/scripts/auto-backport.py
+++ b/.github/scripts/auto-backport.py
@@ -50,7 +50,7 @@ def create_pull_request(repo, new_branch_name, base_branch_name, pr, backport_pr
        if is_draft:
            backport_pr.add_to_labels("conflicts")
            pr_comment = f"@{pr.user.login} - This PR was marked as draft because it has conflicts\n"
-            pr_comment += "Please resolve them and mark this PR as ready for review"
+            pr_comment += "Please resolve them and remove the 'conflicts' label. The PR will be made ready for review automatically."
            backport_pr.create_issue_comment(pr_comment)
        logging.info(f"Assigned PR to original author: {pr.user}")
        return backport_pr
@@ -121,27 +121,46 @@ def backport(repo, pr, version, commits, backport_base_branch):
                    is_draft = True
                    repo_local.git.add(A=True)
                    repo_local.git.cherry_pick('--continue')
-            repo_local.git.push(fork_repo, new_branch_name, force=True)
-            create_pull_request(repo, new_branch_name, backport_base_branch, pr, backport_pr_title, commits,
-                                is_draft=is_draft)

+            # Check if the branch already exists in the remote fork
+            remote_refs = repo_local.git.ls_remote('--heads', fork_repo, new_branch_name)
+            if not remote_refs:
+                # Branch does not exist, create it with a regular push
+                repo_local.git.push(fork_repo, new_branch_name)
+                create_pull_request(repo, new_branch_name, backport_base_branch, pr, backport_pr_title, commits,
+                                    is_draft)
+            else:
+                logging.info(f"Remote branch {new_branch_name} already exists in fork. Skipping push.")
        except GitCommandError as e:
            logging.warning(f"GitCommandError: {e}")


 def with_github_keyword_prefix(repo, pr):
-    pattern = rf"(?:fix(?:|es|ed))\s*:?\s*(?:(?:(?:{repo.full_name})?#)|https://github\.com/{repo.full_name}/issues/)(\d+)"
-    match = re.findall(pattern, pr.body, re.IGNORECASE)
-    if not match:
-        print(f'No valid close reference for {pr.number}')
-        comment = f':warning:  @{pr.user.login} PR body does not contain a Fixes reference to an issue '
-        comment += ' and can not be backported\n\n'
-        comment += 'The following labels were removed:\n'
-        create_pr_comment_and_remove_label(pr, comment)
-        return False
-    else:
+    # GitHub issue pattern: #123, scylladb/scylladb#123, or full GitHub URLs
+    github_pattern = rf"(?:fix(?:|es|ed))\s*:?\s*(?:(?:(?:{repo.full_name})?#)|https://github\.com/{repo.full_name}/issues/)(\d+)"
+    
+    # JIRA issue pattern: PKG-92 or https://scylladb.atlassian.net/browse/PKG-92
+    jira_pattern = r"(?:fix(?:|es|ed))\s*:?\s*(?:(?:https://scylladb\.atlassian\.net/browse/)?([A-Z]+-\d+))"
+    
+    # Check PR body for GitHub issues
+    github_match = re.findall(github_pattern, pr.body, re.IGNORECASE)
+    # Check PR body for JIRA issues
+    jira_match = re.findall(jira_pattern, pr.body, re.IGNORECASE)
+    
+    match = github_match or jira_match
+
+    if match:
        return True

+    for commit in pr.get_commits():
+        github_match = re.findall(github_pattern, commit.commit.message, re.IGNORECASE)
+        jira_match = re.findall(jira_pattern, commit.commit.message, re.IGNORECASE)
+        if github_match or jira_match:
+            print(f'{pr.number} has a valid close reference in commit message {commit.sha}')
+            return True
+
+    print(f'No valid close reference for {pr.number}')
+    return False

 def main():
    args = parse_args()
--- a/.github/workflows/backport-pr-fixes-validation.yaml
+++ b/.github/workflows/backport-pr-fixes-validation.yaml
@@ -18,7 +18,7 @@ jobs:
            
            // Regular expression pattern to check for "Fixes" prefix
            // Adjusted to dynamically insert the repository full name
-            const pattern = `Fixes:? (?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)`;
+            const pattern = `Fixes:? ((?:#|${repo.replace('/', '\\/')}#|https://github\\.com/${repo.replace('/', '\\/')}/issues/)(\\d+)|(?:https://scylladb\\.atlassian\\.net/browse/)?([A-Z]+-\\d+))`;
            const regex = new RegExp(pattern);
            
            if (!regex.test(body)) {
--- a/.github/workflows/call_backport_with_jira.yaml
+++ b/.github/workflows/call_backport_with_jira.yaml
@@ -0,0 +1,53 @@
+name: Backport with Jira Integration
+
+on:
+  push:
+    branches:
+      - master
+      - next-*.*
+      - branch-*.*
+  pull_request_target:
+    types: [labeled, closed]
+    branches: 
+      - master
+      - next
+      - next-*.*
+      - branch-*.*
+
+jobs:
+  backport-on-push:
+    if: github.event_name == 'push'
+    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
+    with:
+      event_type: 'push'
+      base_branch: ${{ github.ref }}
+      commits: ${{ github.event.before }}..${{ github.sha }}
+    secrets:
+      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
+
+  backport-on-label:
+    if: github.event_name == 'pull_request_target' && github.event.action == 'labeled'
+    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
+    with:
+      event_type: 'labeled'
+      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
+      pull_request_number: ${{ github.event.pull_request.number }}
+      head_commit: ${{ github.event.pull_request.base.sha }}
+      label_name: ${{ github.event.label.name }}
+      pr_state: ${{ github.event.pull_request.state }}
+    secrets:
+      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
+
+  backport-chain:
+    if: github.event_name == 'pull_request_target' && github.event.action == 'closed' && github.event.pull_request.merged == true
+    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
+    with:
+      event_type: 'chain'
+      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
+      pull_request_number: ${{ github.event.pull_request.number }}
+      pr_body: ${{ github.event.pull_request.body }}
+    secrets:
+      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/make-pr-ready-for-review.yaml
+++ b/.github/workflows/make-pr-ready-for-review.yaml
@@ -0,0 +1,27 @@
+name: Mark PR as Ready When Conflicts Label is Removed
+
+on:
+  pull_request_target:
+    types:
+      - unlabeled
+
+env:
+  DEFAULT_BRANCH: 'master'
+
+jobs:
+  mark-ready:
+    if: github.event.label.name == 'conflicts'
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ github.repository }}
+          ref: ${{ env.DEFAULT_BRANCH }}
+          token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
+          fetch-depth: 1
+      - name: Mark pull request as ready for review
+        run:  gh pr ready "${{ github.event.pull_request.number }}"
+        env:
+          GITHUB_TOKEN: ${{ secrets.AUTO_BACKPORT_TOKEN }}
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2025.1.0-dev
+VERSION=2025.1.12

 if test -f version
 then
--- a/alternator/consumed_capacity.cc
+++ b/alternator/consumed_capacity.cc
@@ -24,7 +24,7 @@ static constexpr uint64_t KB = 1024ULL;
 static constexpr uint64_t RCU_BLOCK_SIZE_LENGTH = 4*KB;
 static constexpr uint64_t WCU_BLOCK_SIZE_LENGTH = 1*KB;

-static bool should_add_capacity(const rjson::value& request) {
+bool consumed_capacity_counter::should_add_capacity(const rjson::value& request) {
    const rjson::value* return_consumed = rjson::find(request, "ReturnConsumedCapacity");
    if (!return_consumed) {
        return false;
@@ -62,15 +62,22 @@ static uint64_t calculate_half_units(uint64_t unit_block_size, uint64_t total_by
 rcu_consumed_capacity_counter::rcu_consumed_capacity_counter(const rjson::value& request, bool is_quorum) :
        consumed_capacity_counter(should_add_capacity(request)),_is_quorum(is_quorum) {
 }
+uint64_t rcu_consumed_capacity_counter::get_half_units(uint64_t total_bytes, bool is_quorum) noexcept {
+    return calculate_half_units(RCU_BLOCK_SIZE_LENGTH, total_bytes, is_quorum);
+}

 uint64_t rcu_consumed_capacity_counter::get_half_units() const noexcept {
-    return calculate_half_units(RCU_BLOCK_SIZE_LENGTH, _total_bytes, _is_quorum);
+    return get_half_units(_total_bytes, _is_quorum);
 }

 uint64_t wcu_consumed_capacity_counter::get_half_units() const noexcept {
    return calculate_half_units(WCU_BLOCK_SIZE_LENGTH, _total_bytes, true);
 }

+uint64_t wcu_consumed_capacity_counter::get_units(uint64_t total_bytes) noexcept {
+    return calculate_half_units(WCU_BLOCK_SIZE_LENGTH, total_bytes, true) * HALF_UNIT_MULTIPLIER;
+}
+
 wcu_consumed_capacity_counter::wcu_consumed_capacity_counter(const rjson::value& request) :
        consumed_capacity_counter(should_add_capacity(request)) {
 }
--- a/alternator/consumed_capacity.hh
+++ b/alternator/consumed_capacity.hh
@@ -42,21 +42,25 @@ public:
     */
    virtual uint64_t get_half_units() const noexcept = 0;
    uint64_t _total_bytes = 0;
+    static bool should_add_capacity(const rjson::value& request);
 protected:
    bool _should_add_to_reponse = false;
 };

 class rcu_consumed_capacity_counter : public consumed_capacity_counter {
-    virtual uint64_t get_half_units() const noexcept;
    bool _is_quorum = false;
 public:
    rcu_consumed_capacity_counter(const rjson::value& request, bool is_quorum);
+    rcu_consumed_capacity_counter(): consumed_capacity_counter(false), _is_quorum(false){}
+    virtual uint64_t get_half_units() const noexcept;
+    static uint64_t get_half_units(uint64_t total_bytes, bool is_quorum) noexcept;
 };

 class wcu_consumed_capacity_counter : public consumed_capacity_counter {
    virtual uint64_t get_half_units() const noexcept;
 public:
    wcu_consumed_capacity_counter(const rjson::value& request);
+    static uint64_t get_units(uint64_t total_bytes) noexcept;
 };

 }
--- a/alternator/error.hh
+++ b/alternator/error.hh
@@ -88,6 +88,9 @@ public:
    static api_error table_not_found(std::string msg) {
        return api_error("TableNotFoundException", std::move(msg));
    }
+    static api_error limit_exceeded(std::string msg) {
+        return api_error("LimitExceededException", std::move(msg));
+    }
    static api_error internal(std::string msg) {
        return api_error("InternalServerError", std::move(msg), http::reply::status_type::internal_server_error);
    }
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -241,7 +241,8 @@ public:
        const query::partition_slice&& slice,
        shared_ptr<cql3::selection::selection> selection,
        foreign_ptr<lw_shared_ptr<query::result>> query_result,
-        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get);
+        shared_ptr<const std::optional<attrs_to_get>> attrs_to_get,
+        uint64_t& rcu_half_units);

    static void describe_single_item(const cql3::selection::selection&,
        const std::vector<managed_bytes_opt>&,
--- a/alternator/expressions.g
+++ b/alternator/expressions.g
@@ -91,6 +91,18 @@ options {
        throw expressions_syntax_error(format("{} at char {}", err,
            ex->get_charPositionInLine()));
    }
+
+    // ANTLR3 tries to recover missing tokens - it tries to finish parsing
+    // and create valid objects, as if the missing token was there.
+    // But it has a bug and leaks these tokens.
+    // We override offending method and handle abandoned pointers.
+    std::vector<std::unique_ptr<TokenType>> _missing_tokens;
+    TokenType* getMissingSymbol(IntStreamType* istream, ExceptionBaseType* e,
+                                ANTLR_UINT32 expectedTokenType, BitsetListType* follow) {
+        auto token = BaseType::getMissingSymbol(istream, e, expectedTokenType, follow);
+        _missing_tokens.emplace_back(token);
+        return token;
+    }
 }
@lexer::context {
    void displayRecognitionError(ANTLR_UINT8** token_names, ExceptionBaseType* ex) {
--- a/alternator/extract_from_attrs.hh
+++ b/alternator/extract_from_attrs.hh
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2024-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include <string>
+#include <string_view>
+
+#include "utils/rjson.hh"
+#include "serialization.hh"
+#include "column_computation.hh"
+#include "db/view/regular_column_transformation.hh"
+
+namespace alternator {
+
+// An implementation of a "column_computation" which extracts a specific
+// non-key attribute from the big map (":attrs") of all non-key attributes,
+// and deserializes it if it has the desired type. GSI will use this computed
+// column as a materialized-view key when the view key attribute isn't a
+// full-fledged CQL column but rather stored in ":attrs".
+class extract_from_attrs_column_computation : public regular_column_transformation {
+    // The name of the CQL column name holding the attribute map. It is a
+    // constant defined in executor.cc (as ":attrs"), so doesn't need
+    // to be specified when constructing the column computation.
+    static const bytes MAP_NAME;
+    // The top-level attribute name to extract from the ":attrs" map.
+    bytes _attr_name;
+    // The type we expect for the value stored in the attribute. If the type
+    // matches the expected type, it is decoded from the serialized format
+    // we store in the map's values) into the raw CQL type value that we use
+    // for keys, and returned by compute_value(). Only the types "S" (string),
+    // "B" (bytes) and "N" (number) are allowed as keys in DynamoDB, and
+    // therefore in desired_type.
+    alternator_type _desired_type;
+public:
+    virtual column_computation_ptr clone() const override;
+    // TYPE_NAME is a unique string that distinguishes this class from other
+    // column_computation subclasses. column_computation::deserialize() will
+    // construct an object of this subclass if it sees a "type" TYPE_NAME.
+    static inline const std::string TYPE_NAME = "alternator_extract_from_attrs";
+    // Serialize the *definition* of this column computation into a JSON
+    // string with a unique "type" string - TYPE_NAME - which then causes
+    // column_computation::deserialize() to create an object from this class.
+    virtual bytes serialize() const override;
+    // Construct this object based on the previous output of serialize().
+    // Calls on_internal_error() if the string doesn't match the output format
+    // of serialize(). "type" is not checked column_computation::deserialize()
+    // won't call this constructor if "type" doesn't match.
+    extract_from_attrs_column_computation(const rjson::value &v);
+    extract_from_attrs_column_computation(bytes_view attr_name, alternator_type desired_type)
+        : _attr_name(attr_name), _desired_type(desired_type)
+        {}
+    // Implement regular_column_transformation's compute_value() that
+    // accepts the full row:
+    result compute_value(const schema& schema, const partition_key& key,
+        const db::view::clustering_or_static_row& row) const override;
+    // But do not implement column_computation's compute_value() that
+    // accepts only a partition key - that's not enough so our implementation
+    // of this function does on_internal_error().
+    bytes compute_value(const schema& schema, const partition_key& key) const override;
+    // This computed column does depend on a non-primary key column, so
+    // its result may change in the update and we need to compute it
+    // before and after the update.
+    virtual bool depends_on_non_primary_key_column() const override {
+        return true;
+    }
+};
+} // namespace alternator
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -245,6 +245,27 @@ rjson::value deserialize_item(bytes_view bv) {
    return deserialized;
 }

+// This function takes a bytes_view created earlier by serialize_item(), and
+// if has the type "expected_type", the function returns the value as a
+// raw Scylla type. If the type doesn't match, returns an unset optional.
+// This function only supports the key types S (string), B (bytes) and N
+// (number) - serialize_item() serializes those types as a single-byte type
+// followed by the serialized raw Scylla type, so all this function needs to
+// do is to remove the first byte. This makes this function much more
+// efficient than deserialize_item() above because it avoids transformation
+// to/from JSON.
+std::optional<bytes> serialized_value_if_type(bytes_view bv, alternator_type expected_type) {
+    if (bv.empty() || alternator_type(bv[0]) != expected_type) {
+        return std::nullopt;
+    }
+    // Currently, serialize_item() for types in alternator_type (notably S, B
+    // and N) are nothing more than Scylla's raw format for these types
+    // preceded by a type byte. So we just need to skip that byte and we are
+    // left by exactly what we need to return.
+    bv.remove_prefix(1);
+    return bytes(bv);
+}
+
 std::string type_to_string(data_type type) {
    static thread_local std::unordered_map<data_type, std::string> types = {
        {utf8_type, "S"},
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -43,6 +43,7 @@ type_representation represent_type(alternator_type atype);

 bytes serialize_item(const rjson::value& item);
 rjson::value deserialize_item(bytes_view bv);
+std::optional<bytes> serialized_value_if_type(bytes_view bv, alternator_type expected_type);

 std::string type_to_string(data_type type);

--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -94,16 +94,16 @@ stats::stats() : api_operations{} {
                    seastar::metrics::description("number of rows read during filtering operations")),
            seastar::metrics::make_total_operations("filtered_rows_matched_total", cql_stats.filtered_rows_matched_total,
                    seastar::metrics::description("number of rows read and matched during filtering operations")),
-            seastar::metrics::make_counter("rcu_total", rcu_total,
-                    seastar::metrics::description("total number of consumed read units, counted as half units")).set_skip_when_empty(),
+            seastar::metrics::make_counter("rcu_total", [this]{return 0.5 * rcu_half_units_total;},
+                    seastar::metrics::description("total number of consumed read units")).set_skip_when_empty(),
            seastar::metrics::make_counter("wcu_total", wcu_total[wcu_types::PUT_ITEM],
-                    seastar::metrics::description("total number of consumed write units, counted as half units"),{op("PutItem")}).set_skip_when_empty(),
+                    seastar::metrics::description("total number of consumed write units"),{op("PutItem")}).set_skip_when_empty(),
            seastar::metrics::make_counter("wcu_total", wcu_total[wcu_types::DELETE_ITEM],
-                    seastar::metrics::description("total number of consumed write units, counted as half units"),{op("DeleteItem")}).set_skip_when_empty(),
+                    seastar::metrics::description("total number of consumed write units"),{op("DeleteItem")}).set_skip_when_empty(),
            seastar::metrics::make_counter("wcu_total", wcu_total[wcu_types::UPDATE_ITEM],
-                    seastar::metrics::description("total number of consumed write units, counted as half units"),{op("UpdateItem")}).set_skip_when_empty(),
+                    seastar::metrics::description("total number of consumed write units"),{op("UpdateItem")}).set_skip_when_empty(),
            seastar::metrics::make_counter("wcu_total", wcu_total[wcu_types::INDEX],
-                    seastar::metrics::description("total number of consumed write units, counted as half units"),{op("Index")}).set_skip_when_empty(),
+                    seastar::metrics::description("total number of consumed write units"),{op("Index")}).set_skip_when_empty(),
            seastar::metrics::make_total_operations("filtered_rows_dropped_total", [this] { return cql_stats.filtered_rows_read_total - cql_stats.filtered_rows_matched_total; },
                    seastar::metrics::description("number of rows read and dropped during filtering operations")),
            seastar::metrics::make_counter("batch_item_count", seastar::metrics::description("The total number of items processed across all batches"),{op("BatchWriteItem")},
--- a/alternator/stats.hh
+++ b/alternator/stats.hh
@@ -84,7 +84,7 @@ public:
    uint64_t shard_bounce_for_lwt = 0;
    uint64_t requests_blocked_memory = 0;
    uint64_t requests_shed = 0;
-    uint64_t rcu_total = 0;
+    uint64_t rcu_half_units_total = 0;
    // wcu can results from put, update, delete and index
    // Index related will be done on top of the operation it comes with
    enum wcu_types {
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -808,6 +808,9 @@ future<executor::request_return_type> executor::get_records(client_state& client
    if (limit < 1) {
        throw api_error::validation("Limit must be 1 or more");
    }
+    if (limit > 1000) {
+        throw api_error::validation("Limit must be less than or equal to 1000");
+    }

    auto db = _proxy.data_dictionary();
    schema_ptr schema, base;
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -976,7 +976,7 @@
         ]
      },
      {
-         "path":"/storage_service/cleanup_all",
+         "path":"/storage_service/cleanup_all/",
         "operations":[
            {
               "method":"POST",
@@ -986,6 +986,30 @@
               "produces":[
                  "application/json"
               ],
+               "parameters":[
+                    {
+                     "name":"global",
+                     "description":"true if cleanup of entire cluster is requested",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/storage_service/mark_node_as_clean",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Mark the node as clean. After that the node will not be considered as needing cleanup during automatic cleanup which is triggered by some topology operations",
+               "type":"void",
+               "nickname":"reset_cleanup_needed",
+               "produces":[
+                  "application/json"
+               ],
               "parameters":[]
            }
         ]
@@ -2836,7 +2860,7 @@
               "nickname":"repair_tablet",
               "method":"POST",
               "summary":"Repair a tablet",
-               "type":"void",
+               "type":"tablet_repair_result",
               "produces":[
                  "application/json"
               ],
@@ -2864,6 +2888,30 @@
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
+                  },
+                  {
+                     "name":"hosts_filter",
+                     "description":"Repair replicas listed in the comma-separated host_id list.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"dcs_filter",
+                     "description":"Repair replicas listed in the comma-separated DC list",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"await_completion",
+                     "description":"Set true to wait for the repair to complete. Set false to skip waiting for the repair to complete. When the option is not provided, it defaults to false.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
                  }
               ]
            }
@@ -3037,6 +3085,22 @@
               ]
            }
         ]
+      },
+      {
+         "path":"/storage_service/raft_topology/cmd_rpc_status",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get information about currently running topology cmd rpc",
+               "type":"string",
+               "nickname":"raft_topology_get_cmd_status",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+               ]
+            }
+         ]
      }
   ],
   "models":{
@@ -3173,11 +3237,11 @@
         "properties":{
            "start_token":{
               "type":"string",
-               "description":"The range start token"
+               "description":"The range start token (exclusive)"
            },
            "end_token":{
               "type":"string",
-               "description":"The range start token"
+               "description":"The range end token (inclusive)"
            },
            "endpoints":{
               "type":"array",
@@ -3287,6 +3351,15 @@
                }
            }
        }
+      },
+      "tablet_repair_result":{
+        "id":"tablet_repair_result",
+        "description":"Tablet repair result",
+        "properties":{
+            "tablet_task_id":{
+                "type":"string"
+            }
+        }
      }
   }
 }
--- a/api/api-doc/task_manager.json
+++ b/api/api-doc/task_manager.json
@@ -253,6 +253,30 @@
               ]
            }
         ]
+      },
+      {
+         "path":"/task_manager/drain/{module}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Drain finished local tasks",
+               "type":"void",
+               "nickname":"drain_tasks",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"module",
+                     "description":"The module to drain",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            }
+         ]
      }
   ],
   "models":{
--- a/api/api-doc/tasks.json
+++ b/api/api-doc/tasks.json
@@ -42,6 +42,14 @@
                     "allowMultiple":false,
                     "type":"boolean",
                     "paramType":"query"
+                  },
+                  {
+                     "name":"consider_only_existing_data",
+                     "description":"Set to \"true\" to flush all memtables and force tombstone garbage collection to check only the sstables being compacted (false by default). The memtable, commitlog and other uncompacted sstables will not be checked during tombstone garbage collection.",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
                  }
               ]
            }
--- a/api/cql_server_test.cc
+++ b/api/cql_server_test.cc
@@ -6,6 +6,8 @@
 * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
 */

+#include "build_mode.hh"
+
 #ifndef SCYLLA_BUILD_MODE_RELEASE

 #include <seastar/core/coroutine.hh>
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -12,11 +12,13 @@
 #include "api/api-doc/storage_service.json.hh"
 #include "api/api-doc/storage_proxy.json.hh"
 #include "api/scrub_status.hh"
+#include "api/tasks.hh"
 #include "db/config.hh"
 #include "db/schema_tables.hh"
 #include "utils/hash.hh"
 #include <optional>
 #include <sstream>
+#include <stdexcept>
 #include <time.h>
 #include <algorithm>
 #include <functional>
@@ -745,80 +747,33 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
            fmopt = flush_mode::skip;
        }
        auto task = co_await compaction_module.make_and_start_task<global_major_compaction_task_impl>({}, db, fmopt, consider_only_existing_data);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("force_compaction failed: {}", std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json_void();
    });

    ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        auto params = req_params({
-            std::pair("keyspace", mandatory::yes),
-            std::pair("cf", mandatory::no),
-            std::pair("flush_memtables", mandatory::no),
-            std::pair("consider_only_existing_data", mandatory::no),
-        });
-        params.process(*req);
-        auto keyspace = validate_keyspace(ctx, *params.get("keyspace"));
-        auto table_infos = parse_table_infos(keyspace, ctx, params.get("cf").value_or(""));
-        auto flush = params.get_as<bool>("flush_memtables").value_or(true);
-        auto consider_only_existing_data = params.get_as<bool>("consider_only_existing_data").value_or(false);
-        apilog.info("force_keyspace_compaction: keyspace={} tables={}, flush={} consider_only_existing_data={}", keyspace, table_infos, flush, consider_only_existing_data);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        std::optional<flush_mode> fmopt;
-        if (!flush && !consider_only_existing_data) {
-            fmopt = flush_mode::skip;
-        }
-        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("force_keyspace_compaction: keyspace={} tables={} failed: {}", task->get_status().keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        auto task = co_await force_keyspace_compaction(ctx, std::move(req));
+        co_await task->done();
        co_return json_void();
    });

    ss::force_keyspace_cleanup.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        auto keyspace = validate_keyspace(ctx, req);
-        auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
-        const auto& rs = db.local().find_keyspace(keyspace).get_replication_strategy();
-        if (rs.get_type() == locator::replication_strategy_type::local || !rs.is_vnode_based()) {
-            auto reason = rs.get_type() == locator::replication_strategy_type::local ? "require" : "support";
-            apilog.info("Keyspace {} does not {} cleanup", keyspace, reason);
-            co_return json::json_return_type(0);
-        }
-        apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
-        if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
-            auto msg = "Can not perform cleanup operation when topology changes";
-            apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
-            co_await coroutine::return_exception(std::runtime_error(msg));
-        }
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>(
-            {}, std::move(keyspace), db, table_infos, flush_mode::all_tables, tasks::is_user_task::yes);
-        try {
+        auto task = co_await force_keyspace_cleanup(ctx, ss, std::move(req));
+        if (task) {
            co_await task->done();
-        } catch (...) {
-            apilog.error("force_keyspace_cleanup: keyspace={} tables={} failed: {}", task->get_status().keyspace, table_infos, std::current_exception());
-            throw;
        }
-
        co_return json::json_return_type(0);
    });

    ss::cleanup_all.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        apilog.info("cleanup_all");
-        auto done = co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
+        bool global = true;
+        if (auto global_param = req->get_query_param("global"); !global_param.empty()) {
+            global = validate_bool(global_param);
+        }
+
+        apilog.info("cleanup_all global={}", global);
+
+        auto done = !global ? false : co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<bool> {
            if (!ss.is_topology_coordinator_enabled()) {
                co_return false;
            }
@@ -828,49 +783,45 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        if (done) {
            co_return json::json_return_type(0);
        }
-        // fall back to the local global cleanup if topology coordinator is not enabled
+        // fall back to the local cleanup if topology coordinator is not enabled or local cleanup is requested
        auto& db = ctx.db;
        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<global_cleanup_compaction_task_impl>({}, db);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("cleanup_all failed: {}", std::current_exception());
-            throw;
-        }
+        co_await task->done();
+
+        // Mark this node as clean
+        co_await ss.invoke_on(0, [] (service::storage_service& ss) -> future<> {
+            if (ss.is_topology_coordinator_enabled()) {
+                co_await ss.reset_cleanup_needed();
+            }
+        });
+
        co_return json::json_return_type(0);
    });

+    ss::reset_cleanup_needed.set(r, [&ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        apilog.info("reset_cleanup_needed");
+        co_await ss.invoke_on(0, [] (service::storage_service& ss) {
+            if (!ss.is_topology_coordinator_enabled()) {
+                throw std::runtime_error("mark_node_as_clean is only supported when topology over raft is enabled");
+            }
+            return ss.reset_cleanup_needed();
+        });
+        co_return json_void();
+    });
+
    ss::perform_keyspace_offstrategy_compaction.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
        apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, table_infos);
        bool res = false;
        auto& compaction_module = ctx.db.local().get_compaction_manager().get_task_manager_module();
        auto task = co_await compaction_module.make_and_start_task<offstrategy_keyspace_compaction_task_impl>({}, std::move(keyspace), ctx.db, table_infos, &res);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("perform_keyspace_offstrategy_compaction: keyspace={} tables={} failed: {}", task->get_status().keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        co_await task->done();
        co_return json::json_return_type(res);
    }));

    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
-
-        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
-        try {
-            co_await task->done();
-        } catch (...) {
-            apilog.error("upgrade_sstables: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
-            throw;
-        }
-
+        auto task = co_await upgrade_sstables(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
+        co_await task->done();
        co_return json::json_return_type(0);
    }));

@@ -1475,6 +1426,16 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        co_return sstring(format("{}", ustate));
    });

+    ss::raft_topology_get_cmd_status.set(r, [&ss] (std::unique_ptr<http::request> req) -> future<json_return_type> {
+        const auto status = co_await ss.invoke_on(0, [] (auto& ss) {
+            return ss.get_topology_cmd_status();
+        });
+        if (status.active_dst.empty()) {
+            co_return sstring("none");
+        }
+        co_return sstring(fmt::format("{}[{}]: {}", status.current, status.index, fmt::join(status.active_dst, ",")));
+    });
+
    ss::move_tablet.set(r, [&ctx, &ss] (std::unique_ptr<http::request> req) -> future<json_return_type> {
        auto src_host_id = validate_host_id(req->get_query_param("src_host"));
        shard_id src_shard_id = validate_int(req->get_query_param("src_shard"));
@@ -1543,6 +1504,11 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        }
        auto ks = req->get_query_param("ks");
        auto table = req->get_query_param("table");
+        bool await_completion = false;
+        auto await = req->get_query_param("await_completion");
+        if (!await.empty()) {
+            await_completion = validate_bool(await);
+        }
        validate_table(ctx, ks, table);
        auto table_id = ctx.db.local().find_column_family(ks, table).schema()->id();
        std::variant<utils::chunked_vector<dht::token>, service::storage_service::all_tokens_tag> tokens_variant;
@@ -1551,8 +1517,22 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        } else {
            tokens_variant = tokens;
        }
+        auto hosts = req->get_query_param("hosts_filter");
+        auto dcs = req->get_query_param("dcs_filter");

-        auto res = co_await ss.local().add_repair_tablet_request(table_id, tokens_variant);
+        std::unordered_set<locator::host_id> hosts_filter;
+        if (!hosts.empty()) {
+            std::string delim = ",";
+            hosts_filter = std::ranges::views::split(hosts, delim) | std::views::transform([](auto&& h) {
+                try {
+                    return locator::host_id(utils::UUID(std::string_view{h}));
+                } catch (...) {
+                    throw httpd::bad_param_exception(fmt::format("Wrong host_id format {}", h));
+                }
+            }) | std::ranges::to<std::unordered_set>();
+        }
+        auto dcs_filter = locator::tablet_task_info::deserialize_repair_dcs_filter(dcs);
+        auto res = co_await ss.local().add_repair_tablet_request(table_id, tokens_variant, hosts_filter, dcs_filter, await_completion);
        co_return json::json_return_type(res);
    });

@@ -1598,6 +1578,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::force_keyspace_compaction.unset(r);
    ss::force_keyspace_cleanup.unset(r);
    ss::cleanup_all.unset(r);
+    ss::reset_cleanup_needed.unset(r);
    ss::perform_keyspace_offstrategy_compaction.unset(r);
    ss::upgrade_sstables.unset(r);
    ss::force_flush.unset(r);
@@ -1653,6 +1634,7 @@ void unset_storage_service(http_context& ctx, routes& r) {
    ss::reload_raft_topology_state.unset(r);
    ss::upgrade_to_raft_topology.unset(r);
    ss::raft_topology_upgrade_status.unset(r);
+    ss::raft_topology_get_cmd_status.unset(r);
    ss::move_tablet.unset(r);
    ss::add_tablet_replica.unset(r);
    ss::del_tablet_replica.unset(r);
--- a/api/task_manager.cc
+++ b/api/task_manager.cc
@@ -232,6 +232,32 @@ void set_task_manager(http_context& ctx, routes& r, sharded<tasks::task_manager>
        uint32_t user_ttl = cfg.user_task_ttl_seconds();
        co_return json::json_return_type(user_ttl);
    });
+
+    tm::drain_tasks.set(r, [&tm] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        co_await tm.invoke_on_all([&req] (tasks::task_manager& tm) -> future<> {
+            tasks::task_manager::module_ptr module;
+            try {
+                module = tm.find_module(req->get_path_param("module"));
+            } catch (...) {
+                throw bad_param_exception(fmt::format("{}", std::current_exception()));
+            }
+
+            const auto& local_tasks = module->get_local_tasks();
+            std::vector<tasks::task_id> ids;
+            ids.reserve(local_tasks.size());
+            std::transform(begin(local_tasks), end(local_tasks), std::back_inserter(ids), [] (const auto& task) {
+                return task.second->is_complete() ? task.first : tasks::task_id::create_null_id();
+            });
+
+            for (auto&& id : ids) {
+                if (id) {
+                    module->unregister_task(id);
+                }
+                co_await maybe_yield();
+            }
+        });
+        co_return json_void();
+    });
 }

 void unset_task_manager(http_context& ctx, routes& r) {
@@ -243,6 +269,7 @@ void unset_task_manager(http_context& ctx, routes& r) {
    tm::get_task_status_recursively.unset(r);
    tm::get_and_update_ttl.unset(r);
    tm::get_ttl.unset(r);
+    tm::drain_tasks.unset(r);
 }

 }
--- a/api/task_manager_test.cc
+++ b/api/task_manager_test.cc
@@ -6,6 +6,9 @@
 * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
 */

+
+#include "build_mode.hh"
+
 #ifndef SCYLLA_BUILD_MODE_RELEASE

 #include <seastar/core/coroutine.hh>
--- a/api/tasks.cc
+++ b/api/tasks.cc
@@ -37,45 +37,74 @@ static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
    };
 }

+future<tasks::task_manager::task_ptr> force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request> req) {
+    auto& db = ctx.db;
+    auto params = req_params({
+        std::pair("keyspace", mandatory::yes),
+        std::pair("cf", mandatory::no),
+        std::pair("flush_memtables", mandatory::no),
+        std::pair("consider_only_existing_data", mandatory::no),
+    });
+    params.process(*req);
+    auto keyspace = validate_keyspace(ctx, *params.get("keyspace"));
+    auto table_infos = parse_table_infos(keyspace, ctx, params.get("cf").value_or(""));
+    auto flush = params.get_as<bool>("flush_memtables").value_or(true);
+    auto consider_only_existing_data = params.get_as<bool>("consider_only_existing_data").value_or(false);
+    apilog.info("force_keyspace_compaction: keyspace={} tables={}, flush={} consider_only_existing_data={}", keyspace, table_infos, flush, consider_only_existing_data);
+
+    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+    std::optional<compaction::flush_mode> fmopt;
+    if (!flush && !consider_only_existing_data) {
+        fmopt = compaction::flush_mode::skip;
+    }
+    return compaction_module.make_and_start_task<compaction::major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt, consider_only_existing_data);
+}
+
+future<tasks::task_manager::task_ptr> upgrade_sstables(http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) {
+    auto& db = ctx.db;
+    bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
+
+    apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
+
+    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+    return compaction_module.make_and_start_task<compaction::upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
+}
+
+future<tasks::task_manager::task_ptr> force_keyspace_cleanup(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
+    auto& db = ctx.db;
+    auto keyspace = validate_keyspace(ctx, req);
+    auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
+    const auto& rs = db.local().find_keyspace(keyspace).get_replication_strategy();
+    if (rs.get_type() == locator::replication_strategy_type::local || !rs.is_vnode_based()) {
+        auto reason = rs.get_type() == locator::replication_strategy_type::local ? "require" : "support";
+        apilog.info("Keyspace {} does not {} cleanup", keyspace, reason);
+        co_return nullptr;
+    }
+    apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
+    if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
+        auto msg = "Can not perform cleanup operation when topology changes";
+        apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
+        co_await coroutine::return_exception(std::runtime_error(msg));
+    }
+
+    auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
+    co_return co_await compaction_module.make_and_start_task<compaction::cleanup_keyspace_compaction_task_impl>(
+        {}, std::move(keyspace), db, table_infos, compaction::flush_mode::all_tables, tasks::is_user_task::yes);
+}
+
 void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& snap_ctl) {
    t::force_keyspace_compaction_async.set(r, [&ctx](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        auto params = req_params({
-            std::pair("keyspace", mandatory::yes),
-            std::pair("cf", mandatory::no),
-            std::pair("flush_memtables", mandatory::no),
-        });
-        params.process(*req);
-        auto keyspace = validate_keyspace(ctx, *params.get("keyspace"));
-        auto table_infos = parse_table_infos(keyspace, ctx, params.get("cf").value_or(""));
-        auto flush = params.get_as<bool>("flush_memtables").value_or(true);
-        apilog.debug("force_keyspace_compaction_async: keyspace={} tables={}, flush={}", keyspace, table_infos, flush);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        std::optional<flush_mode> fmopt;
-        if (!flush) {
-            fmopt = flush_mode::skip;
-        }
-        auto task = co_await compaction_module.make_and_start_task<major_keyspace_compaction_task_impl>({}, std::move(keyspace), tasks::task_id::create_null_id(), db, table_infos, fmopt);
-
+        auto task = co_await force_keyspace_compaction(ctx, std::move(req));
        co_return json::json_return_type(task->get_status().id.to_sstring());
    });

    t::force_keyspace_cleanup_async.set(r, [&ctx, &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        auto keyspace = validate_keyspace(ctx, req);
-        auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
-        apilog.info("force_keyspace_cleanup_async: keyspace={} tables={}", keyspace, table_infos);
-        if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
-            auto msg = "Can not perform cleanup operation when topology changes";
-            apilog.warn("force_keyspace_cleanup_async: keyspace={} tables={}: {}", keyspace, table_infos, msg);
-            co_await coroutine::return_exception(std::runtime_error(msg));
+        tasks::task_id id = tasks::task_id::create_null_id();
+        auto task = co_await force_keyspace_cleanup(ctx, ss, std::move(req));
+        if (task) {
+            id = task->get_status().id;
        }
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<cleanup_keyspace_compaction_task_impl>({}, std::move(keyspace), db, table_infos, flush_mode::all_tables, tasks::is_user_task::yes);
-
-        co_return json::json_return_type(task->get_status().id.to_sstring());
+        co_return json::json_return_type(id.to_sstring());
    });

    t::perform_keyspace_offstrategy_compaction_async.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
@@ -87,14 +116,7 @@ void set_tasks_compaction_module(http_context& ctx, routes& r, sharded<service::
    }));

    t::upgrade_sstables_async.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
-        auto& db = ctx.db;
-        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
-
-        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
-
-        auto& compaction_module = db.local().get_compaction_manager().get_task_manager_module();
-        auto task = co_await compaction_module.make_and_start_task<upgrade_sstables_compaction_task_impl>({}, std::move(keyspace), db, table_infos, exclude_current_version);
-
+        auto task = co_await upgrade_sstables(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
        co_return json::json_return_type(task->get_status().id.to_sstring());
    }));

--- a/api/tasks.hh
+++ b/api/tasks.hh
@@ -15,6 +15,10 @@ namespace seastar::httpd {
 class routes;
 }

+namespace seastar::http {
+struct request;
+}
+
 namespace service {
 class storage_service;
 }
@@ -25,4 +29,8 @@ struct http_context;
 void set_tasks_compaction_module(http_context& ctx, httpd::routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& snap_ctl);
 void unset_tasks_compaction_module(http_context& ctx, httpd::routes& r);

+future<tasks::task_manager::task_ptr> force_keyspace_compaction(http_context& ctx, std::unique_ptr<http::request> req);
+future<tasks::task_manager::task_ptr> force_keyspace_cleanup(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req);
+future<tasks::task_manager::task_ptr> upgrade_sstables(http_context& ctx, std::unique_ptr<http::request> req, sstring keyspace, std::vector<table_info> table_infos);
+
 }
--- a/api/token_metadata.cc
+++ b/api/token_metadata.cc
@@ -74,6 +74,9 @@ void set_token_metadata(http_context& ctx, routes& r, sharded<locator::shared_to
    });

    ss::get_host_id_map.set(r, [&tm, &g](const_req req) {
+        if (!g.local().is_enabled()) {
+            throw std::runtime_error("The gossiper is not ready yet");
+        }
        std::vector<ss::mapper> res;
        auto map = tm.local().get()->get_host_ids() |
            std::views::transform([&g] (locator::host_id id) { return std::make_pair(g.local().get_address_map().get(id), id); }) |
--- a/audit/audit_syslog_storage_helper.cc
+++ b/audit/audit_syslog_storage_helper.cc
@@ -33,20 +33,6 @@ namespace audit {

 namespace {

-future<> syslog_send_helper(net::datagram_channel& sender,
-                            const socket_address& address,
-                            const sstring& msg) {
-    return sender.send(address, net::packet{msg.data(), msg.size()}).handle_exception([address](auto&& exception_ptr) {
-        auto error_msg = seastar::format(
-            "Syslog audit backend failed (sending a message to {} resulted in {}).",
-            address,
-            exception_ptr
-        );
-        logger.error("{}", error_msg);
-        throw audit_exception(std::move(error_msg));
-    });
-}
-
 static auto syslog_address_helper(const db::config& cfg)
 {
    return cfg.audit_unix_socket_path.is_set()
@@ -56,9 +42,26 @@ static auto syslog_address_helper(const db::config& cfg)

 }

+future<> audit_syslog_storage_helper::syslog_send_helper(const sstring& msg) {
+    try {
+        auto lock = co_await get_units(_semaphore, 1, std::chrono::hours(1));
+        co_await _sender.send(_syslog_address, net::packet{msg.data(), msg.size()});
+    }
+    catch (const std::exception& e) {
+        auto error_msg = seastar::format(
+            "Syslog audit backend failed (sending a message to {} resulted in {}).",
+            _syslog_address,
+            e
+        );
+        logger.error("{}", error_msg);
+        throw audit_exception(std::move(error_msg));
+    }
+}
+
 audit_syslog_storage_helper::audit_syslog_storage_helper(cql3::query_processor& qp, service::migration_manager&) :
    _syslog_address(syslog_address_helper(qp.db().get_config())),
-    _sender(make_unbound_datagram_channel(AF_UNIX)) {
+    _sender(make_unbound_datagram_channel(AF_UNIX)),
+    _semaphore(1) {
 }

 audit_syslog_storage_helper::~audit_syslog_storage_helper() {
@@ -73,10 +76,10 @@ audit_syslog_storage_helper::~audit_syslog_storage_helper() {
 */
 future<> audit_syslog_storage_helper::start(const db::config& cfg) {
    if (this_shard_id() != 0) {
-        return make_ready_future();
+        co_return;
    }

-    return syslog_send_helper(_sender, _syslog_address, "Initializing syslog audit backend.");
+    co_await syslog_send_helper("Initializing syslog audit backend.");
 }

 future<> audit_syslog_storage_helper::stop() {
@@ -106,7 +109,7 @@ future<> audit_syslog_storage_helper::write(const audit_info* audit_info,
                                    audit_info->table(),
                                    username);

-    return syslog_send_helper(_sender, _syslog_address, msg);
+    co_await syslog_send_helper(msg);
 }

 future<> audit_syslog_storage_helper::write_login(const sstring& username,
@@ -125,7 +128,7 @@ future<> audit_syslog_storage_helper::write_login(const sstring& username,
                                    username,
                                    (error ? "true" : "false"));

-    co_await syslog_send_helper(_sender, _syslog_address, msg.c_str());
+    co_await syslog_send_helper(msg.c_str());
 }

 using registry = class_registrator<storage_helper, audit_syslog_storage_helper, cql3::query_processor&, service::migration_manager&>;
--- a/audit/audit_syslog_storage_helper.hh
+++ b/audit/audit_syslog_storage_helper.hh
@@ -24,6 +24,9 @@ namespace audit {
 class audit_syslog_storage_helper : public storage_helper {
    socket_address _syslog_address;
    net::datagram_channel _sender;
+    seastar::semaphore _semaphore;
+
+    future<> syslog_send_helper(const sstring& msg);
 public:
    explicit audit_syslog_storage_helper(cql3::query_processor&, service::migration_manager&);
    virtual ~audit_syslog_storage_helper();
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -119,6 +119,11 @@ future<> create_legacy_metadata_table_if_missing(
    return qs;
 }

+::service::raft_timeout get_raft_timeout() noexcept {
+    auto dur = internal_distributed_query_state().get_client_state().get_timeout_config().other_timeout;
+    return ::service::raft_timeout{.value = lowres_clock::now() + dur};
+}
+
 static future<> announce_mutations_with_guard(
        ::service::raft_group0_client& group0_client,
        std::vector<canonical_mutation> muts,
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -17,6 +17,7 @@

 #include "types/types.hh"
 #include "service/raft/raft_group0_client.hh"
+#include "timeout_config.hh"

 using namespace std::chrono_literals;

@@ -77,6 +78,8 @@ future<> create_legacy_metadata_table_if_missing(
 ///
 ::service::query_state& internal_distributed_query_state() noexcept;

+::service::raft_timeout get_raft_timeout() noexcept;
+
 // Execute update query via group0 mechanism, mutations will be applied on all nodes.
 // Use this function when need to perform read before write on a single guard or if
 // you have more than one mutation and potentially exceed single command size limit.
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -233,9 +233,9 @@ future<role_set> ldap_role_manager::query_granted(std::string_view grantee_name,
 }

 future<role_to_directly_granted_map>
-ldap_role_manager::query_all_directly_granted() {
+ldap_role_manager::query_all_directly_granted(::service::query_state& qs) {
    role_to_directly_granted_map result;
-    auto roles = co_await query_all();
+    auto roles = co_await query_all(qs);
    for (auto& role: roles) {
        auto granted_set = co_await query_granted(role, recursive_role_query::no);
        for (auto& granted: granted_set) {
@@ -247,8 +247,8 @@ ldap_role_manager::query_all_directly_granted() {
    co_return result;
 }

-future<role_set> ldap_role_manager::query_all() {
-    return _std_mgr.query_all();
+future<role_set> ldap_role_manager::query_all(::service::query_state& qs) {
+    return _std_mgr.query_all(qs);
 }

 future<> ldap_role_manager::create_role(std::string_view role_name) {
@@ -311,12 +311,12 @@ future<bool> ldap_role_manager::can_login(std::string_view role_name) {
 }

 future<std::optional<sstring>> ldap_role_manager::get_attribute(
-        std::string_view role_name, std::string_view attribute_name) {
-    return _std_mgr.get_attribute(role_name, attribute_name);
+        std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
+    return _std_mgr.get_attribute(role_name, attribute_name, qs);
 }

-future<role_manager::attribute_vals> ldap_role_manager::query_attribute_for_all(std::string_view attribute_name) {
-    return _std_mgr.query_attribute_for_all(attribute_name);
+future<role_manager::attribute_vals> ldap_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state& qs) {
+    return _std_mgr.query_attribute_for_all(attribute_name, qs);
 }

 future<> ldap_role_manager::set_attribute(
@@ -338,8 +338,7 @@ future<std::vector<cql3::description>> ldap_role_manager::describe_role_grants()
 }

 future<> ldap_role_manager::ensure_superuser_is_created() {
-    // ldap is responsible for users
-    co_return;
+    return _std_mgr.ensure_superuser_is_created();
 }

 } // namespace auth
--- a/auth/ldap_role_manager.hh
+++ b/auth/ldap_role_manager.hh
@@ -77,9 +77,9 @@ class ldap_role_manager : public role_manager {

    future<role_set> query_granted(std::string_view, recursive_role_query) override;

-    future<role_to_directly_granted_map> query_all_directly_granted() override;
+    future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;

-    future<role_set> query_all() override;
+    future<role_set> query_all(::service::query_state&) override;

    future<bool> exists(std::string_view) override;

@@ -87,9 +87,9 @@ class ldap_role_manager : public role_manager {

    future<bool> can_login(std::string_view) override;

-    future<std::optional<sstring>> get_attribute(std::string_view, std::string_view) override;
+    future<std::optional<sstring>> get_attribute(std::string_view, std::string_view, ::service::query_state&) override;

-    future<role_manager::attribute_vals> query_attribute_for_all(std::string_view) override;
+    future<role_manager::attribute_vals> query_attribute_for_all(std::string_view, ::service::query_state&) override;

    future<> set_attribute(std::string_view, std::string_view, std::string_view, ::service::group0_batch& mc) override;

--- a/auth/maintenance_socket_role_manager.cc
+++ b/auth/maintenance_socket_role_manager.cc
@@ -78,11 +78,11 @@ future<role_set> maintenance_socket_role_manager::query_granted(std::string_view
    return operation_not_supported_exception<role_set>("QUERY GRANTED");
 }

-future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted() {
+future<role_to_directly_granted_map> maintenance_socket_role_manager::query_all_directly_granted(::service::query_state&) {
    return operation_not_supported_exception<role_to_directly_granted_map>("QUERY ALL DIRECTLY GRANTED");
 }

-future<role_set> maintenance_socket_role_manager::query_all() {
+future<role_set> maintenance_socket_role_manager::query_all(::service::query_state&) {
    return operation_not_supported_exception<role_set>("QUERY ALL");
 }

@@ -98,11 +98,11 @@ future<bool> maintenance_socket_role_manager::can_login(std::string_view role_na
    return make_ready_future<bool>(true);
 }

-future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name) {
+future<std::optional<sstring>> maintenance_socket_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) {
    return operation_not_supported_exception<std::optional<sstring>>("GET ATTRIBUTE");
 }

-future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name) {
+future<role_manager::attribute_vals> maintenance_socket_role_manager::query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) {
    return operation_not_supported_exception<role_manager::attribute_vals>("QUERY ATTRIBUTE");
 }

--- a/auth/maintenance_socket_role_manager.hh
+++ b/auth/maintenance_socket_role_manager.hh
@@ -53,9 +53,9 @@ public:

    virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;

-    virtual future<role_to_directly_granted_map> query_all_directly_granted() override;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;

-    virtual future<role_set> query_all() override;
+    virtual future<role_set> query_all(::service::query_state&) override;

    virtual future<bool> exists(std::string_view role_name) override;

@@ -63,9 +63,9 @@ public:

    virtual future<bool> can_login(std::string_view role_name) override;

-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) override;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) override;

-    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name) override;
+    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) override;

    virtual future<> set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) override;

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -117,7 +117,8 @@ future<> password_authenticator::migrate_legacy_metadata() const {
    });
 }

-future<> password_authenticator::create_default_if_missing() {
+future<> password_authenticator::legacy_create_default_if_missing() {
+    SCYLLA_ASSERT(legacy_mode(_qp));
    const auto exists = co_await default_role_row_satisfies(_qp, &has_salted_hash, _superuser);
    if (exists) {
        co_return;
@@ -127,18 +128,75 @@ future<> password_authenticator::create_default_if_missing() {
        salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt);
    }
    const auto query = update_row_query();
-    if (legacy_mode(_qp)) {
-        co_await _qp.execute_internal(
+    co_await _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_query_state(),
            {salted_pwd, _superuser},
            cql3::query_processor::cache_internal::no);
-        plogger.info("Created default superuser authentication record.");
-    } else {
-        co_await announce_mutations(_qp, _group0_client, query,
-            {salted_pwd, _superuser}, _as, ::service::raft_timeout{});
-        plogger.info("Created default superuser authentication record.");
+    plogger.info("Created default superuser authentication record.");
+}
+
+future<> password_authenticator::maybe_create_default_password() {
+    auto needs_password = [this] () -> future<bool> {
+        const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", get_auth_ks_name(_qp), meta::roles_table::name);
+        auto results = co_await _qp.execute_internal(query,
+                db::consistency_level::LOCAL_ONE,
+                internal_distributed_query_state(), cql3::query_processor::cache_internal::yes);
+        // Don't add default password if
+        // - there is no default superuser
+        // - there is a superuser with a password.
+        bool has_default = false;
+        bool has_superuser_with_password = false;
+        for (auto& result : *results) {
+            if (result.get_as<sstring>(meta::roles_table::role_col_name) == _superuser) {
+                has_default = true;
+            }
+            if (has_salted_hash(result)) {
+                has_superuser_with_password = true;
+            }
+        }
+        co_return has_default && !has_superuser_with_password;
+    };
+    if (!co_await needs_password()) {
+        co_return;
+    }
+    // We don't want to start operation earlier to avoid quorum requirement in
+    // a common case.
+    ::service::group0_batch batch(
+            co_await _group0_client.start_operation(_as, get_raft_timeout()));
+    // Check again as the state may have changed before we took the guard (batch).
+    if (!co_await needs_password()) {
+        co_return;
+    }
+    // Set default superuser's password.
+    std::string salted_pwd(get_config_value(_qp.db().get_config().auth_superuser_salted_password(), ""));
+    if (salted_pwd.empty()) {
+        salted_pwd = passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt);
+    }
+    const auto update_query = update_row_query();
+    co_await collect_mutations(_qp, batch, update_query, {salted_pwd, _superuser});
+    co_await std::move(batch).commit(_group0_client, _as, get_raft_timeout());
+    plogger.info("Created default superuser authentication record.");
+}
+
+future<> password_authenticator::maybe_create_default_password_with_retries() {
+    size_t retries = _migration_manager.get_concurrent_ddl_retries();
+    while (true)  {
+        try {
+            co_return co_await maybe_create_default_password();
+        } catch (const ::service::group0_concurrent_modification& ex) {
+            plogger.warn("Failed to execute maybe_create_default_password due to guard conflict.{}.", retries ? " Retrying" : " Number of retries exceeded, giving up");
+            if (retries--) {
+                continue;
+            }
+            // Log error but don't crash the whole node startup sequence.
+            plogger.error("Failed to create default superuser password due to guard conflict.");
+            co_return;
+        } catch (const ::service::raft_operation_timeout_error& ex) {
+            plogger.error("Failed to create default superuser password due to exception: {}", ex.what());
+            co_return;
+        }
    }
 }

@@ -161,8 +219,9 @@ future<> password_authenticator::start() {
                        migrate_legacy_metadata().get();
                        return;
                    }
+                    legacy_create_default_if_missing().get();
                }
-                create_default_if_missing().get();
+                    maybe_create_default_password_with_retries().get();
            });
        });

--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -40,7 +40,7 @@ class password_authenticator : public authenticator {
    ::service::migration_manager& _migration_manager;
    future<> _stopped;
    abort_source _as;
-    std::string _superuser;
+    std::string _superuser; // default superuser name from the config (may or may not be present in roles table)

 public:
    static db::consistency_level consistency_for_user(std::string_view role_name);
@@ -85,7 +85,10 @@ private:

    future<> migrate_legacy_metadata() const;

-    future<> create_default_if_missing();
+    future<> legacy_create_default_if_missing();
+
+    future<> maybe_create_default_password();
+    future<> maybe_create_default_password_with_retries();

    sstring update_row_query() const;
 };
--- a/auth/role_manager.hh
+++ b/auth/role_manager.hh
@@ -17,12 +17,17 @@
 #include <seastar/core/format.hh>
 #include <seastar/core/sstring.hh>

+#include "auth/common.hh"
 #include "auth/resource.hh"
 #include "cql3/description.hh"
 #include "seastarx.hh"
 #include "exceptions/exceptions.hh"
 #include "service/raft/raft_group0_client.hh"

+namespace service {
+class query_state;
+};
+
 namespace auth {

 struct role_config final {
@@ -167,9 +172,9 @@ public:
    ///   (role2, role3)
    /// }
    ///  
-    virtual future<role_to_directly_granted_map> query_all_directly_granted() = 0;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state& = internal_distributed_query_state()) = 0;

-    virtual future<role_set> query_all() = 0;
+    virtual future<role_set> query_all(::service::query_state& = internal_distributed_query_state()) = 0;

    virtual future<bool> exists(std::string_view role_name) = 0;

@@ -186,12 +191,12 @@ public:
    ///
    /// \returns the value of the named attribute, if one is set.
    ///
-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) = 0;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& = internal_distributed_query_state()) = 0;

    ///
    /// \returns a mapping of each role's value for the named attribute, if one is set for the role.
    ///
-    virtual future<attribute_vals> query_attribute_for_all(std::string_view attribute_name) = 0;
+    virtual future<attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state& = internal_distributed_query_state()) = 0;

    /// Sets `attribute_name` with `attribute_value` for `role_name`.
    /// \returns an exceptional future with nonexistant_role if the role does not exist.
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -240,6 +240,13 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
        });
    }
    co_await _role_manager->start();
+    if (this_shard_id() == 0) {
+        // Role manager and password authenticator have this odd startup
+        // mechanism where they asynchronously create the superuser role
+        // in the background. Correct password creation depends on role
+        // creation therefore we need to wait here.
+        co_await _role_manager->ensure_superuser_is_created();
+    }
    co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
    _permissions_cache = std::make_unique<permissions_cache>(_loading_cache_config, *this, log);
    co_await once_among_shards([this] {
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -9,6 +9,7 @@
 #include "auth/standard_role_manager.hh"

 #include <optional>
+#include <stdexcept>
 #include <unordered_set>
 #include <vector>

@@ -28,6 +29,7 @@
 #include "cql3/util.hh"
 #include "db/consistency_level_type.hh"
 #include "exceptions/exceptions.hh"
+#include "utils/error_injection.hh"
 #include "utils/log.hh"
 #include <seastar/core/loop.hh>
 #include <seastar/coroutine/maybe_yield.hh>
@@ -178,7 +180,8 @@ future<> standard_role_manager::create_legacy_metadata_tables_if_missing() const
                    _migration_manager)).discard_result();
 }

-future<> standard_role_manager::create_default_role_if_missing() {
+future<> standard_role_manager::legacy_create_default_role_if_missing() {
+    SCYLLA_ASSERT(legacy_mode(_qp));
    try {
        const auto exists = co_await default_role_row_satisfies(_qp, &has_can_login, _superuser);
        if (exists) {
@@ -188,16 +191,12 @@ future<> standard_role_manager::create_default_role_if_missing() {
                get_auth_ks_name(_qp),
                meta::roles_table::name,
                meta::roles_table::role_col_name);
-        if (legacy_mode(_qp)) {
-            co_await _qp.execute_internal(
-                    query,
-                    db::consistency_level::QUORUM,
-                    internal_distributed_query_state(),
-                    {_superuser},
-                    cql3::query_processor::cache_internal::no).discard_result();
-        } else {
-            co_await announce_mutations(_qp, _group0_client, query, {_superuser}, _as, ::service::raft_timeout{});
-        }
+        co_await _qp.execute_internal(
+                query,
+                db::consistency_level::QUORUM,
+                internal_distributed_query_state(),
+                {_superuser},
+                cql3::query_processor::cache_internal::no).discard_result();
        log.info("Created default superuser role '{}'.", _superuser);
    } catch(const exceptions::unavailable_exception& e) {
        log.warn("Skipped default role setup: some nodes were not ready; will retry");
@@ -205,6 +204,60 @@ future<> standard_role_manager::create_default_role_if_missing() {
    }
 }

+future<> standard_role_manager::maybe_create_default_role() {
+    auto has_superuser = [this] () -> future<bool> {
+        const sstring query = seastar::format("SELECT * FROM {}.{} WHERE is_superuser = true ALLOW FILTERING", get_auth_ks_name(_qp), meta::roles_table::name);
+        auto results = co_await _qp.execute_internal(query, db::consistency_level::LOCAL_ONE,
+                internal_distributed_query_state(), cql3::query_processor::cache_internal::yes);
+        for (const auto& result : *results) {
+            if (has_can_login(result)) {
+                co_return true;
+            }
+        }
+        co_return false;
+    };
+    if (co_await has_superuser()) {
+        co_return;
+    }
+    // We don't want to start operation earlier to avoid quorum requirement in
+    // a common case.
+    ::service::group0_batch batch(
+            co_await _group0_client.start_operation(_as, get_raft_timeout()));
+    // Check again as the state may have changed before we took the guard (batch).
+    if (co_await has_superuser()) {
+        co_return;
+    }
+    // There is no superuser which has can_login field - create default role.
+    // Note that we don't check if can_login is set to true.
+    const sstring insert_query = seastar::format("INSERT INTO {}.{} ({}, is_superuser, can_login) VALUES (?, true, true)",
+            get_auth_ks_name(_qp),
+            meta::roles_table::name,
+            meta::roles_table::role_col_name);
+    co_await collect_mutations(_qp, batch, insert_query, {_superuser});
+    co_await std::move(batch).commit(_group0_client, _as, get_raft_timeout());
+    log.info("Created default superuser role '{}'.", _superuser);
+}
+
+future<> standard_role_manager::maybe_create_default_role_with_retries() {
+    size_t retries = _migration_manager.get_concurrent_ddl_retries();
+    while (true)  {
+        try {
+            co_return co_await maybe_create_default_role();
+        } catch (const ::service::group0_concurrent_modification& ex) {
+            log.warn("Failed to execute maybe_create_default_role due to guard conflict.{}.", retries ? " Retrying" : " Number of retries exceeded, giving up");
+            if (retries--) {
+                continue;
+            }
+            // Log error but don't crash the whole node startup sequence.
+            log.error("Failed to create default superuser role due to guard conflict.");
+            co_return;
+        } catch (const ::service::raft_operation_timeout_error& ex) {
+            log.error("Failed to create default superuser role due to exception: {}", ex.what());
+            co_return;
+        }
+    }
+}
+
 static const sstring legacy_table_name{"users"};

 bool standard_role_manager::legacy_metadata_exists() {
@@ -266,10 +319,13 @@ future<> standard_role_manager::start() {
                    co_await migrate_legacy_metadata();
                    co_return;
                }
+                co_await legacy_create_default_role_if_missing();
            }
-            co_await create_default_role_if_missing();
            if (!legacy) {
-                _superuser_created_promise.set_value();
+                co_await maybe_create_default_role_with_retries();
+                if (!_superuser_created_promise.available()) {
+                    _superuser_created_promise.set_value();
+                }
            }
        };

@@ -596,21 +652,30 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
    });
 }

-future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted() {
+future<role_to_directly_granted_map> standard_role_manager::query_all_directly_granted(::service::query_state& qs) {
    const sstring query = seastar::format("SELECT * FROM {}.{}",
            get_auth_ks_name(_qp),
            meta::role_members_table::name);

+    const auto results = co_await _qp.execute_internal(
+            query,
+            db::consistency_level::ONE,
+            qs,
+            cql3::query_processor::cache_internal::yes);
+
    role_to_directly_granted_map roles_map;
-    co_await _qp.query_internal(query, [&roles_map] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
-        roles_map.insert({row.get_as<sstring>("member"), row.get_as<sstring>("role")});
-        co_return stop_iteration::no;
-    });
+    std::transform(
+            results->begin(),
+            results->end(),
+            std::inserter(roles_map, roles_map.begin()),
+            [] (const cql3::untyped_result_set_row& row) {
+                return std::make_pair(row.get_as<sstring>("member"), row.get_as<sstring>("role")); }
+    );

    co_return roles_map;
 }

-future<role_set> standard_role_manager::query_all() {
+future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
    const sstring query = seastar::format("SELECT {} FROM {}.{}",
            meta::roles_table::role_col_name,
            get_auth_ks_name(_qp),
@@ -619,10 +684,16 @@ future<role_set> standard_role_manager::query_all() {
    // To avoid many copies of a view.
    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);

+    if (utils::get_local_injector().enter("standard_role_manager_fail_legacy_query")) {
+        if (legacy_mode(_qp)) {
+            throw std::runtime_error("standard_role_manager::query_all: failed due to error injection");
+        }
+    }
+
    const auto results = co_await _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
-            internal_distributed_query_state(),
+            qs,
            cql3::query_processor::cache_internal::yes);

    role_set roles;
@@ -654,11 +725,11 @@ future<bool> standard_role_manager::can_login(std::string_view role_name) {
    });
 }

-future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name) {
+future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
    const sstring query = seastar::format("SELECT name, value FROM {}.{} WHERE role = ? AND name = ?",
            get_auth_ks_name(_qp),
            meta::role_attributes_table::name);
-    const auto result_set = co_await _qp.execute_internal(query, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
+    const auto result_set = co_await _qp.execute_internal(query, db::consistency_level::ONE, qs, {sstring(role_name), sstring(attribute_name)}, cql3::query_processor::cache_internal::yes);
    if (!result_set->empty()) {
        const cql3::untyped_result_set_row &row = result_set->one();
        co_return std::optional<sstring>(row.get_as<sstring>("value"));
@@ -666,11 +737,11 @@ future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_
    co_return std::optional<sstring>{};
 }

-future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name) {
-    return query_all().then([this, attribute_name] (role_set roles) {
-        return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles)] (attribute_vals &role_to_att_val) {
-            return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name] (sstring role) {
-                return get_attribute(role, attribute_name).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
+future<role_manager::attribute_vals> standard_role_manager::query_attribute_for_all (std::string_view attribute_name, ::service::query_state& qs) {
+    return query_all(qs).then([this, attribute_name, &qs] (role_set roles) {
+        return do_with(attribute_vals{}, [this, attribute_name, roles = std::move(roles), &qs] (attribute_vals &role_to_att_val) {
+            return parallel_for_each(roles.begin(), roles.end(), [this, &role_to_att_val, attribute_name, &qs] (sstring role) {
+                return get_attribute(role, attribute_name, qs).then([&role_to_att_val, role] (std::optional<sstring> att_val) {
                    if (att_val) {
                        role_to_att_val.emplace(std::move(role), std::move(*att_val));
                    }
@@ -715,7 +786,7 @@ future<> standard_role_manager::remove_attribute(std::string_view role_name, std
 future<std::vector<cql3::description>> standard_role_manager::describe_role_grants() {
    std::vector<cql3::description> result{};

-    const auto grants = co_await query_all_directly_granted();
+    const auto grants = co_await query_all_directly_granted(internal_distributed_query_state());
    result.reserve(grants.size());

    for (const auto& [grantee_role, granted_role] : grants) {
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -66,9 +66,9 @@ public:

    virtual future<role_set> query_granted(std::string_view grantee_name, recursive_role_query) override;

-    virtual future<role_to_directly_granted_map> query_all_directly_granted() override;
+    virtual future<role_to_directly_granted_map> query_all_directly_granted(::service::query_state&) override;

-    virtual future<role_set> query_all() override;
+    virtual future<role_set> query_all(::service::query_state&) override;

    virtual future<bool> exists(std::string_view role_name) override;

@@ -76,9 +76,9 @@ public:

    virtual future<bool> can_login(std::string_view role_name) override;

-    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name) override;
+    virtual future<std::optional<sstring>> get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state&) override;

-    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name) override;
+    virtual future<role_manager::attribute_vals> query_attribute_for_all(std::string_view attribute_name, ::service::query_state&) override;

    virtual future<> set_attribute(std::string_view role_name, std::string_view attribute_name, std::string_view attribute_value, ::service::group0_batch& mc) override;

@@ -95,7 +95,10 @@ private:

    future<> migrate_legacy_metadata();

-    future<> create_default_role_if_missing();
+    future<> legacy_create_default_role_if_missing();
+
+    future<> maybe_create_default_role();
+    future<> maybe_create_default_role_with_retries();

    future<> create_or_replace(std::string_view role_name, const role_config&, ::service::group0_batch&);

--- a/cache_mutation_reader.hh
+++ b/cache_mutation_reader.hh
@@ -123,6 +123,9 @@ class cache_mutation_reader final : public mutation_reader::impl {
    gc_clock::time_point _read_time;
    gc_clock::time_point _gc_before;

+    api::timestamp_type _max_purgeable_timestamp = api::missing_timestamp;
+    api::timestamp_type _max_purgeable_timestamp_shadowable = api::missing_timestamp;
+
    future<> do_fill_buffer();
    future<> ensure_underlying();
    void copy_from_cache_to_buffer();
@@ -207,6 +210,11 @@ class cache_mutation_reader final : public mutation_reader::impl {
        return gc_clock::time_point::min();
    }

+    bool can_gc(tombstone t, is_shadowable is) const {
+        const auto max_purgeable = is ? _max_purgeable_timestamp_shadowable : _max_purgeable_timestamp;
+        return t.timestamp < max_purgeable;
+    }
+
 public:
    cache_mutation_reader(schema_ptr s,
                               dht::decorated_key dk,
@@ -228,8 +236,19 @@ public:
        , _read_time(get_read_time())
        , _gc_before(get_gc_before(*_schema, dk, _read_time))
    {
-        clogger.trace("csm {}: table={}.{}, reversed={}, snap={}", fmt::ptr(this), _schema->ks_name(), _schema->cf_name(), _read_context.is_reversed(),
-                      fmt::ptr(&*_snp));
+        _max_purgeable_timestamp = ctx.get_max_purgeable(dk, is_shadowable::no);
+        _max_purgeable_timestamp_shadowable = ctx.get_max_purgeable(dk, is_shadowable::yes);
+
+        clogger.trace("csm {}: table={}.{}, dk={}, gc-before={}, max-purgeable-regular={}, max-purgeable-shadowable={}, reversed={}, snap={}",
+                fmt::ptr(this),
+                _schema->ks_name(),
+                _schema->cf_name(),
+                dk,
+                _gc_before,
+                _max_purgeable_timestamp,
+                _max_purgeable_timestamp_shadowable,
+                _read_context.is_reversed(),
+                fmt::ptr(&*_snp));
        push_mutation_fragment(*_schema, _permit, partition_start(std::move(dk), _snp->partition_tombstone()));
    }
    cache_mutation_reader(schema_ptr s,
@@ -787,12 +806,12 @@ void cache_mutation_reader::copy_from_cache_to_buffer() {
            t.apply(range_tomb);

            auto row_tomb_expired = [&](row_tombstone tomb) {
-                return (tomb && tomb.max_deletion_time() < _gc_before);
+                return (tomb && tomb.max_deletion_time() < _gc_before && can_gc(tomb.tomb(), tomb.is_shadowable()));
            };

            auto is_row_dead = [&](const deletable_row& row) {
                auto& m = row.marker();
-                return (!m.is_missing() && m.is_dead(_read_time) && m.deletion_time() < _gc_before);
+                return (!m.is_missing() && m.is_dead(_read_time) && m.deletion_time() < _gc_before && can_gc(tombstone(m.timestamp(), m.deletion_time()), is_shadowable::no));
            };

            if (row_tomb_expired(t) || is_row_dead(row)) {
@@ -800,9 +819,11 @@ void cache_mutation_reader::copy_from_cache_to_buffer() {

                _read_context.cache()._tracker.on_row_compacted();

+                auto mutation_can_gc = can_gc_fn([this] (tombstone t, is_shadowable is) { return can_gc(t, is); });
+
                with_allocator(_snp->region().allocator(), [&] {
                    deletable_row row_copy(row_schema, row);
-                    row_copy.compact_and_expire(row_schema, t.tomb(), _read_time, always_gc, _gc_before, nullptr);
+                    row_copy.compact_and_expire(row_schema, t.tomb(), _read_time, mutation_can_gc, _gc_before, nullptr);
                    std::swap(row, row_copy);
                });
                remove_row = row.empty();
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -365,6 +365,9 @@ cdc::topology_description make_new_generation_description(
        const noncopyable_function<std::pair<size_t, uint8_t>(dht::token)>& get_sharding_info,
        const locator::token_metadata_ptr tmptr) {
    const auto tokens = get_tokens(bootstrap_tokens, tmptr);
+    if (tokens.empty()) {
+        on_internal_error(cdc_log, "Attempted to create a CDC generation from an empty list of tokens");
+    }

    utils::chunked_vector<token_range_description> vnode_descriptions;
    vnode_descriptions.reserve(tokens.size());
@@ -1112,7 +1115,9 @@ future<bool> generation_service::legacy_do_handle_cdc_generation(cdc::generation
    auto sys_dist_ks = get_sys_dist_ks();
    auto gen = co_await retrieve_generation_data(gen_id, _sys_ks.local(), *sys_dist_ks, { _token_metadata.get()->count_normal_token_owners() });
    if (!gen) {
-        throw std::runtime_error(fmt::format(
+        // This may happen during raft upgrade when a node gossips about a generation that
+        // was propagated through raft and we didn't apply it yet.
+        throw generation_handling_nonfatal_exception(fmt::format(
            "Could not find CDC generation {} in distributed system tables (current time: {}),"
            " even though some node gossiped about it.",
            gen_id, db_clock::now()));
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -56,8 +56,17 @@ using namespace std::chrono_literals;

 logging::logger cdc_log("cdc");

+namespace {
+
+// When dropping a column from a CDC log table, we set the drop timestamp
+// `column_drop_leeway` seconds into the future to ensure that for writes concurrent
+// with column drop, the write timestamp is before the column drop timestamp.
+constexpr auto column_drop_leeway = std::chrono::seconds(5);
+
+} // anonymous namespace
+
 namespace cdc {
-static schema_ptr create_log_schema(const schema&, std::optional<table_id> = {}, schema_ptr = nullptr);
+static schema_ptr create_log_schema(const schema&, api::timestamp_type, std::optional<table_id> = {}, schema_ptr = nullptr);
 }

 static constexpr auto cdc_group_name = "cdc";
@@ -167,7 +176,7 @@ public:
            ensure_that_table_uses_vnodes(ksm, schema);

            // in seastar thread
-            auto log_schema = create_log_schema(schema);
+            auto log_schema = create_log_schema(schema, timestamp);

            auto log_mut = db::schema_tables::make_create_table_mutations(log_schema, timestamp);

@@ -205,7 +214,7 @@ public:
            ensure_that_table_has_no_counter_columns(new_schema);
            ensure_that_table_uses_vnodes(*keyspace.metadata(), new_schema);

-            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);
+            auto new_log_schema = create_log_schema(new_schema, timestamp, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);

            auto log_mut = log_schema 
                ? db::schema_tables::make_update_table_mutations(db, keyspace.metadata(), log_schema, new_log_schema, timestamp)
@@ -496,7 +505,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
    return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
 }

-static schema_ptr create_log_schema(const schema& s, std::optional<table_id> uuid, schema_ptr old) {
+static schema_ptr create_log_schema(const schema& s, api::timestamp_type timestamp, std::optional<table_id> uuid, schema_ptr old) {
    schema_builder b(s.ks_name(), log_name(s.cf_name()));
    b.with_partitioner(cdc::cdc_partitioner::classname);
    b.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
@@ -531,6 +540,28 @@ static schema_ptr create_log_schema(const schema& s, std::optional<table_id> uui
    b.with_column(log_meta_column_name_bytes("ttl"), long_type);
    b.with_column(log_meta_column_name_bytes("end_of_batch"), boolean_type);
    b.set_caching_options(caching_options::get_disabled_caching_options());
+
+    auto validate_new_column = [&] (const sstring& name) {
+        // When dropping a column from a CDC log table, we set the drop timestamp to be
+        // `column_drop_leeway` seconds into the future (see `create_log_schema`).
+        // Therefore, when recreating a column with the same name, we need to validate
+        // that it's not recreated too soon and that the drop timestamp has passed.
+        if (old && old->dropped_columns().contains(name)) {
+            const auto& drop_info = old->dropped_columns().at(name);
+            auto create_time = api::timestamp_clock::time_point(api::timestamp_clock::duration(timestamp));
+            auto drop_time = api::timestamp_clock::time_point(api::timestamp_clock::duration(drop_info.timestamp));
+            if (drop_time > create_time) {
+                throw exceptions::invalid_request_exception(format("Cannot add column {} because a column with the same name was dropped too recently. Please retry after {} seconds",
+                        name, std::chrono::duration_cast<std::chrono::seconds>(drop_time - create_time).count() + 1));
+            }
+        }
+    };
+
+    auto add_column = [&] (sstring name, data_type type) {
+        validate_new_column(name);
+        b.with_column(to_bytes(name), type);
+    };
+
    auto add_columns = [&] (const schema::const_iterator_range_type& columns, bool is_data_col = false) {
        for (const auto& column : columns) {
            auto type = column.type;
@@ -552,9 +583,9 @@ static schema_ptr create_log_schema(const schema& s, std::optional<table_id> uui
                    }
                ));
            }
-            b.with_column(log_data_column_name_bytes(column.name()), type);
+            add_column(log_data_column_name(column.name_as_text()), type);
            if (is_data_col) {
-                b.with_column(log_data_column_deleted_name_bytes(column.name()), boolean_type);
+                add_column(log_data_column_deleted_name(column.name_as_text()), boolean_type);
            }
            if (column.type->is_multi_cell()) {
                auto dtype = visit(*type, make_visitor(
@@ -570,7 +601,7 @@ static schema_ptr create_log_schema(const schema& s, std::optional<table_id> uui
                        throw std::invalid_argument("Should not reach");
                    }
                ));
-                b.with_column(log_data_column_deleted_elements_name_bytes(column.name()), dtype);
+                add_column(log_data_column_deleted_elements_name(column.name_as_text()), dtype);
            }
        }
    };
@@ -592,7 +623,8 @@ static schema_ptr create_log_schema(const schema& s, std::optional<table_id> uui
        // not super efficient, but we don't do this often.
        for (auto& col : old->all_columns()) {
            if (!b.has_column({col.name(), col.name_as_text() })) {
-                b.without_column(col.name_as_text(), col.type, api::new_timestamp());
+                auto drop_ts = api::timestamp_clock::now() + column_drop_leeway;
+                b.without_column(col.name_as_text(), col.type, drop_ts.time_since_epoch().count());
            }
        }
    }
@@ -960,8 +992,12 @@ public:
    // Given a reference to such a column from the base schema, this function sets the corresponding column
    // in the log to the given value for the given row.
    void set_value(const clustering_key& log_ck, const column_definition& base_cdef, const managed_bytes_view& value) {
-        auto& log_cdef = *_log_schema.get_column_definition(log_data_column_name_bytes(base_cdef.name()));
-        _log_mut.set_cell(log_ck, log_cdef, atomic_cell::make_live(*base_cdef.type, _ts, value, _ttl));
+        auto log_cdef_ptr = _log_schema.get_column_definition(log_data_column_name_bytes(base_cdef.name()));
+        if (!log_cdef_ptr) {
+            throw exceptions::invalid_request_exception(format("CDC log schema for {}.{} does not have base column {}",
+                _log_schema.ks_name(), _log_schema.cf_name(), base_cdef.name_as_text()));
+        }
+        _log_mut.set_cell(log_ck, *log_cdef_ptr, atomic_cell::make_live(*base_cdef.type, _ts, value, _ttl));
    }

    // Each regular and static column in the base schema has a corresponding column in the log schema
@@ -969,7 +1005,13 @@ public:
    // Given a reference to such a column from the base schema, this function sets the corresponding column
    // in the log to `true` for the given row. If not called, the column will be `null`.
    void set_deleted(const clustering_key& log_ck, const column_definition& base_cdef) {
-        _log_mut.set_cell(log_ck, log_data_column_deleted_name_bytes(base_cdef.name()), data_value(true), _ts, _ttl);
+        auto log_cdef_ptr = _log_schema.get_column_definition(log_data_column_deleted_name_bytes(base_cdef.name()));
+        if (!log_cdef_ptr) {
+            throw exceptions::invalid_request_exception(format("CDC log schema for {}.{} does not have base column {}",
+                _log_schema.ks_name(), _log_schema.cf_name(), base_cdef.name_as_text()));
+        }
+        auto& log_cdef = *log_cdef_ptr;
+        _log_mut.set_cell(log_ck, *log_cdef_ptr, atomic_cell::make_live(*log_cdef.type, _ts, log_cdef.type->decompose(true), _ttl));
    }

    // Each regular and static non-atomic column in the base schema has a corresponding column in the log schema
@@ -978,7 +1020,12 @@ public:
    // Given a reference to such a column from the base schema, this function sets the corresponding column
    // in the log to the given set of keys for the given row.
    void set_deleted_elements(const clustering_key& log_ck, const column_definition& base_cdef, const managed_bytes& deleted_elements) {
-        auto& log_cdef = *_log_schema.get_column_definition(log_data_column_deleted_elements_name_bytes(base_cdef.name()));
+        auto log_cdef_ptr = _log_schema.get_column_definition(log_data_column_deleted_elements_name_bytes(base_cdef.name()));
+        if (!log_cdef_ptr) {
+            throw exceptions::invalid_request_exception(format("CDC log schema for {}.{} does not have base column {}",
+                _log_schema.ks_name(), _log_schema.cf_name(), base_cdef.name_as_text()));
+        }
+        auto& log_cdef = *log_cdef_ptr;
        _log_mut.set_cell(log_ck, log_cdef, atomic_cell::make_live(*log_cdef.type, _ts, deleted_elements, _ttl));
    }

@@ -1865,5 +1912,10 @@ bool cdc::cdc_service::needs_cdc_augmentation(const std::vector<mutation>& mutat

 future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
 cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
+    if (utils::get_local_injector().enter("sleep_before_cdc_augmentation")) {
+        return seastar::sleep(std::chrono::milliseconds(100)).then([this, timeout, mutations = std::move(mutations), tr_state = std::move(tr_state), write_cl] () mutable {
+            return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
+        });
+    }
    return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
 }
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -186,7 +186,7 @@ bool cdc::metadata::prepare(db_clock::time_point tp) {
    }

    auto ts = to_ts(tp);
-    auto emplaced = _gens.emplace(to_ts(tp), std::nullopt).second;
+    auto [it, emplaced] = _gens.emplace(to_ts(tp), std::nullopt);

    if (_last_stream_timestamp != api::missing_timestamp) {
        auto last_correct_gen = gen_used_at(_last_stream_timestamp);
@@ -201,5 +201,5 @@ bool cdc::metadata::prepare(db_clock::time_point tp) {
        }
    }

-    return emplaced;
+    return !it->second;
 }
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -769,7 +769,7 @@ private:
    }

    virtual sstables::sstable_set make_sstable_set_for_input() const {
-        return _table_s.get_compaction_strategy().make_sstable_set(_schema);
+        return _table_s.get_compaction_strategy().make_sstable_set(_table_s);
    }

    const tombstone_gc_state& get_tombstone_gc_state() const {
@@ -1301,7 +1301,7 @@ public:
    }

    virtual sstables::sstable_set make_sstable_set_for_input() const override {
-        return sstables::make_partitioned_sstable_set(_schema, false);
+        return sstables::make_partitioned_sstable_set(_schema, _table_s.token_range());
    }

    // Unconditionally enable incremental compaction if the strategy specifies a max output size, e.g. LCS.
@@ -1910,7 +1910,11 @@ static future<compaction_result> scrub_sstables_validate_mode(sstables::compacti
    using scrub = sstables::compaction_type_options::scrub;
    if (validation_errors != 0 && descriptor.options.as<scrub>().quarantine_sstables == scrub::quarantine_invalid_sstables::yes) {
        for (auto& sst : descriptor.sstables) {
-            co_await sst->change_state(sstables::sstable_state::quarantine);
+            try {
+                co_await sst->change_state(sstables::sstable_state::quarantine);
+            } catch (...) {
+                clogger.error("Moving {} to quarantine failed due to {}, continuing.", sst->get_filename(), std::current_exception());
+            }
        }
    }

--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -15,6 +15,7 @@
 #include "sstables/sstables_manager.hh"
 #include <memory>
 #include <fmt/ranges.h>
+#include <seastar/core/future.hh>
 #include <seastar/core/metrics.hh>
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/switch_to.hh>
@@ -393,7 +394,7 @@ future<sstables::sstable_set> compaction_task_executor::sstable_set_for_tombston
    auto compound_set = t.sstable_set_for_tombstone_gc();
    // Compound set will be linearized into a single set, since compaction might add or remove sstables
    // to it for incremental compaction to work.
-    auto new_set = sstables::make_partitioned_sstable_set(t.schema(), false);
+    auto new_set = sstables::make_partitioned_sstable_set(t.schema(), t.token_range());
    co_await compound_set->for_each_sstable_gently([&] (const sstables::shared_sstable& sst) {
        auto inserted = new_set.insert(sst);
        if (!inserted) {
@@ -503,7 +504,7 @@ public:

    virtual ~sstables_task_executor() = default;

-    virtual void release_resources() noexcept override;
+    virtual future<> release_resources() noexcept override;

    virtual future<tasks::task_manager::task::progress> get_progress() const override {
        return compaction_task_impl::get_progress(_compaction_data, _progress_monitor);
@@ -788,9 +789,10 @@ compaction::compaction_state::~compaction_state() {
    compaction_done.broken();
 }

-void sstables_task_executor::release_resources() noexcept {
+future<> sstables_task_executor::release_resources() noexcept {
    _cm._stats.pending_tasks -= _sstables.size() - (_state == state::pending);
    _sstables = {};
+    return make_ready_future();
 }

 future<compaction_manager::compaction_stats_opt> compaction_task_executor::run_compaction() noexcept {
@@ -1125,16 +1127,16 @@ future<> compaction_manager::drain() {
        // Disable the state so that it can be enabled later if requested.
        _state = state::disabled;
    }
+    _compaction_submission_timer.cancel();
    // Stop ongoing compactions, if the request has not been sent already and wait for them to stop.
    co_await stop_ongoing_compactions("drain");
+    // Trigger a signal to properly exit from postponed_compactions_reevaluation() fiber
+    reevaluate_postponed_compactions();
    cmlog.info("Drained");
 }

 future<> compaction_manager::stop() {
    do_stop();
-    if (auto cm = std::exchange(_task_manager_module, nullptr)) {
-        co_await cm->stop();
-    }
    if (_stop_future) {
        co_await std::exchange(*_stop_future, make_ready_future());
    }
@@ -1145,14 +1147,15 @@ future<> compaction_manager::really_do_stop() noexcept {
    // Reset the metrics registry
    _metrics.clear();
    co_await stop_ongoing_compactions("shutdown");
-    if (!_tasks.empty()) {
-        on_fatal_internal_error(cmlog, format("{} tasks still exist after being stopped", _tasks.size()));
-    }
+    co_await _task_manager_module->stop();
    co_await coroutine::parallel_for_each(_compaction_state | std::views::values, [] (compaction_state& cs) -> future<> {
        if (!cs.gate.is_closed()) {
            co_await cs.gate.close();
        }
    });
+    if (!_tasks.empty()) {
+        on_fatal_internal_error(cmlog, format("{} tasks still exist after being stopped", _tasks.size()));
+    }
    reevaluate_postponed_compactions();
    co_await std::move(_waiting_reevalution);
    _weight_tracker.clear();
@@ -1565,10 +1568,10 @@ public:
        , _can_purge(can_purge)
    {}

-    virtual void release_resources() noexcept override {
+    virtual future<> release_resources() noexcept override {
        _compacting.release_all();
        _owned_ranges_ptr = nullptr;
-        sstables_task_executor::release_resources();
+        co_await sstables_task_executor::release_resources();
    }

 protected:
@@ -1815,8 +1818,21 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sst
    if (!gh) {
        co_return compaction_stats_opt{};
    }
-    // All sstables must be included, even the ones being compacted, such that everything in table is validated.
-    auto all_sstables = get_all_sstables(t);
+
+    // Collect and register all sstables as compacting while compaction is disabled, to avoid a race condition where
+    // regular compaction runs in between and picks the same files.
+    std::vector<sstables::shared_sstable> all_sstables;
+    compacting_sstable_registration compacting(*this, get_compaction_state(&t));
+    co_await run_with_compaction_disabled(t, [&all_sstables, &compacting, &t] () -> future<> {
+        // All sstables must be included.
+        all_sstables = get_all_sstables(t);
+        compacting.register_compacting(all_sstables);
+        return make_ready_future<>();
+    });
+    if (all_sstables.empty()) {
+        co_return compaction_stats_opt{};
+    }
+
    co_return co_await perform_compaction<validate_sstables_compaction_task_executor>(throw_if_stopping::no, info, &t, info.id, std::move(all_sstables), quarantine_sstables);
 }

@@ -1846,11 +1862,12 @@ public:

    virtual ~cleanup_sstables_compaction_task_executor() = default;

-    virtual void release_resources() noexcept override {
+    virtual future<> release_resources() noexcept override {
        _cm._stats.pending_tasks -= _pending_cleanup_jobs.size();
        _pending_cleanup_jobs = {};
        _compacting.release_all();
        _owned_ranges_ptr = nullptr;
+        return make_ready_future();
    }

    virtual future<tasks::task_manager::task::progress> get_progress() const override {
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -300,6 +300,11 @@ public:
    // unless it is moved back to enabled state.
    future<> drain();

+    // Check if compaction manager is running, i.e. it was enabled or drained
+    bool is_running() const noexcept {
+        return _state == state::enabled || _state == state::disabled;
+    }
+
    using compaction_history_consumer = noncopyable_function<future<>(const db::compaction_history_entry&)>;
    future<> get_compaction_history(compaction_history_consumer&& f);

--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -789,8 +789,8 @@ future<reshape_config> make_reshape_config(const sstables::storage& storage, res
    };
 }

-std::unique_ptr<sstable_set_impl> incremental_compaction_strategy::make_sstable_set(schema_ptr schema) const {
-    return std::make_unique<partitioned_sstable_set>(std::move(schema), false);
+std::unique_ptr<sstable_set_impl> incremental_compaction_strategy::make_sstable_set(const table_state& ts) const {
+    return std::make_unique<partitioned_sstable_set>(ts.schema(), ts.token_range());
 }

 }
--- a/compaction/compaction_strategy.hh
+++ b/compaction/compaction_strategy.hh
@@ -105,7 +105,7 @@ public:
        return name(type());
    }

-    sstable_set make_sstable_set(schema_ptr schema) const;
+    sstable_set make_sstable_set(const table_state& ts) const;

    compaction_backlog_tracker make_backlog_tracker() const;

--- a/compaction/compaction_strategy_impl.hh
+++ b/compaction/compaction_strategy_impl.hh
@@ -56,7 +56,7 @@ public:
        return true;
    }
    virtual int64_t estimated_pending_compactions(table_state& table_s) const = 0;
-    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(schema_ptr schema) const;
+    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const table_state& ts) const;

    bool use_clustering_key_filter() const {
        return _use_clustering_key_filter;
--- a/compaction/incremental_compaction_strategy.hh
+++ b/compaction/incremental_compaction_strategy.hh
@@ -99,7 +99,7 @@ public:

    virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, reshape_config cfg) const override;

-    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(schema_ptr schema) const override;
+    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const table_state& ts) const override;

    friend class ::incremental_backlog_tracker;
 };
--- a/compaction/leveled_compaction_strategy.hh
+++ b/compaction/leveled_compaction_strategy.hh
@@ -70,7 +70,7 @@ public:
    virtual compaction_strategy_type type() const override {
        return compaction_strategy_type::leveled;
    }
-    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(schema_ptr schema) const override;
+    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const table_state& ts) const override;

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const override;

--- a/compaction/table_state.hh
+++ b/compaction/table_state.hh
@@ -33,6 +33,7 @@ namespace compaction {
 class table_state {
 public:
    virtual ~table_state() {}
+    virtual dht::token_range token_range() const noexcept = 0;
    virtual const schema_ptr& schema() const noexcept = 0;
    // min threshold as defined by table.
    virtual unsigned min_compaction_threshold() const noexcept = 0;
--- a/compaction/time_window_compaction_strategy.hh
+++ b/compaction/time_window_compaction_strategy.hh
@@ -150,7 +150,7 @@ public:
        return compaction_strategy_type::time_window;
    }

-    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(schema_ptr schema) const override;
+    virtual std::unique_ptr<sstable_set_impl> make_sstable_set(const table_state& ts) const override;

    virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const override;

--- a/compound.hh
+++ b/compound.hh
@@ -255,6 +255,9 @@ public:
    // Returns true iff given prefix has no missing components
    bool is_full(managed_bytes_view v) const {
        SCYLLA_ASSERT(AllowPrefixes == allow_prefixes::yes);
+        if (_types.size() == 0) {
+            return v.empty();
+        }
        return std::distance(begin(v), end(v)) == (ssize_t)_types.size();
    }
    bool is_empty(managed_bytes_view v) const {
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -677,7 +677,9 @@ maintenance_socket: ignore
 # Guardrail to enable the deprecated feature of CREATE TABLE WITH COMPACT STORAGE.
 # enable_create_table_with_compact_storage: false

-# Enable tablets for new keyspaces.
+# Control tablets for new keyspaces.
+# Can be set to: disabled|enabled
+#
 # When enabled, newly created keyspaces will have tablets enabled by default.
 # That can be explicitly disabled in the CREATE KEYSPACE query
 # by using the `tablets = {'enabled': false}` replication option.
@@ -686,6 +688,15 @@ maintenance_socket: ignore
 # unless tablets are explicitly enabled in the CREATE KEYSPACE query
 # by using the `tablets = {'enabled': true}` replication option.
 #
+# When set to `enforced`, newly created keyspaces will always have tablets enabled by default.
+# This prevents explicitly disabling tablets in the CREATE KEYSPACE query
+# using the `tablets = {'enabled': false}` replication option.
+# It also mandates a replication strategy supporting tablets, like
+# NetworkTopologyStrategy
+#
 # Note that creating keyspaces with tablets enabled or disabled is irreversible.
 # The `tablets` option cannot be changed using `ALTER KEYSPACE`.
-enable_tablets: true
+tablets_mode_for_new_keyspaces: enabled
+
+# Enforce RF-rack-valid keyspaces.
+rf_rack_valid_keyspaces: false
--- a/configure.py
+++ b/configure.py
@@ -813,6 +813,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/rjson.cc',
                'utils/human_readable.cc',
                'utils/histogram_metrics_helper.cc',
+                'utils/io-wrappers.cc',
                'utils/on_internal_error.cc',
                'utils/pretty_printers.cc',
                'utils/stream_compressor.cc',
@@ -1001,6 +1002,7 @@ scylla_core = (['message/messaging_service.cc',
                'db/extensions.cc',
                'db/heat_load_balance.cc',
                'db/large_data_handler.cc',
+                'db/corrupt_data_handler.cc',
                'db/marshal/type_parser.cc',
                'db/batchlog_manager.cc',
                'db/tags/utils.cc',
@@ -1027,6 +1029,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/multiprecision_int.cc',
                'utils/gz/crc_combine.cc',
                'utils/gz/crc_combine_table.cc',
+                'utils/http.cc',
                'utils/s3/aws_error.cc',
                'utils/s3/client.cc',
                'utils/s3/retry_strategy.cc',
@@ -1099,7 +1102,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/lister.cc',
                'repair/repair.cc',
                'repair/row_level.cc',
-                'repair/table_check.cc',
+                'streaming/table_check.cc',
                'exceptions/exceptions.cc',
                'auth/allow_all_authenticator.cc',
                'auth/allow_all_authorizer.cc',
@@ -1321,6 +1324,7 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/replica_exception.idl.hh',
        'idl/per_partition_rate_limit_info.idl.hh',
        'idl/position_in_partition.idl.hh',
+        'idl/full_position.idl.hh',
        'idl/experimental/broadcast_tables_lang.idl.hh',
        'idl/storage_service.idl.hh',
        'idl/join_node.idl.hh',
@@ -1338,6 +1342,7 @@ scylla_tests_generic_dependencies = [
    'test/lib/test_utils.cc',
    'test/lib/tmpdir.cc',
    'test/lib/sstable_run_based_compaction_strategy_for_tests.cc',
+    'test/lib/eventually.cc',
 ]

 scylla_tests_dependencies = scylla_core + alternator + idls + scylla_tests_generic_dependencies + [
@@ -1379,6 +1384,7 @@ scylla_perfs = ['test/perf/perf_alternator.cc',
                'test/lib/key_utils.cc',
                'test/lib/random_schema.cc',
                'test/lib/data_model.cc',
+                'test/lib/eventually.cc',
                'seastar/tests/perf/linux_perf_event.cc']

 deps = {
@@ -1564,7 +1570,7 @@ deps['test/boost/linearizing_input_stream_test'] = [
    "test/boost/linearizing_input_stream_test.cc",
    "test/lib/log.cc",
 ]
-deps['test/boost/expr_test'] = ['test/boost/expr_test.cc', 'test/lib/expr_test_utils.cc'] + scylla_core
+deps['test/boost/expr_test'] = ['test/boost/expr_test.cc', 'test/lib/expr_test_utils.cc'] + scylla_core + alternator
 deps['test/boost/rate_limiter_test'] = ['test/boost/rate_limiter_test.cc', 'db/rate_limiter.cc']
 deps['test/boost/exceptions_optimized_test'] = ['test/boost/exceptions_optimized_test.cc', 'utils/exceptions.cc']
 deps['test/boost/exceptions_fallback_test'] = ['test/boost/exceptions_fallback_test.cc', 'utils/exceptions.cc']
@@ -1573,16 +1579,16 @@ deps['test/boost/duration_test'] += ['test/lib/exception_utils.cc']
 deps['test/boost/schema_loader_test'] += ['tools/schema_loader.cc', 'tools/read_mutation.cc']
 deps['test/boost/rust_test'] += ['rust/inc/src/lib.rs']

-deps['test/raft/replication_test'] = ['test/raft/replication_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
-deps['test/raft/raft_server_test'] = ['test/raft/raft_server_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
+deps['test/raft/replication_test'] = ['test/raft/replication_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc', 'test/lib/eventually.cc'] + scylla_raft_dependencies
+deps['test/raft/raft_server_test'] = ['test/raft/raft_server_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc', 'test/lib/eventually.cc'] + scylla_raft_dependencies
 deps['test/raft/randomized_nemesis_test'] = ['test/raft/randomized_nemesis_test.cc', 'direct_failure_detector/failure_detector.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
 deps['test/raft/failure_detector_test'] = ['test/raft/failure_detector_test.cc', 'direct_failure_detector/failure_detector.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
-deps['test/raft/many_test'] = ['test/raft/many_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc'] + scylla_raft_dependencies
+deps['test/raft/many_test'] = ['test/raft/many_test.cc', 'test/raft/replication.cc', 'test/raft/helpers.cc', 'test/lib/eventually.cc'] + scylla_raft_dependencies
 deps['test/raft/fsm_test'] =  ['test/raft/fsm_test.cc', 'test/raft/helpers.cc', 'test/lib/log.cc'] + scylla_raft_dependencies
 deps['test/raft/etcd_test'] =  ['test/raft/etcd_test.cc', 'test/raft/helpers.cc', 'test/lib/log.cc'] + scylla_raft_dependencies
 deps['test/raft/raft_sys_table_storage_test'] = ['test/raft/raft_sys_table_storage_test.cc'] + \
-    scylla_core + scylla_tests_generic_dependencies
-deps['test/boost/address_map_test'] = ['test/boost/address_map_test.cc'] + scylla_core
+    scylla_core + alternator + scylla_tests_generic_dependencies
+deps['test/boost/address_map_test'] = ['test/boost/address_map_test.cc'] + scylla_core + alternator
 deps['test/raft/discovery_test'] =  ['test/raft/discovery_test.cc',
                                     'test/raft/helpers.cc',
                                     'test/lib/log.cc',
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -709,17 +709,23 @@ batchStatement returns [std::unique_ptr<cql3::statements::raw::batch_statement>
    : K_BEGIN
      ( K_UNLOGGED { type = btype::UNLOGGED; } | K_COUNTER { type = btype::COUNTER; } )?
      K_BATCH ( usingClause[attrs] )?
-          ( s=batchStatementObjective ';'? { statements.push_back(std::move(s)); } )*
+          ( s=batchStatementObjective ';'?
+              {
+                  auto&& stmt = *$s.statement;
+                  stmt->add_raw(sstring{$s.text});
+                  statements.push_back(std::move(stmt));
+              } )*
      K_APPLY K_BATCH
      {
          $expr = std::make_unique<cql3::statements::raw::batch_statement>(type, std::move(attrs), std::move(statements));
      }
    ;

-batchStatementObjective returns [std::unique_ptr<cql3::statements::raw::modification_statement> statement]
-    : i=insertStatement  { $statement = std::move(i); }
-    | u=updateStatement  { $statement = std::move(u); }
-    | d=deleteStatement  { $statement = std::move(d); }
+batchStatementObjective returns [::lw_shared_ptr<std::unique_ptr<cql3::statements::raw::modification_statement>> statement]
+    @init { using original_ret_type = std::unique_ptr<cql3::statements::raw::modification_statement>; }
+    : i=insertStatement  { $statement = make_lw_shared<original_ret_type>(std::move(i)); }
+    | u=updateStatement  { $statement = make_lw_shared<original_ret_type>(std::move(u)); }
+    | d=deleteStatement  { $statement = make_lw_shared<original_ret_type>(std::move(d)); }
    ;

 dropAggregateStatement returns [std::unique_ptr<cql3::statements::drop_aggregate_statement> expr]
--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -13,6 +13,7 @@
 #include <seastar/core/on_internal_error.hh>
 #include <stdexcept>
 #include "alter_keyspace_statement.hh"
+#include "locator/tablets.hh"
 #include "prepared_statement.hh"
 #include "service/migration_manager.hh"
 #include "service/storage_proxy.hh"
@@ -25,6 +26,9 @@
 #include "create_keyspace_statement.hh"
 #include "gms/feature_service.hh"
 #include "replica/database.hh"
+#include "db/config.hh"
+
+using namespace std::string_literals;

 static logging::logger mylogger("alter_keyspace");

@@ -193,9 +197,9 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
        event::schema_change::target_type target_type = event::schema_change::target_type::KEYSPACE;
        auto ks = qp.db().find_keyspace(_name);
        auto ks_md = ks.metadata();
-        const auto& tm = *qp.proxy().get_token_metadata_ptr();
+        const auto tmptr = qp.proxy().get_token_metadata_ptr();
        const auto& feat = qp.proxy().features();
-        auto ks_md_update = _attrs->as_ks_metadata_update(ks_md, tm, feat);
+        auto ks_md_update = _attrs->as_ks_metadata_update(ks_md, *tmptr, feat);
        std::vector<mutation> muts;
        std::vector<sstring> warnings;
        bool include_tablet_options = _attrs->get_map(_attrs->KW_TABLETS).has_value();
@@ -206,6 +210,25 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
        auto ts = mc.write_timestamp();
        auto global_request_id = mc.new_group0_state_id();

+        // #22688 - filter out any dc*:0 entries - consider these
+        // null and void (removed). Migration planning will treat it
+        // as dc*=0 still.
+        std::erase_if(ks_options, [](const auto& i) {
+            static constexpr std::string replication_prefix = ks_prop_defs::KW_REPLICATION + ":"s;
+            // Flattened map, replication entries starts with "replication:".
+            // Only valid options are replication_factor, class and per-dc rf:s. We want to
+            // filter out any dcN=0 entries.
+            auto& [key, val] = i;
+            if (key.starts_with(replication_prefix) && val == "0") {
+                std::string_view v(key);
+                v.remove_prefix(replication_prefix.size());
+                return v != ks_prop_defs::REPLICATION_FACTOR_KEY 
+                    && v != ks_prop_defs::REPLICATION_STRATEGY_CLASS_KEY
+                    ;
+            }
+            return false;
+        });
+
        // we only want to run the tablets path if there are actually any tablets changes, not only schema changes
        // TODO: the current `if (changes_tablets(qp))` is insufficient: someone may set the same RFs as before,
        //       and we'll unnecessarily trigger the processing path for ALTER tablets KS,
@@ -246,6 +269,47 @@ cql3::statements::alter_keyspace_statement::prepare_schema_mutations(query_proce
            muts.insert(muts.begin(), schema_mutations.begin(), schema_mutations.end());
        }

+        auto rs = locator::abstract_replication_strategy::create_replication_strategy(
+                ks_md_update->strategy_name(),
+                locator::replication_strategy_params(ks_md_update->strategy_options(), ks_md_update->initial_tablets()));
+
+        // If `rf_rack_valid_keyspaces` is enabled, it's forbidden to perform a schema change that
+        // would lead to an RF-rack-valid keyspace. Verify that this change does not.
+        // For more context, see: scylladb/scylladb#23071.
+        try {
+            // There are two things to note here:
+            // 1. We hold a group0_guard, so it's correct to check this here.
+            //    The topology or schema cannot change while we're performing this query.
+            // 2. The replication strategy we use here does NOT represent the actual state
+            //    we will arrive at after applying the schema change. For instance, if the user
+            //    did not specify the RF for some of the DCs, it's equal to 0 in the replication
+            //    strategy we pass to this function, while in reality that means that the RF
+            //    will NOT change. That is not a problem:
+            //    - RF=0 is valid for all DCs, so it won't trigger an exception on its own,
+            //    - the keyspace must've been RF-rack-valid before this change. We check that
+            //      condition for all keyspaces at startup.
+            //    The second hyphen is not really true because currently topological changes can
+            //    disturb it (see scylladb/scylladb#23345), but we ignore that.
+            locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
+        } catch (const std::exception& e) {
+            if (qp.db().get_config().rf_rack_valid_keyspaces()) {
+                // There's no guarantee what the type of the exception will be, so we need to
+                // wrap it manually here in a type that can be passed to the user.
+                throw exceptions::invalid_request_exception(e.what());
+            } else {
+                // Even when the configuration option `rf_rack_valid_keyspaces` is set to false,
+                // we'd like to inform the user that the keyspace they're altering will not
+                // satisfy the restriction after the change--but just as a warning.
+                // For more context, see issue: scylladb/scylladb#23330.
+                warnings.push_back(seastar::format(
+                    "Keyspace '{}' is not RF-rack-valid: the replication factor doesn't match "
+                    "the rack count in at least one datacenter. A rack failure may reduce availability. "
+                    "For more context, see: "
+                    "https://docs.scylladb.com/manual/stable/reference/glossary.html#term-RF-rack-valid-keyspace.",
+                    _name));
+            }
+        }
+
        auto ret = ::make_shared<event::schema_change>(
                event::schema_change::change_type::UPDATED,
                target_type,
--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -8,6 +8,7 @@
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
 */

+#include "cdc/log.hh"
 #include "utils/assert.hh"
 #include <seastar/core/coroutine.hh>
 #include "cql3/query_options.hh"
@@ -27,6 +28,7 @@
 #include "db/view/view.hh"
 #include "cql3/query_processor.hh"
 #include "cdc/cdc_extension.hh"
+#include "cdc/cdc_partitioner.hh"

 namespace cql3 {

@@ -276,12 +278,59 @@ void alter_table_statement::drop_column(const query_options& options, const sche
    }
 }

-std::pair<schema_builder, std::vector<view_ptr>> alter_table_statement::prepare_schema_update(data_dictionary::database db, const query_options& options) const {
+std::pair<schema_ptr, std::vector<view_ptr>> alter_table_statement::prepare_schema_update(data_dictionary::database db, const query_options& options) const {
    auto s = validation::validate_column_family(db, keyspace(), column_family());
    if (s->is_view()) {
        throw exceptions::invalid_request_exception("Cannot use ALTER TABLE on Materialized View");
    }

+    const bool is_cdc_log_table = cdc::is_log_for_some_table(db.real_database(), s->ks_name(), s->cf_name());
+    // Only a CDC log table will have this partitioner name. User tables should
+    // not be able to set this. Note that we perform a similar check when trying to
+    // re-enable CDC for a table, when the log table has been replaced by a user table.
+    // For better visualization of the above, consider this
+    //
+    // cqlsh> CREATE TABLE ks.t (p int PRIMARY KEY, v int) WITH cdc = {'enabled': true};
+    // cqlsh> INSERT INTO ks.t (p, v) VALUES (1, 2);
+    // cqlsh> ALTER TABLE ks.t WITH cdc = {'enabled': false};
+    // cqlsh> DESC TABLE ks.t_scylla_cdc_log WITH INTERNALS; # Save this output!
+    // cqlsh> DROP TABLE ks.t_scylla_cdc_log;
+    // cqlsh> [Recreate the log table using the received statement]
+    // cqlsh> ALTER TABLE ks.t WITH cdc = {'enabled': true};
+    //
+    // InvalidRequest: Error from server: code=2200 [Invalid query] message="Cannot create CDC log
+    //                 table for table ks.t because a table of name ks.t_scylla_cdc_log already exists"
+    //
+    // See commit adda43edc75b901b2329bca8f3eb74596698d05f for more information on THAT case.
+    // We reuse the same technique here.
+    const bool was_cdc_log_table = s->get_partitioner().name() == cdc::cdc_partitioner::classname;
+
+    if (_column_changes.size() != 0 && is_cdc_log_table) {
+        throw exceptions::invalid_request_exception(
+                "You cannot modify the set of columns of a CDC log table directly. "
+                "Modify the base table instead.");
+    }
+    if (_column_changes.size() != 0 && was_cdc_log_table) {
+        throw exceptions::invalid_request_exception(
+                "You cannot modify the set of columns of a CDC log table directly. "
+                "Although the base table has deactivated CDC, this table will continue being "
+                "a CDC log table until it is dropped. If you want to modify the columns in it, "
+                "you can only do that by reenabling CDC on the base table, which will reattach "
+                "this log table. Then you will be able to modify the columns in the base table, "
+                "and that will have effect on the log table too. Modifying the columns of a CDC "
+                "log table directly is never allowed.");
+    }
+
+    if (_renames.size() != 0 && is_cdc_log_table) {
+        throw exceptions::invalid_request_exception("Cannot rename a column of a CDC log table.");
+    }
+    if (_renames.size() != 0 && was_cdc_log_table) {
+        throw exceptions::invalid_request_exception(
+                "You cannot rename a column of a CDC log table. Although the base table "
+                "has deactivated CDC, this table will continue being a CDC log table until it "
+                "is dropped.");
+    }
+
    auto cfm = schema_builder(s);

    if (_properties->get_id()) {
@@ -369,41 +418,45 @@ std::pair<schema_builder, std::vector<view_ptr>> alter_table_statement::prepare_

            validate_column_rename(db, *s, *from, *to);
            cfm.rename_column(from->name(), to->name());
-
-            // If the view includes a renamed column, it must be renamed in
-            // the view table and the definition.
-            for (auto&& view : cf.views()) {
+        }
+        // New view schemas contain the new column names, so we need to base them on the
+        // new base schema.
+        schema_ptr new_base_schema = cfm.build();
+        // If the view includes a renamed column, it must be renamed in
+        // the view table and the definition.
+        for (auto&& view : cf.views()) {
+            schema_builder builder(view);
+            std::vector<std::pair<::shared_ptr<column_identifier>, ::shared_ptr<column_identifier>>> view_renames;
+            for (auto&& entry : _renames) {
+                auto from = entry.first->prepare_column_identifier(*s);
                if (view->get_column_definition(from->name())) {
-                    schema_builder builder(view);
-
                    auto view_from = entry.first->prepare_column_identifier(*view);
                    auto view_to = entry.second->prepare_column_identifier(*view);
                    validate_column_rename(db, *view, *view_from, *view_to);
                    builder.rename_column(view_from->name(), view_to->name());
-
-                    auto new_where = util::rename_column_in_where_clause(
-                            view->view_info()->where_clause(),
-                            column_identifier::raw(view_from->text(), true),
-                            column_identifier::raw(view_to->text(), true),
-                            cql3::dialect{});
-                    builder.with_view_info(view->view_info()->base_id(), view->view_info()->base_name(),
-                            view->view_info()->include_all_columns(), std::move(new_where));
-
-                    view_updates.push_back(view_ptr(builder.build()));
+                    view_renames.emplace_back(view_from, view_to);
                }
            }
+            if (!view_renames.empty()) {
+                auto new_where = util::rename_columns_in_where_clause(
+                        view->view_info()->where_clause(),
+                        view_renames,
+                        cql3::dialect{});
+                builder.with_view_info(new_base_schema, view->view_info()->include_all_columns(), std::move(new_where));
+                view_updates.push_back(view_ptr(builder.build()));
+            }
        }
-        break;
+        return make_pair(std::move(new_base_schema), std::move(view_updates));
    }

-    return make_pair(std::move(cfm), std::move(view_updates));
+    return make_pair(cfm.build(), std::move(view_updates));
 }

 future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector<mutation>, cql3::cql_warnings_vec>>
 alter_table_statement::prepare_schema_mutations(query_processor& qp, const query_options& options, api::timestamp_type ts) const {
  data_dictionary::database db = qp.db();
-  auto [cfm, view_updates] = prepare_schema_update(db, options);
-  auto m = co_await service::prepare_column_family_update_announcement(qp.proxy(), cfm.build(), std::move(view_updates), ts);
+  auto [s, view_updates] = prepare_schema_update(db, options);
+  auto m = co_await service::prepare_column_family_update_announcement(qp.proxy(), std::move(s), std::move(view_updates), ts);

  using namespace cql_transport;
  auto ret = ::make_shared<event::schema_change>(
--- a/cql3/statements/alter_table_statement.hh
+++ b/cql3/statements/alter_table_statement.hh
@@ -69,7 +69,7 @@ private:
    void add_column(const query_options& options, const schema& schema, data_dictionary::table cf, schema_builder& cfm, std::vector<view_ptr>& view_updates, const column_identifier& column_name, const cql3_type validator, const column_definition* def, bool is_static) const;
    void alter_column(const query_options& options, const schema& schema, data_dictionary::table cf, schema_builder& cfm, std::vector<view_ptr>& view_updates, const column_identifier& column_name, const cql3_type validator, const column_definition* def, bool is_static) const;
    void drop_column(const query_options& options, const schema& schema, data_dictionary::table cf, schema_builder& cfm, std::vector<view_ptr>& view_updates, const column_identifier& column_name, const cql3_type validator, const column_definition* def, bool is_static) const;
-    std::pair<schema_builder, std::vector<view_ptr>> prepare_schema_update(data_dictionary::database db, const query_options& options) const;
+    std::pair<schema_ptr, std::vector<view_ptr>> prepare_schema_update(data_dictionary::database db, const query_options& options) const;
 };

 class alter_table_statement::raw_statement : public raw::cf_statement {
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -87,6 +87,9 @@ std::vector<::shared_ptr<index_target>> create_index_statement::validate_while_e
                "Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns");
    }

+    if (!db.features().views_with_tablets && db.find_keyspace(keyspace()).get_replication_strategy().uses_tablets()) {
+        throw exceptions::invalid_request_exception(format("Secondary indexes are not supported on base tables with tablets (keyspace '{}')", keyspace()));
+    }
    validate_for_local_index(*schema);

    std::vector<::shared_ptr<index_target>> targets;
--- a/cql3/statements/create_keyspace_statement.cc
+++ b/cql3/statements/create_keyspace_statement.cc
@@ -11,6 +11,8 @@
 #include <seastar/core/coroutine.hh>
 #include "cql3/statements/create_keyspace_statement.hh"
 #include "cql3/statements/ks_prop_defs.hh"
+#include "exceptions/exceptions.hh"
+#include "locator/tablets.hh"
 #include "prepared_statement.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "data_dictionary/keyspace_metadata.hh"
@@ -90,14 +92,14 @@ void create_keyspace_statement::validate(query_processor& qp, const service::cli

 future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector<mutation>, cql3::cql_warnings_vec>> create_keyspace_statement::prepare_schema_mutations(query_processor& qp, const query_options&, api::timestamp_type ts) const {
    using namespace cql_transport;
-    const auto& tm = *qp.proxy().get_token_metadata_ptr();
+    const auto tmptr = qp.proxy().get_token_metadata_ptr();
    const auto& feat = qp.proxy().features();
    const auto& cfg = qp.db().get_config();
    std::vector<mutation> m;
    std::vector<sstring> warnings;

    try {
-        auto ksm = _attrs->as_ks_metadata(_name, tm, feat, cfg);
+        auto ksm = _attrs->as_ks_metadata(_name, *tmptr, feat, cfg);
        m = service::prepare_new_keyspace_announcement(qp.db().real_database(), ksm, ts);
        // If the new keyspace uses tablets, as long as there are features
        // which aren't supported by tablets we want to warn the user that
@@ -111,10 +113,35 @@ future<std::tuple<::shared_ptr<cql_transport::event::schema_change>, std::vector
        if (rs->uses_tablets()) {
            warnings.push_back(
                "Tables in this keyspace will be replicated using Tablets "
-                "and will not support CDC, LWT and counters features. "
-                "To use CDC, LWT or counters, drop this keyspace and re-create it "
-                "without tablets by adding AND TABLETS = {'enabled': false} "
-                "to the CREATE KEYSPACE statement.");
+                "and will not support Materialized Views, Secondary Indexes, CDC, LWT and counters features. "
+                "To use Materialized Views, Secondary Indexes, CDC, LWT or counters, drop this keyspace and re-create it "
+                "without tablets by adding AND TABLETS = {'enabled': false} to the CREATE KEYSPACE statement.");
+        }
+
+        // If `rf_rack_valid_keyspaces` is enabled, it's forbidden to create an RF-rack-invalid keyspace.
+        // Verify that it's RF-rack-valid.
+        // For more context, see: scylladb/scylladb#23071.
+        try {
+            // We hold a group0_guard, so it's correct to check this here.
+            // The topology or schema cannot change while we're performing this query.
+            locator::assert_rf_rack_valid_keyspace(_name, tmptr, *rs);
+        } catch (const std::exception& e) {
+            if (cfg.rf_rack_valid_keyspaces()) {
+                // There's no guarantee what the type of the exception will be, so we need to
+                // wrap it manually here in a type that can be passed to the user.
+                throw exceptions::invalid_request_exception(e.what());
+            } else {
+                // Even when the configuration option `rf_rack_valid_keyspaces` is set to false,
+                // we'd like to inform the user that the keyspace they're creating does not
+                // satisfy the restriction--but just as a warning.
+                // For more context, see issue: scylladb/scylladb#23330.
+                warnings.push_back(seastar::format(
+                    "Keyspace '{}' is not RF-rack-valid: the replication factor doesn't match "
+                    "the rack count in at least one datacenter. A rack failure may reduce availability. "
+                    "For more context, see: "
+                    "https://docs.scylladb.com/manual/stable/reference/glossary.html#term-RF-rack-valid-keyspace.",
+                    _name));
+            }
        }
    } catch (const exceptions::already_exists_exception& e) {
        if (!_if_not_exists) {
@@ -217,9 +244,6 @@ std::vector<sstring> check_against_restricted_replication_strategies(
    // We ignore errors (non-number, negative number, etc.) here,
    // these are checked and reported elsewhere.
    for (auto opt : attrs.get_replication_options()) {
-        if (opt.first == sstring("initial_tablets")) {
-            continue;
-        }
        try {
            auto rf = std::stol(opt.second);
            if (rf > 0) {
--- a/cql3/statements/create_view_statement.cc
+++ b/cql3/statements/create_view_statement.cc
@@ -140,6 +140,9 @@ std::pair<view_ptr, cql3::cql_warnings_vec> create_view_statement::prepare_view(

    schema_ptr schema = validation::validate_column_family(db, _base_name.get_keyspace(), _base_name.get_column_family());

+    if (!db.features().views_with_tablets && db.find_keyspace(keyspace()).get_replication_strategy().uses_tablets()) {
+        throw exceptions::invalid_request_exception(format("Materialized views are not supported on base tables with tablets"));
+    }
    if (schema->is_counter()) {
        throw exceptions::invalid_request_exception(format("Materialized views are not supported on counter tables"));
    }
@@ -364,7 +367,7 @@ std::pair<view_ptr, cql3::cql_warnings_vec> create_view_statement::prepare_view(
    }

    auto where_clause_text = util::relations_to_where_clause(_where_clause);
-    builder.with_view_info(schema->id(), schema->cf_name(), included.empty(), std::move(where_clause_text));
+    builder.with_view_info(schema, included.empty(), std::move(where_clause_text));

    return std::make_pair(view_ptr(builder.build()), std::move(warnings));
 }
--- a/cql3/statements/ks_prop_defs.cc
+++ b/cql3/statements/ks_prop_defs.cc
@@ -70,6 +70,16 @@ static std::map<sstring, sstring> prepare_options(
        }
    }

+    // #22688 / #20039 - check for illegal, empty options (after above expand)
+    // moved to here. We want to be able to remove dc:s once rf=0, 
+    // in which case, the options actually serialized in result mutations
+    // will in extreme cases in fact be empty -> cannot do this check in 
+    // verify_options. We only want to apply this constraint on the input
+    // provided by the user
+    if (options.empty() && !tm.get_topology().get_datacenters().empty()) {
+        throw exceptions::configuration_exception("Configuration for at least one datacenter must be present");
+    }
+
    return options;
 }

@@ -140,7 +150,7 @@ data_dictionary::storage_options ks_prop_defs::get_storage_options() const {
    return opts;
 }

-std::optional<unsigned> ks_prop_defs::get_initial_tablets(std::optional<unsigned> default_value) const {
+std::optional<unsigned> ks_prop_defs::get_initial_tablets(std::optional<unsigned> default_value, bool enforce_tablets) const {
    auto tablets_options = get_map(KW_TABLETS);
    if (!tablets_options) {
        return default_value;
@@ -155,6 +165,9 @@ std::optional<unsigned> ks_prop_defs::get_initial_tablets(std::optional<unsigned
        if (enabled == "true") {
            // nothing
        } else if (enabled == "false") {
+            if (enforce_tablets) {
+                throw exceptions::configuration_exception("Cannot disable tablets for keyspace since tablets are enforced using the `tablets_mode_for_new_keyspaces: enforced` config option.");
+            }
            return std::nullopt;
        } else {
            throw exceptions::configuration_exception(sstring("Tablets enabled value must be true or false; found: ") + enabled);
@@ -189,8 +202,10 @@ bool ks_prop_defs::get_durable_writes() const {
 lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata(sstring ks_name, const locator::token_metadata& tm, const gms::feature_service& feat, const db::config& cfg) {
    auto sc = get_replication_strategy_class().value();
    // if tablets options have not been specified, but tablets are globally enabled, set the value to 0 for N.T.S. only
-    auto enable_tablets = feat.tablets && cfg.enable_tablets();
-    auto initial_tablets = get_initial_tablets(enable_tablets && locator::abstract_replication_strategy::to_qualified_class_name(sc) == "org.apache.cassandra.locator.NetworkTopologyStrategy" ? std::optional<unsigned>(0) : std::nullopt);
+    auto enable_tablets = feat.tablets && cfg.enable_tablets_by_default();
+    std::optional<unsigned> default_initial_tablets = enable_tablets && locator::abstract_replication_strategy::to_qualified_class_name(sc) == "org.apache.cassandra.locator.NetworkTopologyStrategy"
+            ? std::optional<unsigned>(0) : std::nullopt;
+    auto initial_tablets = get_initial_tablets(default_initial_tablets, cfg.enforce_tablets());
    auto options = prepare_options(sc, tm, get_replication_options());
    return data_dictionary::keyspace_metadata::new_keyspace(ks_name, sc,
            std::move(options), initial_tablets, get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
--- a/cql3/statements/ks_prop_defs.hh
+++ b/cql3/statements/ks_prop_defs.hh
@@ -60,7 +60,7 @@ public:
    void validate();
    std::map<sstring, sstring> get_replication_options() const;
    std::optional<sstring> get_replication_strategy_class() const;
-    std::optional<unsigned> get_initial_tablets(std::optional<unsigned> default_value) const;
+    std::optional<unsigned> get_initial_tablets(std::optional<unsigned> default_value, bool enforce_tablets = false) const;
    data_dictionary::storage_options get_storage_options() const;
    bool get_durable_writes() const;
    lw_shared_ptr<data_dictionary::keyspace_metadata> as_ks_metadata(sstring ks_name, const locator::token_metadata&, const gms::feature_service&, const db::config&);
--- a/cql3/util.cc
+++ b/cql3/util.cc
@@ -144,26 +144,29 @@ expr::expression where_clause_to_relations(const std::string_view& where_clause,
    return do_with_parser(where_clause, d, std::mem_fn(&cql3_parser::CqlParser::whereClause));
 }

-sstring rename_column_in_where_clause(const std::string_view& where_clause, column_identifier::raw from, column_identifier::raw to, dialect d) {
+sstring rename_columns_in_where_clause(const std::string_view& where_clause, std::vector<std::pair<::shared_ptr<column_identifier>, ::shared_ptr<column_identifier>>> renames, dialect d) {
    std::vector<expr::expression> relations = boolean_factors(where_clause_to_relations(where_clause, d));
    std::vector<expr::expression> new_relations;
    new_relations.reserve(relations.size());

    for (const expr::expression& old_relation : relations) {
-        expr::expression new_relation = expr::search_and_replace(old_relation,
-            [&](const expr::expression& e) -> std::optional<expr::expression> {
-                if (auto ident = expr::as_if<expr::unresolved_identifier>(&e)) {
-                    if (*ident->ident == from) {
-                        return expr::unresolved_identifier{
-                            ::make_shared<column_identifier::raw>(to)
-                        };
+        new_relations.emplace_back(
+            expr::search_and_replace(old_relation,
+                [&](const expr::expression& e) -> std::optional<expr::expression> {
+                    for (const auto& [view_from, view_to] : renames) {
+                        if (auto ident = expr::as_if<expr::unresolved_identifier>(&e)) {
+                            auto from = column_identifier::raw(view_from->text(), true);
+                            if (*ident->ident == from) {
+                                return expr::unresolved_identifier{
+                                    ::make_shared<column_identifier::raw>(view_to->text(), true)
+                                };
+                            }
+                        }
                    }
+                    return std::nullopt;
                }
-                return std::nullopt;
-            }
+            )
        );
-
-        new_relations.emplace_back(std::move(new_relation));
    }

    return relations_to_where_clause(expr::conjunction{std::move(new_relations)});
--- a/cql3/util.hh
+++ b/cql3/util.hh
@@ -40,7 +40,7 @@ sstring relations_to_where_clause(const expr::expression& e);

 expr::expression where_clause_to_relations(const std::string_view& where_clause, dialect d);

-sstring rename_column_in_where_clause(const std::string_view& where_clause, column_identifier::raw from, column_identifier::raw to, dialect d);
+sstring rename_columns_in_where_clause(const std::string_view& where_clause, std::vector<std::pair<::shared_ptr<column_identifier>, ::shared_ptr<column_identifier>>> renames, dialect d);

 /// build a CQL "select" statement with the desired parameters.
 /// If select_all_columns==true, all columns are selected and the value of
--- a/db/CMakeLists.txt
+++ b/db/CMakeLists.txt
@@ -27,6 +27,7 @@ target_sources(db
    extensions.cc
    heat_load_balance.cc
    large_data_handler.cc
+    corrupt_data_handler.cc
    marshal/type_parser.cc
    batchlog_manager.cc
    tags/utils.cc
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -20,6 +20,7 @@
 #include <boost/range/adaptor/sliced.hpp>

 #include "batchlog_manager.hh"
+#include "data_dictionary/data_dictionary.hh"
 #include "mutation/canonical_mutation.hh"
 #include "service/storage_proxy.hh"
 #include "system_keyspace.hh"
@@ -38,7 +39,7 @@

 static logging::logger blogger("batchlog_manager");

-const uint32_t db::batchlog_manager::replay_interval;
+const std::chrono::seconds db::batchlog_manager::replay_interval;
 const uint32_t db::batchlog_manager::page_size;

 db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, batchlog_manager_config config)
@@ -59,27 +60,31 @@ db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_key
    });
 }

-future<> db::batchlog_manager::do_batch_log_replay(post_replay_cleanup cleanup) {
-    return container().invoke_on(0, [cleanup] (auto& bm) -> future<> {
+future<db::all_batches_replayed> db::batchlog_manager::do_batch_log_replay(post_replay_cleanup cleanup) {
+    return container().invoke_on(0, [cleanup] (auto& bm) -> future<db::all_batches_replayed> {
        auto gate_holder = bm._gate.hold();
        auto sem_units = co_await get_units(bm._sem, 1);

        auto dest = bm._cpu++ % smp::count;
        blogger.debug("Batchlog replay on shard {}: starts", dest);
        auto last_replay = gc_clock::now();
+        all_batches_replayed all_replayed = all_batches_replayed::yes;
        if (dest == 0) {
-            co_await bm.replay_all_failed_batches(cleanup);
+            all_replayed = co_await bm.replay_all_failed_batches(cleanup);
        } else {
-            co_await bm.container().invoke_on(dest, [cleanup] (auto& bm) {
+            all_replayed = co_await bm.container().invoke_on(dest, [cleanup] (auto& bm) {
                return with_gate(bm._gate, [&bm, cleanup] {
                    return bm.replay_all_failed_batches(cleanup);
                });
            });
        }
-        co_await bm.container().invoke_on_all([last_replay] (auto& bm) {
-            bm._last_replay = last_replay;
-        });
+        if (all_replayed == all_batches_replayed::yes) {
+            co_await bm.container().invoke_on_all([last_replay] (auto& bm) {
+                bm._last_replay = last_replay;
+            });
+        }
        blogger.debug("Batchlog replay on shard {}: done", dest);
+        co_return all_replayed;
    });
 }

@@ -117,7 +122,8 @@ future<> db::batchlog_manager::batchlog_replay_loop() {
        } catch (...) {
            blogger.error("Exception in batch replay: {}", std::current_exception());
        }
-        delay = std::chrono::milliseconds(replay_interval);
+        delay = utils::get_local_injector().is_enabled("short_batchlog_manager_replay_interval") ?
+                std::chrono::seconds(1) : replay_interval;
    }
 }

@@ -133,6 +139,8 @@ future<> db::batchlog_manager::drain() {
        _sem.broken();
    }

+    co_await _qp.proxy().abort_batch_writes();
+
    co_await std::move(_loop_done);
    blogger.info("Drained");
 }
@@ -156,116 +164,127 @@ db_clock::duration db::batchlog_manager::get_batch_log_timeout() const {
    return _write_request_timeout * 2;
 }

-future<> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
+future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
    typedef db_clock::rep clock_type;

+    db::all_batches_replayed all_replayed = all_batches_replayed::yes;
    // rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
    // max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
    auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
    auto limiter = make_lw_shared<utils::rate_limiter>(throttle);

-    auto batch = [this, limiter](const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
+    auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
+    auto delete_batch = [this, schema = std::move(schema)] (utils::UUID id) {
+        auto key = partition_key::from_singular(*schema, id);
+        mutation m(schema, key);
+        auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
+        m.partition().apply_delete(*schema, clustering_key_prefix::make_empty(), tombstone(now, gc_clock::now()));
+        return _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+    };
+
+    auto batch = [this, limiter, delete_batch = std::move(delete_batch), &all_replayed](const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
        auto written_at = row.get_as<db_clock::time_point>("written_at");
        auto id = row.get_as<utils::UUID>("id");
        // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
+        auto now = db_clock::now();
        auto timeout = get_batch_log_timeout();
-        if (db_clock::now() < written_at + timeout) {
-            blogger.debug("Skipping replay of {}, too fresh", id);
-            return make_ready_future<stop_iteration>(stop_iteration::no);
+
+        if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
+            blogger.debug("Skipping batch replay due to skip_batch_replay injection");
+            all_replayed = all_batches_replayed::no;
+            co_return stop_iteration::no;
        }

        // check version of serialization format
        if (!row.has("version")) {
            blogger.warn("Skipping logged batch because of unknown version");
-            return make_ready_future<stop_iteration>(stop_iteration::no);
+            co_await delete_batch(id);
+            co_return stop_iteration::no;
        }

        auto version = row.get_as<int32_t>("version");
        if (version != netw::messaging_service::current_version) {
-            blogger.warn("Skipping logged batch because of incorrect version");
-            return make_ready_future<stop_iteration>(stop_iteration::no);
+            blogger.warn("Skipping logged batch because of incorrect version {}; current version = {}", version, netw::messaging_service::current_version);
+            co_await delete_batch(id);
+            co_return stop_iteration::no;
        }

        auto data = row.get_blob("data");

        blogger.debug("Replaying batch {}", id);

-        auto fms = make_lw_shared<std::deque<canonical_mutation>>();
-        auto in = ser::as_input_stream(data);
-        while (in.size()) {
-            fms->emplace_back(ser::deserialize(in, std::type_identity<canonical_mutation>()));
-        }
-
-        auto size = data.size();
-
-        return map_reduce(*fms, [this, written_at] (canonical_mutation& fm) {
-            const auto& cf = _qp.proxy().local_db().find_column_family(fm.column_family_id());
-            return make_ready_future<canonical_mutation*>(written_at > cf.get_truncation_time() ? &fm : nullptr);
-        },
-        std::vector<mutation>(),
-        [this] (std::vector<mutation> mutations, canonical_mutation* fm) {
-            if (fm) {
-                schema_ptr s = _qp.db().find_schema(fm->column_family_id());
-                mutations.emplace_back(fm->to_mutation(s));
+        try {
+            auto fms = make_lw_shared<std::deque<canonical_mutation>>();
+            auto in = ser::as_input_stream(data);
+            while (in.size()) {
+                fms->emplace_back(ser::deserialize(in, std::type_identity<canonical_mutation>()));
+                schema_ptr s = _qp.db().find_schema(fms->back().column_family_id());
+                timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
            }
-            return mutations;
-        }).then([this, limiter, written_at, size, fms] (std::vector<mutation> mutations) {
-            if (mutations.empty()) {
-                return make_ready_future<>();
+
+            if (now < written_at + timeout) {
+                blogger.debug("Skipping replay of {}, too fresh", id);
+                co_return stop_iteration::no;
            }
-            const auto ttl = [written_at]() -> clock_type {
-                /*
-                 * Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
-                 * This ensures that deletes aren't "undone" by an old batch replay.
-                 */
-                auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
-                warn(unimplemented::cause::HINT);
-#if 0
-                for (auto& m : *mutations) {
-                    unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
+
+            auto size = data.size();
+
+            auto mutations = co_await map_reduce(*fms, [this, written_at] (canonical_mutation& fm) {
+                const auto& cf = _qp.proxy().local_db().find_column_family(fm.column_family_id());
+                return make_ready_future<canonical_mutation*>(written_at > cf.get_truncation_time() ? &fm : nullptr);
+            },
+            std::vector<mutation>(),
+            [this] (std::vector<mutation> mutations, canonical_mutation* fm) {
+                if (fm) {
+                    schema_ptr s = _qp.db().find_schema(fm->column_family_id());
+                    mutations.emplace_back(fm->to_mutation(s));
                }
-#endif
-                return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
-            }();
-
-            if (ttl <= 0) {
-                return make_ready_future<>();
-            }
-            // Origin does the send manually, however I can't see a super great reason to do so.
-            // Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
-            // in both cases.
-            // FIXME: verify that the above is reasonably true.
-            return limiter->reserve(size).then([this, mutations = std::move(mutations)] {
-                _stats.write_attempts += mutations.size();
-                // #1222 - change cl level to ALL, emulating origins behaviour of sending/hinting
-                // to all natural end points.
-                // Note however that origin uses hints here, and actually allows for this
-                // send to partially or wholly fail in actually sending stuff. Since we don't
-                // have hints (yet), send with CL=ALL, and hope we can re-do this soon.
-                // See below, we use retry on write failure.
-                return _qp.proxy().mutate(mutations, db::consistency_level::ALL, db::no_timeout, nullptr, empty_service_permit(), db::allow_per_partition_rate_limit::no);
+                return mutations;
            });
-        }).then_wrapped([this, id](future<> batch_result) {
-            try {
-                batch_result.get();
-            } catch (data_dictionary::no_such_keyspace& ex) {
-                // should probably ignore and drop the batch
-            } catch (...) {
-                blogger.warn("Replay failed (will retry): {}", std::current_exception());
-                // timeout, overload etc.
-                // Do _not_ remove the batch, assuning we got a node write error.
-                // Since we don't have hints (which origin is satisfied with),
-                // we have to resort to keeping this batch to next lap.
-                return make_ready_future<>();
+
+            if (!mutations.empty()) {
+                const auto ttl = [written_at]() -> clock_type {
+                    /*
+                    * Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
+                    * This ensures that deletes aren't "undone" by an old batch replay.
+                    */
+                    auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
+                    warn(unimplemented::cause::HINT);
+#if 0
+                    for (auto& m : *mutations) {
+                        unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
+                    }
+#endif
+                    return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
+                }();
+
+                if (ttl > 0) {
+                    // Origin does the send manually, however I can't see a super great reason to do so.
+                    // Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
+                    // in both cases.
+                    // FIXME: verify that the above is reasonably true.
+                    co_await limiter->reserve(size);
+                        _stats.write_attempts += mutations.size();
+                        auto timeout = db::timeout_clock::now() + write_timeout;
+                        co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
+                }
            }
-            // delete batch
-            auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
-            auto key = partition_key::from_singular(*schema, id);
-            mutation m(schema, key);
-            auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
-            m.partition().apply_delete(*schema, clustering_key_prefix::make_empty(), tombstone(now, gc_clock::now()));
-            return _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
-        }).then([] { return make_ready_future<stop_iteration>(stop_iteration::no); });
+        } catch (data_dictionary::no_such_keyspace& ex) {
+            // should probably ignore and drop the batch
+        } catch (const data_dictionary::no_such_column_family&) {
+            // As above -- we should drop the batch if the table doesn't exist anymore.
+        } catch (...) {
+            blogger.warn("Replay failed (will retry): {}", std::current_exception());
+            all_replayed = all_batches_replayed::no;
+            // timeout, overload etc.
+            // Do _not_ remove the batch, assuning we got a node write error.
+            // Since we don't have hints (which origin is satisfied with),
+            // we have to resort to keeping this batch to next lap.
+            co_return stop_iteration::no;
+        }
+        // delete batch
+        co_await delete_batch(id);
+        co_return stop_iteration::no;
    };

    co_await seastar::with_gate(_gate, [this, cleanup, batch = std::move(batch)] () mutable -> future<> {
@@ -287,4 +306,6 @@ future<> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cle
            blogger.debug("Finished replayAllFailedBatches");
        });
    });
+
+    co_return all_replayed;
 }
--- a/db/batchlog_manager.hh
+++ b/db/batchlog_manager.hh
@@ -31,6 +31,8 @@ namespace db {

 class system_keyspace;

+using all_batches_replayed = bool_class<struct all_batches_replayed_tag>;
+
 struct batchlog_manager_config {
    std::chrono::duration<double> write_request_timeout;
    uint64_t replay_rate = std::numeric_limits<uint64_t>::max();
@@ -43,8 +45,9 @@ public:
    using post_replay_cleanup = bool_class<class post_replay_cleanup_tag>;

 private:
-    static constexpr uint32_t replay_interval = 60 * 1000; // milliseconds
+    static constexpr std::chrono::seconds replay_interval = std::chrono::seconds(60);
    static constexpr uint32_t page_size = 128; // same as HHOM, for now, w/out using any heuristics. TODO: set based on avg batch size.
+    static constexpr std::chrono::seconds write_timeout = std::chrono::seconds(300);

    using clock_type = lowres_clock;

@@ -69,7 +72,7 @@ private:

    gc_clock::time_point _last_replay;

-    future<> replay_all_failed_batches(post_replay_cleanup cleanup);
+    future<all_batches_replayed> replay_all_failed_batches(post_replay_cleanup cleanup);
 public:
    // Takes a QP, not a distributes. Because this object is supposed
    // to be per shard and does no dispatching beyond delegating the the
@@ -80,7 +83,7 @@ public:
    future<> drain();
    future<> stop();

-    future<> do_batch_log_replay(post_replay_cleanup cleanup);
+    future<all_batches_replayed> do_batch_log_replay(post_replay_cleanup cleanup);

    future<size_t> count_all_batches() const;
    size_t get_total_batches_replayed() const {
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -500,6 +500,9 @@ public:
    void flush_segments(uint64_t size_to_remove);
    void check_no_data_older_than_allowed();

+    // whitebox testing
+    std::function<future<>()> _oversized_pre_wait_memory_func;
+
 private:
    class shutdown_marker{};

@@ -799,6 +802,8 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
    void end_flush() {
        _segment_manager->end_flush();
        if (can_delete()) {
+            // #25709 - do this early if possible
+            _extended_segments.clear();
            _segment_manager->discard_unused_segments();
        }
    }
@@ -874,6 +879,8 @@ public:
    void release_cf_count(const cf_id_type& cf) {
        mark_clean(cf, 1);
        if (can_delete()) {
+            // #25709 - do this early if possible
+            _extended_segments.clear();
            _segment_manager->discard_unused_segments();
        }
    }
@@ -1591,8 +1598,15 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ

    scope_increment_counter allocating(totals.active_allocations);

+    // #27992 - whitebox testing. signal we are trying to lock out 
+    // all allocators
+    if (_oversized_pre_wait_memory_func) {
+        co_await _oversized_pre_wait_memory_func();
+    }
+
    auto permit = co_await std::move(fut);
-    SCYLLA_ASSERT(_request_controller.available_units() == 0);
+    // #27992 - task reordering _can_ force the available units to negative. this is ok.
+    SCYLLA_ASSERT(_request_controller.available_units() <= 0);

    decltype(permit) fake_permit; // can't have allocate+sync release semaphore.
    bool failed = false;
@@ -1853,13 +1867,15 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
            }
        }
    }
-    SCYLLA_ASSERT(_request_controller.available_units() == 0);
+
+    auto avail = _request_controller.available_units();
+    SCYLLA_ASSERT(avail <= 0);
    SCYLLA_ASSERT(permit.count() == max_request_controller_units());
    auto nw = _request_controller.waiters();
    permit.return_all();
    // #20633 cannot guarantee controller avail is now full, since we could have had waiters when doing
    // return all -> now will be less avail
-    SCYLLA_ASSERT(nw > 0 || _request_controller.available_units() == ssize_t(max_request_controller_units()));
+    SCYLLA_ASSERT(nw > 0 || _request_controller.available_units() == (avail + ssize_t(max_request_controller_units())));

    if (!failed) {
        clogger.trace("Oversized allocation succeeded.");
@@ -1967,13 +1983,13 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
            }
            continue;
        } catch (shutdown_marker&) {
-            _reserve_segments.abort(std::current_exception());
            break;
        } catch (...) {
            clogger.warn("Exception in segment reservation: {}", std::current_exception());
        }
        co_await sleep(100ms);
    }
+    _reserve_segments.abort(std::make_exception_ptr(shutdown_marker()));
 }

 future<std::vector<db::commitlog::descriptor>>
@@ -2574,20 +2590,24 @@ struct fmt::formatter<db::commitlog::segment::cf_mark> {
 void db::commitlog::segment_manager::discard_unused_segments() noexcept {
    clogger.trace("Checking for unused segments ({} active)", _segments.size());

-    std::erase_if(_segments, [=](sseg_ptr s) {
-        if (s->can_delete()) {
-            clogger.debug("Segment {} is unused", *s);
-            return true;
-        }
-        if (s->is_still_allocating()) {
-            clogger.debug("Not safe to delete segment {}; still allocating.", *s);
-        } else if (!s->is_clean()) {
-            clogger.debug("Not safe to delete segment {}; dirty is {}", *s, segment::cf_mark {*s});
-        } else {
-            clogger.debug("Not safe to delete segment {}; disk ops pending", *s);
-        }
-        return false;
-    });
+    // #25709 ensure we don't free any segment until after prune.
+    {
+        auto tmp = _segments; 
+        std::erase_if(_segments, [=](sseg_ptr s) {
+            if (s->can_delete()) {
+                clogger.debug("Segment {} is unused", *s);
+                return true;
+            }
+            if (s->is_still_allocating()) {
+                clogger.debug("Not safe to delete segment {}; still allocating.", *s);
+            } else if (!s->is_clean()) {
+                clogger.debug("Not safe to delete segment {}; dirty is {}", *s, segment::cf_mark {*s});
+            } else {
+                clogger.debug("Not safe to delete segment {}; disk ops pending", *s);
+            }
+            return false;
+        });
+    }

    // launch in background, but guard with gate so this deletion is
    // sure to finish in shutdown, because at least through this path,
@@ -2875,7 +2895,10 @@ future<> db::commitlog::segment_manager::do_pending_deletes() {
 }

 future<> db::commitlog::segment_manager::orphan_all() {
-    _segments.clear();
+    // #25709. the actual process of destroying the elements here
+    // might cause a call into discard_unused_segments.
+    // ensure the target vector is empty when we get to destructors
+    auto tmp = std::exchange(_segments, {});
    return clear_reserve_segments();
 }

@@ -3252,9 +3275,13 @@ const db::commitlog::config& db::commitlog::active_config() const {
    return _segment_manager->cfg;
 }

+db::commitlog::segment_data_corruption_error::segment_data_corruption_error(std::string_view msg, uint64_t s)
+    : _msg(fmt::format("Segment data corruption: {}", msg))
+    , _bytes(s)
+{}

-db::commitlog::segment_truncation::segment_truncation(uint64_t pos) 
-    : _msg(fmt::format("Segment truncation at {}", pos))
+db::commitlog::segment_truncation::segment_truncation(std::string_view reason, uint64_t pos)
+    : _msg(fmt::format("Segment truncation at {}. Reason: {}", pos, reason))
    , _pos(pos)
 {}

@@ -3444,7 +3471,8 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin

            while (rem < size) {
                if (eof) {
-                    throw segment_truncation(block_boundry);
+                    auto reason = fmt::format("unexpected EOF, rem={}, size={}", rem, size);
+                    throw segment_truncation(std::move(reason), block_boundry);
                }

                auto block_size = alignment - initial.size_bytes();
@@ -3455,7 +3483,8 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin

                if (tmp.size_bytes() == 0) {
                    eof = true;
-                    throw segment_truncation(block_boundry);
+                    auto reason = fmt::format("read 0 bytes, while tried to read {}", block_size);
+                    throw segment_truncation(std::move(reason), block_boundry);
                }

                crc32_nbo crc;
@@ -3490,10 +3519,12 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
                    auto checksum = crc.checksum();

                    if (check != checksum) {
-                        throw segment_data_corruption_error("Data corruption", alignment);
+                        auto reason = fmt::format("checksums do not match: {:x} vs. {:x}", check, checksum);
+                        throw segment_data_corruption_error(std::move(reason), alignment);
                    }
                    if (id != this->id) {
-                        throw segment_truncation(pos + rem);
+                        auto reason = fmt::format("IDs do not match: {} vs. {}", id, this->id);
+                        throw segment_truncation(std::move(reason), pos + rem);
                    }
                }
                tmp.remove_suffix(detail::sector_overhead_size);
@@ -3601,6 +3632,10 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
            auto old = pos;
            pos = next_pos(off);
            clogger.trace("Pos {} -> {} ({})", old, pos, off);
+            // #24346 check eof status whenever we move file pos.
+            if (pos >= file_size) {
+                eof = true;
+            }
        }

        future<> read_entry() {
@@ -3768,7 +3803,8 @@ db::commitlog::read_log_file(const replay_state& state, sstring filename, sstrin
                    co_await read_chunk();
                }
                if (corrupt_size > 0) {
-                    throw segment_data_corruption_error("Data corruption", corrupt_size);
+                    auto reason = fmt::format("corrupted size while reading file: {}", corrupt_size);
+                    throw segment_data_corruption_error(std::move(reason), corrupt_size);
                }
            } catch (...) {
                p = std::current_exception();
@@ -3915,6 +3951,9 @@ void db::commitlog::update_max_data_lifetime(std::optional<uint64_t> commitlog_d
    _segment_manager->cfg.commitlog_data_max_lifetime_in_seconds = commitlog_data_max_lifetime_in_seconds;
 }

+void db::commitlog::set_oversized_pre_wait_memory_func(std::function<future<>()> f) {
+    _segment_manager->_oversized_pre_wait_memory_func = std::move(f);
+}

 future<std::vector<sstring>> db::commitlog::get_segments_to_replay() const {
    return _segment_manager->get_segments_to_replay();
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -385,6 +385,9 @@ public:
    // (Re-)set data mix lifetime.
    void update_max_data_lifetime(std::optional<uint64_t> commitlog_data_max_lifetime_in_seconds);

+    // Whitebox testing. Do not use for production
+    void set_oversized_pre_wait_memory_func(std::function<future<>()>);
+
    using commit_load_reader_func = std::function<future<>(buffer_and_replay_position)>;

    class segment_error : public std::exception {};
@@ -392,9 +395,7 @@ public:
    class segment_data_corruption_error: public segment_error {
        std::string _msg;
    public:
-        segment_data_corruption_error(std::string msg, uint64_t s)
-                : _msg(std::move(msg)), _bytes(s) {
-        }
+        segment_data_corruption_error(std::string_view msg, uint64_t s);
        uint64_t bytes() const {
            return _bytes;
        }
@@ -425,7 +426,7 @@ public:
        std::string _msg;
        uint64_t _pos;
    public:
-        segment_truncation(uint64_t);
+        segment_truncation(std::string_view reason, uint64_t position);

        uint64_t position() const;
        const char* what() const noexcept override;
--- a/db/config.cc
+++ b/db/config.cc
@@ -238,6 +238,13 @@ const config_type& config_type_for<enum_option<db::tri_mode_restriction_t>>() {
    return ct;
 }

+template <>
+const config_type& config_type_for<enum_option<db::tablets_mode_t>>() {
+    static config_type ct(
+        "tablets mode", printable_to_json<enum_option<db::tablets_mode_t>>);
+    return ct;
+}
+
 template <>
 const config_type& config_type_for<db::config::hinted_handoff_enabled_type>() {
    static config_type ct("hinted handoff enabled", hinted_handoff_enabled_to_json);
@@ -372,6 +379,23 @@ public:
    }
 };

+template <>
+class convert<enum_option<db::tablets_mode_t>> {
+public:
+    static bool decode(const Node& node, enum_option<db::tablets_mode_t>& rhs) {
+        std::string name;
+        if (!convert<std::string>::decode(node, name)) {
+            return false;
+        }
+        try {
+            std::istringstream(name) >> rhs;
+        } catch (boost::program_options::invalid_option_value&) {
+            return false;
+        }
+        return true;
+    }
+};
+
 template<>
 struct convert<db::config::error_injection_at_startup> {
    static bool decode(const Node& node, db::config::error_injection_at_startup& rhs) {
@@ -536,6 +560,9 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "The directory where the schema commit log is stored. This is a special commitlog instance used for schema and system tables. For optimal write performance, it is recommended the commit log be on a separate disk partition (ideally, a separate physical device) from the data file directories.")
    , data_file_directories(this, "data_file_directories", "datadir", value_status::Used, { },
        "The directory location where table data (SSTables) is stored.")
+    , data_file_capacity(this, "data_file_capacity", liveness::LiveUpdate, value_status::Used, 0,
+        "Total capacity in bytes for storing data files. Used by tablet load balancer to compute storage utilization."
+        " If not set, will use file system's capacity.")
    , hints_directory(this, "hints_directory", value_status::Used, "",
        "The directory where hints files are stored if hinted handoff is enabled.")
    , view_hints_directory(this, "view_hints_directory", value_status::Used, "",
@@ -879,6 +906,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "The default timeout for other, miscellaneous operations.\n"
        "\n"
        "Related information: About hinted handoff writes")
+    , request_timeout_on_shutdown_in_seconds(this, "request_timeout_on_shutdown_in_seconds", value_status::Used, 30,
+        "Timeout for CQL server requests on shutdown. After this timeout the server will shutdown all connections.")
    /**
    * @Group Inter-node settings
    */
@@ -1201,7 +1230,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
            "Start serializing reads after their collective memory consumption goes above $normal_limit * $multiplier.")
    , reader_concurrency_semaphore_kill_limit_multiplier(this, "reader_concurrency_semaphore_kill_limit_multiplier", liveness::LiveUpdate, value_status::Used, 4,
            "Start killing reads after their collective memory consumption goes above $normal_limit * $multiplier.")
-    , reader_concurrency_semaphore_cpu_concurrency(this, "reader_concurrency_semaphore_cpu_concurrency", liveness::LiveUpdate, value_status::Used, 1,
+    , reader_concurrency_semaphore_cpu_concurrency(this, "reader_concurrency_semaphore_cpu_concurrency", liveness::LiveUpdate, value_status::Used, 2,
            "Admit new reads while there are less than this number of requests that need CPU.")
    , view_update_reader_concurrency_semaphore_serialize_limit_multiplier(this, "view_update_reader_concurrency_semaphore_serialize_limit_multiplier", liveness::LiveUpdate, value_status::Used, 2,
            "Start serializing view update reads after their collective memory consumption goes above $normal_limit * $multiplier.")
@@ -1354,7 +1383,11 @@ db::config::config(std::shared_ptr<db::extensions> exts)

    , error_injections_at_startup(this, "error_injections_at_startup", error_injection_value_status, {}, "List of error injections that should be enabled on startup.")
    , topology_barrier_stall_detector_threshold_seconds(this, "topology_barrier_stall_detector_threshold_seconds", value_status::Used, 2, "Report sites blocking topology barrier if it takes longer than this.")
-    , enable_tablets(this, "enable_tablets", value_status::Used, false, "Enable tablets for newly created keyspaces.")
+    , enable_tablets(this, "enable_tablets", value_status::Used, false, "Enable tablets for newly created keyspaces. (deprecated)")
+    , tablets_mode_for_new_keyspaces(this, "tablets_mode_for_new_keyspaces", value_status::Used, tablets_mode_t::mode::unset, "Control tablets for new keyspaces.  Can be set to the following values:\n"
+            "\tdisabled: New keyspaces use vnodes by default, unless enabled by the tablets={'enabled':true} option\n"
+            "\tenabled:  New keyspaces use tablets by default, unless disabled by the tablets={'disabled':true} option\n"
+            "\tenforced: New keyspaces must use tablets. Tablets cannot be disabled using the CREATE KEYSPACE option")
    , view_flow_control_delay_limit_in_ms(this, "view_flow_control_delay_limit_in_ms", liveness::LiveUpdate, value_status::Used, 1000,
        "The maximal amount of time that materialized-view update flow control may delay responses "
        "to try to slow down the client and prevent buildup of unfinished view updates. "
@@ -1364,6 +1397,9 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , disk_space_monitor_high_polling_interval_in_seconds(this, "disk_space_monitor_high_polling_interval_in_seconds", value_status::Used, 1, "Disk-space polling interval at or above polling threshold")
    , disk_space_monitor_polling_interval_threshold(this, "disk_space_monitor_polling_interval_threshold", value_status::Used, 0.9, "Disk-space polling threshold. Polling interval is increased when disk utilization is greater than or equal to this threshold")
    , enable_create_table_with_compact_storage(this, "enable_create_table_with_compact_storage", liveness::LiveUpdate, value_status::Used, false, "Enable the deprecated feature of CREATE TABLE WITH COMPACT STORAGE.  This feature will eventually be removed in a future version.")
+    , rf_rack_valid_keyspaces(this, "rf_rack_valid_keyspaces", liveness::MustRestart, value_status::Used, false,
+        "Enforce RF-rack-valid keyspaces. Additionally, if there are existing RF-rack-invalid "
+        "keyspaces, attempting to start a node with this option ON will fail.")
    , default_log_level(this, "default_log_level", value_status::Used, seastar::log_level::info, "Default log level for log messages")
    , logger_log_level(this, "logger_log_level", value_status::Used, {}, "Map of logger name to log level. Valid log levels are 'error', 'warn', 'info', 'debug' and 'trace'")
    , log_to_stdout(this, "log_to_stdout", value_status::Used, true, "Send log output to stdout")
@@ -1579,6 +1615,16 @@ std::unordered_map<sstring, db::tri_mode_restriction_t::mode> db::tri_mode_restr
            {"warn", db::tri_mode_restriction_t::mode::WARN}};
 }

+std::unordered_map<sstring, db::tablets_mode_t::mode> db::tablets_mode_t::map() {
+    return {{"disabled", db::tablets_mode_t::mode::disabled},
+            {"0", db::tablets_mode_t::mode::disabled},
+            {"enabled", db::tablets_mode_t::mode::enabled},
+            {"1", db::tablets_mode_t::mode::enabled},
+            {"enforced", db::tablets_mode_t::mode::enforced},
+            {"2", db::tablets_mode_t::mode::enforced}
+            };
+}
+
 template struct utils::config_file::named_value<seastar::log_level>;

 namespace utils {
--- a/db/config.hh
+++ b/db/config.hh
@@ -130,6 +130,20 @@ struct replication_strategy_restriction_t {

 constexpr unsigned default_murmur3_partitioner_ignore_msb_bits = 12;

+struct tablets_mode_t {
+    // The `unset` mode is used internally for backward compatibility
+    // with the legacy `enable_tablets` option.
+    // It is defined as -1 as existing test code associates the value
+    // 0 with `false` and 1 with `true` when read from system.config.
+    enum class mode : int8_t {
+        unset = -1,
+        disabled = 0,
+        enabled = 1,
+        enforced = 2
+    };
+    static std::unordered_map<sstring, mode> map(); // for enum_option<>
+};
+
 class config final : public utils::config_file {
 public:
    config();
@@ -183,6 +197,7 @@ public:
    named_value<sstring> commitlog_directory;
    named_value<sstring> schema_commitlog_directory;
    named_value<string_list> data_file_directories;
+    named_value<uint64_t> data_file_capacity;
    named_value<sstring> hints_directory;
    named_value<sstring> view_hints_directory;
    named_value<sstring> saved_caches_directory;
@@ -278,6 +293,7 @@ public:
    named_value<uint32_t> truncate_request_timeout_in_ms;
    named_value<uint32_t> write_request_timeout_in_ms;
    named_value<uint32_t> request_timeout_in_ms;
+    named_value<uint32_t> request_timeout_on_shutdown_in_seconds;
    named_value<bool> cross_node_timeout;
    named_value<uint32_t> internode_send_buff_size_in_bytes;
    named_value<uint32_t> internode_recv_buff_size_in_bytes;
@@ -527,6 +543,23 @@ public:
    named_value<std::vector<error_injection_at_startup>> error_injections_at_startup;
    named_value<double> topology_barrier_stall_detector_threshold_seconds;
    named_value<bool> enable_tablets;
+    named_value<enum_option<tablets_mode_t>> tablets_mode_for_new_keyspaces;
+
+    bool enable_tablets_by_default() const noexcept {
+        switch (tablets_mode_for_new_keyspaces()) {
+        case tablets_mode_t::mode::unset:
+            return enable_tablets();
+        case tablets_mode_t::mode::disabled:
+            return false;
+        case tablets_mode_t::mode::enabled:
+        case tablets_mode_t::mode::enforced:
+            return true;
+        }
+    }
+    bool enforce_tablets() const noexcept {
+        return tablets_mode_for_new_keyspaces() == tablets_mode_t::mode::enforced;
+    }
+
    named_value<uint32_t> view_flow_control_delay_limit_in_ms;

    named_value<int> disk_space_monitor_normal_polling_interval_in_seconds;
@@ -535,6 +568,8 @@ public:

    named_value<bool> enable_create_table_with_compact_storage;

+    named_value<bool> rf_rack_valid_keyspaces;
+
    static const sstring default_tls_priority;
 private:
    template<typename T>
--- a/db/corrupt_data_handler.cc
+++ b/db/corrupt_data_handler.cc
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#include "db/corrupt_data_handler.hh"
+#include "reader_concurrency_semaphore.hh"
+#include "replica/database.hh"
+#include "utils/UUID_gen.hh"
+
+static logging::logger corrupt_data_logger("corrupt_data");
+
+namespace sm = seastar::metrics;
+
+namespace db {
+
+corrupt_data_handler::corrupt_data_handler(register_metrics rm) {
+    if (rm) {
+        _metrics.add_group("corrupt_data", {
+                sm::make_counter("entries_reported", _stats.corrupt_data_reported,
+                               sm::description("Counts the number of corrupt data instances reported to the corrupt data handler. "
+                                               "A non-zero value indicates that the database suffered data corruption."))
+                });
+    }
+}
+
+future<corrupt_data_handler::entry_id> corrupt_data_handler::record_corrupt_clustering_row(const schema& s, const partition_key& pk,
+        clustering_row cr, sstring origin, std::optional<sstring> sstable_name) {
+    ++_stats.corrupt_data_reported;
+    ++_stats.corrupt_clustering_rows_reported;
+    return do_record_corrupt_clustering_row(s, pk, std::move(cr), std::move(origin), std::move(sstable_name)).then([this] (entry_id id) {
+        if (id) {
+            ++_stats.corrupt_data_recorded;
+            ++_stats.corrupt_clustering_rows_recorded;
+        }
+        return id;
+    });
+}
+
+system_table_corrupt_data_handler::system_table_corrupt_data_handler(config cfg, register_metrics rm)
+    : corrupt_data_handler(rm)
+    , _entry_ttl(cfg.entry_ttl)
+{
+}
+
+system_table_corrupt_data_handler::~system_table_corrupt_data_handler() {
+}
+
+reader_permit system_table_corrupt_data_handler::make_fragment_permit(const schema& s) {
+    return _fragment_semaphore->make_tracking_only_permit(s.shared_from_this(), "system_table_corrupt_data_handler::make_fragment_permit", db::no_timeout, {});
+}
+
+future<corrupt_data_handler::entry_id> system_table_corrupt_data_handler::do_record_corrupt_mutation_fragment(
+        gate::holder permit,
+        const schema& user_table_schema,
+        const partition_key& pk,
+        const clustering_key& ck,
+        mutation_fragment_v2::kind kind,
+        frozen_mutation_fragment_v2 fmf,
+        sstring origin,
+        std::optional<sstring> sstable_name) {
+    const corrupt_data_handler::entry_id id{utils::UUID_gen::get_time_UUID()};
+
+    const auto corrupt_data_schema = _sys_ks->local_db().find_column_family(system_keyspace::NAME, system_keyspace::CORRUPT_DATA).schema();
+
+    // Using the lower-level mutation API to avoid large allocation warnings when linearizing the frozen mutation fragment.
+    mutation entry_mutation(corrupt_data_schema, partition_key::from_exploded(*corrupt_data_schema, {serialized(user_table_schema.ks_name()), serialized(user_table_schema.cf_name())}));
+    auto& entry_row = entry_mutation.partition().clustered_row(*corrupt_data_schema, clustering_key::from_single_value(*corrupt_data_schema, serialized(timeuuid_native_type{id.uuid()})));
+
+    const auto timestamp = api::new_timestamp();
+
+    auto set_cell_raw = [this, &entry_row, &corrupt_data_schema, timestamp] (const char* cell_name, managed_bytes cell_value) {
+        auto cdef = corrupt_data_schema->get_column_definition(cell_name);
+        SCYLLA_ASSERT(cdef);
+
+        entry_row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, timestamp, cell_value, _entry_ttl));
+    }; 
+
+    auto set_cell = [this, &entry_row, &corrupt_data_schema, timestamp] (const char* cell_name, data_value cell_value) {
+        auto cdef = corrupt_data_schema->get_column_definition(cell_name);
+        SCYLLA_ASSERT(cdef);
+
+        entry_row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, timestamp, cell_value.serialize_nonnull(), _entry_ttl));
+    };
+
+    entry_row.apply(row_marker(timestamp, _entry_ttl, gc_clock::now() + _entry_ttl));
+    set_cell("partition_key", data_value(to_bytes(pk.representation())));
+    set_cell("clustering_key", data_value(to_bytes(ck.representation())));
+    set_cell("mutation_fragment_kind", fmt::to_string(kind));
+    // FIXME: Exposing knowledge here that bytes are serialized by just storing the raw value.
+    // Need to replace with a fragmented-buffer serialize API call, which we don't have yet.
+    set_cell_raw("frozen_mutation_fragment", std::move(fmf).representation().to_managed_bytes());
+    set_cell("origin", origin);
+    set_cell("sstable_name", sstable_name);
+
+    return _sys_ks->apply_mutation(std::move(entry_mutation)).then([id] {
+        return id;
+    });
+}
+
+future<corrupt_data_handler::entry_id> system_table_corrupt_data_handler::do_record_corrupt_clustering_row(const schema& s, const partition_key& pk,
+        clustering_row cr, sstring origin, std::optional<sstring> sstable_name) {
+    if (!_sys_ks) {
+        co_return corrupt_data_handler::entry_id::create_null_id();
+    }
+    auto permit = _gate.hold();
+
+    const auto ck = cr.key();
+    auto fmf = freeze(s, mutation_fragment_v2(s, make_fragment_permit(s), std::move(cr)));
+
+    co_return co_await do_record_corrupt_mutation_fragment(std::move(permit), s, pk, ck, mutation_fragment_v2::kind::clustering_row, std::move(fmf),
+            std::move(origin), std::move(sstable_name));
+}
+
+void system_table_corrupt_data_handler::plug_system_keyspace(db::system_keyspace& sys_ks) noexcept {
+    _sys_ks = sys_ks.shared_from_this();
+    _fragment_semaphore = std::make_unique<reader_concurrency_semaphore>(reader_concurrency_semaphore::no_limits{}, "system_table_corrupt_data_handler", reader_concurrency_semaphore::register_metrics::no);
+}
+
+void system_table_corrupt_data_handler::unplug_system_keyspace() noexcept {
+    _sys_ks = nullptr;
+}
+
+future<> system_table_corrupt_data_handler::stop() noexcept {
+    co_await _gate.close();
+    if (_fragment_semaphore) {
+        co_await _fragment_semaphore->stop();
+    }
+}
+
+future<corrupt_data_handler::entry_id> nop_corrupt_data_handler::do_record_corrupt_clustering_row(const schema& s, const partition_key& pk,
+        clustering_row cr, sstring origin, std::optional<sstring> sstable_name) {
+    return make_ready_future<entry_id>(entry_id::create_null_id());
+}
+
+} // namespace db
--- a/db/corrupt_data_handler.hh
+++ b/db/corrupt_data_handler.hh
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include "db/system_keyspace.hh"
+#include "utils/UUID.hh"
+
+class reader_concurrency_semaphore;
+class reader_permit;
+
+namespace db {
+
+class corrupt_data_handler {
+public:
+    // An ID identifying the corrupt data entry.
+    // To be interpreted in the context of the storage where it is recorded, see storage_name().
+    using entry_id = utils::tagged_uuid<struct corrupt_data_entry_tag>;
+
+    struct stats {
+        // Counters for the number of corrupt data entries reported.
+        uint64_t corrupt_data_reported = 0;
+        // Counters for the number of corrupt data entries recorded.
+        // Can be less than reported depending on the configuration or if entries failed to be recorded.
+        uint64_t corrupt_data_recorded = 0;
+
+        uint64_t corrupt_clustering_rows_reported = 0;
+        uint64_t corrupt_clustering_rows_recorded = 0;
+    };
+
+private:
+    stats _stats;
+
+    seastar::metrics::metric_groups _metrics;
+
+protected:
+    virtual future<entry_id> do_record_corrupt_clustering_row(const schema& s, const partition_key& pk, clustering_row cr, sstring origin, std::optional<sstring> sstable_name) = 0;
+
+public:
+    using register_metrics = bool_class<struct corrupt_data_handler_register_metrics_tag>;
+    explicit corrupt_data_handler(register_metrics);
+    virtual ~corrupt_data_handler() = default;
+
+    const stats& get_stats() const noexcept {
+        return _stats;
+    }
+
+    // The name of the storage where corrupt data is recorded.
+    // The storage-name and the entry-id together should allow the user to unambiguously locate the entry.
+    virtual sstring storage_name() const noexcept = 0;
+
+    // Record a corrupt clustering row.
+    // If the returned id is null, the row was not recorded.
+    future<entry_id> record_corrupt_clustering_row(const schema& s, const partition_key& pk, clustering_row cr, sstring origin, std::optional<sstring> sstable_name);
+};
+
+// Stores corrupt data entries in the system.corrupt_data table.
+class system_table_corrupt_data_handler final : public corrupt_data_handler {
+public:
+    struct config {
+        gc_clock::duration entry_ttl;
+    };
+
+private:
+    gc_clock::duration _entry_ttl;
+
+    gate _gate;
+    seastar::shared_ptr<db::system_keyspace> _sys_ks;
+    std::unique_ptr<reader_concurrency_semaphore> _fragment_semaphore;
+
+private:
+    reader_permit make_fragment_permit(const schema& s);
+
+    future<entry_id> do_record_corrupt_mutation_fragment(gate::holder permit, const schema& user_table_schema, const partition_key& pk, const clustering_key& ck,
+            mutation_fragment_v2::kind kind, frozen_mutation_fragment_v2 mf, sstring origin, std::optional<sstring> sstable_name);
+
+    virtual future<entry_id> do_record_corrupt_clustering_row(const schema& s, const partition_key& pk, clustering_row cr, sstring origin, std::optional<sstring> sstable_name) override;
+
+public:
+    explicit system_table_corrupt_data_handler(config, register_metrics);
+    ~system_table_corrupt_data_handler();
+
+    virtual sstring storage_name() const noexcept override {
+        return format("{}.{}", db::system_keyspace::NAME, db::system_keyspace::CORRUPT_DATA);
+    }
+
+    void plug_system_keyspace(db::system_keyspace& sys_ks) noexcept;
+    void unplug_system_keyspace() noexcept;
+
+    future<> stop() noexcept;
+};
+
+// A no-op corrupt data handler that does not record any data.
+class nop_corrupt_data_handler final : public corrupt_data_handler {
+    virtual future<entry_id> do_record_corrupt_clustering_row(const schema& s, const partition_key& pk, clustering_row cr, sstring origin, std::optional<sstring> sstable_name) override;
+
+public:
+    explicit nop_corrupt_data_handler(register_metrics rm)
+        : corrupt_data_handler(rm) {}
+    virtual sstring storage_name() const noexcept override {
+        return "/dev/null";
+    }
+};
+
+} // namespace db
--- a/db/hints/internal/hint_endpoint_manager.cc
+++ b/db/hints/internal/hint_endpoint_manager.cc
@@ -65,18 +65,18 @@ future<> hint_endpoint_manager::do_store_hint(schema_ptr s, lw_shared_ptr<const
        const replay_position rp = rh.release();
        if (_last_written_rp < rp) {
            _last_written_rp = rp;
-            manager_logger.debug("[{}] Updated last written replay position to {}", end_point_key(), rp);
+            manager_logger.trace("hint_endpoint_manager[{}]:do_store_hint: Updated last written replay position to {}", end_point_key(), rp);
        }

        ++shard_stats().written;

-        manager_logger.trace("Hint to {} was stored", end_point_key());
+        manager_logger.trace("hint_endpoint_manager[{}]:do_store_hint: Hint has been stored", end_point_key());
        tracing::trace(tr_state, "Hint to {} was stored", end_point_key());
    } catch (...) {
        ++shard_stats().errors;
        const auto eptr = std::current_exception();

-        manager_logger.debug("store_hint(): got the exception when storing a hint to {}: {}", end_point_key(), eptr);
+        manager_logger.debug("hint_endpoint_manager[{}]:do_store_hint: Exception when storing a hint: {}", end_point_key(), eptr);
        tracing::trace(tr_state, "Failed to store a hint to {}: {}", end_point_key(), eptr);
    }

@@ -92,7 +92,7 @@ bool hint_endpoint_manager::store_hint(schema_ptr s, lw_shared_ptr<const frozen_
            return do_store_hint(std::move(s), std::move(fm), tr_state);
        });
    } catch (...) {
-        manager_logger.trace("Failed to store a hint to {}: {}", end_point_key(), std::current_exception());
+        manager_logger.trace("hint_endpoint_manager[{}]:store_hint: Failed to store a hint: {}", end_point_key(), std::current_exception());
        tracing::trace(tr_state, "Failed to store a hint to {}: {}", end_point_key(), std::current_exception());

        ++shard_stats().dropped;
@@ -109,16 +109,23 @@ future<> hint_endpoint_manager::populate_segments_to_replay() {
 }

 void hint_endpoint_manager::start() {
+    manager_logger.debug("hint_endpoint_manager[{}]:start: Starting", end_point_key());
+
    clear_stopped();
    allow_hints();
    _sender.start();
+
+    manager_logger.debug("hint_endpoint_manager[{}]:start: Finished", end_point_key());
 }

 future<> hint_endpoint_manager::stop(drain should_drain) noexcept {
-    if(stopped()) {
+    if (stopped()) {
+        manager_logger.warn("hint_endpoint_manager[{}]:stop: Stop had already been called", end_point_key());
        return make_exception_future<>(std::logic_error(format("ep_manager[{}]: stop() is called twice", _key).c_str()));
    }

+    manager_logger.debug("hint_endpoint_manager[{}]:stop: Starting", end_point_key());
+
    return seastar::async([this, should_drain] {
        std::exception_ptr eptr;

@@ -139,13 +146,18 @@ future<> hint_endpoint_manager::stop(drain should_drain) noexcept {
        }).handle_exception([&eptr] (auto e) { eptr = std::move(e); }).get();

        if (eptr) {
-            manager_logger.error("ep_manager[{}]: exception: {}", _key, eptr);
+            manager_logger.error("hint_endpoint_manager[{}]:stop: Exception occurred: {}", _key, eptr);
        }

        set_stopped();
+        manager_logger.debug("hint_endpoint_manager[{}]:stop: Finished", end_point_key());
    });
 }

+void hint_endpoint_manager::cancel_draining() noexcept {
+    _sender.cancel_draining();
+}
+
 hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hint_directory, manager& shard_manager)
    : _key(key)
    , _shard_manager(shard_manager)
@@ -188,7 +200,7 @@ future<hints_store_ptr> hint_endpoint_manager::get_or_load() {
 }

 future<db::commitlog> hint_endpoint_manager::add_store() noexcept {
-    manager_logger.trace("Going to add a store to {}", _hints_dir.c_str());
+    manager_logger.debug("hint_endpoint_manager[{}]:add_store: Going to add a store: {}", end_point_key(), _hints_dir.native());

    return futurize_invoke([this] {
        return io_check([name = _hints_dir.c_str()] { return recursive_touch_directory(name); }).then([this] () {
@@ -234,7 +246,7 @@ future<db::commitlog> hint_endpoint_manager::add_store() noexcept {
            // which is larger than the segment ID of the RP of the last written hint.
            cfg.base_segment_id = _last_written_rp.base_id();

-            return commitlog::create_commitlog(std::move(cfg)).then([this] (commitlog l) -> future<commitlog> {
+            return commitlog::create_commitlog(std::move(cfg)).then([this] (this auto, commitlog l) -> future<commitlog> {
                // add_store() is triggered every time hint files are forcefully flushed to I/O (every hints_flush_period).
                // When this happens we want to refill _sender's segments only if it has finished with the segments he had before.
                if (_sender.have_segments()) {
@@ -283,6 +295,8 @@ future<db::commitlog> hint_endpoint_manager::add_store() noexcept {
                    _sender.add_segment(std::move(seg));
                }

+                manager_logger.debug("hint_endpoint_manager[{}]:add_store: Finished", end_point_key());
+
                co_return l;
            });
        });
--- a/db/hints/internal/hint_endpoint_manager.hh
+++ b/db/hints/internal/hint_endpoint_manager.hh
@@ -102,6 +102,8 @@ public:
    /// \return Ready future when all operations are complete
    future<> stop(drain should_drain = drain::no) noexcept;

+    void cancel_draining() noexcept;
+
    /// \brief Start the timer.
    void start();

@@ -144,6 +146,10 @@ public:
        return _state.contains(state::stopped);
    }

+    bool canceled_draining() const noexcept {
+        return _sender.canceled_draining();
+    }
+
    /// \brief Returns replay position of the most recently written hint.
    ///
    /// If there weren't any hints written during this endpoint manager's lifetime, a zero replay_position is returned.
--- a/db/hints/internal/hint_sender.cc
+++ b/db/hints/internal/hint_sender.cc
@@ -10,6 +10,7 @@
 #include "db/hints/internal/hint_sender.hh"

 // Seastar features.
+#include <chrono>
 #include <exception>
 #include <seastar/core/abort_source.hh>
 #include <seastar/core/coroutine.hh>
@@ -58,8 +59,8 @@ future<> hint_sender::flush_maybe() noexcept {
    if (current_time >= _next_flush_tp) {
        return _ep_manager.flush_current_hints().then([this, current_time] {
            _next_flush_tp = current_time + manager::hints_flush_period;
-        }).handle_exception([] (auto eptr) {
-            manager_logger.trace("flush_maybe() failed: {}", eptr);
+        }).handle_exception([this] (auto eptr) {
+            manager_logger.debug("hint_sender[{}]:flush_maybe: Failed with {}", _ep_key, eptr);
            return make_ready_future<>();
        });
    }
@@ -117,7 +118,7 @@ const column_mapping& hint_sender::get_column_mapping(lw_shared_ptr<send_one_fil
            throw no_column_mapping(fm.schema_version());
        }

-        manager_logger.debug("new schema version {}", fm.schema_version());
+        manager_logger.trace("hint_sender[{}]:get_column_mapping: new schema version {}", _ep_key, fm.schema_version());
        cm_it = ctx_ptr->schema_ver_to_column_mapping.emplace(fm.schema_version(), *hr.get_column_mapping()).first;
    }

@@ -177,21 +178,28 @@ future<> hint_sender::stop(drain should_drain) noexcept {
            //
            // The next call for send_hints_maybe() will send the last hints to the current end point and when it is
            // done there is going to be no more pending hints and the corresponding hints directory may be removed.
-            manager_logger.trace("Draining for {}: start", end_point_key());
+            manager_logger.trace("hint_sender[{}]:stop: Draining starts", end_point_key());
            set_draining();
            send_hints_maybe();
-            _ep_manager.flush_current_hints().handle_exception([] (auto e) {
-                manager_logger.error("Failed to flush pending hints: {}. Ignoring...", e);
+            _ep_manager.flush_current_hints().handle_exception([this] (auto e) {
+                manager_logger.error("hint_sender[{}]:stop: Failed to flush pending hints: {}. Ignoring", _ep_key, e);
            }).get();
            send_hints_maybe();
-            manager_logger.trace("Draining for {}: end", end_point_key());
+            manager_logger.trace("hint_sender[{}]:stop: Draining finished", end_point_key());
        }
-        // TODO: Change this log to match the class name, but first make sure no test
-        //       relies on the old one.
-        manager_logger.trace("ep_manager({})::sender: exiting", end_point_key());
+
+        manager_logger.debug("hint_sender[{}]:stop: Finished", end_point_key());
    });
 }

+void hint_sender::cancel_draining() {
+    manager_logger.info("hint_sender[{}]:cancel_draining: Marking as canceled", _ep_key);
+    if (_state.contains(state::draining)) {
+        _state.remove(state::draining);
+    }
+    _state.set(state::canceled_draining);
+}
+
 void hint_sender::add_segment(sstring seg_name) {
    _segments_to_replay.emplace_back(std::move(seg_name));
 }
@@ -216,9 +224,8 @@ void hint_sender::start() {

    attr.sched_group = _hints_cpu_sched_group;
    _stopped = seastar::async(std::move(attr), [this] {
-        // TODO: Change this log to match the class name, but first make sure no test
-        //       relies on the old one.
-        manager_logger.trace("ep_manager({})::sender: started", end_point_key());
+        manager_logger.debug("hint_sender[{}]:start: Starting", end_point_key());
+
        while (!stopping()) {
            try {
                flush_maybe().get();
@@ -231,34 +238,36 @@ void hint_sender::start() {
                break;
            } catch (...) {
                // log and keep on spinning
-                // TODO: Change this log to match the class name, but first make sure no test
-                //       relies on the old one.
-                manager_logger.trace("sender: got the exception: {}", std::current_exception());
+                manager_logger.debug("hint_sender[{}]:start: Exception in the loop: {}", _ep_key, std::current_exception());
            }
        }
+
+        manager_logger.debug("hint_sender[{}]:start: Exited the loop", _ep_key);
    });
 }

 future<> hint_sender::send_one_mutation(frozen_mutation_and_schema m) {
    auto ermp = _db.find_column_family(m.s).get_effective_replication_map();
    auto token = dht::get_token(*m.s, m.fm.key());
-    host_id_vector_replica_set natural_endpoints = ermp->get_natural_replicas(std::move(token));
+    host_id_vector_replica_set natural_endpoints = ermp->get_natural_replicas(token);
+    host_id_vector_topology_change pending_endpoints  = ermp->get_pending_replicas(token);

-    return futurize_invoke([this, m = std::move(m), ermp = std::move(ermp), &natural_endpoints] () mutable -> future<> {
+    return futurize_invoke([this, m = std::move(m), ermp = std::move(ermp), &natural_endpoints, &pending_endpoints] () mutable -> future<> {
        // The fact that we send with CL::ALL in both cases below ensures that new hints are not going
        // to be generated as a result of hints sending.
        const auto& tm = ermp->get_token_metadata();
        const auto dst = end_point_key();

        if (std::ranges::contains(natural_endpoints, dst) && !tm.is_leaving(dst)) {
-            manager_logger.trace("Sending directly to {}", dst);
-            return _proxy.send_hint_to_endpoint(std::move(m), std::move(ermp), dst);
+            manager_logger.trace("hint_sender[{}]:send_one_mutation: Sending directly", dst);
+            // dst is not duplicated in pending_endpoints because it's in natural_endpoints
+            return _proxy.send_hint_to_endpoint(std::move(m), std::move(ermp), dst, std::move(pending_endpoints));
        } else {
            if (manager_logger.is_enabled(log_level::trace)) {
                if (tm.is_leaving(end_point_key())) {
-                    manager_logger.trace("The original target endpoint {} is leaving. Mutating from scratch...", dst);
+                    manager_logger.trace("hint_sender[{}]:send_one_mutation: Original target is leaving. Mutating from scratch", dst);
                } else {
-                    manager_logger.trace("Endpoints set has changed and {} is no longer a replica. Mutating from scratch...", dst);
+                    manager_logger.trace("hint_sender[{}]:send_one_mutation: Endpoint set has changed and original target is no longer a replica. Mutating from scratch", dst);
                }
            }
            return _proxy.send_hint_to_all_replicas(std::move(m));
@@ -282,9 +291,9 @@ future<> hint_sender::send_one_hint(lw_shared_ptr<send_one_file_ctx> ctx_ptr, fr
                // Files are aggregated for at most manager::hints_timer_period therefore the oldest hint there is
                // (last_modification - manager::hints_timer_period) old.
                if (const auto now = gc_clock::now().time_since_epoch(); now - secs_since_file_mod > gc_grace_sec - manager::hints_flush_period) {
-                    manager_logger.debug("send_hints(): the hint is too old, skipping it, "
+                    manager_logger.trace("hint_sender[{}]:send_hints: Hint is too old, skipping it, "
                        "secs since file last modification {}, gc_grace_sec {}, hints_flush_period {}",
-                        now - secs_since_file_mod, gc_grace_sec, manager::hints_flush_period);
+                        _ep_key, now - secs_since_file_mod, gc_grace_sec, manager::hints_flush_period);
                    return make_ready_future<>();
                }

@@ -293,24 +302,24 @@ future<> hint_sender::send_one_hint(lw_shared_ptr<send_one_file_ctx> ctx_ptr, fr
                    ++this->shard_stats().sent_total;
                    this->shard_stats().sent_hints_bytes_total += mutation_size;
                }).handle_exception([this, ctx_ptr] (auto eptr) {
-                    manager_logger.trace("send_one_hint(): failed to send to {}: {}", end_point_key(), eptr);
+                    manager_logger.trace("hint_sender[{}]:send_one_hint: Failed to send: {}", end_point_key(), eptr);
                    ++this->shard_stats().send_errors;
                    return make_exception_future<>(std::move(eptr));
                });

            // ignore these errors and move on - probably this hint is too old and the KS/CF has been deleted...
            } catch (replica::no_such_column_family& e) {
-                manager_logger.debug("send_hints(): no_such_column_family: {}", e.what());
+                manager_logger.debug("hint_sender[{}]:send_one_hint: no_such_column_family: {}", _ep_key, e.what());
                ++this->shard_stats().discarded;
            } catch (replica::no_such_keyspace& e) {
-                manager_logger.debug("send_hints(): no_such_keyspace: {}", e.what());
+                manager_logger.debug("hint_sender[{}]:send_one_hint: no_such_keyspace: {}", _ep_key, e.what());
                ++this->shard_stats().discarded;
            } catch (no_column_mapping& e) {
-                manager_logger.debug("send_hints(): {} at {}: {}", fname, rp, e.what());
+                manager_logger.debug("hint_sender[{}]:send_one_hint: no_column_mapping: {} at {}: {}", _ep_key, fname, rp, e.what());
                ++this->shard_stats().discarded;
            } catch (...) {
                auto eptr = std::current_exception();
-                manager_logger.debug("send_hints(): unexpected error in file {} at {}: {}", fname, rp, eptr);
+                manager_logger.debug("hint_sender[{}]:send_one_hint: Unexpected error in file {} at {}: {}", _ep_key, fname, rp, eptr);
                ++this->shard_stats().send_errors;
                return make_exception_future<>(std::move(eptr));
            }
@@ -332,21 +341,24 @@ future<> hint_sender::send_one_hint(lw_shared_ptr<send_one_file_ctx> ctx_ptr, fr
            }
            f.ignore_ready_future();
        });
-    }).handle_exception([ctx_ptr, rp] (auto eptr) {
-        manager_logger.trace("send_one_file(): Hmmm. Something bad had happened: {}", eptr);
+    }).handle_exception([this, ctx_ptr, rp] (auto eptr) {
+        manager_logger.trace("hint_sender[{}]:send_one_hint: Exception occurred: {}", _ep_key, eptr);
        ctx_ptr->on_hint_send_failure(rp);
    });
 }

 void hint_sender::notify_replay_waiters() noexcept {
    if (!_foreign_segments_to_replay.empty()) {
-        manager_logger.trace("[{}] notify_replay_waiters(): not notifying because there are still {} foreign segments to replay", end_point_key(), _foreign_segments_to_replay.size());
+        manager_logger.trace("hint_sender[{}]:notify_replay_waiters: Not notifying because there are still {} foreign segments to replay",
+                end_point_key(), _foreign_segments_to_replay.size());
        return;
    }

-    manager_logger.trace("[{}] notify_replay_waiters(): replay position upper bound was updated to {}", end_point_key(), _sent_upper_bound_rp);
+    manager_logger.trace("hint_sender[{}]:notify_replay_waiters: Replay position upper bound was updated to {}", end_point_key(), _sent_upper_bound_rp);
    while (!_replay_waiters.empty() && _replay_waiters.begin()->first < _sent_upper_bound_rp) {
-        manager_logger.trace("[{}] notify_replay_waiters(): notifying one ({} < {})", end_point_key(), _replay_waiters.begin()->first, _sent_upper_bound_rp);
+        manager_logger.trace("hint_sender[{}]:notify_replay_waiters: Notifying one ({} < {})",
+                end_point_key(), _replay_waiters.begin()->first, _sent_upper_bound_rp);
+
        auto ptr = _replay_waiters.begin()->second;
        (**ptr).set_value();
        (*ptr) = std::nullopt; // Prevent it from being resolved by abort source subscription
@@ -356,7 +368,7 @@ void hint_sender::notify_replay_waiters() noexcept {

 void hint_sender::dismiss_replay_waiters() noexcept {
    for (auto& p : _replay_waiters) {
-        manager_logger.debug("[{}] dismiss_replay_waiters(): dismissing one", end_point_key());
+        manager_logger.debug("hint_sender[{}]:dismiss_replay_waiters: Dismissing one", end_point_key());
        auto ptr = p.second;
        (**ptr).set_exception(std::runtime_error(format("Hints manager for {} is stopping", end_point_key())));
        (*ptr) = std::nullopt; // Prevent it from being resolved by abort source subscription
@@ -365,14 +377,15 @@ void hint_sender::dismiss_replay_waiters() noexcept {
 }

 future<> hint_sender::wait_until_hints_are_replayed_up_to(abort_source& as, db::replay_position up_to_rp) {
-    manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): entering with target {}", end_point_key(), up_to_rp);
+    manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Entering with target {}", end_point_key(), up_to_rp);
    if (_foreign_segments_to_replay.empty() && up_to_rp < _sent_upper_bound_rp) {
-        manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): hints were already replayed above the point ({} < {})", end_point_key(), up_to_rp, _sent_upper_bound_rp);
+        manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Hints were already replayed above the point ({} < {})",
+                end_point_key(), up_to_rp, _sent_upper_bound_rp);
        return make_ready_future<>();
    }

    if (as.abort_requested()) {
-        manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): already aborted - stopping", end_point_key());
+        manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Already aborted - stopping", end_point_key());
        return make_exception_future<>(abort_requested_exception());
    }

@@ -383,7 +396,7 @@ future<> hint_sender::wait_until_hints_are_replayed_up_to(abort_source& as, db::
            // The promise already was resolved by `notify_replay_waiters` and removed from the map
            return;
        }
-        manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): abort requested - stopping", end_point_key());
+        manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Abort requested - stopping", end_point_key());
        _replay_waiters.erase(it);
        (**ptr).set_exception(abort_requested_exception());
    });
@@ -392,7 +405,7 @@ future<> hint_sender::wait_until_hints_are_replayed_up_to(abort_source& as, db::
    // therefore we cannot capture `this`
    auto ep = end_point_key();
    return (**ptr).get_future().finally([sub = std::move(sub), ep] {
-        manager_logger.debug("[{}] wait_until_hints_are_replayed_up_to(): returning after the future was satisfied", ep);
+        manager_logger.debug("hint_sender[{}]:wait_until_hints_are_replayed_up_to: Returning after the future was satisfied", ep);
    });
 }

@@ -449,6 +462,8 @@ bool hint_sender::send_one_file(const sstring& fname) {
    gc_clock::duration secs_since_file_mod = std::chrono::seconds(last_mod.tv_sec);
    lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>(_last_schema_ver_to_column_mapping);

+    struct canceled_draining_exception {};
+
    try {
        commitlog::read_log_file(fname, manager::FILENAME_PREFIX, [this, secs_since_file_mod, &fname, ctx_ptr] (commitlog::buffer_and_replay_position buf_rp) -> future<> {
            auto& buf = buf_rp.buffer;
@@ -461,6 +476,12 @@ bool hint_sender::send_one_file(const sstring& fname) {
                    co_return;
                }

+                if (canceled_draining()) {
+                    manager_logger.debug("hint_sender[{}]:send_one_file: Exiting reading from commitlog because of canceled draining", _ep_key);
+                    // We need to throw an exception here to cancel reading the segment.
+                    throw canceled_draining_exception{};
+                }
+
                // Break early if stop() was called or the destination node went down.
                if (!can_send()) {
                    ctx_ptr->segment_replay_failed = true;
@@ -488,20 +509,30 @@ bool hint_sender::send_one_file(const sstring& fname) {
            };
        }, _last_not_complete_rp.pos, &_db.extensions()).get();
    } catch (db::commitlog::segment_error& ex) {
-        manager_logger.error("{}: {}. Dropping...", fname, ex.what());
+        manager_logger.error("hint_sender[{}]:send_one_file: Segment error in {}: {}. Last not complete position={}",
+                _ep_key, fname, ex.what(), _last_not_complete_rp);
        ctx_ptr->segment_replay_failed = false;
        ++this->shard_stats().corrupted_files;
+    } catch  (const canceled_draining_exception&) {
+        manager_logger.debug("hint_sender[{}]:send_one_file: Loop in send_one_file finishes due to canceled draining", _ep_key);
    } catch (...) {
-        manager_logger.trace("sending of {} failed: {}", fname, std::current_exception());
+        manager_logger.debug("hint_sender[{}]:send_one_file: Sending of {} failed: {}. Last not complete position={}",
+                _ep_key, fname, std::current_exception(), _last_not_complete_rp);
        ctx_ptr->segment_replay_failed = true;
    }

    // wait till all background hints sending is complete
    ctx_ptr->file_send_gate.close().get();

+    // If draining was canceled, we can't say anything about the segment's state,
+    // so return immediately. We return false here because of that reason too.
+    if (canceled_draining()) {
+        return false;
+    }
+
    // If we are draining ignore failures and drop the segment even if we failed to send it.
    if (draining() && ctx_ptr->segment_replay_failed) {
-        manager_logger.trace("send_one_file(): we are draining so we are going to delete the segment anyway");
+        manager_logger.debug("hint_sender[{}]:send_one_file: We are draining, so we are going to delete the segment anyway", _ep_key);
        ctx_ptr->segment_replay_failed = false;
    }

@@ -511,7 +542,7 @@ bool hint_sender::send_one_file(const sstring& fname) {
        // If there was an error thrown by read_log_file function itself, we will retry sending from
        // the last hint that was successfully sent (last_succeeded_rp).
        _last_not_complete_rp = ctx_ptr->first_failed_rp.value_or(ctx_ptr->last_succeeded_rp.value_or(_last_not_complete_rp));
-        manager_logger.trace("send_one_file(): error while sending hints from {}, last RP is {}", fname, _last_not_complete_rp);
+        manager_logger.debug("hint_sender[{}]:send_one_file: Error while sending hints from {}, last RP is {}", _ep_key, fname, _last_not_complete_rp);
        return false;
    }

@@ -524,7 +555,7 @@ bool hint_sender::send_one_file(const sstring& fname) {
    // clear the replay position - we are going to send the next segment...
    _last_not_complete_rp = replay_position();
    _last_schema_ver_to_column_mapping.clear();
-    manager_logger.trace("send_one_file(): segment {} was sent in full and deleted", fname);
+    manager_logger.debug("hint_sender[{}]:send_one_file: Segment {} has been sent in full and deleted", _ep_key, fname);
    return true;
 }

@@ -550,12 +581,17 @@ void hint_sender::pop_current_segment() {
 // Runs in the seastar::async context
 void hint_sender::send_hints_maybe() noexcept {
    using namespace std::literals::chrono_literals;
-    manager_logger.trace("send_hints(): going to send hints to {}, we have {} segment to replay", end_point_key(), _segments_to_replay.size() + _foreign_segments_to_replay.size());
+    manager_logger.trace("hint_sender[{}]:send_hints_maybe: Going to send hints. We have {} segment to replay",
+            end_point_key(), _segments_to_replay.size() + _foreign_segments_to_replay.size());

    int replayed_segments_count = 0;

    try {
        while (true) {
+            if (canceled_draining()) {
+                manager_logger.debug("hint_sender[{}]:send_hints_maybe: Exiting loop in send_hints_maybe because of canceled draining", _ep_key);
+                break;
+            }
            const sstring* seg_name = name_of_current_segment();
            if (!seg_name || !replay_allowed() || !can_send()) {
                break;
@@ -572,7 +608,7 @@ void hint_sender::send_hints_maybe() noexcept {
    // Ignore exceptions, we will retry sending this file from where we left off the next time.
    // Exceptions are not expected here during the regular operation, so just log them.
    } catch (...) {
-        manager_logger.trace("send_hints(): got the exception: {}", std::current_exception());
+        manager_logger.debug("hint_sender[{}]:send_hints_maybe: Exception occurred while sending: {}", _ep_key, std::current_exception());
    }

    if (have_segments()) {
@@ -583,7 +619,7 @@ void hint_sender::send_hints_maybe() noexcept {
        _next_send_retry_tp = _next_flush_tp;
    }

-    manager_logger.trace("send_hints(): we handled {} segments", replayed_segments_count);
+    manager_logger.debug("hint_sender[{}]:send_hints_maybe: We handled {} segments", _ep_key, replayed_segments_count);
 }

 hint_stats& hint_sender::shard_stats() {
--- a/db/hints/internal/hint_sender.hh
+++ b/db/hints/internal/hint_sender.hh
@@ -66,12 +66,14 @@ class hint_sender {
        stopping,               // stop() was called
        ep_state_left_the_ring, // destination Node is not a part of the ring anymore - usually means that it has been decommissioned
        draining,               // try to send everything out and ignore errors
+        canceled_draining,      // draining was started, but it got canceled
    };

    using state_set = enum_set<super_enum<state,
        state::stopping,
        state::ep_state_left_the_ring,
-        state::draining>>;
+        state::draining,
+        state::canceled_draining>>;

    struct send_one_file_ctx {
        send_one_file_ctx(std::unordered_map<table_schema_version, column_mapping>& last_schema_ver_to_column_mapping)
@@ -140,6 +142,12 @@ public:
    /// \param should_drain if is drain::yes - drain all pending hints
    future<> stop(drain should_drain) noexcept;

+    void cancel_draining();
+
+    bool canceled_draining() const noexcept {
+        return _state.contains(state::canceled_draining);
+    }
+
    /// \brief Add a new segment ready for sending.
    void add_segment(sstring seg_name);

--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -220,11 +220,24 @@ future<> manager::stop() {

    set_stopping();

-    return _migrating_done.finally([this] {
+    const auto& node = *_proxy.get_token_metadata_ptr()->get_topology().this_node();
+    const bool leaving = node.is_leaving() || node.left();
+
+    return _migrating_done.finally([this, leaving] {
+        // We want to stop the manager as soon as possible if it's not leaving the cluster.
+        // Because of that, we need to cancel all ongoing drains (since that can take quite a bit of time),
+        // but we also need to ensure that no new drains will be started in the meantime.
+        if (!leaving) {
+            for (auto& [_, ep_man] : _ep_managers) {
+                ep_man.cancel_draining();
+            }
+        }
        return _draining_eps_gate.close();
+        // At this point, all endpoint managers that were being previously drained have been deleted from the map.
+        // In other words, the next lambda is safe to run, i.e. we won't call `hint_endpoint_manager::stop()` twice.
    }).finally([this] {
        return parallel_for_each(_ep_managers | std::views::values, [] (hint_endpoint_manager& ep_man) {
-            return ep_man.stop();
+            return ep_man.stop(drain::no);
        }).finally([this] {
            _ep_managers.clear();
            _hint_directory_manager.clear();
@@ -537,20 +550,20 @@ bool manager::can_hint_for(endpoint_id ep) const noexcept {
    // hints where N is the total number nodes in the cluster.
    const auto hipf = hints_in_progress_for(ep);
    if (_stats.size_of_hints_in_progress > max_size_of_hints_in_progress() && hipf > 0) {
-        manager_logger.trace("size_of_hints_in_progress {} hints_in_progress_for({}) {}",
+        manager_logger.trace("can_hint_for: size_of_hints_in_progress {} hints_in_progress_for({}) {}",
                _stats.size_of_hints_in_progress, ep, hipf);
        return false;
    }

    // Check that the destination DC is "hintable".
    if (!check_dc_for(ep)) {
-        manager_logger.trace("{}'s DC is not hintable", ep);
+        manager_logger.trace("can_hint_for: {}'s DC is not hintable", ep);
        return false;
    }

    const bool node_is_alive = endpoint_downtime_not_bigger_than(local_gossiper(), ep, _max_hint_window_us);
    if (!node_is_alive) {
-        manager_logger.trace("{} has been down for too long, not hinting", ep);
+        manager_logger.trace("can_hint_for: {} has been down for too long, not hinting", ep);
        return false;
    }

@@ -667,7 +680,7 @@ future<> manager::drain_for(endpoint_id host_id, gms::inet_address ip) noexcept
        co_return;
    }

-    manager_logger.trace("on_leave_cluster: {} is removed/decommissioned", host_id);
+    manager_logger.trace("Draining starts for {}", host_id);

    const auto holder = seastar::gate::holder{_draining_eps_gate};
    // As long as we hold on to this lock, no migration of hinted handoff to host IDs
@@ -677,9 +690,24 @@ future<> manager::drain_for(endpoint_id host_id, gms::inet_address ip) noexcept

    // After an endpoint has been drained, we remove its directory with all of its contents.
    auto drain_ep_manager = [] (hint_endpoint_manager& ep_man) -> future<> {
-        return ep_man.stop(drain::yes).finally([&] {
-            return ep_man.with_file_update_mutex([&ep_man] {
-                return remove_file(ep_man.hints_dir().native());
+        // Prevent a drain if the endpoint manager was marked to cancel it.
+        if (ep_man.canceled_draining()) {
+            return make_ready_future();
+        }
+        return ep_man.stop(drain::yes).finally([&ep_man] {
+            // If draining was canceled, we can't remove the hint directory yet
+            // because there might still be some hints that we should send.
+            // We'll do that when the node starts again.
+            // Note that canceling draining can ONLY occur when the node is simply stopping.
+            // That cannot happen when decommissioning the node.
+            if (ep_man.canceled_draining()) {
+                return make_ready_future();
+            }
+
+            return ep_man.with_file_update_mutex([&ep_man] -> future<> {
+                return remove_file(ep_man.hints_dir().native()).then([&ep_man] {
+                    manager_logger.debug("Removed hint directory for {}", ep_man.end_point_key());
+                });
            });
        });
    };
@@ -986,4 +1014,18 @@ future<> manager::perform_migration() {
    manager_logger.info("Migration of hinted handoff to host ID has finished successfully");
 }

+// Technical note: This function obviously doesn't need to be a coroutine. However, it's better to impose
+//                 this constraint early on with possible future refactors in mind. It should be easier
+//                 to modify the function this way.
+future<> manager::drain_left_nodes() {
+    for (const auto& [host_id, ep_man] : _ep_managers) {
+        if (!_proxy.get_token_metadata_ptr()->is_normal_token_owner(host_id)) {
+            // It's safe to discard this future. It's awaited in `manager::stop()`.
+            (void) drain_for(host_id, {});
+        }
+    }
+
+    co_return;
+}
+
 } // namespace db::hints
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -382,6 +382,12 @@ private:
    /// ALL requested sync points will be canceled, i.e. an exception will be issued
    /// in the corresponding futures.
    future<> perform_migration();
+
+public:
+    /// Performs draining for all nodes that have already left the cluster.
+    /// This should only be called when the hint endpoint managers have been initialized
+    /// and the hint manager has started.
+    future<> drain_left_nodes();
 };

 } // namespace db::hints
--- a/db/hints/resource_manager.cc
+++ b/db/hints/resource_manager.cc
@@ -239,6 +239,15 @@ future<> resource_manager::stop() noexcept {
    });
 }

+future<> resource_manager::drain_hints_for_left_nodes() {
+    for (manager& m : _shard_managers) {
+        // It's safe to discard the future here. It's awaited in `manager::stop()`.
+        (void) m.drain_left_nodes();
+    }
+
+    co_return;
+}
+
 future<> resource_manager::register_manager(manager& m) {
    return with_semaphore(_operation_lock, 1, [this, &m] () {
        return with_semaphore(_space_watchdog.update_lock(), 1, [this, &m] {
--- a/db/hints/resource_manager.hh
+++ b/db/hints/resource_manager.hh
@@ -188,6 +188,8 @@ public:
    /// \brief Allows replaying hints for managers which are registered now or will be in the future.
    void allow_replaying() noexcept;

+    future<> drain_hints_for_left_nodes();
+
    /// \brief Registers the hints::manager in resource_manager, and starts it, if resource_manager is already running.
    ///
    /// The hints::managers can be added either before or after resource_manager starts.
--- a/db/large_data_handler.cc
+++ b/db/large_data_handler.cc
@@ -146,7 +146,7 @@ cql_table_large_data_handler::cql_table_large_data_handler(gms::feature_service&

 template <typename... Args>
 future<> cql_table_large_data_handler::try_record(std::string_view large_table, const sstables::sstable& sst,  const sstables::key& partition_key, int64_t size,
-        std::string_view desc, std::string_view extra_path, const std::vector<sstring> &extra_fields, Args&&... args) const {
+        std::string_view size_desc, std::string_view desc, std::string_view extra_path, const std::vector<sstring> &extra_fields, Args&&... args) const {
    if (!_sys_ks) {
        return make_ready_future<>();
    }
@@ -165,7 +165,7 @@ future<> cql_table_large_data_handler::try_record(std::string_view large_table,
    const auto sstable_name = large_data_handler::sst_filename(sst);
    std::string pk_str = key_to_str(partition_key.to_partition_key(s), s);
    auto timestamp = db_clock::now();
-    large_data_logger.warn("Writing large {} {}/{}: {} ({} bytes) to {}", desc, ks_name, cf_name, extra_path, size, sstable_name);
+    large_data_logger.warn("Writing large {} {}/{}: {} ({}) to {}", desc, ks_name, cf_name, extra_path, size_desc, sstable_name);
    return _sys_ks->execute_cql(req, ks_name, cf_name, sstable_name, size, pk_str, timestamp, args...)
            .discard_result()
            .handle_exception([ks_name, cf_name, large_table, sstable_name] (std::exception_ptr ep) {
@@ -182,12 +182,14 @@ future<> cql_table_large_data_handler::record_large_partitions(const sstables::s

 future<> cql_table_large_data_handler::internal_record_large_partitions(const sstables::sstable& sst, const sstables::key& key,
        uint64_t partition_size, uint64_t rows) const {
-    return try_record("partition", sst, key, int64_t(partition_size), "partition", "", {"rows"}, data_value((int64_t)rows));
+    const sstring size_desc = seastar::format("{} bytes/{} rows", partition_size, rows);
+    return try_record("partition", sst, key, int64_t(partition_size), size_desc, "partition", "", {"rows"}, data_value((int64_t)rows));
 }

 future<> cql_table_large_data_handler::internal_record_large_partitions_all_data(const sstables::sstable& sst, const sstables::key& key,
        uint64_t partition_size, uint64_t rows, uint64_t range_tombstones, uint64_t dead_rows) const {
-    return try_record("partition", sst, key, int64_t(partition_size), "partition", "", {"rows", "range_tombstones", "dead_rows"},
+    const sstring size_desc = seastar::format("{} bytes/{} rows", partition_size, rows);
+    return try_record("partition", sst, key, int64_t(partition_size), size_desc, "partition", "", {"rows", "range_tombstones", "dead_rows"},
                data_value((int64_t)rows), data_value((int64_t)range_tombstones), data_value((int64_t)dead_rows));
 }

@@ -201,13 +203,14 @@ future<> cql_table_large_data_handler::internal_record_large_cells(const sstable
    auto column_name = cdef.name_as_text();
    std::string_view cell_type = cdef.is_atomic() ? "cell" : "collection";
    static const std::vector<sstring> extra_fields{"clustering_key", "column_name"};
+    const sstring size_desc = seastar::format("{} bytes", cell_size);
    if (clustering_key) {
        const schema &s = *sst.get_schema();
        auto ck_str = key_to_str(*clustering_key, s);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, column_name, extra_fields, ck_str, column_name);
+        return try_record("cell", sst, partition_key, int64_t(cell_size), size_desc, cell_type, column_name, extra_fields, ck_str, column_name);
    } else {
        auto desc = seastar::format("static {}", cell_type);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), desc, column_name, extra_fields, data_value::make_null(utf8_type), column_name);
+        return try_record("cell", sst, partition_key, int64_t(cell_size), size_desc, desc, column_name, extra_fields, data_value::make_null(utf8_type), column_name);
    }
 }

@@ -215,26 +218,28 @@ future<> cql_table_large_data_handler::internal_record_large_cells_and_collectio
        const clustering_key_prefix* clustering_key, const column_definition& cdef, uint64_t cell_size, uint64_t collection_elements) const {
    auto column_name = cdef.name_as_text();
    std::string_view cell_type = cdef.is_atomic() ? "cell" : "collection";
+    const sstring size_desc = seastar::format("{} bytes", cell_size);
    static const std::vector<sstring> extra_fields{"clustering_key", "column_name", "collection_elements"};
    if (clustering_key) {
        const schema &s = *sst.get_schema();
        auto ck_str = key_to_str(*clustering_key, s);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, column_name, extra_fields, ck_str, column_name, data_value((int64_t)collection_elements));
+        return try_record("cell", sst, partition_key, int64_t(cell_size), size_desc, cell_type, column_name, extra_fields, ck_str, column_name, data_value((int64_t)collection_elements));
    } else {
        auto desc = seastar::format("static {}", cell_type);
-        return try_record("cell", sst, partition_key, int64_t(cell_size), desc, column_name, extra_fields, data_value::make_null(utf8_type), column_name, data_value((int64_t)collection_elements));
+        return try_record("cell", sst, partition_key, int64_t(cell_size), size_desc, desc, column_name, extra_fields, data_value::make_null(utf8_type), column_name, data_value((int64_t)collection_elements));
    }
 }

 future<> cql_table_large_data_handler::record_large_rows(const sstables::sstable& sst, const sstables::key& partition_key,
        const clustering_key_prefix* clustering_key, uint64_t row_size) const {
    static const std::vector<sstring> extra_fields{"clustering_key"};
+    const sstring size_desc = seastar::format("{} bytes", row_size);
    if (clustering_key) {
        const schema &s = *sst.get_schema();
        std::string ck_str = key_to_str(*clustering_key, s);
-        return try_record("row", sst, partition_key, int64_t(row_size), "row", "", extra_fields, ck_str);
+        return try_record("row", sst, partition_key, int64_t(row_size), size_desc, "row", "", extra_fields, ck_str);
    } else {
-        return try_record("row", sst, partition_key, int64_t(row_size), "static row", "", extra_fields, data_value::make_null(utf8_type));
+        return try_record("row", sst, partition_key, int64_t(row_size), size_desc, "static row", "", extra_fields, data_value::make_null(utf8_type));
    }
 }

--- a/db/large_data_handler.hh
+++ b/db/large_data_handler.hh
@@ -187,7 +187,7 @@ private:
 private:
    template <typename... Args>
    future<> try_record(std::string_view large_table, const sstables::sstable& sst,  const sstables::key& partition_key, int64_t size,
-            std::string_view desc, std::string_view extra_path, const std::vector<sstring> &extra_fields, Args&&... args) const;
+            std::string_view size_desc, std::string_view desc, std::string_view extra_path, const std::vector<sstring> &extra_fields, Args&&... args) const;
 };

 class nop_large_data_handler : public large_data_handler {
--- a/db/schema_applier.cc
+++ b/db/schema_applier.cc
@@ -579,19 +579,23 @@ static future<> merge_tables_and_views(distributed<service::storage_proxy>& prox
        // 2. The table was just created - the table is guaranteed to be published with the view in that case.
        // 3. The view itself was altered - in that case we already know the base table so we can take it from
        //    the database object.
-        view_ptr vp = create_view_from_mutations(proxy, std::move(sm));
+        query::result_set rs(sm.columnfamilies_mutation());
+        const query::result_set_row& view_row = rs.row(0);
+        auto ks_name = view_row.get_nonnull<sstring>("keyspace_name");
+        auto base_name = view_row.get_nonnull<sstring>("base_table_name");
+
        schema_ptr base_schema;
        for (auto&& altered : tables_diff.altered) {
            // Chose the appropriate version of the base table schema: old -> old, new -> new.
            schema_ptr s = side == schema_diff_side::left ? altered.old_schema : altered.new_schema;
-            if (s->ks_name() == vp->ks_name() && s->cf_name() == vp->view_info()->base_name() ) {
+            if (s->ks_name() == ks_name && s->cf_name() == base_name) {
                base_schema = s;
                break;
            }
        }
        if (!base_schema) {
            for (auto&& s : tables_diff.created) {
-                if (s.get()->ks_name() == vp->ks_name() && s.get()->cf_name() == vp->view_info()->base_name() ) {
+                if (s.get()->ks_name() == ks_name && s.get()->cf_name() == base_name) {
                    base_schema = s;
                    break;
                }
@@ -599,14 +603,14 @@ static future<> merge_tables_and_views(distributed<service::storage_proxy>& prox
        }

        if (!base_schema) {
-            base_schema = proxy.local().local_db().find_schema(vp->ks_name(), vp->view_info()->base_name());
+            base_schema = proxy.local().local_db().find_schema(ks_name, base_name);
        }
+        view_ptr vp = create_view_from_mutations(proxy, std::move(sm), base_schema);

        // Now when we have a referenced base - sanity check that we're not registering an old view
        // (this could happen when we skip multiple major versions in upgrade, which is unsupported.)
        check_no_legacy_secondary_index_mv_schema(proxy.local().get_db().local(), vp, base_schema);

-        vp->view_info()->set_base_info(vp->view_info()->make_base_dependent_view_info(*base_schema));
        return vp;
    });

--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -2430,13 +2430,9 @@ static index_metadata create_index_from_index_row(const query::result_set_row& r
    return index_metadata{index_name, options, kind, is_local};
 }

-/*
- * View metadata serialization/deserialization.
- */
-
-view_ptr create_view_from_mutations(const schema_ctxt& ctxt, schema_mutations sm, std::optional<table_schema_version> version)  {
-    auto table_rs = query::result_set(sm.columnfamilies_mutation());
-    query::result_set_row row = table_rs.row(0);
+static schema_builder prepare_view_schema_builder_from_mutations(const schema_ctxt& ctxt, const schema_mutations& sm, std::optional<table_schema_version> version,
+                                                                const query::result_set& table_rs) {
+    const query::result_set_row& row = table_rs.row(0);

    auto ks_name = row.get_nonnull<sstring>("keyspace_name");
    auto cf_name = row.get_nonnull<sstring>("view_name");
@@ -2462,13 +2458,47 @@ view_ptr create_view_from_mutations(const schema_ctxt& ctxt, schema_mutations sm
    } else {
        builder.with_version(sm.digest(ctxt.features().cluster_schema_features()));
    }
+    return builder;
+}

-    auto base_id = table_id(row.get_nonnull<utils::UUID>("base_table_id"));
+/*
+ * View metadata serialization/deserialization.
+ * If the base info is not provided, the schema context must have a reference to the database,
+ * and the most up-to-date base schema will be pulled from there.
+ */
+view_ptr create_view_from_mutations(const schema_ctxt& ctxt, schema_mutations sm, schema_ptr base_schema, std::optional<table_schema_version> version)  {
+    auto table_rs = query::result_set(sm.columnfamilies_mutation());
+    auto builder = prepare_view_schema_builder_from_mutations(ctxt, sm, version, table_rs);
+    const query::result_set_row& row = table_rs.row(0);
+    auto include_all_columns = row.get_nonnull<bool>("include_all_columns");
+    auto where_clause = row.get_nonnull<sstring>("where_clause");
+
+    builder.with_view_info(std::move(base_schema), include_all_columns, std::move(where_clause));
+    return view_ptr(builder.build());
+}
+
+view_ptr create_view_from_mutations(const schema_ctxt& ctxt, schema_mutations sm, std::optional<db::view::base_dependent_view_info> base_info, std::optional<table_schema_version> version)  {
+    auto table_rs = query::result_set(sm.columnfamilies_mutation());
+    auto builder = prepare_view_schema_builder_from_mutations(ctxt, sm, version, table_rs);
+    const query::result_set_row& row = table_rs.row(0);
+    auto id = table_id(row.get_nonnull<utils::UUID>("base_table_id"));
    auto base_name = row.get_nonnull<sstring>("base_table_name");
    auto include_all_columns = row.get_nonnull<bool>("include_all_columns");
    auto where_clause = row.get_nonnull<sstring>("where_clause");

-    builder.with_view_info(std::move(base_id), std::move(base_name), include_all_columns, std::move(where_clause));
+    if (!base_info) {
+        if (!ctxt.get_db()) {
+            auto ks_name = row.get_nonnull<sstring>("keyspace_name");
+            auto cf_name = row.get_nonnull<sstring>("view_name");
+            on_internal_error(slogger, format("No database reference with missing base schema when creating view {}.{} from mutations",
+                ks_name, cf_name));
+        }
+        auto base_id = table_id(row.get_nonnull<utils::UUID>("base_table_id"));
+        auto base_schema = ctxt.get_db()->find_schema(base_id);
+        builder.with_view_info(base_schema, include_all_columns, std::move(where_clause));
+    } else {
+        builder.with_view_info(id, base_name, include_all_columns, std::move(where_clause), *base_info);
+    }
    return view_ptr(builder.build());
 }

--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -17,6 +17,7 @@
 #include "schema_mutations.hh"
 #include "types/map.hh"
 #include "query-result-set.hh"
+#include "db/view/base_info.hh"

 #include <seastar/core/distributed.hh>

@@ -287,7 +288,8 @@ std::vector<mutation> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata>

 schema_ptr create_table_from_mutations(const schema_ctxt&, schema_mutations, std::optional<table_schema_version> version = {});

-view_ptr create_view_from_mutations(const schema_ctxt&, schema_mutations, std::optional<table_schema_version> version = {});
+view_ptr create_view_from_mutations(const schema_ctxt&, schema_mutations, schema_ptr, std::optional<table_schema_version> version = {});
+view_ptr create_view_from_mutations(const schema_ctxt&, schema_mutations, std::optional<view::base_dependent_view_info> = {}, std::optional<table_schema_version> version = {});

 future<std::vector<view_ptr>> create_views_from_schema_partition(distributed<service::storage_proxy>& proxy, const schema_result::mapped_type& result);

--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -11,9 +11,11 @@
 #include <boost/functional/hash.hpp>
 #include <boost/icl/interval_map.hpp>
 #include <fmt/ranges.h>
+#include <ranges>

 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
+#include <seastar/core/loop.hh>
 #include <seastar/core/on_internal_error.hh>
 #include "system_keyspace.hh"
 #include "cql3/untyped_result_set.hh"
@@ -23,6 +25,7 @@
 #include "gms/feature_service.hh"
 #include "system_keyspace_view_types.hh"
 #include "schema/schema_builder.hh"
+#include "timestamp.hh"
 #include "utils/assert.hh"
 #include "utils/hashers.hh"
 #include "utils/log.hh"
@@ -35,6 +38,7 @@
 #include "db/schema_tables.hh"
 #include "gms/generation-number.hh"
 #include "service/storage_service.hh"
+#include "service/storage_proxy.hh"
 #include "service/paxos/paxos_state.hh"
 #include "query-result-set.hh"
 #include "idl/frozen_mutation.dist.hh"
@@ -350,6 +354,7 @@ schema_ptr system_keyspace::raft() {

            .set_comment("Persisted RAFT log, votes and snapshot info")
            .with_hash_version()
+            .set_caching_options(caching_options::get_disabled_caching_options())
            .build();
    }();
    return schema;
@@ -762,6 +767,35 @@ schema_ptr system_keyspace::large_cells() {
    return large_cells;
 }

+schema_ptr system_keyspace::corrupt_data() {
+    static thread_local auto corrupt_data = [] {
+        auto id = generate_legacy_id(NAME, CORRUPT_DATA);
+        return schema_builder(NAME, CORRUPT_DATA, id)
+                // partition key
+                .with_column("keyspace_name", utf8_type, column_kind::partition_key)
+                .with_column("table_name", utf8_type, column_kind::partition_key)
+                // clustering key
+                .with_column("id", timeuuid_type, column_kind::clustering_key)
+                // regular rows
+                // Storing keys as bytes: having a corrupt key might be the reason
+                // to record the row as corrupt, so we just dump what we have and
+                // leave interpreting to the lucky person investigating the disaster.
+                .with_column("partition_key", bytes_type)
+                .with_column("clustering_key", bytes_type)
+                // Note: mutation-fragment v2
+                .with_column("mutation_fragment_kind", utf8_type)
+                .with_column("frozen_mutation_fragment", bytes_type)
+                .with_column("origin", utf8_type)
+                .with_column("sstable_name", utf8_type)
+                // options
+                .set_comment("mutation-fragments found to be corrupted")
+                .set_gc_grace_seconds(0)
+                .with_hash_version()
+                .build();
+    }();
+    return corrupt_data;
+}
+
 static constexpr auto schema_gc_grace = std::chrono::duration_cast<std::chrono::seconds>(days(7)).count();

 /*static*/ schema_ptr system_keyspace::scylla_local() {
@@ -1705,8 +1739,15 @@ future<> system_keyspace::drop_truncation_rp_records() {
    auto rs = co_await execute_cql(req);

    bool any = false;
-    co_await coroutine::parallel_for_each(*rs, [&] (const cql3::untyped_result_set_row& row) -> future<> {
+    std::unordered_set<table_id> to_delete;
+    auto db = _qp.db();
+    auto max_concurrency = std::min(1024u, smp::count * 8);
+    co_await seastar::max_concurrent_for_each(*rs, max_concurrency, [&] (const cql3::untyped_result_set_row& row) -> future<> {
        auto table_uuid = table_id(row.get_as<utils::UUID>("table_uuid"));
+        if (!db.try_find_table(table_uuid)) {
+            to_delete.emplace(table_uuid);
+            co_return;
+        }
        auto shard = row.get_as<int32_t>("shard");
        auto segment_id = row.get_as<int64_t>("segment_id");

@@ -1716,11 +1757,26 @@ future<> system_keyspace::drop_truncation_rp_records() {
            co_await execute_cql(req);
        }
    });
+    if (!to_delete.empty()) {
+        // IN has a limit to how many values we can put into it.
+        for (auto&& chunk : to_delete | std::views::transform(&table_id::to_sstring) | std::views::chunk(100)) {
+            auto str = std::ranges::to<std::string>(chunk | std::views::join_with(','));
+            auto req = fmt::format("DELETE FROM system.{} WHERE table_uuid IN ({})", TRUNCATED, str);
+            co_await execute_cql(req);
+        }
+        any = true;
+    }
    if (any) {
        co_await force_blocking_flush(TRUNCATED);
    }
 }

+future<> system_keyspace::remove_truncation_records(table_id id) {
+    auto req = format("DELETE FROM system.{} WHERE table_uuid = {}", TRUNCATED, id);
+    co_await execute_cql(req);
+    co_await force_blocking_flush(TRUNCATED);
+}
+
 future<> system_keyspace::save_truncation_record(const replica::column_family& cf, db_clock::time_point truncated_at, db::replay_position rp) {
    sstring req = format("INSERT INTO system.{} (table_uuid, shard, position, segment_id, truncated_at) VALUES(?,?,?,?,?)", TRUNCATED);
    co_await _qp.execute_internal(req, {cf.schema()->id().uuid(), int32_t(rp.shard_id()), int32_t(rp.pos), int64_t(rp.base_id()), truncated_at}, cql3::query_processor::cache_internal::yes);
@@ -2103,7 +2159,59 @@ future<> system_keyspace::update_peer_info(gms::inet_address ep, locator::host_i

    slogger.debug("{}: values={}", query, values);

-    co_await _qp.execute_internal(query, db::consistency_level::ONE, values, cql3::query_processor::cache_internal::yes);
+    const auto guard = co_await get_units(_peers_cache_lock, 1);
+    try {
+        co_await _qp.execute_internal(query, db::consistency_level::ONE, values, cql3::query_processor::cache_internal::yes);
+        if (auto* cache = get_peers_cache()) {
+            cache->host_id_to_inet_ip[hid] = ep;
+            cache->inet_ip_to_host_id[ep] = hid;
+        }
+    } catch (...) {
+        _peers_cache = nullptr;
+        throw;
+    }
+}
+
+system_keyspace::peers_cache* system_keyspace::get_peers_cache() {
+    auto* cache = _peers_cache.get();
+    if (cache && (lowres_clock::now() > cache->expiration_time)) {
+        _peers_cache = nullptr;
+        return nullptr;
+    }
+    return cache;
+}
+
+future<lw_shared_ptr<const system_keyspace::peers_cache>> system_keyspace::get_or_load_peers_cache() {
+    const auto guard = co_await get_units(_peers_cache_lock, 1);
+    if (auto* cache = get_peers_cache()) {
+        co_return cache->shared_from_this();
+    }
+    auto cache = make_lw_shared<peers_cache>();
+    cache->inet_ip_to_host_id = co_await load_host_ids();
+    cache->host_id_to_inet_ip.reserve(cache->inet_ip_to_host_id.size());
+    for (const auto [ip, id]: cache->inet_ip_to_host_id) {
+        const auto [it, inserted] = cache->host_id_to_inet_ip.insert({id, ip});
+        if (!inserted) {
+            on_internal_error(slogger, ::format("duplicate IP for host_id {}, first IP {}, second IP {}",
+                id, it->second, ip));
+        }
+    }
+    cache->expiration_time = lowres_clock::now() + std::chrono::milliseconds(200);
+    _peers_cache = cache;
+    co_return std::move(cache);
+}
+
+future<std::optional<gms::inet_address>> system_keyspace::get_ip_from_peers_table(locator::host_id id) {
+    const auto cache = co_await get_or_load_peers_cache();
+    if (const auto it = cache->host_id_to_inet_ip.find(id); it != cache->host_id_to_inet_ip.end()) {
+        co_return it->second;
+    }
+    co_return std::nullopt;
+}
+
+future<system_keyspace::host_id_to_ip_map_t> system_keyspace::get_host_id_to_ip_map() {
+    const auto cache = co_await get_or_load_peers_cache();
+    co_return cache->host_id_to_inet_ip;
 }

 template <typename T>
@@ -2153,7 +2261,22 @@ future<> system_keyspace::update_schema_version(table_schema_version version) {
 future<> system_keyspace::remove_endpoint(gms::inet_address ep) {
    const sstring req = format("DELETE FROM system.{} WHERE peer = ?", PEERS);
    slogger.debug("DELETE FROM system.{} WHERE peer = {}", PEERS, ep);
-    co_await execute_cql(req, ep.addr()).discard_result();
+
+    const auto guard = co_await get_units(_peers_cache_lock, 1);
+    try {
+        co_await execute_cql(req, ep.addr()).discard_result();
+        if (auto* cache = get_peers_cache()) {
+            const auto it = cache->inet_ip_to_host_id.find(ep);
+            if (it != cache->inet_ip_to_host_id.end()) {
+                const auto id = it->second;
+                cache->inet_ip_to_host_id.erase(it);
+                cache->host_id_to_inet_ip.erase(id);
+            }
+        }
+    } catch (...) {
+        _peers_cache = nullptr;
+        throw;
+    }
 }

 future<> system_keyspace::update_tokens(const std::unordered_set<dht::token>& tokens) {
@@ -2305,6 +2428,7 @@ std::vector<schema_ptr> system_keyspace::all_tables(const db::config& cfg) {
                    peers(), peer_events(), range_xfers(),
                    compactions_in_progress(), compaction_history(),
                    sstable_activity(), size_estimates(), large_partitions(), large_rows(), large_cells(),
+                    corrupt_data(),
                    scylla_local(), db::schema_tables::scylla_table_schema_history(),
                    repair_history(),
                    v3::views_builds_in_progress(), v3::built_views(),
@@ -2931,9 +3055,8 @@ future<std::optional<mutation>> system_keyspace::get_service_levels_version_muta
    return get_scylla_local_mutation(_db, SERVICE_LEVELS_VERSION_KEY);
 }

-future<mutation> system_keyspace::make_service_levels_version_mutation(int8_t version, const service::group0_guard& guard) {
+future<mutation> system_keyspace::make_service_levels_version_mutation(int8_t version, api::timestamp_type timestamp) {
    static sstring query = format("INSERT INTO {}.{} (key, value) VALUES (?, ?);", db::system_keyspace::NAME, db::system_keyspace::SCYLLA_LOCAL);
-    auto timestamp = guard.write_timestamp();
    auto muts = co_await _qp.get_mutations_internal(query, internal_system_query_state(), timestamp, {SERVICE_LEVELS_VERSION_KEY, format("{}", version)});

    if (muts.size() != 1) {
@@ -3071,7 +3194,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
            supported_features = decode_features(deserialize_set_column(*topology(), row, "supported_features"));
        }

-        if (row.has("topology_request")) {
+        if (row.has("topology_request") && nstate != service::node_state::left) {
            auto req = service::topology_request_from_string(row.get_as<sstring>("topology_request"));
            ret.requests.emplace(host_id, req);
            switch(req) {
@@ -3528,4 +3651,12 @@ future<::shared_ptr<cql3::untyped_result_set>> system_keyspace::execute_cql(cons
    return _qp.execute_internal(query_string, values, cql3::query_processor::cache_internal::yes);
 }

+future<> system_keyspace::apply_mutation(mutation m) {
+    if (m.schema()->ks_name() != NAME) {
+        on_internal_error(slogger, fmt::format("system_keyspace::apply_mutation(): attempted to apply mutation belonging to table {}.{}", m.schema()->cf_name(), m.schema()->ks_name()));
+    }
+
+    return _qp.proxy().mutate_locally(m, {}, db::commitlog::force_sync(m.schema()->static_props().wait_for_sync_to_commitlog), db::no_timeout);
+}
+
 } // namespace db
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -141,6 +141,7 @@ class system_keyspace : public seastar::peering_sharded_service<system_keyspace>
    static schema_ptr large_partitions();
    static schema_ptr large_rows();
    static schema_ptr large_cells();
+    static schema_ptr corrupt_data();
    static schema_ptr scylla_local();
    future<> force_blocking_flush(sstring cfname);
    // This function is called when the system.peers table is read,
@@ -152,6 +153,17 @@ class system_keyspace : public seastar::peering_sharded_service<system_keyspace>
    //  and this node crashes after adding a new IP but before removing the old one. The
    //  record with older timestamp is removed, the warning is written to the log.
    future<> peers_table_read_fixup();
+
+    struct peers_cache: public enable_lw_shared_from_this<peers_cache> {
+        std::unordered_map<gms::inet_address, locator::host_id> inet_ip_to_host_id;
+        std::unordered_map<locator::host_id, gms::inet_address> host_id_to_inet_ip;
+        lowres_clock::time_point expiration_time;
+    };
+    lw_shared_ptr<peers_cache> _peers_cache;
+    semaphore _peers_cache_lock{1};
+    peers_cache* get_peers_cache();
+    future<lw_shared_ptr<const peers_cache>> get_or_load_peers_cache();
+
 public:
    static schema_ptr size_estimates();
 public:
@@ -173,6 +185,7 @@ public:
    static constexpr auto LARGE_PARTITIONS = "large_partitions";
    static constexpr auto LARGE_ROWS = "large_rows";
    static constexpr auto LARGE_CELLS = "large_cells";
+    static constexpr auto CORRUPT_DATA = "corrupt_data";
    static constexpr auto SCYLLA_LOCAL = "scylla_local";
    static constexpr auto RAFT = "raft";
    static constexpr auto RAFT_SNAPSHOTS = "raft_snapshots";
@@ -316,6 +329,12 @@ public:

    future<> update_peer_info(gms::inet_address ep, locator::host_id hid, const peer_info& info);

+    // Return ip of the peers table entry with given host id
+    future<std::optional<gms::inet_address>> get_ip_from_peers_table(locator::host_id id);
+
+    using host_id_to_ip_map_t = std::unordered_map<locator::host_id, gms::inet_address>;
+    future<host_id_to_ip_map_t> get_host_id_to_ip_map();
+
    future<> remove_endpoint(gms::inet_address ep);

    // Saves the key-value pair into system.scylla_local table.
@@ -423,6 +442,7 @@ public:
    future<> save_truncation_record(const replica::column_family&, db_clock::time_point truncated_at, db::replay_position);
    future<replay_positions> get_truncated_positions(table_id);
    future<> drop_truncation_rp_records();
+    future<> remove_truncation_records(table_id);

    // Converts a `dht::token_range` object to the left-open integer range (x,y] form.
    //
@@ -654,7 +674,7 @@ public:
 public:
    future<std::optional<int8_t>> get_service_levels_version();
    
-    future<mutation> make_service_levels_version_mutation(int8_t version, const service::group0_guard& guard);
+    future<mutation> make_service_levels_version_mutation(int8_t version, api::timestamp_type timestamp);
    future<std::optional<mutation>> get_service_levels_version_mutation();

    // Publishes a new compression dictionary to `dicts`,
@@ -685,6 +705,10 @@ public:
        return execute_cql(req, { data_value(std::forward<Args>(args))... });
    }

+    // Apply write as mutation to the system keyspace.
+    // Mutation has to belong to a table int he system keyspace.
+    future<> apply_mutation(mutation m);
+
    friend future<column_mapping> db::schema_tables::get_column_mapping(db::system_keyspace& sys_ks, ::table_id table_id, table_schema_version version);
    friend future<bool> db::schema_tables::column_mapping_exists(db::system_keyspace& sys_ks, table_id table_id, table_schema_version version);
    friend future<> db::schema_tables::drop_column_mapping(db::system_keyspace& sys_ks, table_id table_id, table_schema_version version);
--- a/db/tags/utils.cc
+++ b/db/tags/utils.cc
@@ -49,29 +49,39 @@ std::optional<std::string> find_tag(const schema& s, const sstring& tag) {
 future<> modify_tags(service::migration_manager& mm, sstring ks, sstring cf,
                     std::function<void(std::map<sstring, sstring>&)> modify) {
    co_await mm.container().invoke_on(0, [ks = std::move(ks), cf = std::move(cf), modify = std::move(modify)] (service::migration_manager& mm) -> future<> {
-        // FIXME: the following needs to be in a loop. If mm.announce() below
-        // fails, we need to retry the whole thing.
-        auto group0_guard = co_await mm.start_group0_operation();
-        // After getting the schema-modification lock, we need to read the
-        // table's *current* schema - it might have changed before we got
-        // the lock, by some concurrent modification. If the table is gone,
-        // this will throw no_such_column_family.
-        schema_ptr s = mm.get_storage_proxy().data_dictionary().find_schema(ks, cf);
-        const std::map<sstring, sstring>* tags_ptr = get_tags_of_table(s);
-        std::map<sstring, sstring> tags;
-        if (tags_ptr) {
-            // tags_ptr is a constant pointer to schema data. To allow func()
-            // to modify the tags, we must make a copy.
-            tags = *tags_ptr;
-        }
-        modify(tags);
-        schema_builder builder(s);
-        builder.add_extension(tags_extension::NAME, ::make_shared<tags_extension>(tags));
+        size_t retries = mm.get_concurrent_ddl_retries();
+        for (;;) {
+            auto group0_guard = co_await mm.start_group0_operation();
+            // After getting the schema-modification lock, we need to read the
+            // table's *current* schema - it might have changed before we got
+            // the lock, by some concurrent modification. If the table is gone,
+            // this will throw no_such_column_family.
+            schema_ptr s = mm.get_storage_proxy().data_dictionary().find_schema(ks, cf);
+            const std::map<sstring, sstring>* tags_ptr = get_tags_of_table(s);
+            std::map<sstring, sstring> tags;
+            if (tags_ptr) {
+                // tags_ptr is a constant pointer to schema data. To allow func()
+                // to modify the tags, we must make a copy.
+                tags = *tags_ptr;
+            }
+            modify(tags);
+            schema_builder builder(s);
+            builder.add_extension(tags_extension::NAME, ::make_shared<tags_extension>(tags));

-        auto m = co_await service::prepare_column_family_update_announcement(mm.get_storage_proxy(),
+            auto m = co_await service::prepare_column_family_update_announcement(mm.get_storage_proxy(),
                builder.build(), std::vector<view_ptr>(), group0_guard.write_timestamp());
-
-        co_await mm.announce(std::move(m), std::move(group0_guard), format("Modify tags for {} table", cf));
+            try {
+                co_await mm.announce(std::move(m), std::move(group0_guard), format("Modify tags for {} table", cf));
+                break;
+            }  catch (const service::group0_concurrent_modification& ex) {
+                tlogger.info("Failed to modify tags for table {} due to concurrent schema modifications. {}.",
+                    cf, retries ? "Retrying" : "Number of retries exceeded, giving up");
+                if (retries--) {
+                    continue;
+                }
+                throw;
+            }
+        }
    });
 }

--- a/db/view/base_info.hh
+++ b/db/view/base_info.hh
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include <optional>
+#include "bytes_fwd.hh"
+#include "schema/schema_fwd.hh"
+
+namespace db {
+
+namespace view {
+
+// Part of the view description which depends on the base schema.
+struct base_dependent_view_info {
+    bool has_computed_column_depending_on_base_non_primary_key;
+
+    // True if the partition key columns of the view are the same as the
+    // partition key columns of the base, maybe in a different order.
+    bool is_partition_key_permutation_of_base_partition_key;
+
+    // Indicates if the view hase pk columns which are not part of the base
+    // pk, it seems that !base_non_pk_columns_in_view_pk.empty() is the same,
+    // but actually there are cases where we can compute this boolean without
+    // succeeding to reliably build the former.
+    bool has_base_non_pk_columns_in_view_pk;
+
+
+    // A constructor for a base info that can facilitate reads and writes from the materialized view.
+    base_dependent_view_info(bool has_computed_column_depending_on_base_non_primary_key,
+            bool is_partition_key_permutation_of_base_partition_key,
+            bool has_base_non_pk_columns_in_view_pk);
+};
+
+}
+
+}
--- a/db/view/regular_column_transformation.hh
+++ b/db/view/regular_column_transformation.hh
@@ -0,0 +1,127 @@
+/*
+ * Copyright (C) 2024-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include "column_computation.hh"
+#include "mutation/atomic_cell.hh"
+#include "timestamp.hh"
+#include <type_traits>
+
+class row_marker;
+
+// In a basic column_computation defined in column_computation.hh, the
+// compute_value() method is only based on the partition key, and it must
+// return a value. That API has very limited applications - basically the
+// only thing we can implement with it is token_column_computation which
+// we used to create the token column in secondary indexes.
+// The regular_column_transformation base class here is more powerful, but
+// still is not a completely general computation: Its compute_value() virtual
+// method can transform the value read from a single cell of a regular column
+// into a new cell stored in a structure regular_column_transformation::result.
+//
+// In more details, the assumptions of regular_column_transformation is:
+// 1. compute_value() computes the value based on a *single* column in a
+//    row passed to compute_value().
+//    This assumption means that the value or deletion of the value always
+//    has a single known timestamp (and the value can't be half-missing)
+//    and single TTL information. That would not have been possible if we
+//    allowed the computation to depend on multiple columns.
+// 2. compute_value() computes the value based on a *regular* column in the
+//    base table. This means that an update can modify this value (unlike a
+//    base-table key column that can't change in an update), so the view
+//    update code needs to compute the value before and after the update,
+//    and potentially delete and create view rows.
+// 3. compute_value() returns a column_computation::result which includes
+//    a value and its liveness information (timestamp and ttl/expiry) or
+//    is missing a value.
+
+class regular_column_transformation : public column_computation {
+public:
+    struct result {
+        // We can use "bytes" instead of "managed_bytes" here because we know
+        // that a column_computation is only used for generating a key value,
+        // and that is limited to 64K. This limitation is enforced below -
+        // we never linearize a cell's value if its size is more than 64K.
+        std::optional<bytes> _value;
+
+        // _ttl and _expiry are only defined if _value is set.
+        // The default values below are used when the source cell does not
+        // expire, and are the same values that row_marker uses for a non-
+        // expiring marker. This is useful when creating a row_marker from
+        // get_ttl() and get_expiry().
+        gc_clock::duration _ttl { 0 };
+        gc_clock::time_point _expiry { gc_clock::duration(0) };
+
+        // _ts may be set even if _value is missing, which can remember the
+        // timestamp of a tombstone. Note that the current view-update code
+        // that uses this class doesn't use _ts when _value is missing.
+        api::timestamp_type _ts = api::missing_timestamp;
+
+        api::timestamp_type get_ts() const {
+            return _ts;
+        }
+
+        bool has_value() const {
+            return _value.has_value();
+        }
+
+        // Should only be called if has_value() is true:
+        const bytes& get_value() const {
+            return *_value;
+        }
+        gc_clock::duration get_ttl() const {
+            return _ttl;
+        }
+        gc_clock::time_point get_expiry() const {
+            return _expiry;
+        }
+
+        // A missing computation result
+        result() { }
+
+        // Construct a computation result by copying a given atomic_cell -
+        // including its value, timestamp, and ttl - or deletion timestamp.
+        // The second parameter is an optional transformation function f -
+        // taking a bytes and returning an optional<bytes> - that transforms
+        // the value of the cell but keeps its other liveness information.
+        // If f returns a nullopt, it causes the view row should be deleted.
+        template<typename Func=std::identity>
+        requires std::invocable<Func, bytes> && std::convertible_to<std::invoke_result_t<Func, bytes>, std::optional<bytes>>
+        result(atomic_cell_view cell, Func f = {}) {
+            _ts = cell.timestamp();
+            if (cell.is_live()) {
+                // If the cell is larger than what a key can hold (64KB),
+                // return a missing value. This lets us skip this item during
+                // view building and avoid hanging the view build as described
+                // in #8627. But it doesn't prevent later inserting such a item
+                // to the base table, nor does it implement front-end specific
+                // limits (such as Alternator's 1K or 2K limits - see #10347).
+                // Those stricter limits should be validated in the base-table
+                // write code, not here - deep inside the view update code.
+                // Note also we assume that f() doesn't grow the value further.
+                if (cell.value().size() >= 65536) {
+                    return;
+                }
+                _value = f(to_bytes(cell.value()));
+                if (_value) {
+                    if (cell.is_live_and_has_ttl()) {
+                        _ttl = cell.ttl();
+                        _expiry = cell.expiry();
+                    }
+                }
+            }
+        }
+    };
+
+    virtual ~regular_column_transformation() = default;
+    virtual result compute_value(
+        const schema& schema,
+        const partition_key& key,
+        const db::view::clustering_or_static_row& row) const = 0;
+ };
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -24,6 +24,7 @@
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/maybe_yield.hh>

+#include "db/view/base_info.hh"
 #include "replica/database.hh"
 #include "clustering_bounds_comparator.hh"
 #include "cql3/statements/select_statement.hh"
@@ -36,6 +37,7 @@
 #include "db/view/view_builder.hh"
 #include "db/view/view_updating_consumer.hh"
 #include "db/view/view_update_generator.hh"
+#include "db/view/regular_column_transformation.hh"
 #include "db/system_keyspace_view_types.hh"
 #include "db/system_keyspace.hh"
 #include "db/system_distributed_keyspace.hh"
@@ -77,10 +79,16 @@ static inline void inject_failure(std::string_view operation) {
            [operation] { throw std::runtime_error(std::string(operation)); });
 }

-view_info::view_info(const schema& schema, const raw_view_info& raw_view_info)
+view_info::view_info(const schema& schema, const raw_view_info& raw_view_info, schema_ptr base_schema)
        : _schema(schema)
        , _raw(raw_view_info)
-        , _has_computed_column_depending_on_base_non_primary_key(false)
+        , _base_info(make_base_dependent_view_info(*base_schema))
+{ }
+
+view_info::view_info(const schema& schema, const raw_view_info& raw_view_info, db::view::base_dependent_view_info base_info)
+        : _schema(schema)
+        , _raw(raw_view_info)
+        , _base_info(std::move(base_info))
 { }

 cql3::statements::select_statement& view_info::select_statement(data_dictionary::database db) const {
@@ -131,117 +139,63 @@ const column_definition* view_info::view_column(const column_definition& base_de
    return _schema.get_column_definition(base_def.name());
 }

-void view_info::set_base_info(db::view::base_info_ptr base_info) {
-    _base_info = std::move(base_info);
+void view_info::reset_view_info() {
    // Forget the cached objects which may refer to the base schema.
    _select_statement = nullptr;
    _partition_slice = std::nullopt;
 }

 // A constructor for a base info that can facilitate reads and writes from the materialized view.
-db::view::base_dependent_view_info::base_dependent_view_info(schema_ptr base_schema,
-        std::vector<column_id>&& base_regular_columns_in_view_pk,
-        std::vector<column_id>&& base_static_columns_in_view_pk)
-        : _base_schema{std::move(base_schema)}
-        , _base_regular_columns_in_view_pk{std::move(base_regular_columns_in_view_pk)}
-        , _base_static_columns_in_view_pk{std::move(base_static_columns_in_view_pk)}
-        , has_base_non_pk_columns_in_view_pk{!_base_regular_columns_in_view_pk.empty() || !_base_static_columns_in_view_pk.empty()}
-        , use_only_for_reads{false} {
-
-}
-
-// A constructor for a base info that can facilitate only reads from the materialized view.
-db::view::base_dependent_view_info::base_dependent_view_info(bool has_base_non_pk_columns_in_view_pk, std::optional<bytes>&& column_missing_in_base)
-        : _base_schema{nullptr}
-        , _column_missing_in_base{std::move(column_missing_in_base)}
+db::view::base_dependent_view_info::base_dependent_view_info(bool has_computed_column_depending_on_base_non_primary_key,
+        bool is_partition_key_permutation_of_base_partition_key,
+        bool has_base_non_pk_columns_in_view_pk)
+        : has_computed_column_depending_on_base_non_primary_key{has_computed_column_depending_on_base_non_primary_key}
+        , is_partition_key_permutation_of_base_partition_key{is_partition_key_permutation_of_base_partition_key}
        , has_base_non_pk_columns_in_view_pk{has_base_non_pk_columns_in_view_pk}
-        , use_only_for_reads{true} {
-}
+{ }

-const std::vector<column_id>& db::view::base_dependent_view_info::base_regular_columns_in_view_pk() const {
-    if (use_only_for_reads) {
-        on_internal_error(vlogger,
-                seastar::format("base_regular_columns_in_view_pk(): operation unsupported when initialized only for view reads. "
-                "Missing column in the base table: {}", to_string_view(_column_missing_in_base.value_or(bytes()))));
-    }
-    return _base_regular_columns_in_view_pk;
-}
-
-const std::vector<column_id>& db::view::base_dependent_view_info::base_static_columns_in_view_pk() const {
-    if (use_only_for_reads) {
-        on_internal_error(vlogger,
-                seastar::format("base_static_columns_in_view_pk(): operation unsupported when initialized only for view reads. "
-                "Missing column in the base table: {}", to_string_view(_column_missing_in_base.value_or(bytes()))));
-    }
-    return _base_static_columns_in_view_pk;
-}
-
-const schema_ptr& db::view::base_dependent_view_info::base_schema() const {
-    if (use_only_for_reads) {
-        on_internal_error(vlogger,
-                seastar::format("base_schema(): operation unsupported when initialized only for view reads. "
-                "Missing column in the base table: {}", to_string_view(_column_missing_in_base.value_or(bytes()))));
-    }
-    return _base_schema;
-}
-
-db::view::base_info_ptr view_info::make_base_dependent_view_info(const schema& base) const {
-    std::vector<column_id> base_regular_columns_in_view_pk;
-    std::vector<column_id> base_static_columns_in_view_pk;
-
-    _is_partition_key_permutation_of_base_partition_key =
+db::view::base_dependent_view_info view_info::make_base_dependent_view_info(const schema& base) const {
+    bool is_partition_key_permutation_of_base_partition_key =
        std::ranges::all_of(_schema.partition_key_columns(), [&base] (const column_definition& view_col) {
            const column_definition* base_col = base.get_column_definition(view_col.name());
            return base_col && base_col->is_partition_key();
            })
        && _schema.partition_key_size() == base.partition_key_size();

+    bool has_computed_column_depending_on_base_non_primary_key = false;
+    bool has_base_non_pk_columns_in_view_pk = false;
    for (auto&& view_col : _schema.primary_key_columns()) {
        if (view_col.is_computed()) {
            // we are not going to find it in the base table...
            if (view_col.get_computation().depends_on_non_primary_key_column()) {
-                _has_computed_column_depending_on_base_non_primary_key = true;
+                has_computed_column_depending_on_base_non_primary_key = true;
            }
            continue;
        }
        const bytes& view_col_name = view_col.name();
        auto* base_col = base.get_column_definition(view_col_name);
        if (base_col && base_col->is_regular()) {
-            base_regular_columns_in_view_pk.push_back(base_col->id);
+            has_base_non_pk_columns_in_view_pk = true;
        } else if (base_col && base_col->is_static()) {
-            base_static_columns_in_view_pk.push_back(base_col->id);
+            has_base_non_pk_columns_in_view_pk = true;
        } else if (!base_col) {
-            vlogger.error("Column {} in view {}.{} was not found in the base table {}.{}",
-                    to_string_view(view_col_name), _schema.ks_name(), _schema.cf_name(), base.ks_name(), base.cf_name());
-            if (to_string_view(view_col_name) == "idx_token") {
-                vlogger.warn("Missing idx_token column is caused by an incorrect upgrade of a secondary index. "
-                        "Please recreate index {}.{} to avoid future issues.", _schema.ks_name(), _schema.cf_name());
-            }
-            // If we didn't find the column in the base column then it must have been deleted
-            // or not yet added (by alter command), this means it is for sure not a pk column
-            // in the base table. This can happen if the version of the base schema is not the
-            // one that the view was created with. Setting this schema as the base can't harm since
-            // if we got to such a situation then it means it is only going to be used for reading
-            // (computation of shadowable tombstones) and in that case the existence of such a column
-            // is the only thing that is of interest to us.
-            return make_lw_shared<db::view::base_dependent_view_info>(true, view_col_name);
+            has_base_non_pk_columns_in_view_pk = true;
        }
    }
-
-    return make_lw_shared<db::view::base_dependent_view_info>(base.shared_from_this(), std::move(base_regular_columns_in_view_pk), std::move(base_static_columns_in_view_pk));
+    return db::view::base_dependent_view_info(has_computed_column_depending_on_base_non_primary_key,
+        is_partition_key_permutation_of_base_partition_key, has_base_non_pk_columns_in_view_pk);
 }

 bool view_info::has_base_non_pk_columns_in_view_pk() const {
-    // The base info is not always available, this is because
-    // the base info initialization is separate from the view
-    // info construction. If we are trying to get this info without
-    // initializing the base information it means that we have a
-    // schema integrity problem as the creator of owning view schema
-    // didn't make sure to initialize it with base information.
-    if (!_base_info) {
-        on_internal_error(vlogger, "Tried to perform a view query which is base info dependent without initializing it");
-    }
-    return _base_info->has_base_non_pk_columns_in_view_pk;
+    return _base_info.has_base_non_pk_columns_in_view_pk;
+}
+
+bool view_info::has_computed_column_depending_on_base_non_primary_key() const {
+    return _base_info.has_computed_column_depending_on_base_non_primary_key;
+}
+
+bool view_info::is_partition_key_permutation_of_base_partition_key() const {
+    return _base_info.is_partition_key_permutation_of_base_partition_key;
 }

 clustering_row db::view::clustering_or_static_row::as_clustering_row(const schema& s) const {
@@ -342,11 +296,11 @@ bool may_be_affected_by(data_dictionary::database db, const schema& base, const
 }

 static bool update_requires_read_before_write(data_dictionary::database db, const schema& base,
-        const std::vector<view_and_base>& views,
+        const std::vector<view_ptr>& views,
        const dht::decorated_key& key,
        const rows_entry& update) {
    for (auto&& v : views) {
-        view_info& vf = *v.view->view_info();
+        view_info& vf = *v->view_info();
        if (may_be_affected_by(db, base, vf, key, update)) {
            return true;
        }
@@ -483,6 +437,30 @@ bool matches_view_filter(data_dictionary::database db, const schema& base, const
            && visitor.matches_view_filter();
 }

+view_updates::view_updates(view_ptr v, schema_ptr base)
+    : _view(std::move(v))
+    , _view_info(*_view->view_info())
+    , _base(std::move(base))
+    , _base_info(_view_info.base_info())
+    , _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view))
+{
+    for (auto&& view_col : _view->primary_key_columns()) {
+        if (view_col.is_computed()) {
+            continue;
+        }
+        const bytes& view_col_name = view_col.name();
+        auto* base_col = _base->get_column_definition(view_col_name);
+        if (base_col && base_col->is_regular()) {
+            _base_regular_columns_in_view_pk.push_back(base_col->id);
+        } else if (base_col && base_col->is_static()) {
+            _base_static_columns_in_view_pk.push_back(base_col->id);
+        } else if (!base_col) {
+            on_internal_error(vlogger, format("Column {} in view {}.{} was not found in the base table {}.{}",
+                    view_col_name, _view->ks_name(), _view->cf_name(), _base->ks_name(), _base->cf_name()));
+        }
+    }
+}
+
 future<> view_updates::move_to(utils::chunked_vector<frozen_mutation_and_schema>& mutations) {
    mutations.reserve(mutations.size() + _updates.size());
    for (auto it = _updates.begin(); it != _updates.end(); it = _updates.erase(it)) {
@@ -506,79 +484,6 @@ size_t view_updates::op_count() const {
    return _op_count;
 }

-row_marker view_updates::compute_row_marker(const clustering_or_static_row& base_row) const {
-    /*
-     * We need to compute both the timestamp and expiration for view rows.
-     *
-     * Below there are several distinct cases depending on how many new key
-     * columns the view has - i.e., how many of the view's key columns were
-     * regular columns in the base. base_regular_columns_in_view_pk.size():
-     *
-     * Zero new key columns:
-     *     The view rows key is composed only from base key columns, and those
-     *     cannot be changed in an update, so the view row remains alive as
-     *     long as the base row is alive. We need to return the same row
-     *     marker as the base for the view - to keep an empty view row alive
-    *      for as long as an empty base row exists.
-     *     Note that in this case, if there are *unselected* base columns, we
-     *     may need to keep an empty view row alive even without a row marker
-     *     because the base row (which has additional columns) is still alive.
-     *     For that we have the "virtual columns" feature: In the zero new
-     *     key columns case, we put unselected columns in the view as empty
-     *     columns, to keep the view row alive.
-     *
-     * One new key column:
-     *     In this case, there is a regular base column that is part of the
-     *     view key. This regular column can be added or deleted in an update,
-     *     or its expiration be set, and those can cause the view row -
-     *     including its row marker - to need to appear or disappear as well.
-     *     So the liveness of cell of this one column determines the liveness
-     *     of the view row and the row marker that we return.
-     *
-     * Two or more new key columns:
-     *     This case is explicitly NOT supported in CQL - one cannot create a
-     *     view with more than one base-regular columns in its key. In general
-     *     picking one liveness (timestamp and expiration) is not possible
-     *     if there are multiple regular base columns in the view key, as
-     *     those can have different liveness.
-     *     However, we do allow this case for Alternator - we need to allow
-     *     the case of two (but not more) because the DynamoDB API allows
-     *     creating a GSI whose two key columns (hash and range key) were
-     *     regular columns.
-     *     We can support this case in Alternator because it doesn't use
-     *     expiration (the "TTL" it does support is different), and doesn't
-     *     support user-defined timestamps. But, the two columns can still
-     *     have different timestamps - this happens if an update modifies
-     *     just one of them. In this case the timestamp of the view update
-     *     (and that of the row marker we return) is the later of these two
-     *     updated columns.
-     */
-    const auto& col_ids = base_row.is_clustering_row()
-            ? _base_info->base_regular_columns_in_view_pk()
-            : _base_info->base_static_columns_in_view_pk();
-    if (!col_ids.empty()) {
-        auto& def = _base->column_at(base_row.column_kind(), col_ids[0]);
-        // Note: multi-cell columns can't be part of the primary key.
-        auto cell = base_row.cells().cell_at(col_ids[0]).as_atomic_cell(def);
-        auto ts = cell.timestamp();
-        if (col_ids.size() > 1){
-            // As explained above, this case only happens in Alternator,
-            // and we may need to pick a higher ts:
-            auto& second_def = _base->column_at(base_row.column_kind(), col_ids[1]);
-            auto second_cell = base_row.cells().cell_at(col_ids[1]).as_atomic_cell(second_def);
-            auto second_ts = second_cell.timestamp();
-            ts = std::max(ts, second_ts);
-            // Alternator isn't supposed to have TTL or more than two col_ids!
-            if (col_ids.size() != 2 || cell.is_live_and_has_ttl() || second_cell.is_live_and_has_ttl()) [[unlikely]] {
-                utils::on_internal_error(format("Unexpected col_ids length {} or has TTL", col_ids.size()));
-            }
-        }
-        return cell.is_live_and_has_ttl() ? row_marker(ts, cell.ttl(), cell.expiry()) : row_marker(ts);
-    }
-
-    return base_row.marker();
-}
-
 namespace {
 // The following struct is identical to view_key_with_action, except the key
 // is stored as a managed_bytes_view instead of bytes.
@@ -654,8 +559,8 @@ public:
            return {_update.key()->get_component(_base, base_col->position())};
        default:
            if (base_col->kind != _update.column_kind()) {
-                on_internal_error(vlogger, format("Tried to get a {} column from a {} row update, which is impossible",
-                        to_sstring(base_col->kind), _update.is_clustering_row() ? "clustering" : "static"));
+                on_internal_error(vlogger, format("Tried to get a {} column {} from a {} row update, which is impossible",
+                        to_sstring(base_col->kind), base_col->name_as_text(), _update.is_clustering_row() ? "clustering" : "static"));
            }
            auto& c = _update.cells().cell_at(base_col->id);
            auto value_view = base_col->is_atomic() ? c.as_atomic_cell(cdef).value() : c.as_collection_mutation().data;
@@ -676,6 +581,22 @@ private:
            return handle_collection_column_computation(collection_computation);
        }

+        // TODO: we already calculated this computation in updatable_view_key_cols,
+        // so perhaps we should pass it here and not re-compute it. But this will
+        // mean computed columns will only work for view key columns (currently
+        // we assume that anyway)
+        if (auto* c = dynamic_cast<const regular_column_transformation*>(&computation)) {
+            regular_column_transformation::result after =
+                c->compute_value(_base, _base_key, _update);
+            if (after.has_value()) {
+                return {managed_bytes_view(linearized_values.emplace_back(after.get_value()))};
+            }
+            // We only get to this function when we know the _update row
+            // exists and call it to read its key columns, so we don't expect
+            // to see a missing value for any of those columns
+            on_internal_error(vlogger, fmt::format("unexpected call to handle_computed_column {} missing in update", cdef.name_as_text()));
+        }
+
        auto computed_value = computation.compute_value(_base, _base_key);
        return {managed_bytes_view(linearized_values.emplace_back(std::move(computed_value)))};
    }
@@ -727,7 +648,6 @@ view_updates::get_view_rows(const partition_key& base_key, const clustering_or_s
        if (partition.partition_tombstone() && partition.partition_tombstone() == row_delete_tomb.tomb()) {
            return;
        }
-
        ret.push_back({&partition.clustered_row(*_view, std::move(ckey)), action});
    };

@@ -934,13 +854,12 @@ static void add_cells_to_view(const schema& base, const schema& view, column_kin
 * Creates a view entry corresponding to the provided base row.
 * This method checks that the base row does match the view filter before applying anything.
 */
-void view_updates::create_entry(data_dictionary::database db, const partition_key& base_key, const clustering_or_static_row& update, gc_clock::time_point now) {
+void view_updates::create_entry(data_dictionary::database db, const partition_key& base_key, const clustering_or_static_row& update, gc_clock::time_point now, row_marker update_marker) {
    if (!matches_view_filter(db, *_base, _view_info, base_key, update, now)) {
        return;
    }

    auto view_rows = get_view_rows(base_key, update, std::nullopt, {});
-    auto update_marker = compute_row_marker(update);
    const auto kind = update.column_kind();
    for (const auto& [r, action]: view_rows) {
        if (auto rm = std::get_if<row_marker>(&action)) {
@@ -958,48 +877,28 @@ void view_updates::create_entry(data_dictionary::database db, const partition_ke
 * Deletes the view entry corresponding to the provided base row.
 * This method checks that the base row does match the view filter before bothering.
 */
-void view_updates::delete_old_entry(data_dictionary::database db, const partition_key& base_key, const clustering_or_static_row& existing, const clustering_or_static_row& update, gc_clock::time_point now) {
+void view_updates::delete_old_entry(data_dictionary::database db, const partition_key& base_key, const clustering_or_static_row& existing, const clustering_or_static_row& update, gc_clock::time_point now, api::timestamp_type deletion_ts) {
    // Before deleting an old entry, make sure it was matching the view filter
    // (otherwise there is nothing to delete)
    if (matches_view_filter(db, *_base, _view_info, base_key, existing, now)) {
-        do_delete_old_entry(base_key, existing, update, now);
+        do_delete_old_entry(base_key, existing, update, now, deletion_ts);
    }
 }

-void view_updates::do_delete_old_entry(const partition_key& base_key, const clustering_or_static_row& existing, const clustering_or_static_row& update, gc_clock::time_point now) {
+void view_updates::do_delete_old_entry(const partition_key& base_key, const clustering_or_static_row& existing, const clustering_or_static_row& update, gc_clock::time_point now, api::timestamp_type deletion_ts) {
    auto view_rows = get_view_rows(base_key, existing, std::nullopt, update.tomb());
    const auto kind = existing.column_kind();
    for (const auto& [r, action] : view_rows) {
        const auto& col_ids = existing.is_clustering_row()
-                ? _base_info->base_regular_columns_in_view_pk()
-                : _base_info->base_static_columns_in_view_pk();
-        if (_view_info.has_computed_column_depending_on_base_non_primary_key()) {
-            if (auto ts_tag = std::get_if<view_key_and_action::shadowable_tombstone_tag>(&action)) {
-                r->apply(ts_tag->into_shadowable_tombstone(now));
-            }
-        } else if (!col_ids.empty()) {
-            // We delete the old row using a shadowable row tombstone, making sure that
-            // the tombstone deletes everything in the row (or it might still show up).
-            // Note: multi-cell columns can't be part of the primary key.
-            auto& def = _base->column_at(kind, col_ids[0]);
-            auto cell = existing.cells().cell_at(col_ids[0]).as_atomic_cell(def);
-            auto ts = cell.timestamp();
-            if (col_ids.size() > 1) {
-                // This is the Alternator-only support for two regular base
-                // columns that become view key columns. See explanation in
-                // view_updates::compute_row_marker().
-                auto& second_def = _base->column_at(kind, col_ids[1]);
-                auto second_cell = existing.cells().cell_at(col_ids[1]).as_atomic_cell(second_def);
-                auto second_ts = second_cell.timestamp();
-                ts = std::max(ts, second_ts);
-                // Alternator isn't supposed to have more than two col_ids!
-                if (col_ids.size() != 2) [[unlikely]] {
-                    utils::on_internal_error(format("Unexpected col_ids length {}", col_ids.size()));
-                }
-            }
-            if (cell.is_live()) {
-                r->apply(shadowable_tombstone(ts, now));
-            }
+                ? _base_regular_columns_in_view_pk
+                : _base_static_columns_in_view_pk;
+        if (!col_ids.empty() || _view_info.has_computed_column_depending_on_base_non_primary_key()) {
+            // The view key could have been modified because it contains or
+            // depends on a non-primary-key. The fact that this function was
+            // called instead of update_entry() means the caller knows it
+            // wants to delete the old row (with the given deletion_ts) and
+            // will create a different one. So let's honor this.
+            r->apply(shadowable_tombstone(deletion_ts, now));
        } else {
            // "update" caused the base row to have been deleted, and !col_id
            // means view row is the same - so it needs to be deleted as well
@@ -1046,7 +945,7 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
        // as part of its PK, there are NO virtual columns corresponding to the unselected columns in the view.
        // Because of that, we don't generate view updates when the value in an unselected column is created
        // or changes.
-        if (!column_is_selected && _base_info->has_base_non_pk_columns_in_view_pk) {
+        if (!column_is_selected && _base_info.has_base_non_pk_columns_in_view_pk) {
            return true;
        }

@@ -1100,15 +999,15 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
 * This method checks that the base row (before and after) matches the view filter before
 * applying anything.
 */
-void view_updates::update_entry(data_dictionary::database db, const partition_key& base_key, const clustering_or_static_row& update, const clustering_or_static_row& existing, gc_clock::time_point now) {
+void view_updates::update_entry(data_dictionary::database db, const partition_key& base_key, const clustering_or_static_row& update, const clustering_or_static_row& existing, gc_clock::time_point now, row_marker update_marker) {
    // While we know update and existing correspond to the same view entry,
    // they may not match the view filter.
    if (!matches_view_filter(db, *_base, _view_info, base_key, existing, now)) {
-        create_entry(db, base_key, update, now);
+        create_entry(db, base_key, update, now, update_marker);
        return;
    }
    if (!matches_view_filter(db, *_base, _view_info, base_key, update, now)) {
-        do_delete_old_entry(base_key, existing, update, now);
+        do_delete_old_entry(base_key, existing, update, now, update_marker.timestamp());
        return;
    }

@@ -1117,7 +1016,7 @@ void view_updates::update_entry(data_dictionary::database db, const partition_ke
    }

    auto view_rows = get_view_rows(base_key, update, std::nullopt, {});
-    auto update_marker = compute_row_marker(update);
+
    const auto kind = update.column_kind();
    for (const auto& [r, action] : view_rows) {
        if (auto rm = std::get_if<row_marker>(&action)) {
@@ -1133,6 +1032,8 @@ void view_updates::update_entry(data_dictionary::database db, const partition_ke
    _op_count += view_rows.size();
 }

+// Note: despite the general-sounding name of this function, it is used
+// just for the case of collection indexing.
 void view_updates::update_entry_for_computed_column(
        const partition_key& base_key,
        const clustering_or_static_row& update,
@@ -1155,30 +1056,72 @@ void view_updates::update_entry_for_computed_column(
    }
 }

+// view_updates::generate_update() is the main function for taking an update
+// to a base table row - consisting of existing and updated versions of row -
+// and creating from it zero or more updates to a given materialized view.
+// These view updates may consist of updating an existing view row, deleting
+// an old view row, and/or creating a new view row.
+// There are several distinct cases depending on how many of the view's key
+// columns are "new key columns", i.e., were regular key columns in the base
+// or are a computed column based on a regular column (these computed columns
+// are used by, for example, Alternator's GSI):
+//
+// Zero new key columns:
+//   The view rows key is composed only from base key columns, and those can't
+//   be changed in an update, so the view row remains alive as long as the
+//   base row is alive. The row marker for the view needs to be set to the
+//   same row marker in the base - to keep an empty view row alive for as long
+//   as an empty base row exists.
+//   Note that in this case, if there are *unselected* base columns, we may
+//   need to keep an empty view row alive even without a row marker because
+//   the base row (which has additional columns) is still alive. For that we
+//   have the "virtual columns" feature: In the zero new key columns case, we
+//   put unselected columns in the view as empty columns, to keep the view
+//   row alive.
+//
+// One new key column:
+//   In this case, there is a regular base column that is part of the view
+//   key. This regular column can be added or deleted in an update, or its
+//   expiration be set, and those can cause the view row - including its row
+//   marker - to need to appear or disappear as well. So the liveness of cell
+//   of this one column determines the liveness of the view row and the row
+//   marker that we set for it.
+//
+// Two or more new key columns:
+//   This case is explicitly NOT supported in CQL - one cannot create a view
+//   with more than one base-regular columns in its key. In general picking
+//   one liveness (timestamp and expiration) is not possible if there are
+//   multiple regular base columns in the view key, asthose can have different
+//   liveness.
+//   However, we do allow this case for Alternator - we need to allow the case
+//   of two (but not more) because the DynamoDB API allows creating a GSI
+//   whose two key columns (hash and range key) were regular columns. We can
+//   support this case in Alternator because it doesn't use expiration (the
+//   "TTL" it does support is different), and doesn't support user-defined
+//   timestamps. But, the two columns can still have different timestamps -
+//   this happens if an update modifies just one of them. In this case the
+//   timestamp of the view update (and that of the row marker) is the later
+//    of these two updated columns.
 void view_updates::generate_update(
        data_dictionary::database db,
        const partition_key& base_key,
        const clustering_or_static_row& update,
        const std::optional<clustering_or_static_row>& existing,
        gc_clock::time_point now) {
-
-    // Note that the base PK columns in update and existing are the same, since we're intrinsically dealing
-    // with the same base row. So we have to check 3 things:
-    //   1) that the clustering key doesn't have a null, which can happen for compact tables. If that's the case,
-    //      there is no corresponding entries.
-    //   2) if there is a column not part of the base PK in the view PK, whether it is changed by the update.
-    //   3) whether the update actually matches the view SELECT filter
-
+    // FIXME: The following if() is old code which may be related to COMPACT
+    // STORAGE. If this is a real case, refer to a test that demonstrates it.
+    // If it's not a real case, remove this if().
    if (update.is_clustering_row()) {
        if (!update.key()->is_full(*_base)) {
            return;
        }
    }
-
-    if (_view_info.has_computed_column_depending_on_base_non_primary_key()) {
-        return update_entry_for_computed_column(base_key, update, existing, now);
-    }
-    if (!_base_info->has_base_non_pk_columns_in_view_pk) {
+    // If the view key depends on any regular column in the base, the update
+    // may change the view key and may require deleting an old view row and
+    // inserting a new row. The other case, which we'll handle here first,
+    // is easier and require just modifying one view row.
+    if (!_base_info.has_base_non_pk_columns_in_view_pk &&
+        !_view_info.has_computed_column_depending_on_base_non_primary_key()) {
        if (update.is_static_row()) {
            // TODO: support static rows in views with pk only including columns from base pk
            return;
@@ -1186,85 +1129,186 @@ void view_updates::generate_update(
        // The view key is necessarily the same pre and post update.
        if (existing && existing->is_live(*_base)) {
            if (update.is_live(*_base)) {
-                update_entry(db, base_key, update, *existing, now);
+                update_entry(db, base_key, update, *existing, now, update.marker());
            } else {
-                delete_old_entry(db, base_key, *existing, update, now);
+                delete_old_entry(db, base_key, *existing, update, now, api::missing_timestamp);
            }
        } else if (update.is_live(*_base)) {
-            create_entry(db, base_key, update, now);
+            create_entry(db, base_key, update, now, update.marker());
        }
        return;
    }

-    const auto& col_ids = update.is_clustering_row()
-            ? _base_info->base_regular_columns_in_view_pk()
-            : _base_info->base_static_columns_in_view_pk();
-
-    // The view has a non-primary-key column from the base table as its primary key.
-    // That means it's either a regular or static column. If we are currently
-    // processing an update which does not correspond to the column's kind,
-    // just stop here.
-    if (col_ids.empty()) {
+    // Find the view key columns that may be changed by an update.
+    // This case is interesting because a change to the view key means that
+    // we may need to delete an old view row and/or create a new view row.
+    // The columns we look for are view key columns that are neither base key
+    // columns nor computed columns based just on key columns. In other words,
+    // we look here for columns which were regular columns or static columns
+    // in the base table, or computed columns based on regular columns.
+    struct updatable_view_key_col {
+        column_id view_col_id;
+        regular_column_transformation::result before;
+        regular_column_transformation::result after;
+    };
+    std::vector<updatable_view_key_col> updatable_view_key_cols;
+    for (const column_definition& view_col : _view->primary_key_columns()) {
+        if (view_col.is_computed()) {
+            const column_computation& computation = view_col.get_computation();
+            if (computation.depends_on_non_primary_key_column()) {
+                // Column is a computed column that does not depend just on
+                // the base key, so it may change in the update.
+                if (auto* c = dynamic_cast<const regular_column_transformation*>(&computation)) {
+                    updatable_view_key_cols.emplace_back(view_col.id,
+                        existing ? c->compute_value(*_base, base_key, *existing) : regular_column_transformation::result(),
+                        c->compute_value(*_base, base_key, update));
+                } else {
+                    // The only other column_computation we have which has
+                    // depends_on_non_primary_key_column is
+                    // collection_column_computation, and we have a special
+                    // function to handle that case:
+                    return update_entry_for_computed_column(base_key, update, existing, now);
+                }
+            }
+        } else {
+            const column_definition* base_col = _base->get_column_definition(view_col.name());
+            if (!base_col) {
+                on_internal_error(vlogger, fmt::format("Column {} in view {}.{} was not found in the base table {}.{}",
+                    view_col.name(), _view->ks_name(), _view->cf_name(), _base->ks_name(), _base->cf_name()));
+            }
+            // If the view key column was also a base primary key column, then
+            // it can't possibly change in this update. But the column was not
+            // not a primary key column - i.e., a regular column or static
+            // column, the update might have changed it and we need to list it
+            // on updatable_view_key_cols.
+            // We check base_col->kind == update.column_kind() instead of just
+            // !base_col->is_primary_key() because when update is a static row
+            // we know it can't possibly update a regular column (and vice
+            // versa).
+            if (base_col->kind == update.column_kind()) {
+                // This is view key, so we know it is atomic
+                std::optional<atomic_cell_view> after;
+                auto afterp = update.cells().find_cell(base_col->id);
+                if (afterp) {
+                    after = afterp->as_atomic_cell(*base_col);
+                }
+                std::optional<atomic_cell_view> before;
+                if (existing) {
+                    auto beforep = existing->cells().find_cell(base_col->id);
+                    if (beforep) {
+                        before = beforep->as_atomic_cell(*base_col);
+                    }
+                }
+                updatable_view_key_cols.emplace_back(view_col.id,
+                    before ? regular_column_transformation::result(*before) : regular_column_transformation::result(),
+                    after ? regular_column_transformation::result(*after) : regular_column_transformation::result());
+            }
+        }
+    }
+    // If we reached here, the view has a non-primary-key column from the base
+    // table as its primary key. That means it's either a regular or static
+    // column. If we are currently processing an update which does not
+    // correspond to the column's kind, updatable_view_key_cols will be empty
+    // and we can just stop here.
+    if (updatable_view_key_cols.empty()) {
        return;
    }

-    const auto kind = update.column_kind();
-
-    // If one of the key columns is missing, set has_new_row = false
-    // meaning that after the update there will be no view row.
-    // If one of the key columns is missing in the existing value,
-    // set has_old_row = false meaning we don't have an old row to
-    // delete.
+    // Use updatable_view_key_cols - the before and after values of the
+    // view key columns that may have changed - to determine if the update
+    // changes an existing view row, deletes an old row or creates a new row.
    bool has_old_row = true;
    bool has_new_row = true;
-    bool same_row = true;
-    for (auto col_id : col_ids) {
-        auto* after = update.cells().find_cell(col_id);
-        auto& cdef = _base->column_at(kind, col_id);
-        if (existing) {
-            auto* before = existing->cells().find_cell(col_id);
-            // Note that this cell is necessarily atomic, because col_ids are
-            // view key columns, and keys must be atomic.
-            if (before && before->as_atomic_cell(cdef).is_live()) {
-                if (after && after->as_atomic_cell(cdef).is_live()) {
-                    // We need to compare just the values of the keys, not
-                    // metadata like the timestamp. This is because below,
-                    // if the old and new view row have the same key, we need
-                    // to be sure to reach the update_entry() case.
-                    auto cmp = compare_unsigned(before->as_atomic_cell(cdef).value(), after->as_atomic_cell(cdef).value());
-                    if (cmp != 0) {
-                        same_row = false;
-                    }
+    bool same_row = true; // undefined if either has_old_row or has_new_row are false
+    for (const auto& u : updatable_view_key_cols) {
+        if (u.before.has_value()) {
+            if (u.after.has_value()) {
+                if (compare_unsigned(u.before.get_value(), u.after.get_value()) != 0) {
+                    same_row = false;
                }
            } else {
-                has_old_row = false;
+                has_new_row = false;
            }
        } else {
            has_old_row = false;
-        }
-        if (!after || !after->as_atomic_cell(cdef).is_live()) {
-            has_new_row = false;
+            if (!u.after.has_value()) {
+                has_new_row = false;
+            }
        }
    }
+
+    // If has_new_row, calculate a row marker for this view row - i.e., a
+    // timestamp and ttl - based on those of the updatable view key column
+    // (or, in an Alternator-only extension, more than one).
+    row_marker new_row_rm; // only set if has_new_row
+    if (has_new_row) {
+        // Note:
+        // 1. By reaching here we know that updatable_view_key_cols has at
+        //    least one member (in CQL, it's always one, in Alternator it
+        //    may be two).
+        // 2. Because has_new_row, we know all elements in that array have
+        //    after.has_value() true, so we can use after.get_ts() et al.
+        api::timestamp_type new_row_ts = updatable_view_key_cols[0].after.get_ts();
+        // This is the Alternator-only support for *two* regular base columns
+        // that become view key columns. The timestamp we use is the *maximum*
+        // of the two key columns, as explained in pull-request #17172.
+        if (updatable_view_key_cols.size() > 1) {
+            auto second_ts = updatable_view_key_cols[1].after.get_ts();
+            new_row_ts = std::max(new_row_ts, second_ts);
+            // Alternator isn't supposed to have more than two updatable view key columns!
+            if (updatable_view_key_cols.size() != 2) [[unlikely]] {
+                utils::on_internal_error(format("Unexpected updatable_view_key_col length {}", updatable_view_key_cols.size()));
+            }
+        }
+        // We assume that either updatable_view_key_cols has just one column
+        // (the only situation allowed in CQL) or if there is more then one
+        // they have the same expiry information (in Alternator, there is
+        // never a CQL TTL set).
+        new_row_rm =  row_marker(new_row_ts, updatable_view_key_cols[0].after.get_ttl(), updatable_view_key_cols[0].after.get_expiry());
+    }
+
    if (has_old_row) {
+        // As explained in #19977, when there is one updatable_view_key_cols
+        // (the only case allowed in CQL) the deletion timestamp is before's
+        // timestamp. As explained in #17119, if there are two of them (only
+        // possible in Alternator), we take the maximum.
+        // Note:
+        // 1. By reaching here we know that updatable_view_key_cols has at
+        //    least one member (in CQL, it's always one, in Alternator it
+        //    may be two).
+        // 2. Because has_old_row, we know all elements in that array have
+        //    before.has_value() true, so we can use before.get_ts().
+        auto old_row_ts = updatable_view_key_cols[0].before.get_ts();
+        if (updatable_view_key_cols.size() > 1) {
+            // This is the Alternator-only support for two regular base
+            // columns that become view key columns. See explanation in
+            // view_updates::compute_row_marker().
+            auto second_ts = updatable_view_key_cols[1].before.get_ts();
+            old_row_ts = std::max(old_row_ts, second_ts);
+            // Alternator isn't supposed to have more than two updatable view key columns!
+            if (updatable_view_key_cols.size() != 2) [[unlikely]] {
+                utils::on_internal_error(format("Unexpected updatable_view_key_col length {}", updatable_view_key_cols.size()));
+            }
+        }
        if (has_new_row) {
            if (same_row) {
-                update_entry(db, base_key, update, *existing, now);
+                update_entry(db, base_key, update, *existing, now, new_row_rm);
            } else {
-                // This code doesn't work if the old and new view row have the
-                // same key, because if they do we get both data and tombstone
-                // for the same timestamp (now) and the tombstone wins. This
-                // is why we need the "same_row" case above - it's not just a
-                // performance optimization.
-                delete_old_entry(db, base_key, *existing, update, now);
-                create_entry(db, base_key, update, now);
+                // The following code doesn't work if the old and new view row
+                // have the same key, because if they do we can get both data
+                // and tombstone for the same timestamp and the tombstone
+                // wins. This is why we need the "same_row" case above - it's
+                // not just a performance optimization.
+                delete_old_entry(db, base_key, *existing, update, now, old_row_ts);
+                create_entry(db, base_key, update, now, new_row_rm);
            }
        } else {
-            delete_old_entry(db, base_key, *existing, update, now);
+            delete_old_entry(db, base_key, *existing, update, now, old_row_ts);
        }
    } else if (has_new_row) {
-        create_entry(db, base_key, update, now);
+        create_entry(db, base_key, update, now, new_row_rm);
    }
+
 }

 bool view_updates::is_partition_key_permutation_of_base_partition_key() const {
@@ -1605,17 +1649,12 @@ view_update_builder make_view_update_builder(
        data_dictionary::database db,
        const replica::table& base_table,
        const schema_ptr& base,
-        std::vector<view_and_base>&& views_to_update,
+        std::vector<view_ptr>&& views_to_update,
        mutation_reader&& updates,
        mutation_reader_opt&& existings,
        gc_clock::time_point now) {
-    auto vs = views_to_update | std::views::transform([&] (view_and_base v) {
-        if (base->version() != v.base->base_schema()->version()) {
-            on_internal_error(vlogger, format("Schema version used for view updates ({}) does not match the current"
-                                              " base schema version of the view ({}) for view {}.{} of {}.{}",
-                base->version(), v.base->base_schema()->version(), v.view->ks_name(), v.view->cf_name(), base->ks_name(), base->cf_name()));
-        }
-        return view_updates(std::move(v));
+    auto vs = views_to_update | std::views::transform([&] (view_ptr v) {
+        return view_updates(std::move(v), base);
    }) | std::ranges::to<std::vector<view_updates>>();
    return view_update_builder(std::move(db), base_table, base, std::move(vs), std::move(updates), std::move(existings), now);
 }
@@ -1624,18 +1663,18 @@ future<query::clustering_row_ranges> calculate_affected_clustering_ranges(data_d
        const schema& base,
        const dht::decorated_key& key,
        const mutation_partition& mp,
-        const std::vector<view_and_base>& views) {
+        const std::vector<view_ptr>& views) {
    utils::chunked_vector<interval<clustering_key_prefix_view>> row_ranges;
    utils::chunked_vector<interval<clustering_key_prefix_view>> view_row_ranges;
    clustering_key_prefix_view::tri_compare cmp(base);
    if (mp.partition_tombstone() || !mp.row_tombstones().empty()) {
        for (auto&& v : views) {
            // FIXME: #2371
-            if (v.view->view_info()->select_statement(db).get_restrictions()->has_unrestricted_clustering_columns()) {
+            if (v->view_info()->select_statement(db).get_restrictions()->has_unrestricted_clustering_columns()) {
                view_row_ranges.push_back(interval<clustering_key_prefix_view>::make_open_ended_both_sides());
                break;
            }
-            for (auto&& r : v.view->view_info()->partition_slice(db).default_row_ranges()) {
+            for (auto&& r : v->view_info()->partition_slice(db).default_row_ranges()) {
                view_row_ranges.push_back(r.transform(std::mem_fn(&clustering_key_prefix::view)));
                co_await coroutine::maybe_yield();
            }
@@ -1683,7 +1722,7 @@ future<query::clustering_row_ranges> calculate_affected_clustering_ranges(data_d
    co_return result_ranges;
 }

-bool needs_static_row(const mutation_partition& mp, const std::vector<view_and_base>& views) {
+bool needs_static_row(const mutation_partition& mp, const std::vector<view_ptr>& views) {
    // TODO: We could also check whether any of the views need static rows
    // and return false if none of them do
    return mp.partition_tombstone() || !mp.static_row().empty();
@@ -2481,9 +2520,16 @@ void view_builder::on_create_view(const sstring& ks_name, const sstring& view_na
            // threshold.
          return initialize_reader_at_current_token(step).then([this, view, &step] () mutable {
            return add_new_view(view, step).then_wrapped([this, view] (future<>&& f) {
-                if (f.failed()) {
-                    vlogger.error("Error setting up view for building {}.{}: {}", view->ks_name(), view->cf_name(), f.get_exception());
+                try {
+                    f.get();
+                } catch (abort_requested_exception&) {
+                    vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
+                } catch (raft::request_aborted&) {
+                    vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
+                } catch (...) {
+                    vlogger.error("Error setting up view for building {}.{}: {}", view->ks_name(), view->cf_name(), std::current_exception());
                }
+
                // Waited on indirectly in stop().
                (void)_build_step.trigger();
            });
@@ -2954,13 +3000,12 @@ public:
        if (!_fragments.empty()) {
            _fragments.emplace_front(*_step.reader.schema(), _builder._permit, partition_start(_step.current_key, tombstone()));
            auto base_schema = _step.base->schema();
-            auto views = with_base_info_snapshot(_views_to_build);
            auto reader = make_mutation_reader_from_fragments(_step.reader.schema(), _builder._permit, std::move(_fragments));
            auto close_reader = defer([&reader] { reader.close().get(); });
            reader.upgrade_schema(base_schema);
            _gen->populate_views(
                    *_step.base,
-                    std::move(views),
+                    _views_to_build,
                    _step.current_token(),
                    std::move(reader),
                    _now).get();
@@ -2972,6 +3017,7 @@ public:

    stop_iteration consume_end_of_partition() {
        inject_failure("view_builder_consume_end_of_partition");
+        utils::get_local_injector().inject("view_builder_consume_end_of_partition_delay", utils::wait_for_message(std::chrono::seconds(60))).get();
        flush_fragments();
        return stop_iteration(_step.build_status.empty());
    }
@@ -2987,14 +3033,11 @@ public:
                          _step.base->schema()->cf_name(), _step.current_token(), view_names);
        }
        if (_step.reader.is_end_of_stream() && _step.reader.is_buffer_empty()) {
-            if (_step.current_key.key().is_empty()) {
-                // consumer got end-of-stream without consuming a single partition
-                vlogger.debug("Reader didn't produce anything, marking views as built");
-                while (!_step.build_status.empty()) {
-                    _built_views.views.push_back(std::move(_step.build_status.back()));
-                    _step.build_status.pop_back();
-                }
-            }
+            // before going back to the minimum token, advance current_key to the end
+            // and check for built views in that range.
+            _step.current_key = {_step.prange.end().value_or(dht::ring_position::max()).value().token(), partition_key::make_empty()};
+            check_for_built_views();
+
            _step.current_key = {dht::minimum_token(), partition_key::make_empty()};
            for (auto&& vs : _step.build_status) {
                vs.next_token = dht::minimum_token();
@@ -3008,6 +3051,7 @@ public:

 // Called in the context of a seastar::thread.
 void view_builder::execute(build_step& step, exponential_backoff_retry r) {
+    inject_failure("dont_start_build_step");
    gc_clock::time_point now = gc_clock::now();
    auto compaction_state = make_lw_shared<compact_for_query_state_v2>(
            *step.reader.schema(),
@@ -3041,6 +3085,7 @@ void view_builder::execute(build_step& step, exponential_backoff_retry r) {
    seastar::when_all_succeed(bookkeeping_ops.begin(), bookkeeping_ops.end()).handle_exception([] (std::exception_ptr ep) {
        vlogger.warn("Failed to update materialized view bookkeeping ({}), continuing anyway.", ep);
    }).get();
+    utils::get_local_injector().inject("delay_finishing_build_step", utils::wait_for_message(60s)).get();
 }

 future<> view_builder::mark_as_built(view_ptr view) {
@@ -3223,12 +3268,6 @@ view_updating_consumer::view_updating_consumer(view_update_generator& gen, schem
    })
 { }

-std::vector<db::view::view_and_base> with_base_info_snapshot(std::vector<view_ptr> vs) {
-    return vs | std::views::transform([] (const view_ptr& v) {
-        return db::view::view_and_base{v, v->view_info()->base_info()};
-    }) | std::ranges::to<std::vector>();
-}
-
 delete_ghost_rows_visitor::delete_ghost_rows_visitor(service::storage_proxy& proxy, service::query_state& state, view_ptr view, db::timeout_clock::duration timeout_duration)
        : _proxy(proxy)
        , _state(state)
@@ -3250,6 +3289,7 @@ void delete_ghost_rows_visitor::accept_new_row(const clustering_key& ck, const q
    auto view_exploded_ck = ck.explode();
    std::vector<bytes> base_exploded_pk(_base_schema->partition_key_size());
    std::vector<bytes> base_exploded_ck(_base_schema->clustering_key_size());
+    std::map<const column_definition*, bytes> view_key_cols_not_in_base_key;
    for (const column_definition& view_cdef : _view->all_columns()) {
        const column_definition* base_cdef = _base_schema->get_column_definition(view_cdef.name());
        if (base_cdef) {
@@ -3258,6 +3298,8 @@ void delete_ghost_rows_visitor::accept_new_row(const clustering_key& ck, const q
                base_exploded_pk[base_cdef->id] = view_exploded_key[view_cdef.id];
            } else if (base_cdef->is_clustering_key()) {
                base_exploded_ck[base_cdef->id] = view_exploded_key[view_cdef.id];
+            } else if (!base_cdef->is_computed() && view_cdef.is_primary_key()) {
+                view_key_cols_not_in_base_key[base_cdef] = view_exploded_key[view_cdef.id];
            }
        }
    }
@@ -3265,22 +3307,44 @@ void delete_ghost_rows_visitor::accept_new_row(const clustering_key& ck, const q
    clustering_key base_ck = clustering_key::from_exploded(base_exploded_ck);

    dht::partition_range_vector partition_ranges({dht::partition_range::make_singular(dht::decorate_key(*_base_schema, base_pk))});
-    auto selection = cql3::selection::selection::for_columns(_base_schema, std::vector<const column_definition*>({&_base_schema->partition_key_columns().front()}));
+    auto view_key_cols_not_in_base_key_cdefs = view_key_cols_not_in_base_key | std::views::keys | std::ranges::to<std::vector<const column_definition*>>();
+    auto selection = cql3::selection::selection::for_columns(_base_schema,
+        view_key_cols_not_in_base_key.empty() ? std::vector<const column_definition*>({&_base_schema->partition_key_columns().front()}) : view_key_cols_not_in_base_key_cdefs);

    std::vector<query::clustering_range> bounds{query::clustering_range::make_singular(base_ck)};
-    query::partition_slice partition_slice(std::move(bounds), {},  {}, selection->get_query_options());
+    utils::small_vector<column_id, 8> view_key_col_ids;
+    for (const auto& [col_def, _] : view_key_cols_not_in_base_key) {
+        view_key_col_ids.push_back(col_def->id);
+    }
+    query::partition_slice partition_slice(std::move(bounds), {}, std::move(view_key_col_ids), selection->get_query_options());
    auto command = ::make_lw_shared<query::read_command>(_base_schema->id(), _base_schema->version(), partition_slice,
            _proxy.get_max_result_size(partition_slice), query::tombstone_limit(_proxy.get_tombstone_limit()));
    auto timeout = db::timeout_clock::now() + _timeout_duration;
    service::storage_proxy::coordinator_query_options opts{timeout, _state.get_permit(), _state.get_client_state(), _state.get_trace_state()};
    auto base_qr = _proxy.query(_base_schema, command, std::move(partition_ranges), db::consistency_level::ALL, opts).get();
    query::result& result = *base_qr.query_result;
-    if (result.row_count().value_or(0) == 0) {
+    auto delete_ghost_row = [&]() {
        mutation m(_view, *_view_pk);
        auto& row = m.partition().clustered_row(*_view, ck);
        row.apply(tombstone(api::new_timestamp(), gc_clock::now()));
        timeout = db::timeout_clock::now() + _timeout_duration;
        _proxy.mutate({m}, db::consistency_level::ALL, timeout, _state.get_trace_state(), empty_service_permit(), db::allow_per_partition_rate_limit::no).get();
+    };
+    if (result.row_count().value_or(0) == 0) {
+        delete_ghost_row();
+    } else if (!view_key_cols_not_in_base_key.empty()) {
+        if (result.row_count().value_or(0) != 1) {
+            on_internal_error(vlogger, format("Got multiple base rows corresponding to a single view row when pruning {}.{}", _view->ks_name(), _view->cf_name()));
+        }
+        auto results = query::result_set::from_raw_result(_base_schema, partition_slice, result);
+        auto& base_row = results.row(0);
+        for (const auto& [col_def, col_val] : view_key_cols_not_in_base_key) {
+            const data_value* base_val = base_row.get_data_value(col_def->name_as_text());
+            if (!base_val || base_val->is_null() || col_val != base_val->serialize_nonnull()) {
+                delete_ghost_row();
+                break;
+            }
+        }
    }
 }

--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -15,6 +15,7 @@
 #include "mutation/frozen_mutation.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "locator/abstract_replication_strategy.hh"
+#include "db/view/base_info.hh"

 class frozen_mutation_and_schema;

@@ -28,54 +29,6 @@ namespace view {

 class stats;

-// Part of the view description which depends on the base schema version.
-//
-// This structure may change even though the view schema doesn't change, so
-// it needs to live outside view_ptr.
-struct base_dependent_view_info {
-private:
-    schema_ptr _base_schema;
-    // Id of a regular base table column included in the view's PK, if any.
-    // Scylla views only allow one such column, alternator can have up to two.
-    std::vector<column_id> _base_regular_columns_in_view_pk;
-    std::vector<column_id> _base_static_columns_in_view_pk;
-    // For tracing purposes, if the view is out of sync with its base table
-    // and there exists a column which is not in base, its name is stored
-    // and added to debug messages.
-    std::optional<bytes> _column_missing_in_base = {};
-public:
-    const std::vector<column_id>& base_regular_columns_in_view_pk() const;
-    const std::vector<column_id>& base_static_columns_in_view_pk() const;
-    const schema_ptr& base_schema() const;
-
-    // Indicates if the view hase pk columns which are not part of the base
-    // pk, it seems that !base_non_pk_columns_in_view_pk.empty() is the same,
-    // but actually there are cases where we can compute this boolean without
-    // succeeding to reliably build the former.
-    const bool has_base_non_pk_columns_in_view_pk;
-
-    // If base_non_pk_columns_in_view_pk couldn't reliably be built, this base
-    // info can't be used for computing view updates, only for reading the materialized
-    // view.
-    const bool use_only_for_reads;
-
-    // A constructor for a base info that can facilitate reads and writes from the materialized view.
-    base_dependent_view_info(schema_ptr base_schema,
-            std::vector<column_id>&& base_regular_columns_in_view_pk,
-            std::vector<column_id>&& base_static_columns_in_view_pk);
-    // A constructor for a base info that can facilitate only reads from the materialized view.
-    base_dependent_view_info(bool has_base_non_pk_columns_in_view_pk, std::optional<bytes>&& column_missing_in_base);
-};
-
-// Immutable snapshot of view's base-schema-dependent part.
-using base_info_ptr = lw_shared_ptr<const base_dependent_view_info>;
-
-// Snapshot of the view schema and its base-schema-dependent part.
-struct view_and_base {
-    view_ptr view;
-    base_info_ptr base;
-};
-
 // An immutable representation of a clustering or static row of the base table.
 struct clustering_or_static_row {
 private:
@@ -207,18 +160,15 @@ class view_updates final {
    view_ptr _view;
    const view_info& _view_info;
    schema_ptr _base;
-    base_info_ptr _base_info;
+    const base_dependent_view_info& _base_info;
+    // Id of a regular base table column included in the view's PK, if any.
+    // Scylla views only allow one such column, alternator can have up to two.
+    std::vector<column_id> _base_regular_columns_in_view_pk;
+    std::vector<column_id> _base_static_columns_in_view_pk;
    std::unordered_map<partition_key, mutation_partition, partition_key::hashing, partition_key::equality> _updates;
    size_t _op_count = 0;
 public:
-    explicit view_updates(view_and_base vab)
-            : _view(std::move(vab.view))
-            , _view_info(*_view->view_info())
-            , _base(vab.base->base_schema())
-            , _base_info(vab.base)
-            , _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view))
-    {
-    }
+    explicit view_updates(view_ptr v, schema_ptr b);

    future<> move_to(utils::chunked_vector<frozen_mutation_and_schema>& mutations);

@@ -240,10 +190,10 @@ private:
    };
    std::vector<view_row_entry> get_view_rows(const partition_key& base_key, const clustering_or_static_row& update, const std::optional<clustering_or_static_row>& existing, row_tombstone update_tomb);
    bool can_skip_view_updates(const clustering_or_static_row& update, const clustering_or_static_row& existing) const;
-    void create_entry(data_dictionary::database db, const partition_key& base_key, const clustering_or_static_row& update, gc_clock::time_point now);
-    void delete_old_entry(data_dictionary::database db, const partition_key& base_key, const clustering_or_static_row& existing, const clustering_or_static_row& update, gc_clock::time_point now);
-    void do_delete_old_entry(const partition_key& base_key, const clustering_or_static_row& existing, const clustering_or_static_row& update, gc_clock::time_point now);
-    void update_entry(data_dictionary::database db, const partition_key& base_key, const clustering_or_static_row& update, const clustering_or_static_row& existing, gc_clock::time_point now);
+    void create_entry(data_dictionary::database db, const partition_key& base_key, const clustering_or_static_row& update, gc_clock::time_point now, row_marker update_marker);
+    void delete_old_entry(data_dictionary::database db, const partition_key& base_key, const clustering_or_static_row& existing, const clustering_or_static_row& update, gc_clock::time_point now, api::timestamp_type deletion_ts);
+    void do_delete_old_entry(const partition_key& base_key, const clustering_or_static_row& existing, const clustering_or_static_row& update, gc_clock::time_point now, api::timestamp_type deletion_ts);
+    void update_entry(data_dictionary::database db, const partition_key& base_key, const clustering_or_static_row& update, const clustering_or_static_row& existing, gc_clock::time_point now, row_marker update_marker);
    void update_entry_for_computed_column(const partition_key& base_key, const clustering_or_static_row& update, const std::optional<clustering_or_static_row>& existing, gc_clock::time_point now);
 };

@@ -308,7 +258,7 @@ view_update_builder make_view_update_builder(
        data_dictionary::database db,
        const replica::table& base_table,
        const schema_ptr& base_schema,
-        std::vector<view_and_base>&& views_to_update,
+        std::vector<view_ptr>&& views_to_update,
        mutation_reader&& updates,
        mutation_reader_opt&& existings,
        gc_clock::time_point now);
@@ -318,9 +268,9 @@ future<query::clustering_row_ranges> calculate_affected_clustering_ranges(
        const schema& base,
        const dht::decorated_key& key,
        const mutation_partition& mp,
-        const std::vector<view_and_base>& views);
+        const std::vector<view_ptr>& views);

-bool needs_static_row(const mutation_partition& mp, const std::vector<view_and_base>& views);
+bool needs_static_row(const mutation_partition& mp, const std::vector<view_ptr>& views);

 // Whether this node and shard should generate and send view updates for the given token.
 // Checks that the node is one of the replicas (not a pending replicas), and is ready for reads.
@@ -343,13 +293,6 @@ size_t memory_usage_of(const frozen_mutation_and_schema& mut);
 */
 void create_virtual_column(schema_builder& builder, const bytes& name, const data_type& type);

-/**
- * Converts a collection of view schema snapshots into a collection of
- * view_and_base objects, which are snapshots of both the view schema
- * and the base-schema-dependent part of view description.
- */
-std::vector<view_and_base> with_base_info_snapshot(std::vector<view_ptr>);
-
 }

 }
--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -143,7 +143,11 @@ future<> view_update_generator::start() {
                    // Exploit the fact that sstables in the staging directory
                    // are usually non-overlapping and use a partitioned set for
                    // the read.
-                    auto ssts = make_lw_shared<sstables::sstable_set>(sstables::make_partitioned_sstable_set(s, false));
+                    // With tablets, it doesn't matter full range is fed into partitioned set since
+                    // there will be usually one sstable to be processed per tablet, and sstables of
+                    // different tablets are disjoint.
+                    auto token_range = dht::token_range::make(dht::first_token(), dht::last_token());
+                    auto ssts = make_lw_shared<sstables::sstable_set>(sstables::make_partitioned_sstable_set(s, std::move(token_range)));
                    for (auto& sst : sstables) {
                        ssts->insert(sst);
                        input_size += sst->data_size();
@@ -331,7 +335,7 @@ static size_t memory_usage_of(const utils::chunked_vector<frozen_mutation_and_sc
 * @return a future that resolves when the updates have been acknowledged by the view replicas
 */
 future<> view_update_generator::populate_views(const replica::table& table,
-        std::vector<view_and_base> views,
+        std::vector<view_ptr> views,
        dht::token base_token,
        mutation_reader&& reader,
        gc_clock::time_point now) {
@@ -404,7 +408,7 @@ struct view_update_generation_timeout_exception : public seastar::timed_out_erro
 future<> view_update_generator::generate_and_propagate_view_updates(const replica::table& table,
        const schema_ptr& base,
        reader_permit permit,
-        std::vector<view_and_base>&& views,
+        std::vector<view_ptr>&& views,
        mutation&& m,
        mutation_reader_opt existings,
        tracing::trace_state_ptr tr_state,
--- a/Show More
+++ b/Show More