Fix token kind comparison in decorated_key::tri_compare

When comparing decorated_key with ring_position, we need to account for the token kind. decorated_key tokens are always token_kind::key, but ring_position tokens can be before_all_keys or after_all_keys. The previous version incorrectly compared only _data fields, which would produce wrong results when the ring_position token had a different kind. Co-authored-by: tgrabiec <283695+tgrabiec@users.noreply.github.com>
Replace dht::token with int64_t in decorated_key
2026-04-20 08:30:35 +00:00 · 2026-01-29 13:58:03 +00:00 · 2026-01-29 13:54:46 +00:00 · 2026-01-29 13:49:15 +00:00 · 2026-01-29 08:25:17 +02:00 · 2026-01-28 20:49:19 +01:00
289 changed files with 3606 additions and 3892 deletions
--- a/.github/workflows/call_backport_with_jira.yaml
+++ b/.github/workflows/call_backport_with_jira.yaml
@@ -1,53 +0,0 @@
-name: Backport with Jira Integration
-
-on:
-  push:
-    branches:
-      - master
-      - next-*.*
-      - branch-*.*
-  pull_request_target:
-    types: [labeled, closed]
-    branches: 
-      - master
-      - next
-      - next-*.*
-      - branch-*.*
-
-jobs:
-  backport-on-push:
-    if: github.event_name == 'push'
-    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
-    with:
-      event_type: 'push'
-      base_branch: ${{ github.ref }}
-      commits: ${{ github.event.before }}..${{ github.sha }}
-    secrets:
-      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
-      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
-  backport-on-label:
-    if: github.event_name == 'pull_request_target' && github.event.action == 'labeled'
-    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
-    with:
-      event_type: 'labeled'
-      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
-      pull_request_number: ${{ github.event.pull_request.number }}
-      head_commit: ${{ github.event.pull_request.base.sha }}
-      label_name: ${{ github.event.label.name }}
-      pr_state: ${{ github.event.pull_request.state }}
-    secrets:
-      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
-      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
-  backport-chain:
-    if: github.event_name == 'pull_request_target' && github.event.action == 'closed' && github.event.pull_request.merged == true
-    uses: scylladb/github-automation/.github/workflows/backport-with-jira.yaml@main
-    with:
-      event_type: 'chain'
-      base_branch: refs/heads/${{ github.event.pull_request.base.ref }}
-      pull_request_number: ${{ github.event.pull_request.number }}
-      pr_body: ${{ github.event.pull_request.body }}
-    secrets:
-      gh_token: ${{ secrets.AUTO_BACKPORT_TOKEN }}
-      jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/iwyu.yaml
+++ b/.github/workflows/iwyu.yaml
@@ -14,7 +14,8 @@ env:
  CLEANER_DIRS: test/unit exceptions alternator api auth cdc compaction db dht gms index lang message mutation mutation_writer node_ops raft redis replica service
  SEASTAR_BAD_INCLUDE_OUTPUT_PATH: build/seastar-bad-include.log

-permissions: {}
+permissions:
+  contents: read

 # cancel the in-progress run upon a repush
 concurrency:
--- a/.github/workflows/trigger-scylla-ci.yaml
+++ b/.github/workflows/trigger-scylla-ci.yaml
@@ -9,53 +9,16 @@ on:

 jobs:
  trigger-jenkins:
-    if: (github.event_name == 'issue_comment' && github.event.comment.user.login != 'scylladbbot') || github.event.label.name == 'conflicts'
+    if: (github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')) || github.event.label.name == 'conflicts'
    runs-on: ubuntu-latest
    steps:
-      - name: Verify Org Membership
-        id: verify_author
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        shell: bash
-        run: |
-          if [[ "${{ github.event_name }}" == "pull_request_target" ]]; then
-            AUTHOR="${{ github.event.pull_request.user.login }}"
-          else
-            AUTHOR="${{ github.event.comment.user.login }}"
-          fi
-          ORG="scylladb"
-          if gh api "/orgs/${ORG}/members/${AUTHOR}" --silent 2>/dev/null; then
-            echo "member=true" >> $GITHUB_OUTPUT
-          else
-            echo "::warning::${AUTHOR} is not a member of ${ORG}; skipping CI trigger."
-            echo "member=false" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Validate Comment Trigger
-        if: github.event_name == 'issue_comment'
-        id: verify_comment
-        shell: bash
-        run: |
-          BODY=$(cat << 'EOF'
-          ${{ github.event.comment.body }}
-          EOF
-          )
-          CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
-
-          if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
-            echo "trigger=true" >> $GITHUB_OUTPUT
-          else
-            echo "trigger=false" >> $GITHUB_OUTPUT
-          fi
-
      - name: Trigger Scylla-CI-Route Jenkins Job
-        if: steps.verify_author.outputs.member == 'true' && (github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true')
        env:
          JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
          JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
          JENKINS_URL: "https://jenkins.scylladb.com"
-          PR_NUMBER: "${{ github.event.issue.number || github.event.pull_request.number }}"
-          PR_REPO_NAME: "${{ github.event.repository.full_name }}"
        run: |
+          PR_NUMBER=${{ github.event.issue.number }}
+          PR_REPO_NAME=${{ github.event.repository.full_name }}
          curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
-            --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail
+          --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2026.1.0
+VERSION=2026.2.0-dev

 if test -f version
 then
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -491,7 +491,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    if (!opts.enabled()) {
        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
+        co_return rjson::print(std::move(ret));
    }

    // TODO: label
@@ -502,123 +502,121 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
    // filter out cdc generations older than the table or now() - cdc::ttl (typically dynamodb_streams_max_window - 24h)
    auto low_ts = std::max(as_timepoint(schema->id()), db_clock::now() - ttl);

-    return _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners }).then([db, shard_start, limit, ret = std::move(ret), stream_desc = std::move(stream_desc)] (std::map<db_clock::time_point, cdc::streams_version> topologies) mutable {
+    std::map<db_clock::time_point, cdc::streams_version> topologies = co_await _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners });
+    auto e = topologies.end();
+    auto prev = e;
+    auto shards = rjson::empty_array();

-        auto e = topologies.end();
-        auto prev = e;
-        auto shards = rjson::empty_array();
+    std::optional<shard_id> last;

-        std::optional<shard_id> last;
+    auto i = topologies.begin();
+    // if we're a paged query, skip to the generation where we left of.
+    if (shard_start) {
+        i = topologies.find(shard_start->time);
+    }

-        auto i = topologies.begin();
-        // if we're a paged query, skip to the generation where we left of.
-        if (shard_start) {
-            i = topologies.find(shard_start->time);
-        }
+    // for parent-child stuff we need id:s to be sorted by token
+    // (see explanation above) since we want to find closest
+    // token boundary when determining parent.
+    // #7346 - we processed and searched children/parents in
+    // stored order, which is not necessarily token order,
+    // so the finding of "closest" token boundary (using upper bound)
+    // could give somewhat weird results.
+    static auto token_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
+        return id1.token() < id2.token();
+    };

-        // for parent-child stuff we need id:s to be sorted by token
-        // (see explanation above) since we want to find closest
-        // token boundary when determining parent.
-        // #7346 - we processed and searched children/parents in
-        // stored order, which is not necessarily token order,
-        // so the finding of "closest" token boundary (using upper bound)
-        // could give somewhat weird results.
-        static auto token_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
-            return id1.token() < id2.token();
-        };
+    // #7409 - shards must be returned in lexicographical order,
+    // normal bytes compare is string_traits<int8_t>::compare.
+    // thus bytes 0x8000 is less than 0x0000. By doing unsigned
+    // compare instead we inadvertently will sort in string lexical.
+    static auto id_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
+        return compare_unsigned(id1.to_bytes(), id2.to_bytes()) < 0;
+    };
+
+    // need a prev even if we are skipping stuff
+    if (i != topologies.begin()) {
+        prev = std::prev(i);
+    }
+
+    for (; limit > 0 && i != e; prev = i, ++i) {
+        auto& [ts, sv] = *i;
+
+        last = std::nullopt;
+
+        auto lo = sv.streams.begin();
+        auto end = sv.streams.end();

        // #7409 - shards must be returned in lexicographical order,
-        // normal bytes compare is string_traits<int8_t>::compare.
-        // thus bytes 0x8000 is less than 0x0000. By doing unsigned
-        // compare instead we inadvertently will sort in string lexical.
-        static auto id_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
-            return compare_unsigned(id1.to_bytes(), id2.to_bytes()) < 0;
-        };
+        std::sort(lo, end, id_cmp);

-        // need a prev even if we are skipping stuff
-        if (i != topologies.begin()) {
-            prev = std::prev(i);
+        if (shard_start) {
+            // find next shard position
+            lo = std::upper_bound(lo, end, shard_start->id, id_cmp);
+            shard_start = std::nullopt;
        }

-        for (; limit > 0 && i != e; prev = i, ++i) {
-            auto& [ts, sv] = *i;
+        if (lo != end && prev != e) {
+            // We want older stuff sorted in token order so we can find matching
+            // token range when determining parent shard.
+            std::stable_sort(prev->second.streams.begin(), prev->second.streams.end(), token_cmp);
+        }
+
+        auto expired = [&]() -> std::optional<db_clock::time_point> {
+            auto j = std::next(i);
+            if (j == e) {
+                return std::nullopt;
+            }
+            // add this so we sort of match potential 
+            // sequence numbers in get_records result.
+            return j->first + confidence_interval(db);
+        }();
+
+        while (lo != end) {
+            auto& id = *lo++;
+
+            auto shard = rjson::empty_object();
+
+            if (prev != e) {
+                auto& pids = prev->second.streams;
+                auto pid = std::upper_bound(pids.begin(), pids.end(), id.token(), [](const dht::token& t, const cdc::stream_id& id) {
+                    return t < id.token();
+                });
+                if (pid != pids.begin()) {
+                    pid = std::prev(pid);
+                }
+                if (pid != pids.end()) {
+                    rjson::add(shard, "ParentShardId", shard_id(prev->first, *pid));
+                }
+            }
+
+            last.emplace(ts, id);
+            rjson::add(shard, "ShardId", *last);
+            auto range = rjson::empty_object();
+            rjson::add(range, "StartingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(ts.time_since_epoch())));
+            if (expired) {
+                rjson::add(range, "EndingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(expired->time_since_epoch())));
+            }
+
+            rjson::add(shard, "SequenceNumberRange", std::move(range));
+            rjson::push_back(shards, std::move(shard));
+            
+            if (--limit == 0) {
+                break;
+            }

            last = std::nullopt;
-
-            auto lo = sv.streams.begin();
-            auto end = sv.streams.end();
-
-            // #7409 - shards must be returned in lexicographical order,
-            std::sort(lo, end, id_cmp);
-
-            if (shard_start) {
-                // find next shard position
-                lo = std::upper_bound(lo, end, shard_start->id, id_cmp);
-                shard_start = std::nullopt;
-            }
-
-            if (lo != end && prev != e) {
-                // We want older stuff sorted in token order so we can find matching
-                // token range when determining parent shard.
-                std::stable_sort(prev->second.streams.begin(), prev->second.streams.end(), token_cmp);
-            }
-
-            auto expired = [&]() -> std::optional<db_clock::time_point> {
-                auto j = std::next(i);
-                if (j == e) {
-                    return std::nullopt;
-                }
-                // add this so we sort of match potential 
-                // sequence numbers in get_records result.
-                return j->first + confidence_interval(db);
-            }();
-
-            while (lo != end) {
-                auto& id = *lo++;
-
-                auto shard = rjson::empty_object();
-
-                if (prev != e) {
-                    auto& pids = prev->second.streams;
-                    auto pid = std::upper_bound(pids.begin(), pids.end(), id.token(), [](const dht::token& t, const cdc::stream_id& id) {
-                        return t < id.token();
-                    });
-                    if (pid != pids.begin()) {
-                        pid = std::prev(pid);
-                    }
-                    if (pid != pids.end()) {
-                        rjson::add(shard, "ParentShardId", shard_id(prev->first, *pid));
-                    }
-                }
-
-                last.emplace(ts, id);
-                rjson::add(shard, "ShardId", *last);
-                auto range = rjson::empty_object();
-                rjson::add(range, "StartingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(ts.time_since_epoch())));
-                if (expired) {
-                    rjson::add(range, "EndingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(expired->time_since_epoch())));
-                }
-
-                rjson::add(shard, "SequenceNumberRange", std::move(range));
-                rjson::push_back(shards, std::move(shard));
-                
-                if (--limit == 0) {
-                    break;
-                }
-
-                last = std::nullopt;
-            }
        }
+    }

-        if (last) {
-            rjson::add(stream_desc, "LastEvaluatedShardId", *last);
-        }
+    if (last) {
+        rjson::add(stream_desc, "LastEvaluatedShardId", *last);
+    }

-        rjson::add(stream_desc, "Shards", std::move(shards));
-        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-            
-        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
-    });
+    rjson::add(stream_desc, "Shards", std::move(shards));
+    rjson::add(ret, "StreamDescription", std::move(stream_desc));
+        
+    co_return rjson::print(std::move(ret));
 }

 enum class shard_iterator_type {
@@ -898,172 +896,169 @@ future<executor::request_return_type> executor::get_records(client_state& client
    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
            query::tombstone_limit(_proxy.get_tombstone_limit()), query::row_limit(limit * mul));

-    co_return co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
-            [this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable {       
-        cql3::selection::result_set_builder builder(*selection, gc_clock::now());
-        query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
+    service::storage_proxy::coordinator_query_result qr = co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state));
+    cql3::selection::result_set_builder builder(*selection, gc_clock::now());
+    query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));

-        auto result_set = builder.build();
-        auto records = rjson::empty_array();
+    auto result_set = builder.build();
+    auto records = rjson::empty_array();

-        auto& metadata = result_set->get_metadata();
+    auto& metadata = result_set->get_metadata();

-        auto op_index = std::distance(metadata.get_names().begin(), 
-            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-                return cdef->name->name() == op_column_name;
-            })
-        );
-        auto ts_index = std::distance(metadata.get_names().begin(), 
-            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-                return cdef->name->name() == timestamp_column_name;
-            })
-        );
-        auto eor_index = std::distance(metadata.get_names().begin(), 
-            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-                return cdef->name->name() == eor_column_name;
-            })
-        );
+    auto op_index = std::distance(metadata.get_names().begin(), 
+        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+            return cdef->name->name() == op_column_name;
+        })
+    );
+    auto ts_index = std::distance(metadata.get_names().begin(), 
+        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+            return cdef->name->name() == timestamp_column_name;
+        })
+    );
+    auto eor_index = std::distance(metadata.get_names().begin(), 
+        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+            return cdef->name->name() == eor_column_name;
+        })
+    );

-        std::optional<utils::UUID> timestamp;
-        auto dynamodb = rjson::empty_object();
-        auto record = rjson::empty_object();
-        const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();
+    std::optional<utils::UUID> timestamp;
+    auto dynamodb = rjson::empty_object();
+    auto record = rjson::empty_object();
+    const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();

-        using op_utype = std::underlying_type_t<cdc::operation>;
+    using op_utype = std::underlying_type_t<cdc::operation>;

-        auto maybe_add_record = [&] {
-            if (!dynamodb.ObjectEmpty()) {
-                rjson::add(record, "dynamodb", std::move(dynamodb));
-                dynamodb = rjson::empty_object();
-            }
-            if (!record.ObjectEmpty()) {
-                rjson::add(record, "awsRegion", rjson::from_string(dc_name));
-                rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
-                rjson::add(record, "eventSource", "scylladb:alternator");
-                rjson::add(record, "eventVersion", "1.1");
-                rjson::push_back(records, std::move(record));
-                record = rjson::empty_object();
-                --limit;
-            }
-        };
+    auto maybe_add_record = [&] {
+        if (!dynamodb.ObjectEmpty()) {
+            rjson::add(record, "dynamodb", std::move(dynamodb));
+            dynamodb = rjson::empty_object();
+        }
+        if (!record.ObjectEmpty()) {
+            rjson::add(record, "awsRegion", rjson::from_string(dc_name));
+            rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
+            rjson::add(record, "eventSource", "scylladb:alternator");
+            rjson::add(record, "eventVersion", "1.1");
+            rjson::push_back(records, std::move(record));
+            record = rjson::empty_object();
+            --limit;
+        }
+    };

-        for (auto& row : result_set->rows()) {
-            auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
-            auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
-            auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;
+    for (auto& row : result_set->rows()) {
+        auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
+        auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
+        auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;

-            if (!dynamodb.HasMember("Keys")) {
-                auto keys = rjson::empty_object();
-                describe_single_item(*selection, row, key_names, keys);
-                rjson::add(dynamodb, "Keys", std::move(keys));
-                rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
-                rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
-                rjson::add(dynamodb, "StreamViewType", type);
-                // TODO: SizeBytes
-            }
-
-            /**
-             * We merge rows with same timestamp into a single event.
-             * This is pretty much needed, because a CDC row typically
-             * encodes ~half the info of an alternator write. 
-             * 
-             * A big, big downside to how alternator records are written
-             * (i.e. CQL), is that the distinction between INSERT and UPDATE
-             * is somewhat lost/unmappable to actual eventName. 
-             * A write (currently) always looks like an insert+modify
-             * regardless whether we wrote existing record or not. 
-             * 
-             * Maybe RMW ops could be done slightly differently so 
-             * we can distinguish them here...
-             * 
-             * For now, all writes will become MODIFY.
-             * 
-             * Note: we do not check the current pre/post
-             * flags on CDC log, instead we use data to 
-             * drive what is returned. This is (afaict)
-             * consistent with dynamo streams
-             */
-            switch (op) {
-            case cdc::operation::pre_image:
-            case cdc::operation::post_image:
-            {
-                auto item = rjson::empty_object();
-                describe_single_item(*selection, row, attr_names, item, nullptr, true);
-                describe_single_item(*selection, row, key_names, item);
-                rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
-                break;
-            }
-            case cdc::operation::update:
-                rjson::add(record, "eventName", "MODIFY");
-                break;
-            case cdc::operation::insert:
-                rjson::add(record, "eventName", "INSERT");
-                break;
-            case cdc::operation::service_row_delete:
-            case cdc::operation::service_partition_delete:
-            {
-                auto user_identity = rjson::empty_object();
-                rjson::add(user_identity, "Type", "Service");
-                rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
-                rjson::add(record, "userIdentity", std::move(user_identity));
-                rjson::add(record, "eventName", "REMOVE");
-                break;
-            }
-            default:
-                rjson::add(record, "eventName", "REMOVE");
-                break;
-            }
-            if (eor) {
-                maybe_add_record();
-                timestamp = ts;
-                if (limit == 0) {
-                    break;
-                }
-            }
+        if (!dynamodb.HasMember("Keys")) {
+            auto keys = rjson::empty_object();
+            describe_single_item(*selection, row, key_names, keys);
+            rjson::add(dynamodb, "Keys", std::move(keys));
+            rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
+            rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
+            rjson::add(dynamodb, "StreamViewType", type);
+            // TODO: SizeBytes
        }

-        auto ret = rjson::empty_object();
-        auto nrecords = records.Size();
-        rjson::add(ret, "Records", std::move(records));
-
-        if (nrecords != 0) {
-            // #9642. Set next iterators threshold to > last
-            shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
-            // Note that here we unconditionally return NextShardIterator,
-            // without checking if maybe we reached the end-of-shard. If the
-            // shard did end, then the next read will have nrecords == 0 and
-            // will notice end end of shard and not return NextShardIterator.
-            rjson::add(ret, "NextShardIterator", next_iter);
-            _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
-            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
+        /**
+         * We merge rows with same timestamp into a single event.
+         * This is pretty much needed, because a CDC row typically
+         * encodes ~half the info of an alternator write. 
+         * 
+         * A big, big downside to how alternator records are written
+         * (i.e. CQL), is that the distinction between INSERT and UPDATE
+         * is somewhat lost/unmappable to actual eventName. 
+         * A write (currently) always looks like an insert+modify
+         * regardless whether we wrote existing record or not. 
+         * 
+         * Maybe RMW ops could be done slightly differently so 
+         * we can distinguish them here...
+         * 
+         * For now, all writes will become MODIFY.
+         * 
+         * Note: we do not check the current pre/post
+         * flags on CDC log, instead we use data to 
+         * drive what is returned. This is (afaict)
+         * consistent with dynamo streams
+         */
+        switch (op) {
+        case cdc::operation::pre_image:
+        case cdc::operation::post_image:
+        {
+            auto item = rjson::empty_object();
+            describe_single_item(*selection, row, attr_names, item, nullptr, true);
+            describe_single_item(*selection, row, key_names, item);
+            rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
+            break;
        }
-
-        // ugh. figure out if we are and end-of-shard
-        auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
-
-        return _sdks.cdc_current_generation_timestamp({ normal_token_owners }).then([this, iter, high_ts, start_time, ret = std::move(ret)](db_clock::time_point ts) mutable {
-            auto& shard = iter.shard;            
-
-            if (shard.time < ts && ts < high_ts) {
-                // The DynamoDB documentation states that when a shard is
-                // closed, reading it until the end has NextShardIterator
-                // "set to null". Our test test_streams_closed_read
-                // confirms that by "null" they meant not set at all.
-            } else {
-                // We could have return the same iterator again, but we did
-                // a search from it until high_ts and found nothing, so we
-                // can also start the next search from high_ts.
-                // TODO: but why? It's simpler just to leave the iterator be.
-                shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
-                rjson::add(ret, "NextShardIterator", iter);
+        case cdc::operation::update:
+            rjson::add(record, "eventName", "MODIFY");
+            break;
+        case cdc::operation::insert:
+            rjson::add(record, "eventName", "INSERT");
+            break;
+        case cdc::operation::service_row_delete:
+        case cdc::operation::service_partition_delete:
+        {
+            auto user_identity = rjson::empty_object();
+            rjson::add(user_identity, "Type", "Service");
+            rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
+            rjson::add(record, "userIdentity", std::move(user_identity));
+            rjson::add(record, "eventName", "REMOVE");
+            break;
+        }
+        default:
+            rjson::add(record, "eventName", "REMOVE");
+            break;
+        }
+        if (eor) {
+            maybe_add_record();
+            timestamp = ts;
+            if (limit == 0) {
+                break;
            }
-            _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
-            if (is_big(ret)) {
-                return make_ready_future<executor::request_return_type>(make_streamed(std::move(ret)));
-            }
-            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
-        });
-    });
+        }
+    }
+
+    auto ret = rjson::empty_object();
+    auto nrecords = records.Size();
+    rjson::add(ret, "Records", std::move(records));
+
+    if (nrecords != 0) {
+        // #9642. Set next iterators threshold to > last
+        shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
+        // Note that here we unconditionally return NextShardIterator,
+        // without checking if maybe we reached the end-of-shard. If the
+        // shard did end, then the next read will have nrecords == 0 and
+        // will notice end end of shard and not return NextShardIterator.
+        rjson::add(ret, "NextShardIterator", next_iter);
+        _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
+        co_return rjson::print(std::move(ret));
+    }
+
+    // ugh. figure out if we are and end-of-shard
+    auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
+
+    db_clock::time_point ts = co_await _sdks.cdc_current_generation_timestamp({ normal_token_owners });
+    auto& shard = iter.shard;
+
+    if (shard.time < ts && ts < high_ts) {
+        // The DynamoDB documentation states that when a shard is
+        // closed, reading it until the end has NextShardIterator
+        // "set to null". Our test test_streams_closed_read
+        // confirms that by "null" they meant not set at all.
+    } else {
+        // We could have return the same iterator again, but we did
+        // a search from it until high_ts and found nothing, so we
+        // can also start the next search from high_ts.
+        // TODO: but why? It's simpler just to leave the iterator be.
+        shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
+        rjson::add(ret, "NextShardIterator", iter);
+    }
+    _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
+    if (is_big(ret)) {
+        co_return make_streamed(std::move(ret));
+    }
+    co_return rjson::print(std::move(ret));
 }

 bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -767,7 +767,7 @@ static future<bool> scan_table(
                // by tasking another node to take over scanning of the dead node's primary
                // ranges. What we do here is that this node will also check expiration
                // on its *secondary* ranges - but only those whose primary owner is down.
-                auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet, erm->get_topology()); // throws if no secondary replica
+                auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet); // throws if no secondary replica
                if (tablet_secondary_replica.host == my_host_id && tablet_secondary_replica.shard == this_shard_id()) {
                    if (!gossiper.is_alive(tablet_primary_replica.host)) {
                        co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -515,15 +515,6 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
        auto sstables = parsed.GetArray() |
            std::views::transform([] (const auto& s) { return sstring(rjson::to_string_view(s)); }) |
            std::ranges::to<std::vector>();
-        apilog.info("Restore invoked with following parameters: keyspace={}, table={}, endpoint={}, bucket={}, prefix={}, sstables_count={}, scope={}, primary_replica_only={}",
-                    keyspace,
-                    table,
-                    endpoint,
-                    bucket,
-                    prefix,
-                    sstables.size(),
-                    scope,
-                    primary_replica_only);
        auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope, primary_replica_only);
        co_return json::json_return_type(fmt::to_string(task_id));
    });
--- a/audit/audit.cc
+++ b/audit/audit.cc
@@ -209,15 +209,11 @@ future<> audit::stop_audit() {
    });
 }

-audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table) {
+audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch) {
    if (!audit_instance().local_is_initialized()) {
        return nullptr;
    }
-    return std::make_unique<audit_info>(cat, keyspace, table);
-}
-
-audit_info_ptr audit::create_no_audit_info() {
-    return audit_info_ptr();
+    return std::make_unique<audit_info>(cat, keyspace, table, batch);
 }

 future<> audit::start(const db::config& cfg) {
@@ -267,18 +263,21 @@ future<> audit::log_login(const sstring& username, socket_address client_ip, boo
 }

 future<> inspect(shared_ptr<cql3::cql_statement> statement, service::query_state& query_state, const cql3::query_options& options, bool error) {
-    cql3::statements::batch_statement* batch = dynamic_cast<cql3::statements::batch_statement*>(statement.get());
-    if (batch != nullptr) {
+    auto audit_info = statement->get_audit_info();
+    if (!audit_info) {
+        return make_ready_future<>();
+    }
+    if (audit_info->batch()) {
+        cql3::statements::batch_statement* batch = static_cast<cql3::statements::batch_statement*>(statement.get());
        return do_for_each(batch->statements().begin(), batch->statements().end(), [&query_state, &options, error] (auto&& m) {
            return inspect(m.statement, query_state, options, error);
        });
    } else {
-        auto audit_info = statement->get_audit_info();
-        if (bool(audit_info) && audit::local_audit_instance().should_log(audit_info)) {
+        if (audit::local_audit_instance().should_log(audit_info)) {
            return audit::local_audit_instance().log(audit_info, query_state, options, error);
        }
+        return make_ready_future<>();
    }
-    return make_ready_future<>();
 }

 future<> inspect_login(const sstring& username, socket_address client_ip, bool error) {
--- a/audit/audit.hh
+++ b/audit/audit.hh
@@ -75,11 +75,13 @@ class audit_info final {
    sstring _keyspace;
    sstring _table;
    sstring _query;
+    bool _batch;
 public:
-    audit_info(statement_category cat, sstring keyspace, sstring table)
+    audit_info(statement_category cat, sstring keyspace, sstring table, bool batch)
        : _category(cat)
        , _keyspace(std::move(keyspace))
        , _table(std::move(table))
+        , _batch(batch)
    { }
    void set_query_string(const std::string_view& query_string) {
        _query = sstring(query_string);
@@ -89,6 +91,7 @@ public:
    const sstring& query() const { return _query; }
    sstring category_string() const;
    statement_category category() const { return _category; }
+    bool batch() const { return _batch; }
 };

 using audit_info_ptr = std::unique_ptr<audit_info>;
@@ -126,8 +129,7 @@ public:
    }
    static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
    static future<> stop_audit();
-    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table);
-    static audit_info_ptr create_no_audit_info();
+    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch = false);
    audit(locator::shared_token_metadata& stm,
          cql3::query_processor& qp,
          service::migration_manager& mm,
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -778,7 +778,6 @@ compaction_manager::get_incremental_repair_read_lock(compaction::compaction_grou
        cmlog.debug("Get get_incremental_repair_read_lock for {} started", reason);
    }
    compaction::compaction_state& cs = get_compaction_state(&t);
-    auto gh = cs.gate.hold();
    auto ret = co_await cs.incremental_repair_lock.hold_read_lock();
    if (!reason.empty()) {
        cmlog.debug("Get get_incremental_repair_read_lock for {} done", reason);
@@ -792,7 +791,6 @@ compaction_manager::get_incremental_repair_write_lock(compaction::compaction_gro
        cmlog.debug("Get get_incremental_repair_write_lock for {} started", reason);
    }
    compaction::compaction_state& cs = get_compaction_state(&t);
-    auto gh = cs.gate.hold();
    auto ret = co_await cs.incremental_repair_lock.hold_write_lock();
    if (!reason.empty()) {
        cmlog.debug("Get get_incremental_repair_write_lock for {} done", reason);
@@ -1521,9 +1519,7 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
            | std::views::transform(std::mem_fn(&sstables::sstable::run_identifier))
            | std::ranges::to<std::unordered_set>());
    };
-    const auto threshold = utils::get_local_injector().inject_parameter<size_t>("set_sstable_count_reduction_threshold")
-        .value_or(size_t(std::max(schema->max_compaction_threshold(), 32)));
-
+    const auto threshold = size_t(std::max(schema->max_compaction_threshold(), 32));
    auto count = co_await num_runs_for_compaction();
    if (count <= threshold) {
        cmlog.trace("No need to wait for sstable count reduction in {}: {} <= {}",
@@ -1538,7 +1534,9 @@ future<> compaction_manager::maybe_wait_for_sstable_count_reduction(compaction_g
    auto& cstate = get_compaction_state(&t);
    try {
        while (can_perform_regular_compaction(t) && co_await num_runs_for_compaction() > threshold) {
-            co_await cstate.compaction_done.wait();
+            co_await cstate.compaction_done.wait([this, &t] {
+                return !can_perform_regular_compaction(t);
+            });
        }
    } catch (const broken_condition_variable&) {
        co_return;
@@ -2389,8 +2387,6 @@ future<> compaction_manager::remove(compaction_group_view& t, sstring reason) no
    if (!c_state.gate.is_closed()) {
        auto close_gate = c_state.gate.close();
        co_await stop_ongoing_compactions(reason, &t);
-        // Wait for users of incremental repair lock (can be either repair itself or maintenance compactions).
-        co_await c_state.incremental_repair_lock.write_lock();
        co_await std::move(close_gate);
    }

--- a/compaction/leveled_manifest.hh
+++ b/compaction/leveled_manifest.hh
@@ -329,13 +329,13 @@ public:
        auto it = candidates.begin();
        auto& first_sstable = *it;
        it++;
-        dht::token first = first_sstable->get_first_decorated_key()._token;
-        dht::token last = first_sstable->get_last_decorated_key()._token;
+        dht::token first = first_sstable->get_first_decorated_key().token();
+        dht::token last = first_sstable->get_last_decorated_key().token();
        while (it != candidates.end()) {
            auto& candidate_sstable = *it;
            it++;
-            dht::token first_candidate = candidate_sstable->get_first_decorated_key()._token;
-            dht::token last_candidate = candidate_sstable->get_last_decorated_key()._token;
+            dht::token first_candidate = candidate_sstable->get_first_decorated_key().token();
+            dht::token last_candidate = candidate_sstable->get_last_decorated_key().token();

            first = first <= first_candidate? first : first_candidate;
            last = last >= last_candidate ? last : last_candidate;
@@ -345,7 +345,7 @@ public:

    template <typename T>
    static std::vector<sstables::shared_sstable> overlapping(const schema& s, const sstables::shared_sstable& sstable, const T& others) {
-        return overlapping(s, sstable->get_first_decorated_key()._token, sstable->get_last_decorated_key()._token, others);
+        return overlapping(s, sstable->get_first_decorated_key().token(), sstable->get_last_decorated_key().token(), others);
    }

    /**
@@ -359,7 +359,7 @@ public:
        auto range = ::wrapping_interval<dht::token>::make(start, end);

        for (auto& candidate : sstables) {
-            auto candidate_range = ::wrapping_interval<dht::token>::make(candidate->get_first_decorated_key()._token, candidate->get_last_decorated_key()._token);
+            auto candidate_range = ::wrapping_interval<dht::token>::make(candidate->get_first_decorated_key().token(), candidate->get_last_decorated_key().token());

            if (range.overlaps(candidate_range, dht::token_comparator())) {
                overlapped.push_back(candidate);
--- a/configure.py
+++ b/configure.py
@@ -730,6 +730,28 @@ vector_search_tests = set([
    'test/vector_search/rescoring_test'
 ])

+vector_search_validator_bin = 'vector-search-validator/bin/vector-search-validator'
+vector_search_validator_deps = set([
+    'test/vector_search_validator/build-validator',
+    'test/vector_search_validator/Cargo.toml',
+    'test/vector_search_validator/crates/validator/Cargo.toml',
+    'test/vector_search_validator/crates/validator/src/main.rs',
+    'test/vector_search_validator/crates/validator-scylla/Cargo.toml',
+    'test/vector_search_validator/crates/validator-scylla/src/lib.rs',
+    'test/vector_search_validator/crates/validator-scylla/src/cql.rs',
+])
+
+vector_store_bin = 'vector-search-validator/bin/vector-store'
+vector_store_deps = set([
+    'test/vector_search_validator/build-env',
+    'test/vector_search_validator/build-vector-store',
+])
+
+vector_search_validator_bins = set([
+    vector_search_validator_bin,
+    vector_store_bin,
+])
+
 wasms = set([
    'wasm/return_input.wat',
    'wasm/test_complex_null_values.wat',
@@ -763,7 +785,7 @@ other = set([
    'iotune',
 ])

-all_artifacts = apps | cpp_apps | tests | other | wasms
+all_artifacts = apps | cpp_apps | tests | other | wasms | vector_search_validator_bins

 arg_parser = argparse.ArgumentParser('Configure scylla', add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
@@ -795,6 +817,9 @@ arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='clan
                        help='C compiler path')
 arg_parser.add_argument('--compiler-cache', action='store', dest='compiler_cache', default='auto',
                        help='Compiler cache to use: auto (default, prefers sccache), sccache, ccache, none, or a path to a binary')
+# Workaround for https://github.com/mozilla/sccache/issues/2575
+arg_parser.add_argument('--sccache-rust', action=argparse.BooleanOptionalAction, default=False,
+                        help='Use sccache for rust code (if sccache is selected as compiler cache). Doesn\'t work with distributed builds.')
 add_tristate(arg_parser, name='dpdk', dest='dpdk', default=False,
                        help='Use dpdk (from seastar dpdk sources)')
 arg_parser.add_argument('--dpdk-target', action='store', dest='dpdk_target', default='',
@@ -925,8 +950,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/crypt_sha512.cc',
                'utils/logalloc.cc',
                'utils/large_bitset.cc',
-                'utils/buffer_input_stream.cc',
-                'utils/limiting_data_source.cc',
+                'test/lib/limiting_data_source.cc',
                'utils/updateable_value.cc',
                'message/dictionary_service.cc',
                'utils/directories.cc',
@@ -1535,6 +1559,7 @@ scylla_perfs = ['test/perf/perf_alternator.cc',
                'test/perf/perf_fast_forward.cc',
                'test/perf/perf_row_cache_update.cc',
                'test/perf/perf_simple_query.cc',
+                'test/perf/perf_cql_raw.cc',
                'test/perf/perf_sstable.cc',
                'test/perf/perf_tablets.cc',
                'test/perf/tablet_load_balancing.cc',
@@ -2383,7 +2408,7 @@ def write_build_file(f,
    # If compiler cache is available, prefix the compiler with it
    cxx_with_cache = f'{compiler_cache} {args.cxx}' if compiler_cache else args.cxx
    # For Rust, sccache is used via RUSTC_WRAPPER environment variable
-    rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache else ''
+    rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache and args.sccache_rust else ''
    f.write(textwrap.dedent('''\
        configure_args = {configure_args}
        builddir = {outdir}
@@ -2560,10 +2585,11 @@ def write_build_file(f,
              description = RUST_LIB $out
            ''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, rustc_wrapper=rustc_wrapper, **modeval))
        f.write(
-            'build {mode}-build: phony {artifacts} {wasms}\n'.format(
+            'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
                mode=mode,
-                artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms)]),
+                artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms - vector_search_validator_bins)]),
                wasms = str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & wasms)]),
+                vector_search_validator_bins=str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & vector_search_validator_bins)]),
            )
        )
        if profile_recipe := modes[mode].get('profile_recipe'):
@@ -2593,7 +2619,7 @@ def write_build_file(f,
                continue
            profile_dep = modes[mode].get('profile_target', "")

-            if binary in other or binary in wasms:
+            if binary in other or binary in wasms or binary in vector_search_validator_bins:
                continue
            srcs = deps[binary]
            # 'scylla'
@@ -2704,10 +2730,11 @@ def write_build_file(f,
        )

        f.write(
-            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms}\n'.format(
+            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms} {vector_search_validator_bins} \n'.format(
                mode=mode,
                test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in sorted(tests)]),
                wasms=' '.join([f'$builddir/{binary}' for binary in sorted(wasms)]),
+                vector_search_validator_bins=' '.join([f'$builddir/{binary}' for binary in sorted(vector_search_validator_bins)]),
            )
        )
        f.write(
@@ -2875,6 +2902,19 @@ def write_build_file(f,
            'build compiler-training: phony {}\n'.format(' '.join(['{mode}-compiler-training'.format(mode=mode) for mode in default_modes]))
    )

+    f.write(textwrap.dedent(f'''\
+        rule build-vector-search-validator
+            command = test/vector_search_validator/build-validator $builddir
+        rule build-vector-store
+            command = test/vector_search_validator/build-vector-store $builddir
+        '''))
+    f.write(
+            'build $builddir/{vector_search_validator_bin}: build-vector-search-validator {}\n'.format(' '.join([dep for dep in sorted(vector_search_validator_deps)]), vector_search_validator_bin=vector_search_validator_bin)
+    )
+    f.write(
+            'build $builddir/{vector_store_bin}: build-vector-store {}\n'.format(' '.join([dep for dep in sorted(vector_store_deps)]), vector_store_bin=vector_store_bin)
+    )
+
    f.write(textwrap.dedent(f'''\
        build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
        build dist-unified: phony dist-unified-tar
@@ -3112,7 +3152,7 @@ def configure_using_cmake(args):
        settings['CMAKE_CXX_COMPILER_LAUNCHER'] = compiler_cache
        settings['CMAKE_C_COMPILER_LAUNCHER'] = compiler_cache
        # For Rust, sccache is used via RUSTC_WRAPPER
-        if 'sccache' in compiler_cache:
+        if 'sccache' in compiler_cache and args.sccache_rust:
            settings['Scylla_RUSTC_WRAPPER'] = compiler_cache

    if args.date_stamp:
--- a/cql3/functions/vector_similarity_fcts.cc
+++ b/cql3/functions/vector_similarity_fcts.cc
@@ -10,41 +10,9 @@
 #include "types/types.hh"
 #include "types/vector.hh"
 #include "exceptions/exceptions.hh"
-#include <span>
-#include <bit>

 namespace cql3 {
 namespace functions {
-
-namespace detail {
-
-std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension) {
-    if (!param) {
-        throw exceptions::invalid_request_exception("Cannot extract float vector from null parameter");
-    }
-
-    const size_t expected_size = dimension * sizeof(float);
-    if (param->size() != expected_size) {
-        throw exceptions::invalid_request_exception(
-            fmt::format("Invalid vector size: expected {} bytes for {} floats, got {} bytes",
-                       expected_size, dimension, param->size()));
-    }
-
-    std::vector<float> result;
-    result.reserve(dimension);
-
-    bytes_view view(*param);
-    for (size_t i = 0; i < dimension; ++i) {
-        // read_simple handles network byte order (big-endian) conversion
-        uint32_t raw = read_simple<uint32_t>(view);
-        result.push_back(std::bit_cast<float>(raw));
-    }
-
-    return result;
-}
-
-} // namespace detail
-
 namespace {

 // The computations of similarity scores match the exact formulas of Cassandra's (jVector's) implementation to ensure compatibility.
@@ -54,14 +22,14 @@ namespace {

 // You should only use this function if you need to preserve the original vectors and cannot normalize
 // them in advance.
-float compute_cosine_similarity(std::span<const float> v1, std::span<const float> v2) {
+float compute_cosine_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
    double dot_product = 0.0;
    double squared_norm_a = 0.0;
    double squared_norm_b = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = v1[i];
-        double b = v2[i];
+        double a = value_cast<float>(v1[i]);
+        double b = value_cast<float>(v2[i]);

        dot_product += a * b;
        squared_norm_a += a * a;
@@ -69,7 +37,7 @@ float compute_cosine_similarity(std::span<const float> v1, std::span<const float
    }

    if (squared_norm_a == 0 || squared_norm_b == 0) {
-        return std::numeric_limits<float>::quiet_NaN();
+        throw exceptions::invalid_request_exception("Function system.similarity_cosine doesn't support all-zero vectors");
    }

    // The cosine similarity is in the range [-1, 1].
@@ -78,12 +46,12 @@ float compute_cosine_similarity(std::span<const float> v1, std::span<const float
    return (1 + (dot_product / (std::sqrt(squared_norm_a * squared_norm_b)))) / 2;
 }

-float compute_euclidean_similarity(std::span<const float> v1, std::span<const float> v2) {
+float compute_euclidean_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
    double sum = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = v1[i];
-        double b = v2[i];
+        double a = value_cast<float>(v1[i]);
+        double b = value_cast<float>(v2[i]);

        double diff = a - b;
        sum += diff * diff;
@@ -97,12 +65,12 @@ float compute_euclidean_similarity(std::span<const float> v1, std::span<const fl

 // Assumes that both vectors are L2-normalized.
 // This similarity is intended as an optimized way to perform cosine similarity calculation.
-float compute_dot_product_similarity(std::span<const float> v1, std::span<const float> v2) {
+float compute_dot_product_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
    double dot_product = 0.0;

    for (size_t i = 0; i < v1.size(); ++i) {
-        double a = v1[i];
-        double b = v2[i];
+        double a = value_cast<float>(v1[i]);
+        double b = value_cast<float>(v2[i]);
        dot_product += a * b;
    }

@@ -168,15 +136,13 @@ bytes_opt vector_similarity_fct::execute(std::span<const bytes_opt> parameters)
        return std::nullopt;
    }

-    // Extract dimension from the vector type
-    const auto& type = static_cast<const vector_type_impl&>(*arg_types()[0]);
-    size_t dimension = type.get_dimension();
+    const auto& type = arg_types()[0];
+    data_value v1 = type->deserialize(*parameters[0]);
+    data_value v2 = type->deserialize(*parameters[1]);
+    const auto& v1_elements = value_cast<std::vector<data_value>>(v1);
+    const auto& v2_elements = value_cast<std::vector<data_value>>(v2);

-    // Optimized path: extract floats directly from bytes, bypassing data_value overhead
-    std::vector<float> v1 = detail::extract_float_vector(parameters[0], dimension);
-    std::vector<float> v2 = detail::extract_float_vector(parameters[1], dimension);
-
-    float result = SIMILARITY_FUNCTIONS.at(_name)(v1, v2);
+    float result = SIMILARITY_FUNCTIONS.at(_name)(v1_elements, v2_elements);
    return float_type->decompose(result);
 }

--- a/cql3/functions/vector_similarity_fcts.hh
+++ b/cql3/functions/vector_similarity_fcts.hh
@@ -11,7 +11,6 @@
 #include "native_scalar_function.hh"
 #include "cql3/assignment_testable.hh"
 #include "cql3/functions/function_name.hh"
-#include <span>

 namespace cql3 {
 namespace functions {
@@ -20,7 +19,7 @@ static const function_name SIMILARITY_COSINE_FUNCTION_NAME = function_name::nati
 static const function_name SIMILARITY_EUCLIDEAN_FUNCTION_NAME = function_name::native_function("similarity_euclidean");
 static const function_name SIMILARITY_DOT_PRODUCT_FUNCTION_NAME = function_name::native_function("similarity_dot_product");

-using similarity_function_t = float (*)(std::span<const float>, std::span<const float>);
+using similarity_function_t = float (*)(const std::vector<data_value>&, const std::vector<data_value>&);
 extern thread_local const std::unordered_map<function_name, similarity_function_t> SIMILARITY_FUNCTIONS;

 std::vector<data_type> retrieve_vector_arg_types(const function_name& name, const std::vector<shared_ptr<assignment_testable>>& provided_args);
@@ -34,14 +33,5 @@ public:
    virtual bytes_opt execute(std::span<const bytes_opt> parameters) override;
 };

-namespace detail {
-
-// Extract float vector directly from serialized bytes, bypassing data_value overhead.
-// This is an internal API exposed for testing purposes.
-// Vector<float, N> wire format: N floats as big-endian uint32_t values, 4 bytes each.
-std::vector<float> extract_float_vector(const bytes_opt& param, size_t dimension);
-
-} // namespace detail
-
 } // namespace functions
 } // namespace cql3
--- a/cql3/query_result_printer.hh
+++ b/cql3/query_result_printer.hh
@@ -1,20 +0,0 @@
-/*
- * Copyright 2025-present ScyllaDB
- */
-
-/*
- * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
- */
-
-#pragma once
-
-#include <ostream>
-
-namespace cql3 {
-
-class result;
-
-void print_query_results_text(std::ostream& os, const result& result);
-void print_query_results_json(std::ostream& os, const result& result);
-
-} // namespace cql3
--- a/cql3/result_set.cc
+++ b/cql3/result_set.cc
@@ -9,10 +9,8 @@
 */

 #include <cstdint>
-#include "types/json_utils.hh"
 #include "utils/assert.hh"
 #include "utils/hashers.hh"
-#include "utils/rjson.hh"
 #include "cql3/result_set.hh"

 namespace cql3 {
@@ -197,85 +195,4 @@ make_empty_metadata() {
    return empty_metadata_cache;
 }

-void print_query_results_text(std::ostream& os, const cql3::result& result) {
-    const auto& metadata = result.get_metadata();
-    const auto& column_metadata = metadata.get_names();
-
-    struct column_values {
-        size_t max_size{0};
-        sstring header_format;
-        sstring row_format;
-        std::vector<sstring> values;
-
-        void add(sstring value) {
-            max_size = std::max(max_size, value.size());
-            values.push_back(std::move(value));
-        }
-    };
-
-    std::vector<column_values> columns;
-    columns.resize(column_metadata.size());
-
-    for (size_t i = 0; i < column_metadata.size(); ++i) {
-        columns[i].add(column_metadata[i]->name->text());
-    }
-
-    for (const auto& row : result.result_set().rows()) {
-        for (size_t i = 0; i < row.size(); ++i) {
-            if (row[i]) {
-                columns[i].add(column_metadata[i]->type->to_string(linearized(managed_bytes_view(*row[i]))));
-            } else {
-                columns[i].add("");
-            }
-        }
-    }
-
-    std::vector<sstring> separators(columns.size(), sstring());
-    for (size_t i = 0; i < columns.size(); ++i) {
-        auto& col_values = columns[i];
-        col_values.header_format = seastar::format(" {{:<{}}} ", col_values.max_size);
-        col_values.row_format = seastar::format(" {{:>{}}} ", col_values.max_size);
-        for (size_t c = 0; c < col_values.max_size; ++c) {
-            separators[i] += "-";
-        }
-    }
-
-    for (size_t r = 0; r < result.result_set().rows().size() + 1; ++r) {
-        std::vector<sstring> row;
-        row.reserve(columns.size());
-        for (size_t i = 0; i < columns.size(); ++i) {
-            const auto& format = r == 0 ? columns[i].header_format : columns[i].row_format;
-            row.push_back(fmt::format(fmt::runtime(std::string_view(format)), columns[i].values[r]));
-        }
-        fmt::print(os, "{}\n", fmt::join(row, "|"));
-        if (!r) {
-            fmt::print(os, "-{}-\n", fmt::join(separators, "-+-"));
-        }
-    }
-}
-
-void print_query_results_json(std::ostream& os, const cql3::result& result) {
-    const auto& metadata = result.get_metadata();
-    const auto& column_metadata = metadata.get_names();
-
-    rjson::streaming_writer writer(os);
-
-    writer.StartArray();
-    for (const auto& row : result.result_set().rows()) {
-        writer.StartObject();
-        for (size_t i = 0; i < row.size(); ++i) {
-            writer.Key(column_metadata[i]->name->text());
-            if (!row[i] || row[i]->empty()) {
-                writer.Null();
-                continue;
-            }
-            const auto value = to_json_string(*column_metadata[i]->type, *row[i]);
-            const auto type = to_json_type(*column_metadata[i]->type, *row[i]);
-            writer.RawValue(value, type);
-        }
-        writer.EndObject();
-    }
-    writer.EndArray();
-}
-
 }
--- a/cql3/statements/describe_statement.cc
+++ b/cql3/statements/describe_statement.cc
@@ -23,7 +23,6 @@
 #include "index/vector_index.hh"
 #include "schema/schema.hh"
 #include "service/client_state.hh"
-#include "service/paxos/paxos_state.hh"
 #include "types/types.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/cql_statement.hh"
@@ -330,19 +329,6 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
                "*/",
                *table_desc.create_statement);

-        table_desc.create_statement = std::move(os).to_managed_string();
-    } else if (service::paxos::paxos_store::try_get_base_table(name)) {
-        // Paxos state table is internally managed by Scylla and it shouldn't be exposed to the user.
-        // The table is allowed to be described as a comment to ease administrative work but it's hidden from all listings.
-        fragmented_ostringstream os{};
-
-        fmt::format_to(os.to_iter(),
-                "/* Do NOT execute this statement! It's only for informational purposes.\n"
-                "   A paxos state table is created automatically when enabling LWT on a base table.\n"
-                "\n{}\n"
-                "*/",
-                *table_desc.create_statement);
-
        table_desc.create_statement = std::move(os).to_managed_string();
    }
    result.push_back(std::move(table_desc));
@@ -378,7 +364,7 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
 future<std::vector<description>> tables(const data_dictionary::database& db, const lw_shared_ptr<keyspace_metadata>& ks, std::optional<bool> with_internals = std::nullopt) {
    auto& replica_db = db.real_database();
    auto tables = ks->tables() | std::views::filter([&replica_db] (const schema_ptr& s) {
-        return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name()) && !service::paxos::paxos_store::try_get_base_table(s->cf_name());
+        return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name());
    }) | std::ranges::to<std::vector<schema_ptr>>();
    std::ranges::sort(tables, std::ranges::less(), std::mem_fn(&schema::cf_name));

--- a/cql3/statements/raw/batch_statement.hh
+++ b/cql3/statements/raw/batch_statement.hh
@@ -50,8 +50,8 @@ public:
 protected:
    virtual audit::statement_category category() const override;
    virtual audit::audit_info_ptr audit_info() const override {
-        // We don't audit batch statements. Instead we audit statements that are inside the batch.
-        return audit::audit::create_no_audit_info();
+        constexpr bool batch = true;
+        return audit::audit::create_audit_info(category(), sstring(), sstring(), batch);
    }
 };

--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -55,21 +55,8 @@ int32_t batchlog_shard_of(db_clock::time_point written_at) {
    return hash & ((1ULL << batchlog_shard_bits) - 1);
 }

-bool is_batchlog_v1(const schema& schema) {
-    return schema.cf_name() == system_keyspace::BATCHLOG;
-}
-
 std::pair<partition_key, clustering_key>
 get_batchlog_key(const schema& schema, int32_t version, db::batchlog_stage stage, int32_t batchlog_shard, db_clock::time_point written_at, std::optional<utils::UUID> id) {
-    if (is_batchlog_v1(schema)) {
-        if (!id) {
-            on_internal_error(blogger, "get_batchlog_key(): key for batchlog v1 requires batchlog id");
-        }
-        auto pkey = partition_key::from_single_value(schema, {serialized(*id)});
-        auto ckey = clustering_key::make_empty();
-        return std::pair(std::move(pkey), std::move(ckey));
-    }
-
    auto pkey = partition_key::from_exploded(schema, {serialized(version), serialized(int8_t(stage)), serialized(batchlog_shard)});

    std::vector<bytes> ckey_components;
@@ -98,14 +85,6 @@ mutation get_batchlog_mutation_for(schema_ptr schema, managed_bytes data, int32_
    auto cdef_data = schema->get_column_definition(to_bytes("data"));
    m.set_cell(ckey, *cdef_data, atomic_cell::make_live(*cdef_data->type, timestamp, std::move(data)));

-    if (is_batchlog_v1(*schema)) {
-        auto cdef_version = schema->get_column_definition(to_bytes("version"));
-        m.set_cell(ckey, *cdef_version, atomic_cell::make_live(*cdef_version->type, timestamp, serialized(version)));
-
-        auto cdef_written_at = schema->get_column_definition(to_bytes("written_at"));
-        m.set_cell(ckey, *cdef_written_at, atomic_cell::make_live(*cdef_written_at->type, timestamp, serialized(now)));
-    }
-
    return m;
 }

@@ -143,10 +122,9 @@ mutation get_batchlog_delete_mutation(schema_ptr schema, int32_t version, db_clo
 const std::chrono::seconds db::batchlog_manager::replay_interval;
 const uint32_t db::batchlog_manager::page_size;

-db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, gms::feature_service& fs, batchlog_manager_config config)
+db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, batchlog_manager_config config)
        : _qp(qp)
        , _sys_ks(sys_ks)
-        , _fs(fs)
        , _replay_timeout(config.replay_timeout)
        , _replay_rate(config.replay_rate)
        , _delay(config.delay)
@@ -322,206 +300,149 @@ future<> db::batchlog_manager::maybe_migrate_v1_to_v2() {
    });
 }

-namespace {
-
-using clock_type = db_clock::rep;
-
-struct replay_stats {
-    std::optional<db_clock::time_point> min_too_fresh;
-    bool need_cleanup = false;
-};
-
-} // anonymous namespace
-
-static future<db::all_batches_replayed> process_batch(
-        cql3::query_processor& qp,
-        db::batchlog_manager::stats& stats,
-        db::batchlog_manager::post_replay_cleanup cleanup,
-        utils::rate_limiter& limiter,
-        schema_ptr schema,
-        std::unordered_map<int32_t, replay_stats>& replay_stats_per_shard,
-        const db_clock::time_point now,
-        db_clock::duration replay_timeout,
-        std::chrono::seconds write_timeout,
-        const cql3::untyped_result_set::row& row) {
-    const bool is_v1 = db::is_batchlog_v1(*schema);
-    const auto stage = is_v1 ? db::batchlog_stage::initial : static_cast<db::batchlog_stage>(row.get_as<int8_t>("stage"));
-    const auto batch_shard = is_v1 ? 0 : row.get_as<int32_t>("shard");
-    auto written_at = row.get_as<db_clock::time_point>("written_at");
-    auto id = row.get_as<utils::UUID>("id");
-    // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
-    auto timeout = replay_timeout;
-
-    if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
-        blogger.debug("Skipping batch replay due to skip_batch_replay injection");
-        co_return db::all_batches_replayed::no;
-    }
-
-    auto data = row.get_blob_unfragmented("data");
-
-    blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
-
-    utils::chunked_vector<mutation> mutations;
-    bool send_failed = false;
-
-    auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;
-
-    try {
-        utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
-        auto in = ser::as_input_stream(data);
-        while (in.size()) {
-            auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
-            const auto tbl = qp.db().try_find_table(fm.column_family_id());
-            if (!tbl) {
-                continue;
-            }
-            if (written_at <= tbl->get_truncation_time()) {
-                continue;
-            }
-            schema_ptr s = tbl->schema();
-            if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
-                timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
-            }
-            fms.emplace_back(std::move(fm), std::move(s));
-        }
-
-        if (now < written_at + timeout) {
-            blogger.debug("Skipping replay of {}, too fresh", id);
-
-            shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
-
-            co_return db::all_batches_replayed::no;
-        }
-
-        auto size = data.size();
-
-        for (const auto& [fm, s] : fms) {
-            mutations.emplace_back(fm.to_mutation(s));
-            co_await coroutine::maybe_yield();
-        }
-
-        if (!mutations.empty()) {
-            const auto ttl = [written_at]() -> clock_type {
-                /*
-                * Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
-                * This ensures that deletes aren't "undone" by an old batch replay.
-                */
-                auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
-                warn(unimplemented::cause::HINT);
-#if 0
-                for (auto& m : *mutations) {
-                    unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
-                }
-#endif
-                return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
-            }();
-
-            if (ttl > 0) {
-                // Origin does the send manually, however I can't see a super great reason to do so.
-                // Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
-                // in both cases.
-                // FIXME: verify that the above is reasonably true.
-                co_await limiter.reserve(size);
-                stats.write_attempts += mutations.size();
-                auto timeout = db::timeout_clock::now() + write_timeout;
-                if (cleanup) {
-                    co_await qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
-                } else {
-                    co_await qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
-                }
-            }
-        }
-    } catch (data_dictionary::no_such_keyspace& ex) {
-        // should probably ignore and drop the batch
-    } catch (const data_dictionary::no_such_column_family&) {
-        // As above -- we should drop the batch if the table doesn't exist anymore.
-    } catch (...) {
-        blogger.warn("Replay failed (will retry): {}", std::current_exception());
-        // timeout, overload etc.
-        // Do _not_ remove the batch, assuning we got a node write error.
-        // Since we don't have hints (which origin is satisfied with),
-        // we have to resort to keeping this batch to next lap.
-        if (is_v1 || !cleanup || stage == db::batchlog_stage::failed_replay) {
-            co_return db::all_batches_replayed::no;
-        }
-        send_failed = true;
-    }
-
-    auto& sp = qp.proxy();
-
-    if (send_failed) {
-        blogger.debug("Moving batch {} to stage failed_replay", id);
-        auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, db::batchlog_stage::failed_replay, written_at, id);
-        co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
-    }
-
-    // delete batch
-    auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
-    co_await qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
-
-    shard_written_at.need_cleanup = true;
-
-    co_return db::all_batches_replayed(!send_failed);
-}
-
-future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches_v1(post_replay_cleanup) {
-    db::all_batches_replayed all_replayed = all_batches_replayed::yes;
-    // rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
-    // max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
-    auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
-    utils::rate_limiter limiter(throttle);
-
-    auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
-
-    std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;
-
-    // Use a stable `now` across all batches, so skip/replay decisions are the
-    // same across a while prefix of written_at (across all ids).
-    const auto now = db_clock::now();
-
-    auto batch = [this, &limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
-        all_replayed = all_replayed && co_await process_batch(_qp, _stats, post_replay_cleanup::no, limiter, schema, replay_stats_per_shard, now, _replay_timeout, write_timeout, row);
-        co_return stop_iteration::no;
-    };
-
-    co_await with_gate(_gate, [this, &all_replayed, batch = std::move(batch)] () mutable -> future<> {
-        blogger.debug("Started replayAllFailedBatches");
-        co_await utils::get_local_injector().inject("add_delay_to_batch_replay", std::chrono::milliseconds(1000));
-
-        auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
-
-        co_await _qp.query_internal(
-                format("SELECT * FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG),
-                db::consistency_level::ONE,
-                {},
-                page_size,
-                batch);
-
-        blogger.debug("Finished replayAllFailedBatches with all_replayed: {}", all_replayed);
-    });
-
-    co_return all_replayed;
-}
-
-future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches_v2(post_replay_cleanup cleanup) {
+future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
    co_await maybe_migrate_v1_to_v2();

+    typedef db_clock::rep clock_type;
+
    db::all_batches_replayed all_replayed = all_batches_replayed::yes;
    // rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
    // max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
    auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
-    utils::rate_limiter limiter(throttle);
+    auto limiter = make_lw_shared<utils::rate_limiter>(throttle);

    auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);

+    struct replay_stats {
+        std::optional<db_clock::time_point> min_too_fresh;
+        bool need_cleanup = false;
+    };
+
    std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;

    // Use a stable `now` across all batches, so skip/replay decisions are the
    // same across a while prefix of written_at (across all ids).
    const auto now = db_clock::now();

-    auto batch = [this, cleanup, &limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
-        all_replayed = all_replayed && co_await process_batch(_qp, _stats, cleanup, limiter, schema, replay_stats_per_shard, now, _replay_timeout, write_timeout, row);
+    auto batch = [this, cleanup, limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
+        const auto stage = static_cast<batchlog_stage>(row.get_as<int8_t>("stage"));
+        const auto batch_shard = row.get_as<int32_t>("shard");
+        auto written_at = row.get_as<db_clock::time_point>("written_at");
+        auto id = row.get_as<utils::UUID>("id");
+        // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
+        auto timeout = _replay_timeout;
+
+        if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
+            blogger.debug("Skipping batch replay due to skip_batch_replay injection");
+            all_replayed = all_batches_replayed::no;
+            co_return stop_iteration::no;
+        }
+
+        auto data = row.get_blob_unfragmented("data");
+
+        blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
+
+        utils::chunked_vector<mutation> mutations;
+        bool send_failed = false;
+
+        auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;
+
+        try {
+            utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
+            auto in = ser::as_input_stream(data);
+            while (in.size()) {
+                auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
+                const auto tbl = _qp.db().try_find_table(fm.column_family_id());
+                if (!tbl) {
+                    continue;
+                }
+                if (written_at <= tbl->get_truncation_time()) {
+                    continue;
+                }
+                schema_ptr s = tbl->schema();
+                if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
+                    timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
+                }
+                fms.emplace_back(std::move(fm), std::move(s));
+            }
+
+            if (now < written_at + timeout) {
+                blogger.debug("Skipping replay of {}, too fresh", id);
+
+                shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
+
+                co_return stop_iteration::no;
+            }
+
+            auto size = data.size();
+
+            for (const auto& [fm, s] : fms) {
+                mutations.emplace_back(fm.to_mutation(s));
+                co_await coroutine::maybe_yield();
+            }
+
+            if (!mutations.empty()) {
+                const auto ttl = [written_at]() -> clock_type {
+                    /*
+                    * Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
+                    * This ensures that deletes aren't "undone" by an old batch replay.
+                    */
+                    auto unadjusted_ttl = std::numeric_limits<gc_clock::rep>::max();
+                    warn(unimplemented::cause::HINT);
+#if 0
+                    for (auto& m : *mutations) {
+                        unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
+                    }
+#endif
+                    return unadjusted_ttl - std::chrono::duration_cast<gc_clock::duration>(db_clock::now() - written_at).count();
+                }();
+
+                if (ttl > 0) {
+                    // Origin does the send manually, however I can't see a super great reason to do so.
+                    // Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
+                    // in both cases.
+                    // FIXME: verify that the above is reasonably true.
+                    co_await limiter->reserve(size);
+                    _stats.write_attempts += mutations.size();
+                    auto timeout = db::timeout_clock::now() + write_timeout;
+                    if (cleanup) {
+                        co_await _qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
+                    } else {
+                        co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
+                    }
+                }
+            }
+        } catch (data_dictionary::no_such_keyspace& ex) {
+            // should probably ignore and drop the batch
+        } catch (const data_dictionary::no_such_column_family&) {
+            // As above -- we should drop the batch if the table doesn't exist anymore.
+        } catch (...) {
+            blogger.warn("Replay failed (will retry): {}", std::current_exception());
+            all_replayed = all_batches_replayed::no;
+            // timeout, overload etc.
+            // Do _not_ remove the batch, assuning we got a node write error.
+            // Since we don't have hints (which origin is satisfied with),
+            // we have to resort to keeping this batch to next lap.
+            if (!cleanup || stage == batchlog_stage::failed_replay) {
+                co_return stop_iteration::no;
+            }
+            send_failed = true;
+        }
+
+        auto& sp = _qp.proxy();
+
+        if (send_failed) {
+            blogger.debug("Moving batch {} to stage failed_replay", id);
+            auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, batchlog_stage::failed_replay, written_at, id);
+            co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+        }
+
+        // delete batch
+        auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
+        co_await _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+
+        shard_written_at.need_cleanup = true;
+
        co_return stop_iteration::no;
    };

@@ -580,10 +501,3 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches

    co_return all_replayed;
 }
-
-future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
-    if (_fs.batchlog_v2) {
-        return replay_all_failed_batches_v2(cleanup);
-    }
-    return replay_all_failed_batches_v1(cleanup);
-}
--- a/db/batchlog_manager.hh
+++ b/db/batchlog_manager.hh
@@ -27,12 +27,6 @@ class query_processor;

 } // namespace cql3

-namespace gms {
-
-class feature_service;
-
-} // namespace gms
-
 namespace db {

 class system_keyspace;
@@ -55,11 +49,6 @@ class batchlog_manager : public peering_sharded_service<batchlog_manager> {
 public:
    using post_replay_cleanup = bool_class<class post_replay_cleanup_tag>;

-    struct stats {
-        uint64_t write_attempts = 0;
-    };
-
-
 private:
    static constexpr std::chrono::seconds replay_interval = std::chrono::seconds(60);
    static constexpr uint32_t page_size = 128; // same as HHOM, for now, w/out using any heuristics. TODO: set based on avg batch size.
@@ -67,13 +56,14 @@ private:

    using clock_type = lowres_clock;

-    stats _stats;
+    struct stats {
+        uint64_t write_attempts = 0;
+    } _stats;

    seastar::metrics::metric_groups _metrics;

    cql3::query_processor& _qp;
    db::system_keyspace& _sys_ks;
-    gms::feature_service& _fs;
    db_clock::duration _replay_timeout;
    uint64_t _replay_rate;
    std::chrono::milliseconds _delay;
@@ -94,14 +84,12 @@ private:

    future<> maybe_migrate_v1_to_v2();

-    future<all_batches_replayed> replay_all_failed_batches_v1(post_replay_cleanup cleanup);
-    future<all_batches_replayed> replay_all_failed_batches_v2(post_replay_cleanup cleanup);
    future<all_batches_replayed> replay_all_failed_batches(post_replay_cleanup cleanup);
 public:
    // Takes a QP, not a distributes. Because this object is supposed
    // to be per shard and does no dispatching beyond delegating the the
    // shard qp (which is what you feed here).
-    batchlog_manager(cql3::query_processor&, db::system_keyspace& sys_ks, gms::feature_service& fs, batchlog_manager_config config);
+    batchlog_manager(cql3::query_processor&, db::system_keyspace& sys_ks, batchlog_manager_config config);

    // abort the replay loop and return its future.
    future<> drain();
@@ -114,7 +102,7 @@ public:
        return _last_replay;
    }

-    const stats& get_stats() const {
+    const stats& stats() const {
        return _stats;
    }
 private:
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -1986,13 +1986,13 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
            }
            continue;
        } catch (shutdown_marker&) {
+            _reserve_segments.abort(std::current_exception());
            break;
        } catch (...) {
            clogger.warn("Exception in segment reservation: {}", std::current_exception());
        }
        co_await sleep(100ms);
    }
-    _reserve_segments.abort(std::make_exception_ptr(shutdown_marker()));
 }

 future<std::vector<db::commitlog::descriptor>>
--- a/db/config.cc
+++ b/db/config.cc
@@ -1291,7 +1291,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , ignore_dead_nodes_for_replace(this, "ignore_dead_nodes_for_replace", value_status::Used, "", "List dead nodes to ignore for replace operation using a comma-separated list of host IDs. E.g., scylla --ignore-dead-nodes-for-replace 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1dbn-mac8-43fddce9123e")
    , override_decommission(this, "override_decommission", value_status::Deprecated, false, "Set true to force a decommissioned node to join the cluster (cannot be set if consistent-cluster-management is enabled).")
    , enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based.")
-    , allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
+    , allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild,bootstrap,decommission", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
    , enable_compacting_data_for_streaming_and_repair(this, "enable_compacting_data_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, true, "Enable the compacting reader, which compacts the data for streaming and repair (load'n'stream included) before sending it to, or synchronizing it with peers. Can reduce the amount of data to be processed by removing dead data, but adds CPU overhead.")
    , enable_tombstone_gc_for_streaming_and_repair(this, "enable_tombstone_gc_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, false,
            "If the compacting reader is enabled for streaming and repair (see enable_compacting_data_for_streaming_and_repair), allow it to garbage-collect tombstones."
--- a/db/hints/internal/hint_endpoint_manager.cc
+++ b/db/hints/internal/hint_endpoint_manager.cc
@@ -158,7 +158,7 @@ void hint_endpoint_manager::cancel_draining() noexcept {
    _sender.cancel_draining();
 }

-hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hint_directory, manager& shard_manager)
+hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hint_directory, manager& shard_manager, scheduling_group send_sg)
    : _key(key)
    , _shard_manager(shard_manager)
    , _store_gate("hint_endpoint_manager")
@@ -169,7 +169,7 @@ hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hi
    // Approximate the position of the last written hint by using the same formula as for segment id calculation in commitlog
    // TODO: Should this logic be deduplicated with what is in the commitlog?
    , _last_written_rp(this_shard_id(), std::chrono::duration_cast<std::chrono::milliseconds>(runtime::get_boot_time().time_since_epoch()).count())
-    , _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper())
+    , _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper(), send_sg)
 {}

 hint_endpoint_manager::hint_endpoint_manager(hint_endpoint_manager&& other)
--- a/db/hints/internal/hint_endpoint_manager.hh
+++ b/db/hints/internal/hint_endpoint_manager.hh
@@ -63,7 +63,7 @@ private:
    hint_sender _sender;

 public:
-    hint_endpoint_manager(const endpoint_id& key, std::filesystem::path hint_directory, manager& shard_manager);
+    hint_endpoint_manager(const endpoint_id& key, std::filesystem::path hint_directory, manager& shard_manager, scheduling_group send_sg);
    hint_endpoint_manager(hint_endpoint_manager&&);
    ~hint_endpoint_manager();

--- a/db/hints/internal/hint_sender.cc
+++ b/db/hints/internal/hint_sender.cc
@@ -122,7 +122,7 @@ const column_mapping& hint_sender::get_column_mapping(lw_shared_ptr<send_one_fil
    return cm_it->second;
 }

-hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy,replica::database& local_db, const gms::gossiper& local_gossiper) noexcept
+hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy,replica::database& local_db, const gms::gossiper& local_gossiper, scheduling_group sg) noexcept
    : _stopped(make_ready_future<>())
    , _ep_key(parent.end_point_key())
    , _ep_manager(parent)
@@ -130,7 +130,7 @@ hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy&
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(local_storage_proxy)
    , _db(local_db)
-    , _hints_cpu_sched_group(_db.get_streaming_scheduling_group())
+    , _hints_cpu_sched_group(sg)
    , _gossiper(local_gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
--- a/db/hints/internal/hint_sender.hh
+++ b/db/hints/internal/hint_sender.hh
@@ -120,7 +120,7 @@ private:
    std::multimap<db::replay_position, lw_shared_ptr<std::optional<promise<>>>> _replay_waiters;

 public:
-    hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy, replica::database& local_db, const gms::gossiper& local_gossiper) noexcept;
+    hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy, replica::database& local_db, const gms::gossiper& local_gossiper, scheduling_group sg) noexcept;
    ~hint_sender();

    /// \brief A constructor that should be called from the copy/move-constructor of hint_endpoint_manager.
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -142,7 +142,7 @@ future<> directory_initializer::ensure_rebalanced() {
 }

 manager::manager(service::storage_proxy& proxy, sstring hints_directory, host_filter filter, int64_t max_hint_window_ms,
-        resource_manager& res_manager, sharded<replica::database>& db)
+        resource_manager& res_manager, sharded<replica::database>& db, scheduling_group sg)
    : _hints_dir(fs::path(hints_directory) / fmt::to_string(this_shard_id()))
    , _host_filter(std::move(filter))
    , _proxy(proxy)
@@ -150,6 +150,7 @@ manager::manager(service::storage_proxy& proxy, sstring hints_directory, host_fi
    , _local_db(db.local())
    , _draining_eps_gate(seastar::format("hints::manager::{}", _hints_dir.native()))
    , _resource_manager(res_manager)
+    , _hints_sending_sched_group(sg)
 {
    if (utils::get_local_injector().enter("decrease_hints_flush_period")) {
        hints_flush_period = std::chrono::seconds{1};
@@ -415,7 +416,7 @@ hint_endpoint_manager& manager::get_ep_manager(const endpoint_id& host_id, const

    try {
        std::filesystem::path hint_directory = hints_dir() / (_uses_host_id ? fmt::to_string(host_id) : fmt::to_string(ip));
-        auto [it, _] = _ep_managers.emplace(host_id, hint_endpoint_manager{host_id, std::move(hint_directory), *this});
+        auto [it, _] = _ep_managers.emplace(host_id, hint_endpoint_manager{host_id, std::move(hint_directory), *this, _hints_sending_sched_group});
        hint_endpoint_manager& ep_man = it->second;

        manager_logger.trace("Created an endpoint manager for {}", host_id);
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -133,6 +133,7 @@ private:

    hint_stats _stats;
    seastar::metrics::metric_groups _metrics;
+    scheduling_group _hints_sending_sched_group;

    // We need to keep a variant here. Before migrating hinted handoff to using host ID, hint directories will
    // still represent IP addresses. But after the migration, they will start representing host IDs.
@@ -155,7 +156,7 @@ private:

 public:
    manager(service::storage_proxy& proxy, sstring hints_directory, host_filter filter,
-            int64_t max_hint_window_ms, resource_manager& res_manager, sharded<replica::database>& db);
+            int64_t max_hint_window_ms, resource_manager& res_manager, sharded<replica::database>& db, scheduling_group sg);

    manager(const manager&) = delete;
    manager& operator=(const manager&) = delete;
--- a/db/schema_applier.cc
+++ b/db/schema_applier.cc
@@ -1139,17 +1139,14 @@ future<> schema_applier::finalize_tables_and_views() {
    // was already dropped (see https://github.com/scylladb/scylla/issues/5614)
    for (auto& dropped_view : diff.tables_and_views.local().views.dropped) {
        auto s = dropped_view.get();
-        co_await _ss.local().on_cleanup_for_drop_table(s->id());
        co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
    }
    for (auto& dropped_table : diff.tables_and_views.local().tables.dropped) {
        auto s = dropped_table.get();
-        co_await _ss.local().on_cleanup_for_drop_table(s->id());
        co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
    }
    for (auto& dropped_cdc : diff.tables_and_views.local().cdc.dropped) {
        auto s = dropped_cdc.get();
-        co_await _ss.local().on_cleanup_for_drop_table(s->id());
        co_await replica::database::cleanup_drop_table_on_all_shards(sharded_db, _sys_ks, true, diff.table_shards[s->id()]);
    }

--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1714,9 +1714,7 @@ std::unordered_set<dht::token> decode_tokens(const set_type_impl::native_type& t
    std::unordered_set<dht::token> tset;
    for (auto& t: tokens) {
        auto str = value_cast<sstring>(t);
-        if (str != dht::token::from_sstring(str).to_sstring()) {
-            on_internal_error(slogger, format("decode_tokens: invalid token string '{}'", str));
-        }
+        SCYLLA_ASSERT(str == dht::token::from_sstring(str).to_sstring());
        tset.insert(dht::token::from_sstring(str));
    }
    return tset;
@@ -3193,7 +3191,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
                    };
                }
            } else if (must_have_tokens(nstate)) {
-                on_internal_error(slogger, format(
+                on_fatal_internal_error(slogger, format(
                        "load_topology_state: node {} in {} state but missing ring slice", host_id, nstate));
            }
        }
@@ -3275,7 +3273,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
            // Currently, at most one node at a time can be in transitioning state.
            if (!map->empty()) {
                const auto& [other_id, other_rs] = *map->begin();
-                on_internal_error(slogger, format(
+                on_fatal_internal_error(slogger, format(
                    "load_topology_state: found two nodes in transitioning state: {} in {} state and {} in {} state",
                    other_id, other_rs.state, host_id, nstate));
            }
@@ -3333,7 +3331,8 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
                format("SELECT count(range_end) as cnt FROM {}.{} WHERE key = '{}' AND id = ?",
                        NAME, CDC_GENERATIONS_V3, cdc::CDC_GENERATIONS_V3_KEY),
                gen_id.id);
-            if (!gen_rows || gen_rows->empty()) {
+            SCYLLA_ASSERT(gen_rows);
+            if (gen_rows->empty()) {
                on_internal_error(slogger, format(
                    "load_topology_state: last committed CDC generation time UUID ({}) present, but data missing", gen_id.id));
            }
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -215,8 +215,6 @@ public:
    static constexpr auto BUILT_VIEWS = "built_views";
    static constexpr auto SCYLLA_VIEWS_BUILDS_IN_PROGRESS = "scylla_views_builds_in_progress";
    static constexpr auto CDC_LOCAL = "cdc_local";
-    static constexpr auto CDC_TIMESTAMPS = "cdc_timestamps";
-    static constexpr auto CDC_STREAMS = "cdc_streams";

    // auth
    static constexpr auto ROLES = "roles";
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -23,6 +23,7 @@

 #include <seastar/core/future-util.hh>
 #include <seastar/core/coroutine.hh>
+#include <seastar/coroutine/all.hh>
 #include <seastar/coroutine/maybe_yield.hh>
 #include <flat_map>

@@ -65,6 +66,7 @@
 #include "mutation/timestamp.hh"
 #include "utils/assert.hh"
 #include "utils/small_vector.hh"
+#include "view_builder.hh"
 #include "view_info.hh"
 #include "view_update_checks.hh"
 #include "types/list.hh"
@@ -2238,12 +2240,20 @@ void view_builder::setup_metrics() {
 }

 future<> view_builder::start_in_background(service::migration_manager& mm, utils::cross_shard_barrier barrier) {
+    auto step_fiber = make_ready_future<>();
    try {
        view_builder_init_state vbi;
        auto fail = defer([&barrier] mutable { barrier.abort(); });
-        // Guard the whole startup routine with a semaphore,
-        // so that it's not intercepted by `on_drop_view`, `on_create_view`
-        // or `on_update_view` events.
+        // Semaphore usage invariants:
+        // - One unit of _sem serializes all per-shard bookkeeping that mutates view-builder state
+        //   (_base_to_build_step, _built_views, build_status, reader resets).
+        // - The unit is held for the whole operation, including the async chain, until the state
+        //   is stable for the next operation on that shard.
+        // - Cross-shard operations acquire _sem on shard 0 for the duration of the broadcast.
+        //   Other shards acquire their own _sem only around their local handling; shard 0 skips
+        //   the local acquire because it already holds the unit from the dispatcher.
+        // Guard the whole startup routine with a semaphore so that it's not intercepted by
+        // `on_drop_view`, `on_create_view`, or `on_update_view` events.
        auto units = co_await get_units(_sem, view_builder_semaphore_units);
        // Wait for schema agreement even if we're a seed node.
        co_await mm.wait_for_schema_agreement(_db, db::timeout_clock::time_point::max(), &_as);
@@ -2264,8 +2274,10 @@ future<> view_builder::start_in_background(service::migration_manager& mm, utils
        _mnotifier.register_listener(this);
        co_await calculate_shard_build_step(vbi);
        _current_step = _base_to_build_step.begin();
-        // Waited on indirectly in stop().
-        (void)_build_step.trigger();
+
+        // If preparation above fails, run_in_background() is not invoked, just
+        // the start_in_background() emits a warning into logs and resolves
+        step_fiber = run_in_background();
    } catch (...) {
        auto ex = std::current_exception();
        auto ll = log_level::error;
@@ -2280,10 +2292,12 @@ future<> view_builder::start_in_background(service::migration_manager& mm, utils
        }
        vlogger.log(ll, "start aborted: {}", ex);
    }
+
+    co_await std::move(step_fiber);
 }

 future<> view_builder::start(service::migration_manager& mm, utils::cross_shard_barrier barrier) {
-    _started = start_in_background(mm, std::move(barrier));
+    _step_fiber = start_in_background(mm, std::move(barrier));
    return make_ready_future<>();
 }

@@ -2293,12 +2307,12 @@ future<> view_builder::drain() {
    }
    vlogger.info("Draining view builder");
    _as.request_abort();
-    co_await std::move(_started);
    co_await _mnotifier.unregister_listener(this);
    co_await _vug.drain();
    co_await _sem.wait();
    _sem.broken();
-    co_await _build_step.join();
+    _build_step.broken();
+    co_await std::move(_step_fiber);
    co_await coroutine::parallel_for_each(_base_to_build_step, [] (std::pair<const table_id, build_step>& p) {
        return p.second.reader.close();
    });
@@ -2667,63 +2681,59 @@ static bool should_ignore_tablet_keyspace(const replica::database& db, const sst
    return db.features().view_building_coordinator && db.has_keyspace(ks_name) && db.find_keyspace(ks_name).uses_tablets();
 }

-future<> view_builder::dispatch_create_view(sstring ks_name, sstring view_name) {
-    if (should_ignore_tablet_keyspace(_db, ks_name)) {
-        return make_ready_future<>();
-    }
-    return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-        // This runs on shard 0 only; seed the global rows before broadcasting.
-        return handle_seed_view_build_progress(ks_name, view_name).then([this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-            return container().invoke_on_all([ks_name = std::move(ks_name), view_name = std::move(view_name)] (view_builder& vb) mutable {
-                return vb.handle_create_view_local(std::move(ks_name), std::move(view_name));
-            });
-        });
-    });
+future<view_builder::view_builder_units> view_builder::get_or_adopt_view_builder_lock(view_builder_units_opt units) {
+    co_return units ? std::move(*units) : co_await get_units(_sem, view_builder_semaphore_units);
 }

-future<> view_builder::handle_seed_view_build_progress(sstring ks_name, sstring view_name) {
+future<> view_builder::dispatch_create_view(sstring ks_name, sstring view_name) {
+    if (should_ignore_tablet_keyspace(_db, ks_name)) {
+        co_return;
+    }
+
+    auto units = co_await get_or_adopt_view_builder_lock(std::nullopt);
+    co_await handle_seed_view_build_progress(ks_name, view_name);
+
+    co_await coroutine::all(
+        [this, ks_name, view_name, units = std::move(units)] mutable -> future<> {
+            co_await handle_create_view_local(ks_name, view_name, std::move(units)); },
+        [this, ks_name, view_name] mutable -> future<> {
+            co_await container().invoke_on_others([ks_name = std::move(ks_name), view_name = std::move(view_name)] (view_builder& vb) mutable -> future<> {
+                return vb.handle_create_view_local(ks_name, view_name, std::nullopt); }); });
+}
+
+future<> view_builder::handle_seed_view_build_progress(const sstring& ks_name, const sstring& view_name) {
    auto view = view_ptr(_db.find_schema(ks_name, view_name));
    auto& step = get_or_create_build_step(view->view_info()->base_id());
    return _sys_ks.register_view_for_building_for_all_shards(view->ks_name(), view->cf_name(), step.current_token());
 }

-future<> view_builder::handle_create_view_local(sstring ks_name, sstring view_name){
-    if (this_shard_id() == 0) { 
-        return handle_create_view_local_impl(std::move(ks_name), std::move(view_name));
-    } else {
-        return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-            return handle_create_view_local_impl(std::move(ks_name), std::move(view_name));
-        });
-    }
-}
-
-future<> view_builder::handle_create_view_local_impl(sstring ks_name, sstring view_name) {
+future<> view_builder::handle_create_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units) {
+    [[maybe_unused]] auto sem_units = co_await get_or_adopt_view_builder_lock(std::move(units));
    auto view = view_ptr(_db.find_schema(ks_name, view_name));
    auto& step = get_or_create_build_step(view->view_info()->base_id());
-    return when_all(step.base->await_pending_writes(), step.base->await_pending_streams()).discard_result().then([this, &step] {
-        return flush_base(step.base, _as);
-    }).then([this, view, &step] () {
+    try {
+        co_await coroutine::all(
+            [&step] -> future<> {
+                co_await step.base->await_pending_writes(); },
+            [&step] -> future<> {
+                co_await step.base->await_pending_streams(); });
+        co_await flush_base(step.base, _as);
+    
        // This resets the build step to the current token. It may result in views currently
        // being built to receive duplicate updates, but it simplifies things as we don't have
        // to keep around a list of new views to build the next time the reader crosses a token
        // threshold.
-        return initialize_reader_at_current_token(step).then([this, view, &step] () mutable {
-            return add_new_view(view, step);
-        }).then_wrapped([this, view] (future<>&& f) {
-            try {
-                f.get();
-            } catch (abort_requested_exception&) {
-                vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
-            } catch (raft::request_aborted&) {
-                vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
-            } catch (...) {
-                vlogger.error("Error setting up view for building {}.{}: {}", view->ks_name(), view->cf_name(), std::current_exception());
-            }
+        co_await initialize_reader_at_current_token(step);
+        co_await add_new_view(view, step);
+    } catch (abort_requested_exception&) {
+        vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
+    } catch (raft::request_aborted&) {
+        vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
+    } catch (...) {
+        vlogger.error("Error setting up view for building {}.{}: {}", view->ks_name(), view->cf_name(), std::current_exception());
+    }

-            // Waited on indirectly in stop().
-            static_cast<void>(_build_step.trigger());
-        });
-    });
+    _build_step.signal();
 }

 void view_builder::on_create_view(const sstring& ks_name, const sstring& view_name) {
@@ -2760,62 +2770,55 @@ void view_builder::on_update_view(const sstring& ks_name, const sstring& view_na

 future<> view_builder::dispatch_drop_view(sstring ks_name, sstring view_name) {
    if (should_ignore_tablet_keyspace(_db, ks_name)) {
-        return make_ready_future<>();
+        co_return;
    }

-    return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-        // This runs on shard 0 only; broadcast local cleanup before global cleanup.
-        return container().invoke_on_all([ks_name, view_name] (view_builder& vb) mutable {
-            return vb.handle_drop_view_local(std::move(ks_name), std::move(view_name));
-        }).then([this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-            return handle_drop_view_global_cleanup(std::move(ks_name), std::move(view_name));
-        });
-    });
+    auto units = co_await get_or_adopt_view_builder_lock(std::nullopt);
+
+    co_await coroutine::all(
+        [this, ks_name, view_name, units = std::move(units)] mutable -> future<> {
+            co_await handle_drop_view_local(ks_name, view_name, std::move(units)); },
+        [this, ks_name, view_name] mutable -> future<> {
+            co_await container().invoke_on_others([ks_name = std::move(ks_name), view_name = std::move(view_name)] (view_builder& vb) mutable -> future<> {
+                return vb.handle_drop_view_local(ks_name, view_name, std::nullopt); });});
+    co_await handle_drop_view_global_cleanup(ks_name, view_name);
 }

-future<> view_builder::handle_drop_view_local(sstring ks_name, sstring view_name) {
-    if (this_shard_id() == 0) { 
-        return handle_drop_view_local_impl(std::move(ks_name), std::move(view_name));
-    } else {
-        return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-            return handle_drop_view_local_impl(std::move(ks_name), std::move(view_name));
-        });
-    }
-}
-
-future<> view_builder::handle_drop_view_local_impl(sstring ks_name, sstring view_name) {
+future<> view_builder::handle_drop_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units) {
+    [[maybe_unused]] auto sem_units = co_await get_or_adopt_view_builder_lock(std::move(units));
    vlogger.info0("Stopping to build view {}.{}", ks_name, view_name);
-    // The view is absent from the database at this point, so find it by brute force.
-    ([&, this] {
-        for (auto& [_, step] : _base_to_build_step) {
-            if (step.build_status.empty() || step.build_status.front().view->ks_name() != ks_name) {
-                continue;
-            }
-            for (auto it = step.build_status.begin(); it != step.build_status.end(); ++it) {
-                if (it->view->cf_name() == view_name) {
-                    _built_views.erase(it->view->id());
-                    step.build_status.erase(it);
-                    return;
-                }
+
+    for (auto& [_, step] : _base_to_build_step) {
+        if (step.build_status.empty() || step.build_status.front().view->ks_name() != ks_name) {
+            continue;
+        }
+        for (auto it = step.build_status.begin(); it != step.build_status.end(); ++it) {
+            if (it->view->cf_name() == view_name) {
+                _built_views.erase(it->view->id());
+                step.build_status.erase(it);
+                co_return;
            }
        }
-    })();
-    return make_ready_future<>();  
+    }
 }

-future<> view_builder::handle_drop_view_global_cleanup(sstring ks_name, sstring view_name) {
+future<> view_builder::handle_drop_view_global_cleanup(const sstring& ks_name, const sstring& view_name) {
    if (this_shard_id() != 0) {
-        return make_ready_future<>();
+        co_return;
    }
    vlogger.info0("Starting view global cleanup {}.{}", ks_name, view_name);
-    return when_all_succeed(
-                _sys_ks.remove_view_build_progress_across_all_shards(ks_name, view_name),
-                _sys_ks.remove_built_view(ks_name, view_name),
-                remove_view_build_status(ks_name, view_name))
-                    .discard_result()
-                    .handle_exception([ks_name, view_name] (std::exception_ptr ep) {
-        vlogger.warn("Failed to cleanup view {}.{}: {}", ks_name, view_name, ep);
-    });
+    
+    try {
+        co_await coroutine::all(
+            [this, &ks_name, &view_name] -> future<>  {
+                co_await _sys_ks.remove_view_build_progress_across_all_shards(ks_name, view_name); },
+            [this, &ks_name, &view_name] -> future<>  {
+                co_await _sys_ks.remove_built_view(ks_name, view_name); },
+            [this, &ks_name, &view_name] -> future<>  {
+                co_await remove_view_build_status(ks_name, view_name); });
+    } catch (...) {
+        vlogger.warn("Failed to cleanup view {}.{}: {}", ks_name, view_name, std::current_exception());
+    }
 }

 void view_builder::on_drop_view(const sstring& ks_name, const sstring& view_name) {
@@ -2829,14 +2832,15 @@ void view_builder::on_drop_view(const sstring& ks_name, const sstring& view_name
    }));
 }

-future<> view_builder::do_build_step() {
-    // Run the view building in the streaming scheduling group
-    // so that it doesn't impact other tasks with higher priority.
-    seastar::thread_attributes attr;
-    attr.sched_group = _db.get_streaming_scheduling_group();
-    return seastar::async(std::move(attr), [this] {
+future<> view_builder::run_in_background() {
+    return seastar::async([this] {
        exponential_backoff_retry r(1s, 1min);
-        while (!_base_to_build_step.empty() && !_as.abort_requested()) {
+        while (!_as.abort_requested()) {
+            try {
+                _build_step.wait([this] { return !_base_to_build_step.empty(); }).get();
+            } catch (const seastar::broken_condition_variable&) {
+                return;
+            }
            auto units = get_units(_sem, view_builder_semaphore_units).get();
            ++_stats.steps_performed;
            try {
--- a/db/view/view_builder.hh
+++ b/db/view/view_builder.hh
@@ -11,13 +11,13 @@
 #include "query/query-request.hh"
 #include "service/migration_listener.hh"
 #include "service/raft/raft_group0_client.hh"
-#include "utils/serialized_action.hh"
 #include "utils/cross-shard-barrier.hh"
 #include "replica/database.hh"

 #include <seastar/core/abort_source.hh>
 #include <seastar/core/future.hh>
 #include <seastar/core/semaphore.hh>
+#include <seastar/core/condition-variable.hh>
 #include <seastar/core/sharded.hh>
 #include <seastar/core/shared_future.hh>
 #include <seastar/core/shared_ptr.hh>
@@ -104,6 +104,12 @@ class view_update_generator;
 *            redo the missing step, for simplicity.
 */
 class view_builder final : public service::migration_listener::only_view_notifications, public seastar::peering_sharded_service<view_builder> {
+    //aliasing for semaphore units that will be used throughout the class
+    using view_builder_units = semaphore_units<named_semaphore_exception_factory>;
+
+    //aliasing for optional semaphore units that will be used throughout the class
+    using view_builder_units_opt = std::optional<view_builder_units>;
+
    /**
     * Keeps track of the build progress for a particular view.
     * When the view is built, next_token == first_token.
@@ -168,14 +174,24 @@ class view_builder final : public service::migration_listener::only_view_notific
    reader_permit _permit;
    base_to_build_step_type _base_to_build_step;
    base_to_build_step_type::iterator _current_step = _base_to_build_step.end();
-    serialized_action _build_step{std::bind(&view_builder::do_build_step, this)};
+    condition_variable _build_step;
    static constexpr size_t view_builder_semaphore_units = 1;
    // Ensures bookkeeping operations are serialized, meaning that while we execute
    // a build step we don't consider newly added or removed views. This simplifies
    // the algorithms. Also synchronizes an operation wrt. a call to stop().
+    // Semaphore usage invariants:
+    // - One unit of _sem serializes all per-shard bookkeeping that mutates view-builder state
+    //   (_base_to_build_step, _built_views, build_status, reader resets).
+    // - The unit is held for the whole operation, including the async chain, until the state
+    //   is stable for the next operation on that shard.
+    // - Cross-shard operations acquire _sem on shard 0 for the duration of the broadcast.
+    //   Other shards acquire their own _sem only around their local handling; shard 0 skips
+    //   the local acquire because it already holds the unit from the dispatcher.
+    // Guard the whole startup routine with a semaphore so that it's not intercepted by
+    // `on_drop_view`, `on_create_view`, or `on_update_view` events.
    seastar::named_semaphore _sem{view_builder_semaphore_units, named_semaphore_exception_factory{"view builder"}};
    seastar::abort_source _as;
-    future<> _started = make_ready_future<>();
+    future<> _step_fiber = make_ready_future<>();
    // Used to coordinate between shards the conclusion of the build process for a particular view.
    std::unordered_set<table_id> _built_views;
    // Used for testing.
@@ -262,19 +278,18 @@ private:
    void setup_shard_build_step(view_builder_init_state& vbi, std::vector<system_keyspace_view_name>, std::vector<system_keyspace_view_build_progress>);
    future<> calculate_shard_build_step(view_builder_init_state& vbi);
    future<> add_new_view(view_ptr, build_step&);
-    future<> do_build_step();
+    future<> run_in_background();
    void execute(build_step&, exponential_backoff_retry);
    future<> maybe_mark_view_as_built(view_ptr, dht::token);
    future<> mark_as_built(view_ptr);
    void setup_metrics();
    future<> dispatch_create_view(sstring ks_name, sstring view_name);
    future<> dispatch_drop_view(sstring ks_name, sstring view_name);
-    future<> handle_seed_view_build_progress(sstring ks_name, sstring view_name);
-    future<> handle_create_view_local(sstring ks_name, sstring view_name);
-    future<> handle_drop_view_local(sstring ks_name, sstring view_name);
-    future<> handle_create_view_local_impl(sstring ks_name, sstring view_name);
-    future<> handle_drop_view_local_impl(sstring ks_name, sstring view_name);
-    future<> handle_drop_view_global_cleanup(sstring ks_name, sstring view_name);
+    future<> handle_seed_view_build_progress(const sstring& ks_name, const sstring& view_name);
+    future<> handle_create_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units);
+    future<> handle_drop_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units);
+    future<> handle_drop_view_global_cleanup(const sstring& ks_name, const sstring& view_name);
+    future<view_builder_units> get_or_adopt_view_builder_lock(view_builder_units_opt units);

    template <typename Func1, typename Func2>
    future<> write_view_build_status(Func1&& fn_group0, Func2&& fn_sys_dist) {
--- a/db/view/view_building_worker.cc
+++ b/db/view/view_building_worker.cc
@@ -242,7 +242,7 @@ future<> view_building_worker::create_staging_sstable_tasks() {
                utils::UUID_gen::get_time_UUID(), view_building_task::task_type::process_staging, false,
                table_id, ::table_id{}, {my_host_id, sst_info.shard}, sst_info.last_token
            };
-            auto mut = co_await _group0.client().sys_ks().make_view_building_task_mutation(guard.write_timestamp(), task);
+            auto mut = co_await _sys_ks.make_view_building_task_mutation(guard.write_timestamp(), task);
            cmuts.emplace_back(std::move(mut));
        }
    }
@@ -386,7 +386,6 @@ future<> view_building_worker::update_built_views() {
        auto schema = _db.find_schema(table_id);
        return std::make_pair(schema->ks_name(), schema->cf_name());
    };
-    auto& sys_ks = _group0.client().sys_ks();

    std::set<std::pair<sstring, sstring>> built_views;
    for (auto& [id, statuses]: _vb_state_machine.views_state.status_map) {
@@ -395,22 +394,22 @@ future<> view_building_worker::update_built_views() {
        }
    }

-    auto local_built = co_await sys_ks.load_built_views() | std::views::filter([&] (auto& v) {
+    auto local_built = co_await _sys_ks.load_built_views() | std::views::filter([&] (auto& v) {
        return !_db.has_keyspace(v.first) || _db.find_keyspace(v.first).uses_tablets();
    }) | std::ranges::to<std::set>();

    // Remove dead entries
    for (auto& view: local_built) {
        if (!built_views.contains(view)) {
-            co_await sys_ks.remove_built_view(view.first, view.second);
+            co_await _sys_ks.remove_built_view(view.first, view.second);
        }
    }

    // Add new entries
    for (auto& view: built_views) {
        if (!local_built.contains(view)) {
-            co_await sys_ks.mark_view_as_built(view.first, view.second);
-            co_await sys_ks.remove_view_build_progress_across_all_shards(view.first, view.second);
+            co_await _sys_ks.mark_view_as_built(view.first, view.second);
+            co_await _sys_ks.remove_view_build_progress_across_all_shards(view.first, view.second);
        }
    }
 }
--- a/db/virtual_tables.cc
+++ b/db/virtual_tables.cc
@@ -1345,8 +1345,8 @@ public:

 private:
    static schema_ptr build_schema() {
-        auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS);
-        return schema_builder(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS, std::make_optional(id))
+        auto id = generate_legacy_id(system_keyspace::NAME, "cdc_timestamps");
+        return schema_builder(system_keyspace::NAME, "cdc_timestamps", std::make_optional(id))
            .with_column("keyspace_name", utf8_type, column_kind::partition_key)
            .with_column("table_name", utf8_type, column_kind::partition_key)
            .with_column("timestamp", reversed_type_impl::get_instance(timestamp_type), column_kind::clustering_key)
@@ -1428,8 +1428,8 @@ public:
    }
 private:
    static schema_ptr build_schema() {
-        auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_STREAMS);
-        return schema_builder(system_keyspace::NAME, system_keyspace::CDC_STREAMS, std::make_optional(id))
+        auto id = generate_legacy_id(system_keyspace::NAME, "cdc_streams");
+        return schema_builder(system_keyspace::NAME, "cdc_streams", std::make_optional(id))
            .with_column("keyspace_name", utf8_type, column_kind::partition_key)
            .with_column("table_name", utf8_type, column_kind::partition_key)
            .with_column("timestamp", timestamp_type, column_kind::clustering_key)
--- a/debug.cc
+++ b/debug.cc
@@ -11,5 +11,6 @@
 namespace debug {

 seastar::sharded<replica::database>* volatile the_database = nullptr;
+seastar::scheduling_group streaming_scheduling_group;

 }
--- a/debug.hh
+++ b/debug.hh
@@ -17,7 +17,7 @@ class database;
 namespace debug {

 extern seastar::sharded<replica::database>* volatile the_database;
-
+extern seastar::scheduling_group streaming_scheduling_group;

 }

--- a/dht/decorated_key.hh
+++ b/dht/decorated_key.hh
@@ -30,11 +30,13 @@ namespace dht {
 // Total ordering defined by comparators is compatible with Origin's ordering.
 class decorated_key {
 public:
-    dht::token _token;
+    // Store only the token data as int64_t to avoid the bloat of storing
+    // token_kind, which is always token_kind::key for decorated_key.
+    int64_t _token_data;
    partition_key _key;

    decorated_key(dht::token t, partition_key k)
-        : _token(std::move(t))
+        : _token_data(t._data)
        , _key(std::move(k)) {
    }

@@ -56,8 +58,8 @@ public:
    std::strong_ordering tri_compare(const schema& s, const decorated_key& other) const;
    std::strong_ordering tri_compare(const schema& s, const ring_position& other) const;

-    const dht::token& token() const noexcept {
-        return _token;
+    dht::token token() const noexcept {
+        return dht::token(_token_data);
    }

    const partition_key& key() const {
@@ -65,7 +67,7 @@ public:
    }

    size_t external_memory_usage() const {
-        return _key.external_memory_usage() + _token.external_memory_usage();
+        return _key.external_memory_usage();
    }

    size_t memory_usage() const {
@@ -102,6 +104,6 @@ template <> struct fmt::formatter<dht::decorated_key> {
    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
    template <typename FormatContext>
    auto format(const dht::decorated_key& dk, FormatContext& ctx) const {
-        return fmt::format_to(ctx.out(), "{{key: {}, token: {}}}", dk._key, dk._token);
+        return fmt::format_to(ctx.out(), "{{key: {}, token: {}}}", dk._key, dk.token());
    }
 };
--- a/dht/i_partitioner.cc
+++ b/dht/i_partitioner.cc
@@ -95,7 +95,7 @@ std::unique_ptr<dht::i_partitioner> make_partitioner(sstring partitioner_name) {

 bool
 decorated_key::equal(const schema& s, const decorated_key& other) const {
-    if (_token == other._token) {
+    if (_token_data == other._token_data) {
        return _key.legacy_equal(s, other._key);
    }
    return false;
@@ -103,7 +103,7 @@ decorated_key::equal(const schema& s, const decorated_key& other) const {

 std::strong_ordering
 decorated_key::tri_compare(const schema& s, const decorated_key& other) const {
-    auto r = _token <=> other._token;
+    auto r = _token_data <=> other._token_data;
    if (r != 0) {
        return r;
    } else {
@@ -113,13 +113,24 @@ decorated_key::tri_compare(const schema& s, const decorated_key& other) const {

 std::strong_ordering
 decorated_key::tri_compare(const schema& s, const ring_position& other) const {
-    auto r = _token <=> other.token();
-    if (r != 0) {
-        return r;
-    } else if (other.has_key()) {
-        return _key.legacy_tri_compare(s, *other.key());
+    // decorated_key tokens are always of token_kind::key, so we need to 
+    // account for ring_position tokens that might be before_all_keys or after_all_keys
+    const auto& other_token = other.token();
+    if (other_token._kind == token_kind::key) [[likely]] {
+        auto r = _token_data <=> other_token._data;
+        if (r != 0) {
+            return r;
+        } else if (other.has_key()) {
+            return _key.legacy_tri_compare(s, *other.key());
+        }
+        return 0 <=> other.relation_to_keys();
+    } else if (other_token._kind == token_kind::before_all_keys) {
+        // decorated_key (token_kind::key) > before_all_keys
+        return std::strong_ordering::greater;
+    } else {
+        // decorated_key (token_kind::key) < after_all_keys
+        return std::strong_ordering::less;
    }
-    return 0 <=> other.relation_to_keys();
 }

 bool
--- a/dht/ring_position.hh
+++ b/dht/ring_position.hh
@@ -93,12 +93,12 @@ public:
    { }

    ring_position(const dht::decorated_key& dk)
-        : _token(dk._token)
+        : _token(dk.token())
        , _key(std::make_optional(dk._key))
    { }

    ring_position(dht::decorated_key&& dk)
-        : _token(std::move(dk._token))
+        : _token(dk.token())
        , _key(std::make_optional(std::move(dk._key)))
    { }

--- a/dist/docker/redhat/build_docker.sh
+++ b/dist/docker/redhat/build_docker.sh
@@ -97,9 +97,7 @@ bcp LICENSE-ScyllaDB-Source-Available.md /licenses/

 run microdnf clean all
 run microdnf --setopt=tsflags=nodocs -y update
-run microdnf --setopt=tsflags=nodocs -y install hostname kmod procps-ng python3 python3-pip cpio
-# Extract only systemctl binary from systemd package to avoid installing the whole systemd in the container.
-run bash -rc "microdnf download systemd && rpm2cpio systemd-*.rpm | cpio -idmv ./usr/bin/systemctl && rm -rf systemd-*.rpm"
+run microdnf --setopt=tsflags=nodocs -y install hostname kmod procps-ng python3 python3-pip
 run curl -L --output /etc/yum.repos.d/scylla.repo ${repo_file_url}
 run pip3 install --no-cache-dir --prefix /usr supervisor
 run bash -ec "echo LANG=C.UTF-8 > /etc/locale.conf"
@@ -108,8 +106,6 @@ run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
 run mkdir -p /var/log/scylla
 run chown -R scylla:scylla /var/lib/scylla
 run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --network-stack posix"/' /etc/sysconfig/scylla-server
-# Cleanup packages not needed in the final image and clean package manager cache to reduce image size.
-run bash -rc "microdnf remove -y cpio && microdnf clean all"

 run mkdir -p /opt/scylladb/supervisor
 run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
--- a/docs/alternator/alternator.md
+++ b/docs/alternator/alternator.md
@@ -142,6 +142,10 @@ want modify a non-top-level attribute directly (e.g., a.b[3].c) need RMW:
 Alternator implements such requests by reading the entire top-level
 attribute a, modifying only a.b[3].c, and then writing back a.

+Currently, Alternator doesn't use Tablets. That's because Alternator relies
+on LWT (lightweight transactions), and LWT is not supported in keyspaces
+with Tablets enabled.
+
 ```{eval-rst}
 .. toctree::
    :maxdepth: 2
--- a/docs/architecture/tablets.rst
+++ b/docs/architecture/tablets.rst
@@ -187,23 +187,6 @@ You can create a keyspace with tablets enabled with the ``tablets = {'enabled':
    the keyspace schema with ``tablets = { 'enabled': false }`` or 
    ``tablets = { 'enabled': true }``.

-.. _keyspace-rf-rack-valid-to-enforce-rack-list:
-
-Enforcing Rack-List Replication for Tablet Keyspaces
------------------------------------------------------------------
-
-The ``rf_rack_valid_keyspaces`` is a legacy option that ensures that all keyspaces with tablets enabled are
-:term:`RF-rack-valid <RF-rack-valid keyspace>`.
-
-Requiring every tablet keyspace to use the rack list replication factor exclusively is enough to guarantee the keyspace is
-:term:`RF-rack-valid <RF-rack-valid keyspace>`. It reduces restrictions and provides stronger guarantees compared
-to ``rf_rack_valid_keyspaces`` option.
-
-To enforce rack list in tablet keyspaces, use ``enforce_rack_list`` option. It can be set only if all tablet keyspaces use
-rack list. To ensure that, follow a procedure of :ref:`conversion to rack list replication factor <conversion-to-rack-list-rf>`.
-After that restart all nodes in the cluster, with ``enforce_rack_list`` enabled and ``rf_rack_valid_keyspaces`` disabled. Make
-sure to avoid setting or updating replication factor (with CREATE KEYSPACE or ALTER KEYSPACE) while nodes are being restarted.
-
 .. _tablets-limitations:

 Limitations and Unsupported Features
--- a/docs/cql/ddl.rst
+++ b/docs/cql/ddl.rst
@@ -200,6 +200,8 @@ for two cases. One is setting replication factor to 0, in which case the number
 The other is when the numeric replication factor is equal to the current number of replicas
 for a given datacanter, in which case the current rack list is preserved.

+Altering from a numeric replication factor to a rack list is not supported yet.
+
 Note that when ``ALTER`` ing keyspaces and supplying ``replication_factor``,
 auto-expansion will only *add* new datacenters for safety, it will not alter
 existing datacenters or remove any even if they are no longer in the cluster.
@@ -422,21 +424,6 @@ Altering from a rack list to a numeric replication factor is not supported.

 Keyspaces which use rack lists are :term:`RF-rack-valid <RF-rack-valid keyspace>` if each rack in the rack list contains at least one node (excluding :doc:`zero-token nodes </architecture/zero-token-nodes>`).

-.. _conversion-to-rack-list-rf:
-
-Conversion to rack-list replication factor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-To migrate a keyspace from a numeric replication factor to a rack-list replication factor, provide the rack-list replication factor explicitly in ALTER KEYSPACE statement. The number of racks in the list must be equal to the numeric replication factor. The replication factor can be converted in any number of DCs at once. In a statement that converts replication factor, no replication factor updates (increase or decrease) are allowed in any DC.
-
-.. code-block:: cql
-
-  CREATE KEYSPACE Excelsior
-   WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 1} AND tablets = { 'enabled': true };
-
-  ALTER KEYSPACE Excelsior
-   WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : ['RAC1', 'RAC2', 'RAC3'], 'dc2' : ['RAC4']} AND tablets = { 'enabled': true };
-
 .. _drop-keyspace-statement:

 DROP KEYSPACE
@@ -1039,7 +1026,29 @@ You can enable the after-repair tombstone GC by setting the ``repair`` mode usin

    ALTER TABLE ks.cf WITH tombstone_gc = {'mode':'repair'} ;

-The following modes are available:
+To support writes arriving out-of-order -- either due to natural delays, or user provided timestamps -- the repair mode has a propagation delay.
+Out-of-order writes present a problem for repair mode tombstone gc. Consider the following example sequence of events:
+
+1) Write ``DELETE FROM table WHERE key = K1`` arrives at the node.
+2) Repair is run.
+3) Compaction runs and garbage collects the tombstone for ``key = K1``.
+4) Write ``INSERT INTO table (key, ...) VALUES (K1, ...)`` arrives at the node with timestamp smaller than that of the delete. The tombstone for ``key = K1`` should apply to this write, but it is already garbage collected, so this data is resurrected.
+
+Propagation delay solves this problem by establishing a window before repair, where tombstones are not yet garbage collectible: a tombstone is garbage collectible if it was written before the last repair by at least the propagation delay.
+
+The value of the propagation delay can be set via the ``propagation_delay_in_seconds`` parameter:
+
+.. code-block:: cql
+
+    CREATE TABLE ks.cf (key blob PRIMARY KEY, val blob) WITH tombstone_gc = {'mode':'repair', 'propagation_delay_in_seconds': 120};
+
+.. code-block:: cql
+
+    ALTER TABLE ks.cf WITH tombstone_gc = {'mode':'repair', 'propagation_delay_in_seconds': 120};
+
+The default value of the propagation delay is 1 hour. This parameter should only be changed if your application uses user provided timestamps and writes and deletes can arrive out-of-order by more than the default 1 hour.
+
+The following tombstone gc modes are available:

 .. list-table::
   :widths: 20 80
--- a/docs/cql/dml/select.rst
+++ b/docs/cql/dml/select.rst
@@ -281,8 +281,7 @@ For example::
      ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;


-Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key
-or columns provided in a definition of the index.
+Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key.

 For example::

--- a/docs/cql/secondary-indexes.rst
+++ b/docs/cql/secondary-indexes.rst
@@ -140,83 +140,17 @@ Vector Index :label-note:`ScyllaDB Cloud`
   `ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/stable/vector-search/>`_.

 ScyllaDB supports creating vector indexes on tables, allowing queries on the table to use those indexes for efficient
-similarity search on vector data. Vector indexes can be a global index for indexing vectors per table or a local
-index for indexing vectors per partition.
+similarity search on vector data. 

 The vector index is the only custom type index supported in ScyllaDB. It is created using
-the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. It is also possible to
-add additional columns to the index for filtering the search results. The partition column
-specified in the global vector index definition must be the vector column, and any subsequent
-columns are treated as filtering columns. The local vector index requires that the partition key
-of the base table is also the partition key of the index and the vector column is the first one
-from the following columns.
-
-Example of a simple index:
+the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. Example:

 .. code-block:: cql

-      CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding)
+      CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding) 
      USING 'vector_index' 
      WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};

-The vector column (``embedding``) is indexed to enable similarity search using
-a global vector index. Additional filtering can be performed on the primary key
-columns of the base table.
-
-Example of a global vector index with additional filtering:
-
-.. code-block:: cql
-
-      CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding, category, info)
-      USING 'vector_index' 
-      WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
-
-The vector column (``embedding``) is indexed to enable similarity search using
-a global index. Additional columns are added for filtering the search results.
-The filtering is possible on ``category``, ``info`` and all primary key columns
-of the base table.
-
-Example of a local vector index:
-
-.. code-block:: cql
-
-      CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings ((id, created_at), embedding, category, info)
-      USING 'vector_index' 
-      WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
-
-The vector column (``embedding``) is indexed for similarity search (a local
-index) and additional columns are added for filtering the search results. The
-filtering is possible on ``category``, ``info`` and all primary key columns of
-the base table. The columns ``id`` and ``created_at`` must be the partition key
-of the base table.
-
-Vector indexes support additional filtering columns of native data types
-(excluding counter and duration). The indexed column itself must be a vector
-column, while the extra columns can be used to filter search results.
-
-The supported types are:
-
-* ``ascii``
-* ``bigint``
-* ``blob``
-* ``boolean``
-* ``date``
-* ``decimal``
-* ``double``
-* ``float``
-* ``inet``
-* ``int``
-* ``smallint``
-* ``text``
-* ``varchar``
-* ``time``
-* ``timestamp``
-* ``timeuuid``
-* ``tinyint``
-* ``uuid``
-* ``varint``
-
-
 The following options are supported for vector indexes. All of them are optional.

 +------------------------------+----------------------------------------------------------------------------------------------------------+---------------+
--- a/docs/features/automatic-repair.rst
+++ b/docs/features/automatic-repair.rst
@@ -3,9 +3,9 @@
 Automatic Repair
 ================

-Traditionally, launching :doc:`repairs </operating-scylla/procedures/maintenance/repair>` in a ScyllaDB cluster is left to an external process, typically done via `Scylla Manager <https://manager.docs.scylladb.com/stable/repair/index.html>`_.
+Traditionally, launching `repairs </operating-scylla/procedures/maintenance/repair>`_ in a ScyllaDB cluster is left to an external process, typically done via `Scylla Manager <https://manager.docs.scylladb.com/stable/repair/index.html>`_.

-Automatic repair offers built-in scheduling in ScyllaDB itself. If the time since the last repair is greater than the configured repair interval, ScyllaDB will start a repair for the :doc:`tablet table </architecture/tablets>` automatically.
+Automatic repair offers built-in scheduling in ScyllaDB itself. If the time since the last repair is greater than the configured repair interval, ScyllaDB will start a repair for the tablet `tablet </architecture/tablets>`_ automatically.
 Repairs are spread over time and among nodes and shards, to avoid load spikes or any adverse effects on user workloads.

 To enable automatic repair, add this to the configuration (``scylla.yaml``):
@@ -20,4 +20,4 @@ More featureful configuration methods will be implemented in the future.

 To disable, set ``auto_repair_enabled_default: false``.

-Automatic repair relies on :doc:`Incremental Repair </features/incremental-repair>` and as such it only works with :doc:`tablet </architecture/tablets>` tables.
+Automatic repair relies on `Incremental Repair </features/incremental-repair>`_ and as such it only works with `tablet </architecture/tablets>`_ tables.
--- a/docs/features/incremental-repair.rst
+++ b/docs/features/incremental-repair.rst
@@ -3,7 +3,7 @@
 Incremental Repair
 ==================

-ScyllaDB's standard :doc:`repair </operating-scylla/procedures/maintenance/repair>` process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.
+ScyllaDB's standard `repair </operating-scylla/procedures/maintenance/repair>`_ process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.

 The core idea of incremental repair is to repair only the data that has been written or changed since the last repair was run. It intelligently skips data that has already been verified, dramatically reducing the time, I/O, and CPU resources required for the repair operation.

@@ -51,7 +51,7 @@ Benefits of Incremental Repair
 *   **Reduced Resource Usage:** Consumes significantly less CPU, I/O, and network bandwidth compared to a full repair.
 *   **More Frequent Repairs:** The efficiency of incremental repair allows you to run it more frequently, ensuring a higher level of data consistency across your cluster at all times.

-Tables using Incremental Repair can schedule repairs in ScyllaDB itself, with :doc:`Automatic Repair </features/automatic-repair>`.
+Tables using Incremental Repair can schedule repairs in ScyllaDB itself, with `Automatic Repair </features/automatic-repair>`_.

 Notes
 -----
--- a/docs/getting-started/install-scylla/index.rst
+++ b/docs/getting-started/install-scylla/index.rst
@@ -10,6 +10,7 @@ Install ScyllaDB |CURRENT_VERSION|
   /getting-started/install-scylla/launch-on-azure
   /getting-started/installation-common/scylla-web-installer
   /getting-started/install-scylla/install-on-linux
+   /getting-started/installation-common/install-jmx
   /getting-started/install-scylla/run-in-docker
   /getting-started/installation-common/unified-installer
   /getting-started/installation-common/air-gapped-install
@@ -23,9 +24,9 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa
  :id: "getting-started"
  :class: my-panel

-  * :doc:`Launch ScyllaDB on AWS </getting-started/install-scylla/launch-on-aws>`
-  * :doc:`Launch ScyllaDB on GCP </getting-started/install-scylla/launch-on-gcp>`
-  * :doc:`Launch ScyllaDB on Azure </getting-started/install-scylla/launch-on-azure>`
+  * :doc:`Launch ScyllaDB |CURRENT_VERSION| on AWS </getting-started/install-scylla/launch-on-aws>`
+  * :doc:`Launch ScyllaDB |CURRENT_VERSION| on GCP </getting-started/install-scylla/launch-on-gcp>`
+  * :doc:`Launch ScyllaDB |CURRENT_VERSION| on Azure </getting-started/install-scylla/launch-on-azure>`


 .. panel-box::
@@ -34,7 +35,8 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa
  :class: my-panel

  * :doc:`Install ScyllaDB with Web Installer (recommended) </getting-started/installation-common/scylla-web-installer>`
-  * :doc:`Install ScyllaDB Linux Packages </getting-started/install-scylla/install-on-linux>`
+  * :doc:`Install ScyllaDB |CURRENT_VERSION| Linux Packages </getting-started/install-scylla/install-on-linux>`
+  * :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`
  * :doc:`Install ScyllaDB Without root Privileges </getting-started/installation-common/unified-installer>`
  * :doc:`Air-gapped Server Installation </getting-started/installation-common/air-gapped-install>`
  * :doc:`ScyllaDB Developer Mode </getting-started/installation-common/dev-mod>`
--- a/docs/getting-started/install-scylla/install-on-linux.rst
+++ b/docs/getting-started/install-scylla/install-on-linux.rst
@@ -94,6 +94,16 @@ Install ScyllaDB

               apt-get install scylla{,-server,-kernel-conf,-node-exporter,-conf,-python3,-cqlsh}=2025.3.1-0.20250907.2bbf3cf669bb-1

+
+        #. (Ubuntu only) Set Java 11.
+
+            .. code-block:: console
+    
+               sudo apt-get update
+               sudo apt-get install -y openjdk-11-jre-headless
+               sudo update-java-alternatives --jre-headless -s java-1.11.0-openjdk-amd64
+
+
   .. group-tab:: Centos/RHEL

        #. Install the EPEL repository.
@@ -147,6 +157,14 @@ Install ScyllaDB
    
               sudo yum install scylla-5.2.3

+(Optional) Install scylla-jmx
+-------------------------------
+
+    scylla-jmx is an optional package and is not installed by default.
+    If you need JMX server, see :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`.
+
+
+
 .. include:: /getting-started/_common/setup-after-install.rst

 Next Steps
--- a/docs/getting-started/installation-common/install-jmx.rst
+++ b/docs/getting-started/installation-common/install-jmx.rst
@@ -0,0 +1,78 @@
+
+======================================
+Install scylla-jmx Package
+======================================
+
+scylla-jmx is an optional package and is not installed by default.
+If you need JMX server, you can still install it from scylla-jmx GitHub page.
+
+.. tabs::
+
+   .. group-tab:: Debian/Ubuntu
+        #. Download .deb package from scylla-jmx page.
+
+            Access to https://github.com/scylladb/scylla-jmx, select latest
+            release from "releases", download a file end with ".deb".
+
+        #. (Optional) Transfer the downloaded package to the install node.
+
+            If the pc from which you downloaded the package is different from
+            the node where you install scylladb, you will need to transfer
+            the files to the node.
+
+        #. Install scylla-jmx package.
+
+            .. code-block:: console
+    
+               sudo apt install -y ./scylla-jmx_<version>_all.deb
+
+
+   .. group-tab:: Centos/RHEL
+
+        #. Download .rpm package from scylla-jmx page.
+
+            Access to https://github.com/scylladb/scylla-jmx, select latest
+            release from "releases", download a file end with ".rpm".
+
+        #. (Optional) Transfer the downloaded package to the install node.
+
+            If the pc from which you downloaded the package is different from
+            the node where you install scylladb, you will need to transfer
+            the files to the node.
+
+        #. Install scylla-jmx package.
+
+            .. code-block:: console
+    
+               sudo yum install -y ./scylla-jmx-<version>.noarch.rpm
+
+
+   .. group-tab:: Install without root privileges
+
+        #. Download .tar.gz package from scylla-jmx page.
+
+            Access to https://github.com/scylladb/scylla-jmx, select latest
+            release from "releases", download a file end with ".tar.gz".
+
+        #. (Optional) Transfer the downloaded package to the install node.
+
+            If the pc from which you downloaded the package is different from
+            the node where you install scylladb, you will need to transfer
+            the files to the node.
+
+        #. Install scylla-jmx package.
+
+            .. code:: console
+    
+                tar xpf scylla-jmx-<version>.noarch.tar.gz
+                cd scylla-jmx
+                ./install.sh --nonroot
+
+Next Steps
+-----------
+
+* :doc:`Configure ScyllaDB </getting-started/system-configuration>`
+* Manage your clusters with `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_
+* Monitor your cluster and data with `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_
+* Get familiar with ScyllaDB’s :doc:`command line reference guide </operating-scylla/nodetool>`.
+* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_
--- a/docs/getting-started/installation-common/unified-installer.rst
+++ b/docs/getting-started/installation-common/unified-installer.rst
@@ -14,35 +14,44 @@ Prerequisites
 Ensure your platform is supported by the ScyllaDB version you want to install. 
 See :doc:`OS Support </getting-started/os-support>` for information about supported Linux distributions and versions.

+Note that if you're on CentOS 7, only root offline installation is supported.
+
 Download and Install
 -----------------------

 #. Download the latest tar.gz file for ScyllaDB version (x86 or ARM) from ``https://downloads.scylladb.com/downloads/scylla/relocatable/scylladb-<version>/``.

-   **Example** for version 2025.1:
-   
-   - Go to https://downloads.scylladb.com/downloads/scylla/relocatable/scylladb-2025.1/
-   - Download the ``scylla-unified`` file for the patch version you want to
-     install. For example, to install 2025.1.9 (x86), download
-     ``scylla-unified-2025.1.9-0.20251010.6c539463bbda.x86_64.tar.gz``.
+   Example for version 6.1: https://downloads.scylladb.com/downloads/scylla/relocatable/scylladb-6.1/

 #. Uncompress the downloaded package.

-   **Example** for version 2025.1.9 (x86) (downloaded in the previous step):
+   The following example shows the package for ScyllaDB 6.1.1 (x86):

-   .. code::
+   .. code:: console

-    tar xvfz scylla-unified-2025.1.9-0.20251010.6c539463bbda.x86_64.tar.gz
+    tar xvfz scylla-unified-6.1.1-0.20240814.8d90b817660a.x86_64.tar.gz

-#. (Root offline installation only) For root offline installation on Debian-like
-   systems, two additional packages, ``xfsprogs`` and ``mdadm``, should be
-   installed to be used in RAID setup.
+#. Install OpenJDK 8 or 11.
+
+   The following example shows Java installation on a CentOS-like system:
+
+   .. code:: console
+    
+    sudo yum install -y java-11-openjdk-headless
+
+   For root offline installation on Debian-like systems, two additional packages, ``xfsprogs`` 
+   and ``mdadm``, should be installed to be used in RAID setup.

 #. Install ScyllaDB as a user with non-root privileges:

   .. code:: console

-    ./install.sh --nonroot
+    ./install.sh --nonroot --python3 ~/scylladb/python3/bin/python3
+
+#. (Optional) Install scylla-jmx
+
+    scylla-jmx is an optional package and is not installed by default.
+    If you need JMX server, see :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`.

 Configure and Run ScyllaDB
 ----------------------------
@@ -72,14 +81,19 @@ Run nodetool:

 .. code:: console

-    ~/scylladb/bin/nodetool nodetool status
+    ~/scylladb/share/cassandra/bin/nodetool status

 Run cqlsh:

 .. code:: console

-    ~/scylladb/bin/cqlsh 
+    ~/scylladb/share/cassandra/bin/cqlsh 

+Run cassandra-stress:
+
+.. code:: console
+
+    ~/scylladb/share/cassandra/bin/cassandra-stress write

 .. note::

@@ -110,7 +124,7 @@ Nonroot install

    ./install.sh --upgrade --nonroot

-.. note:: The installation script does not upgrade scylla-tools. You will have to upgrade them separately. 
+.. note:: The installation script does not upgrade scylla-jmx and scylla-tools. You will have to upgrade them separately. 

 Uninstall
 ===========
@@ -140,4 +154,4 @@ Next Steps
 * Manage your clusters with `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_
 * Monitor your cluster and data with `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_
 * Get familiar with ScyllaDB’s :doc:`command line reference guide </operating-scylla/nodetool>`.
-* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_
+* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_
--- a/docs/operating-scylla/admin-tools/scylla-sstable.rst
+++ b/docs/operating-scylla/admin-tools/scylla-sstable.rst
@@ -601,11 +601,7 @@ Scrub has several modes:
 * **segregate** - Fixes partition/row/mutation-fragment out-of-order errors by segregating the output into as many SStables as required so that the content of each output SStable is properly ordered.
 * **validate** - Validates the content of the SStable, reporting any corruptions found. Writes no output SStables. In this mode, scrub has the same outcome as the `validate operation <scylla-sstable-validate-operation_>`_ - and the validate operation is recommended over scrub.

-Output SStables are written to the directory specified via ``--output-directory``. They will be written with the ``BIG`` format and the highest supported SStable format, with generations chosen by scylla-sstable. Generations are chosen such
-that they are unique among the SStables written by the current scrub.
-
-The output directory must be empty; otherwise, scylla-sstable will abort scrub. You can allow writing to a non-empty directory by setting the ``--unsafe-accept-nonempty-output-dir`` command line flag.
-Note that scrub will be aborted if an SStable cannot be written because its generation clashes with a pre-existing SStable in the output directory.
+Output SStables are written to the directory specified via ``--output-dir``. They will be written with the ``BIG`` format and the highest supported SStable format, with random generation.

 validate-checksums
 ^^^^^^^^^^^^^^^^^^
@@ -870,7 +866,7 @@ The SSTable version to be used can be overridden with the ``--version`` flag, al
 SSTables which are already on the designated version are skipped. To force rewriting *all* SSTables, use the ``--all`` flag. 

 Output SSTables are written to the path provided by the ``--output-dir`` flag, or to the current directory if not specified.
-This directory is expected to exist and be empty. If not empty the tool will refuse to run. This can be overridden with the ``--unsafe-accept-nonempty-output-dir`` flag.
+This directory is expected to exist.

 It is strongly recommended to use the system schema tables as the schema source for this command, see the :ref:`schema options <scylla-sstable-schema>` for more details.
 A schema which is good enough to read the SSTable and dump its content, may not be good enough to write its content back verbatim.
@@ -882,6 +878,25 @@ But even an altered schema which changed only the table options can lead to data

 The mapping of input SSTables to output SSTables is printed to ``stdout``.

+filter
+^^^^^^
+
+Filter the SSTable(s), including/excluding specified partitions.
+
+Similar to ``scylla sstable dump-data --partition|--partition-file``, with some notable differences:
+
+* Instead of dumping the content to stdout, the filtered content is written back to SSTable(s) on disk.
+* Also supports negative filters (keep all partitions except the those specified).
+
+The partition list can be provided either via the ``--partition`` command line argument, or via a file path passed to the the ``--partitions-file`` argument. The file should contain one partition key per line.
+Partition keys should be provided in the hex format, as produced by `scylla types serialize </operating-scylla/admin-tools/scylla-types/>`_.
+
+With ``--include``, only the specified partitions are kept from the input SSTable(s). With ``--exclude``, the specified partitions are discarded and won't be written to the output SSTable(s).
+It is possible that certain input SSTable(s) won't have any content left after the filtering. These input SSTable(s) will not have a matching output SSTable.
+
+By default, each input sstable is filtered individually. Use ``--merge`` to filter the combined content of all input sstables, producing a single output SSTable.
+
+Output sstables use the latest supported sstable format (can be changed with ``--sstable-version``).

 Examples
 --------
--- a/docs/operating-scylla/nodetool-commands/rebuild.rst
+++ b/docs/operating-scylla/nodetool-commands/rebuild.rst
@@ -25,8 +25,4 @@ For Example:

   nodetool rebuild <source-dc-name>

-``nodetool rebuild`` command works only for vnode keyspaces. For tablet keyspaces, use ``nodetool cluster repair`` instead.
-
-See :doc:`Data Distribution with Tablets </architecture/tablets/>`.
-
 .. include:: nodetool-index.rst
--- a/docs/operating-scylla/procedures/cluster-management/add-dc-to-existing-dc.rst
+++ b/docs/operating-scylla/procedures/cluster-management/add-dc-to-existing-dc.rst
@@ -155,6 +155,7 @@ Add New DC
      UN   54.235.9.159    109.75 KB       256     ?               39798227-9f6f-4868-8193-08570856c09a    RACK1
      UN   54.146.228.25   128.33 KB       256     ?               7a4957a1-9590-4434-9746-9c8a6f796a0c    RACK1

+.. TODO possibly provide additional information WRT how ALTER works with tablets

 #. When all nodes are up and running ``ALTER`` the following Keyspaces in the new nodes:

@@ -170,68 +171,26 @@ Add New DC

      DESCRIBE KEYSPACE mykeyspace;

-      CREATE KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3};
+      CREATE KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3};

   ALTER Command

   .. code-block:: cql

-      ALTER KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
-      ALTER KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
-      ALTER KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
+      ALTER KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
+      ALTER KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
+      ALTER KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};

   After

   .. code-block:: cql

      DESCRIBE KEYSPACE mykeyspace;
-      CREATE KEYSPACE mykeyspace WITH REPLICATION = {'class': 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
-      CREATE KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
-      CREATE KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
+      CREATE KEYSPACE mykeyspace WITH REPLICATION = {'class’: 'NetworkTopologyStrategy', <exiting_dc>:3, <new_dc>: 3};
+      CREATE KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
+      CREATE KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};

-   For tablet keyspaces, update the replication factor one by one:
-
-   .. code-block:: cql
-
-      DESCRIBE KEYSPACE mykeyspace2;
-
-      CREATE KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3} AND tablets = { 'enabled': true };
-
-   .. code-block:: cql
-
-      ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 1} AND tablets = { 'enabled': true };
-      ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 2} AND tablets = { 'enabled': true };
-      ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3} AND tablets = { 'enabled': true };
-
-   .. note::
-         If ``rf_rack_valid_keyspaces`` option is set, a tablet keyspace needs to use rack list replication factor, so that a new DC (rack) can be added. See :ref:`the conversion procedure <conversion-to-rack-list-rf>`. In this case, to add a datacenter:
-
-         Before
-
-         .. code-block:: cql
-
-            DESCRIBE KEYSPACE mykeyspace3;
-
-            CREATE KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>']} AND tablets = { 'enabled': true };
-
-         Add all the nodes to the new datacenter and then alter the keyspace one by one:
-
-         .. code-block:: cql
-
-            ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>']} AND tablets = { 'enabled': true };
-            ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>']} AND tablets = { 'enabled': true };
-            ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
-
-         After
-
-         .. code-block:: cql
-
-            DESCRIBE KEYSPACE mykeyspace3;
-            CREATE KEYSPACE mykeyspace3 WITH REPLICATION = {'class': 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
-
-         Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
-
-#. If any vnode keyspace was altered, run ``nodetool rebuild`` on each node in the new datacenter, specifying the existing datacenter name in the rebuild command.
+#. Run ``nodetool rebuild`` on each node in the new datacenter, specify the existing datacenter name in the rebuild command.

   For example:

@@ -239,7 +198,7 @@ Add New DC

   The rebuild ensures that the new nodes that were just added to the cluster will recognize the existing datacenters in the cluster.

-#. If any vnode keyspace was altered, run a full cluster repair, using :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair>` on each node, or using `ScyllaDB Manager ad-hoc repair <https://manager.docs.scylladb.com/stable/repair>`_
+#. Run a full cluster repair, using :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair>` on each node, or using `ScyllaDB Manager ad-hoc repair <https://manager.docs.scylladb.com/stable/repair>`_

 #. If you are using ScyllaDB Monitoring, update the `monitoring stack <https://monitoring.docs.scylladb.com/stable/install/monitoring_stack.html#configure-scylla-nodes-from-files>`_ to monitor it. If you are using ScyllaDB Manager, make sure you install the `Manager Agent <https://manager.docs.scylladb.com/stable/install-scylla-manager-agent.html>`_ and Manager can access the new DC.

--- a/docs/operating-scylla/procedures/cluster-management/decommissioning-data-center.rst
+++ b/docs/operating-scylla/procedures/cluster-management/decommissioning-data-center.rst
@@ -40,14 +40,12 @@ Prerequisites
 Procedure
 ---------

-#. If there are vnode keyspaces in this DC, run the ``nodetool repair -pr`` command on each node in the data-center that is going to be decommissioned. This will verify that all the data is in sync between the decommissioned data-center and the other data-centers in the cluster.
+#. Run the ``nodetool repair -pr`` command on each node in the data-center that is going to be decommissioned. This will verify that all the data is in sync between the decommissioned data-center and the other data-centers in the cluster.

   For example:

   If the ASIA-DC cluster is to be removed, then, run the ``nodetool repair -pr`` command on all the nodes in the ASIA-DC

-#. If there are tablet keyspaces in this DC, run the ``nodetool cluster repair`` on an arbitrary node. The reason for running repair is to ensure that any updates stored only on the about-to-be-decommissioned replicas are propagated to the other replicas, before the replicas on the decommissioned datacenter are dropped.
-
 #. ALTER every cluster KEYSPACE, so that the keyspaces will no longer replicate data to the decommissioned data-center.

   For example:
@@ -75,33 +73,6 @@ Procedure

      cqlsh> ALTER KEYSPACE nba WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3};

-   For tablet keyspaces, update the replication factor one by one:
-
-   .. code-block:: shell
-
-      cqlsh> DESCRIBE nba2
-      cqlsh> CREATE KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 2, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
-
-   .. code-block:: shell
-
-      cqlsh> ALTER KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 1, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
-      cqlsh> ALTER KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
-
-   .. note::
-         If ``rf_rack_valid_keyspaces`` option is set, a tablet keyspace needs to use rack list replication factor, so that the DC can be removed. See :ref:`the conversion procedure <conversion-to-rack-list-rf>`. In this case, to remove a datacenter:
-
-         .. code-block:: shell
-
-            cqlsh> DESCRIBE nba3
-            cqlsh> CREATE KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : ['RAC4', 'RAC5'], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
-
-         .. code-block:: shell
-
-            cqlsh> ALTER KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : ['RAC4'], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
-            cqlsh> ALTER KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : [], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
-
-         Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
-
 #. Run :doc:`nodetool decommission </operating-scylla/nodetool-commands/decommission>` on every node in the data center that is to be removed.
   Refer to :doc:`Remove a Node from a ScyllaDB Cluster - Down Scale </operating-scylla/procedures/cluster-management/remove-node>` for further information.

--- a/docs/operating-scylla/procedures/maintenance/repair.rst
+++ b/docs/operating-scylla/procedures/maintenance/repair.rst
@@ -61,9 +61,9 @@ See also
 Incremental Repair
 ------------------

-Built on top of :ref:`Row-level Repair <row-level-repair>` and :doc:`Tablets </architecture/tablets>`, Incremental Repair enables frequent and quick repairs. For more details, see :doc:`Incremental Repair </features/incremental-repair>`.
+Built on top of `Row-level Repair <row-level-repair_>`_ and `Tablets </architecture/tablets>`_, Incremental Repair enables frequent and quick repairs. For more details, see `Incremental Repair </features/incremental-repair>`_.

 Automatic Repair
 ----------------

-Built on top of :doc:`Incremental Repair </features/incremental-repair>`, :doc:`Automatic Repair </features/automatic-repair>` offers repair scheduling and execution directly in ScyllaDB, without external processes.
+Built on top of `Incremental Repair </features/incremental-repair>`_, `Automatic Repair </features/automatic-repair>`_ offers repair scheduling and execution directly in ScyllaDB, without external processes.
--- a/docs/upgrade/about-upgrade.rst
+++ b/docs/upgrade/about-upgrade.rst
@@ -11,13 +11,9 @@ ScyllaDB. This means that:

 * You should follow the upgrade policy:

-   * Starting with version **2025.4**, upgrades can **skip minor versions** if:
-
-       * They remain within the same major version (for example, upgrading
-         directly from *2025.1 → 2025.4* is supported).
-       * You upgrade to the next major version (for example, upgrading
-         directly from *2025.3 → 2026.1* is supported).
-
+   * Starting with version **2025.4**, upgrades can skip minor versions as long
+     as they remain within the same major version (for example, upgrading directly
+     from 2025.1 → 2025.4 is supported).
   * For versions **prior to 2025.4**, upgrades must be performed consecutively—
     each successive X.Y version must be installed in order, **without skipping
     any major or minor version** (for example, upgrading directly from 2025.1 → 2025.3
--- a/docs/upgrade/upgrade-guides/index.rst
+++ b/docs/upgrade/upgrade-guides/index.rst
@@ -4,7 +4,8 @@ Upgrade ScyllaDB

 .. toctree::
   
-   ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1/index>
+   ScyllaDB 2025.x to ScyllaDB 2025.4 <upgrade-guide-from-2025.x-to-2025.4/index>
+   ScyllaDB 2025.4 Patch Upgrades <upgrade-guide-from-2025.4.x-to-2025.4.y>
   ScyllaDB Image <ami-upgrade>


--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.4.x-to-2025.4.y.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.4.x-to-2025.4.y.rst
@@ -0,0 +1,266 @@
+.. |SCYLLA_NAME| replace:: ScyllaDB
+
+.. |SRC_VERSION| replace:: 2025.4.x
+.. |NEW_VERSION| replace:: 2025.4.y
+
+==========================================================================
+Upgrade - |SCYLLA_NAME| |SRC_VERSION| to |NEW_VERSION| (Patch Upgrades)
+==========================================================================
+
+This document describes a step-by-step procedure for upgrading from
+|SCYLLA_NAME| |SRC_VERSION|  to |SCYLLA_NAME| |NEW_VERSION| (where "y" is
+the latest available version), and rolling back to version |SRC_VERSION|
+if necessary.
+
+This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL),
+CentOS, Debian, and Ubuntu.
+See :doc:`OS Support by Platform and Version </getting-started/os-support>`
+for information about supported versions.
+
+It also applies to the ScyllaDB official image on EC2, GCP, or Azure.
+
+Upgrade Procedure
+=================
+
+.. note::
+   Apply the following procedure **serially** on each node. Do not move to the next
+   node before validating that the node is up and running the new version.
+
+A ScyllaDB upgrade is a rolling procedure that does **not** require a full cluster
+shutdown. For each of the nodes in the cluster, you will:
+
+#. Drain the node and back up the data.
+#. Backup configuration file.
+#. Stop ScyllaDB.
+#. Download and install new ScyllaDB packages.
+#. Start ScyllaDB.
+#. Validate that the upgrade was successful.
+
+**Before** upgrading, check which version you are running now using
+``scylla --version``. Note the current version in case you want to roll back
+the upgrade.
+
+**During** the rolling upgrade it is highly recommended:
+
+* Not to use new |NEW_VERSION| features.
+* Not to run administration functions, like repairs, refresh, rebuild or add
+  or remove nodes. See
+  `sctool <https://manager.docs.scylladb.com/stable/sctool/>`_ for suspending
+  ScyllaDB Manager's scheduled or running repairs.
+* Not to apply schema changes.
+
+Upgrade Steps
+=============
+
+Back up the data
+------------------------------
+
+Back up all the data to an external device. We recommend using
+`ScyllaDB Manager <https://manager.docs.scylladb.com/stable/backup/index.html>`_
+to create backups.
+
+Alternatively, you can use the ``nodetool snapshot`` command.
+For **each** node in the cluster, run the following:
+
+.. code:: sh
+
+   nodetool drain
+   nodetool snapshot
+
+Take note of the directory name that nodetool gives you, and copy all
+the directories with this name under ``/var/lib/scylla`` to a backup device.
+
+When the upgrade is completed on all nodes, remove the snapshot with the 
+``nodetool clearsnapshot -t <snapshot>`` command to prevent running out of
+space.
+
+Back up the configuration file
+------------------------------
+
+Back up the ``scylla.yaml`` configuration file and the ScyllaDB packages
+in case you need to roll back the upgrade.
+
+.. tabs::
+
+   .. group-tab:: Debian/Ubuntu
+
+      .. code:: sh
+
+         sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+         sudo cp /etc/apt/sources.list.d/scylla.list ~/scylla.list-backup
+
+   .. group-tab:: RHEL/CentOS
+
+      .. code:: sh 
+         
+         sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+         sudo cp /etc/yum.repos.d/scylla.repo ~/scylla.repo-backup
+
+Gracefully stop the node
+------------------------
+
+.. code:: sh
+
+   sudo service scylla-server stop
+
+Download and install the new release
+------------------------------------
+
+You don’t need to update the ScyllaDB DEB or RPM repo when you upgrade to
+a patch release.
+
+.. tabs::
+
+   .. group-tab:: Debian/Ubuntu
+
+        To install a patch version on Debian or Ubuntu, run:
+
+        .. code:: sh
+            
+            sudo apt-get clean all
+            sudo apt-get update
+            sudo apt-get dist-upgrade scylla
+
+        Answer ‘y’ to the first two questions.
+
+   .. group-tab:: RHEL/CentOS
+
+        To install a patch version on RHEL or CentOS, run:
+
+        .. code:: sh
+            
+            sudo yum clean all
+            sudo yum update scylla\* -y
+        
+   .. group-tab:: EC2/GCP/Azure Ubuntu Image
+
+        If you're using the ScyllaDB official image (recommended), see 
+        the **Debian/Ubuntu** tab for upgrade instructions.
+
+        If you're using your own image and have installed ScyllaDB packages for 
+        Ubuntu or Debian, you need to apply an extended upgrade procedure:
+
+        #. Install the new ScyllaDB version with the additional
+           ``scylla-machine-image`` package:
+
+            .. code-block:: console
+
+               sudo apt-get clean all
+               sudo apt-get update
+               sudo apt-get dist-upgrade scylla
+               sudo apt-get dist-upgrade scylla-machine-image
+        #. Run ``scylla_setup`` without ``running io_setup``.
+        #. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.
+
+Start the node
+--------------
+
+.. code:: sh
+
+   sudo service start scylla-server
+
+Validate
+--------
+#. Check cluster status with ``nodetool status`` and make sure **all** nodes,
+   including the one you just upgraded, are in UN status.
+#. Use ``curl -X GET "http://localhost:10000/storage_service/scylla_release_version"``
+   to check the ScyllaDB version.
+#. Use ``journalctl _COMM=scylla`` to check there are no new errors in the log.
+#. Check again after 2 minutes to validate that no new issues are introduced.
+
+Once you are sure the node upgrade is successful, move to the next node in
+the cluster.
+
+Rollback Procedure
+==================
+
+The following procedure describes a rollback from ScyllaDB release
+|NEW_VERSION| to |SRC_VERSION|. Apply this procedure if an upgrade from
+|SRC_VERSION| to |NEW_VERSION| failed before completing on all nodes. 
+
+* Use this procedure only on nodes you upgraded to |NEW_VERSION|.
+* Execute the following commands one node at a time, moving to the next node only
+  after the rollback procedure is completed successfully.
+
+ScyllaDB rollback is a rolling procedure that does **not** require a full
+cluster shutdown. For each of the nodes to roll back to |SRC_VERSION|, you will:
+
+#. Drain the node and stop ScyllaDB.
+#. Downgrade to the previous release.
+#. Restore the configuration file.
+#. Restart ScyllaDB.
+#. Validate the rollback success.
+
+Rollback Steps
+==============
+
+Gracefully shutdown ScyllaDB
+-----------------------------
+
+.. code:: sh
+    
+   nodetool drain
+   sudo service stop scylla-server
+
+Downgrade to the previous release
+----------------------------------
+
+.. tabs::
+
+   .. group-tab:: Debian/Ubuntu
+
+        To downgrade to |SRC_VERSION| on Debian or Ubuntu, run:
+
+        .. code-block:: console
+            :substitutions:
+
+            sudo apt-get install scylla=|SRC_VERSION|\* scylla-server=|SRC_VERSION|\* scylla-tools=|SRC_VERSION|\* scylla-tools-core=|SRC_VERSION|\* scylla-kernel-conf=|SRC_VERSION|\* scylla-conf=|SRC_VERSION|\*
+        
+        Answer ‘y’ to the first two questions.
+
+   .. group-tab:: RHEL/CentOS
+
+        To downgrade to |SRC_VERSION| on RHEL or CentOS, run:
+
+        .. code-block:: console
+            :substitutions:
+
+            sudo yum downgrade scylla\*-|SRC_VERSION|-\* -y
+
+   .. group-tab:: EC2/GCP/Azure Ubuntu Image
+
+        If you’re using the ScyllaDB official image (recommended), see
+        the **Debian/Ubuntu** tab for upgrade instructions.
+
+        If you’re using your own image and have installed ScyllaDB packages for
+        Ubuntu or Debian, you need to additionally downgrade
+        the ``scylla-machine-image`` package.
+
+        .. code-block:: console
+            :substitutions:
+
+            sudo apt-get install scylla=|SRC_VERSION|\* scylla-server=|SRC_VERSION|\* scylla-tools=|SRC_VERSION|\* scylla-tools-core=|SRC_VERSION|\* scylla-kernel-conf=|SRC_VERSION|\* scylla-conf=|SRC_VERSION|\*
+            sudo apt-get install scylla-machine-image=|SRC_VERSION|\*
+        
+        Answer ‘y’ to the first two questions.
+
+
+Restore the configuration file
+------------------------------
+
+.. code:: sh
+   
+   sudo rm -rf /etc/scylla/scylla.yaml
+   sudo cp -a /etc/scylla/scylla.yaml.backup /etc/scylla/scylla.yaml
+
+Start the node
+--------------
+
+.. code:: sh
+
+   sudo service scylla-server start
+
+Validate
+--------
+Check upgrade instruction above for validation. Once you are sure the node
+rollback is successful, move to the next node in the cluster.
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/index.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/index.rst
@@ -0,0 +1,13 @@
+==========================================================
+Upgrade - ScyllaDB 2025.x to ScyllaDB 2025.4
+==========================================================
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+   Upgrade ScyllaDB <upgrade-guide-from-2025.x-to-2025.4>
+   Metrics Update <metric-update-2025.x-to-2025.4>
+
+* :doc:`Upgrade from ScyllaDB 2025.x to ScyllaDB 2025.4 <upgrade-guide-from-2025.x-to-2025.4>`
+* :doc:`Metrics Update Between 2025.x and 2025.4 <metric-update-2025.x-to-2025.4>`
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/metric-update-2025.x-to-2025.4.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/metric-update-2025.x-to-2025.4.rst
@@ -0,0 +1,68 @@
+.. |SRC_VERSION| replace:: 2025.x
+.. |NEW_VERSION| replace:: 2025.4
+.. |PRECEDING_VERSION| replace:: 2025.3
+
+================================================================
+Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
+================================================================
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
+
+
+New Metrics in |NEW_VERSION|
+--------------------------------------
+
+The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
+
+.. list-table::
+   :widths: 25 150
+   :header-rows: 1
+
+   * - Metric
+     - Description
+   * - scylla_database_total_view_updates_due_to_replica_count_mismatch	
+     - The total number of view updates for which there were more view replicas
+       than base replicas and we had to generate an extra view update because
+       the additional view replica wouldn't get paired with any base replica.
+       It should only increase during the Replication Factor (RF) change. It
+       should stop increasing shortly after finishing the RF change.
+   * - scylla_database_total_writes_rejected_due_to_out_of_space_prevention
+     - Counts write operations that were rejected due to disabled user tables
+       writes.
+   * - scylla_index_query_latencies
+     - Index query latencies.
+   * - scylla_reactor_aio_retries
+     - The total number of IOCB-s re-submitted via thread-pool.
+   * - scylla_reactor_io_threaded_fallbacks
+     - The total number of io-threaded-fallbacks operations.
+   * - scylla_repair_inc_sst_read_bytes
+     - The total number of bytes read from SStables for incremental repair
+       on this shard.
+   * - scylla_repair_inc_sst_skipped_bytes
+     - The total number of bytes skipped from SStables for incremental repair
+       on this shard.
+   * - scylla_repair_tablet_time_ms
+     - The time spent on tablet repair on this shard (in milliseconds).
+   * - scylla_s3_downloads_blocked_on_memory
+     - Counts the number of times the S3 client downloads were delayed due to
+       insufficient memory availability.
+   * - scylla_s3_memory_usage
+     - The total number of bytes consumed by the S3 client.
+   * - scylla_s3_total_read_prefetch_bytes
+     - The total number of bytes requested from object.
+   * - scylla_storage_proxy_replica_fenced_out_requests
+     - The number of requests that resulted in a stale_topology_exception.
+   * - scylla_vector_store_dns_refreshes	
+     - The number of DNS refreshes.
+
+New and Updated Metrics in Previous 2025.x Releases
+-------------------------------------------------------
+
+* `Metrics Update Between 2025.2 and 2025.3 <https://docs.scylladb.com/manual/branch-2025.3/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/metric-update-2025.2-to-2025.3.html>`_
+* `Metrics Update Between 2025.1 and 2025.2 <https://docs.scylladb.com/manual/branch-2025.2/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/metric-update-2025.1-to-2025.2.html>`_
+
+
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/upgrade-guide-from-2025.x-to-2025.4.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/upgrade-guide-from-2025.x-to-2025.4.rst
@@ -1,13 +1,13 @@
 .. |SCYLLA_NAME| replace:: ScyllaDB

 .. |SRC_VERSION| replace:: 2025.x
-.. |NEW_VERSION| replace:: 2026.1
+.. |NEW_VERSION| replace:: 2025.4

 .. |ROLLBACK| replace:: rollback
 .. _ROLLBACK: ./#rollback-procedure

-.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.x to 2026.1
-.. _SCYLLA_METRICS: ../metric-update-2025.x-to-2026.1
+.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.x to 2025.4
+.. _SCYLLA_METRICS: ../metric-update-2025.x-to-2025.4

 =======================================================================================
 Upgrade from |SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION|
@@ -17,12 +17,10 @@ This document describes a step-by-step procedure for upgrading from |SCYLLA_NAME
 to |SCYLLA_NAME| |NEW_VERSION| and rollback to version |SRC_VERSION| if necessary.

 This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL), CentOS, Debian, 
-and Ubuntu.
-See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_ 
-for information about supported versions. It also applies when using
-the ScyllaDB official image on EC2, GCP, or Azure.
+and Ubuntu. See :doc:`OS Support by Platform and Version </getting-started/os-support>` 
+for information about supported versions.

-See :doc:`About Upgrade </upgrade/about-upgrade/>` for the ScyllaDB upgrade policy.
+It also applies when using the ScyllaDB official image on EC2, GCP, or Azure.

 Before You Upgrade ScyllaDB
 ==============================
@@ -151,9 +149,8 @@ You should take note of the current version in case you want to |ROLLBACK|_ the
        #. Update the ScyllaDB deb repo to |NEW_VERSION|.

            .. code-block:: console
-               :substitutions:

-               sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
+               sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/scylla-2025.4.list

        #. Install the new ScyllaDB version:

@@ -170,9 +167,8 @@ You should take note of the current version in case you want to |ROLLBACK|_ the
        #. Update the ScyllaDB rpm repo to |NEW_VERSION|.

            .. code-block:: console
-               :substitutions:

-               sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
+               sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/scylla-2025.4.repo

        #. Install the new ScyllaDB version:

@@ -202,6 +198,11 @@ You should take note of the current version in case you want to |ROLLBACK|_ the
      #. Run ``scylla_setup`` without ``running io_setup``.
      #. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.

+
+If you need JMX server, see
+:doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`
+and get new version. 
+
 Start the node
 --------------

--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/index.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/index.rst
@@ -1,13 +0,0 @@
-==========================================================
-Upgrade - ScyllaDB 2025.x to ScyllaDB 2026.1
-==========================================================
-
-.. toctree::
-   :maxdepth: 2
-   :hidden:
-
-   Upgrade ScyllaDB <upgrade-guide-from-2025.x-to-2026.1>
-   Metrics Update <metric-update-2025.x-to-2026.1>
-
-* :doc:`Upgrade from ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1>`
-* :doc:`Metrics Update Between 2025.x and 2026.1 <metric-update-2025.x-to-2026.1>`
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/metric-update-2025.x-to-2026.1.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/metric-update-2025.x-to-2026.1.rst
@@ -1,82 +0,0 @@
-.. |SRC_VERSION| replace:: 2025.x
-.. |NEW_VERSION| replace:: 2026.1
-.. |PRECEDING_VERSION| replace:: 2025.4
-
-================================================================
-Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
-================================================================
-
-.. toctree::
-   :maxdepth: 2
-   :hidden:
-
-ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
-
-
-New Metrics in |NEW_VERSION|
--------------------------------------
-
-The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
-
-.. list-table::
-   :widths: 25 150
-   :header-rows: 1
-
-   * - Metric
-     - Description
-   * - scylla_alternator_operation_size_kb
-     - Histogram of item sizes involved in a request.
-   * - scylla_column_family_total_disk_space_before_compression
-     - Hypothetical total disk space used if data files weren't compressed
-   * - scylla_group_name_auto_repair_enabled_nr
-     - Number of tablets with auto repair enabled.
-   * - scylla_group_name_auto_repair_needs_repair_nr
-     - Number of tablets with auto repair enabled that currently need repair.
-   * - scylla_lsa_compact_time_ms
-     - Total time spent on segment compaction that was not accounted under ``reclaim_time_ms``.
-   * - scylla_lsa_evict_time_ms
-     - Total time spent on evicting objects that was not accounted under ``reclaim_time_ms``,
-   * - scylla_lsa_reclaim_time_ms
-     - Total time spent in reclaiming LSA memory back to std allocator.
-   * - scylla_object_storage_memory_usage
-     - Total number of bytes consumed by the object storage client.
-   * - scylla_tablet_ops_failed
-     - Number of failed tablet auto repair attempts.
-   * - scylla_tablet_ops_succeeded
-     - Number of successful tablet auto repair attempts.
-   
-Renamed Metrics in |NEW_VERSION|
--------------------------------------
-
-The following metrics are renamed in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
-
-.. list-table::
-   :widths: 25 150
-   :header-rows: 1
-
-   * - Metric Name in |PRECEDING_VERSION|
-     - Metric Name in |NEW_VERSION|
-   * - scylla_s3_memory_usage
-     - scylla_object_storage_memory_usage
-
-Removed Metrics in |NEW_VERSION|
--------------------------------------
-
-The following metrics are removed in ScyllaDB |NEW_VERSION|.
-
-* scylla_redis_current_connections
-* scylla_redis_op_latency
-* scylla_redis_operation
-* scylla_redis_operation
-* scylla_redis_requests_latency
-* scylla_redis_requests_served
-* scylla_redis_requests_serving
-
-New and Updated Metrics in Previous Releases
-------------------------------------------------------
-
-* `Metrics Update Between 2025.3 and 2025.4 <https://docs.scylladb.com/manual/branch-2025.4/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/metric-update-2025.x-to-2025.4.html>`_
-* `Metrics Update Between 2025.2 and 2025.3 <https://docs.scylladb.com/manual/branch-2025.3/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/metric-update-2025.2-to-2025.3.html>`_
-* `Metrics Update Between 2025.1 and 2025.2 <https://docs.scylladb.com/manual/branch-2025.2/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/metric-update-2025.1-to-2025.2.html>`_
-
-
--- a/ent/encryption/gcp_host.cc
+++ b/ent/encryption/gcp_host.cc
@@ -284,7 +284,6 @@ future<rjson::value> encryption::gcp_host::impl::gcp_auth_post_with_retry(std::s
                }
                [[fallthrough]];
            case httpclient::reply_status::request_timeout:
-            case httpclient::reply_status::too_many_requests:
                if (retry < max_retries) {
                    // service unavailable etc -> backoff + retry
                    do_backoff = true;
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -182,7 +182,7 @@ public:
    gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
    gms::feature size_based_load_balancing { *this, "SIZE_BASED_LOAD_BALANCING"sv };
    gms::feature topology_noop_request { *this, "TOPOLOGY_NOOP_REQUEST"sv };
-    gms::feature batchlog_v2 { *this, "BATCHLOG_V2"sv };
+    gms::feature tablets_intermediate_fallback_cleanup { *this, "TABLETS_INTERMEDIATE_FALLBACK_CLEANUP"sv };
 public:

    const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
--- a/index/vector_index.cc
+++ b/index/vector_index.cc
@@ -17,11 +17,11 @@
 #include "index/secondary_index.hh"
 #include "index/secondary_index_manager.hh"
 #include "types/concrete_types.hh"
-#include "types/types.hh"
 #include "utils/managed_string.hh"
 #include <seastar/core/sstring.hh>
 #include <boost/algorithm/string.hpp>

+
 namespace secondary_index {

 static void validate_positive_option(int max, const sstring& value_name, const sstring& value) {
@@ -147,88 +147,17 @@ std::optional<cql3::description> vector_index::describe(const index_metadata& im
 }

 void vector_index::check_target(const schema& schema, const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) const {
-
-    struct validate_visitor {
-        const class schema& schema;
-        bool& is_vector;
-
-        /// Vector indexes support filtering on native types that can be used as primary key columns.
-        /// There is no counter (it cannot be used with vector columns)
-        /// and no duration (it cannot be used as a primary key or in secondary indexes).
-        static bool is_supported_filtering_column(abstract_type const & kind_type) {
-            switch (kind_type.get_kind()) {
-                case abstract_type::kind::ascii:
-                case abstract_type::kind::boolean:
-                case abstract_type::kind::byte:
-                case abstract_type::kind::bytes:
-                case abstract_type::kind::date:
-                case abstract_type::kind::decimal:
-                case abstract_type::kind::double_kind:
-                case abstract_type::kind::float_kind:
-                case abstract_type::kind::inet:
-                case abstract_type::kind::int32:
-                case abstract_type::kind::long_kind:
-                case abstract_type::kind::short_kind:
-                case abstract_type::kind::simple_date:
-                case abstract_type::kind::time:
-                case abstract_type::kind::timestamp:
-                case abstract_type::kind::timeuuid:
-                case abstract_type::kind::utf8:
-                case abstract_type::kind::uuid:
-                case abstract_type::kind::varint:
-                    return true;
-                default:
-                    break;
-            }
-            return false;
-        }
-
-        void validate(cql3::column_identifier const& column, bool is_vector) const {
-            auto const& c_name = column.to_string();
-            auto const* c_def = schema.get_column_definition(column.name());
-            if (c_def == nullptr) {
-                throw exceptions::invalid_request_exception(format("Column {} not found in schema", c_name));
-            }
-
-            auto type = c_def->type;
-
-            if (is_vector) {
-                auto const* vector_type = dynamic_cast<const vector_type_impl*>(type.get());
-                if (vector_type == nullptr) {
-                    throw exceptions::invalid_request_exception("Vector indexes are only supported on columns of vectors of floats");
-                }
-
-                auto elements_type = vector_type->get_elements_type();
-                if (elements_type->get_kind() != abstract_type::kind::float_kind) {
-                    throw exceptions::invalid_request_exception("Vector indexes are only supported on columns of vectors of floats");
-                }
-                return;
-            }
-
-            if (!is_supported_filtering_column(*type)) {
-                throw exceptions::invalid_request_exception(format("Unsupported vector index filtering column {} type", c_name));
-            }
-        }
-
-        void operator()(const std::vector<::shared_ptr<cql3::column_identifier>>& columns) const {
-            for (const auto& column : columns) {
-                // CQL restricts the secondary local index to have multiple columns with partition key only.
-                // Vectors shouldn't be partition key columns and they aren't supported as a filtering column,
-                // so we can assume here that these are non-vectors filtering columns.
-                validate(*column, false);
-            }
-        }
-
-        void operator()(const ::shared_ptr<cql3::column_identifier>& column) {
-            validate(*column, is_vector);
-            // The first column is the vector column, the rest mustn't be vectors.
-            is_vector = false;
-        }
-    };
-
-    bool is_vector = true;
-    for (const auto& target : targets) {
-        std::visit(validate_visitor{.schema = schema, .is_vector = is_vector}, target->value);
+    if (targets.size() != 1) {
+        throw exceptions::invalid_request_exception("Vector index can only be created on a single column");
+    }
+    auto target = targets[0];
+    auto c_def = schema.get_column_definition(to_bytes(target->column_name()));
+    if (!c_def) {
+        throw exceptions::invalid_request_exception(format("Column {} not found in schema", target->column_name()));
+    }
+    auto type = c_def->type;
+    if (!type->is_vector() || static_cast<const vector_type_impl*>(type.get())->get_elements_type()->get_kind() != abstract_type::kind::float_kind) {
+        throw exceptions::invalid_request_exception(format("Vector indexes are only supported on columns of vectors of floats", target->column_name()));
    }
 }

--- a/install.sh
+++ b/install.sh
@@ -347,8 +347,8 @@ install -d -m755 "$retc"/scylla.d
 scylla_yaml_dir=$(mktemp -d)
 scylla_yaml=$scylla_yaml_dir/scylla.yaml
 grep -v api_ui_dir conf/scylla.yaml | grep -v api_doc_dir > $scylla_yaml
-echo "api_ui_dir: $prefix/swagger-ui/dist/" >> $scylla_yaml
-echo "api_doc_dir: $prefix/api/api-doc/" >> $scylla_yaml
+echo "api_ui_dir: /opt/scylladb/swagger-ui/dist/" >> $scylla_yaml
+echo "api_doc_dir: /opt/scylladb/api/api-doc/" >> $scylla_yaml
 installconfig 644 $scylla_yaml "$retc"/scylla
 rm -rf $scylla_yaml_dir

--- a/locator/tablets.cc
+++ b/locator/tablets.cc
@@ -50,6 +50,8 @@ write_replica_set_selector get_selector_for_writes(tablet_transition_stage stage
            return write_replica_set_selector::previous;
        case tablet_transition_stage::write_both_read_old:
            return write_replica_set_selector::both;
+        case tablet_transition_stage::write_both_read_old_fallback_cleanup:
+            return write_replica_set_selector::both;
        case tablet_transition_stage::streaming:
            return write_replica_set_selector::both;
        case tablet_transition_stage::rebuild_repair:
@@ -81,6 +83,8 @@ read_replica_set_selector get_selector_for_reads(tablet_transition_stage stage)
            return read_replica_set_selector::previous;
        case tablet_transition_stage::write_both_read_old:
            return read_replica_set_selector::previous;
+        case tablet_transition_stage::write_both_read_old_fallback_cleanup:
+            return read_replica_set_selector::previous;
        case tablet_transition_stage::streaming:
            return read_replica_set_selector::previous;
        case tablet_transition_stage::rebuild_repair:
@@ -612,16 +616,12 @@ tablet_replica tablet_map::get_primary_replica(tablet_id id, const locator::topo
    return maybe_get_primary_replica(id, replicas, topo, [&] (const auto& _) { return true; }).value();
 }

-tablet_replica tablet_map::get_secondary_replica(tablet_id id, const locator::topology& topo) const {
-    const auto& orig_replicas = get_tablet_info(id).replicas;
-    if (orig_replicas.size() < 2) {
+tablet_replica tablet_map::get_secondary_replica(tablet_id id) const {
+    if (get_tablet_info(id).replicas.size() < 2) {
        throw std::runtime_error(format("No secondary replica for tablet id {}", id));
    }
-    tablet_replica_set replicas = orig_replicas;
-    std::ranges::sort(replicas, tablet_replica_comparator(topo));
-    // This formula must match the one in get_primary_replica(),
-    // just with + 1.
-    return replicas.at((size_t(id) + size_t(id) / replicas.size() + 1) % replicas.size());
+    const auto& replicas = get_tablet_info(id).replicas;
+    return replicas.at((size_t(id)+1) % replicas.size());
 }

 std::optional<tablet_replica> tablet_map::maybe_get_selected_replica(tablet_id id, const topology& topo, const tablet_task_info& tablet_task_info) const {
@@ -745,6 +745,7 @@ void tablet_map::set_tablet_raft_info(tablet_id id, tablet_raft_info raft_info)
 static const std::unordered_map<tablet_transition_stage, sstring> tablet_transition_stage_to_name = {
    {tablet_transition_stage::allow_write_both_read_old, "allow_write_both_read_old"},
    {tablet_transition_stage::write_both_read_old, "write_both_read_old"},
+    {tablet_transition_stage::write_both_read_old_fallback_cleanup, "write_both_read_old_fallback_cleanup"},
    {tablet_transition_stage::write_both_read_new, "write_both_read_new"},
    {tablet_transition_stage::streaming, "streaming"},
    {tablet_transition_stage::rebuild_repair, "rebuild_repair"},
--- a/locator/tablets.hh
+++ b/locator/tablets.hh
@@ -277,6 +277,7 @@ std::optional<tablet_info> merge_tablet_info(tablet_info a, tablet_info b);
 enum class tablet_transition_stage {
    allow_write_both_read_old,
    write_both_read_old,
+    write_both_read_old_fallback_cleanup,
    streaming,
    rebuild_repair,
    write_both_read_new,
@@ -647,10 +648,9 @@ public:
    /// Returns the primary replica for the tablet
    tablet_replica get_primary_replica(tablet_id id, const locator::topology& topo) const;

-    /// Returns the secondary replica for the tablet: the replica that immediately follows the primary
-    /// replica in the topology-sorted replica list.
+    /// Returns the secondary replica for the tablet, which is assumed to be directly following the primary replica in the replicas vector
    /// \throws std::runtime_error if the tablet has less than 2 replicas.
-    tablet_replica get_secondary_replica(tablet_id id, const locator::topology& topo) const;
+    tablet_replica get_secondary_replica(tablet_id id) const;

    // Returns the replica that matches hosts and dcs filters for tablet_task_info.
    std::optional<tablet_replica> maybe_get_selected_replica(tablet_id id, const topology& topo, const tablet_task_info& tablet_task_info) const;
--- a/main.cc
+++ b/main.cc
@@ -571,7 +571,7 @@ sharded<service::storage_proxy> *the_storage_proxy;
 // This is used by perf-alternator to allow running scylla together with the tool
 // in a single process. So that it's easier to measure internals. It's not added
 // to main_func_type to not complicate common flow as no other tool needs such logic.
-std::function<void(lw_shared_ptr<db::config>)> after_init_func;
+std::function<future<>(lw_shared_ptr<db::config>, sharded<abort_source>&)> after_init_func;

 static locator::host_id initialize_local_info_thread(sharded<db::system_keyspace>& sys_ks,
        sharded<locator::snitch_ptr>& snitch,
@@ -906,6 +906,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl

            auto background_reclaim_scheduling_group = create_scheduling_group("background_reclaim", "bgre", 50).get();
            auto maintenance_scheduling_group = create_scheduling_group("streaming", "strm", 200).get();
+            debug::streaming_scheduling_group = maintenance_scheduling_group;

            smp::invoke_on_all([&cfg, background_reclaim_scheduling_group] {
                logalloc::tracker::config st_cfg;
@@ -1306,6 +1307,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            checkpoint(stop_signal, "starting storage proxy");
            service::storage_proxy::config spcfg {
                .hints_directory_initializer = hints_dir_initializer,
+                .hints_sched_group = maintenance_scheduling_group,
            };
            spcfg.hinted_handoff_enabled = hinted_handoff_enabled;
            spcfg.available_memory = memory::stats().total_memory();
@@ -1677,7 +1679,9 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
                    gossiper.local(), feature_service.local(), sys_ks.local(), group0_client, dbcfg.gossip_scheduling_group};

            checkpoint(stop_signal, "starting tablet allocator");
-            service::tablet_allocator::config tacfg;
+            service::tablet_allocator::config tacfg {
+                .background_sg = maintenance_scheduling_group,
+            };
            sharded<service::tablet_allocator> tablet_allocator;
            tablet_allocator.start(tacfg, std::ref(mm_notifier), std::ref(db)).get();
            auto stop_tablet_allocator = defer_verbose_shutdown("tablet allocator", [&tablet_allocator] {
@@ -2417,7 +2421,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            bm_cfg.delay = std::chrono::milliseconds(cfg->ring_delay_ms());
            bm_cfg.replay_cleanup_after_replays = cfg->batchlog_replay_cleanup_after_replays();

-            bm.start(std::ref(qp), std::ref(sys_ks), std::ref(feature_service), bm_cfg).get();
+            bm.start(std::ref(qp), std::ref(sys_ks), bm_cfg).get();
            auto stop_batchlog_manager = defer_verbose_shutdown("batchlog manager", [&bm] {
                bm.stop().get();
            });
@@ -2490,7 +2494,9 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl

            if (cfg->view_building()) {
                checkpoint(stop_signal, "starting view builders");
-                view_builder.invoke_on_all(&db::view::view_builder::start, std::ref(mm), utils::cross_shard_barrier()).get();
+                with_scheduling_group(maintenance_scheduling_group, [&mm] {
+                    return view_builder.invoke_on_all(&db::view::view_builder::start, std::ref(mm), utils::cross_shard_barrier());
+                }).get();
            }
            auto drain_view_builder = defer_verbose_shutdown("draining view builders", [&] {
                view_builder.invoke_on_all(&db::view::view_builder::drain).get();
@@ -2576,11 +2582,13 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            supervisor::notify("serving");

            startlog.info("Scylla version {} initialization completed.", scylla_version());
+            future<> after_init_fut = make_ready_future<>();
            if (after_init_func) {
-                after_init_func(cfg);
+                after_init_fut = after_init_func(cfg, stop_signal.as_sharded_abort_source());
            }
            stop_signal.wait().get();
            startlog.info("Signal received; shutting down");
+            std::move(after_init_fut).get();
 	    // At this point, all objects destructors and all shutdown hooks registered with defer() are executed
          } catch (const sleep_aborted&) {
            startlog.info("Startup interrupted");
@@ -2650,7 +2658,8 @@ int main(int ac, char** av) {
        {"perf-load-balancing", perf::scylla_tablet_load_balancing_main, "run tablet load balancer tests"},
        {"perf-simple-query", perf::scylla_simple_query_main, "run performance tests by sending simple queries to this server"},
        {"perf-sstable", perf::scylla_sstable_main, "run performance tests by exercising sstable related operations on this server"},
-        {"perf-alternator", perf::alternator(scylla_main, &after_init_func), "run performance tests on full alternator stack"}
+        {"perf-alternator", perf::alternator(scylla_main, &after_init_func), "run performance tests on full alternator stack"},
+        {"perf-cql-raw", perf::perf_cql_raw(scylla_main, &after_init_func), "run performance tests using raw CQL protocol frames"}
    };

    main_func_type main_func;
--- a/mutation/mutation.cc
+++ b/mutation/mutation.cc
@@ -316,7 +316,7 @@ auto fmt::formatter<mutation>::format(const mutation& m, fmt::format_context& ct
        ++column_iterator;
    }

-    return fmt::format_to(out, "token: {}}}, {}\n}}", dk._token, mutation_partition::printer(s, m.partition()));
+    return fmt::format_to(out, "token: {}}}, {}\n}}", dk.token(), mutation_partition::printer(s, m.partition()));
 }

 namespace mutation_json {
--- a/mutation/mutation.hh
+++ b/mutation/mutation.hh
@@ -126,7 +126,7 @@ public:
    const partition_key& key() const { return _ptr->_dk._key; };
    const dht::decorated_key& decorated_key() const { return _ptr->_dk; };
    dht::ring_position ring_position() const { return { decorated_key() }; }
-    const dht::token& token() const { return _ptr->_dk._token; }
+    dht::token token() const { return _ptr->_dk.token(); }
    const schema_ptr& schema() const { return _ptr->_schema; }
    const mutation_partition& partition() const { return _ptr->_p; }
    mutation_partition& partition() { return _ptr->_p; }
--- a/pgo/profiles/aarch64/profile.profdata.xz
+++ b/pgo/profiles/aarch64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:52c9772c9ac334650d8b179b591c47769ee38d34fad784b61c682e11c03f2506
-size 6530196
+oid sha256:a4710f1f0b0bb329721c21d133618e811e820f2e70553b0aca28fb278bff89c9
+size 6492280
--- a/pgo/profiles/x86_64/profile.profdata.xz
+++ b/pgo/profiles/x86_64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d1a869ebfe4e90d9681499246eb86bb032ae402c350357e19d97b989037a5bd3
-size 6528308
+oid sha256:2433f7a1fc5cda0dd990ab59587eb6046dca0fe1ae48d599953d1936fe014ed9
+size 6492176
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -1211,7 +1211,6 @@ private:
        }

        co_await utils::get_local_injector().inject("incremental_repair_prepare_wait", utils::wait_for_message(60s));
-        rlogger.debug("Disabling compaction for range={} for incremental repair", _range);
        auto reenablers_and_holders = co_await table.get_compaction_reenablers_and_lock_holders_for_repair(_db.local(), _frozen_topology_guard, _range);
        for (auto& lock_holder : reenablers_and_holders.lock_holders) {
            _rs._repair_compaction_locks[gid].push_back(std::move(lock_holder));
@@ -1241,8 +1240,6 @@ private:
        // compaction.
        reenablers_and_holders.cres.clear();
        rlogger.info("Re-enabled compaction for range={} for incremental repair", _range);
-
-        co_await utils::get_local_injector().inject("wait_after_prepare_sstables_for_incremental_repair", utils::wait_for_message(5min));
    }

    // Read rows from sstable until the size of rows exceeds _max_row_buf_size  - current_size
@@ -2636,7 +2633,7 @@ future<repair_flush_hints_batchlog_response> repair_service::repair_flush_hints_
                        all_replayed = co_await _bm.local().do_batch_log_replay(db::batchlog_manager::post_replay_cleanup::no);
                        utils::get_local_injector().set_parameter("repair_flush_hints_batchlog_handler", "issue_flush", fmt::to_string(flush_time));
                    }
-                    rlogger.info("repair[{}]: Finished to flush batchlog for repair_flush_hints_batchlog_request from node={}, flushed={} all_replayed={}", req.repair_uuid, from, issue_flush, all_replayed);
+                    rlogger.info("repair[{}]: Finished to flush batchlog for repair_flush_hints_batchlog_request from node={}, flushed={}", req.repair_uuid, from, issue_flush);
                }
            );
            if (!all_replayed) {
@@ -3956,19 +3953,3 @@ future<std::optional<repair_task_progress>> repair_service::get_tablet_repair_ta
            task_uuid, tid, requested, finished, progress.progress(), finished_nomerge);
    co_return progress;
 }
-
-void repair_service::on_cleanup_for_drop_table(const table_id& id) {
-    // Prevent repair lock from being leaked in repair_service when table is dropped midway.
-    // The RPC verb that removes the lock on success path will not be called by coordinator after table was dropped.
-    // We also cannot move the lock from repair_service to repair_meta, since the lock must outlive the latter.
-    // Since tablet metadata has been erased at this point, we can simply erase all instances for the dropped table.
-    rlogger.debug("Cleaning up state for dropped table {}", id);
-    for (auto it = _repair_compaction_locks.begin(); it != _repair_compaction_locks.end();) {
-        auto& [global_tid, _] = *it;
-        if (global_tid.table == id) {
-            it = _repair_compaction_locks.erase(it);
-        } else {
-            it++;
-        }
-    }
-}
--- a/repair/row_level.hh
+++ b/repair/row_level.hh
@@ -318,8 +318,6 @@ public:

    future<uint32_t> get_next_repair_meta_id();

-    void on_cleanup_for_drop_table(const table_id& id);
-
    friend class repair::user_requested_repair_task_impl;
    friend class repair::data_sync_repair_task_impl;
    friend class repair::tablet_repair_task_impl;
--- a/replica/compaction_group.hh
+++ b/replica/compaction_group.hh
@@ -448,7 +448,6 @@ public:
    virtual future<> maybe_split_compaction_group_of(size_t idx) = 0;
    virtual future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) = 0;
    virtual dht::token_range get_token_range_after_split(const dht::token&) const noexcept = 0;
-    virtual future<> wait_for_background_tablet_resize_work() = 0;

    virtual lw_shared_ptr<sstables::sstable_set> make_sstable_set() const = 0;
 };
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -1368,6 +1368,8 @@ public:
    future<compaction_reenablers_and_lock_holders> get_compaction_reenablers_and_lock_holders_for_repair(replica::database& db,
            const service::frozen_topology_guard& guard, dht::token_range range);
    future<uint64_t> estimated_partitions_in_range(dht::token_range tr) const;
+private:
+    future<std::vector<compaction::compaction_group_view*>> get_compaction_group_views_for_repair(dht::token_range range);
 };

 lw_shared_ptr<sstables::sstable_set> make_tablet_sstable_set(schema_ptr, const storage_group_manager& sgm, const locator::tablet_map&);
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -7,6 +7,7 @@
 */

 #include <seastar/core/seastar.hh>
+#include <seastar/core/shard_id.hh>
 #include <seastar/core/coroutine.hh>
 #include <seastar/core/with_scheduling_group.hh>
 #include <seastar/coroutine/maybe_yield.hh>
@@ -23,7 +24,6 @@
 #include "replica/data_dictionary_impl.hh"
 #include "replica/compaction_group.hh"
 #include "replica/query_state.hh"
-#include "seastar/core/shard_id.hh"
 #include "sstables/shared_sstable.hh"
 #include "sstables/sstable_set.hh"
 #include "sstables/sstables.hh"
@@ -750,7 +750,6 @@ public:
        return make_ready_future<std::vector<sstables::shared_sstable>>(std::vector<sstables::shared_sstable>{sst});
    }
    dht::token_range get_token_range_after_split(const dht::token&) const noexcept override { return dht::token_range(); }
-    future<> wait_for_background_tablet_resize_work() override { return make_ready_future<>(); }

    lw_shared_ptr<sstables::sstable_set> make_sstable_set() const override {
        return get_compaction_group().make_sstable_set();
@@ -769,13 +768,6 @@ class tablet_storage_group_manager final : public storage_group_manager {
    locator::resize_decision::seq_number_t _split_ready_seq_number = std::numeric_limits<locator::resize_decision::seq_number_t>::min();
    future<> _merge_completion_fiber;
    condition_variable _merge_completion_event;
-    // Ensures that processes such as incremental repair will wait for pending work from
-    // merge fiber before proceeding. This guarantees stability on the compaction groups.
-    // NOTE: it's important that we don't await on the barrier with any compaction group
-    // gate held, since merge fiber will stop groups that in turn await on gate,
-    // potentially causing an ABBA deadlock.
-    utils::phased_barrier _merge_fiber_barrier;
-    std::optional<utils::phased_barrier::operation> _pending_merge_fiber_work;
    // Holds compaction reenabler which disables compaction temporarily during tablet merge
    std::vector<compaction::compaction_reenabler> _compaction_reenablers_for_merging;
 private:
@@ -864,7 +856,6 @@ public:
        , _my_host_id(erm.get_token_metadata().get_my_id())
        , _tablet_map(&erm.get_token_metadata().tablets().get_tablet_map(schema()->id()))
        , _merge_completion_fiber(merge_completion_fiber())
-        , _merge_fiber_barrier(format("[table {}.{}] merge_fiber_barrier", _t.schema()->ks_name(), _t.schema()->cf_name()))
    {
        storage_group_map ret;

@@ -917,10 +908,6 @@ public:
    dht::token_range get_token_range_after_split(const dht::token& token) const noexcept override {
        return tablet_map().get_token_range_after_split(token);
    }
-    future<> wait_for_background_tablet_resize_work() override {
-        co_await _merge_fiber_barrier.advance_and_await();
-        co_return;
-    }

    lw_shared_ptr<sstables::sstable_set> make_sstable_set() const override {
        // FIXME: avoid recreation of compound_set for groups which had no change. usually, only one group will be changed at a time.
@@ -2133,31 +2120,33 @@ compaction_group::update_repaired_at_for_merge() {
    });
 }

+future<std::vector<compaction::compaction_group_view*>> table::get_compaction_group_views_for_repair(dht::token_range range) {
+    std::vector<compaction::compaction_group_view*> ret;
+    auto sgs = storage_groups_for_token_range(range);
+    for (auto& sg : sgs) {
+        co_await coroutine::maybe_yield();
+        sg->for_each_compaction_group([&ret] (const compaction_group_ptr& cg) {
+            ret.push_back(&cg->view_for_unrepaired_data());
+        });
+    }
+    co_return ret;
+}
+
 future<compaction_reenablers_and_lock_holders> table::get_compaction_reenablers_and_lock_holders_for_repair(replica::database& db,
        const service::frozen_topology_guard& guard, dht::token_range range) {
    auto ret = compaction_reenablers_and_lock_holders();
-    // Waits for background tablet resize work like merge that might destroy compaction groups,
-    // providing stability. Essentially, serializes tablet merge completion handling with
-    // the start of incremental repair, from the replica side.
-    co_await _sg_manager->wait_for_background_tablet_resize_work();
-
-    for (auto sg : storage_groups_for_token_range(range)) {
-      // FIXME: indentation
-      auto cgs = sg->compaction_groups_immediate();
-      for (auto& cg : cgs) {
-        auto gate_holder = cg->async_gate().hold();
-        auto& view = cg->view_for_unrepaired_data();
-        auto cre = co_await db.get_compaction_manager().await_and_disable_compaction(view);
+    auto views = co_await get_compaction_group_views_for_repair(range);
+    for (auto view : views) {
+        auto cre = co_await db.get_compaction_manager().await_and_disable_compaction(*view);
        tlogger.info("Disabled compaction for range={} session_id={} for incremental repair", range, guard);
        ret.cres.push_back(std::make_unique<compaction::compaction_reenabler>(std::move(cre)));

        // This lock prevents the unrepaired compaction started by major compaction to run in parallel with repair.
        // The unrepaired compaction started by minor compaction does not need to take the lock since it ignores
        // sstables being repaired, so it can run in parallel with repair.
-        auto lock_holder = co_await db.get_compaction_manager().get_incremental_repair_write_lock(view, "row_level_repair");
+        auto lock_holder = co_await db.get_compaction_manager().get_incremental_repair_write_lock(*view, "row_level_repair");
        tlogger.info("Got unrepaired compaction and repair lock for range={} session_id={} for incremental repair", range, guard);
        ret.lock_holders.push_back(std::move(lock_holder));
-      }
    }
    co_return ret;
 }
@@ -3029,7 +3018,7 @@ future<> tablet_storage_group_manager::merge_completion_fiber() {

    while (!_t.async_gate().is_closed()) {
        try {
-            co_await utils::get_local_injector().inject("merge_completion_fiber", utils::wait_for_message(5min));
+            co_await utils::get_local_injector().inject("merge_completion_fiber", utils::wait_for_message(60s));
            auto ks_name = schema()->ks_name();
            auto cf_name = schema()->cf_name();
            // Enable compaction after merge is done.
@@ -3063,7 +3052,6 @@ future<> tablet_storage_group_manager::merge_completion_fiber() {
        utils::get_local_injector().inject("replica_merge_completion_wait", [] () {
            tlogger.info("Merge completion fiber finished, about to sleep");
        });
-        _pending_merge_fiber_work.reset();
        co_await _merge_completion_event.wait();
        tlogger.debug("Merge completion fiber woke up for {}.{}", schema()->ks_name(), schema()->cf_name());
    }
@@ -3122,7 +3110,6 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
        new_storage_groups[new_tid] = std::move(new_sg);
    }
    _storage_groups = std::move(new_storage_groups);
-    _pending_merge_fiber_work = _merge_fiber_barrier.start();
    _merge_completion_event.signal();
 }

@@ -3139,9 +3126,6 @@ void tablet_storage_group_manager::update_effective_replication_map(const locato
    } else if (new_tablet_count < old_tablet_count) {
        tlogger.info0("Detected tablet merge for table {}.{}, decreasing from {} to {} tablets",
                      schema()->ks_name(), schema()->cf_name(), old_tablet_count, new_tablet_count);
-        if (utils::get_local_injector().is_enabled("tablet_force_tablet_count_decrease_once")) {
-            utils::get_local_injector().disable("tablet_force_tablet_count_decrease");
-        }
        handle_tablet_merge_completion(*old_tablet_map, *new_tablet_map);
    }

--- a/service/client_state.cc
+++ b/service/client_state.cc
@@ -98,16 +98,6 @@ future<> service::client_state::has_column_family_access(const sstring& ks,
    co_return co_await has_access(ks, {p, r, t}, is_vector_indexed);
 }

-future<> service::client_state::has_schema_access(const schema& s, auth::permission p) const {
-    auth::resource r = auth::make_data_resource(s.ks_name(), s.cf_name());
-    co_return co_await has_access(s.ks_name(), {p, r});
-}
-
-future<> service::client_state::has_schema_access(const sstring& ks_name, const sstring& cf_name, auth::permission p) const {
-    auth::resource r = auth::make_data_resource(ks_name, cf_name);
-    co_return co_await has_access(ks_name, {p, r});
-}
-
 future<> service::client_state::check_internal_table_permissions(std::string_view ks, std::string_view table_name, const auth::command_desc& cmd) const {
    // 1. CDC and $paxos tables are managed internally by Scylla. Users are prohibited
    //    from running ALTER or DROP commands on them.
@@ -227,8 +217,6 @@ future<> service::client_state::has_access(const sstring& ks, auth::command_desc
    static const std::unordered_set<auth::resource> vector_search_system_resources = {
        auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::GROUP0_HISTORY),
        auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::VERSIONS),
-        auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::CDC_STREAMS),
-        auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::CDC_TIMESTAMPS),
    };

    if ((cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) ||
@@ -365,4 +353,4 @@ future<> service::client_state::set_client_options(
        });
        _client_options.emplace_back(std::move(cached_key), std::move(cached_value));
    }
-}
+}
--- a/service/client_state.hh
+++ b/service/client_state.hh
@@ -359,8 +359,6 @@ public:
    future<> has_keyspace_access(const sstring&, auth::permission) const;
    future<> has_column_family_access(const sstring&, const sstring&, auth::permission,
                                      auth::command_desc::type = auth::command_desc::type::OTHER, std::optional<bool> is_vector_indexed = std::nullopt) const;
-    future<> has_schema_access(const schema& s, auth::permission p) const;
-    future<> has_schema_access(const sstring&, const sstring&, auth::permission p) const;

    future<> has_functions_access(auth::permission p) const;
    future<> has_functions_access(const sstring& ks, auth::permission p) const;
--- a/service/qos/service_level_controller.cc
+++ b/service/qos/service_level_controller.cc
@@ -948,6 +948,10 @@ future<> service_level_controller::migrate_to_v2(size_t nodes_count, db::system_
        qs,
        {},
        cql3::query_processor::cache_internal::no);
+    if (rows->empty()) {
+        co_return;
+    }
+    

    auto col_names = schema->all_columns() | std::views::transform([] (const auto& col) {return col.name_as_cql_string(); }) | std::ranges::to<std::vector<sstring>>();
    auto col_names_str = fmt::to_string(fmt::join(col_names, ", "));
--- a/service/raft/group0_state_id_handler.cc
+++ b/service/raft/group0_state_id_handler.cc
@@ -72,7 +72,7 @@ void group0_state_id_handler::refresh() {
    const auto min_state_id = std::ranges::min(group0_members_state_ids, [](auto a, auto b) {
        if (!a || !b) {
            // This should never happen, but if it does, it's a bug.
-            on_internal_error(slogger, "unexpected empty state_id");
+            on_fatal_internal_error(slogger, "unexpected empty state_id");
        }
        return utils::timeuuid_tri_compare(a, b) < 0;
    });
--- a/service/raft/group0_state_machine.cc
+++ b/service/raft/group0_state_machine.cc
@@ -338,7 +338,7 @@ future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merg
 }

 #ifndef SCYLLA_BUILD_MODE_RELEASE
-static void ensure_group0_schema(const group0_command& cmd, const replica::database& db) {
+static void ensure_group0_schema(const group0_command& cmd, data_dictionary::database db) {
    auto validate_schema = [&db](const utils::chunked_vector<canonical_mutation>& mutations) {
        for (const auto& mut : mutations) {
            // Get the schema for the column family
@@ -382,7 +382,7 @@ future<> group0_state_machine::apply(std::vector<raft::command_cref> command) {

    // max_mutation_size = 1/2 of commitlog segment size, thus max_command_size is set 1/3 of commitlog segment size to leave space for metadata.
    size_t max_command_size = _sp.data_dictionary().get_config().commitlog_segment_size_in_mb() * 1024 * 1024 / 3;
-    group0_state_machine_merger m(co_await _client.sys_ks().get_last_group0_state_id(), std::move(read_apply_mutex_holder),
+    group0_state_machine_merger m(co_await _client.get_last_group0_state_id(), std::move(read_apply_mutex_holder),
                                  max_command_size, _sp.data_dictionary());

    for (auto&& c : command) {
@@ -392,7 +392,7 @@ future<> group0_state_machine::apply(std::vector<raft::command_cref> command) {
 #ifndef SCYLLA_BUILD_MODE_RELEASE
        // Ensure that the schema of the mutations is a group0 schema.
        // This validation is supposed to be only performed in tests, so it is skipped in the release mode.
-        ensure_group0_schema(cmd, _client.sys_ks().local_db());
+        ensure_group0_schema(cmd, _sp.data_dictionary());
 #endif

        slogger.trace("cmd: prev_state_id: {}, new_state_id: {}, creator_addr: {}, creator_id: {}",
--- a/service/raft/group0_voter_handler.cc
+++ b/service/raft/group0_voter_handler.cc
@@ -149,31 +149,19 @@ public:
        const auto& node = nodes_info.at(voter_id);

        if (node.is_alive) {
-            if (_alive_nodes_remaining == 0) {
-                on_internal_error(rvlogger,
-                        format("rack_info: no alive nodes remaining, but node {} is alive", voter_id));
-            }
+            SCYLLA_ASSERT(_alive_nodes_remaining > 0);
            --_alive_nodes_remaining;
            if (node.is_leader) {
-                if (!_owns_alive_leader) {
-                    on_internal_error(rvlogger,
-                            format("rack_info: rack doesn't own a live leader, but leader {} is alive", voter_id));
-                }
+                SCYLLA_ASSERT(_owns_alive_leader);
                _owns_alive_leader = false;
            }
        }
        if (node.is_voter) {
            if (node.is_alive) {
-                if (_existing_alive_voters_remaining == 0) {
-                    on_internal_error(rvlogger,
-                            format("rack_info: no live voters remaining, but voter {} is alive", voter_id));
-                }
+                SCYLLA_ASSERT(_existing_alive_voters_remaining > 0);
                --_existing_alive_voters_remaining;
            } else {
-                if (_existing_dead_voters_remaining == 0) {
-                    on_internal_error(rvlogger,
-                            format("rack_info: no dead voters remaining, but voter {} is dead", voter_id));
-                }
+                SCYLLA_ASSERT(_existing_dead_voters_remaining > 0);
                --_existing_dead_voters_remaining;
            }
        }
@@ -291,25 +279,16 @@ public:

            if (node.is_alive) {
                if (node.is_voter) {
-                    if (_existing_alive_voters_remaining == 0) {
-                        on_internal_error(rvlogger,
-                                format("datacenter_info: no live voters remaining, but voter {} is alive", *voter_id));
-                    }
+                    SCYLLA_ASSERT(_existing_alive_voters_remaining > 0);
                    --_existing_alive_voters_remaining;
                }
                if (node.is_leader) {
-                    if (!_owns_alive_leader) {
-                        on_internal_error(rvlogger,
-                                format("datacenter_info: DC doesn't own a live leader, but leader {} is alive", *voter_id));
-                    }
+                    SCYLLA_ASSERT(_owns_alive_leader);
                    _owns_alive_leader = false;
                }
            }

-            if (_nodes_remaining == 0) {
-                on_internal_error(rvlogger,
-                        format("datacenter_info: no nodes remaining, but voter {} belongs to this DC", *voter_id));
-            }
+            SCYLLA_ASSERT(_nodes_remaining > 0);

            --_nodes_remaining;
            ++_assigned_voters_count;
--- a/service/raft/raft_group0_client.cc
+++ b/service/raft/raft_group0_client.cc
@@ -245,6 +245,10 @@ utils::UUID raft_group0_client::generate_group0_state_id(utils::UUID prev_state_
    return utils::UUID_gen::get_random_time_UUID_from_micros(std::chrono::microseconds{ts});
 }

+future<utils::UUID> raft_group0_client::get_last_group0_state_id() {
+    return _sys_ks.get_last_group0_state_id();
+}
+
 future<group0_guard> raft_group0_client::start_operation(seastar::abort_source& as, std::optional<raft_timeout> timeout) {
    if (this_shard_id() != 0) {
        on_internal_error(logger, "start_group0_operation: must run on shard 0");
@@ -282,7 +286,7 @@ future<group0_guard> raft_group0_client::start_operation(seastar::abort_source&
            // Read barrier may wait for `group0_state_machine::apply` which also takes this mutex.
            auto read_apply_holder = co_await hold_read_apply_mutex(as);

-            auto observed_group0_state_id = co_await _sys_ks.get_last_group0_state_id();
+            auto observed_group0_state_id = co_await get_last_group0_state_id();
            auto new_group0_state_id = generate_group0_state_id(observed_group0_state_id);

            co_return group0_guard {
@@ -467,10 +471,6 @@ future<semaphore_units<>> raft_group0_client::hold_read_apply_mutex(abort_source
    return get_units(_read_apply_mutex, 1, as);
 }

-db::system_keyspace& raft_group0_client::sys_ks() {
-    return _sys_ks;
-}
-
 bool raft_group0_client::in_recovery() const {
    return _upgrade_state == group0_upgrade_state::recovery;
 }
--- a/service/raft/raft_group0_client.hh
+++ b/service/raft/raft_group0_client.hh
@@ -200,8 +200,6 @@ public:

    future<semaphore_units<>> hold_read_apply_mutex(abort_source&);

-    db::system_keyspace& sys_ks();
-
    bool in_recovery() const;

    gc_clock::duration get_history_gc_duration() const;
@@ -212,6 +210,7 @@ public:
    query_result_guard create_result_guard(utils::UUID query_id);
    void set_query_result(utils::UUID query_id, service::broadcast_tables::query_result qr);
    static utils::UUID generate_group0_state_id(utils::UUID prev_state_id);
+    future<utils::UUID> get_last_group0_state_id();
 };

 using mutations_generator = coroutine::experimental::generator<mutation>;
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -123,7 +123,12 @@ utils::small_vector<locator::host_id, N> addr_vector_to_id(const gms::gossiper&
 // Check the effective replication map consistency:
 // we have an inconsistent effective replication map in case we the number of
 // read replicas is higher than the replication factor.
-[[maybe_unused]] void validate_read_replicas(const locator::effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) {
+void validate_read_replicas(const locator::effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) {
+    // Skip for non-debug builds.
+    if constexpr (!tools::build_info::is_debug_build()) {
+        return;
+    }
+
    const sstring error = erm.get_replication_strategy().sanity_check_read_replicas(erm, read_replicas);
    if (!error.empty()) {
        on_internal_error(slogger, error);
@@ -3216,9 +3221,9 @@ storage_proxy::storage_proxy(sharded<replica::database>& db, storage_proxy::conf
    , _write_ack_smp_service_group(cfg.write_ack_smp_service_group)
    , _next_response_id(std::chrono::system_clock::now().time_since_epoch()/1ms)
    , _hints_resource_manager(*this, cfg.available_memory / 10, _db.local().get_config().max_hinted_handoff_concurrency)
-    , _hints_manager(*this, _db.local().get_config().hints_directory(), cfg.hinted_handoff_enabled, _db.local().get_config().max_hint_window_in_ms(), _hints_resource_manager, _db)
+    , _hints_manager(*this, _db.local().get_config().hints_directory(), cfg.hinted_handoff_enabled, _db.local().get_config().max_hint_window_in_ms(), _hints_resource_manager, _db, cfg.hints_sched_group)
    , _hints_directory_initializer(std::move(cfg.hints_directory_initializer))
-    , _hints_for_views_manager(*this, _db.local().get_config().view_hints_directory(), {}, _db.local().get_config().max_hint_window_in_ms(), _hints_resource_manager, _db)
+    , _hints_for_views_manager(*this, _db.local().get_config().view_hints_directory(), {}, _db.local().get_config().max_hint_window_in_ms(), _hints_resource_manager, _db, cfg.hints_sched_group)
    , _stats_key(stats_key)
    , _features(feat)
    , _background_write_throttle_threahsold(cfg.available_memory / 10)
@@ -4286,7 +4291,7 @@ storage_proxy::mutate_atomically_result(utils::chunked_vector<mutation> mutation
    public:
        context(storage_proxy & p, utils::chunked_vector<mutation>&& mutations, lw_shared_ptr<cdc::operation_result_tracker>&& cdc_tracker, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, coordinator_mutate_options options)
                : _p(p)
-                , _schema(_p.local_db().find_schema(db::system_keyspace::NAME, _p.features().batchlog_v2 ? db::system_keyspace::BATCHLOG_V2 : db::system_keyspace::BATCHLOG))
+                , _schema(_p.local_db().find_schema(db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2))
                , _ermp(_p.local_db().find_column_family(_schema->id()).get_effective_replication_map())
                , _mutations(std::move(mutations))
                , _cdc_tracker(std::move(cdc_tracker))
@@ -6967,12 +6972,7 @@ host_id_vector_replica_set storage_proxy::get_endpoints_for_reading(const schema
        return host_id_vector_replica_set{my_host_id(erm)};
    }
    auto endpoints = erm.get_replicas_for_reading(token);
-    // Skip for non-debug builds and maintenance mode.
-    if constexpr (tools::build_info::is_debug_build()) {
-        if (!_db.local().get_config().maintenance_mode()) {
-            validate_read_replicas(erm, endpoints);
-        }
-    }
+    validate_read_replicas(erm, endpoints);
    auto it = std::ranges::remove_if(endpoints, std::not_fn(std::bind_front(&storage_proxy::is_alive, this, std::cref(erm)))).begin();
    endpoints.erase(it, endpoints.end());
    sort_endpoints_by_proximity(erm, endpoints);
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -195,6 +195,7 @@ public:
        // they need a separate smp_service_group to prevent an ABBA deadlock
        // with writes.
        smp_service_group write_ack_smp_service_group = default_smp_service_group();
+        scheduling_group hints_sched_group;
    };
 private:

--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -532,16 +532,9 @@ future<> storage_service::raft_topology_update_ip(locator::host_id id, gms::inet
    co_await when_all_succeed(sys_ks_futures.begin(), sys_ks_futures.end()).discard_result();
 }

-static std::unordered_set<locator::host_id> get_released_nodes(const service::topology& topology, const locator::token_metadata& tm) {
-    return boost::join(topology.left_nodes, topology.ignored_nodes)
-            | std::views::transform([] (const auto& raft_id) { return locator::host_id(raft_id.uuid()); })
-            | std::views::filter([&] (const auto& h) { return !tm.get_topology().has_node(h); })
-            | std::ranges::to<std::unordered_set<locator::host_id>>();
-}
-
 // Synchronizes the local node state (token_metadata, system.peers/system.local tables,
 // gossiper) to align it with the other raft topology nodes.
-future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal, std::optional<std::unordered_set<locator::host_id>> prev_released) {
+future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal) {
    nodes_to_notify_after_sync nodes_to_notify;

    rtlogger.trace("Start sync_raft_topology_nodes");
@@ -632,9 +625,7 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
            co_await update_topology_change_info(tmptr, ::format("{} {}/{}", rs.state, id, ip));
            break;
        case node_state::replacing: {
-            if (!_topology_state_machine._topology.req_param.contains(id)) {
-                on_internal_error(rtlogger, format("No request parameters for replacing node {}", id));
-            }
+            SCYLLA_ASSERT(_topology_state_machine._topology.req_param.contains(id));
            auto replaced_id = std::get<replace_param>(_topology_state_machine._topology.req_param[id]).replaced_id;
            auto existing_ip = _address_map.find(locator::host_id{replaced_id.uuid()});
            const auto replaced_host_id = locator::host_id(replaced_id.uuid());
@@ -651,7 +642,7 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
            co_await process_normal_node(id, host_id, ip, rs);
            break;
        default:
-            on_internal_error(rtlogger, ::format("Unexpected state {} for node {}", rs.state, id));
+            on_fatal_internal_error(rtlogger, ::format("Unexpected state {} for node {}", rs.state, id));
        }
    };

@@ -697,10 +688,13 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
        }
    }

-    if (prev_released) {
-        auto nodes_to_release = get_released_nodes(t, *tmptr);
-        std::erase_if(nodes_to_release, [&] (const auto& host_id) { return prev_released->contains(host_id); });
-        std::copy(nodes_to_release.begin(), nodes_to_release.end(), std::back_inserter(nodes_to_notify.released));
+    auto nodes_to_release = t.left_nodes;
+    nodes_to_release.insert(t.ignored_nodes.begin(), t.ignored_nodes.end());
+    for (const auto& id: nodes_to_release) {
+        auto host_id = locator::host_id(id.uuid());
+        if (!tmptr->get_topology().find_node(host_id)) {
+            nodes_to_notify.released.push_back(host_id);
+        }
    }

    co_await when_all_succeed(sys_ks_futures.begin(), sys_ks_futures.end()).discard_result();
@@ -738,10 +732,6 @@ future<> storage_service::topology_state_load(state_change_hint hint) {

    rtlogger.debug("reload raft topology state");
    std::unordered_set<raft::server_id> prev_normal = _topology_state_machine._topology.normal_nodes | std::views::keys | std::ranges::to<std::unordered_set>();
-    std::optional<std::unordered_set<locator::host_id>> prev_released;
-    if (!_topology_state_machine._topology.is_empty()) {
-        prev_released = get_released_nodes(_topology_state_machine._topology, get_token_metadata());
-    }

    std::unordered_set<locator::host_id> tablet_hosts = co_await replica::read_required_hosts(_qp);

@@ -842,7 +832,7 @@ future<> storage_service::topology_state_load(state_change_hint hint) {
        }, topology.tstate);
        tmptr->set_read_new(read_new);

-        auto nodes_to_notify = co_await sync_raft_topology_nodes(tmptr, std::move(prev_normal), std::move(prev_released));
+        auto nodes_to_notify = co_await sync_raft_topology_nodes(tmptr, std::move(prev_normal));

        std::optional<locator::tablet_metadata> tablets;
        if (hint.tablets_hint) {
@@ -6286,11 +6276,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
            break;
            case raft_topology_cmd::command::stream_ranges: {
                co_await with_scheduling_group(_db.local().get_streaming_scheduling_group(), coroutine::lambda([&] () -> future<> {
-                    const auto* server_rs = _topology_state_machine._topology.find(id);
-                    if (!server_rs) {
-                        on_internal_error(rtlogger, format("Got {} request for node {} not found in topology", cmd.cmd, id));
-                    }
-                    const auto rs = server_rs->second;
+                    const auto rs = _topology_state_machine._topology.find(id)->second;
                    auto tstate = _topology_state_machine._topology.tstate;
                    auto session = _topology_state_machine._topology.session;
                    if (!rs.ring || rs.ring->tokens.empty()) {
@@ -7342,15 +7328,11 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(

    const locator::host_id this_host = _db.local().get_token_metadata().get_my_id();

-    // Align to 64 bytes to avoid cache line ping-pong when updating size in map_reduce0() below
-    struct alignas(64) aligned_tablet_size {
-        uint64_t size = 0;
-    };
-    std::vector<aligned_tablet_size> tablet_sizes_per_shard(smp::count);
+    uint64_t sum_tablet_sizes = 0;

    // Each node combines a per-table load map from all of its shards and returns it to the coordinator.
    // So if there are 1k nodes, there will be 1k RPCs in total.
-    auto load_stats = co_await _db.map_reduce0([&table_ids, &this_host, &tablet_sizes_per_shard] (replica::database& db) -> future<locator::load_stats> {
+    auto load_stats = co_await _db.map_reduce0([&table_ids, &this_host, &sum_tablet_sizes] (replica::database& db) -> future<locator::load_stats> {
        locator::load_stats load_stats{};
        auto& tables_metadata = db.get_tables_metadata();

@@ -7388,7 +7370,7 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(

            locator::combined_load_stats combined_ls { table->table_load_stats(tablet_filter) };
            load_stats.tables.emplace(id, std::move(combined_ls.table_ls));
-            tablet_sizes_per_shard[this_shard_id()].size += load_stats.tablet_stats[this_host].add_tablet_sizes(combined_ls.tablet_ls);
+            sum_tablet_sizes += load_stats.tablet_stats[this_host].add_tablet_sizes(combined_ls.tablet_ls);

            co_await coroutine::maybe_yield();
        }
@@ -7407,10 +7389,6 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
    if (config_capacity != 0) {
        tls.effective_capacity = config_capacity;
    } else {
-        uint64_t sum_tablet_sizes = 0;
-        for (const auto& ts : tablet_sizes_per_shard) {
-            sum_tablet_sizes += ts.size;
-        }
        tls.effective_capacity = si.available + sum_tablet_sizes;
    }

@@ -8453,7 +8431,6 @@ future<> storage_service::start_maintenance_mode() {
    set_mode(mode::MAINTENANCE);

    return mutate_token_metadata([this] (mutable_token_metadata_ptr token_metadata) -> future<> {
-        token_metadata->update_topology(my_host_id(), _snitch.local()->get_location(), locator::node::state::normal, smp::count);
        return token_metadata->update_normal_tokens({ dht::token{} }, my_host_id());
    }, acquire_merge_lock::yes);
 }
@@ -8626,13 +8603,4 @@ future<> storage_service::query_cdc_streams(table_id table, noncopyable_function
    return _cdc_gens.local().query_cdc_streams(table, std::move(f));
 }

-future<> storage_service::on_cleanup_for_drop_table(const table_id& id) {
-    co_await container().invoke_on_all([id] (storage_service& ss) {
-        if (ss._repair.local_is_initialized()) {
-            ss._repair.local().on_cleanup_for_drop_table(id);
-        }
-    });
-    co_return;
-}
-
 } // namespace service
--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -617,8 +617,6 @@ public:
    virtual void on_drop_function(const sstring& ks_name, const sstring& function_name) override {}
    virtual void on_drop_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
    virtual void on_drop_view(const sstring& ks_name, const sstring& view_name) override {}
-
-    future<> on_cleanup_for_drop_table(const table_id& id);
 private:
    std::optional<db::system_keyspace::peer_info> get_peer_info_for_update(locator::host_id endpoint);
    // return an engaged value iff app_state_map has changes to the peer info
@@ -1117,7 +1115,7 @@ private:
    // gossiper) to align it with the other raft topology nodes.
    // Optional target_node can be provided to restrict the synchronization to the specified node.
    // Returns a structure that describes which notifications to trigger after token metadata is updated.
-    future<nodes_to_notify_after_sync> sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal, std::optional<std::unordered_set<locator::host_id>> prev_released);
+    future<nodes_to_notify_after_sync> sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal);
    // Triggers notifications (on_joined, on_left) based on the recent changes to token metadata, as described by the passed in structure.
    // This function should be called on the result of `sync_raft_topology_nodes`, after the global token metadata is updated.
    future<> notify_nodes_after_sync(nodes_to_notify_after_sync&& nodes_to_notify);
--- a/service/tablet_allocator.cc
+++ b/service/tablet_allocator.cc
@@ -951,6 +951,8 @@ private:
                return true;
            case tablet_transition_stage::write_both_read_old:
                return true;
+            case tablet_transition_stage::write_both_read_old_fallback_cleanup:
+                return false;
            case tablet_transition_stage::streaming:
                return true;
            case tablet_transition_stage::rebuild_repair:
@@ -1107,11 +1109,6 @@ public:
        if (!is_auto_repair_enabled(config)) {
            co_return false;
        }
-        auto size = info.replicas.size();
-        if (size <= 1) {
-            lblogger.debug("Skipped auto repair for tablet={} replicas={}", gid, size);
-            co_return false;
-        }
        auto threshold = _db.get_config().auto_repair_threshold_default_in_seconds();
        auto repair_time_threshold = std::chrono::seconds(threshold);
        auto& last_repair_time = info.repair_time;
@@ -3865,6 +3862,7 @@ class tablet_allocator_impl : public tablet_allocator::impl
    service::migration_notifier& _migration_notifier;
    replica::database& _db;
    load_balancer_stats_manager _load_balancer_stats;
+    scheduling_group _background;
    bool _stopped = false;
    bool _use_tablet_aware_balancing = true;
    locator::load_stats_ptr _load_stats;
@@ -3886,7 +3884,9 @@ public:
    tablet_allocator_impl(tablet_allocator::config cfg, service::migration_notifier& mn, replica::database& db)
            : _migration_notifier(mn)
            , _db(db)
-            , _load_balancer_stats("load_balancer") {
+            , _load_balancer_stats("load_balancer")
+            , _background(cfg.background_sg)
+    {
        _migration_notifier.register_listener(this);
    }

@@ -3903,7 +3903,7 @@ public:

    future<migration_plan> balance_tablets(token_metadata_ptr tm, service::topology* topology, db::system_keyspace* sys_ks, locator::load_stats_ptr table_load_stats, std::unordered_set<host_id> skiplist) {
        auto lb = make_load_balancer(tm, topology, sys_ks, table_load_stats ? table_load_stats : _load_stats, std::move(skiplist));
-        co_await coroutine::switch_to(_db.get_streaming_scheduling_group());
+        co_await coroutine::switch_to(_background);
        co_return co_await lb.make_plan();
    }

--- a/service/tablet_allocator.hh
+++ b/service/tablet_allocator.hh
@@ -257,6 +257,7 @@ class migration_notifier;
 class tablet_allocator {
 public:
    struct config {
+        scheduling_group background_sg;
    };
    class impl {
    public:
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -331,17 +331,12 @@ class topology_coordinator : public endpoint_lifecycle_subscriber

        auto [id, req] = *next_req;

-        auto* server_rs = topo.find(id);
-        if (!server_rs) {
-            on_internal_error(rtlogger, format("Node {} has a pending {} request but is not found in topology", id, req));
-        }
-
        if (cleanup_needed && (req == topology_request::remove || req == topology_request::leave)) {
            // If the highest prio request is removenode or decommission we need to start cleanup if one is needed
            return start_vnodes_cleanup(std::move(guard), req, id);
        }

-        return node_to_work_on(std::move(guard), &topo, id, &server_rs->second, req, get_request_param(id));
+        return node_to_work_on(std::move(guard), &topo, id, &topo.find(id)->second, req, get_request_param(id));
    };

    node_to_work_on get_node_to_work_on(group0_guard guard) const {
@@ -378,9 +373,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
        auto& topo = _topo_sm._topology;

        auto it = topo.find(id);
-        if (!it) {
-            on_internal_error(rtlogger, format("retake_node: node {} not found in topology", id));
-        }
+        SCYLLA_ASSERT(it);

        std::optional<topology_request> req;
        auto rit = topo.requests.find(id);
@@ -1546,7 +1539,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                case locator::tablet_transition_stage::allow_write_both_read_old:
                    if (action_failed(tablet_state.barriers[trinfo.stage])) {
                        if (check_excluded_replicas()) {
-                            transition_to_with_barrier(locator::tablet_transition_stage::cleanup_target);
+                            transition_to(locator::tablet_transition_stage::cleanup_target);
                            break;
                        }
                    }
@@ -1567,7 +1560,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                case locator::tablet_transition_stage::write_both_read_old:
                    if (action_failed(tablet_state.barriers[trinfo.stage])) {
                        if (check_excluded_replicas()) {
-                            transition_to_with_barrier(locator::tablet_transition_stage::cleanup_target);
+                            transition_to(locator::tablet_transition_stage::cleanup_target);
                            break;
                        }
                    }
@@ -1577,17 +1570,18 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                        transition_to_with_barrier(locator::tablet_transition_stage::streaming);
                    }
                    break;
+                case locator::tablet_transition_stage::write_both_read_old_fallback_cleanup:
+                    transition_to_with_barrier(locator::tablet_transition_stage::cleanup_target);
+                    break;
                case locator::tablet_transition_stage::rebuild_repair: {
                    if (action_failed(tablet_state.rebuild_repair)) {
                        bool fail = utils::get_local_injector().enter("rebuild_repair_stage_fail");
                        if (fail || check_excluded_replicas()) {
-                            if (do_barrier()) {
-                                rtlogger.debug("Will set tablet {} stage to {}", gid, locator::tablet_transition_stage::cleanup_target);
-                                updates.emplace_back(get_mutation_builder()
-                                        .set_stage(last_token, locator::tablet_transition_stage::cleanup_target)
-                                        .del_session(last_token)
-                                        .build());
-                            }
+                            rtlogger.debug("Will set tablet {} stage to {}", gid, locator::tablet_transition_stage::cleanup_target);
+                            updates.emplace_back(get_mutation_builder()
+                                    .set_stage(last_token, locator::tablet_transition_stage::cleanup_target)
+                                    .del_session(last_token)
+                                    .build());
                            break;
                        }
                    }
@@ -1660,13 +1654,11 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                        }

                        if (rollback) {
-                            if (do_barrier()) {
-                                rtlogger.debug("Will set tablet {} stage to {}: {}", gid, locator::tablet_transition_stage::cleanup_target, *rollback);
-                                updates.emplace_back(get_mutation_builder()
-                                        .set_stage(last_token, locator::tablet_transition_stage::cleanup_target)
-                                        .del_session(last_token)
-                                        .build());
-                            }
+                            rtlogger.debug("Will set tablet {} stage to {}: {}", gid, locator::tablet_transition_stage::cleanup_target, *rollback);
+                            updates.emplace_back(get_mutation_builder()
+                                .set_stage(last_token, locator::tablet_transition_stage::cleanup_target)
+                                .del_session(last_token)
+                                .build());
                            break;
                        }
                    }
@@ -1703,7 +1695,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                        _exit(1);
                    });

-                    auto next_stage = locator::tablet_transition_stage::use_new;
                    if (action_failed(tablet_state.barriers[trinfo.stage])) {
                        auto& tinfo = tmap.get_tablet_info(gid.tablet);
                        unsigned excluded_old = 0;
@@ -1725,10 +1716,15 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                        // than excluded_old for intra-node migration.
                        if (excluded_new > excluded_old && trinfo.transition != locator::tablet_transition_kind::intranode_migration) {
                            rtlogger.debug("During {} stage of {} {} new nodes and {} old nodes were excluded", trinfo.stage, gid, excluded_new, excluded_old);
-                            next_stage = locator::tablet_transition_stage::cleanup_target;
+                            if (_feature_service.tablets_intermediate_fallback_cleanup) {
+                                transition_to(locator::tablet_transition_stage::write_both_read_old_fallback_cleanup);
+                            } else {
+                                transition_to_with_barrier(locator::tablet_transition_stage::cleanup_target);
+                            }
+                            break;
                        }
                    }
-                    transition_to_with_barrier(next_stage);
+                    transition_to_with_barrier(locator::tablet_transition_stage::use_new);
                }
                    break;
                case locator::tablet_transition_stage::use_new:
@@ -2499,9 +2495,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber

                switch (node.rs->state) {
                    case node_state::bootstrapping: {
-                        if (node.rs->ring) {
-                            on_internal_error(rtlogger, format("Bootstrapping node {} owns tokens", node.id));
-                        }
+                        SCYLLA_ASSERT(!node.rs->ring);
                        auto num_tokens = std::get<join_param>(node.req_param.value()).num_tokens;
                        auto tokens_string = std::get<join_param>(node.req_param.value()).tokens_string;

@@ -2557,23 +2551,11 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                    }
                        break;
                    case node_state::replacing: {
-                        if (node.rs->ring) {
-                            on_internal_error(rtlogger, format("Replacing node {} owns tokens", node.id));
-                        }
+                        SCYLLA_ASSERT(!node.rs->ring);
                        auto replaced_id = std::get<replace_param>(node.req_param.value()).replaced_id;
                        auto it = _topo_sm._topology.normal_nodes.find(replaced_id);
-                        if (it == _topo_sm._topology.normal_nodes.end()) {
-                            on_internal_error(rtlogger,
-                                    format("Node {} being replaced by {} not found in normal nodes", replaced_id, node.id));
-                        }
-                        if (!it->second.ring) {
-                            on_internal_error(rtlogger,
-                                    format("Node {} being replaced by {} is missing tokens", replaced_id, node.id));
-                        }
-                        if (it->second.state != node_state::normal) {
-                            on_internal_error(rtlogger,
-                                    format("Node {} being replaced by {} is not in normal state", replaced_id, node.id));
-                        }
+                        SCYLLA_ASSERT(it != _topo_sm._topology.normal_nodes.end());
+                        SCYLLA_ASSERT(it->second.ring && it->second.state == node_state::normal);

                        topology_mutation_builder builder(node.guard.write_timestamp());

@@ -2972,7 +2954,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                    }
                    break;
                default:
-                    on_internal_error(rtlogger, ::format(
+                    on_fatal_internal_error(rtlogger, ::format(
                            "Ring state on node {} is write_both_read_new while the node is in state {}",
                            node.id, node.rs->state));
                }
@@ -3289,9 +3271,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                rtbuilder.set("start_time", db_clock::now());
                switch (node.request.value()) {
                    case topology_request::join: {
-                        if (node.rs->ring) {
-                            on_internal_error(rtlogger, ::format("Joining node {} owns tokens", node.id));
-                        }
+                        SCYLLA_ASSERT(!node.rs->ring);
                        // Write chosen tokens through raft.
                        builder.set_transition_state(topology::transition_state::join_group0)
                               .with_node(node.id)
@@ -3303,9 +3283,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                        break;
                        }
                    case topology_request::leave: {
-                        if (!node.rs->ring) {
-                            on_internal_error(rtlogger, ::format("Leaving node {} doesn't own tokens", node.id));
-                        }
+                        SCYLLA_ASSERT(node.rs->ring);

                        auto validation_result = validate_removing_node(_db, to_host_id(node.id));
                        if (std::holds_alternative<node_validation_failure>(validation_result)) {
@@ -3336,9 +3314,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                        break;
                        }
                    case topology_request::remove: {
-                        if (!node.rs->ring) {
-                            on_internal_error(rtlogger, ::format("Node {} being removed doesn't own tokens", node.id));
-                        }
+                        SCYLLA_ASSERT(node.rs->ring);

                        auto validation_result = validate_removing_node(_db, to_host_id(node.id));
                        if (std::holds_alternative<node_validation_failure>(validation_result)) {
@@ -3366,9 +3342,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                        break;
                        }
                    case topology_request::replace: {
-                        if (node.rs->ring) {
-                            on_internal_error(rtlogger, ::format("Replacing node {} owns tokens", node.id));
-                        }
+                        SCYLLA_ASSERT(!node.rs->ring);

                        builder.set_transition_state(topology::transition_state::join_group0)
                               .with_node(node.id)
@@ -3425,12 +3399,12 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
            case node_state::removing:
            case node_state::replacing:
                // Should not get here
-                on_internal_error(rtlogger, ::format(
+                on_fatal_internal_error(rtlogger, ::format(
                    "Found node {} in state {} but there is no ongoing topology transition",
                    node.id, node.rs->state));
            case node_state::left:
                // Should not get here
-                on_internal_error(rtlogger, ::format(
+                on_fatal_internal_error(rtlogger, ::format(
                        "Topology coordinator is called for node {} in state 'left'", node.id));
                break;
        }
@@ -3492,9 +3466,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber

        auto id = node.id;

-        if (_topo_sm._topology.transition_nodes.empty()) {
-            on_internal_error(rtlogger, format("transition nodes are empty while accepting node {}", node.id));
-        }
+        SCYLLA_ASSERT(!_topo_sm._topology.transition_nodes.empty());

        release_node(std::move(node));

@@ -3904,9 +3876,6 @@ future<> topology_coordinator::refresh_tablet_load_stats() {
        for (auto& [table_id, table_stats] : dc_stats.tables) {
            co_await coroutine::maybe_yield();

-            if (!_db.column_family_exists(table_id)) {
-                continue;
-            }
            auto& t = _db.find_column_family(table_id);
            auto& rs = t.get_effective_replication_map()->get_replication_strategy();
            if (!rs.uses_tablets()) {
@@ -3930,9 +3899,6 @@ future<> topology_coordinator::refresh_tablet_load_stats() {
    }

    for (auto& [table_id, table_load_stats] : stats.tables) {
-        if (!total_replicas.contains(table_id)) {
-            continue;
-        }
        auto table_total_replicas = total_replicas.at(table_id);
        if (table_total_replicas == 0) {
            continue;
--- a/service/topology_mutation.cc
+++ b/service/topology_mutation.cc
@@ -20,8 +20,6 @@ namespace db {

 namespace service {

-extern logging::logger rtlogger;
-
 topology_mutation_builder::topology_mutation_builder(api::timestamp_type ts) :
        _s(db::system_keyspace::topology()),
        _m(_s, partition_key::from_singular(*_s, db::system_keyspace::TOPOLOGY)),
@@ -37,9 +35,7 @@ topology_node_mutation_builder::topology_node_mutation_builder(topology_mutation
 template<typename Builder>
 Builder& topology_mutation_builder_base<Builder>::apply_atomic(const char* cell, const data_value& value) {
    const column_definition* cdef = self().schema().get_column_definition(cell);
-    if (!cdef) {
-        on_internal_error(rtlogger, format("column {} not found in the topology table", cell));
-    }
+    SCYLLA_ASSERT(cdef);
    self().row().apply(*cdef, atomic_cell::make_live(*cdef->type, self().timestamp(), cdef->type->decompose(value), self().ttl()));
    return self();
 }
@@ -49,9 +45,7 @@ template<std::ranges::range C>
 requires std::convertible_to<std::ranges::range_value_t<C>, data_value>
 Builder& topology_mutation_builder_base<Builder>::apply_set(const char* cell, collection_apply_mode apply_mode, const C& c) {
    const column_definition* cdef = self().schema().get_column_definition(cell);
-    if (!cdef) {
-        on_internal_error(rtlogger, format("column {} not found in the topology table", cell));
-    }
+    SCYLLA_ASSERT(cdef);
    auto vtype = static_pointer_cast<const set_type_impl>(cdef->type)->get_elements_type();

    std::set<bytes, serialized_compare> cset(vtype->as_less_comparator());
@@ -76,9 +70,7 @@ Builder& topology_mutation_builder_base<Builder>::apply_set(const char* cell, co
 template<typename Builder>
 Builder& topology_mutation_builder_base<Builder>::del(const char* cell) {
    auto cdef = self().schema().get_column_definition(cell);
-    if (!cdef) {
-        on_internal_error(rtlogger, format("column {} not found in the topology table", cell));
-    }
+    SCYLLA_ASSERT(cdef);
    if (!cdef->type->is_multi_cell()) {
        self().row().apply(*cdef, atomic_cell::make_dead(self().timestamp(), gc_clock::now()));
    } else {
--- a/sstables/mx/partition_reversing_data_source.cc
+++ b/sstables/mx/partition_reversing_data_source.cc
@@ -8,6 +8,7 @@

 #include <seastar/core/coroutine.hh>
 #include <seastar/core/iostream.hh>
+#include <seastar/util/memory-data-source.hh>
 #include "partition_reversing_data_source.hh"
 #include "reader_permit.hh"
 #include "sstables/consumer.hh"
@@ -15,7 +16,6 @@
 #include "sstables/shared_sstable.hh"
 #include "sstables/sstables.hh"
 #include "sstables/types.hh"
-#include "utils/buffer_input_stream.hh"
 #include "utils/to_string.hh"

 namespace sstables {
@@ -417,7 +417,7 @@ private:
            }
            _current_read_size = std::min(max_read_size, _current_read_size * 2);
        }
-        co_return make_buffer_input_stream(_cached_read.share(_cached_read.size() - row_size, row_size));
+        co_return seastar::util::as_input_stream(_cached_read.share(_cached_read.size() - row_size, row_size));
    }
    temporary_buffer<char> last_row(size_t row_size) {
        auto tmp = _cached_read.share(_cached_read.size() - row_size, row_size);
--- a/sstables/storage.cc
+++ b/sstables/storage.cc
@@ -95,6 +95,7 @@ public:
    virtual future<file> open_component(const sstable& sst, component_type type, open_flags flags, file_open_options options, bool check_integrity) override;
    virtual future<data_sink> make_data_or_index_sink(sstable& sst, component_type type) override;
    future<data_source> make_data_or_index_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const override;
+    future<data_source> make_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const override;
    virtual future<data_sink> make_component_sink(sstable& sst, component_type type, open_flags oflags, file_output_stream_options options) override;
    virtual future<> destroy(const sstable& sst) override { return make_ready_future<>(); }
    virtual future<atomic_delete_context> atomic_delete_prepare(const std::vector<shared_sstable>&) const override;
@@ -132,8 +133,12 @@ future<data_sink> filesystem_storage::make_data_or_index_sink(sstable& sst, comp
    }
 }

-future<data_source> filesystem_storage::make_data_or_index_source(sstable&, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const {
+future<data_source> filesystem_storage::make_data_or_index_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const {
    SCYLLA_ASSERT(type == component_type::Data || type == component_type::Index);
+    co_return co_await make_source(sst, type, std::move(f), offset, len, std::move(opt));
+}
+
+future<data_source> filesystem_storage::make_source(sstable&, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const {
    co_return make_file_data_source(std::move(f), offset, len, std::move(opt));
 }

@@ -615,6 +620,7 @@ public:
    future<file> open_component(const sstable& sst, component_type type, open_flags flags, file_open_options options, bool check_integrity) override;
    future<data_sink> make_data_or_index_sink(sstable& sst, component_type type) override;
    future<data_source> make_data_or_index_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const override;
+    future<data_source> make_source(sstable& sst, component_type type, file, uint64_t offset, uint64_t len, file_input_stream_options) const override;

    future<data_sink> make_component_sink(sstable& sst, component_type type, open_flags oflags, file_output_stream_options options) override;
    future<> destroy(const sstable& sst) override {
@@ -657,6 +663,7 @@ public:
    {}

    future<data_source> make_data_or_index_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const override;
+    future<data_source> make_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const override;
 };

 object_name object_storage_base::make_object_name(const sstable& sst, component_type type) const {
@@ -742,13 +749,23 @@ future<data_sink> object_storage_base::make_data_or_index_sink(sstable& sst, com

 future<data_source>
 object_storage_base::make_data_or_index_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options options) const {
+    co_return co_await make_source(sst, type, f, offset, len, options);
+}
+
+future<data_source>
+object_storage_base::make_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options) const {
    co_return co_await maybe_wrap_source(sst, type, _client->make_download_source(make_object_name(sst, type), abort_source()), offset, len);
 }

 future<data_source>
 s3_storage::make_data_or_index_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options options) const {
+    co_return co_await make_source(sst, type, std::move(f), offset, len, std::move(options));
+}
+
+future<data_source>
+s3_storage::make_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options options) const {
    if (offset == 0) {
-        co_return co_await object_storage_base::make_data_or_index_source(sst, type, std::move(f), offset, len, std::move(options));
+        co_return co_await object_storage_base::make_source(sst, type, std::move(f), offset, len, std::move(options));
    }
    co_return make_file_data_source(
        co_await maybe_wrap_file(sst, type, open_flags::ro, _client->make_readable_file(make_object_name(sst, type), abort_source())), offset, len, std::move(options));
--- a/sstables/storage.hh
+++ b/sstables/storage.hh
@@ -107,6 +107,7 @@ public:
    virtual future<file> open_component(const sstable& sst, component_type type, open_flags flags, file_open_options options, bool check_integrity) = 0;
    virtual future<data_sink> make_data_or_index_sink(sstable& sst, component_type type) = 0;
    virtual future<data_source> make_data_or_index_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const = 0;
+    virtual future<data_source> make_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const = 0;
    virtual future<data_sink> make_component_sink(sstable& sst, component_type type, open_flags oflags, file_output_stream_options options) = 0;
    virtual future<> destroy(const sstable& sst) = 0;
    virtual future<atomic_delete_context> atomic_delete_prepare(const std::vector<shared_sstable>&) const = 0;
--- a/Show More
+++ b/Show More