Fix critical bugs and issues found in alternator code review

Co-authored-by: nyh <584227+nyh@users.noreply.github.com>
Initial plan
2026-05-13 11:22:01 +00:00 · 2026-01-29 22:54:57 +00:00 · 2026-01-29 22:49:31 +00:00 · 2026-01-29 17:25:42 +01:00 · 2026-01-29 18:12:35 +02:00 · 2026-01-29 16:18:26 +02:00
255 changed files with 3356 additions and 2449 deletions
--- a/.github/workflows/iwyu.yaml
+++ b/.github/workflows/iwyu.yaml
@@ -14,7 +14,8 @@ env:
  CLEANER_DIRS: test/unit exceptions alternator api auth cdc compaction db dht gms index lang message mutation mutation_writer node_ops raft redis replica service
  SEASTAR_BAD_INCLUDE_OUTPUT_PATH: build/seastar-bad-include.log

-permissions: {}
+permissions:
+  contents: read

 # cancel the in-progress run upon a repush
 concurrency:
@@ -34,8 +35,6 @@ jobs:
      - uses: actions/checkout@v4
        with:
          submodules: true
-      - run: |
-          sudo dnf -y install clang-tools-extra
      - name: Generate compilation database
        run: |
          cmake                                         \
--- a/.github/workflows/trigger-scylla-ci.yaml
+++ b/.github/workflows/trigger-scylla-ci.yaml
@@ -9,34 +9,16 @@ on:

 jobs:
  trigger-jenkins:
-    if: (github.event_name == 'issue_comment' && github.event.comment.user.login != 'scylladbbot') || github.event.label.name == 'conflicts'
+    if: (github.event.comment.user.login != 'scylladbbot' && contains(github.event.comment.body, '@scylladbbot') && contains(github.event.comment.body, 'trigger-ci')) || github.event.label.name == 'conflicts'
    runs-on: ubuntu-latest
    steps:
-      - name: Validate Comment Trigger
-        if: github.event_name == 'issue_comment'
-        id: verify_comment
-        shell: bash
-        run: |
-          BODY=$(cat << 'EOF'
-          ${{ github.event.comment.body }}
-          EOF
-          )
-          CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
-
-          if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
-            echo "trigger=true" >> $GITHUB_OUTPUT
-          else
-            echo "trigger=false" >> $GITHUB_OUTPUT
-          fi
-
      - name: Trigger Scylla-CI-Route Jenkins Job
-        if: github.event_name == 'pull_request_target' || steps.verify_comment.outputs.trigger == 'true'
        env:
          JENKINS_USER: ${{ secrets.JENKINS_USERNAME }}
          JENKINS_API_TOKEN: ${{ secrets.JENKINS_TOKEN }}
          JENKINS_URL: "https://jenkins.scylladb.com"
        run: |
-          PR_NUMBER=${{ github.event.issue.number || github.event.pull_request.number }}
+          PR_NUMBER=${{ github.event.issue.number }}
          PR_REPO_NAME=${{ github.event.repository.full_name }}
          curl -X POST "$JENKINS_URL/job/releng/job/Scylla-CI-Route/buildWithParameters?PR_NUMBER=$PR_NUMBER&PR_REPO_NAME=$PR_REPO_NAME" \
          --user "$JENKINS_USER:$JENKINS_API_TOKEN" --fail -i -v
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2026.1.0-rc2
+VERSION=2026.2.0-dev

 if test -f version
 then
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -244,7 +244,10 @@ static bool is_set_of(const rjson::value& type1, const rjson::value& type2) {

 // Check if two JSON-encoded values match with the CONTAINS relation
 bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2, bool v1_from_query, bool v2_from_query) {
-    if (!v1) {
+    if (!v1 || !v1->IsObject() || v1->MemberCount() == 0) {
+        return false;
+    }
+    if (!v2.IsObject() || v2.MemberCount() == 0) {
        return false;
    }
    const auto& kv1 = *v1->MemberBegin();
--- a/alternator/consumed_capacity.cc
+++ b/alternator/consumed_capacity.cc
@@ -45,7 +45,7 @@ bool consumed_capacity_counter::should_add_capacity(const rjson::value& request)
 }

 void consumed_capacity_counter::add_consumed_capacity_to_response_if_needed(rjson::value& response) const noexcept {
-    if (_should_add_to_reponse) {
+    if (_should_add_to_response) {
        auto consumption = rjson::empty_object();
        rjson::add(consumption, "CapacityUnits", get_consumed_capacity_units());
        rjson::add(response, "ConsumedCapacity", std::move(consumption));
@@ -53,7 +53,9 @@ void consumed_capacity_counter::add_consumed_capacity_to_response_if_needed(rjso
 }

 static uint64_t calculate_half_units(uint64_t unit_block_size, uint64_t total_bytes, bool is_quorum) {
-    uint64_t half_units = (total_bytes + unit_block_size -1) / unit_block_size; //divide by unit_block_size and round up
+    // Avoid potential integer overflow when total_bytes is close to UINT64_MAX
+    // by using division with modulo instead of addition before division
+    uint64_t half_units = total_bytes / unit_block_size + (total_bytes % unit_block_size != 0 ? 1 : 0);

    if (is_quorum) {
        half_units *= 2;
--- a/alternator/consumed_capacity.hh
+++ b/alternator/consumed_capacity.hh
@@ -28,9 +28,9 @@ namespace alternator {
 class consumed_capacity_counter {
 public:
    consumed_capacity_counter() = default;
-    consumed_capacity_counter(bool should_add_to_reponse) : _should_add_to_reponse(should_add_to_reponse){}
+    consumed_capacity_counter(bool should_add_to_response) : _should_add_to_response(should_add_to_response){}
    bool operator()() const noexcept {
-        return _should_add_to_reponse;
+        return _should_add_to_response;
    }

    consumed_capacity_counter& operator +=(uint64_t bytes);
@@ -44,7 +44,7 @@ public:
    uint64_t _total_bytes = 0;
    static bool should_add_capacity(const rjson::value& request);
 protected:
-    bool _should_add_to_reponse = false;
+    bool _should_add_to_response = false;
 };

 class rcu_consumed_capacity_counter : public consumed_capacity_counter {
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -834,11 +834,13 @@ future<> executor::fill_table_size(rjson::value &table_description, schema_ptr s
            total_size = co_await _ss.estimate_total_sstable_volume(schema->id(), service::storage_service::ignore_errors::yes);
            const auto expiry = std::chrono::seconds{ _proxy.data_dictionary().get_config().alternator_describe_table_info_cache_validity_in_seconds() };
            // Note: we don't care when the notification of other shards will finish, as long as it will be done
-            // it's possible to get into race condition (next DescribeTable comes to other shard, that new shard doesn't have
-            // the size yet, so it will calculate it again) - this is not a problem, because it will call cache_newly_calculated_size_on_all_shards
-            // with expiry, which is extremely unlikely to be exactly the same as the previous one, all shards will keep the size coming with expiry that is further into the future.
-            // In case of the same expiry, some shards will have different size, which means DescribeTable will return different values depending on the shard
-            // which is also fine, as the specification doesn't give precision guarantees of any kind.
+            // A race condition is possible: if a DescribeTable request arrives on a different shard before
+            // that shard receives the cached size, it will recalculate independently. This is acceptable because:
+            // 1. Both calculations will cache their results with an expiry time
+            // 2. Expiry times are unlikely to be identical, so eventually all shards converge to the most recent value
+            // 3. Even if expiry times match, different shards may briefly return different table sizes
+            // 4. This temporary inconsistency is acceptable per DynamoDB specification, which doesn't guarantee
+            //    exact precision for DescribeTable size information
            co_await cache_newly_calculated_size_on_all_shards(schema, total_size, expiry);
        }
    }
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -491,7 +491,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    if (!opts.enabled()) {
        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
+        co_return rjson::print(std::move(ret));
    }

    // TODO: label
@@ -502,123 +502,121 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
    // filter out cdc generations older than the table or now() - cdc::ttl (typically dynamodb_streams_max_window - 24h)
    auto low_ts = std::max(as_timepoint(schema->id()), db_clock::now() - ttl);

-    return _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners }).then([db, shard_start, limit, ret = std::move(ret), stream_desc = std::move(stream_desc)] (std::map<db_clock::time_point, cdc::streams_version> topologies) mutable {
+    std::map<db_clock::time_point, cdc::streams_version> topologies = co_await _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners });
+    auto e = topologies.end();
+    auto prev = e;
+    auto shards = rjson::empty_array();

-        auto e = topologies.end();
-        auto prev = e;
-        auto shards = rjson::empty_array();
+    std::optional<shard_id> last;

-        std::optional<shard_id> last;
+    auto i = topologies.begin();
+    // if we're a paged query, skip to the generation where we left of.
+    if (shard_start) {
+        i = topologies.find(shard_start->time);
+    }

-        auto i = topologies.begin();
-        // if we're a paged query, skip to the generation where we left of.
-        if (shard_start) {
-            i = topologies.find(shard_start->time);
-        }
+    // for parent-child stuff we need id:s to be sorted by token
+    // (see explanation above) since we want to find closest
+    // token boundary when determining parent.
+    // #7346 - we processed and searched children/parents in
+    // stored order, which is not necessarily token order,
+    // so the finding of "closest" token boundary (using upper bound)
+    // could give somewhat weird results.
+    static auto token_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
+        return id1.token() < id2.token();
+    };

-        // for parent-child stuff we need id:s to be sorted by token
-        // (see explanation above) since we want to find closest
-        // token boundary when determining parent.
-        // #7346 - we processed and searched children/parents in
-        // stored order, which is not necessarily token order,
-        // so the finding of "closest" token boundary (using upper bound)
-        // could give somewhat weird results.
-        static auto token_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
-            return id1.token() < id2.token();
-        };
+    // #7409 - shards must be returned in lexicographical order,
+    // normal bytes compare is string_traits<int8_t>::compare.
+    // thus bytes 0x8000 is less than 0x0000. By doing unsigned
+    // compare instead we inadvertently will sort in string lexical.
+    static auto id_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
+        return compare_unsigned(id1.to_bytes(), id2.to_bytes()) < 0;
+    };
+
+    // need a prev even if we are skipping stuff
+    if (i != topologies.begin()) {
+        prev = std::prev(i);
+    }
+
+    for (; limit > 0 && i != e; prev = i, ++i) {
+        auto& [ts, sv] = *i;
+
+        last = std::nullopt;
+
+        auto lo = sv.streams.begin();
+        auto end = sv.streams.end();

        // #7409 - shards must be returned in lexicographical order,
-        // normal bytes compare is string_traits<int8_t>::compare.
-        // thus bytes 0x8000 is less than 0x0000. By doing unsigned
-        // compare instead we inadvertently will sort in string lexical.
-        static auto id_cmp = [](const cdc::stream_id& id1, const cdc::stream_id& id2) {
-            return compare_unsigned(id1.to_bytes(), id2.to_bytes()) < 0;
-        };
+        std::sort(lo, end, id_cmp);

-        // need a prev even if we are skipping stuff
-        if (i != topologies.begin()) {
-            prev = std::prev(i);
+        if (shard_start) {
+            // find next shard position
+            lo = std::upper_bound(lo, end, shard_start->id, id_cmp);
+            shard_start = std::nullopt;
        }

-        for (; limit > 0 && i != e; prev = i, ++i) {
-            auto& [ts, sv] = *i;
+        if (lo != end && prev != e) {
+            // We want older stuff sorted in token order so we can find matching
+            // token range when determining parent shard.
+            std::stable_sort(prev->second.streams.begin(), prev->second.streams.end(), token_cmp);
+        }
+
+        auto expired = [&]() -> std::optional<db_clock::time_point> {
+            auto j = std::next(i);
+            if (j == e) {
+                return std::nullopt;
+            }
+            // add this so we sort of match potential 
+            // sequence numbers in get_records result.
+            return j->first + confidence_interval(db);
+        }();
+
+        while (lo != end) {
+            auto& id = *lo++;
+
+            auto shard = rjson::empty_object();
+
+            if (prev != e) {
+                auto& pids = prev->second.streams;
+                auto pid = std::upper_bound(pids.begin(), pids.end(), id.token(), [](const dht::token& t, const cdc::stream_id& id) {
+                    return t < id.token();
+                });
+                if (pid != pids.begin()) {
+                    pid = std::prev(pid);
+                }
+                if (pid != pids.end()) {
+                    rjson::add(shard, "ParentShardId", shard_id(prev->first, *pid));
+                }
+            }
+
+            last.emplace(ts, id);
+            rjson::add(shard, "ShardId", *last);
+            auto range = rjson::empty_object();
+            rjson::add(range, "StartingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(ts.time_since_epoch())));
+            if (expired) {
+                rjson::add(range, "EndingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(expired->time_since_epoch())));
+            }
+
+            rjson::add(shard, "SequenceNumberRange", std::move(range));
+            rjson::push_back(shards, std::move(shard));
+            
+            if (--limit == 0) {
+                break;
+            }

            last = std::nullopt;
-
-            auto lo = sv.streams.begin();
-            auto end = sv.streams.end();
-
-            // #7409 - shards must be returned in lexicographical order,
-            std::sort(lo, end, id_cmp);
-
-            if (shard_start) {
-                // find next shard position
-                lo = std::upper_bound(lo, end, shard_start->id, id_cmp);
-                shard_start = std::nullopt;
-            }
-
-            if (lo != end && prev != e) {
-                // We want older stuff sorted in token order so we can find matching
-                // token range when determining parent shard.
-                std::stable_sort(prev->second.streams.begin(), prev->second.streams.end(), token_cmp);
-            }
-
-            auto expired = [&]() -> std::optional<db_clock::time_point> {
-                auto j = std::next(i);
-                if (j == e) {
-                    return std::nullopt;
-                }
-                // add this so we sort of match potential 
-                // sequence numbers in get_records result.
-                return j->first + confidence_interval(db);
-            }();
-
-            while (lo != end) {
-                auto& id = *lo++;
-
-                auto shard = rjson::empty_object();
-
-                if (prev != e) {
-                    auto& pids = prev->second.streams;
-                    auto pid = std::upper_bound(pids.begin(), pids.end(), id.token(), [](const dht::token& t, const cdc::stream_id& id) {
-                        return t < id.token();
-                    });
-                    if (pid != pids.begin()) {
-                        pid = std::prev(pid);
-                    }
-                    if (pid != pids.end()) {
-                        rjson::add(shard, "ParentShardId", shard_id(prev->first, *pid));
-                    }
-                }
-
-                last.emplace(ts, id);
-                rjson::add(shard, "ShardId", *last);
-                auto range = rjson::empty_object();
-                rjson::add(range, "StartingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(ts.time_since_epoch())));
-                if (expired) {
-                    rjson::add(range, "EndingSequenceNumber", sequence_number(utils::UUID_gen::min_time_UUID(expired->time_since_epoch())));
-                }
-
-                rjson::add(shard, "SequenceNumberRange", std::move(range));
-                rjson::push_back(shards, std::move(shard));
-                
-                if (--limit == 0) {
-                    break;
-                }
-
-                last = std::nullopt;
-            }
        }
+    }

-        if (last) {
-            rjson::add(stream_desc, "LastEvaluatedShardId", *last);
-        }
+    if (last) {
+        rjson::add(stream_desc, "LastEvaluatedShardId", *last);
+    }

-        rjson::add(stream_desc, "Shards", std::move(shards));
-        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-            
-        return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
-    });
+    rjson::add(stream_desc, "Shards", std::move(shards));
+    rjson::add(ret, "StreamDescription", std::move(stream_desc));
+        
+    co_return rjson::print(std::move(ret));
 }

 enum class shard_iterator_type {
@@ -898,172 +896,169 @@ future<executor::request_return_type> executor::get_records(client_state& client
    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
            query::tombstone_limit(_proxy.get_tombstone_limit()), query::row_limit(limit * mul));

-    co_return co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
-            [this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable {       
-        cql3::selection::result_set_builder builder(*selection, gc_clock::now());
-        query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
+    service::storage_proxy::coordinator_query_result qr = co_await _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state));
+    cql3::selection::result_set_builder builder(*selection, gc_clock::now());
+    query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));

-        auto result_set = builder.build();
-        auto records = rjson::empty_array();
+    auto result_set = builder.build();
+    auto records = rjson::empty_array();

-        auto& metadata = result_set->get_metadata();
+    auto& metadata = result_set->get_metadata();

-        auto op_index = std::distance(metadata.get_names().begin(), 
-            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-                return cdef->name->name() == op_column_name;
-            })
-        );
-        auto ts_index = std::distance(metadata.get_names().begin(), 
-            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-                return cdef->name->name() == timestamp_column_name;
-            })
-        );
-        auto eor_index = std::distance(metadata.get_names().begin(), 
-            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
-                return cdef->name->name() == eor_column_name;
-            })
-        );
+    auto op_index = std::distance(metadata.get_names().begin(), 
+        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+            return cdef->name->name() == op_column_name;
+        })
+    );
+    auto ts_index = std::distance(metadata.get_names().begin(), 
+        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+            return cdef->name->name() == timestamp_column_name;
+        })
+    );
+    auto eor_index = std::distance(metadata.get_names().begin(), 
+        std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+            return cdef->name->name() == eor_column_name;
+        })
+    );

-        std::optional<utils::UUID> timestamp;
-        auto dynamodb = rjson::empty_object();
-        auto record = rjson::empty_object();
-        const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();
+    std::optional<utils::UUID> timestamp;
+    auto dynamodb = rjson::empty_object();
+    auto record = rjson::empty_object();
+    const auto dc_name = _proxy.get_token_metadata_ptr()->get_topology().get_datacenter();

-        using op_utype = std::underlying_type_t<cdc::operation>;
+    using op_utype = std::underlying_type_t<cdc::operation>;

-        auto maybe_add_record = [&] {
-            if (!dynamodb.ObjectEmpty()) {
-                rjson::add(record, "dynamodb", std::move(dynamodb));
-                dynamodb = rjson::empty_object();
-            }
-            if (!record.ObjectEmpty()) {
-                rjson::add(record, "awsRegion", rjson::from_string(dc_name));
-                rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
-                rjson::add(record, "eventSource", "scylladb:alternator");
-                rjson::add(record, "eventVersion", "1.1");
-                rjson::push_back(records, std::move(record));
-                record = rjson::empty_object();
-                --limit;
-            }
-        };
+    auto maybe_add_record = [&] {
+        if (!dynamodb.ObjectEmpty()) {
+            rjson::add(record, "dynamodb", std::move(dynamodb));
+            dynamodb = rjson::empty_object();
+        }
+        if (!record.ObjectEmpty()) {
+            rjson::add(record, "awsRegion", rjson::from_string(dc_name));
+            rjson::add(record, "eventID", event_id(iter.shard.id, *timestamp));
+            rjson::add(record, "eventSource", "scylladb:alternator");
+            rjson::add(record, "eventVersion", "1.1");
+            rjson::push_back(records, std::move(record));
+            record = rjson::empty_object();
+            --limit;
+        }
+    };

-        for (auto& row : result_set->rows()) {
-            auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
-            auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
-            auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;
+    for (auto& row : result_set->rows()) {
+        auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
+        auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
+        auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;

-            if (!dynamodb.HasMember("Keys")) {
-                auto keys = rjson::empty_object();
-                describe_single_item(*selection, row, key_names, keys);
-                rjson::add(dynamodb, "Keys", std::move(keys));
-                rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
-                rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
-                rjson::add(dynamodb, "StreamViewType", type);
-                // TODO: SizeBytes
-            }
-
-            /**
-             * We merge rows with same timestamp into a single event.
-             * This is pretty much needed, because a CDC row typically
-             * encodes ~half the info of an alternator write. 
-             * 
-             * A big, big downside to how alternator records are written
-             * (i.e. CQL), is that the distinction between INSERT and UPDATE
-             * is somewhat lost/unmappable to actual eventName. 
-             * A write (currently) always looks like an insert+modify
-             * regardless whether we wrote existing record or not. 
-             * 
-             * Maybe RMW ops could be done slightly differently so 
-             * we can distinguish them here...
-             * 
-             * For now, all writes will become MODIFY.
-             * 
-             * Note: we do not check the current pre/post
-             * flags on CDC log, instead we use data to 
-             * drive what is returned. This is (afaict)
-             * consistent with dynamo streams
-             */
-            switch (op) {
-            case cdc::operation::pre_image:
-            case cdc::operation::post_image:
-            {
-                auto item = rjson::empty_object();
-                describe_single_item(*selection, row, attr_names, item, nullptr, true);
-                describe_single_item(*selection, row, key_names, item);
-                rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
-                break;
-            }
-            case cdc::operation::update:
-                rjson::add(record, "eventName", "MODIFY");
-                break;
-            case cdc::operation::insert:
-                rjson::add(record, "eventName", "INSERT");
-                break;
-            case cdc::operation::service_row_delete:
-            case cdc::operation::service_partition_delete:
-            {
-                auto user_identity = rjson::empty_object();
-                rjson::add(user_identity, "Type", "Service");
-                rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
-                rjson::add(record, "userIdentity", std::move(user_identity));
-                rjson::add(record, "eventName", "REMOVE");
-                break;
-            }
-            default:
-                rjson::add(record, "eventName", "REMOVE");
-                break;
-            }
-            if (eor) {
-                maybe_add_record();
-                timestamp = ts;
-                if (limit == 0) {
-                    break;
-                }
-            }
+        if (!dynamodb.HasMember("Keys")) {
+            auto keys = rjson::empty_object();
+            describe_single_item(*selection, row, key_names, keys);
+            rjson::add(dynamodb, "Keys", std::move(keys));
+            rjson::add(dynamodb, "ApproximateCreationDateTime", utils::UUID_gen::unix_timestamp_in_sec(ts).count());
+            rjson::add(dynamodb, "SequenceNumber", sequence_number(ts));
+            rjson::add(dynamodb, "StreamViewType", type);
+            // TODO: SizeBytes
        }

-        auto ret = rjson::empty_object();
-        auto nrecords = records.Size();
-        rjson::add(ret, "Records", std::move(records));
-
-        if (nrecords != 0) {
-            // #9642. Set next iterators threshold to > last
-            shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
-            // Note that here we unconditionally return NextShardIterator,
-            // without checking if maybe we reached the end-of-shard. If the
-            // shard did end, then the next read will have nrecords == 0 and
-            // will notice end end of shard and not return NextShardIterator.
-            rjson::add(ret, "NextShardIterator", next_iter);
-            _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
-            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
+        /**
+         * We merge rows with same timestamp into a single event.
+         * This is pretty much needed, because a CDC row typically
+         * encodes ~half the info of an alternator write. 
+         * 
+         * A big, big downside to how alternator records are written
+         * (i.e. CQL), is that the distinction between INSERT and UPDATE
+         * is somewhat lost/unmappable to actual eventName. 
+         * A write (currently) always looks like an insert+modify
+         * regardless whether we wrote existing record or not. 
+         * 
+         * Maybe RMW ops could be done slightly differently so 
+         * we can distinguish them here...
+         * 
+         * For now, all writes will become MODIFY.
+         * 
+         * Note: we do not check the current pre/post
+         * flags on CDC log, instead we use data to 
+         * drive what is returned. This is (afaict)
+         * consistent with dynamo streams
+         */
+        switch (op) {
+        case cdc::operation::pre_image:
+        case cdc::operation::post_image:
+        {
+            auto item = rjson::empty_object();
+            describe_single_item(*selection, row, attr_names, item, nullptr, true);
+            describe_single_item(*selection, row, key_names, item);
+            rjson::add(dynamodb, op == cdc::operation::pre_image ? "OldImage" : "NewImage", std::move(item));
+            break;
        }
-
-        // ugh. figure out if we are and end-of-shard
-        auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
-
-        return _sdks.cdc_current_generation_timestamp({ normal_token_owners }).then([this, iter, high_ts, start_time, ret = std::move(ret)](db_clock::time_point ts) mutable {
-            auto& shard = iter.shard;            
-
-            if (shard.time < ts && ts < high_ts) {
-                // The DynamoDB documentation states that when a shard is
-                // closed, reading it until the end has NextShardIterator
-                // "set to null". Our test test_streams_closed_read
-                // confirms that by "null" they meant not set at all.
-            } else {
-                // We could have return the same iterator again, but we did
-                // a search from it until high_ts and found nothing, so we
-                // can also start the next search from high_ts.
-                // TODO: but why? It's simpler just to leave the iterator be.
-                shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
-                rjson::add(ret, "NextShardIterator", iter);
+        case cdc::operation::update:
+            rjson::add(record, "eventName", "MODIFY");
+            break;
+        case cdc::operation::insert:
+            rjson::add(record, "eventName", "INSERT");
+            break;
+        case cdc::operation::service_row_delete:
+        case cdc::operation::service_partition_delete:
+        {
+            auto user_identity = rjson::empty_object();
+            rjson::add(user_identity, "Type", "Service");
+            rjson::add(user_identity, "PrincipalId", "dynamodb.amazonaws.com");
+            rjson::add(record, "userIdentity", std::move(user_identity));
+            rjson::add(record, "eventName", "REMOVE");
+            break;
+        }
+        default:
+            rjson::add(record, "eventName", "REMOVE");
+            break;
+        }
+        if (eor) {
+            maybe_add_record();
+            timestamp = ts;
+            if (limit == 0) {
+                break;
            }
-            _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
-            if (is_big(ret)) {
-                return make_ready_future<executor::request_return_type>(make_streamed(std::move(ret)));
-            }
-            return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
-        });
-    });
+        }
+    }
+
+    auto ret = rjson::empty_object();
+    auto nrecords = records.Size();
+    rjson::add(ret, "Records", std::move(records));
+
+    if (nrecords != 0) {
+        // #9642. Set next iterators threshold to > last
+        shard_iterator next_iter(iter.table, iter.shard, *timestamp, false);
+        // Note that here we unconditionally return NextShardIterator,
+        // without checking if maybe we reached the end-of-shard. If the
+        // shard did end, then the next read will have nrecords == 0 and
+        // will notice end end of shard and not return NextShardIterator.
+        rjson::add(ret, "NextShardIterator", next_iter);
+        _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
+        co_return rjson::print(std::move(ret));
+    }
+
+    // ugh. figure out if we are and end-of-shard
+    auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
+
+    db_clock::time_point ts = co_await _sdks.cdc_current_generation_timestamp({ normal_token_owners });
+    auto& shard = iter.shard;
+
+    if (shard.time < ts && ts < high_ts) {
+        // The DynamoDB documentation states that when a shard is
+        // closed, reading it until the end has NextShardIterator
+        // "set to null". Our test test_streams_closed_read
+        // confirms that by "null" they meant not set at all.
+    } else {
+        // We could have return the same iterator again, but we did
+        // a search from it until high_ts and found nothing, so we
+        // can also start the next search from high_ts.
+        // TODO: but why? It's simpler just to leave the iterator be.
+        shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
+        rjson::add(ret, "NextShardIterator", iter);
+    }
+    _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
+    if (is_big(ret)) {
+        co_return make_streamed(std::move(ret));
+    }
+    co_return rjson::print(std::move(ret));
 }

 bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -515,15 +515,6 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
        auto sstables = parsed.GetArray() |
            std::views::transform([] (const auto& s) { return sstring(rjson::to_string_view(s)); }) |
            std::ranges::to<std::vector>();
-        apilog.info("Restore invoked with following parameters: keyspace={}, table={}, endpoint={}, bucket={}, prefix={}, sstables_count={}, scope={}, primary_replica_only={}",
-                    keyspace,
-                    table,
-                    endpoint,
-                    bucket,
-                    prefix,
-                    sstables.size(),
-                    scope,
-                    primary_replica_only);
        auto task_id = co_await sst_loader.local().download_new_sstables(keyspace, table, prefix, std::move(sstables), endpoint, bucket, scope, primary_replica_only);
        co_return json::json_return_type(fmt::to_string(task_id));
    });
--- a/audit/audit.cc
+++ b/audit/audit.cc
@@ -209,15 +209,11 @@ future<> audit::stop_audit() {
    });
 }

-audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table) {
+audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch) {
    if (!audit_instance().local_is_initialized()) {
        return nullptr;
    }
-    return std::make_unique<audit_info>(cat, keyspace, table);
-}
-
-audit_info_ptr audit::create_no_audit_info() {
-    return audit_info_ptr();
+    return std::make_unique<audit_info>(cat, keyspace, table, batch);
 }

 future<> audit::start(const db::config& cfg) {
@@ -267,18 +263,21 @@ future<> audit::log_login(const sstring& username, socket_address client_ip, boo
 }

 future<> inspect(shared_ptr<cql3::cql_statement> statement, service::query_state& query_state, const cql3::query_options& options, bool error) {
-    cql3::statements::batch_statement* batch = dynamic_cast<cql3::statements::batch_statement*>(statement.get());
-    if (batch != nullptr) {
+    auto audit_info = statement->get_audit_info();
+    if (!audit_info) {
+        return make_ready_future<>();
+    }
+    if (audit_info->batch()) {
+        cql3::statements::batch_statement* batch = static_cast<cql3::statements::batch_statement*>(statement.get());
        return do_for_each(batch->statements().begin(), batch->statements().end(), [&query_state, &options, error] (auto&& m) {
            return inspect(m.statement, query_state, options, error);
        });
    } else {
-        auto audit_info = statement->get_audit_info();
-        if (bool(audit_info) && audit::local_audit_instance().should_log(audit_info)) {
+        if (audit::local_audit_instance().should_log(audit_info)) {
            return audit::local_audit_instance().log(audit_info, query_state, options, error);
        }
+        return make_ready_future<>();
    }
-    return make_ready_future<>();
 }

 future<> inspect_login(const sstring& username, socket_address client_ip, bool error) {
--- a/audit/audit.hh
+++ b/audit/audit.hh
@@ -75,11 +75,13 @@ class audit_info final {
    sstring _keyspace;
    sstring _table;
    sstring _query;
+    bool _batch;
 public:
-    audit_info(statement_category cat, sstring keyspace, sstring table)
+    audit_info(statement_category cat, sstring keyspace, sstring table, bool batch)
        : _category(cat)
        , _keyspace(std::move(keyspace))
        , _table(std::move(table))
+        , _batch(batch)
    { }
    void set_query_string(const std::string_view& query_string) {
        _query = sstring(query_string);
@@ -89,6 +91,7 @@ public:
    const sstring& query() const { return _query; }
    sstring category_string() const;
    statement_category category() const { return _category; }
+    bool batch() const { return _batch; }
 };

 using audit_info_ptr = std::unique_ptr<audit_info>;
@@ -126,8 +129,7 @@ public:
    }
    static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
    static future<> stop_audit();
-    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table);
-    static audit_info_ptr create_no_audit_info();
+    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch = false);
    audit(locator::shared_token_metadata& stm,
          cql3::query_processor& qp,
          service::migration_manager& mm,
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -52,13 +52,6 @@ static const class_registrator<
        ::service::migration_manager&,
        cache&> registration("org.apache.cassandra.auth.CassandraRoleManager");

-struct record final {
-    sstring name;
-    bool is_superuser;
-    bool can_login;
-    role_set member_of;
-};
-
 static db::consistency_level consistency_for_role(std::string_view role_name) noexcept {
    if (role_name == meta::DEFAULT_SUPERUSER_NAME) {
        return db::consistency_level::QUORUM;
@@ -67,13 +60,13 @@ static db::consistency_level consistency_for_role(std::string_view role_name) no
    return db::consistency_level::LOCAL_ONE;
 }

-static future<std::optional<record>> find_record(cql3::query_processor& qp, std::string_view role_name) {
+future<std::optional<standard_role_manager::record>> standard_role_manager::legacy_find_record(std::string_view role_name) {
    const sstring query = seastar::format("SELECT * FROM {}.{} WHERE {} = ?",
-            get_auth_ks_name(qp),
+            get_auth_ks_name(_qp),
            meta::roles_table::name,
            meta::roles_table::role_col_name);

-    const auto results = co_await qp.execute_internal(
+    const auto results = co_await _qp.execute_internal(
            query,
            consistency_for_role(role_name),
            internal_distributed_query_state(),
@@ -93,8 +86,25 @@ static future<std::optional<record>> find_record(cql3::query_processor& qp, std:
                        : role_set())});
 }

-static future<record> require_record(cql3::query_processor& qp, std::string_view role_name) {
-    return find_record(qp, role_name).then([role_name](std::optional<record> mr) {
+future<std::optional<standard_role_manager::record>> standard_role_manager::find_record(std::string_view role_name) {
+    if (legacy_mode(_qp)) {
+        return legacy_find_record(role_name);
+    }
+    auto name = sstring(role_name);
+    auto role = _cache.get(name);
+    if (!role) {
+        return make_ready_future<std::optional<record>>(std::nullopt);
+    }
+    return make_ready_future<std::optional<record>>(std::make_optional(record{
+        .name = std::move(name),
+        .is_superuser = role->is_superuser,
+        .can_login = role->can_login,
+        .member_of = role->member_of
+    }));
+}
+
+future<standard_role_manager::record> standard_role_manager::require_record(std::string_view role_name) {
+    return find_record(role_name).then([role_name](std::optional<record> mr) {
        if (!mr) {
            throw nonexistant_role(role_name);
        }
@@ -386,7 +396,7 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat
        return fmt::to_string(fmt::join(assignments, ", "));
    };

-    return require_record(_qp, role_name).then([this, role_name, &u, &mc](record) {
+    return require_record(role_name).then([this, role_name, &u, &mc](record) {
        if (!u.is_superuser && !u.can_login) {
            return make_ready_future<>();
        }
@@ -620,18 +630,17 @@ standard_role_manager::revoke(std::string_view revokee_name, std::string_view ro
    });
 }

-static future<> collect_roles(
-        cql3::query_processor& qp,
+future<> standard_role_manager::collect_roles(
        std::string_view grantee_name,
        bool recurse,
        role_set& roles) {
-    return require_record(qp, grantee_name).then([&qp, &roles, recurse](record r) {
-        return do_with(std::move(r.member_of), [&qp, &roles, recurse](const role_set& memberships) {
-            return do_for_each(memberships.begin(), memberships.end(), [&qp, &roles, recurse](const sstring& role_name) {
+    return require_record(grantee_name).then([this, &roles, recurse](standard_role_manager::record r) {
+        return do_with(std::move(r.member_of), [this, &roles, recurse](const role_set& memberships) {
+            return do_for_each(memberships.begin(), memberships.end(), [this, &roles, recurse](const sstring& role_name) {
                roles.insert(role_name);

                if (recurse) {
-                    return collect_roles(qp, role_name, true, roles);
+                    return collect_roles(role_name, true, roles);
                }

                return make_ready_future<>();
@@ -646,7 +655,7 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
    return do_with(
            role_set{sstring(grantee_name)},
            [this, grantee_name, recurse](role_set& roles) {
-        return collect_roles(_qp, grantee_name, recurse, roles).then([&roles] { return roles; });
+        return collect_roles(grantee_name, recurse, roles).then([&roles] { return roles; });
    });
 }

@@ -706,27 +715,21 @@ future<role_set> standard_role_manager::query_all(::service::query_state& qs) {
 }

 future<bool> standard_role_manager::exists(std::string_view role_name) {
-    return find_record(_qp, role_name).then([](std::optional<record> mr) {
+    return find_record(role_name).then([](std::optional<record> mr) {
        return static_cast<bool>(mr);
    });
 }

 future<bool> standard_role_manager::is_superuser(std::string_view role_name) {
-    return require_record(_qp, role_name).then([](record r) {
+    return require_record(role_name).then([](record r) {
        return r.is_superuser;
    });
 }

 future<bool> standard_role_manager::can_login(std::string_view role_name) {
-    if (legacy_mode(_qp)) {
-       const auto r = co_await require_record(_qp, role_name);
-       co_return r.can_login;
-    }
-    auto role = _cache.get(sstring(role_name));
-    if (!role) {
-        throw nonexistant_role(role_name);
-    }
-    co_return role->can_login;
+    return require_record(role_name).then([](record r) {
+        return r.can_login;
+    });
 }

 future<std::optional<sstring>> standard_role_manager::get_attribute(std::string_view role_name, std::string_view attribute_name, ::service::query_state& qs) {
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -90,6 +90,12 @@ public:

 private:
    enum class membership_change { add, remove };
+    struct record final {
+        sstring name;
+        bool is_superuser;
+        bool can_login;
+        role_set member_of;
+    };

    future<> create_legacy_metadata_tables_if_missing() const;

@@ -107,6 +113,14 @@ private:
    future<> legacy_modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change);

    future<> modify_membership(std::string_view role_name, std::string_view grantee_name, membership_change, ::service::group0_batch& mc);
+
+    future<std::optional<record>> legacy_find_record(std::string_view role_name);
+    future<std::optional<record>> find_record(std::string_view role_name);
+    future<record> require_record(std::string_view role_name);
+    future<> collect_roles(
+            std::string_view grantee_name,
+            bool recurse,
+            role_set& roles);
 };

 } // namespace auth
--- a/configure.py
+++ b/configure.py
@@ -730,6 +730,28 @@ vector_search_tests = set([
    'test/vector_search/rescoring_test'
 ])

+vector_search_validator_bin = 'vector-search-validator/bin/vector-search-validator'
+vector_search_validator_deps = set([
+    'test/vector_search_validator/build-validator',
+    'test/vector_search_validator/Cargo.toml',
+    'test/vector_search_validator/crates/validator/Cargo.toml',
+    'test/vector_search_validator/crates/validator/src/main.rs',
+    'test/vector_search_validator/crates/validator-scylla/Cargo.toml',
+    'test/vector_search_validator/crates/validator-scylla/src/lib.rs',
+    'test/vector_search_validator/crates/validator-scylla/src/cql.rs',
+])
+
+vector_store_bin = 'vector-search-validator/bin/vector-store'
+vector_store_deps = set([
+    'test/vector_search_validator/build-env',
+    'test/vector_search_validator/build-vector-store',
+])
+
+vector_search_validator_bins = set([
+    vector_search_validator_bin,
+    vector_store_bin,
+])
+
 wasms = set([
    'wasm/return_input.wat',
    'wasm/test_complex_null_values.wat',
@@ -763,7 +785,7 @@ other = set([
    'iotune',
 ])

-all_artifacts = apps | cpp_apps | tests | other | wasms
+all_artifacts = apps | cpp_apps | tests | other | wasms | vector_search_validator_bins

 arg_parser = argparse.ArgumentParser('Configure scylla', add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 arg_parser.add_argument('--out', dest='buildfile', action='store', default='build.ninja',
@@ -795,6 +817,9 @@ arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='clan
                        help='C compiler path')
 arg_parser.add_argument('--compiler-cache', action='store', dest='compiler_cache', default='auto',
                        help='Compiler cache to use: auto (default, prefers sccache), sccache, ccache, none, or a path to a binary')
+# Workaround for https://github.com/mozilla/sccache/issues/2575
+arg_parser.add_argument('--sccache-rust', action=argparse.BooleanOptionalAction, default=False,
+                        help='Use sccache for rust code (if sccache is selected as compiler cache). Doesn\'t work with distributed builds.')
 add_tristate(arg_parser, name='dpdk', dest='dpdk', default=False,
                        help='Use dpdk (from seastar dpdk sources)')
 arg_parser.add_argument('--dpdk-target', action='store', dest='dpdk_target', default='',
@@ -925,8 +950,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/crypt_sha512.cc',
                'utils/logalloc.cc',
                'utils/large_bitset.cc',
-                'utils/buffer_input_stream.cc',
-                'utils/limiting_data_source.cc',
+                'test/lib/limiting_data_source.cc',
                'utils/updateable_value.cc',
                'message/dictionary_service.cc',
                'utils/directories.cc',
@@ -1535,6 +1559,7 @@ scylla_perfs = ['test/perf/perf_alternator.cc',
                'test/perf/perf_fast_forward.cc',
                'test/perf/perf_row_cache_update.cc',
                'test/perf/perf_simple_query.cc',
+                'test/perf/perf_cql_raw.cc',
                'test/perf/perf_sstable.cc',
                'test/perf/perf_tablets.cc',
                'test/perf/tablet_load_balancing.cc',
@@ -2383,7 +2408,7 @@ def write_build_file(f,
    # If compiler cache is available, prefix the compiler with it
    cxx_with_cache = f'{compiler_cache} {args.cxx}' if compiler_cache else args.cxx
    # For Rust, sccache is used via RUSTC_WRAPPER environment variable
-    rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache else ''
+    rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache and args.sccache_rust else ''
    f.write(textwrap.dedent('''\
        configure_args = {configure_args}
        builddir = {outdir}
@@ -2560,10 +2585,11 @@ def write_build_file(f,
              description = RUST_LIB $out
            ''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, rustc_wrapper=rustc_wrapper, **modeval))
        f.write(
-            'build {mode}-build: phony {artifacts} {wasms}\n'.format(
+            'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
                mode=mode,
-                artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms)]),
+                artifacts=str.join(' ', ['$builddir/' + mode + '/' + x for x in sorted(build_artifacts - wasms - vector_search_validator_bins)]),
                wasms = str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & wasms)]),
+                vector_search_validator_bins=str.join(' ', ['$builddir/' + x for x in sorted(build_artifacts & vector_search_validator_bins)]),
            )
        )
        if profile_recipe := modes[mode].get('profile_recipe'):
@@ -2593,7 +2619,7 @@ def write_build_file(f,
                continue
            profile_dep = modes[mode].get('profile_target', "")

-            if binary in other or binary in wasms:
+            if binary in other or binary in wasms or binary in vector_search_validator_bins:
                continue
            srcs = deps[binary]
            # 'scylla'
@@ -2704,10 +2730,11 @@ def write_build_file(f,
        )

        f.write(
-            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms}\n'.format(
+            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/scylla {wasms} {vector_search_validator_bins} \n'.format(
                mode=mode,
                test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in sorted(tests)]),
                wasms=' '.join([f'$builddir/{binary}' for binary in sorted(wasms)]),
+                vector_search_validator_bins=' '.join([f'$builddir/{binary}' for binary in sorted(vector_search_validator_bins)]),
            )
        )
        f.write(
@@ -2875,6 +2902,19 @@ def write_build_file(f,
            'build compiler-training: phony {}\n'.format(' '.join(['{mode}-compiler-training'.format(mode=mode) for mode in default_modes]))
    )

+    f.write(textwrap.dedent(f'''\
+        rule build-vector-search-validator
+            command = test/vector_search_validator/build-validator $builddir
+        rule build-vector-store
+            command = test/vector_search_validator/build-vector-store $builddir
+        '''))
+    f.write(
+            'build $builddir/{vector_search_validator_bin}: build-vector-search-validator {}\n'.format(' '.join([dep for dep in sorted(vector_search_validator_deps)]), vector_search_validator_bin=vector_search_validator_bin)
+    )
+    f.write(
+            'build $builddir/{vector_store_bin}: build-vector-store {}\n'.format(' '.join([dep for dep in sorted(vector_store_deps)]), vector_store_bin=vector_store_bin)
+    )
+
    f.write(textwrap.dedent(f'''\
        build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
        build dist-unified: phony dist-unified-tar
@@ -3112,7 +3152,7 @@ def configure_using_cmake(args):
        settings['CMAKE_CXX_COMPILER_LAUNCHER'] = compiler_cache
        settings['CMAKE_C_COMPILER_LAUNCHER'] = compiler_cache
        # For Rust, sccache is used via RUSTC_WRAPPER
-        if 'sccache' in compiler_cache:
+        if 'sccache' in compiler_cache and args.sccache_rust:
            settings['Scylla_RUSTC_WRAPPER'] = compiler_cache

    if args.date_stamp:
--- a/cql3/statements/describe_statement.cc
+++ b/cql3/statements/describe_statement.cc
@@ -23,7 +23,6 @@
 #include "index/vector_index.hh"
 #include "schema/schema.hh"
 #include "service/client_state.hh"
-#include "service/paxos/paxos_state.hh"
 #include "types/types.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/cql_statement.hh"
@@ -330,19 +329,6 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
                "*/",
                *table_desc.create_statement);

-        table_desc.create_statement = std::move(os).to_managed_string();
-    } else if (service::paxos::paxos_store::try_get_base_table(name)) {
-        // Paxos state table is internally managed by Scylla and it shouldn't be exposed to the user.
-        // The table is allowed to be described as a comment to ease administrative work but it's hidden from all listings.
-        fragmented_ostringstream os{};
-
-        fmt::format_to(os.to_iter(),
-                "/* Do NOT execute this statement! It's only for informational purposes.\n"
-                "   A paxos state table is created automatically when enabling LWT on a base table.\n"
-                "\n{}\n"
-                "*/",
-                *table_desc.create_statement);
-
        table_desc.create_statement = std::move(os).to_managed_string();
    }
    result.push_back(std::move(table_desc));
@@ -378,7 +364,7 @@ future<std::vector<description>> table(const data_dictionary::database& db, cons
 future<std::vector<description>> tables(const data_dictionary::database& db, const lw_shared_ptr<keyspace_metadata>& ks, std::optional<bool> with_internals = std::nullopt) {
    auto& replica_db = db.real_database();
    auto tables = ks->tables() | std::views::filter([&replica_db] (const schema_ptr& s) {
-        return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name()) && !service::paxos::paxos_store::try_get_base_table(s->cf_name());
+        return !cdc::is_log_for_some_table(replica_db, s->ks_name(), s->cf_name());
    }) | std::ranges::to<std::vector<schema_ptr>>();
    std::ranges::sort(tables, std::ranges::less(), std::mem_fn(&schema::cf_name));

--- a/cql3/statements/raw/batch_statement.hh
+++ b/cql3/statements/raw/batch_statement.hh
@@ -50,8 +50,8 @@ public:
 protected:
    virtual audit::statement_category category() const override;
    virtual audit::audit_info_ptr audit_info() const override {
-        // We don't audit batch statements. Instead we audit statements that are inside the batch.
-        return audit::audit::create_no_audit_info();
+        constexpr bool batch = true;
+        return audit::audit::create_audit_info(category(), sstring(), sstring(), batch);
    }
 };

--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -1986,13 +1986,13 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
            }
            continue;
        } catch (shutdown_marker&) {
+            _reserve_segments.abort(std::current_exception());
            break;
        } catch (...) {
            clogger.warn("Exception in segment reservation: {}", std::current_exception());
        }
        co_await sleep(100ms);
    }
-    _reserve_segments.abort(std::make_exception_ptr(shutdown_marker()));
 }

 future<std::vector<db::commitlog::descriptor>>
--- a/db/config.cc
+++ b/db/config.cc
@@ -621,25 +621,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    * @GroupDescription: Provides an overview of the group.
    */
    /**
-    * @Group Ungrouped properties
-    */
-    , background_writer_scheduling_quota(this, "background_writer_scheduling_quota", value_status::Deprecated, 1.0,
-        "max cpu usage ratio (between 0 and 1) for compaction process. Not intended for setting in normal operations. Setting it to 1 or higher will disable it, recommended operational setting is 0.5.")
-    , auto_adjust_flush_quota(this, "auto_adjust_flush_quota", value_status::Deprecated, false,
-        "true: auto-adjust memtable shares for flush processes")
-    , memtable_flush_static_shares(this, "memtable_flush_static_shares", liveness::LiveUpdate, value_status::Used, 0,
-        "If set to higher than 0, ignore the controller's output and set the memtable shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
-    , compaction_static_shares(this, "compaction_static_shares", liveness::LiveUpdate, value_status::Used, 0,
-        "If set to higher than 0, ignore the controller's output and set the compaction shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
-    , compaction_max_shares(this, "compaction_max_shares", liveness::LiveUpdate, value_status::Used, default_compaction_maximum_shares,
-        "Set the maximum shares of regular compaction to the specific value. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
-    , compaction_enforce_min_threshold(this, "compaction_enforce_min_threshold", liveness::LiveUpdate, value_status::Used, false,
-        "If set to true, enforce the min_threshold option for compactions strictly. If false (default), Scylla may decide to compact even if below min_threshold.")
-    , compaction_flush_all_tables_before_major_seconds(this, "compaction_flush_all_tables_before_major_seconds", value_status::Used, 86400,
-        "Set the minimum interval in seconds between flushing all tables before each major compaction (default is 86400)."
-        "This option is useful for maximizing tombstone garbage collection by releasing all active commitlog segments."
-        "Set to 0 to disable automatic flushing all tables before major compaction.")
-    /**
    * @Group Initialization properties
    * @GroupDescription The minimal properties needed for configuring a cluster.
    */
@@ -1394,6 +1375,10 @@ db::config::config(std::shared_ptr<db::extensions> exts)
            "Start killing reads after their collective memory consumption goes above $normal_limit * $multiplier.")
    , reader_concurrency_semaphore_cpu_concurrency(this, "reader_concurrency_semaphore_cpu_concurrency", liveness::LiveUpdate, value_status::Used, 2,
            "Admit new reads while there are less than this number of requests that need CPU.")
+    , reader_concurrency_semaphore_preemptive_abort_factor(this, "reader_concurrency_semaphore_preemptive_abort_factor", liveness::LiveUpdate, value_status::Used, 0.3,
+            "Admit new reads while their remaining time is more than this factor times their timeout times when arrived to a semaphore. Its vale means\n"
+            "* <= 0.0 means new reads will never get rejected during admission\n"
+            "* >= 1.0 means new reads will always get rejected during admission\n")
    , view_update_reader_concurrency_semaphore_serialize_limit_multiplier(this, "view_update_reader_concurrency_semaphore_serialize_limit_multiplier", liveness::LiveUpdate, value_status::Used, 2,
            "Start serializing view update reads after their collective memory consumption goes above $normal_limit * $multiplier.")
    , view_update_reader_concurrency_semaphore_kill_limit_multiplier(this, "view_update_reader_concurrency_semaphore_kill_limit_multiplier", liveness::LiveUpdate, value_status::Used, 4,
@@ -1602,6 +1587,25 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Sets the maximum difference in percentages between the most loaded and least loaded nodes, below which the load balancer considers nodes balanced.")
    , minimal_tablet_size_for_balancing(this, "minimal_tablet_size_for_balancing", liveness::LiveUpdate, value_status::Used, service::default_target_tablet_size / 100,
        "Sets the minimal tablet size for the load balancer. For any tablet smaller than this, the balancer will use this size instead of the actual tablet size.")
+    /**
+    * @Group Ungrouped properties
+    */
+    , background_writer_scheduling_quota(this, "background_writer_scheduling_quota", value_status::Deprecated, 1.0,
+        "max cpu usage ratio (between 0 and 1) for compaction process. Not intended for setting in normal operations. Setting it to 1 or higher will disable it, recommended operational setting is 0.5.")
+    , auto_adjust_flush_quota(this, "auto_adjust_flush_quota", value_status::Deprecated, false,
+        "true: auto-adjust memtable shares for flush processes")
+    , memtable_flush_static_shares(this, "memtable_flush_static_shares", liveness::LiveUpdate, value_status::Used, 0,
+        "If set to higher than 0, ignore the controller's output and set the memtable shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
+    , compaction_static_shares(this, "compaction_static_shares", liveness::LiveUpdate, value_status::Used, 0,
+        "If set to higher than 0, ignore the controller's output and set the compaction shares statically. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
+    , compaction_max_shares(this, "compaction_max_shares", liveness::LiveUpdate, value_status::Used, default_compaction_maximum_shares,
+        "Set the maximum shares of regular compaction to the specific value. Do not set this unless you know what you are doing and suspect a problem in the controller. This option will be retired when the controller reaches more maturity.")
+    , compaction_enforce_min_threshold(this, "compaction_enforce_min_threshold", liveness::LiveUpdate, value_status::Used, false,
+        "If set to true, enforce the min_threshold option for compactions strictly. If false (default), Scylla may decide to compact even if below min_threshold.")
+    , compaction_flush_all_tables_before_major_seconds(this, "compaction_flush_all_tables_before_major_seconds", value_status::Used, 86400,
+        "Set the minimum interval in seconds between flushing all tables before each major compaction (default is 86400)."
+        "This option is useful for maximizing tombstone garbage collection by releasing all active commitlog segments."
+        "Set to 0 to disable automatic flushing all tables before major compaction.")
    , default_log_level(this, "default_log_level", value_status::Used, seastar::log_level::info, "Default log level for log messages")
    , logger_log_level(this, "logger_log_level", value_status::Used, {}, "Map of logger name to log level. Valid log levels are 'error', 'warn', 'info', 'debug' and 'trace'")
    , log_to_stdout(this, "log_to_stdout", value_status::Used, true, "Send log output to stdout")
--- a/db/config.hh
+++ b/db/config.hh
@@ -185,13 +185,6 @@ public:
     * All values and documentation taken from
     * http://docs.datastax.com/en/cassandra/2.1/cassandra/configuration/configCassandra_yaml_r.html
     */
-    named_value<double> background_writer_scheduling_quota;
-    named_value<bool> auto_adjust_flush_quota;
-    named_value<float> memtable_flush_static_shares;
-    named_value<float> compaction_static_shares;
-    named_value<float> compaction_max_shares;
-    named_value<bool> compaction_enforce_min_threshold;
-    named_value<uint32_t> compaction_flush_all_tables_before_major_seconds;
    named_value<sstring> cluster_name;
    named_value<sstring> listen_address;
    named_value<sstring> listen_interface;
@@ -446,6 +439,7 @@ public:
    named_value<uint32_t> reader_concurrency_semaphore_serialize_limit_multiplier;
    named_value<uint32_t> reader_concurrency_semaphore_kill_limit_multiplier;
    named_value<uint32_t> reader_concurrency_semaphore_cpu_concurrency;
+    named_value<float> reader_concurrency_semaphore_preemptive_abort_factor;
    named_value<uint32_t> view_update_reader_concurrency_semaphore_serialize_limit_multiplier;
    named_value<uint32_t> view_update_reader_concurrency_semaphore_kill_limit_multiplier;
    named_value<uint32_t> view_update_reader_concurrency_semaphore_cpu_concurrency;
@@ -612,6 +606,14 @@ public:
    named_value<float> size_based_balance_threshold_percentage;
    named_value<uint64_t> minimal_tablet_size_for_balancing;

+    named_value<double> background_writer_scheduling_quota;
+    named_value<bool> auto_adjust_flush_quota;
+    named_value<float> memtable_flush_static_shares;
+    named_value<float> compaction_static_shares;
+    named_value<float> compaction_max_shares;
+    named_value<bool> compaction_enforce_min_threshold;
+    named_value<uint32_t> compaction_flush_all_tables_before_major_seconds;
+
    static const sstring default_tls_priority;
 private:
    template<typename T>
--- a/db/hints/internal/hint_endpoint_manager.cc
+++ b/db/hints/internal/hint_endpoint_manager.cc
@@ -158,7 +158,7 @@ void hint_endpoint_manager::cancel_draining() noexcept {
    _sender.cancel_draining();
 }

-hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hint_directory, manager& shard_manager)
+hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hint_directory, manager& shard_manager, scheduling_group send_sg)
    : _key(key)
    , _shard_manager(shard_manager)
    , _store_gate("hint_endpoint_manager")
@@ -169,7 +169,7 @@ hint_endpoint_manager::hint_endpoint_manager(const endpoint_id& key, fs::path hi
    // Approximate the position of the last written hint by using the same formula as for segment id calculation in commitlog
    // TODO: Should this logic be deduplicated with what is in the commitlog?
    , _last_written_rp(this_shard_id(), std::chrono::duration_cast<std::chrono::milliseconds>(runtime::get_boot_time().time_since_epoch()).count())
-    , _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper())
+    , _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper(), send_sg)
 {}

 hint_endpoint_manager::hint_endpoint_manager(hint_endpoint_manager&& other)
--- a/db/hints/internal/hint_endpoint_manager.hh
+++ b/db/hints/internal/hint_endpoint_manager.hh
@@ -63,7 +63,7 @@ private:
    hint_sender _sender;

 public:
-    hint_endpoint_manager(const endpoint_id& key, std::filesystem::path hint_directory, manager& shard_manager);
+    hint_endpoint_manager(const endpoint_id& key, std::filesystem::path hint_directory, manager& shard_manager, scheduling_group send_sg);
    hint_endpoint_manager(hint_endpoint_manager&&);
    ~hint_endpoint_manager();

--- a/db/hints/internal/hint_sender.cc
+++ b/db/hints/internal/hint_sender.cc
@@ -122,7 +122,7 @@ const column_mapping& hint_sender::get_column_mapping(lw_shared_ptr<send_one_fil
    return cm_it->second;
 }

-hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy,replica::database& local_db, const gms::gossiper& local_gossiper) noexcept
+hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy,replica::database& local_db, const gms::gossiper& local_gossiper, scheduling_group sg) noexcept
    : _stopped(make_ready_future<>())
    , _ep_key(parent.end_point_key())
    , _ep_manager(parent)
@@ -130,7 +130,7 @@ hint_sender::hint_sender(hint_endpoint_manager& parent, service::storage_proxy&
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(local_storage_proxy)
    , _db(local_db)
-    , _hints_cpu_sched_group(_db.get_streaming_scheduling_group())
+    , _hints_cpu_sched_group(sg)
    , _gossiper(local_gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
--- a/db/hints/internal/hint_sender.hh
+++ b/db/hints/internal/hint_sender.hh
@@ -120,7 +120,7 @@ private:
    std::multimap<db::replay_position, lw_shared_ptr<std::optional<promise<>>>> _replay_waiters;

 public:
-    hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy, replica::database& local_db, const gms::gossiper& local_gossiper) noexcept;
+    hint_sender(hint_endpoint_manager& parent, service::storage_proxy& local_storage_proxy, replica::database& local_db, const gms::gossiper& local_gossiper, scheduling_group sg) noexcept;
    ~hint_sender();

    /// \brief A constructor that should be called from the copy/move-constructor of hint_endpoint_manager.
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -142,7 +142,7 @@ future<> directory_initializer::ensure_rebalanced() {
 }

 manager::manager(service::storage_proxy& proxy, sstring hints_directory, host_filter filter, int64_t max_hint_window_ms,
-        resource_manager& res_manager, sharded<replica::database>& db)
+        resource_manager& res_manager, sharded<replica::database>& db, scheduling_group sg)
    : _hints_dir(fs::path(hints_directory) / fmt::to_string(this_shard_id()))
    , _host_filter(std::move(filter))
    , _proxy(proxy)
@@ -150,6 +150,7 @@ manager::manager(service::storage_proxy& proxy, sstring hints_directory, host_fi
    , _local_db(db.local())
    , _draining_eps_gate(seastar::format("hints::manager::{}", _hints_dir.native()))
    , _resource_manager(res_manager)
+    , _hints_sending_sched_group(sg)
 {
    if (utils::get_local_injector().enter("decrease_hints_flush_period")) {
        hints_flush_period = std::chrono::seconds{1};
@@ -415,7 +416,7 @@ hint_endpoint_manager& manager::get_ep_manager(const endpoint_id& host_id, const

    try {
        std::filesystem::path hint_directory = hints_dir() / (_uses_host_id ? fmt::to_string(host_id) : fmt::to_string(ip));
-        auto [it, _] = _ep_managers.emplace(host_id, hint_endpoint_manager{host_id, std::move(hint_directory), *this});
+        auto [it, _] = _ep_managers.emplace(host_id, hint_endpoint_manager{host_id, std::move(hint_directory), *this, _hints_sending_sched_group});
        hint_endpoint_manager& ep_man = it->second;

        manager_logger.trace("Created an endpoint manager for {}", host_id);
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -133,6 +133,7 @@ private:

    hint_stats _stats;
    seastar::metrics::metric_groups _metrics;
+    scheduling_group _hints_sending_sched_group;

    // We need to keep a variant here. Before migrating hinted handoff to using host ID, hint directories will
    // still represent IP addresses. But after the migration, they will start representing host IDs.
@@ -155,7 +156,7 @@ private:

 public:
    manager(service::storage_proxy& proxy, sstring hints_directory, host_filter filter,
-            int64_t max_hint_window_ms, resource_manager& res_manager, sharded<replica::database>& db);
+            int64_t max_hint_window_ms, resource_manager& res_manager, sharded<replica::database>& db, scheduling_group sg);

    manager(const manager&) = delete;
    manager& operator=(const manager&) = delete;
--- a/db/row_cache.cc
+++ b/db/row_cache.cc
@@ -24,7 +24,7 @@
 #include "readers/forwardable.hh"
 #include "readers/nonforwardable.hh"
 #include "cache_mutation_reader.hh"
-#include "partition_snapshot_reader.hh"
+#include "replica/partition_snapshot_reader.hh"
 #include "keys/clustering_key_filter.hh"
 #include "utils/assert.hh"
 #include "utils/updateable_value.hh"
@@ -845,7 +845,7 @@ mutation_reader row_cache::make_nonpopulating_reader(schema_ptr schema, reader_p
            cache_entry& e = *i;
            upgrade_entry(e);
            tracing::trace(ts, "Reading partition {} from cache", pos);
-            return make_partition_snapshot_flat_reader<false, dummy_accounter>(
+            return replica::make_partition_snapshot_reader<false, dummy_accounter>(
                    schema,
                    std::move(permit),
                    e.key(),
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -215,8 +215,6 @@ public:
    static constexpr auto BUILT_VIEWS = "built_views";
    static constexpr auto SCYLLA_VIEWS_BUILDS_IN_PROGRESS = "scylla_views_builds_in_progress";
    static constexpr auto CDC_LOCAL = "cdc_local";
-    static constexpr auto CDC_TIMESTAMPS = "cdc_timestamps";
-    static constexpr auto CDC_STREAMS = "cdc_streams";

    // auth
    static constexpr auto ROLES = "roles";
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -23,6 +23,7 @@

 #include <seastar/core/future-util.hh>
 #include <seastar/core/coroutine.hh>
+#include <seastar/coroutine/all.hh>
 #include <seastar/coroutine/maybe_yield.hh>
 #include <flat_map>

@@ -65,6 +66,7 @@
 #include "mutation/timestamp.hh"
 #include "utils/assert.hh"
 #include "utils/small_vector.hh"
+#include "view_builder.hh"
 #include "view_info.hh"
 #include "view_update_checks.hh"
 #include "types/list.hh"
@@ -2238,12 +2240,20 @@ void view_builder::setup_metrics() {
 }

 future<> view_builder::start_in_background(service::migration_manager& mm, utils::cross_shard_barrier barrier) {
+    auto step_fiber = make_ready_future<>();
    try {
        view_builder_init_state vbi;
        auto fail = defer([&barrier] mutable { barrier.abort(); });
-        // Guard the whole startup routine with a semaphore,
-        // so that it's not intercepted by `on_drop_view`, `on_create_view`
-        // or `on_update_view` events.
+        // Semaphore usage invariants:
+        // - One unit of _sem serializes all per-shard bookkeeping that mutates view-builder state
+        //   (_base_to_build_step, _built_views, build_status, reader resets).
+        // - The unit is held for the whole operation, including the async chain, until the state
+        //   is stable for the next operation on that shard.
+        // - Cross-shard operations acquire _sem on shard 0 for the duration of the broadcast.
+        //   Other shards acquire their own _sem only around their local handling; shard 0 skips
+        //   the local acquire because it already holds the unit from the dispatcher.
+        // Guard the whole startup routine with a semaphore so that it's not intercepted by
+        // `on_drop_view`, `on_create_view`, or `on_update_view` events.
        auto units = co_await get_units(_sem, view_builder_semaphore_units);
        // Wait for schema agreement even if we're a seed node.
        co_await mm.wait_for_schema_agreement(_db, db::timeout_clock::time_point::max(), &_as);
@@ -2264,8 +2274,10 @@ future<> view_builder::start_in_background(service::migration_manager& mm, utils
        _mnotifier.register_listener(this);
        co_await calculate_shard_build_step(vbi);
        _current_step = _base_to_build_step.begin();
-        // Waited on indirectly in stop().
-        (void)_build_step.trigger();
+
+        // If preparation above fails, run_in_background() is not invoked, just
+        // the start_in_background() emits a warning into logs and resolves
+        step_fiber = run_in_background();
    } catch (...) {
        auto ex = std::current_exception();
        auto ll = log_level::error;
@@ -2280,10 +2292,12 @@ future<> view_builder::start_in_background(service::migration_manager& mm, utils
        }
        vlogger.log(ll, "start aborted: {}", ex);
    }
+
+    co_await std::move(step_fiber);
 }

 future<> view_builder::start(service::migration_manager& mm, utils::cross_shard_barrier barrier) {
-    _started = start_in_background(mm, std::move(barrier));
+    _step_fiber = start_in_background(mm, std::move(barrier));
    return make_ready_future<>();
 }

@@ -2293,12 +2307,12 @@ future<> view_builder::drain() {
    }
    vlogger.info("Draining view builder");
    _as.request_abort();
-    co_await std::move(_started);
    co_await _mnotifier.unregister_listener(this);
    co_await _vug.drain();
    co_await _sem.wait();
    _sem.broken();
-    co_await _build_step.join();
+    _build_step.broken();
+    co_await std::move(_step_fiber);
    co_await coroutine::parallel_for_each(_base_to_build_step, [] (std::pair<const table_id, build_step>& p) {
        return p.second.reader.close();
    });
@@ -2667,63 +2681,59 @@ static bool should_ignore_tablet_keyspace(const replica::database& db, const sst
    return db.features().view_building_coordinator && db.has_keyspace(ks_name) && db.find_keyspace(ks_name).uses_tablets();
 }

-future<> view_builder::dispatch_create_view(sstring ks_name, sstring view_name) {
-    if (should_ignore_tablet_keyspace(_db, ks_name)) {
-        return make_ready_future<>();
-    }
-    return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-        // This runs on shard 0 only; seed the global rows before broadcasting.
-        return handle_seed_view_build_progress(ks_name, view_name).then([this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-            return container().invoke_on_all([ks_name = std::move(ks_name), view_name = std::move(view_name)] (view_builder& vb) mutable {
-                return vb.handle_create_view_local(std::move(ks_name), std::move(view_name));
-            });
-        });
-    });
+future<view_builder::view_builder_units> view_builder::get_or_adopt_view_builder_lock(view_builder_units_opt units) {
+    co_return units ? std::move(*units) : co_await get_units(_sem, view_builder_semaphore_units);
 }

-future<> view_builder::handle_seed_view_build_progress(sstring ks_name, sstring view_name) {
+future<> view_builder::dispatch_create_view(sstring ks_name, sstring view_name) {
+    if (should_ignore_tablet_keyspace(_db, ks_name)) {
+        co_return;
+    }
+
+    auto units = co_await get_or_adopt_view_builder_lock(std::nullopt);
+    co_await handle_seed_view_build_progress(ks_name, view_name);
+
+    co_await coroutine::all(
+        [this, ks_name, view_name, units = std::move(units)] mutable -> future<> {
+            co_await handle_create_view_local(ks_name, view_name, std::move(units)); },
+        [this, ks_name, view_name] mutable -> future<> {
+            co_await container().invoke_on_others([ks_name = std::move(ks_name), view_name = std::move(view_name)] (view_builder& vb) mutable -> future<> {
+                return vb.handle_create_view_local(ks_name, view_name, std::nullopt); }); });
+}
+
+future<> view_builder::handle_seed_view_build_progress(const sstring& ks_name, const sstring& view_name) {
    auto view = view_ptr(_db.find_schema(ks_name, view_name));
    auto& step = get_or_create_build_step(view->view_info()->base_id());
    return _sys_ks.register_view_for_building_for_all_shards(view->ks_name(), view->cf_name(), step.current_token());
 }

-future<> view_builder::handle_create_view_local(sstring ks_name, sstring view_name){
-    if (this_shard_id() == 0) { 
-        return handle_create_view_local_impl(std::move(ks_name), std::move(view_name));
-    } else {
-        return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-            return handle_create_view_local_impl(std::move(ks_name), std::move(view_name));
-        });
-    }
-}
-
-future<> view_builder::handle_create_view_local_impl(sstring ks_name, sstring view_name) {
+future<> view_builder::handle_create_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units) {
+    [[maybe_unused]] auto sem_units = co_await get_or_adopt_view_builder_lock(std::move(units));
    auto view = view_ptr(_db.find_schema(ks_name, view_name));
    auto& step = get_or_create_build_step(view->view_info()->base_id());
-    return when_all(step.base->await_pending_writes(), step.base->await_pending_streams()).discard_result().then([this, &step] {
-        return flush_base(step.base, _as);
-    }).then([this, view, &step] () {
+    try {
+        co_await coroutine::all(
+            [&step] -> future<> {
+                co_await step.base->await_pending_writes(); },
+            [&step] -> future<> {
+                co_await step.base->await_pending_streams(); });
+        co_await flush_base(step.base, _as);
+    
        // This resets the build step to the current token. It may result in views currently
        // being built to receive duplicate updates, but it simplifies things as we don't have
        // to keep around a list of new views to build the next time the reader crosses a token
        // threshold.
-        return initialize_reader_at_current_token(step).then([this, view, &step] () mutable {
-            return add_new_view(view, step);
-        }).then_wrapped([this, view] (future<>&& f) {
-            try {
-                f.get();
-            } catch (abort_requested_exception&) {
-                vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
-            } catch (raft::request_aborted&) {
-                vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
-            } catch (...) {
-                vlogger.error("Error setting up view for building {}.{}: {}", view->ks_name(), view->cf_name(), std::current_exception());
-            }
+        co_await initialize_reader_at_current_token(step);
+        co_await add_new_view(view, step);
+    } catch (abort_requested_exception&) {
+        vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
+    } catch (raft::request_aborted&) {
+        vlogger.debug("Aborted while setting up view for building {}.{}", view->ks_name(), view->cf_name());
+    } catch (...) {
+        vlogger.error("Error setting up view for building {}.{}: {}", view->ks_name(), view->cf_name(), std::current_exception());
+    }

-            // Waited on indirectly in stop().
-            static_cast<void>(_build_step.trigger());
-        });
-    });
+    _build_step.signal();
 }

 void view_builder::on_create_view(const sstring& ks_name, const sstring& view_name) {
@@ -2760,62 +2770,55 @@ void view_builder::on_update_view(const sstring& ks_name, const sstring& view_na

 future<> view_builder::dispatch_drop_view(sstring ks_name, sstring view_name) {
    if (should_ignore_tablet_keyspace(_db, ks_name)) {
-        return make_ready_future<>();
+        co_return;
    }

-    return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-        // This runs on shard 0 only; broadcast local cleanup before global cleanup.
-        return container().invoke_on_all([ks_name, view_name] (view_builder& vb) mutable {
-            return vb.handle_drop_view_local(std::move(ks_name), std::move(view_name));
-        }).then([this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-            return handle_drop_view_global_cleanup(std::move(ks_name), std::move(view_name));
-        });
-    });
+    auto units = co_await get_or_adopt_view_builder_lock(std::nullopt);
+
+    co_await coroutine::all(
+        [this, ks_name, view_name, units = std::move(units)] mutable -> future<> {
+            co_await handle_drop_view_local(ks_name, view_name, std::move(units)); },
+        [this, ks_name, view_name] mutable -> future<> {
+            co_await container().invoke_on_others([ks_name = std::move(ks_name), view_name = std::move(view_name)] (view_builder& vb) mutable -> future<> {
+                return vb.handle_drop_view_local(ks_name, view_name, std::nullopt); });});
+    co_await handle_drop_view_global_cleanup(ks_name, view_name);
 }

-future<> view_builder::handle_drop_view_local(sstring ks_name, sstring view_name) {
-    if (this_shard_id() == 0) { 
-        return handle_drop_view_local_impl(std::move(ks_name), std::move(view_name));
-    } else {
-        return with_semaphore(_sem, view_builder_semaphore_units, [this, ks_name = std::move(ks_name), view_name = std::move(view_name)] () mutable {
-            return handle_drop_view_local_impl(std::move(ks_name), std::move(view_name));
-        });
-    }
-}
-
-future<> view_builder::handle_drop_view_local_impl(sstring ks_name, sstring view_name) {
+future<> view_builder::handle_drop_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units) {
+    [[maybe_unused]] auto sem_units = co_await get_or_adopt_view_builder_lock(std::move(units));
    vlogger.info0("Stopping to build view {}.{}", ks_name, view_name);
-    // The view is absent from the database at this point, so find it by brute force.
-    ([&, this] {
-        for (auto& [_, step] : _base_to_build_step) {
-            if (step.build_status.empty() || step.build_status.front().view->ks_name() != ks_name) {
-                continue;
-            }
-            for (auto it = step.build_status.begin(); it != step.build_status.end(); ++it) {
-                if (it->view->cf_name() == view_name) {
-                    _built_views.erase(it->view->id());
-                    step.build_status.erase(it);
-                    return;
-                }
+
+    for (auto& [_, step] : _base_to_build_step) {
+        if (step.build_status.empty() || step.build_status.front().view->ks_name() != ks_name) {
+            continue;
+        }
+        for (auto it = step.build_status.begin(); it != step.build_status.end(); ++it) {
+            if (it->view->cf_name() == view_name) {
+                _built_views.erase(it->view->id());
+                step.build_status.erase(it);
+                co_return;
            }
        }
-    })();
-    return make_ready_future<>();  
+    }
 }

-future<> view_builder::handle_drop_view_global_cleanup(sstring ks_name, sstring view_name) {
+future<> view_builder::handle_drop_view_global_cleanup(const sstring& ks_name, const sstring& view_name) {
    if (this_shard_id() != 0) {
-        return make_ready_future<>();
+        co_return;
    }
    vlogger.info0("Starting view global cleanup {}.{}", ks_name, view_name);
-    return when_all_succeed(
-                _sys_ks.remove_view_build_progress_across_all_shards(ks_name, view_name),
-                _sys_ks.remove_built_view(ks_name, view_name),
-                remove_view_build_status(ks_name, view_name))
-                    .discard_result()
-                    .handle_exception([ks_name, view_name] (std::exception_ptr ep) {
-        vlogger.warn("Failed to cleanup view {}.{}: {}", ks_name, view_name, ep);
-    });
+    
+    try {
+        co_await coroutine::all(
+            [this, &ks_name, &view_name] -> future<>  {
+                co_await _sys_ks.remove_view_build_progress_across_all_shards(ks_name, view_name); },
+            [this, &ks_name, &view_name] -> future<>  {
+                co_await _sys_ks.remove_built_view(ks_name, view_name); },
+            [this, &ks_name, &view_name] -> future<>  {
+                co_await remove_view_build_status(ks_name, view_name); });
+    } catch (...) {
+        vlogger.warn("Failed to cleanup view {}.{}: {}", ks_name, view_name, std::current_exception());
+    }
 }

 void view_builder::on_drop_view(const sstring& ks_name, const sstring& view_name) {
@@ -2829,14 +2832,15 @@ void view_builder::on_drop_view(const sstring& ks_name, const sstring& view_name
    }));
 }

-future<> view_builder::do_build_step() {
-    // Run the view building in the streaming scheduling group
-    // so that it doesn't impact other tasks with higher priority.
-    seastar::thread_attributes attr;
-    attr.sched_group = _db.get_streaming_scheduling_group();
-    return seastar::async(std::move(attr), [this] {
+future<> view_builder::run_in_background() {
+    return seastar::async([this] {
        exponential_backoff_retry r(1s, 1min);
-        while (!_base_to_build_step.empty() && !_as.abort_requested()) {
+        while (!_as.abort_requested()) {
+            try {
+                _build_step.wait([this] { return !_base_to_build_step.empty(); }).get();
+            } catch (const seastar::broken_condition_variable&) {
+                return;
+            }
            auto units = get_units(_sem, view_builder_semaphore_units).get();
            ++_stats.steps_performed;
            try {
--- a/db/view/view_builder.hh
+++ b/db/view/view_builder.hh
@@ -11,13 +11,13 @@
 #include "query/query-request.hh"
 #include "service/migration_listener.hh"
 #include "service/raft/raft_group0_client.hh"
-#include "utils/serialized_action.hh"
 #include "utils/cross-shard-barrier.hh"
 #include "replica/database.hh"

 #include <seastar/core/abort_source.hh>
 #include <seastar/core/future.hh>
 #include <seastar/core/semaphore.hh>
+#include <seastar/core/condition-variable.hh>
 #include <seastar/core/sharded.hh>
 #include <seastar/core/shared_future.hh>
 #include <seastar/core/shared_ptr.hh>
@@ -104,6 +104,12 @@ class view_update_generator;
 *            redo the missing step, for simplicity.
 */
 class view_builder final : public service::migration_listener::only_view_notifications, public seastar::peering_sharded_service<view_builder> {
+    //aliasing for semaphore units that will be used throughout the class
+    using view_builder_units = semaphore_units<named_semaphore_exception_factory>;
+
+    //aliasing for optional semaphore units that will be used throughout the class
+    using view_builder_units_opt = std::optional<view_builder_units>;
+
    /**
     * Keeps track of the build progress for a particular view.
     * When the view is built, next_token == first_token.
@@ -168,14 +174,24 @@ class view_builder final : public service::migration_listener::only_view_notific
    reader_permit _permit;
    base_to_build_step_type _base_to_build_step;
    base_to_build_step_type::iterator _current_step = _base_to_build_step.end();
-    serialized_action _build_step{std::bind(&view_builder::do_build_step, this)};
+    condition_variable _build_step;
    static constexpr size_t view_builder_semaphore_units = 1;
    // Ensures bookkeeping operations are serialized, meaning that while we execute
    // a build step we don't consider newly added or removed views. This simplifies
    // the algorithms. Also synchronizes an operation wrt. a call to stop().
+    // Semaphore usage invariants:
+    // - One unit of _sem serializes all per-shard bookkeeping that mutates view-builder state
+    //   (_base_to_build_step, _built_views, build_status, reader resets).
+    // - The unit is held for the whole operation, including the async chain, until the state
+    //   is stable for the next operation on that shard.
+    // - Cross-shard operations acquire _sem on shard 0 for the duration of the broadcast.
+    //   Other shards acquire their own _sem only around their local handling; shard 0 skips
+    //   the local acquire because it already holds the unit from the dispatcher.
+    // Guard the whole startup routine with a semaphore so that it's not intercepted by
+    // `on_drop_view`, `on_create_view`, or `on_update_view` events.
    seastar::named_semaphore _sem{view_builder_semaphore_units, named_semaphore_exception_factory{"view builder"}};
    seastar::abort_source _as;
-    future<> _started = make_ready_future<>();
+    future<> _step_fiber = make_ready_future<>();
    // Used to coordinate between shards the conclusion of the build process for a particular view.
    std::unordered_set<table_id> _built_views;
    // Used for testing.
@@ -262,19 +278,18 @@ private:
    void setup_shard_build_step(view_builder_init_state& vbi, std::vector<system_keyspace_view_name>, std::vector<system_keyspace_view_build_progress>);
    future<> calculate_shard_build_step(view_builder_init_state& vbi);
    future<> add_new_view(view_ptr, build_step&);
-    future<> do_build_step();
+    future<> run_in_background();
    void execute(build_step&, exponential_backoff_retry);
    future<> maybe_mark_view_as_built(view_ptr, dht::token);
    future<> mark_as_built(view_ptr);
    void setup_metrics();
    future<> dispatch_create_view(sstring ks_name, sstring view_name);
    future<> dispatch_drop_view(sstring ks_name, sstring view_name);
-    future<> handle_seed_view_build_progress(sstring ks_name, sstring view_name);
-    future<> handle_create_view_local(sstring ks_name, sstring view_name);
-    future<> handle_drop_view_local(sstring ks_name, sstring view_name);
-    future<> handle_create_view_local_impl(sstring ks_name, sstring view_name);
-    future<> handle_drop_view_local_impl(sstring ks_name, sstring view_name);
-    future<> handle_drop_view_global_cleanup(sstring ks_name, sstring view_name);
+    future<> handle_seed_view_build_progress(const sstring& ks_name, const sstring& view_name);
+    future<> handle_create_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units);
+    future<> handle_drop_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units);
+    future<> handle_drop_view_global_cleanup(const sstring& ks_name, const sstring& view_name);
+    future<view_builder_units> get_or_adopt_view_builder_lock(view_builder_units_opt units);

    template <typename Func1, typename Func2>
    future<> write_view_build_status(Func1&& fn_group0, Func2&& fn_sys_dist) {
--- a/db/view/view_building_worker.cc
+++ b/db/view/view_building_worker.cc
@@ -242,7 +242,7 @@ future<> view_building_worker::create_staging_sstable_tasks() {
                utils::UUID_gen::get_time_UUID(), view_building_task::task_type::process_staging, false,
                table_id, ::table_id{}, {my_host_id, sst_info.shard}, sst_info.last_token
            };
-            auto mut = co_await _group0.client().sys_ks().make_view_building_task_mutation(guard.write_timestamp(), task);
+            auto mut = co_await _sys_ks.make_view_building_task_mutation(guard.write_timestamp(), task);
            cmuts.emplace_back(std::move(mut));
        }
    }
@@ -386,7 +386,6 @@ future<> view_building_worker::update_built_views() {
        auto schema = _db.find_schema(table_id);
        return std::make_pair(schema->ks_name(), schema->cf_name());
    };
-    auto& sys_ks = _group0.client().sys_ks();

    std::set<std::pair<sstring, sstring>> built_views;
    for (auto& [id, statuses]: _vb_state_machine.views_state.status_map) {
@@ -395,22 +394,22 @@ future<> view_building_worker::update_built_views() {
        }
    }

-    auto local_built = co_await sys_ks.load_built_views() | std::views::filter([&] (auto& v) {
+    auto local_built = co_await _sys_ks.load_built_views() | std::views::filter([&] (auto& v) {
        return !_db.has_keyspace(v.first) || _db.find_keyspace(v.first).uses_tablets();
    }) | std::ranges::to<std::set>();

    // Remove dead entries
    for (auto& view: local_built) {
        if (!built_views.contains(view)) {
-            co_await sys_ks.remove_built_view(view.first, view.second);
+            co_await _sys_ks.remove_built_view(view.first, view.second);
        }
    }

    // Add new entries
    for (auto& view: built_views) {
        if (!local_built.contains(view)) {
-            co_await sys_ks.mark_view_as_built(view.first, view.second);
-            co_await sys_ks.remove_view_build_progress_across_all_shards(view.first, view.second);
+            co_await _sys_ks.mark_view_as_built(view.first, view.second);
+            co_await _sys_ks.remove_view_build_progress_across_all_shards(view.first, view.second);
        }
    }
 }
--- a/db/virtual_tables.cc
+++ b/db/virtual_tables.cc
@@ -1345,8 +1345,8 @@ public:

 private:
    static schema_ptr build_schema() {
-        auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS);
-        return schema_builder(system_keyspace::NAME, system_keyspace::CDC_TIMESTAMPS, std::make_optional(id))
+        auto id = generate_legacy_id(system_keyspace::NAME, "cdc_timestamps");
+        return schema_builder(system_keyspace::NAME, "cdc_timestamps", std::make_optional(id))
            .with_column("keyspace_name", utf8_type, column_kind::partition_key)
            .with_column("table_name", utf8_type, column_kind::partition_key)
            .with_column("timestamp", reversed_type_impl::get_instance(timestamp_type), column_kind::clustering_key)
@@ -1428,8 +1428,8 @@ public:
    }
 private:
    static schema_ptr build_schema() {
-        auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::CDC_STREAMS);
-        return schema_builder(system_keyspace::NAME, system_keyspace::CDC_STREAMS, std::make_optional(id))
+        auto id = generate_legacy_id(system_keyspace::NAME, "cdc_streams");
+        return schema_builder(system_keyspace::NAME, "cdc_streams", std::make_optional(id))
            .with_column("keyspace_name", utf8_type, column_kind::partition_key)
            .with_column("table_name", utf8_type, column_kind::partition_key)
            .with_column("timestamp", timestamp_type, column_kind::clustering_key)
--- a/debug.cc
+++ b/debug.cc
@@ -11,5 +11,6 @@
 namespace debug {

 seastar::sharded<replica::database>* volatile the_database = nullptr;
+seastar::scheduling_group streaming_scheduling_group;

 }
--- a/debug.hh
+++ b/debug.hh
@@ -17,7 +17,7 @@ class database;
 namespace debug {

 extern seastar::sharded<replica::database>* volatile the_database;
-
+extern seastar::scheduling_group streaming_scheduling_group;

 }

--- a/docs/cql/ddl.rst
+++ b/docs/cql/ddl.rst
@@ -1026,7 +1026,29 @@ You can enable the after-repair tombstone GC by setting the ``repair`` mode usin

    ALTER TABLE ks.cf WITH tombstone_gc = {'mode':'repair'} ;

-The following modes are available:
+To support writes arriving out-of-order -- either due to natural delays, or user provided timestamps -- the repair mode has a propagation delay.
+Out-of-order writes present a problem for repair mode tombstone gc. Consider the following example sequence of events:
+
+1) Write ``DELETE FROM table WHERE key = K1`` arrives at the node.
+2) Repair is run.
+3) Compaction runs and garbage collects the tombstone for ``key = K1``.
+4) Write ``INSERT INTO table (key, ...) VALUES (K1, ...)`` arrives at the node with timestamp smaller than that of the delete. The tombstone for ``key = K1`` should apply to this write, but it is already garbage collected, so this data is resurrected.
+
+Propagation delay solves this problem by establishing a window before repair, where tombstones are not yet garbage collectible: a tombstone is garbage collectible if it was written before the last repair by at least the propagation delay.
+
+The value of the propagation delay can be set via the ``propagation_delay_in_seconds`` parameter:
+
+.. code-block:: cql
+
+    CREATE TABLE ks.cf (key blob PRIMARY KEY, val blob) WITH tombstone_gc = {'mode':'repair', 'propagation_delay_in_seconds': 120};
+
+.. code-block:: cql
+
+    ALTER TABLE ks.cf WITH tombstone_gc = {'mode':'repair', 'propagation_delay_in_seconds': 120};
+
+The default value of the propagation delay is 1 hour. This parameter should only be changed if your application uses user provided timestamps and writes and deletes can arrive out-of-order by more than the default 1 hour.
+
+The following tombstone gc modes are available:

 .. list-table::
   :widths: 20 80
--- a/docs/cql/dml/select.rst
+++ b/docs/cql/dml/select.rst
@@ -281,8 +281,7 @@ For example::
      ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;


-Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key
-or columns provided in a definition of the index.
+Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key.

 For example::

--- a/docs/cql/secondary-indexes.rst
+++ b/docs/cql/secondary-indexes.rst
@@ -140,83 +140,17 @@ Vector Index :label-note:`ScyllaDB Cloud`
   `ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/stable/vector-search/>`_.

 ScyllaDB supports creating vector indexes on tables, allowing queries on the table to use those indexes for efficient
-similarity search on vector data. Vector indexes can be a global index for indexing vectors per table or a local
-index for indexing vectors per partition.
+similarity search on vector data. 

 The vector index is the only custom type index supported in ScyllaDB. It is created using
-the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. It is also possible to
-add additional columns to the index for filtering the search results. The partition column
-specified in the global vector index definition must be the vector column, and any subsequent
-columns are treated as filtering columns. The local vector index requires that the partition key
-of the base table is also the partition key of the index and the vector column is the first one
-from the following columns.
-
-Example of a simple index:
+the ``CUSTOM`` keyword and specifying the index type as ``vector_index``. Example:

 .. code-block:: cql

-      CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding)
+      CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding) 
      USING 'vector_index' 
      WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};

-The vector column (``embedding``) is indexed to enable similarity search using
-a global vector index. Additional filtering can be performed on the primary key
-columns of the base table.
-
-Example of a global vector index with additional filtering:
-
-.. code-block:: cql
-
-      CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings (embedding, category, info)
-      USING 'vector_index' 
-      WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
-
-The vector column (``embedding``) is indexed to enable similarity search using
-a global index. Additional columns are added for filtering the search results.
-The filtering is possible on ``category``, ``info`` and all primary key columns
-of the base table.
-
-Example of a local vector index:
-
-.. code-block:: cql
-
-      CREATE CUSTOM INDEX vectorIndex ON ImageEmbeddings ((id, created_at), embedding, category, info)
-      USING 'vector_index' 
-      WITH OPTIONS = {'similarity_function': 'COSINE', 'maximum_node_connections': '16'};
-
-The vector column (``embedding``) is indexed for similarity search (a local
-index) and additional columns are added for filtering the search results. The
-filtering is possible on ``category``, ``info`` and all primary key columns of
-the base table. The columns ``id`` and ``created_at`` must be the partition key
-of the base table.
-
-Vector indexes support additional filtering columns of native data types
-(excluding counter and duration). The indexed column itself must be a vector
-column, while the extra columns can be used to filter search results.
-
-The supported types are:
-
-* ``ascii``
-* ``bigint``
-* ``blob``
-* ``boolean``
-* ``date``
-* ``decimal``
-* ``double``
-* ``float``
-* ``inet``
-* ``int``
-* ``smallint``
-* ``text``
-* ``varchar``
-* ``time``
-* ``timestamp``
-* ``timeuuid``
-* ``tinyint``
-* ``uuid``
-* ``varint``
-
-
 The following options are supported for vector indexes. All of them are optional.

 +------------------------------+----------------------------------------------------------------------------------------------------------+---------------+
--- a/docs/dev/reader-concurrency-semaphore.md
+++ b/docs/dev/reader-concurrency-semaphore.md
@@ -78,6 +78,7 @@ Permits are in one of the following states:
 * `active/await` - a previously `active/need_cpu` permit, which needs something other than CPU to proceed, it is waiting on I/O or a remote shards, other permits can be admitted while the permit is in this state, pending resource availability;
 * `inactive` - the permit was marked inactive, it can be evicted to make room for admitting more permits if needed;
 * `evicted` - a former inactive permit which was evicted, the permit has to undergo admission again for the read to resume;
+* `preemptive_aborted` - the permit timed out or was rejected during admission as it was detected the read might time out later during execution;

 Note that some older releases will have different names for some of these states or lack some of the states altogether:

--- a/docs/dev/testing.md
+++ b/docs/dev/testing.md
@@ -124,6 +124,7 @@ There are several test directories that are excluded from orchestration by `test
 - test/cql
 - test/cqlpy
 - test/rest_api
+- test/scylla_gdb

 This means that `test.py` will not run tests directly, but will delegate all work to `pytest`.
 That's why all these directories do not have `suite.yaml` files.
--- a/docs/features/automatic-repair.rst
+++ b/docs/features/automatic-repair.rst
@@ -3,9 +3,9 @@
 Automatic Repair
 ================

-Traditionally, launching :doc:`repairs </operating-scylla/procedures/maintenance/repair>` in a ScyllaDB cluster is left to an external process, typically done via `Scylla Manager <https://manager.docs.scylladb.com/stable/repair/index.html>`_.
+Traditionally, launching `repairs </operating-scylla/procedures/maintenance/repair>`_ in a ScyllaDB cluster is left to an external process, typically done via `Scylla Manager <https://manager.docs.scylladb.com/stable/repair/index.html>`_.

-Automatic repair offers built-in scheduling in ScyllaDB itself. If the time since the last repair is greater than the configured repair interval, ScyllaDB will start a repair for the :doc:`tablet table </architecture/tablets>` automatically.
+Automatic repair offers built-in scheduling in ScyllaDB itself. If the time since the last repair is greater than the configured repair interval, ScyllaDB will start a repair for the tablet `tablet </architecture/tablets>`_ automatically.
 Repairs are spread over time and among nodes and shards, to avoid load spikes or any adverse effects on user workloads.

 To enable automatic repair, add this to the configuration (``scylla.yaml``):
@@ -20,4 +20,4 @@ More featureful configuration methods will be implemented in the future.

 To disable, set ``auto_repair_enabled_default: false``.

-Automatic repair relies on :doc:`Incremental Repair </features/incremental-repair>` and as such it only works with :doc:`tablet </architecture/tablets>` tables.
+Automatic repair relies on `Incremental Repair </features/incremental-repair>`_ and as such it only works with `tablet </architecture/tablets>`_ tables.
--- a/docs/features/incremental-repair.rst
+++ b/docs/features/incremental-repair.rst
@@ -3,7 +3,7 @@
 Incremental Repair
 ==================

-ScyllaDB's standard :doc:`repair </operating-scylla/procedures/maintenance/repair>` process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.
+ScyllaDB's standard `repair </operating-scylla/procedures/maintenance/repair>`_ process scans and processes all the data on a node, regardless of whether it has changed since the last repair. This operation can be resource-intensive and time-consuming. The Incremental Repair feature provides a much more efficient and lightweight alternative for maintaining data consistency.

 The core idea of incremental repair is to repair only the data that has been written or changed since the last repair was run. It intelligently skips data that has already been verified, dramatically reducing the time, I/O, and CPU resources required for the repair operation.

@@ -51,7 +51,7 @@ Benefits of Incremental Repair
 *   **Reduced Resource Usage:** Consumes significantly less CPU, I/O, and network bandwidth compared to a full repair.
 *   **More Frequent Repairs:** The efficiency of incremental repair allows you to run it more frequently, ensuring a higher level of data consistency across your cluster at all times.

-Tables using Incremental Repair can schedule repairs in ScyllaDB itself, with :doc:`Automatic Repair </features/automatic-repair>`.
+Tables using Incremental Repair can schedule repairs in ScyllaDB itself, with `Automatic Repair </features/automatic-repair>`_.

 Notes
 -----
--- a/docs/operating-scylla/admin-tools/scylla-sstable.rst
+++ b/docs/operating-scylla/admin-tools/scylla-sstable.rst
@@ -601,11 +601,7 @@ Scrub has several modes:
 * **segregate** - Fixes partition/row/mutation-fragment out-of-order errors by segregating the output into as many SStables as required so that the content of each output SStable is properly ordered.
 * **validate** - Validates the content of the SStable, reporting any corruptions found. Writes no output SStables. In this mode, scrub has the same outcome as the `validate operation <scylla-sstable-validate-operation_>`_ - and the validate operation is recommended over scrub.

-Output SStables are written to the directory specified via ``--output-directory``. They will be written with the ``BIG`` format and the highest supported SStable format, with generations chosen by scylla-sstable. Generations are chosen such
-that they are unique among the SStables written by the current scrub.
-
-The output directory must be empty; otherwise, scylla-sstable will abort scrub. You can allow writing to a non-empty directory by setting the ``--unsafe-accept-nonempty-output-dir`` command line flag.
-Note that scrub will be aborted if an SStable cannot be written because its generation clashes with a pre-existing SStable in the output directory.
+Output SStables are written to the directory specified via ``--output-dir``. They will be written with the ``BIG`` format and the highest supported SStable format, with random generation.

 validate-checksums
 ^^^^^^^^^^^^^^^^^^
@@ -870,7 +866,7 @@ The SSTable version to be used can be overridden with the ``--version`` flag, al
 SSTables which are already on the designated version are skipped. To force rewriting *all* SSTables, use the ``--all`` flag. 

 Output SSTables are written to the path provided by the ``--output-dir`` flag, or to the current directory if not specified.
-This directory is expected to exist and be empty. If not empty the tool will refuse to run. This can be overridden with the ``--unsafe-accept-nonempty-output-dir`` flag.
+This directory is expected to exist.

 It is strongly recommended to use the system schema tables as the schema source for this command, see the :ref:`schema options <scylla-sstable-schema>` for more details.
 A schema which is good enough to read the SSTable and dump its content, may not be good enough to write its content back verbatim.
@@ -882,6 +878,25 @@ But even an altered schema which changed only the table options can lead to data

 The mapping of input SSTables to output SSTables is printed to ``stdout``.

+filter
+^^^^^^
+
+Filter the SSTable(s), including/excluding specified partitions.
+
+Similar to ``scylla sstable dump-data --partition|--partition-file``, with some notable differences:
+
+* Instead of dumping the content to stdout, the filtered content is written back to SSTable(s) on disk.
+* Also supports negative filters (keep all partitions except the those specified).
+
+The partition list can be provided either via the ``--partition`` command line argument, or via a file path passed to the the ``--partitions-file`` argument. The file should contain one partition key per line.
+Partition keys should be provided in the hex format, as produced by `scylla types serialize </operating-scylla/admin-tools/scylla-types/>`_.
+
+With ``--include``, only the specified partitions are kept from the input SSTable(s). With ``--exclude``, the specified partitions are discarded and won't be written to the output SSTable(s).
+It is possible that certain input SSTable(s) won't have any content left after the filtering. These input SSTable(s) will not have a matching output SSTable.
+
+By default, each input sstable is filtered individually. Use ``--merge`` to filter the combined content of all input sstables, producing a single output SSTable.
+
+Output sstables use the latest supported sstable format (can be changed with ``--sstable-version``).

 Examples
 --------
--- a/docs/operating-scylla/procedures/maintenance/repair.rst
+++ b/docs/operating-scylla/procedures/maintenance/repair.rst
@@ -61,9 +61,9 @@ See also
 Incremental Repair
 ------------------

-Built on top of :ref:`Row-level Repair <row-level-repair>` and :doc:`Tablets </architecture/tablets>`, Incremental Repair enables frequent and quick repairs. For more details, see :doc:`Incremental Repair </features/incremental-repair>`.
+Built on top of `Row-level Repair <row-level-repair_>`_ and `Tablets </architecture/tablets>`_, Incremental Repair enables frequent and quick repairs. For more details, see `Incremental Repair </features/incremental-repair>`_.

 Automatic Repair
 ----------------

-Built on top of :doc:`Incremental Repair </features/incremental-repair>`, :doc:`Automatic Repair </features/automatic-repair>` offers repair scheduling and execution directly in ScyllaDB, without external processes.
+Built on top of `Incremental Repair </features/incremental-repair>`_, `Automatic Repair </features/automatic-repair>`_ offers repair scheduling and execution directly in ScyllaDB, without external processes.
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -182,6 +182,7 @@ public:
    gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
    gms::feature size_based_load_balancing { *this, "SIZE_BASED_LOAD_BALANCING"sv };
    gms::feature topology_noop_request { *this, "TOPOLOGY_NOOP_REQUEST"sv };
+    gms::feature tablets_intermediate_fallback_cleanup { *this, "TABLETS_INTERMEDIATE_FALLBACK_CLEANUP"sv };
 public:

    const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
--- a/index/vector_index.cc
+++ b/index/vector_index.cc
@@ -17,11 +17,11 @@
 #include "index/secondary_index.hh"
 #include "index/secondary_index_manager.hh"
 #include "types/concrete_types.hh"
-#include "types/types.hh"
 #include "utils/managed_string.hh"
 #include <seastar/core/sstring.hh>
 #include <boost/algorithm/string.hpp>

+
 namespace secondary_index {

 static void validate_positive_option(int max, const sstring& value_name, const sstring& value) {
@@ -147,88 +147,17 @@ std::optional<cql3::description> vector_index::describe(const index_metadata& im
 }

 void vector_index::check_target(const schema& schema, const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) const {
-
-    struct validate_visitor {
-        const class schema& schema;
-        bool& is_vector;
-
-        /// Vector indexes support filtering on native types that can be used as primary key columns.
-        /// There is no counter (it cannot be used with vector columns)
-        /// and no duration (it cannot be used as a primary key or in secondary indexes).
-        static bool is_supported_filtering_column(abstract_type const & kind_type) {
-            switch (kind_type.get_kind()) {
-                case abstract_type::kind::ascii:
-                case abstract_type::kind::boolean:
-                case abstract_type::kind::byte:
-                case abstract_type::kind::bytes:
-                case abstract_type::kind::date:
-                case abstract_type::kind::decimal:
-                case abstract_type::kind::double_kind:
-                case abstract_type::kind::float_kind:
-                case abstract_type::kind::inet:
-                case abstract_type::kind::int32:
-                case abstract_type::kind::long_kind:
-                case abstract_type::kind::short_kind:
-                case abstract_type::kind::simple_date:
-                case abstract_type::kind::time:
-                case abstract_type::kind::timestamp:
-                case abstract_type::kind::timeuuid:
-                case abstract_type::kind::utf8:
-                case abstract_type::kind::uuid:
-                case abstract_type::kind::varint:
-                    return true;
-                default:
-                    break;
-            }
-            return false;
-        }
-
-        void validate(cql3::column_identifier const& column, bool is_vector) const {
-            auto const& c_name = column.to_string();
-            auto const* c_def = schema.get_column_definition(column.name());
-            if (c_def == nullptr) {
-                throw exceptions::invalid_request_exception(format("Column {} not found in schema", c_name));
-            }
-
-            auto type = c_def->type;
-
-            if (is_vector) {
-                auto const* vector_type = dynamic_cast<const vector_type_impl*>(type.get());
-                if (vector_type == nullptr) {
-                    throw exceptions::invalid_request_exception("Vector indexes are only supported on columns of vectors of floats");
-                }
-
-                auto elements_type = vector_type->get_elements_type();
-                if (elements_type->get_kind() != abstract_type::kind::float_kind) {
-                    throw exceptions::invalid_request_exception("Vector indexes are only supported on columns of vectors of floats");
-                }
-                return;
-            }
-
-            if (!is_supported_filtering_column(*type)) {
-                throw exceptions::invalid_request_exception(format("Unsupported vector index filtering column {} type", c_name));
-            }
-        }
-
-        void operator()(const std::vector<::shared_ptr<cql3::column_identifier>>& columns) const {
-            for (const auto& column : columns) {
-                // CQL restricts the secondary local index to have multiple columns with partition key only.
-                // Vectors shouldn't be partition key columns and they aren't supported as a filtering column,
-                // so we can assume here that these are non-vectors filtering columns.
-                validate(*column, false);
-            }
-        }
-
-        void operator()(const ::shared_ptr<cql3::column_identifier>& column) {
-            validate(*column, is_vector);
-            // The first column is the vector column, the rest mustn't be vectors.
-            is_vector = false;
-        }
-    };
-
-    bool is_vector = true;
-    for (const auto& target : targets) {
-        std::visit(validate_visitor{.schema = schema, .is_vector = is_vector}, target->value);
+    if (targets.size() != 1) {
+        throw exceptions::invalid_request_exception("Vector index can only be created on a single column");
+    }
+    auto target = targets[0];
+    auto c_def = schema.get_column_definition(to_bytes(target->column_name()));
+    if (!c_def) {
+        throw exceptions::invalid_request_exception(format("Column {} not found in schema", target->column_name()));
+    }
+    auto type = c_def->type;
+    if (!type->is_vector() || static_cast<const vector_type_impl*>(type.get())->get_elements_type()->get_kind() != abstract_type::kind::float_kind) {
+        throw exceptions::invalid_request_exception(format("Vector indexes are only supported on columns of vectors of floats", target->column_name()));
    }
 }

--- a/install-dependencies.sh
+++ b/install-dependencies.sh
@@ -31,6 +31,7 @@ fi

 debian_base_packages=(
    clang
+    clang-tools
    gdb
    cargo
    wabt
@@ -72,6 +73,7 @@ debian_base_packages=(

 fedora_packages=(
    clang
+    clang-tools-extra
    compiler-rt
    libasan
    libubsan
--- a/locator/tablets.cc
+++ b/locator/tablets.cc
@@ -50,6 +50,8 @@ write_replica_set_selector get_selector_for_writes(tablet_transition_stage stage
            return write_replica_set_selector::previous;
        case tablet_transition_stage::write_both_read_old:
            return write_replica_set_selector::both;
+        case tablet_transition_stage::write_both_read_old_fallback_cleanup:
+            return write_replica_set_selector::both;
        case tablet_transition_stage::streaming:
            return write_replica_set_selector::both;
        case tablet_transition_stage::rebuild_repair:
@@ -81,6 +83,8 @@ read_replica_set_selector get_selector_for_reads(tablet_transition_stage stage)
            return read_replica_set_selector::previous;
        case tablet_transition_stage::write_both_read_old:
            return read_replica_set_selector::previous;
+        case tablet_transition_stage::write_both_read_old_fallback_cleanup:
+            return read_replica_set_selector::previous;
        case tablet_transition_stage::streaming:
            return read_replica_set_selector::previous;
        case tablet_transition_stage::rebuild_repair:
@@ -741,6 +745,7 @@ void tablet_map::set_tablet_raft_info(tablet_id id, tablet_raft_info raft_info)
 static const std::unordered_map<tablet_transition_stage, sstring> tablet_transition_stage_to_name = {
    {tablet_transition_stage::allow_write_both_read_old, "allow_write_both_read_old"},
    {tablet_transition_stage::write_both_read_old, "write_both_read_old"},
+    {tablet_transition_stage::write_both_read_old_fallback_cleanup, "write_both_read_old_fallback_cleanup"},
    {tablet_transition_stage::write_both_read_new, "write_both_read_new"},
    {tablet_transition_stage::streaming, "streaming"},
    {tablet_transition_stage::rebuild_repair, "rebuild_repair"},
--- a/locator/tablets.hh
+++ b/locator/tablets.hh
@@ -277,6 +277,7 @@ std::optional<tablet_info> merge_tablet_info(tablet_info a, tablet_info b);
 enum class tablet_transition_stage {
    allow_write_both_read_old,
    write_both_read_old,
+    write_both_read_old_fallback_cleanup,
    streaming,
    rebuild_repair,
    write_both_read_new,
--- a/main.cc
+++ b/main.cc
@@ -571,7 +571,7 @@ sharded<service::storage_proxy> *the_storage_proxy;
 // This is used by perf-alternator to allow running scylla together with the tool
 // in a single process. So that it's easier to measure internals. It's not added
 // to main_func_type to not complicate common flow as no other tool needs such logic.
-std::function<void(lw_shared_ptr<db::config>)> after_init_func;
+std::function<future<>(lw_shared_ptr<db::config>, sharded<abort_source>&)> after_init_func;

 static locator::host_id initialize_local_info_thread(sharded<db::system_keyspace>& sys_ks,
        sharded<locator::snitch_ptr>& snitch,
@@ -906,6 +906,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl

            auto background_reclaim_scheduling_group = create_scheduling_group("background_reclaim", "bgre", 50).get();
            auto maintenance_scheduling_group = create_scheduling_group("streaming", "strm", 200).get();
+            debug::streaming_scheduling_group = maintenance_scheduling_group;

            smp::invoke_on_all([&cfg, background_reclaim_scheduling_group] {
                logalloc::tracker::config st_cfg;
@@ -1306,6 +1307,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            checkpoint(stop_signal, "starting storage proxy");
            service::storage_proxy::config spcfg {
                .hints_directory_initializer = hints_dir_initializer,
+                .hints_sched_group = maintenance_scheduling_group,
            };
            spcfg.hinted_handoff_enabled = hinted_handoff_enabled;
            spcfg.available_memory = memory::stats().total_memory();
@@ -1677,7 +1679,9 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
                    gossiper.local(), feature_service.local(), sys_ks.local(), group0_client, dbcfg.gossip_scheduling_group};

            checkpoint(stop_signal, "starting tablet allocator");
-            service::tablet_allocator::config tacfg;
+            service::tablet_allocator::config tacfg {
+                .background_sg = maintenance_scheduling_group,
+            };
            sharded<service::tablet_allocator> tablet_allocator;
            tablet_allocator.start(tacfg, std::ref(mm_notifier), std::ref(db)).get();
            auto stop_tablet_allocator = defer_verbose_shutdown("tablet allocator", [&tablet_allocator] {
@@ -2490,7 +2494,9 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl

            if (cfg->view_building()) {
                checkpoint(stop_signal, "starting view builders");
-                view_builder.invoke_on_all(&db::view::view_builder::start, std::ref(mm), utils::cross_shard_barrier()).get();
+                with_scheduling_group(maintenance_scheduling_group, [&mm] {
+                    return view_builder.invoke_on_all(&db::view::view_builder::start, std::ref(mm), utils::cross_shard_barrier());
+                }).get();
            }
            auto drain_view_builder = defer_verbose_shutdown("draining view builders", [&] {
                view_builder.invoke_on_all(&db::view::view_builder::drain).get();
@@ -2576,11 +2582,13 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            supervisor::notify("serving");

            startlog.info("Scylla version {} initialization completed.", scylla_version());
+            future<> after_init_fut = make_ready_future<>();
            if (after_init_func) {
-                after_init_func(cfg);
+                after_init_fut = after_init_func(cfg, stop_signal.as_sharded_abort_source());
            }
            stop_signal.wait().get();
            startlog.info("Signal received; shutting down");
+            std::move(after_init_fut).get();
 	    // At this point, all objects destructors and all shutdown hooks registered with defer() are executed
          } catch (const sleep_aborted&) {
            startlog.info("Startup interrupted");
@@ -2650,7 +2658,8 @@ int main(int ac, char** av) {
        {"perf-load-balancing", perf::scylla_tablet_load_balancing_main, "run tablet load balancer tests"},
        {"perf-simple-query", perf::scylla_simple_query_main, "run performance tests by sending simple queries to this server"},
        {"perf-sstable", perf::scylla_sstable_main, "run performance tests by exercising sstable related operations on this server"},
-        {"perf-alternator", perf::alternator(scylla_main, &after_init_func), "run performance tests on full alternator stack"}
+        {"perf-alternator", perf::alternator(scylla_main, &after_init_func), "run performance tests on full alternator stack"},
+        {"perf-cql-raw", perf::perf_cql_raw(scylla_main, &after_init_func), "run performance tests using raw CQL protocol frames"}
    };

    main_func_type main_func;
--- a/pgo/profiles/aarch64/profile.profdata.xz
+++ b/pgo/profiles/aarch64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8a470d16aee004a06f22af49e0f9bcc3ee845c5dcabfb803b2fca96ab27c7908
-size 6526676
+oid sha256:a4710f1f0b0bb329721c21d133618e811e820f2e70553b0aca28fb278bff89c9
+size 6492280
--- a/pgo/profiles/x86_64/profile.profdata.xz
+++ b/pgo/profiles/x86_64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f0303b6705733d1d236700c3e36652c97eb02e8e78b2e04e8008dffd23804759
-size 6526408
+oid sha256:2433f7a1fc5cda0dd990ab59587eb6046dca0fe1ae48d599953d1936fe014ed9
+size 6492176
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -148,6 +148,7 @@ public:
    };

 private:
+    const db::timeout_clock::time_point _created;
    reader_concurrency_semaphore& _semaphore;
    schema_ptr _schema;

@@ -237,17 +238,25 @@ private:
                break;
            case state::inactive:
                _semaphore.evict(*this, reader_concurrency_semaphore::evict_reason::time);
-                break;
+                // Return here on purpose. The evicted permit is destroyed when closing a reader.
+                // As a consequence, any member access beyond this point is invalid.
+                return;
            case state::evicted:
+            case state::preemptive_aborted:
                break;
        }
+
+        // The function call not only sets state to reader_permit::state::preemptive_aborted
+        // but also correctly decreases the statistics i.e. need_cpu_permits and awaits_permits.
+        on_permit_inactive(reader_permit::state::preemptive_aborted);
    }

 public:
    struct value_tag {};

    impl(reader_concurrency_semaphore& semaphore, schema_ptr schema, const std::string_view& op_name, reader_resources base_resources, db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_ptr)
-        : _semaphore(semaphore)
+        : _created(db::timeout_clock::now())
+        , _semaphore(semaphore)
        , _schema(std::move(schema))
        , _op_name_view(op_name)
        , _base_resources(base_resources)
@@ -258,7 +267,8 @@ public:
        _semaphore.on_permit_created(*this);
    }
    impl(reader_concurrency_semaphore& semaphore, schema_ptr schema, sstring&& op_name, reader_resources base_resources, db::timeout_clock::time_point timeout, tracing::trace_state_ptr trace_ptr)
-        : _semaphore(semaphore)
+        : _created(db::timeout_clock::now())
+        , _semaphore(semaphore)
        , _schema(std::move(schema))
        , _op_name(std::move(op_name))
        , _op_name_view(_op_name)
@@ -360,6 +370,17 @@ public:
        on_permit_active();
    }

+    void on_preemptive_aborted() {
+        if (_state != reader_permit::state::waiting_for_admission && _state != reader_permit::state::waiting_for_memory) {
+            on_internal_error(rcslog, format("on_preemptive_aborted(): permit in invalid state {}", _state));
+        }
+
+        _ttl_timer.cancel();
+        _state = reader_permit::state::preemptive_aborted;
+        _aux_data.pr.set_exception(named_semaphore_aborted(_semaphore._name));
+        _semaphore.on_permit_preemptive_aborted();
+    }
+
    void on_register_as_inactive() {
        SCYLLA_ASSERT(_state == reader_permit::state::active || _state == reader_permit::state::active_need_cpu || _state == reader_permit::state::waiting_for_memory);
        on_permit_inactive(reader_permit::state::inactive);
@@ -467,6 +488,10 @@ public:
        return _semaphore.do_wait_admission(*this);
    }

+    db::timeout_clock::time_point created() const noexcept {
+        return _created;
+    }
+
    db::timeout_clock::time_point timeout() const noexcept {
        return _ttl_timer.armed() ? _ttl_timer.get_timeout() : db::no_timeout;
    }
@@ -689,6 +714,9 @@ auto fmt::formatter<reader_permit::state>::format(reader_permit::state s, fmt::f
        case reader_permit::state::evicted:
            name = "evicted";
            break;
+        case reader_permit::state::preemptive_aborted:
+            name = "preemptive_aborted";
+            break;
    }
    return formatter<string_view>::format(name, ctx);
 }
@@ -1038,6 +1066,7 @@ reader_concurrency_semaphore::reader_concurrency_semaphore(
        utils::updateable_value<uint32_t> serialize_limit_multiplier,
        utils::updateable_value<uint32_t> kill_limit_multiplier,
        utils::updateable_value<uint32_t> cpu_concurrency,
+        utils::updateable_value<float> preemptive_abort_factor,
        register_metrics metrics)
    : _initial_resources(count, memory)
    , _resources(count, memory)
@@ -1047,6 +1076,7 @@ reader_concurrency_semaphore::reader_concurrency_semaphore(
    , _serialize_limit_multiplier(std::move(serialize_limit_multiplier))
    , _kill_limit_multiplier(std::move(kill_limit_multiplier))
    , _cpu_concurrency(cpu_concurrency)
+    , _preemptive_abort_factor(preemptive_abort_factor)
    , _close_readers_gate(format("[reader_concurrency_semaphore {}] close_readers", _name))
    , _permit_gate(format("[reader_concurrency_semaphore {}] permit", _name))
 {
@@ -1114,6 +1144,7 @@ reader_concurrency_semaphore::reader_concurrency_semaphore(no_limits, sstring na
            utils::updateable_value(std::numeric_limits<uint32_t>::max()),
            utils::updateable_value(std::numeric_limits<uint32_t>::max()),
            utils::updateable_value(uint32_t(1)),
+            utils::updateable_value(float(0.0)),
            metrics) {}

 reader_concurrency_semaphore::~reader_concurrency_semaphore() {
@@ -1489,6 +1520,25 @@ void reader_concurrency_semaphore::maybe_admit_waiters() noexcept {
        auto& permit = _wait_list.front();
        dequeue_permit(permit);
        try {
+            // Do not admit the read as it is unlikely to finish before its timeout. The condition is:
+            // permit's remaining time <= preemptive_abort_factor * permit's time budget
+            //
+            // The additional check for remaining_time > 0 is to avoid preemptive aborting reads
+            // that already timed out but are still in the wait list due to scheduling delays.
+            // It also effectively disables preemptive aborting when the factor is set to 0.
+            const auto time_budget = permit.timeout() - permit.created();
+            const auto remaining_time = permit.timeout() - db::timeout_clock::now();
+            if (remaining_time > db::timeout_clock::duration::zero() && remaining_time <= _preemptive_abort_factor() * time_budget) {
+                permit.on_preemptive_aborted();
+                using ms = std::chrono::milliseconds;
+                tracing::trace(permit.trace_state(), "[reader concurrency semaphore {}] read shed as unlikely to finish (elapsed: {}, timeout: {}, preemptive_factor: {})",
+                               _name,
+                               std::chrono::duration_cast<ms>(time_budget - remaining_time),
+                               std::chrono::duration_cast<ms>(time_budget),
+                               _preemptive_abort_factor());
+                continue;
+            }
+
            if (permit.get_state() == reader_permit::state::waiting_for_memory) {
                _blessed_permit = &permit;
                permit.on_granted_memory();
@@ -1549,7 +1599,11 @@ void reader_concurrency_semaphore::dequeue_permit(reader_permit::impl& permit) {
        case reader_permit::state::waiting_for_admission:
        case reader_permit::state::waiting_for_memory:
        case reader_permit::state::waiting_for_execution:
-            --_stats.waiters;
+            if (_stats.waiters > 0) {
+                --_stats.waiters;
+            } else {
+                on_internal_error_noexcept(rcslog, "reader_concurrency_semaphore::dequeue_permit(): invalid state: no waiters yet dequeueing a waiting permit");
+            }
            break;
        case reader_permit::state::inactive:
        case reader_permit::state::evicted:
@@ -1558,12 +1612,17 @@ void reader_concurrency_semaphore::dequeue_permit(reader_permit::impl& permit) {
        case reader_permit::state::active:
        case reader_permit::state::active_need_cpu:
        case reader_permit::state::active_await:
+        case reader_permit::state::preemptive_aborted:
            on_internal_error_noexcept(rcslog, format("reader_concurrency_semaphore::dequeue_permit(): unrecognized queued state: {}", permit.get_state()));
    }
    permit.unlink();
    _permit_list.push_back(permit);
 }

+void reader_concurrency_semaphore::on_permit_preemptive_aborted() noexcept {
+    ++_stats.total_reads_shed_due_to_overload;
+}
+
 void reader_concurrency_semaphore::on_permit_created(reader_permit::impl& permit) {
    _permit_gate.enter();
    _permit_list.push_back(permit);
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -42,7 +42,7 @@ using mutation_reader_opt = optimized_optional<mutation_reader>;
 /// number of waiting readers becomes equal or greater than
 /// `max_queue_length` (upon calling `obtain_permit()`) an exception of
 /// type `std::runtime_error` is thrown. Optionally, some additional
-/// code can be executed just before throwing (`prethrow_action` 
+/// code can be executed just before throwing (`prethrow_action`
 /// constructor parameter).
 ///
 /// The semaphore has 3 layers of defense against consuming more memory
@@ -89,6 +89,7 @@ public:
        // Total number of failed reads executed through this semaphore.
        uint64_t total_failed_reads = 0;
        // Total number of reads rejected because the admission queue reached its max capacity
+        // or rejected due to a high probability of not getting finalized on time.
        uint64_t total_reads_shed_due_to_overload = 0;
        // Total number of reads killed due to the memory consumption reaching the kill limit.
        uint64_t total_reads_killed_due_to_kill_limit = 0;
@@ -192,6 +193,8 @@ private:
    utils::updateable_value<uint32_t> _serialize_limit_multiplier;
    utils::updateable_value<uint32_t> _kill_limit_multiplier;
    utils::updateable_value<uint32_t> _cpu_concurrency;
+    utils::updateable_value<float> _preemptive_abort_factor;
+
    stats _stats;
    std::optional<seastar::metrics::metric_groups> _metrics;
    bool _stopped = false;
@@ -250,6 +253,8 @@ private:
    void on_permit_created(reader_permit::impl&);
    void on_permit_destroyed(reader_permit::impl&) noexcept;

+    void on_permit_preemptive_aborted() noexcept;
+
    void on_permit_need_cpu() noexcept;
    void on_permit_not_need_cpu() noexcept;

@@ -287,6 +292,7 @@ public:
            utils::updateable_value<uint32_t> serialize_limit_multiplier,
            utils::updateable_value<uint32_t> kill_limit_multiplier,
            utils::updateable_value<uint32_t> cpu_concurrency,
+            utils::updateable_value<float> preemptive_abort_factor,
            register_metrics metrics);

    reader_concurrency_semaphore(
@@ -296,9 +302,12 @@ public:
            size_t max_queue_length,
            utils::updateable_value<uint32_t> serialize_limit_multiplier,
            utils::updateable_value<uint32_t> kill_limit_multiplier,
+            utils::updateable_value<uint32_t> cpu_concurrency,
+            utils::updateable_value<float> preemptive_abort_factor,
            register_metrics metrics)
        : reader_concurrency_semaphore(utils::updateable_value(count), memory, std::move(name), max_queue_length,
-                std::move(serialize_limit_multiplier), std::move(kill_limit_multiplier), utils::updateable_value<uint32_t>(1), metrics)
+                std::move(serialize_limit_multiplier), std::move(kill_limit_multiplier), std::move(cpu_concurrency),
+                std::move(preemptive_abort_factor), metrics)
    { }

    /// Create a semaphore with practically unlimited count and memory.
@@ -318,9 +327,10 @@ public:
            utils::updateable_value<uint32_t> serialize_limit_multipler = utils::updateable_value(std::numeric_limits<uint32_t>::max()),
            utils::updateable_value<uint32_t> kill_limit_multipler = utils::updateable_value(std::numeric_limits<uint32_t>::max()),
            utils::updateable_value<uint32_t> cpu_concurrency = utils::updateable_value<uint32_t>(1),
+            utils::updateable_value<float> preemptive_abort_factor = utils::updateable_value<float>(0.0f),
            register_metrics metrics = register_metrics::no)
        : reader_concurrency_semaphore(utils::updateable_value(count), memory, std::move(name), max_queue_length, std::move(serialize_limit_multipler),
-                std::move(kill_limit_multipler), std::move(cpu_concurrency), metrics)
+                std::move(kill_limit_multipler), std::move(cpu_concurrency), std::move(preemptive_abort_factor), metrics)
    {}

    virtual ~reader_concurrency_semaphore();
--- a/reader_concurrency_semaphore_group.cc
+++ b/reader_concurrency_semaphore_group.cc
@@ -70,7 +70,8 @@ reader_concurrency_semaphore& reader_concurrency_semaphore_group::add_or_update(
            _max_queue_length,
            _serialize_limit_multiplier,
            _kill_limit_multiplier,
-            _cpu_concurrency
+            _cpu_concurrency,
+            _preemptive_abort_factor
        );
    auto&& it = result.first;
    // since we serialize all group changes this change wait will be queues and no further operations
--- a/reader_concurrency_semaphore_group.hh
+++ b/reader_concurrency_semaphore_group.hh
@@ -26,6 +26,7 @@ class reader_concurrency_semaphore_group {
    utils::updateable_value<uint32_t> _serialize_limit_multiplier;
    utils::updateable_value<uint32_t> _kill_limit_multiplier;
    utils::updateable_value<uint32_t> _cpu_concurrency;
+    utils::updateable_value<float> _preemptive_abort_factor;

    friend class database_test_wrapper;

@@ -36,11 +37,12 @@ class reader_concurrency_semaphore_group {
        weighted_reader_concurrency_semaphore(size_t shares, int count, sstring name, size_t max_queue_length,
                utils::updateable_value<uint32_t> serialize_limit_multiplier,
                utils::updateable_value<uint32_t> kill_limit_multiplier,
-                utils::updateable_value<uint32_t> cpu_concurrency)
+                utils::updateable_value<uint32_t> cpu_concurrency,
+                utils::updateable_value<float> preemptive_abort_factor)
                : weight(shares)
                , memory_share(0)
                , sem(utils::updateable_value(count), 0, name, max_queue_length, std::move(serialize_limit_multiplier), std::move(kill_limit_multiplier),
-                      std::move(cpu_concurrency), reader_concurrency_semaphore::register_metrics::yes) {}
+                      std::move(cpu_concurrency), std::move(preemptive_abort_factor), reader_concurrency_semaphore::register_metrics::yes) {}
    };

    std::unordered_map<scheduling_group, weighted_reader_concurrency_semaphore> _semaphores;
@@ -54,6 +56,7 @@ public:
            utils::updateable_value<uint32_t> serialize_limit_multiplier,
            utils::updateable_value<uint32_t> kill_limit_multiplier,
            utils::updateable_value<uint32_t> cpu_concurrency,
+            utils::updateable_value<float> preemptive_abort_factor,
            std::optional<sstring> name_prefix = std::nullopt)
            : _total_memory(memory)
            , _total_weight(0)
@@ -62,6 +65,7 @@ public:
            , _serialize_limit_multiplier(std::move(serialize_limit_multiplier))
            , _kill_limit_multiplier(std::move(kill_limit_multiplier))
            , _cpu_concurrency(std::move(cpu_concurrency))
+            , _preemptive_abort_factor(std::move(preemptive_abort_factor))
            , _operations_serializer(1)
            , _name_prefix(std::move(name_prefix)) { }

--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -92,6 +92,7 @@ public:
        active_await,
        inactive,
        evicted,
+        preemptive_aborted,
    };

    class impl;
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -412,6 +412,7 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
            utils::updateable_value(std::numeric_limits<uint32_t>::max()),
            utils::updateable_value(std::numeric_limits<uint32_t>::max()),
            utils::updateable_value(uint32_t(1)),
+            utils::updateable_value(0.0f),
            reader_concurrency_semaphore::register_metrics::yes)
    // No limits, just for accounting.
    , _compaction_concurrency_sem(reader_concurrency_semaphore::no_limits{}, "compaction", reader_concurrency_semaphore::register_metrics::no)
@@ -423,6 +424,8 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
            std::numeric_limits<size_t>::max(),
            utils::updateable_value(std::numeric_limits<uint32_t>::max()),
            utils::updateable_value(std::numeric_limits<uint32_t>::max()),
+            utils::updateable_value(uint32_t(1)),
+            utils::updateable_value(0.0f),
            reader_concurrency_semaphore::register_metrics::yes)
    , _view_update_read_concurrency_semaphores_group(
            max_memory_concurrent_view_update_reads(),
@@ -431,6 +434,7 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
            _cfg.view_update_reader_concurrency_semaphore_serialize_limit_multiplier,
            _cfg.view_update_reader_concurrency_semaphore_kill_limit_multiplier,
            _cfg.view_update_reader_concurrency_semaphore_cpu_concurrency,
+            utils::updateable_value(0.0f),
            "view_update")
    , _row_cache_tracker(_cfg.index_cache_fraction.operator utils::updateable_value<double>(), cache_tracker::register_metrics::yes)
    , _apply_stage("db_apply", &database::do_apply)
@@ -460,7 +464,8 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
    , _reader_concurrency_semaphores_group(max_memory_concurrent_reads(), max_count_concurrent_reads, max_inactive_queue_length(),
        _cfg.reader_concurrency_semaphore_serialize_limit_multiplier,
        _cfg.reader_concurrency_semaphore_kill_limit_multiplier,
-        _cfg.reader_concurrency_semaphore_cpu_concurrency)
+        _cfg.reader_concurrency_semaphore_cpu_concurrency,
+        _cfg.reader_concurrency_semaphore_preemptive_abort_factor)
    , _stop_barrier(std::move(barrier))
    , _update_memtable_flush_static_shares_action([this, &cfg] { return _memtable_controller.update_static_shares(cfg.memtable_flush_static_shares()); })
    , _memtable_flush_static_shares_observer(cfg.memtable_flush_static_shares.observe(_update_memtable_flush_static_shares_action.make_observer()))
--- a/replica/memtable.cc
+++ b/replica/memtable.cc
@@ -10,7 +10,7 @@
 #include "memtable.hh"
 #include "replica/database.hh"
 #include "mutation/frozen_mutation.hh"
-#include "partition_snapshot_reader.hh"
+#include "replica/partition_snapshot_reader.hh"
 #include "partition_builder.hh"
 #include "mutation/mutation_partition_view.hh"
 #include "readers/empty.hh"
@@ -19,7 +19,7 @@

 namespace replica {

-static mutation_reader make_partition_snapshot_flat_reader_from_snp_schema(
+static mutation_reader make_partition_snapshot_reader_from_snp_schema(
        bool is_reversed,
        reader_permit permit,
        dht::decorated_key dk,
@@ -482,7 +482,7 @@ public:
                        auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), _slice, key_and_snp->first.key());
                        bool digest_requested = _slice.options.contains<query::partition_slice::option::with_digest>();
                        bool is_reversed = _slice.is_reversed();
-                        _delegate = make_partition_snapshot_flat_reader_from_snp_schema(is_reversed, _permit, std::move(key_and_snp->first), std::move(cr), std::move(key_and_snp->second), digest_requested, region(), read_section(), mtbl(), streamed_mutation::forwarding::no, *mtbl());
+                        _delegate = make_partition_snapshot_reader_from_snp_schema(is_reversed, _permit, std::move(key_and_snp->first), std::move(cr), std::move(key_and_snp->second), digest_requested, region(), read_section(), mtbl(), streamed_mutation::forwarding::no, *mtbl());
                        _delegate->upgrade_schema(schema());
                    } else {
                        _end_of_stream = true;
@@ -604,7 +604,7 @@ public:
    }
 };

-static mutation_reader make_partition_snapshot_flat_reader_from_snp_schema(
+static mutation_reader make_partition_snapshot_reader_from_snp_schema(
        bool is_reversed,
        reader_permit permit,
        dht::decorated_key dk,
@@ -617,10 +617,10 @@ static mutation_reader make_partition_snapshot_flat_reader_from_snp_schema(
        streamed_mutation::forwarding fwd, memtable& memtable) {
    if (is_reversed) {
        schema_ptr rev_snp_schema = snp->schema()->make_reversed();
-        return make_partition_snapshot_flat_reader<true, partition_snapshot_read_accounter>(std::move(rev_snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
+        return make_partition_snapshot_reader<true, partition_snapshot_read_accounter>(std::move(rev_snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
    } else {
        schema_ptr snp_schema = snp->schema();
-        return make_partition_snapshot_flat_reader<false, partition_snapshot_read_accounter>(std::move(snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
+        return make_partition_snapshot_reader<false, partition_snapshot_read_accounter>(std::move(snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
    }
 }

@@ -660,7 +660,7 @@ private:
            update_last(key_and_snp->first);
            auto cr = query::clustering_key_filter_ranges::get_ranges(*schema(), schema()->full_slice(), key_and_snp->first.key());
            auto snp_schema = key_and_snp->second->schema();
-            _partition_reader = make_partition_snapshot_flat_reader<false, partition_snapshot_flush_accounter>(snp_schema, _permit, std::move(key_and_snp->first), std::move(cr),
+            _partition_reader = make_partition_snapshot_reader<false, partition_snapshot_flush_accounter>(snp_schema, _permit, std::move(key_and_snp->first), std::move(cr),
                            std::move(key_and_snp->second), false, region(), read_section(), mtbl(), streamed_mutation::forwarding::no, *snp_schema, _flushed_memory);
            _partition_reader->upgrade_schema(schema());
        }
@@ -737,7 +737,7 @@ memtable::make_mutation_reader_opt(schema_ptr query_schema,
        auto dk = pos.as_decorated_key();
        auto cr = query::clustering_key_filter_ranges::get_ranges(*query_schema, slice, dk.key());
        bool digest_requested = slice.options.contains<query::partition_slice::option::with_digest>();
-        auto rd = make_partition_snapshot_flat_reader_from_snp_schema(is_reversed, std::move(permit), std::move(dk), std::move(cr), std::move(snp), digest_requested, *this, _table_shared_data.read_section, shared_from_this(), fwd, *this);
+        auto rd = make_partition_snapshot_reader_from_snp_schema(is_reversed, std::move(permit), std::move(dk), std::move(cr), std::move(snp), digest_requested, *this, _table_shared_data.read_section, shared_from_this(), fwd, *this);
        rd.upgrade_schema(query_schema);
        return rd;
    } else {
--- a/replica/partition_snapshot_reader.hh
+++ b/replica/partition_snapshot_reader.hh
@@ -9,9 +9,7 @@
 #pragma once

 #include "mutation/partition_version.hh"
-#include "readers/mutation_reader_fwd.hh"
 #include "readers/mutation_reader.hh"
-#include "readers/range_tombstone_change_merger.hh"
 #include "keys/clustering_key_filter.hh"
 #include "query/query-request.hh"
 #include "db/partition_snapshot_row_cursor.hh"
@@ -19,8 +17,10 @@

 extern seastar::logger mplog;

+namespace replica {
+
 template <bool Reversing, typename Accounter>
-class partition_snapshot_flat_reader : public mutation_reader::impl, public Accounter {
+class partition_snapshot_reader : public mutation_reader::impl, public Accounter {
    struct row_info {
        mutation_fragment_v2 row;
        tombstone rt_for_row;
@@ -232,7 +232,7 @@ private:
    }
 public:
    template <typename... Args>
-    partition_snapshot_flat_reader(schema_ptr s, reader_permit permit, dht::decorated_key dk, partition_snapshot_ptr snp,
+    partition_snapshot_reader(schema_ptr s, reader_permit permit, dht::decorated_key dk, partition_snapshot_ptr snp,
                              query::clustering_key_filter_ranges crr, bool digest_requested,
                              logalloc::region& region, logalloc::allocating_section& read_section,
                              std::any pointer_to_container, Args&&... args)
@@ -285,7 +285,7 @@ public:

 template <bool Reversing, typename Accounter, typename... Args>
 inline mutation_reader
-make_partition_snapshot_flat_reader(schema_ptr s,
+make_partition_snapshot_reader(schema_ptr s,
                                    reader_permit permit,
                                    dht::decorated_key dk,
                                    query::clustering_key_filter_ranges crr,
@@ -297,7 +297,7 @@ make_partition_snapshot_flat_reader(schema_ptr s,
                                    streamed_mutation::forwarding fwd,
                                    Args&&... args)
 {
-    auto res = make_mutation_reader<partition_snapshot_flat_reader<Reversing, Accounter>>(std::move(s), std::move(permit), std::move(dk),
+    auto res = make_mutation_reader<partition_snapshot_reader<Reversing, Accounter>>(std::move(s), std::move(permit), std::move(dk),
            snp, std::move(crr), digest_requested, region, read_section, std::move(pointer_to_container), std::forward<Args>(args)...);
    if (fwd) {
        return make_forwardable(std::move(res)); // FIXME: optimize
@@ -305,3 +305,5 @@ make_partition_snapshot_flat_reader(schema_ptr s,
        return res;
    }
 }
+
+} // namespace replica
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -7,6 +7,7 @@
 */

 #include <seastar/core/seastar.hh>
+#include <seastar/core/shard_id.hh>
 #include <seastar/core/coroutine.hh>
 #include <seastar/core/with_scheduling_group.hh>
 #include <seastar/coroutine/maybe_yield.hh>
@@ -23,7 +24,6 @@
 #include "replica/data_dictionary_impl.hh"
 #include "replica/compaction_group.hh"
 #include "replica/query_state.hh"
-#include "seastar/core/shard_id.hh"
 #include "sstables/shared_sstable.hh"
 #include "sstables/sstable_set.hh"
 #include "sstables/sstables.hh"
@@ -1746,100 +1746,97 @@ table::seal_active_memtable(compaction_group& cg, flush_permit&& flush_permit) n
 }

 future<>
-table::try_flush_memtable_to_sstable(compaction_group& cg, lw_shared_ptr<memtable> old, sstable_write_permit&& permit) {
+table::try_flush_memtable_to_sstable(compaction_group& cg, lw_shared_ptr<memtable> old, sstable_write_permit&& permit_) {
    co_await utils::get_local_injector().inject("flush_memtable_to_sstable_wait", utils::wait_for_message(60s));

-    auto try_flush = [this, old = std::move(old), permit = make_lw_shared(std::move(permit)), &cg] () mutable -> future<> {
-        // Note that due to our sharded architecture, it is possible that
-        // in the face of a value change some shards will backup sstables
-        // while others won't.
-        //
-        // This is, in theory, possible to mitigate through a rwlock.
-        // However, this doesn't differ from the situation where all tables
-        // are coming from a single shard and the toggle happens in the
-        // middle of them.
-        //
-        // The code as is guarantees that we'll never partially backup a
-        // single sstable, so that is enough of a guarantee.
+    auto permit = make_lw_shared(std::move(permit_));
+    co_await coroutine::switch_to(_config.memtable_scheduling_group);
+    // Note that due to our sharded architecture, it is possible that
+    // in the face of a value change some shards will backup sstables
+    // while others won't.
+    //
+    // This is, in theory, possible to mitigate through a rwlock.
+    // However, this doesn't differ from the situation where all tables
+    // are coming from a single shard and the toggle happens in the
+    // middle of them.
+    //
+    // The code as is guarantees that we'll never partially backup a
+    // single sstable, so that is enough of a guarantee.

-        auto newtabs = std::vector<sstables::shared_sstable>();
-        auto metadata = mutation_source_metadata{};
-        metadata.min_timestamp = old->get_min_timestamp();
-        metadata.max_timestamp = old->get_max_timestamp();
-        auto estimated_partitions = _compaction_strategy.adjust_partition_estimate(metadata, old->partition_count(), _schema);
+    auto newtabs = std::vector<sstables::shared_sstable>();
+    auto metadata = mutation_source_metadata{};
+    metadata.min_timestamp = old->get_min_timestamp();
+    metadata.max_timestamp = old->get_max_timestamp();
+    auto estimated_partitions = _compaction_strategy.adjust_partition_estimate(metadata, old->partition_count(), _schema);

-        if (!cg.async_gate().is_closed()) {
-            co_await _compaction_manager.maybe_wait_for_sstable_count_reduction(cg.view_for_unrepaired_data());
-        }
+    if (!cg.async_gate().is_closed()) {
+        co_await _compaction_manager.maybe_wait_for_sstable_count_reduction(cg.view_for_unrepaired_data());
+    }

-        auto consumer = _compaction_strategy.make_interposer_consumer(metadata, [this, old, permit, &newtabs, estimated_partitions, &cg] (mutation_reader reader) mutable -> future<> {
-          std::exception_ptr ex;
-          try {
-            sstables::sstable_writer_config cfg = get_sstables_manager().configure_writer("memtable");
-            cfg.backup = incremental_backups_enabled();
+    auto consumer = _compaction_strategy.make_interposer_consumer(metadata, [this, old, permit, &newtabs, estimated_partitions, &cg] (mutation_reader reader) mutable -> future<> {
+      std::exception_ptr ex;
+      try {
+        sstables::sstable_writer_config cfg = get_sstables_manager().configure_writer("memtable");
+        cfg.backup = incremental_backups_enabled();

-            auto newtab = make_sstable();
-            newtabs.push_back(newtab);
-            tlogger.debug("Flushing to {}", newtab->get_filename());
+        auto newtab = make_sstable();
+        newtabs.push_back(newtab);
+        tlogger.debug("Flushing to {}", newtab->get_filename());

-            auto monitor = database_sstable_write_monitor(permit, newtab, cg,
-                old->get_max_timestamp());
+        auto monitor = database_sstable_write_monitor(permit, newtab, cg,
+            old->get_max_timestamp());

-            co_return co_await write_memtable_to_sstable(std::move(reader), *old, newtab, estimated_partitions, monitor, cfg);
-          } catch (...) {
-            ex = std::current_exception();
-          }
-          co_await reader.close();
-          co_await coroutine::return_exception_ptr(std::move(ex));
+        co_return co_await write_memtable_to_sstable(std::move(reader), *old, newtab, estimated_partitions, monitor, cfg);
+      } catch (...) {
+        ex = std::current_exception();
+      }
+      co_await reader.close();
+      co_await coroutine::return_exception_ptr(std::move(ex));
+    });
+
+    auto f = consumer(old->make_flush_reader(
+        old->schema(),
+        compaction_concurrency_semaphore().make_tracking_only_permit(old->schema(), "try_flush_memtable_to_sstable()", db::no_timeout, {})));
+
+    // Switch back to default scheduling group for post-flush actions, to avoid them being staved by the memtable flush
+    // controller. Cache update does not affect the input of the memtable cpu controller, so it can be subject to
+    // priority inversion.
+    co_await coroutine::switch_to(default_scheduling_group());
+    try {
+        co_await std::move(f);
+        co_await coroutine::parallel_for_each(newtabs, [] (auto& newtab) -> future<> {
+            co_await newtab->open_data();
+            tlogger.debug("Flushing to {} done", newtab->get_filename());
        });

-        auto f = consumer(old->make_flush_reader(
-            old->schema(),
-            compaction_concurrency_semaphore().make_tracking_only_permit(old->schema(), "try_flush_memtable_to_sstable()", db::no_timeout, {})));
+        co_await with_scheduling_group(_config.memtable_to_cache_scheduling_group, [this, old, &newtabs, &cg] {
+            return update_cache(cg, old, newtabs);
+        });

-        // Switch back to default scheduling group for post-flush actions, to avoid them being staved by the memtable flush
-        // controller. Cache update does not affect the input of the memtable cpu controller, so it can be subject to
-        // priority inversion.
-        auto post_flush = [this, old = std::move(old), &newtabs, f = std::move(f), &cg] () mutable -> future<> {
-            try {
-                co_await std::move(f);
-                co_await coroutine::parallel_for_each(newtabs, [] (auto& newtab) -> future<> {
-                    co_await newtab->open_data();
-                    tlogger.debug("Flushing to {} done", newtab->get_filename());
-                });
-
-                co_await with_scheduling_group(_config.memtable_to_cache_scheduling_group, [this, old, &newtabs, &cg] {
-                    return update_cache(cg, old, newtabs);
-                });
-
-                co_await utils::get_local_injector().inject("replica_post_flush_after_update_cache", [this] (auto& handler) -> future<> {
-                    const auto this_table_name = format("{}.{}", _schema->ks_name(), _schema->cf_name());
-                    if (this_table_name == handler.get("table_name")) {
-                        tlogger.info("error injection handler replica_post_flush_after_update_cache: suspending flush for table {}", this_table_name);
-                        handler.set("suspended", true);
-                        co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
-                        tlogger.info("error injection handler replica_post_flush_after_update_cache: resuming flush for table {}", this_table_name);
-                    }
-                });
-
-                cg.memtables()->erase(old);
-                tlogger.debug("Memtable for {}.{} replaced, into {} sstables", old->schema()->ks_name(), old->schema()->cf_name(), newtabs.size());
-                co_return;
-            } catch (const std::exception& e) {
-                for (auto& newtab : newtabs) {
-                    newtab->mark_for_deletion();
-                    tlogger.error("failed to write sstable {}: {}", newtab->get_filename(), e);
-                }
-                _config.cf_stats->failed_memtables_flushes_count++;
-                // If we failed this write we will try the write again and that will create a new flush reader
-                // that will decrease dirty memory again. So we need to reset the accounting.
-                old->revert_flushed_memory();
-                throw;
+        co_await utils::get_local_injector().inject("replica_post_flush_after_update_cache", [this] (auto& handler) -> future<> {
+            const auto this_table_name = format("{}.{}", _schema->ks_name(), _schema->cf_name());
+            if (this_table_name == handler.get("table_name")) {
+                tlogger.info("error injection handler replica_post_flush_after_update_cache: suspending flush for table {}", this_table_name);
+                handler.set("suspended", true);
+                co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
+                tlogger.info("error injection handler replica_post_flush_after_update_cache: resuming flush for table {}", this_table_name);
            }
-        };
-        co_return co_await with_scheduling_group(default_scheduling_group(), std::ref(post_flush));
-    };
-    co_return co_await with_scheduling_group(_config.memtable_scheduling_group, std::ref(try_flush));
+        });
+
+        cg.memtables()->erase(old);
+        tlogger.debug("Memtable for {}.{} replaced, into {} sstables", old->schema()->ks_name(), old->schema()->cf_name(), newtabs.size());
+        co_return;
+    } catch (const std::exception& e) {
+        for (auto& newtab : newtabs) {
+            newtab->mark_for_deletion();
+            tlogger.error("failed to write sstable {}: {}", newtab->get_filename(), e);
+        }
+        _config.cf_stats->failed_memtables_flushes_count++;
+        // If we failed this write we will try the write again and that will create a new flush reader
+        // that will decrease dirty memory again. So we need to reset the accounting.
+        old->revert_flushed_memory();
+        throw;
+    }
 }

 void
--- a/service/client_state.cc
+++ b/service/client_state.cc
@@ -98,16 +98,6 @@ future<> service::client_state::has_column_family_access(const sstring& ks,
    co_return co_await has_access(ks, {p, r, t}, is_vector_indexed);
 }

-future<> service::client_state::has_schema_access(const schema& s, auth::permission p) const {
-    auth::resource r = auth::make_data_resource(s.ks_name(), s.cf_name());
-    co_return co_await has_access(s.ks_name(), {p, r});
-}
-
-future<> service::client_state::has_schema_access(const sstring& ks_name, const sstring& cf_name, auth::permission p) const {
-    auth::resource r = auth::make_data_resource(ks_name, cf_name);
-    co_return co_await has_access(ks_name, {p, r});
-}
-
 future<> service::client_state::check_internal_table_permissions(std::string_view ks, std::string_view table_name, const auth::command_desc& cmd) const {
    // 1. CDC and $paxos tables are managed internally by Scylla. Users are prohibited
    //    from running ALTER or DROP commands on them.
@@ -227,8 +217,6 @@ future<> service::client_state::has_access(const sstring& ks, auth::command_desc
    static const std::unordered_set<auth::resource> vector_search_system_resources = {
        auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::GROUP0_HISTORY),
        auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::VERSIONS),
-        auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::CDC_STREAMS),
-        auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::CDC_TIMESTAMPS),
    };

    if ((cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) ||
@@ -365,4 +353,4 @@ future<> service::client_state::set_client_options(
        });
        _client_options.emplace_back(std::move(cached_key), std::move(cached_value));
    }
-}
+}
--- a/service/client_state.hh
+++ b/service/client_state.hh
@@ -359,8 +359,6 @@ public:
    future<> has_keyspace_access(const sstring&, auth::permission) const;
    future<> has_column_family_access(const sstring&, const sstring&, auth::permission,
                                      auth::command_desc::type = auth::command_desc::type::OTHER, std::optional<bool> is_vector_indexed = std::nullopt) const;
-    future<> has_schema_access(const schema& s, auth::permission p) const;
-    future<> has_schema_access(const sstring&, const sstring&, auth::permission p) const;

    future<> has_functions_access(auth::permission p) const;
    future<> has_functions_access(const sstring& ks, auth::permission p) const;
--- a/service/raft/group0_state_machine.cc
+++ b/service/raft/group0_state_machine.cc
@@ -338,7 +338,7 @@ future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merg
 }

 #ifndef SCYLLA_BUILD_MODE_RELEASE
-static void ensure_group0_schema(const group0_command& cmd, const replica::database& db) {
+static void ensure_group0_schema(const group0_command& cmd, data_dictionary::database db) {
    auto validate_schema = [&db](const utils::chunked_vector<canonical_mutation>& mutations) {
        for (const auto& mut : mutations) {
            // Get the schema for the column family
@@ -382,7 +382,7 @@ future<> group0_state_machine::apply(std::vector<raft::command_cref> command) {

    // max_mutation_size = 1/2 of commitlog segment size, thus max_command_size is set 1/3 of commitlog segment size to leave space for metadata.
    size_t max_command_size = _sp.data_dictionary().get_config().commitlog_segment_size_in_mb() * 1024 * 1024 / 3;
-    group0_state_machine_merger m(co_await _client.sys_ks().get_last_group0_state_id(), std::move(read_apply_mutex_holder),
+    group0_state_machine_merger m(co_await _client.get_last_group0_state_id(), std::move(read_apply_mutex_holder),
                                  max_command_size, _sp.data_dictionary());

    for (auto&& c : command) {
@@ -392,7 +392,7 @@ future<> group0_state_machine::apply(std::vector<raft::command_cref> command) {
 #ifndef SCYLLA_BUILD_MODE_RELEASE
        // Ensure that the schema of the mutations is a group0 schema.
        // This validation is supposed to be only performed in tests, so it is skipped in the release mode.
-        ensure_group0_schema(cmd, _client.sys_ks().local_db());
+        ensure_group0_schema(cmd, _sp.data_dictionary());
 #endif

        slogger.trace("cmd: prev_state_id: {}, new_state_id: {}, creator_addr: {}, creator_id: {}",
--- a/service/raft/raft_group0_client.cc
+++ b/service/raft/raft_group0_client.cc
@@ -245,6 +245,10 @@ utils::UUID raft_group0_client::generate_group0_state_id(utils::UUID prev_state_
    return utils::UUID_gen::get_random_time_UUID_from_micros(std::chrono::microseconds{ts});
 }

+future<utils::UUID> raft_group0_client::get_last_group0_state_id() {
+    return _sys_ks.get_last_group0_state_id();
+}
+
 future<group0_guard> raft_group0_client::start_operation(seastar::abort_source& as, std::optional<raft_timeout> timeout) {
    if (this_shard_id() != 0) {
        on_internal_error(logger, "start_group0_operation: must run on shard 0");
@@ -282,7 +286,7 @@ future<group0_guard> raft_group0_client::start_operation(seastar::abort_source&
            // Read barrier may wait for `group0_state_machine::apply` which also takes this mutex.
            auto read_apply_holder = co_await hold_read_apply_mutex(as);

-            auto observed_group0_state_id = co_await _sys_ks.get_last_group0_state_id();
+            auto observed_group0_state_id = co_await get_last_group0_state_id();
            auto new_group0_state_id = generate_group0_state_id(observed_group0_state_id);

            co_return group0_guard {
@@ -467,10 +471,6 @@ future<semaphore_units<>> raft_group0_client::hold_read_apply_mutex(abort_source
    return get_units(_read_apply_mutex, 1, as);
 }

-db::system_keyspace& raft_group0_client::sys_ks() {
-    return _sys_ks;
-}
-
 bool raft_group0_client::in_recovery() const {
    return _upgrade_state == group0_upgrade_state::recovery;
 }
--- a/service/raft/raft_group0_client.hh
+++ b/service/raft/raft_group0_client.hh
@@ -200,8 +200,6 @@ public:

    future<semaphore_units<>> hold_read_apply_mutex(abort_source&);

-    db::system_keyspace& sys_ks();
-
    bool in_recovery() const;

    gc_clock::duration get_history_gc_duration() const;
@@ -212,6 +210,7 @@ public:
    query_result_guard create_result_guard(utils::UUID query_id);
    void set_query_result(utils::UUID query_id, service::broadcast_tables::query_result qr);
    static utils::UUID generate_group0_state_id(utils::UUID prev_state_id);
+    future<utils::UUID> get_last_group0_state_id();
 };

 using mutations_generator = coroutine::experimental::generator<mutation>;
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -123,7 +123,12 @@ utils::small_vector<locator::host_id, N> addr_vector_to_id(const gms::gossiper&
 // Check the effective replication map consistency:
 // we have an inconsistent effective replication map in case we the number of
 // read replicas is higher than the replication factor.
-[[maybe_unused]] void validate_read_replicas(const locator::effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) {
+void validate_read_replicas(const locator::effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) {
+    // Skip for non-debug builds.
+    if constexpr (!tools::build_info::is_debug_build()) {
+        return;
+    }
+
    const sstring error = erm.get_replication_strategy().sanity_check_read_replicas(erm, read_replicas);
    if (!error.empty()) {
        on_internal_error(slogger, error);
@@ -3216,9 +3221,9 @@ storage_proxy::storage_proxy(sharded<replica::database>& db, storage_proxy::conf
    , _write_ack_smp_service_group(cfg.write_ack_smp_service_group)
    , _next_response_id(std::chrono::system_clock::now().time_since_epoch()/1ms)
    , _hints_resource_manager(*this, cfg.available_memory / 10, _db.local().get_config().max_hinted_handoff_concurrency)
-    , _hints_manager(*this, _db.local().get_config().hints_directory(), cfg.hinted_handoff_enabled, _db.local().get_config().max_hint_window_in_ms(), _hints_resource_manager, _db)
+    , _hints_manager(*this, _db.local().get_config().hints_directory(), cfg.hinted_handoff_enabled, _db.local().get_config().max_hint_window_in_ms(), _hints_resource_manager, _db, cfg.hints_sched_group)
    , _hints_directory_initializer(std::move(cfg.hints_directory_initializer))
-    , _hints_for_views_manager(*this, _db.local().get_config().view_hints_directory(), {}, _db.local().get_config().max_hint_window_in_ms(), _hints_resource_manager, _db)
+    , _hints_for_views_manager(*this, _db.local().get_config().view_hints_directory(), {}, _db.local().get_config().max_hint_window_in_ms(), _hints_resource_manager, _db, cfg.hints_sched_group)
    , _stats_key(stats_key)
    , _features(feat)
    , _background_write_throttle_threahsold(cfg.available_memory / 10)
@@ -6967,12 +6972,7 @@ host_id_vector_replica_set storage_proxy::get_endpoints_for_reading(const schema
        return host_id_vector_replica_set{my_host_id(erm)};
    }
    auto endpoints = erm.get_replicas_for_reading(token);
-    // Skip for non-debug builds and maintenance mode.
-    if constexpr (tools::build_info::is_debug_build()) {
-        if (!_db.local().get_config().maintenance_mode()) {
-            validate_read_replicas(erm, endpoints);
-        }
-    }
+    validate_read_replicas(erm, endpoints);
    auto it = std::ranges::remove_if(endpoints, std::not_fn(std::bind_front(&storage_proxy::is_alive, this, std::cref(erm)))).begin();
    endpoints.erase(it, endpoints.end());
    sort_endpoints_by_proximity(erm, endpoints);
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -195,6 +195,7 @@ public:
        // they need a separate smp_service_group to prevent an ABBA deadlock
        // with writes.
        smp_service_group write_ack_smp_service_group = default_smp_service_group();
+        scheduling_group hints_sched_group;
    };
 private:

--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -532,16 +532,9 @@ future<> storage_service::raft_topology_update_ip(locator::host_id id, gms::inet
    co_await when_all_succeed(sys_ks_futures.begin(), sys_ks_futures.end()).discard_result();
 }

-static std::unordered_set<locator::host_id> get_released_nodes(const service::topology& topology, const locator::token_metadata& tm) {
-    return boost::join(topology.left_nodes, topology.ignored_nodes)
-            | std::views::transform([] (const auto& raft_id) { return locator::host_id(raft_id.uuid()); })
-            | std::views::filter([&] (const auto& h) { return !tm.get_topology().has_node(h); })
-            | std::ranges::to<std::unordered_set<locator::host_id>>();
-}
-
 // Synchronizes the local node state (token_metadata, system.peers/system.local tables,
 // gossiper) to align it with the other raft topology nodes.
-future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal, std::optional<std::unordered_set<locator::host_id>> prev_released) {
+future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal) {
    nodes_to_notify_after_sync nodes_to_notify;

    rtlogger.trace("Start sync_raft_topology_nodes");
@@ -695,10 +688,13 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
        }
    }

-    if (prev_released) {
-        auto nodes_to_release = get_released_nodes(t, *tmptr);
-        std::erase_if(nodes_to_release, [&] (const auto& host_id) { return prev_released->contains(host_id); });
-        std::copy(nodes_to_release.begin(), nodes_to_release.end(), std::back_inserter(nodes_to_notify.released));
+    auto nodes_to_release = t.left_nodes;
+    nodes_to_release.insert(t.ignored_nodes.begin(), t.ignored_nodes.end());
+    for (const auto& id: nodes_to_release) {
+        auto host_id = locator::host_id(id.uuid());
+        if (!tmptr->get_topology().find_node(host_id)) {
+            nodes_to_notify.released.push_back(host_id);
+        }
    }

    co_await when_all_succeed(sys_ks_futures.begin(), sys_ks_futures.end()).discard_result();
@@ -736,10 +732,6 @@ future<> storage_service::topology_state_load(state_change_hint hint) {

    rtlogger.debug("reload raft topology state");
    std::unordered_set<raft::server_id> prev_normal = _topology_state_machine._topology.normal_nodes | std::views::keys | std::ranges::to<std::unordered_set>();
-    std::optional<std::unordered_set<locator::host_id>> prev_released;
-    if (!_topology_state_machine._topology.is_empty()) {
-        prev_released = get_released_nodes(_topology_state_machine._topology, get_token_metadata());
-    }

    std::unordered_set<locator::host_id> tablet_hosts = co_await replica::read_required_hosts(_qp);

@@ -840,7 +832,7 @@ future<> storage_service::topology_state_load(state_change_hint hint) {
        }, topology.tstate);
        tmptr->set_read_new(read_new);

-        auto nodes_to_notify = co_await sync_raft_topology_nodes(tmptr, std::move(prev_normal), std::move(prev_released));
+        auto nodes_to_notify = co_await sync_raft_topology_nodes(tmptr, std::move(prev_normal));

        std::optional<locator::tablet_metadata> tablets;
        if (hint.tablets_hint) {
@@ -8439,7 +8431,6 @@ future<> storage_service::start_maintenance_mode() {
    set_mode(mode::MAINTENANCE);

    return mutate_token_metadata([this] (mutable_token_metadata_ptr token_metadata) -> future<> {
-        token_metadata->update_topology(my_host_id(), _snitch.local()->get_location(), locator::node::state::normal, smp::count);
        return token_metadata->update_normal_tokens({ dht::token{} }, my_host_id());
    }, acquire_merge_lock::yes);
 }
--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -1115,7 +1115,7 @@ private:
    // gossiper) to align it with the other raft topology nodes.
    // Optional target_node can be provided to restrict the synchronization to the specified node.
    // Returns a structure that describes which notifications to trigger after token metadata is updated.
-    future<nodes_to_notify_after_sync> sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal, std::optional<std::unordered_set<locator::host_id>> prev_released);
+    future<nodes_to_notify_after_sync> sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::unordered_set<raft::server_id> prev_normal);
    // Triggers notifications (on_joined, on_left) based on the recent changes to token metadata, as described by the passed in structure.
    // This function should be called on the result of `sync_raft_topology_nodes`, after the global token metadata is updated.
    future<> notify_nodes_after_sync(nodes_to_notify_after_sync&& nodes_to_notify);
--- a/service/tablet_allocator.cc
+++ b/service/tablet_allocator.cc
@@ -951,6 +951,8 @@ private:
                return true;
            case tablet_transition_stage::write_both_read_old:
                return true;
+            case tablet_transition_stage::write_both_read_old_fallback_cleanup:
+                return false;
            case tablet_transition_stage::streaming:
                return true;
            case tablet_transition_stage::rebuild_repair:
@@ -3860,6 +3862,7 @@ class tablet_allocator_impl : public tablet_allocator::impl
    service::migration_notifier& _migration_notifier;
    replica::database& _db;
    load_balancer_stats_manager _load_balancer_stats;
+    scheduling_group _background;
    bool _stopped = false;
    bool _use_tablet_aware_balancing = true;
    locator::load_stats_ptr _load_stats;
@@ -3881,7 +3884,9 @@ public:
    tablet_allocator_impl(tablet_allocator::config cfg, service::migration_notifier& mn, replica::database& db)
            : _migration_notifier(mn)
            , _db(db)
-            , _load_balancer_stats("load_balancer") {
+            , _load_balancer_stats("load_balancer")
+            , _background(cfg.background_sg)
+    {
        _migration_notifier.register_listener(this);
    }

@@ -3898,7 +3903,7 @@ public:

    future<migration_plan> balance_tablets(token_metadata_ptr tm, service::topology* topology, db::system_keyspace* sys_ks, locator::load_stats_ptr table_load_stats, std::unordered_set<host_id> skiplist) {
        auto lb = make_load_balancer(tm, topology, sys_ks, table_load_stats ? table_load_stats : _load_stats, std::move(skiplist));
-        co_await coroutine::switch_to(_db.get_streaming_scheduling_group());
+        co_await coroutine::switch_to(_background);
        co_return co_await lb.make_plan();
    }

--- a/service/tablet_allocator.hh
+++ b/service/tablet_allocator.hh
@@ -257,6 +257,7 @@ class migration_notifier;
 class tablet_allocator {
 public:
    struct config {
+        scheduling_group background_sg;
    };
    class impl {
    public:
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -1539,7 +1539,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                case locator::tablet_transition_stage::allow_write_both_read_old:
                    if (action_failed(tablet_state.barriers[trinfo.stage])) {
                        if (check_excluded_replicas()) {
-                            transition_to_with_barrier(locator::tablet_transition_stage::cleanup_target);
+                            transition_to(locator::tablet_transition_stage::cleanup_target);
                            break;
                        }
                    }
@@ -1560,7 +1560,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                case locator::tablet_transition_stage::write_both_read_old:
                    if (action_failed(tablet_state.barriers[trinfo.stage])) {
                        if (check_excluded_replicas()) {
-                            transition_to_with_barrier(locator::tablet_transition_stage::cleanup_target);
+                            transition_to(locator::tablet_transition_stage::cleanup_target);
                            break;
                        }
                    }
@@ -1570,17 +1570,18 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                        transition_to_with_barrier(locator::tablet_transition_stage::streaming);
                    }
                    break;
+                case locator::tablet_transition_stage::write_both_read_old_fallback_cleanup:
+                    transition_to_with_barrier(locator::tablet_transition_stage::cleanup_target);
+                    break;
                case locator::tablet_transition_stage::rebuild_repair: {
                    if (action_failed(tablet_state.rebuild_repair)) {
                        bool fail = utils::get_local_injector().enter("rebuild_repair_stage_fail");
                        if (fail || check_excluded_replicas()) {
-                            if (do_barrier()) {
-                                rtlogger.debug("Will set tablet {} stage to {}", gid, locator::tablet_transition_stage::cleanup_target);
-                                updates.emplace_back(get_mutation_builder()
-                                        .set_stage(last_token, locator::tablet_transition_stage::cleanup_target)
-                                        .del_session(last_token)
-                                        .build());
-                            }
+                            rtlogger.debug("Will set tablet {} stage to {}", gid, locator::tablet_transition_stage::cleanup_target);
+                            updates.emplace_back(get_mutation_builder()
+                                    .set_stage(last_token, locator::tablet_transition_stage::cleanup_target)
+                                    .del_session(last_token)
+                                    .build());
                            break;
                        }
                    }
@@ -1653,13 +1654,11 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                        }

                        if (rollback) {
-                            if (do_barrier()) {
-                                rtlogger.debug("Will set tablet {} stage to {}: {}", gid, locator::tablet_transition_stage::cleanup_target, *rollback);
-                                updates.emplace_back(get_mutation_builder()
-                                        .set_stage(last_token, locator::tablet_transition_stage::cleanup_target)
-                                        .del_session(last_token)
-                                        .build());
-                            }
+                            rtlogger.debug("Will set tablet {} stage to {}: {}", gid, locator::tablet_transition_stage::cleanup_target, *rollback);
+                            updates.emplace_back(get_mutation_builder()
+                                .set_stage(last_token, locator::tablet_transition_stage::cleanup_target)
+                                .del_session(last_token)
+                                .build());
                            break;
                        }
                    }
@@ -1696,7 +1695,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                        _exit(1);
                    });

-                    auto next_stage = locator::tablet_transition_stage::use_new;
                    if (action_failed(tablet_state.barriers[trinfo.stage])) {
                        auto& tinfo = tmap.get_tablet_info(gid.tablet);
                        unsigned excluded_old = 0;
@@ -1718,10 +1716,15 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                        // than excluded_old for intra-node migration.
                        if (excluded_new > excluded_old && trinfo.transition != locator::tablet_transition_kind::intranode_migration) {
                            rtlogger.debug("During {} stage of {} {} new nodes and {} old nodes were excluded", trinfo.stage, gid, excluded_new, excluded_old);
-                            next_stage = locator::tablet_transition_stage::cleanup_target;
+                            if (_feature_service.tablets_intermediate_fallback_cleanup) {
+                                transition_to(locator::tablet_transition_stage::write_both_read_old_fallback_cleanup);
+                            } else {
+                                transition_to_with_barrier(locator::tablet_transition_stage::cleanup_target);
+                            }
+                            break;
                        }
                    }
-                    transition_to_with_barrier(next_stage);
+                    transition_to_with_barrier(locator::tablet_transition_stage::use_new);
                }
                    break;
                case locator::tablet_transition_stage::use_new:
@@ -3873,9 +3876,6 @@ future<> topology_coordinator::refresh_tablet_load_stats() {
        for (auto& [table_id, table_stats] : dc_stats.tables) {
            co_await coroutine::maybe_yield();

-            if (!_db.column_family_exists(table_id)) {
-                continue;
-            }
            auto& t = _db.find_column_family(table_id);
            auto& rs = t.get_effective_replication_map()->get_replication_strategy();
            if (!rs.uses_tablets()) {
@@ -3899,9 +3899,6 @@ future<> topology_coordinator::refresh_tablet_load_stats() {
    }

    for (auto& [table_id, table_load_stats] : stats.tables) {
-        if (!total_replicas.contains(table_id)) {
-            continue;
-        }
        auto table_total_replicas = total_replicas.at(table_id);
        if (table_total_replicas == 0) {
            continue;
--- a/sstables/mx/partition_reversing_data_source.cc
+++ b/sstables/mx/partition_reversing_data_source.cc
@@ -8,6 +8,7 @@

 #include <seastar/core/coroutine.hh>
 #include <seastar/core/iostream.hh>
+#include <seastar/util/memory-data-source.hh>
 #include "partition_reversing_data_source.hh"
 #include "reader_permit.hh"
 #include "sstables/consumer.hh"
@@ -15,7 +16,6 @@
 #include "sstables/shared_sstable.hh"
 #include "sstables/sstables.hh"
 #include "sstables/types.hh"
-#include "utils/buffer_input_stream.hh"
 #include "utils/to_string.hh"

 namespace sstables {
@@ -417,7 +417,7 @@ private:
            }
            _current_read_size = std::min(max_read_size, _current_read_size * 2);
        }
-        co_return make_buffer_input_stream(_cached_read.share(_cached_read.size() - row_size, row_size));
+        co_return seastar::util::as_input_stream(_cached_read.share(_cached_read.size() - row_size, row_size));
    }
    temporary_buffer<char> last_row(size_t row_size) {
        auto tmp = _cached_read.share(_cached_read.size() - row_size, row_size);
--- a/sstables/sstables_manager.cc
+++ b/sstables/sstables_manager.cc
@@ -45,6 +45,8 @@ sstables_manager::sstables_manager(
        std::numeric_limits<size_t>::max(),
        utils::updateable_value(std::numeric_limits<uint32_t>::max()),
        utils::updateable_value(std::numeric_limits<uint32_t>::max()),
+        utils::updateable_value(uint32_t(1)),
+        utils::updateable_value(0.0f),
        reader_concurrency_semaphore::register_metrics::no)
    , _dir_semaphore(dir_sem)
    , _resolve_host_id(std::move(resolve_host_id))
--- a/sstables/storage.cc
+++ b/sstables/storage.cc
@@ -95,6 +95,7 @@ public:
    virtual future<file> open_component(const sstable& sst, component_type type, open_flags flags, file_open_options options, bool check_integrity) override;
    virtual future<data_sink> make_data_or_index_sink(sstable& sst, component_type type) override;
    future<data_source> make_data_or_index_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const override;
+    future<data_source> make_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const override;
    virtual future<data_sink> make_component_sink(sstable& sst, component_type type, open_flags oflags, file_output_stream_options options) override;
    virtual future<> destroy(const sstable& sst) override { return make_ready_future<>(); }
    virtual future<atomic_delete_context> atomic_delete_prepare(const std::vector<shared_sstable>&) const override;
@@ -132,8 +133,12 @@ future<data_sink> filesystem_storage::make_data_or_index_sink(sstable& sst, comp
    }
 }

-future<data_source> filesystem_storage::make_data_or_index_source(sstable&, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const {
+future<data_source> filesystem_storage::make_data_or_index_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const {
    SCYLLA_ASSERT(type == component_type::Data || type == component_type::Index);
+    co_return co_await make_source(sst, type, std::move(f), offset, len, std::move(opt));
+}
+
+future<data_source> filesystem_storage::make_source(sstable&, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const {
    co_return make_file_data_source(std::move(f), offset, len, std::move(opt));
 }

@@ -615,6 +620,7 @@ public:
    future<file> open_component(const sstable& sst, component_type type, open_flags flags, file_open_options options, bool check_integrity) override;
    future<data_sink> make_data_or_index_sink(sstable& sst, component_type type) override;
    future<data_source> make_data_or_index_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const override;
+    future<data_source> make_source(sstable& sst, component_type type, file, uint64_t offset, uint64_t len, file_input_stream_options) const override;

    future<data_sink> make_component_sink(sstable& sst, component_type type, open_flags oflags, file_output_stream_options options) override;
    future<> destroy(const sstable& sst) override {
@@ -657,6 +663,7 @@ public:
    {}

    future<data_source> make_data_or_index_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const override;
+    future<data_source> make_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const override;
 };

 object_name object_storage_base::make_object_name(const sstable& sst, component_type type) const {
@@ -742,13 +749,23 @@ future<data_sink> object_storage_base::make_data_or_index_sink(sstable& sst, com

 future<data_source>
 object_storage_base::make_data_or_index_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options options) const {
+    co_return co_await make_source(sst, type, f, offset, len, options);
+}
+
+future<data_source>
+object_storage_base::make_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options) const {
    co_return co_await maybe_wrap_source(sst, type, _client->make_download_source(make_object_name(sst, type), abort_source()), offset, len);
 }

 future<data_source>
 s3_storage::make_data_or_index_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options options) const {
+    co_return co_await make_source(sst, type, std::move(f), offset, len, std::move(options));
+}
+
+future<data_source>
+s3_storage::make_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options options) const {
    if (offset == 0) {
-        co_return co_await object_storage_base::make_data_or_index_source(sst, type, std::move(f), offset, len, std::move(options));
+        co_return co_await object_storage_base::make_source(sst, type, std::move(f), offset, len, std::move(options));
    }
    co_return make_file_data_source(
        co_await maybe_wrap_file(sst, type, open_flags::ro, _client->make_readable_file(make_object_name(sst, type), abort_source())), offset, len, std::move(options));
--- a/sstables/storage.hh
+++ b/sstables/storage.hh
@@ -107,6 +107,7 @@ public:
    virtual future<file> open_component(const sstable& sst, component_type type, open_flags flags, file_open_options options, bool check_integrity) = 0;
    virtual future<data_sink> make_data_or_index_sink(sstable& sst, component_type type) = 0;
    virtual future<data_source> make_data_or_index_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const = 0;
+    virtual future<data_source> make_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const = 0;
    virtual future<data_sink> make_component_sink(sstable& sst, component_type type, open_flags oflags, file_output_stream_options options) = 0;
    virtual future<> destroy(const sstable& sst) = 0;
    virtual future<atomic_delete_context> atomic_delete_prepare(const std::vector<shared_sstable>&) const = 0;
--- a/sstables_loader.cc
+++ b/sstables_loader.cc
@@ -11,11 +11,13 @@
 #include <seastar/core/map_reduce.hh>
 #include <seastar/core/shared_ptr.hh>
 #include <seastar/core/shared_mutex.hh>
+#include <seastar/core/units.hh>
 #include <seastar/coroutine/maybe_yield.hh>
 #include <seastar/coroutine/switch_to.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
 #include <seastar/rpc/rpc.hh>
 #include "sstables_loader.hh"
+#include "dht/auto_refreshing_sharder.hh"
 #include "replica/distributed_loader.hh"
 #include "replica/database.hh"
 #include "sstables/sstables_manager.hh"
@@ -178,11 +180,13 @@ private:
 };

 class tablet_sstable_streamer : public sstable_streamer {
+    sharded<replica::database>& _db;
    const locator::tablet_map& _tablet_map;
 public:
-    tablet_sstable_streamer(netw::messaging_service& ms, replica::database& db, ::table_id table_id, locator::effective_replication_map_ptr erm,
+    tablet_sstable_streamer(netw::messaging_service& ms, sharded<replica::database>& db, ::table_id table_id, locator::effective_replication_map_ptr erm,
                            std::vector<sstables::shared_sstable> sstables, primary_replica_only primary, unlink_sstables unlink, stream_scope scope)
-        : sstable_streamer(ms, db, table_id, std::move(erm), std::move(sstables), primary, unlink, scope)
+        : sstable_streamer(ms, db.local(), table_id, std::move(erm), std::move(sstables), primary, unlink, scope)
+        , _db(db)
        , _tablet_map(_erm->get_token_metadata().tablets().get_tablet_map(table_id)) {
    }

@@ -199,9 +203,139 @@ private:
        return result;
    }

-    future<> stream_fully_contained_sstables(const dht::partition_range& pr, std::vector<sstables::shared_sstable> sstables, shared_ptr<stream_progress> progress) {
-        // FIXME: fully contained sstables can be optimized.
-        return stream_sstables(pr, std::move(sstables), std::move(progress));
+    struct minimal_sst_info {
+        sstables::generation_type _generation;
+        sstables::sstable_version_types _version;
+        sstables::sstable_format_types _format;
+    };
+    using sst_classification_info = std::vector<std::vector<minimal_sst_info>>;
+
+    future<> attach_sstable(shard_id from_shard, const sstring& ks, const sstring& cf, const minimal_sst_info& min_info) const {
+        llog.debug("Adding downloaded SSTables to the table {} on shard {}, submitted from shard {}", _table.schema()->cf_name(), this_shard_id(), from_shard);
+        auto& db = _db.local();
+        auto& table = db.find_column_family(ks, cf);
+        auto& sst_manager = table.get_sstables_manager();
+        auto sst = sst_manager.make_sstable(
+            table.schema(), table.get_storage_options(), min_info._generation, sstables::sstable_state::normal, min_info._version, min_info._format);
+        sst->set_sstable_level(0);
+        auto units = co_await sst_manager.dir_semaphore().get_units(1);
+        co_await sst->load(table.get_effective_replication_map()->get_sharder(*table.schema()));
+        co_await table.add_sstable_and_update_cache(sst);
+    }
+
+    future<>
+    stream_fully_contained_sstables(const dht::partition_range& pr, std::vector<sstables::shared_sstable> sstables, shared_ptr<stream_progress> progress) {
+        if (_stream_scope != stream_scope::node) {
+            co_return co_await stream_sstables(pr, std::move(sstables), std::move(progress));
+        }
+        llog.debug("Directly downloading {} fully contained SSTables to local node from object storage.", sstables.size());
+        auto downloaded_ssts = co_await download_fully_contained_sstables(std::move(sstables));
+
+        co_await smp::invoke_on_all(
+            [this, &downloaded_ssts, from = this_shard_id(), ks = _table.schema()->ks_name(), cf = _table.schema()->cf_name()] -> future<> {
+                auto shard_ssts = std::move(downloaded_ssts[this_shard_id()]);
+                for (const auto& min_info : shard_ssts) {
+                    co_await attach_sstable(from, ks, cf, min_info);
+                }
+            });
+        if (progress) {
+            progress->advance(std::accumulate(downloaded_ssts.cbegin(), downloaded_ssts.cend(), 0., [](float acc, const auto& v) { return acc + v.size(); }));
+        }
+    }
+
+    future<sst_classification_info> download_fully_contained_sstables(std::vector<sstables::shared_sstable> sstables) const {
+        constexpr auto foptions = file_open_options{.extent_allocation_size_hint = 32_MiB, .sloppy_size = true};
+        constexpr auto stream_options = file_output_stream_options{.buffer_size = 128_KiB, .write_behind = 10};
+        sst_classification_info downloaded_sstables(smp::count);
+        for (const auto& sstable : sstables) {
+            auto components = sstable->all_components();
+
+            // Move the TOC to the front to be processed first since `sstables::create_stream_sink` takes care
+            // of creating behind the scene TemporaryTOC instead of usual one. This assures that in case of failure
+            // this partially created SSTable will be cleaned up properly at some point.
+            auto toc_it = std::ranges::find_if(components, [](const auto& component) { return component.first == component_type::TOC; });
+            if (toc_it != components.begin()) {
+                swap(*toc_it, components.front());
+            }
+
+            // Ensure the Scylla component is processed second.
+            //
+            // The sstable_sink->output() call for each component may invoke load_metadata()
+            // and save_metadata(), but these functions only operate correctly if the Scylla
+            // component file already exists on disk. If the Scylla component is written first,
+            // load_metadata()/save_metadata() become no-ops, leaving the original Scylla
+            // component (with outdated metadata) untouched.
+            //
+            // By placing the Scylla component second, we guarantee that:
+            //   1) The first component (TOC) is written and the Scylla component file already
+            //      exists on disk when subsequent output() calls happen.
+            //   2) Later output() calls will overwrite the Scylla component with the correct,
+            //      updated metadata.
+            //
+            // In short: Scylla must be written second so that all following output() calls
+            // can properly update its metadata instead of silently skipping it.
+            auto scylla_it = std::ranges::find_if(components, [](const auto& component) { return component.first == component_type::Scylla; });
+            if (scylla_it != std::next(components.begin())) {
+                swap(*scylla_it, *std::next(components.begin()));
+            }
+
+            auto gen = _table.get_sstable_generation_generator()();
+            auto files = co_await sstable->readable_file_for_all_components();
+            for (auto it = components.cbegin(); it != components.cend(); ++it) {
+                try {
+                    auto descriptor = sstable->get_descriptor(it->first);
+                    auto sstable_sink = sstables::create_stream_sink(
+                        _table.schema(),
+                        _table.get_sstables_manager(),
+                        _table.get_storage_options(),
+                        sstables::sstable_state::normal,
+                        sstables::sstable::component_basename(
+                            _table.schema()->ks_name(), _table.schema()->cf_name(), descriptor.version, gen, descriptor.format, it->first),
+                        sstables::sstable_stream_sink_cfg{.last_component = std::next(it) == components.cend()});
+                    auto out = co_await sstable_sink->output(foptions, stream_options);
+
+                    input_stream src(co_await [this, &it, sstable, f = files.at(it->first)]() -> future<input_stream<char>> {
+                        const auto fis_options = file_input_stream_options{.buffer_size = 128_KiB, .read_ahead = 2};
+
+                        if (it->first != sstables::component_type::Data) {
+                            co_return input_stream<char>(
+                                co_await sstable->get_storage().make_source(*sstable, it->first, f, 0, std::numeric_limits<size_t>::max(), fis_options));
+                        }
+                        auto permit = co_await _db.local().obtain_reader_permit(_table, "download_fully_contained_sstables", db::no_timeout, {});
+                        co_return co_await (
+                            sstable->get_compression()
+                                ? sstable->data_stream(0, sstable->ondisk_data_size(), std::move(permit), nullptr, nullptr, sstables::sstable::raw_stream::yes)
+                                : sstable->data_stream(0, sstable->data_size(), std::move(permit), nullptr, nullptr, sstables::sstable::raw_stream::no));
+                    }());
+
+                    std::exception_ptr eptr;
+                    try {
+                        co_await seastar::copy(src, out);
+                    } catch (...) {
+                        eptr = std::current_exception();
+                        llog.info("Error downloading SSTable component {}. Reason: {}", it->first, eptr);
+                    }
+                    co_await src.close();
+                    co_await out.close();
+                    if (eptr) {
+                        co_await sstable_sink->abort();
+                        std::rethrow_exception(eptr);
+                    }
+                    if (auto sst = co_await sstable_sink->close()) {
+                        const auto& shards = sstable->get_shards_for_this_sstable();
+                        if (shards.size() != 1) {
+                            on_internal_error(llog, "Fully-contained sstable must belong to one shard only");
+                        }
+                        llog.debug("SSTable shards {}", fmt::join(shards, ", "));
+                        downloaded_sstables[shards.front()].emplace_back(gen, descriptor.version, descriptor.format);
+                    }
+                } catch (...) {
+                    llog.info("Error downloading SSTable component {}. Reason: {}", it->first, std::current_exception());
+                    throw;
+                }
+            }
+        }
+        co_return downloaded_sstables;
    }

    bool tablet_in_scope(locator::tablet_id) const;
@@ -400,8 +534,14 @@ future<> tablet_sstable_streamer::stream(shared_ptr<stream_progress> progress) {
            progress,
            sstables_fully_contained.size() + sstables_partially_contained.size());
        auto tablet_pr = dht::to_partition_range(tablet_range);
-        co_await stream_sstables(tablet_pr, std::move(sstables_partially_contained), per_tablet_progress);
-        co_await stream_fully_contained_sstables(tablet_pr, std::move(sstables_fully_contained), per_tablet_progress);
+        if (!sstables_partially_contained.empty()) {
+            llog.debug("Streaming {} partially contained SSTables.",sstables_partially_contained.size());
+            co_await stream_sstables(tablet_pr, std::move(sstables_partially_contained), per_tablet_progress);
+        }
+        if (!sstables_fully_contained.empty()) {
+            llog.debug("Streaming {} fully contained SSTables.",sstables_fully_contained.size());
+            co_await stream_fully_contained_sstables(tablet_pr, std::move(sstables_fully_contained), per_tablet_progress);
+        }
    }
 }

@@ -536,14 +676,6 @@ future<> sstable_streamer::stream_sstable_mutations(streaming::plan_id ops_uuid,
    }
 }

-template <typename... Args>
-static std::unique_ptr<sstable_streamer> make_sstable_streamer(bool uses_tablets, Args&&... args) {
-    if (uses_tablets) {
-        return std::make_unique<tablet_sstable_streamer>(std::forward<Args>(args)...);
-    }
-    return std::make_unique<sstable_streamer>(std::forward<Args>(args)...);
-}
-
 future<locator::effective_replication_map_ptr> sstables_loader::await_topology_quiesced_and_get_erm(::table_id table_id) {
    // By waiting for topology to quiesce, we guarantee load-and-stream will not start in the middle
    // of a topology operation that changes the token range boundaries, e.g. split or merge.
@@ -581,9 +713,14 @@ future<> sstables_loader::load_and_stream(sstring ks_name, sstring cf_name,
    // throughout its lifetime.
    auto erm = co_await await_topology_quiesced_and_get_erm(table_id);

-    auto streamer = make_sstable_streamer(_db.local().find_column_family(table_id).uses_tablets(),
-                                          _messaging, _db.local(), table_id, std::move(erm), std::move(sstables),
-                                          primary, unlink_sstables(unlink), scope);
+    std::unique_ptr<sstable_streamer> streamer;
+    if (_db.local().find_column_family(table_id).uses_tablets()) {
+        streamer =
+            std::make_unique<tablet_sstable_streamer>(_messaging, _db, table_id, std::move(erm), std::move(sstables), primary, unlink_sstables(unlink), scope);
+    } else {
+        streamer =
+            std::make_unique<sstable_streamer>(_messaging, _db.local(), table_id, std::move(erm), std::move(sstables), primary, unlink_sstables(unlink), scope);
+    }

    co_await streamer->stream(progress);
 }
--- a/streaming/consumer.cc
+++ b/streaming/consumer.cc
@@ -18,6 +18,7 @@
 #include "db/view/view_update_checks.hh"
 #include "sstables/sstables.hh"
 #include "sstables/sstables_manager.hh"
+#include "debug.hh"

 namespace streaming {

@@ -33,7 +34,7 @@ mutation_reader_consumer make_streaming_consumer(sstring origin,
    return [&db, &vb = vb.container(), &vbw, estimated_partitions, reason, offstrategy, origin = std::move(origin), frozen_guard, on_sstable_written] (mutation_reader reader) -> future<> {
        std::exception_ptr ex;
        try {
-            if (current_scheduling_group() != db.local().get_streaming_scheduling_group()) {
+            if (current_scheduling_group() != debug::streaming_scheduling_group) {
                on_internal_error(sstables::sstlog, format("The stream consumer is not running in streaming group current_scheduling_group={}",
                        current_scheduling_group().name()));
            }
--- a/test.py
+++ b/test.py
@@ -61,11 +61,14 @@ PYTEST_RUNNER_DIRECTORIES = [
    TEST_DIR / 'raft',
    TEST_DIR / 'unit',
    TEST_DIR / 'vector_search',
+    TEST_DIR / 'vector_search_validator',
    TEST_DIR / 'alternator',
    TEST_DIR / 'broadcast_tables',
    TEST_DIR / 'cql',
    TEST_DIR / 'cqlpy',
    TEST_DIR / 'rest_api',
+    TEST_DIR / 'nodetool',
+    TEST_DIR / 'scylla_gdb',
 ]

 launch_time = time.monotonic()
@@ -251,8 +254,6 @@ def parse_cmd_line() -> argparse.Namespace:
                        default=None, dest="pytest_arg",
                        help="Additional command line arguments to pass to pytest, for example ./test.py --pytest-arg=\"-v -x\"")
    scylla_additional_options = parser.add_argument_group('Additional options for Scylla tests')
-    scylla_additional_options.add_argument('--x-log2-compaction-groups', action="store", default="0", type=int,
-                             help="Controls number of compaction groups to be used by Scylla tests. Value of 3 implies 8 groups.")
    scylla_additional_options.add_argument('--extra-scylla-cmdline-options', action="store", default="", type=str,
                                           help="Passing extra scylla cmdline options for all tests. Options should be space separated:"
                                                "'--logger-log-level raft=trace --default-log-level error'")
@@ -363,8 +364,6 @@ def run_pytest(options: argparse.Namespace) -> tuple[int, list[SimpleNamespace]]
        args.extend(shlex.split(options.pytest_arg))
    if options.random_seed:
        args.append(f'--random-seed={options.random_seed}')
-    if options.x_log2_compaction_groups:
-        args.append(f'--x-log2-compaction-groups={options.x_log2_compaction_groups}')
    if options.gather_metrics:
        args.append('--gather-metrics')
    if options.timeout:
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -103,6 +103,7 @@ if(BUILD_TESTING)
    add_subdirectory(raft)
    add_subdirectory(resource/wasm)
    add_subdirectory(vector_search)
+    add_subdirectory(vector_search_validator)

    if(CMAKE_CONFIGURATION_TYPES)
      foreach(config ${CMAKE_CONFIGURATION_TYPES})
--- a/test/alternator/test_table.py
+++ b/test/alternator/test_table.py
@@ -451,24 +451,6 @@ def test_update_table_non_existent(dynamodb, test_table):
    with pytest.raises(ClientError, match='ResourceNotFoundException'):
        client.update_table(TableName=random_string(20), BillingMode='PAY_PER_REQUEST')

-# Consistent schema change feature is optionally enabled and
-# some tests are expected to fail on Scylla without this
-# option enabled, and pass with it enabled (and also pass on Cassandra).
-# These tests should use the "fails_without_consistent_cluster_management"
-# fixture. When consistent mode becomes the default, this fixture can be removed.
-@pytest.fixture(scope="module")
-def check_pre_consistent_cluster_management(dynamodb):
-    # If not running on Scylla, return false.
-    if is_aws(dynamodb):
-        return False
-    consistent = scylla_config_read(dynamodb, 'consistent_cluster_management')
-    return consistent is None or consistent == 'false'
-
-@pytest.fixture(scope="function")
-def fails_without_consistent_cluster_management(request, check_pre_consistent_cluster_management):
-    if check_pre_consistent_cluster_management:
-        request.node.add_marker(pytest.mark.xfail(reason='Test expected to fail without consistent cluster management feature on'))
-
 # Test for reproducing issues #6391 and #9868 - where CreateTable did not
 # *atomically* perform all the schema modifications - creating a keyspace,
 # a table, secondary indexes and tags - and instead it created the different
@@ -526,7 +508,7 @@ def fails_without_consistent_cluster_management(request, check_pre_consistent_cl
        'Tags': [{'Key': 'k1', 'Value': 'v1'}]
    }
 ])
-def test_concurrent_create_and_delete_table(dynamodb, table_def, fails_without_consistent_cluster_management):
+def test_concurrent_create_and_delete_table(dynamodb, table_def):
    # According to boto3 documentation, "Unlike Resources and Sessions,
    # clients are generally thread-safe.". So because we have two threads
    # in this test, we must not use "dynamodb" (containing the boto3
--- a/test/boost/continuous_data_consumer_test.cc
+++ b/test/boost/continuous_data_consumer_test.cc
@@ -10,7 +10,7 @@
 #include "sstables/consumer.hh"

 #include "bytes.hh"
-#include "utils/buffer_input_stream.hh"
+#include "test/lib/limiting_data_source.hh"
 #include "test/lib/reader_concurrency_semaphore.hh"
 #include "test/lib/random_utils.hh"
 #include "schema/schema.hh"
@@ -20,12 +20,18 @@
 #include <seastar/core/iostream.hh>
 #include <seastar/core/temporary_buffer.hh>
 #include <seastar/core/thread.hh>
+#include <seastar/util/memory-data-source.hh>
 #include "test/lib/scylla_test_case.hh"
 #include <seastar/testing/thread_test_case.hh>
 #include <random>

 namespace {

+input_stream<char> make_buffer_input_stream(temporary_buffer<char>&& buf, size_t limit) {
+    auto res = data_source{std::make_unique<seastar::util::temporary_buffer_data_source>(std::move(buf))};
+    return input_stream < char > { make_limiting_data_source(std::move(res), limit) };
+}
+
 class test_consumer final : public data_consumer::continuous_data_consumer<test_consumer> {
    static const int MULTIPLIER = 10;
    uint64_t _tested_value;
@@ -48,7 +54,7 @@ class test_consumer final : public data_consumer::continuous_data_consumer<test_
        for (int i = 0; i < MULTIPLIER; ++i) {
            pos += unsigned_vint::serialize(tested_value, out + pos);
        }
-        return make_buffer_input_stream(std::move(buf), [] {return 1;});
+        return make_buffer_input_stream(std::move(buf), 1);
    }

 public:
@@ -121,7 +127,7 @@ class skipping_consumer final : public data_consumer::continuous_data_consumer<s
        std::memset(buf.get_write(), 'a', initial_data_size);
        std::memset(buf.get_write() + initial_data_size, 'b', to_skip);
        std::memset(buf.get_write() + initial_data_size + to_skip, 'a', next_data_size);
-        return make_buffer_input_stream(std::move(buf), [] {return 1;});
+        return make_buffer_input_stream(std::move(buf), 1);
    }
    static size_t prepare_initial_consumer_length(int initial_data_size, int to_skip) {
        // some bytes that we want to skip may end up even after the initial consumer range
--- a/test/boost/cql_auth_query_test.cc
+++ b/test/boost/cql_auth_query_test.cc
@@ -391,31 +391,21 @@ SEASTAR_TEST_CASE(select_from_vector_search_system_table) {
    return do_with_cql_env_thread(
            [](auto&& env) {
                create_user_if_not_exists(env, bob);
-
-                // All tables in vector_search_system_resources from client_state.cc
-                const std::vector<sstring> vector_search_system_tables = {
-                    "system.group0_history",
-                    "system.versions",
-                    "system.cdc_streams",
-                    "system.cdc_timestamps",
-                };
-
-                // Without VECTOR_SEARCH_INDEXING permission, bob cannot select from these tables
-                for (const auto& table : vector_search_system_tables) {
-                    with_user(env, bob, [&env, &table] {
-                        BOOST_REQUIRE_EXCEPTION(env.execute_cql(format("SELECT * FROM {}", table)).get(), exceptions::unauthorized_exception,
-                                exception_predicate::message_contains("User bob has none of the permissions (VECTOR_SEARCH_INDEXING, SELECT) on"));
-                    });
-                }
-
+                with_user(env, bob, [&env] {
+                    BOOST_REQUIRE_EXCEPTION(env.execute_cql("SELECT * FROM system.group0_history").get(), exceptions::unauthorized_exception,
+                            exception_predicate::message_contains("User bob has none of the permissions (VECTOR_SEARCH_INDEXING, SELECT) on"));
+                });
+                with_user(env, bob, [&env] {
+                    BOOST_REQUIRE_EXCEPTION(env.execute_cql("SELECT * FROM system.versions").get(), exceptions::unauthorized_exception,
+                            exception_predicate::message_contains("User bob has none of the permissions (VECTOR_SEARCH_INDEXING, SELECT) on"));
+                });
                cquery_nofail(env, "GRANT VECTOR_SEARCH_INDEXING ON ALL KEYSPACES TO bob");
-
-                // With VECTOR_SEARCH_INDEXING permission, bob can select from these tables
-                for (const auto& table : vector_search_system_tables) {
-                    with_user(env, bob, [&env, &table] {
-                        cquery_nofail(env, format("SELECT * FROM {}", table));
-                    });
-                }
+                with_user(env, bob, [&env] {
+                    cquery_nofail(env, "SELECT * FROM system.group0_history");
+                });
+                with_user(env, bob, [&env] {
+                    cquery_nofail(env, "SELECT * FROM system.versions");
+                });
            },
            db_config_with_auth());
 }
--- a/test/boost/database_test.cc
+++ b/test/boost/database_test.cc
@@ -7,9 +7,9 @@
 */


-#include "seastar/core/shard_id.hh"
 #include <boost/test/tools/old/interface.hpp>
 #include <seastar/core/seastar.hh>
+#include <seastar/core/shard_id.hh>
 #include <seastar/core/smp.hh>
 #include <seastar/core/thread.hh>
 #include <seastar/core/coroutine.hh>
@@ -97,29 +97,12 @@ static future<> apply_mutation(sharded<replica::database>& sharded_db, table_id
    });
 }

-future<> do_with_cql_env_and_compaction_groups_cgs(unsigned cgs, std::function<void(cql_test_env&)> func, cql_test_config cfg = {}, thread_attributes thread_attr = {}) {
-    // clean the dir before running
-    if (cfg.db_config->data_file_directories.is_set() && cfg.clean_data_dir_before_test) {
-        co_await recursive_remove_directory(fs::path(cfg.db_config->data_file_directories()[0]));
-        co_await recursive_touch_directory(cfg.db_config->data_file_directories()[0]);
-    }
-    // TODO: perhaps map log2_compaction_groups into initial_tablets when creating the testing keyspace.
-    co_await do_with_cql_env_thread(func, cfg, thread_attr);
-}
-
-future<> do_with_cql_env_and_compaction_groups(std::function<void(cql_test_env&)> func, cql_test_config cfg = {}, thread_attributes thread_attr = {}) {
-    std::vector<unsigned> x_log2_compaction_group_values = { 0 /* 1 CG */ };
-    for (auto x_log2_compaction_groups : x_log2_compaction_group_values) {
-        co_await do_with_cql_env_and_compaction_groups_cgs(x_log2_compaction_groups, func, cfg, thread_attr);
-    }
-}
-
 BOOST_AUTO_TEST_SUITE(database_test)

 SEASTAR_TEST_CASE(test_safety_after_truncate) {
    auto cfg = make_shared<db::config>();
    cfg->auto_snapshot.set(false);
-    return do_with_cql_env_and_compaction_groups([](cql_test_env& e) {
+    return do_with_cql_env_thread([](cql_test_env& e) {
        e.execute_cql("create table ks.cf (k text, v int, primary key (k));").get();
        auto& db = e.local_db();
        sstring ks_name = "ks";
@@ -180,7 +163,7 @@ SEASTAR_TEST_CASE(test_safety_after_truncate) {
 SEASTAR_TEST_CASE(test_truncate_without_snapshot_during_writes) {
    auto cfg = make_shared<db::config>();
    cfg->auto_snapshot.set(false);
-    return do_with_cql_env_and_compaction_groups([] (cql_test_env& e) {
+    return do_with_cql_env_thread([] (cql_test_env& e) {
        sstring ks_name = "ks";
        sstring cf_name = "cf";
        e.execute_cql(fmt::format("create table {}.{} (k text, v int, primary key (k));", ks_name, cf_name)).get();
@@ -218,7 +201,7 @@ SEASTAR_TEST_CASE(test_truncate_without_snapshot_during_writes) {
 SEASTAR_TEST_CASE(test_truncate_saves_replay_position) {
    auto cfg = make_shared<db::config>();
    cfg->auto_snapshot.set(false);
-    return do_with_cql_env_and_compaction_groups([] (cql_test_env& e) {
+    return do_with_cql_env_thread([] (cql_test_env& e) {
        BOOST_REQUIRE_GT(smp::count, 1);
        const sstring ks_name = "ks";
        const sstring cf_name = "cf";
@@ -236,7 +219,7 @@ SEASTAR_TEST_CASE(test_truncate_saves_replay_position) {
 }

 SEASTAR_TEST_CASE(test_querying_with_limits) {
-    return do_with_cql_env_and_compaction_groups([](cql_test_env& e) {
+    return do_with_cql_env_thread([](cql_test_env& e) {
            // FIXME: restore indent.
            e.execute_cql("create table ks.cf (k text, v int, primary key (k));").get();
            auto& db = e.local_db();
@@ -304,8 +287,8 @@ SEASTAR_TEST_CASE(test_querying_with_limits) {
    });
 }

-static void test_database(void (*run_tests)(populate_fn_ex, bool), unsigned cgs) {
-    do_with_cql_env_and_compaction_groups_cgs(cgs, [run_tests] (cql_test_env& e) {
+static void test_database(void (*run_tests)(populate_fn_ex, bool)) {
+    do_with_cql_env_thread([run_tests] (cql_test_env& e) {
        run_tests([&] (schema_ptr s, const utils::chunked_vector<mutation>& partitions, gc_clock::time_point) -> mutation_source {
            auto& mm = e.migration_manager().local();
            try {
@@ -339,72 +322,36 @@ static void test_database(void (*run_tests)(populate_fn_ex, bool), unsigned cgs)
    }).get();
 }

-// plain cg0
-SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_plain_basic_cg0) {
-    test_database(run_mutation_source_tests_plain_basic, 0);
+SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_plain_basic) {
+    test_database(run_mutation_source_tests_plain_basic);
 }

-SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_plain_reader_conversion_cg0) {
-    test_database(run_mutation_source_tests_plain_reader_conversion, 0);
+SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_plain_reader_conversion) {
+    test_database(run_mutation_source_tests_plain_reader_conversion);
 }

-SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_plain_fragments_monotonic_cg0) {
-    test_database(run_mutation_source_tests_plain_fragments_monotonic, 0);
+SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_plain_fragments_monotonic) {
+    test_database(run_mutation_source_tests_plain_fragments_monotonic);
 }

-SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_plain_read_back_cg0) {
-    test_database(run_mutation_source_tests_plain_read_back, 0);
+SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_plain_read_back) {
+    test_database(run_mutation_source_tests_plain_read_back);
 }

-// plain cg1
-SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_plain_basic_cg1) {
-    test_database(run_mutation_source_tests_plain_basic, 1);
+SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_reverse_basic) {
+    test_database(run_mutation_source_tests_reverse_basic);
 }

-SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_plain_reader_conversion_cg1) {
-    test_database(run_mutation_source_tests_plain_reader_conversion, 1);
+SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_reverse_reader_conversion) {
+    test_database(run_mutation_source_tests_reverse_reader_conversion);
 }

-SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_plain_fragments_monotonic_cg1) {
-    test_database(run_mutation_source_tests_plain_fragments_monotonic, 1);
+SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_reverse_fragments_monotonic) {
+    test_database(run_mutation_source_tests_reverse_fragments_monotonic);
 }

-SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_plain_read_back_cg1) {
-    test_database(run_mutation_source_tests_plain_read_back, 1);
-}
-
-// reverse cg0
-SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_reverse_basic_cg0) {
-    test_database(run_mutation_source_tests_reverse_basic, 0);
-}
-
-SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_reverse_reader_conversion_cg0) {
-    test_database(run_mutation_source_tests_reverse_reader_conversion, 0);
-}
-
-SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_reverse_fragments_monotonic_cg0) {
-    test_database(run_mutation_source_tests_reverse_fragments_monotonic, 0);
-}
-
-SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_reverse_read_back_cg0) {
-    test_database(run_mutation_source_tests_reverse_read_back, 0);
-}
-
-// reverse cg1
-SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_reverse_basic_cg1) {
-    test_database(run_mutation_source_tests_reverse_basic, 1);
-}
-
-SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_reverse_reader_conversion_cg1) {
-    test_database(run_mutation_source_tests_reverse_reader_conversion, 1);
-}
-
-SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_reverse_fragments_monotonic_cg1) {
-    test_database(run_mutation_source_tests_reverse_fragments_monotonic, 1);
-}
-
-SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_reverse_read_back_cg1) {
-    test_database(run_mutation_source_tests_reverse_read_back, 1);
+SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source_reverse_read_back) {
+    test_database(run_mutation_source_tests_reverse_read_back);
 }

 static void require_exist(const sstring& filename, bool should) {
@@ -455,7 +402,7 @@ SEASTAR_THREAD_TEST_CASE(test_distributed_loader_with_incomplete_sstables) {

    auto test_config = cql_test_config(db_cfg_ptr);
    test_config.clean_data_dir_before_test = false;
-    do_with_cql_env_and_compaction_groups([&sst_dir, &ks, &cf, &temp_sst_dir_2, &temp_sst_dir_3] (cql_test_env& e) {
+    do_with_cql_env_thread([&sst_dir, &ks, &cf, &temp_sst_dir_2, &temp_sst_dir_3] (cql_test_env& e) {
        require_exist(temp_sst_dir_2, false);
        require_exist(temp_sst_dir_3, false);

@@ -544,7 +491,7 @@ SEASTAR_THREAD_TEST_CASE(test_distributed_loader_with_pending_delete) {

    auto test_config = cql_test_config(db_cfg_ptr);
    test_config.clean_data_dir_before_test = false;
-    do_with_cql_env_and_compaction_groups([&] (cql_test_env& e) {
+    do_with_cql_env_thread([&] (cql_test_env& e) {
        // Empty log filesst_dir
        // Empty temporary log file
        require_exist(pending_delete_dir + "/sstables-1-1.log.tmp", false);
@@ -582,7 +529,7 @@ future<> do_with_some_data_in_thread(std::vector<sstring> cf_names, std::functio
            db_cfg_ptr = make_shared<db::config>();
            db_cfg_ptr->data_file_directories(std::vector<sstring>({ tmpdir_for_data->path().string() }));
        }
-        do_with_cql_env_and_compaction_groups([cf_names = std::move(cf_names), func = std::move(func), create_mvs, num_keys] (cql_test_env& e) {
+        do_with_cql_env_thread([cf_names = std::move(cf_names), func = std::move(func), create_mvs, num_keys] (cql_test_env& e) {
            for (const auto& cf_name : cf_names) {
                e.create_table([&cf_name] (std::string_view ks_name) {
                    return *schema_builder(ks_name, cf_name)
@@ -1113,7 +1060,7 @@ SEASTAR_TEST_CASE(test_snapshot_ctl_true_snapshots_size) {

 // toppartitions_query caused a lw_shared_ptr to cross shards when moving results, #5104
 SEASTAR_TEST_CASE(toppartitions_cross_shard_schema_ptr) {
-    return do_with_cql_env_and_compaction_groups([] (cql_test_env& e) {
+    return do_with_cql_env_thread([] (cql_test_env& e) {
        e.execute_cql("CREATE TABLE ks.tab (id int PRIMARY KEY)").get();
        db::toppartitions_query tq(e.db(), {{"ks", "tab"}}, {}, 1s, 100, 100);
        tq.scatter().get();
@@ -1128,7 +1075,7 @@ SEASTAR_TEST_CASE(toppartitions_cross_shard_schema_ptr) {
 }

 SEASTAR_THREAD_TEST_CASE(read_max_size) {
-    do_with_cql_env_and_compaction_groups([] (cql_test_env& e) {
+    do_with_cql_env_thread([] (cql_test_env& e) {
        e.execute_cql("CREATE TABLE test (pk text, ck int, v text, PRIMARY KEY (pk, ck));").get();
        auto id = e.prepare("INSERT INTO test (pk, ck, v) VALUES (?, ?, ?);").get();

@@ -1217,7 +1164,7 @@ SEASTAR_THREAD_TEST_CASE(unpaged_mutation_read_global_limit) {
    // configured based on the available memory, so give a small amount to
    // the "node", so we don't have to work with large amount of data.
    cfg.dbcfg->available_memory = 2 * 1024 * 1024;
-    do_with_cql_env_and_compaction_groups([] (cql_test_env& e) {
+    do_with_cql_env_thread([] (cql_test_env& e) {
        e.execute_cql("CREATE TABLE test (pk text, ck int, v text, PRIMARY KEY (pk, ck));").get();
        auto id = e.prepare("INSERT INTO test (pk, ck, v) VALUES (?, ?, ?);").get();

@@ -1301,7 +1248,7 @@ SEASTAR_THREAD_TEST_CASE(reader_concurrency_semaphore_selection_test) {
    scheduling_group_and_expected_semaphore.emplace_back(sched_groups.gossip_scheduling_group, system_semaphore);
    scheduling_group_and_expected_semaphore.emplace_back(unknown_scheduling_group, user_semaphore);

-    do_with_cql_env_and_compaction_groups([&scheduling_group_and_expected_semaphore] (cql_test_env& e) {
+    do_with_cql_env_thread([&scheduling_group_and_expected_semaphore] (cql_test_env& e) {
        auto& db = e.local_db();
        database_test_wrapper tdb(db);
        for (const auto& [sched_group, expected_sem_getter] : scheduling_group_and_expected_semaphore) {
@@ -1352,7 +1299,7 @@ SEASTAR_THREAD_TEST_CASE(max_result_size_for_query_selection_test) {
    scheduling_group_and_expected_max_result_size.emplace_back(sched_groups.gossip_scheduling_group, system_max_result_size);
    scheduling_group_and_expected_max_result_size.emplace_back(unknown_scheduling_group, user_max_result_size);

-    do_with_cql_env_and_compaction_groups([&scheduling_group_and_expected_max_result_size] (cql_test_env& e) {
+    do_with_cql_env_thread([&scheduling_group_and_expected_max_result_size] (cql_test_env& e) {
        auto& db = e.local_db();
        database_test_wrapper tdb(db);
        for (const auto& [sched_group, expected_max_size] : scheduling_group_and_expected_max_result_size) {
@@ -1433,7 +1380,7 @@ SEASTAR_TEST_CASE(multipage_range_scan_semaphore_mismatch) {
 // Test `upgrade_sstables` on all keyspaces (including the system keyspace).
 // Refs: #9494 (https://github.com/scylladb/scylla/issues/9494)
 SEASTAR_TEST_CASE(upgrade_sstables) {
-    return do_with_cql_env_and_compaction_groups([] (cql_test_env& e) {
+    return do_with_cql_env_thread([] (cql_test_env& e) {
        e.db().invoke_on_all([] (replica::database& db) -> future<> {
            auto& cm = db.get_compaction_manager();
            for (auto& [ks_name, ks] : db.get_keyspaces()) {
@@ -1645,7 +1592,7 @@ SEASTAR_TEST_CASE(snapshot_with_quarantine_works) {
 }

 SEASTAR_TEST_CASE(database_drop_column_family_clears_querier_cache) {
-    return do_with_cql_env_and_compaction_groups([] (cql_test_env& e) {
+    return do_with_cql_env_thread([] (cql_test_env& e) {
        e.execute_cql("create table ks.cf (k text, v int, primary key (k));").get();
        auto& db = e.local_db();
        auto& tbl = db.find_column_family("ks", "cf");
@@ -2205,6 +2152,7 @@ struct scoped_execption_log_level {

 SEASTAR_TEST_CASE(replica_read_timeout_no_exception) {
    cql_test_config cfg;
+    cfg.db_config->reader_concurrency_semaphore_preemptive_abort_factor.set(0.0);
    const auto read_timeout = 10ms;
    const auto write_timeout = 10s;
    cfg.query_timeout.emplace(timeout_config{
--- a/test/boost/encryption_at_rest_test.cc
+++ b/test/boost/encryption_at_rest_test.cc
@@ -113,9 +113,6 @@ static future<> test_provider(const test_provider_args& args) {
        auto cfg = seastar::make_shared<db::config>(ext);
        cfg->data_file_directories({args.tmp.path().string()});

-        // Currently the test fails with consistent_cluster_management = true. See #2995.
-        cfg->consistent_cluster_management(false);
-
        {
            boost::program_options::options_description desc;
            boost::program_options::options_description_easy_init init(&desc);
@@ -199,9 +196,6 @@ static auto make_commitlog_config(const test_provider_args& args, const std::uno
    cfg->data_file_directories({args.tmp.path().string()});
    cfg->commitlog_sync("batch"); // just to make sure files are written

-    // Currently the test fails with consistent_cluster_management = true. See #2995.
-    cfg->consistent_cluster_management(false);
-
    boost::program_options::options_description desc;
    boost::program_options::options_description_easy_init init(&desc);
    configurable::append_all(*cfg, init);
--- a/test/boost/fragmented_temporary_buffer_test.cc
+++ b/test/boost/fragmented_temporary_buffer_test.cc
@@ -7,6 +7,7 @@
 */

 #include <seastar/core/thread.hh>
+#include <seastar/util/memory-data-source.hh>
 #include "test/lib/scylla_test_case.hh"

 #include "utils/assert.hh"
@@ -342,29 +343,6 @@ SEASTAR_THREAD_TEST_CASE(test_read_bytes_view) {
    }
 }

-namespace {
-
-class memory_data_source final : public data_source_impl {
-private:
-    using vector_type = std::vector<temporary_buffer<char>>;
-    vector_type _buffers;
-    vector_type::iterator _position;
-public:
-    explicit memory_data_source(std::vector<temporary_buffer<char>> buffers)
-        : _buffers(std::move(buffers))
-        , _position(_buffers.begin())
-    { }
-
-    virtual future<temporary_buffer<char>> get() override {
-        if (_position == _buffers.end()) {
-            return make_ready_future<temporary_buffer<char>>();
-        }
-        return make_ready_future<temporary_buffer<char>>(std::move(*_position++));
-    }
-};
-
-}
-
 SEASTAR_THREAD_TEST_CASE(test_read_fragmented_buffer) {
    using tuple_type = std::tuple<std::vector<temporary_buffer<char>>,
                                  bytes,
@@ -412,7 +390,7 @@ SEASTAR_THREAD_TEST_CASE(test_read_fragmented_buffer) {
        auto size = expected_data.size();
        auto suffix_size = expected_suffix.size();

-        auto in = input_stream<char>(data_source(std::make_unique<memory_data_source>(std::move(buffers))));
+        auto in = seastar::util::as_input_stream(std::move(buffers));

        auto prefix = in.read_exactly(prefix_size).get();
        BOOST_CHECK_EQUAL(prefix.size(), prefix_size);
@@ -491,8 +469,7 @@ static void do_test_read_exactly_eof(size_t input_size) {
    if (input_size) {
        data.push_back(temporary_buffer<char>(input_size));
    }
-    auto ds = data_source(std::make_unique<memory_data_source>(std::move(data)));
-    auto is = input_stream<char>(std::move(ds));
+    auto is = seastar::util::as_input_stream(std::move(data));
    auto reader = fragmented_temporary_buffer::reader();
    auto result = reader.read_exactly(is, input_size + 1).get();
    BOOST_CHECK_EQUAL(result.size_bytes(), size_t(0));
--- a/test/boost/gcp_object_storage_test.cc
+++ b/test/boost/gcp_object_storage_test.cc
@@ -113,23 +113,15 @@ static future<> compare_object_data(const local_gcs_wrapper& env, std::string_vi
    BOOST_REQUIRE_EQUAL(read, total);
 }

-using namespace std::string_literals;
-static constexpr auto prefix = "bork/ninja/"s;
-
-// #28398 include a prefix in all names. 
-static std::string make_name() {
-    return fmt::format("{}{}", prefix, utils::UUID_gen::get_time_UUID());
-}
-
 static future<> test_read_write_helper(const local_gcs_wrapper& env, size_t dest_size, std::optional<size_t> specific_buffer_size = std::nullopt) {
    auto& c = env.client();
-    auto name = make_name();
+    auto uuid = fmt::format("{}", utils::UUID_gen::get_time_UUID());
    std::vector<temporary_buffer<char>> written;

    // ensure we remove the object
-    env.objects_to_delete.emplace_back(name);
-    co_await create_object_of_size(c, env.bucket, name, dest_size, &written, specific_buffer_size);
-    co_await compare_object_data(env, name, std::move(written));
+    env.objects_to_delete.emplace_back(uuid);
+    co_await create_object_of_size(c, env.bucket, uuid, dest_size, &written, specific_buffer_size);
+    co_await compare_object_data(env, uuid, std::move(written));
 }

 BOOST_AUTO_TEST_SUITE(gcs_tests, *seastar::testing::async_fixture<gcs_fixture>())
@@ -155,28 +147,21 @@ SEASTAR_FIXTURE_TEST_CASE(test_gcp_storage_list_objects, local_gcs_wrapper, *che
    auto& c = env.client();
    std::unordered_map<std::string, uint64_t> names;
    for (size_t i = 0; i < 10; ++i) {
-        auto name = make_name();
+        auto name = fmt::format("{}", utils::UUID_gen::get_time_UUID());
        auto size = tests::random::get_int(size_t(1), size_t(2*1024*1024));
        env.objects_to_delete.emplace_back(name);
        co_await create_object_of_size(c, env.bucket, name, size);
        names.emplace(name, size);
    }

-    utils::gcp::storage::bucket_paging paging;
+    auto infos = co_await c.list_objects(env.bucket);
    size_t n_found = 0;

-    for (;;) {
-        auto infos = co_await c.list_objects(env.bucket, "", paging);
-
-        for (auto& info : infos) {
-            auto i = names.find(info.name);
-            if (i != names.end()) {
-                BOOST_REQUIRE_EQUAL(info.size, i->second);
-                ++n_found;
-            }
-        }
-        if (infos.empty()) {
-            break;
+    for (auto& info : infos) {
+        auto i = names.find(info.name);
+        if (i != names.end()) {
+            BOOST_REQUIRE_EQUAL(info.size, i->second);
+            ++n_found;
        }
    }
    BOOST_REQUIRE_EQUAL(n_found, names.size());
@@ -185,7 +170,7 @@ SEASTAR_FIXTURE_TEST_CASE(test_gcp_storage_list_objects, local_gcs_wrapper, *che
 SEASTAR_FIXTURE_TEST_CASE(test_gcp_storage_delete_object, local_gcs_wrapper, *check_gcp_storage_test_enabled()) {
    auto& env = *this;
    auto& c = env.client();
-    auto name = make_name();
+    auto name = fmt::format("{}", utils::UUID_gen::get_time_UUID());
    env.objects_to_delete.emplace_back(name);
    co_await create_object_of_size(c, env.bucket, name, 128);
    {
@@ -205,7 +190,7 @@ SEASTAR_FIXTURE_TEST_CASE(test_gcp_storage_delete_object, local_gcs_wrapper, *ch
 SEASTAR_FIXTURE_TEST_CASE(test_gcp_storage_skip_read, local_gcs_wrapper, *check_gcp_storage_test_enabled()) {
    auto& env = *this;
    auto& c = env.client();
-    auto name = make_name();
+    auto name = fmt::format("{}", utils::UUID_gen::get_time_UUID());
    std::vector<temporary_buffer<char>> bufs;
    constexpr size_t file_size = 12*1024*1024 + 384*7 + 31;

@@ -258,7 +243,7 @@ SEASTAR_FIXTURE_TEST_CASE(test_merge_objects, local_gcs_wrapper, *check_gcp_stor

    size_t total = 0; 
    for (size_t i = 0; i < 32; ++i) {
-        auto name = make_name();
+        auto name = fmt::format("{}", utils::UUID_gen::get_time_UUID());
        auto size = tests::random::get_int(size_t(1), size_t(2*1024*1024));
        env.objects_to_delete.emplace_back(name);
        co_await create_object_of_size(c, env.bucket, name, size, &bufs);
@@ -266,7 +251,7 @@ SEASTAR_FIXTURE_TEST_CASE(test_merge_objects, local_gcs_wrapper, *check_gcp_stor
        total += size;
    }

-    auto name = make_name();
+    auto name = fmt::format("{}", utils::UUID_gen::get_time_UUID());
    env.objects_to_delete.emplace_back(name);

    auto info = co_await c.merge_objects(env.bucket, name, names);
--- a/test/boost/limiting_data_source_test.cc
+++ b/test/boost/limiting_data_source_test.cc
@@ -7,10 +7,10 @@
 */

 #include "utils/assert.hh"
-#include "utils/limiting_data_source.hh"

 #include <boost/test/unit_test.hpp>
 #include "test/lib/scylla_test_case.hh"
+#include "test/lib/limiting_data_source.hh"
 #include <seastar/testing/thread_test_case.hh>
 #include <seastar/core/iostream.hh>
 #include <seastar/core/temporary_buffer.hh>
@@ -53,7 +53,7 @@ data_source create_test_data_source() {

 void test_get(unsigned limit) {
    auto src = create_test_data_source();
-    auto tested = make_limiting_data_source(std::move(src), [limit] { return limit; });
+    auto tested = make_limiting_data_source(std::move(src), limit);
    char expected = 0;
    auto test_get = [&] {
        auto buf = tested.get().get();
@@ -69,7 +69,7 @@ void test_get(unsigned limit) {

 data_source prepare_test_skip() {
    auto src = create_test_data_source();
-    auto tested = make_limiting_data_source(std::move(src), [] { return 1; });
+    auto tested = make_limiting_data_source(std::move(src), 1);
    auto buf = tested.get().get();
    BOOST_REQUIRE_EQUAL(1, buf.size());
    BOOST_REQUIRE_EQUAL(0, buf[0]);
--- a/test/boost/memtable_test.cc
+++ b/test/boost/memtable_test.cc
@@ -1185,6 +1185,13 @@ SEASTAR_TEST_CASE(failed_flush_prevents_writes) {
 }

 SEASTAR_TEST_CASE(flushing_rate_is_reduced_if_compaction_doesnt_keep_up) {
+#ifdef DEBUG
+    // This test was observed to take multiple minutes to run in debug mode on CI machines.
+    // This test checks that a certain behaviour is triggered when compaction falls behind.
+    // Not critical to run in debug mode. Both compaction and memtable have their own
+    // correctness tests, which do run in debug mode.
+    return make_ready_future<>();
+#else
    BOOST_ASSERT(smp::count == 2);
    // The test simulates a situation where 2 threads issue flushes to 2
    // tables. Both issue small flushes, but one has injected reactor stalls.
@@ -1259,6 +1266,7 @@ SEASTAR_TEST_CASE(flushing_rate_is_reduced_if_compaction_doesnt_keep_up) {
            sleep_ms *= 2;
        }
    });
+#endif
 }

 static future<> exceptions_in_flush_helper(std::unique_ptr<sstables::file_io_extension> mep, bool& should_fail, const bool& did_fail, const schema*& schema_filter, bool expect_isolate) {
--- a/test/boost/mvcc_test.cc
+++ b/test/boost/mvcc_test.cc
@@ -14,7 +14,6 @@

 #include "mutation/partition_version.hh"
 #include "db/partition_snapshot_row_cursor.hh"
-#include "partition_snapshot_reader.hh"
 #include "keys/clustering_interval_set.hh"

 #include "test/lib/scylla_test_case.hh"
--- a/test/boost/reader_concurrency_semaphore_test.cc
+++ b/test/boost/reader_concurrency_semaphore_test.cc
@@ -517,6 +517,38 @@ SEASTAR_TEST_CASE(reader_concurrency_semaphore_timeout) {
    });
 }

+SEASTAR_THREAD_TEST_CASE(reader_concurrency_semaphore_abort) {
+    const auto preemptive_abort_factor = 0.5f;
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), 1, replica::new_reader_base_cost,
+            100, utils::updateable_value(std::numeric_limits<uint32_t>::max()), utils::updateable_value(std::numeric_limits<uint32_t>::max()),
+            utils::updateable_value<uint32_t>(1), utils::updateable_value<float>(preemptive_abort_factor));
+    auto stop_sem = deferred_stop(semaphore);
+
+    {
+        BOOST_REQUIRE(semaphore.get_stats().total_reads_shed_due_to_overload == 0);
+
+        auto timeout = db::timeout_clock::now() + 500ms;
+
+        reader_permit_opt permit1 = semaphore.obtain_permit(nullptr, "permit1", replica::new_reader_base_cost, timeout, {}).get();
+
+        auto permit2_fut = semaphore.obtain_permit(nullptr, "permit2", replica::new_reader_base_cost, timeout, {});
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().waiters, 1);
+
+        // The permits are rejected when the remaining time is less than half of its timeout when arrived to the semaphore.
+        // Hence, sleep 300ms to reject the permits in the waitlist during admission.
+        seastar::sleep(300ms).get();
+
+        permit1 = {};
+        const auto futures_failed = eventually_true([&] { return permit2_fut.failed(); });
+        BOOST_CHECK(futures_failed);
+        BOOST_CHECK_THROW(std::rethrow_exception(permit2_fut.get_exception()), semaphore_aborted);
+        BOOST_CHECK(semaphore.get_stats().total_reads_shed_due_to_overload > 0);
+    }
+
+    // All units should have been deposited back.
+    REQUIRE_EVENTUALLY_EQUAL<ssize_t>([&] { return semaphore.available_resources().memory; }, replica::new_reader_base_cost);
+}
+
 SEASTAR_TEST_CASE(reader_concurrency_semaphore_max_queue_length) {
    return async([&] () {
        reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), 1, replica::new_reader_base_cost, 2);
@@ -597,7 +629,8 @@ SEASTAR_THREAD_TEST_CASE(reader_concurrency_semaphore_dump_reader_diganostics) {

                permit.resources = permit.permit->consume_resources(reader_resources(tests::random::get_int<unsigned>(0, 1), tests::random::get_int<unsigned>(1024, 16 * 1024 * 1024)));
            } else {
-                const auto timeout_seconds = tests::random::get_int<unsigned>(0, 3);
+                //Ensure timeout_seconds > 0 to avoid permits being rejected during admission. The test will become flaky.
+                const auto timeout_seconds = tests::random::get_int<unsigned>(1, 4);

                permit.permit_fut = semaphore.obtain_permit(
                        schema,
@@ -1226,11 +1259,14 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_group) {
    auto serialize_multiplier = utils::updateable_value_source<uint32_t>(2);
    auto kill_multiplier = utils::updateable_value_source<uint32_t>(3);
    auto cpu_concurrency = utils::updateable_value_source<uint32_t>(1);
+    auto preemptive_abort_factor = utils::updateable_value_source<float>(0.0f);

    reader_concurrency_semaphore_group sem_group(initial_resources.memory, initial_resources.count, 1000,
            utils::updateable_value(serialize_multiplier),
            utils::updateable_value(kill_multiplier),
-            utils::updateable_value(cpu_concurrency));
+            utils::updateable_value(cpu_concurrency),
+            utils::updateable_value(preemptive_abort_factor));
+
    auto stop_sem = deferred_stop(sem_group);

    circular_buffer<scheduling_group> recycle_bin;
@@ -1472,8 +1508,8 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_memory_limit_no_leaks
    const auto initial_resources = reader_concurrency_semaphore::resources{4, 4 * 1024};
    const auto serialize_multiplier = 2;
    const auto kill_multiplier = 3;
-    reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name(), 100,
-            utils::updateable_value<uint32_t>(serialize_multiplier), utils::updateable_value<uint32_t>(kill_multiplier), reader_concurrency_semaphore::register_metrics::no);
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory, 100,
+            utils::updateable_value<uint32_t>(serialize_multiplier), utils::updateable_value<uint32_t>(kill_multiplier));
    auto stop_sem = deferred_stop(semaphore);

    const size_t reader_count_target = 6;
@@ -1726,9 +1762,8 @@ SEASTAR_TEST_CASE(test_reader_concurrency_semaphore_memory_limit_engages) {
 SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_request_memory_preserves_state) {
    const auto initial_resources = reader_concurrency_semaphore::resources{2, 2 * 1024};
    const auto serialize_multiplier = 2;
-    const auto kill_multiplier = std::numeric_limits<uint32_t>::max(); // we don't want this to interfere with our test
-    reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name(), 100,
-            utils::updateable_value<uint32_t>(serialize_multiplier), utils::updateable_value<uint32_t>(kill_multiplier), reader_concurrency_semaphore::register_metrics::no);
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count,
+            initial_resources.memory, 100, utils::updateable_value<uint32_t>(serialize_multiplier));
    auto stop_sem = deferred_stop(semaphore);

    auto sponge_permit = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
@@ -1789,9 +1824,8 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_request_memory_preser
 SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_blessed_read_goes_inactive) {
    const auto initial_resources = reader_concurrency_semaphore::resources{2, 2 * 1024};
    const auto serialize_multiplier = 2;
-    const auto kill_multiplier = std::numeric_limits<uint32_t>::max(); // we don't want this to interfere with our test
-    reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name(), 100,
-            utils::updateable_value<uint32_t>(serialize_multiplier), utils::updateable_value<uint32_t>(kill_multiplier), reader_concurrency_semaphore::register_metrics::no);
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count,
+            initial_resources.memory, 100, utils::updateable_value<uint32_t>(serialize_multiplier));
    auto stop_sem = deferred_stop(semaphore);

    simple_schema ss;
@@ -1851,9 +1885,8 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_stop_with_inactive_re
 SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_permit_waiting_for_memory_goes_inactive) {
    const auto initial_resources = reader_concurrency_semaphore::resources{2, 2 * 1024};
    const auto serialize_multiplier = 2;
-    const auto kill_multiplier = std::numeric_limits<uint32_t>::max(); // we don't want this to interfere with our test
-    reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name(), 100,
-            utils::updateable_value<uint32_t>(serialize_multiplier), utils::updateable_value<uint32_t>(kill_multiplier), reader_concurrency_semaphore::register_metrics::no);
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count,
+            initial_resources.memory, 100, utils::updateable_value<uint32_t>(serialize_multiplier));
    auto stop_sem = deferred_stop(semaphore);

    auto permit1 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
@@ -1897,10 +1930,7 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_permit_waiting_for_me
 // This test covers all the cases where eviction should **not** happen.
 SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_no_unnecessary_evicting) {
    const auto initial_resources = reader_concurrency_semaphore::resources{2, 4 * 1024};
-    const auto serialize_multiplier = std::numeric_limits<uint32_t>::max();
-    const auto kill_multiplier = std::numeric_limits<uint32_t>::max();
-    reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name(), 100,
-            utils::updateable_value<uint32_t>(serialize_multiplier), utils::updateable_value<uint32_t>(kill_multiplier), reader_concurrency_semaphore::register_metrics::no);
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory, 100);
    auto stop_sem = deferred_stop(semaphore);

    simple_schema ss;
@@ -1990,10 +2020,7 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_no_unnecessary_evicti
 // Check that inactive reads are evicted when they are blocking admission
 SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_necessary_evicting) {
    const auto initial_resources = reader_concurrency_semaphore::resources{2, 4 * 1024};
-    const auto serialize_multiplier = std::numeric_limits<uint32_t>::max();
-    const auto kill_multiplier = std::numeric_limits<uint32_t>::max();
-    reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name(), 100,
-            utils::updateable_value<uint32_t>(serialize_multiplier), utils::updateable_value<uint32_t>(kill_multiplier), reader_concurrency_semaphore::register_metrics::no);
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory, 100);
    auto stop_sem = deferred_stop(semaphore);

    simple_schema ss;
@@ -2147,10 +2174,7 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_necessary_evicting) {
 // resources.
 SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_execution_stage_wakeup) {
    const auto initial_resources = reader_concurrency_semaphore::resources{2, 4 * 1024};
-    const auto serialize_multiplier = std::numeric_limits<uint32_t>::max();
-    const auto kill_multiplier = std::numeric_limits<uint32_t>::max();
-    reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name(), 100,
-            utils::updateable_value<uint32_t>(serialize_multiplier), utils::updateable_value<uint32_t>(kill_multiplier), reader_concurrency_semaphore::register_metrics::no);
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory, 100);
    auto stop_sem = deferred_stop(semaphore);

    auto permit1 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout, {}).get();
@@ -2186,6 +2210,8 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_live_update_count) {
    const uint32_t initial_memory = 4 * 1024;
    const auto serialize_multiplier = std::numeric_limits<uint32_t>::max();
    const auto kill_multiplier = std::numeric_limits<uint32_t>::max();
+    const auto cpu_concurrency = 1;
+    const auto preemptive_abort_factor = 0.0f;

    reader_concurrency_semaphore semaphore(
            utils::updateable_value(count),
@@ -2194,7 +2220,8 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_live_update_count) {
            100,
            utils::updateable_value<uint32_t>(serialize_multiplier),
            utils::updateable_value<uint32_t>(kill_multiplier),
-            utils::updateable_value<uint32_t>(1),
+            utils::updateable_value<uint32_t>(cpu_concurrency),
+            utils::updateable_value<float>(preemptive_abort_factor),
            reader_concurrency_semaphore::register_metrics::no);
    auto stop_sem = deferred_stop(semaphore);

@@ -2214,6 +2241,7 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_live_update_cpu_concu
    const uint32_t initial_memory = 4 * 1024;
    const auto serialize_multiplier = std::numeric_limits<uint32_t>::max();
    const auto kill_multiplier = std::numeric_limits<uint32_t>::max();
+    const auto preemptive_abort_factor = 0.0f;

    reader_concurrency_semaphore semaphore(
            utils::updateable_value<int>(initial_count),
@@ -2223,6 +2251,7 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_live_update_cpu_concu
            utils::updateable_value<uint32_t>(serialize_multiplier),
            utils::updateable_value<uint32_t>(kill_multiplier),
            utils::updateable_value(cpu_concurrency),
+            utils::updateable_value<float>(preemptive_abort_factor),
            reader_concurrency_semaphore::register_metrics::no);
    auto stop_sem = deferred_stop(semaphore);

@@ -2275,6 +2304,7 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_wait_queue_overload_c
            utils::updateable_value<uint32_t>(2),
            utils::updateable_value<uint32_t>(4),
            utils::updateable_value<uint32_t>(1),
+            utils::updateable_value<float>(0.0f),
            reader_concurrency_semaphore::register_metrics::no);
    auto stop_sem = deferred_stop(semaphore);

@@ -2328,6 +2358,7 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_double_permit_abort)
            utils::updateable_value<uint32_t>(2),
            utils::updateable_value<uint32_t>(400),
            utils::updateable_value<uint32_t>(2),
+            utils::updateable_value<float>(0.0f),
            reader_concurrency_semaphore::register_metrics::no);
    auto stop_sem = deferred_stop(semaphore);

@@ -2392,6 +2423,7 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_always_admit_one_perm
            utils::updateable_value<uint32_t>(200),
            utils::updateable_value<uint32_t>(400),
            utils::updateable_value<uint32_t>(1),
+            utils::updateable_value<float>(0.0f),
            reader_concurrency_semaphore::register_metrics::no);
    auto stop_sem = deferred_stop(semaphore);

@@ -2433,6 +2465,7 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_release_base_resource
            utils::updateable_value<uint32_t>(200),
            utils::updateable_value<uint32_t>(400),
            utils::updateable_value<uint32_t>(1),
+            utils::updateable_value<float>(0.0f),
            reader_concurrency_semaphore::register_metrics::no);
    auto stop_sem = deferred_stop(semaphore);

--- a/test/boost/row_cache_test.cc
+++ b/test/boost/row_cache_test.cc
@@ -4837,8 +4837,8 @@ SEASTAR_TEST_CASE(test_compact_range_tombstones_on_read) {
 // of course doesn't necessarily help release pressure on the semaphore.
 SEASTAR_THREAD_TEST_CASE(test_cache_reader_semaphore_oom_kill) {
    simple_schema s;
-    reader_concurrency_semaphore semaphore(100, 1, get_name(), std::numeric_limits<size_t>::max(), utils::updateable_value<uint32_t>(1),
-            utils::updateable_value<uint32_t>(1), reader_concurrency_semaphore::register_metrics::no);
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), 100, 1, std::numeric_limits<size_t>::max(),
+            utils::updateable_value<uint32_t>(1), utils::updateable_value<uint32_t>(1));
    auto stop_semaphore = deferred_stop(semaphore);

    cache_tracker tracker;
--- a/test/cluster/auth_cluster/test_auth_password_ensured.py
+++ b/test/cluster/auth_cluster/test_auth_password_ensured.py
@@ -9,7 +9,6 @@ import logging
 import time

 from cassandra.cluster import NoHostAvailable
-from test.cluster.conftest import skip_mode
 from test.pylib.manager_client import ManagerClient, ServerUpState
 from test.pylib.util import wait_for
 from test.cluster.auth_cluster import extra_scylla_config_options as auth_config
--- a/test/cluster/auth_cluster/test_raft_service_levels.py
+++ b/test/cluster/auth_cluster/test_raft_service_levels.py
@@ -14,7 +14,6 @@ from test.cluster.util import trigger_snapshot, wait_until_topology_upgrade_fini
        delete_raft_topology_state, delete_raft_data_and_upgrade_state, wait_until_upgrade_finishes, \
        wait_for_token_ring_and_group0_consistency, wait_until_driver_service_level_created, get_topology_coordinator, \
        find_server_by_host_id
-from test.cluster.conftest import skip_mode
 from test.cqlpy.test_service_levels import MAX_USER_SERVICE_LEVELS
 from cassandra import ConsistencyLevel
 from cassandra.query import SimpleStatement
--- a/test/cluster/conftest.py
+++ b/test/cluster/conftest.py
@@ -268,7 +268,7 @@ async def manager(request: pytest.FixtureRequest,
        # Save scylladb logs for failed tests in a separate directory and copy XML report to the same directory to have
        # all related logs in one dir.
        # Then add property to the XML report with the path to the directory, so it can be visible in Jenkins
-        failed_test_dir_path = testpy_test.suite.log_dir / "failed_test" / test_case_name
+        failed_test_dir_path = testpy_test.suite.log_dir / "failed_test" / test_case_name.translate(str.maketrans('[]', '()'))
        failed_test_dir_path.mkdir(parents=True, exist_ok=True)
        await manager_client.gather_related_logs(
            failed_test_dir_path,
@@ -356,27 +356,6 @@ async def random_tables(request, manager):
    if not failed and not await manager.is_dirty():
        tables.drop_all()

-skipped_funcs = {}
-# Can be used to mark a test to be skipped for a specific mode=[release, dev, debug]
-# The reason to skip a test should be specified, used as a comment only.
-# Additionally, platform_key can be specified to limit the scope of the attribute
-# to the specified platform. Example platform_key-s: [aarch64, x86_64]
-@warnings.deprecated('Please use pytest.mark.skip_mode instead')
-def skip_mode(mode: str, reason: str, platform_key: str | None = None):
-    """DEPRECATED. Please use pytest.mark.skip_mode instead"""
-    def wrap(func):
-        skipped_funcs.setdefault((func, mode), []).append((reason, platform_key))
-        return func
-    return wrap
-
-@pytest.fixture(scope="function", autouse=True)
-@warnings.deprecated('Please use pytest.mark.skip_mode instead')
-def skip_mode_fixture(request, build_mode):
-    for reason, platform_key in skipped_funcs.get((request.function, build_mode), []):
-        if platform_key is None or platform_key in platform.platform():
-            pytest.skip(f'{request.node.name} skipped, reason: {reason}')
-
-
@pytest.fixture(scope="function", autouse=True)
 async def prepare_3_nodes_cluster(request, manager):
    if request.node.get_closest_marker("prepare_3_nodes_cluster"):
--- a/test/cluster/dtest/scrub_test.py
+++ b/test/cluster/dtest/scrub_test.py
@@ -152,7 +152,7 @@ class TestHelper(Tester):
        table_path = self.get_table_path(cf)

        with tempfile.TemporaryDirectory() as tmp_dir:
-            node1.run_scylla_sstable("scrub", additional_args=["--scrub-mode", "abort", "--output-dir", tmp_dir, "--logger-log-level", "scylla-sstable=debug", "--unsafe-accept-nonempty-output-dir"], keyspace=ks, column_families=[cf])
+            node1.run_scylla_sstable("scrub", additional_args=["--scrub-mode", "abort", "--output-dir", tmp_dir, "--logger-log-level", "scylla-sstable=debug"], keyspace=ks, column_families=[cf])
            # Replace the table's sstables with the scrubbed ones, just like online scrub would do.
            shutil.rmtree(table_path)
            shutil.copytree(tmp_dir, table_path)
--- a/test/cluster/lwt/test_lwt_during_tablets_migration.py
+++ b/test/cluster/lwt/test_lwt_during_tablets_migration.py
@@ -9,7 +9,6 @@ import logging
 import random

 import pytest
-from test.cluster.conftest import skip_mode
 from test.cluster.lwt.lwt_common import (
    BaseLWTTester,
    get_token_for_pk,
--- a/test/cluster/lwt/test_lwt_during_tablets_resize.py
+++ b/test/cluster/lwt/test_lwt_during_tablets_resize.py
@@ -9,7 +9,6 @@ import logging
 import random

 import pytest
-from test.cluster.conftest import skip_mode
 from test.cluster.lwt.lwt_common import (
    BaseLWTTester,
    wait_for_tablet_count,
--- a/test/cluster/lwt/test_lwt_with_counters_during_tablets_resize_and_migrations.py
+++ b/test/cluster/lwt/test_lwt_with_counters_during_tablets_resize_and_migrations.py
@@ -9,7 +9,6 @@ import time
 from typing import Dict

 import pytest
-from test.cluster.conftest import skip_mode
 from test.cluster.lwt.lwt_common import (
    BaseLWTTester,
    DEFAULT_WORKERS,
--- a/test/cluster/mv/tablets/test_mv_tablets.py
+++ b/test/cluster/mv/tablets/test_mv_tablets.py
@@ -11,7 +11,6 @@ from test.pylib.rest_client import read_barrier
 from test.pylib.tablets import get_tablet_replicas, get_tablet_count
 from test.pylib.util import wait_for_cql_and_get_hosts
 from test.pylib.internal_types import ServerInfo
-from test.cluster.conftest import skip_mode
 from test.cluster.util import new_test_keyspace

 from test.cluster.test_alternator import get_alternator, alternator_config, full_query
--- a/test/cluster/mv/tablets/test_mv_tablets_empty_ip.py
+++ b/test/cluster/mv/tablets/test_mv_tablets_empty_ip.py
@@ -13,7 +13,6 @@ from cassandra.cluster import ConnectionException, NoHostAvailable  # type: igno

 from test.pylib.scylla_cluster import ReplaceConfig
 from test.pylib.manager_client import ManagerClient
-from test.cluster.conftest import skip_mode
 from test.cluster.util import new_test_keyspace


--- a/Show More
+++ b/Show More