test: add cross-partition static column assertion to testStaticColumnsWithSecondaryIndex

Add a second partition (k=1) with a different static value (s=99) and verify that a secondary index query returns the correct static column values across partitions. This covers the gap identified in dtest cql_static_columns_tests.py, allowing its removal. Refs: SCYLLADB-1922
Merge 'Don't use database.get_config() to fetch calculate_view_update_throttling_delay option' from Pavel Emelyanov
2026-05-13 11:22:01 +00:00 · 2026-05-11 18:32:24 +03:00 · 2026-05-11 10:30:24 +03:00 · 2026-05-11 10:11:20 +03:00 · 2026-05-11 09:12:40 +03:00 · 2026-05-11 08:55:33 +03:00
335 changed files with 9865 additions and 4229 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -32,8 +32,8 @@ counters* @nuivall
 tests/counter_test* @nuivall

 # DOCS
-docs/* @annastuchlik @tzach
-docs/alternator @annastuchlik @tzach @nyh
+/docs/ @annastuchlik @tzach
+/docs/alternator/ @annastuchlik @tzach @nyh

 # GOSSIP
 gms/* @tgrabiec @asias @kbr-scylla
--- a/.gitignore
+++ b/.gitignore
@@ -36,4 +36,6 @@ compile_commands.json
 clang_build
 .idea/
 nuke
-rust/target
+rust/**/target
+rust/**/Cargo.lock
+test/resource/wasm/rust/target
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2026.2.0-dev
+VERSION=2026.3.0-dev

 if test -f version
 then
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -681,7 +681,7 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
    case parsed::primitive_condition::type::VALUE:
        if (calculated_values.size() != 1) {
            // Shouldn't happen unless we have a bug in the parser
-            throw std::logic_error(format("Unexpected values in primitive_condition", cond._values.size()));
+            throw std::logic_error(format("Unexpected values {} in primitive_condition", cond._values.size()));
        }
        // Unwrap the boolean wrapped as the value (if it is a boolean)
        if (calculated_values[0].IsObject() && calculated_values[0].MemberCount() == 1) {
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -1362,6 +1362,33 @@ static int get_dimensions(const rjson::value& vector_attribute, std::string_view
    return dimensions_v->GetInt();
 }

+// As noted in issue #5052, in Alternator the CreateTable and UpdateTable are
+// currently synchronous - they return only after the operation is complete.
+// After announce() of the new schema finished, the schema change is committed
+// and a majority of nodes know it - but it's possible that some live nodes
+// have not yet applied the new schema. If we return to the user now, and the
+// user sends a node request that relies on the new schema, it might fail.
+// So before returning, we must verify that *all* nodes have applied the new
+// schema. This is what wait_for_schema_agreement_after_ddl() does.
+//
+// Note that wait_for_schema_agreement_after_ddl() has a timeout (currently
+// hard-coded to 30 seconds). If the timeout is reached an InternalServerError
+// is returned. The user, who doesn't know if the CreateTable succeeded or not,
+// can retry the request and will get a ResourceInUseException and know the
+// table already exists. So a CreateTable that returns a ResourceInUseException
+// should also call wait_for_schema_agreement_after_ddl().
+//
+// When issue #5052 is resolved, this function can be removed - we will need
+// to check if we reached schema agreement, but not to *wait* for it.
+static future<> wait_for_schema_agreement_after_ddl(service::migration_manager& mm, const replica::database& db) {
+    static constexpr auto schema_agreement_seconds = 30;
+    try {
+        co_await mm.wait_for_schema_agreement(db, db::timeout_clock::now() + std::chrono::seconds(schema_agreement_seconds), nullptr);
+    } catch (const service::migration_manager::schema_agreement_timeout&) {
+        throw api_error::internal(fmt::format("The operation was successful, but unable to confirm cluster-wide schema agreement after {} seconds. Please retry the operation, and wait for the retry to report an error since the operation was already done.", schema_agreement_seconds));
+    }
+}
+
 future<executor::request_return_type> executor::create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization,
            const db::tablets_mode_t::mode tablets_mode, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
    throwing_assert(this_shard_id() == 0);
@@ -1695,13 +1722,26 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
                }
            }
        }
+        bool table_already_exists = false;
        try {
            schema_mutations = service::prepare_new_keyspace_announcement(_proxy.local_db(), ksm, ts);
        } catch (exceptions::already_exists_exception&) {
            if (_proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
-                co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
+                table_already_exists = true;
            }
        }
+        if (table_already_exists) {
+            // The user may have retried a CreateTable operation after it timed
+            // out in wait_for_schema_agreement_after_ddl(). So before we may
+            // return ResourceInUseException (which can lead the user to start
+            // using the table which it now knows exists), we need to wait for
+            // schema agreement, just like the original CreateTable did. Again
+            // we fail with InternalServerError if schema agreement still cannot
+            // be reached. We can release group0_guard before waiting.
+            release_guard(std::move(group0_guard));
+            co_await wait_for_schema_agreement_after_ddl(_mm, _proxy.local_db());
+            co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
+        }
        if (_proxy.data_dictionary().try_find_table(schema->id())) {
            // This should never happen, the ID is supposed to be unique
            co_return api_error::internal(format("Table with ID {} already exists", schema->id()));
@@ -1750,7 +1790,7 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
        }
    }

-    co_await _mm.wait_for_schema_agreement(_proxy.local_db(), db::timeout_clock::now() + 10s, nullptr);
+    co_await wait_for_schema_agreement_after_ddl(_mm, _proxy.local_db());
    rjson::value status = rjson::empty_object();
    executor::supplement_table_info(request, *schema, _proxy);
    rjson::add(status, "TableDescription", std::move(request));
@@ -1860,7 +1900,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
            rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
            if (stream_specification && stream_specification->IsObject()) {
                empty_request = false;
-                if (add_stream_options(*stream_specification, builder, p.local())) {
+                if (add_stream_options(*stream_specification, builder, p.local(), tab->cdc_options())) {
                    validate_cdc_log_name_length(builder.cf_name());
                    // On tablet tables, defer stream enablement and block
                    // tablet merges (see defer_enabling_streams_block_tablet_merges).
@@ -1875,6 +1915,23 @@ future<executor::request_return_type> executor::update_table(client_state& clien
                        if (tab->cdc_options().enabled() || tab->cdc_options().enable_requested()) {
                            co_return api_error::validation("Table already has an enabled stream: TableName: " + tab->cf_name());
                        }
+                        // When re-enabling streams on an Alternator table, drop the old
+                        // CDC log table first as a separate schema change, so the
+                        // subsequent UpdateTable creates a fresh one with a new UUID
+                        // (= new StreamArn). See #7239.
+                        auto logname = cdc::log_name(tab->cf_name());
+                        auto& local_db = p.local().local_db();
+                        if (local_db.has_schema(tab->ks_name(), logname)
+                                && cdc::is_log_schema(*local_db.find_schema(tab->ks_name(), logname))) {
+                            auto drop_m = co_await service::prepare_column_family_drop_announcement(
+                                p.local(), tab->ks_name(), logname,
+                                group0_guard.write_timestamp());
+                            co_await mm.announce(std::move(drop_m), std::move(group0_guard),
+                                format("alternator-executor: drop old CDC log for {}", tab->cf_name()));
+                            co_await mm.wait_for_schema_agreement(
+                                p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
+                            continue;
+                        }
                    }
                    else if (!tab->cdc_options().enabled() && !tab->cdc_options().enable_requested()) {
                        co_return api_error::validation("Table has no stream to disable: TableName: " + tab->cf_name());
@@ -1892,7 +1949,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
                }
                if (vector_index_updates->Size() > 1) {
                    // VectorIndexUpdates mirrors GlobalSecondaryIndexUpdates.
-                    // Since DynamoDB artifically limits the latter to just a
+                    // Since DynamoDB artificially limits the latter to just a
                    // single operation (one Create or one Delete), we also
                    // place the same artificial limit on VectorIndexUpdates,
                    // and throw the same LimitExceeded error if the client
@@ -2189,7 +2246,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
                throw;
            }
        }
-        co_await mm.wait_for_schema_agreement(p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
+        co_await wait_for_schema_agreement_after_ddl(mm, p.local().local_db());

        rjson::value status = rjson::empty_object();
        supplement_table_info(request, *schema, p.local());
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -30,6 +30,7 @@
 #include "utils/updateable_value.hh"

 #include "tracing/trace_state.hh"
+#include "cdc/cdc_options.hh"


 namespace db {
@@ -199,7 +200,7 @@ private:
        tracing::trace_state_ptr trace_state, service_permit permit);

 public:
-    static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp);
+    static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp, const cdc::options& existing_cdc_opts = {});
    static void supplement_table_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp);
    static void supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp);
 };
--- a/alternator/executor_read.cc
+++ b/alternator/executor_read.cc
@@ -1354,7 +1354,7 @@ static future<executor::request_return_type> query_vector(
    std::unordered_set<std::string> used_attribute_values;
    // Parse the Select parameter and determine which attributes to return.
    // For a vector index, the default Select is ALL_ATTRIBUTES (full items).
-    // ALL_PROJECTED_ATTRIBUTES is significantly more efficent because it
+    // ALL_PROJECTED_ATTRIBUTES is significantly more efficient because it
    // returns what the vector store returned without looking up additional
    // base-table data. Currently only the primary key attributes are projected
    // but in the future we'll implement projecting additional attributes into
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -167,46 +167,8 @@ static schema_ptr get_schema_from_arn(service::storage_proxy& proxy, const strea
    }
 }

-// ShardId. Must be between 28 and 65 characters inclusive.
-// UUID is 36 bytes as string (including dashes). 
-// Prepend a version/type marker (`S`) -> 37
-class stream_shard_id : public utils::UUID {
-public:
-    using UUID = utils::UUID;
-    static constexpr char marker = 'S';
-
-    stream_shard_id() = default;
-    stream_shard_id(const UUID& uuid)
-        : UUID(uuid)
-    {}
-    stream_shard_id(const table_id& tid)
-        : UUID(tid.uuid())
-    {}
-    stream_shard_id(std::string_view v)
-        : UUID(v.substr(1))
-    {
-        if (v[0] != marker) {
-            throw std::invalid_argument(std::string(v));
-        }
-    }
-    friend std::ostream& operator<<(std::ostream& os, const stream_shard_id& arn) {
-        const UUID& uuid = arn;
-        return os << marker << uuid;
-    }
-    friend std::istream& operator>>(std::istream& is, stream_shard_id& arn) {
-        std::string s;
-        is >> s;
-        arn = stream_shard_id(s);
-        return is;
-    }
-};
-
 } // namespace alternator

-template<typename ValueType>
-struct rapidjson::internal::TypeHelper<ValueType, alternator::stream_shard_id>
-    : public from_string_helper<ValueType, alternator::stream_shard_id>
-{};
 template<typename ValueType>
 struct rapidjson::internal::TypeHelper<ValueType, alternator::stream_arn>
    : public from_string_helper<ValueType, alternator::stream_arn>
@@ -218,7 +180,8 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
    _stats.api_operations.list_streams++;

    auto limit = rjson::get_opt<int>(request, "Limit").value_or(100);
-    auto streams_start = rjson::get_opt<stream_shard_id>(request, "ExclusiveStartStreamArn");
+    auto streams_start = rjson::get_opt<stream_arn>(request, "ExclusiveStartStreamArn");
+
    auto table = find_table(_proxy, request);
    auto db = _proxy.data_dictionary();

@@ -244,34 +207,34 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
        cfs = db.get_tables();
    }

-    // # 12601 (maybe?) - sort the set of tables on ID. This should ensure we never
-    // generate duplicates in a paged listing here. Can obviously miss things if they 
-    // are added between paged calls and end up with a "smaller" UUID/ARN, but that 
-    // is to be expected.
+    // We need to sort the tables to ensure a stable order for paging.
+    // We sort by keyspace and table name, which will also allow us to skip to
+    // the right position by ExclusiveStartStreamArn.
+    auto cmp = [](std::string_view ks1, std::string_view cf1, std::string_view ks2, std::string_view cf2) {
+        return ks1 == ks2 ? cf1 < cf2 : ks1 < ks2;
+    };
    if (std::cmp_less(limit, cfs.size()) || streams_start) {
-        std::sort(cfs.begin(), cfs.end(), [](const data_dictionary::table& t1, const data_dictionary::table& t2) {
-            return t1.schema()->id().uuid() < t2.schema()->id().uuid();
-        });
+        std::sort(cfs.begin(), cfs.end(),
+            [&cmp](const data_dictionary::table& t1, const data_dictionary::table& t2) {
+                return cmp(t1.schema()->ks_name(), t1.schema()->cf_name(),
+                           t2.schema()->ks_name(), t2.schema()->cf_name());
+            });
    }

    auto i = cfs.begin();
    auto e = cfs.end();

    if (streams_start) {
-        i = std::find_if(i, e, [&](const data_dictionary::table& t) {
-            return t.schema()->id().uuid() == streams_start
-                && cdc::get_base_table(db.real_database(), *t.schema())
-                && is_alternator_keyspace(t.schema()->ks_name())
-                ;
-        });
-        if (i != e) {
-            ++i;
-        }
+        i = std::upper_bound(i, e, *streams_start,
+            [&cmp](const stream_arn& arn, const data_dictionary::table& t) {
+                return cmp(arn.keyspace_name(), arn.table_name(),
+                           t.schema()->ks_name(), t.schema()->cf_name());
+            });
    }

    auto ret = rjson::empty_object();
    auto streams = rjson::empty_array();
-    std::optional<stream_shard_id> last;
+    std::optional<stream_arn> last;

    for (;limit > 0 && i != e; ++i) {
        auto s = i->schema();
@@ -280,21 +243,29 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
        if (!is_alternator_keyspace(ks_name)) {
            continue;
        }
-        if (cdc::is_log_for_some_table(db.real_database(), ks_name, cf_name)) {
+        // Use get_base_table instead of is_log_for_some_table because the
+        // latter requires CDC to be enabled, but we want to list streams
+        // that have been disabled but whose log table still exists (#7239).
+        if (cdc::get_base_table(db.real_database(), ks_name, cf_name)) {
            rjson::value new_entry = rjson::empty_object();
-            last = i->schema()->id();
+
            auto arn = stream_arn{ i->schema(), cdc::get_base_table(db.real_database(), *i->schema()) };
            rjson::add(new_entry, "StreamArn", arn);
            rjson::add(new_entry, "StreamLabel", rjson::from_string(stream_label(*s)));
            rjson::add(new_entry, "TableName", rjson::from_string(cdc::base_name(s->cf_name())));
            rjson::push_back(streams, std::move(new_entry));
+            last = std::move(arn);
            --limit;
        }
    }

    rjson::add(ret, "Streams", std::move(streams));

-    if (last) {
+    // Only emit LastEvaluatedStreamArn when we stopped because we hit the
+    // limit (limit == 0), meaning there may be more streams to list.
+    // If we exhausted all tables naturally (limit > 0), there are no more
+    // streams, so we must not emit a cookie.
+    if (last && limit == 0) {
        rjson::add(ret, "LastEvaluatedStreamArn", *last);
    }
    return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
@@ -424,7 +395,7 @@ std::istream& operator>>(std::istream& is, stream_view_type& type) {
    return is;
 }

-static stream_view_type cdc_options_to_steam_view_type(const cdc::options& opts) {
+static stream_view_type cdc_options_to_stream_view_type(const cdc::options& opts) {
    stream_view_type type = stream_view_type::KEYS_ONLY;
    if (opts.preimage() && opts.postimage()) {
        type = stream_view_type::NEW_AND_OLD_IMAGES;
@@ -614,7 +585,7 @@ void stream_id_range::prepare_for_iterating()
 // the function returns `stream_id_range` that will allow iteration over children Streams shards for the Streams shard `parent`
 // a child Streams shard is defined as a Streams shard that touches token range that was previously covered by `parent` Streams shard
 // Streams shard contains a token, that represents end of the token range for that Streams shard (inclusive)
-// begginning of the token range is defined by previous Streams shard's token + 1
+// beginning of the token range is defined by previous Streams shard's token + 1
 // NOTE: With vnodes, ranges of Streams' shards wrap, while with tablets the biggest allowed token number is always a range end.
 // NOTE: both streams generation are guaranteed to cover whole range and be non-empty
 // NOTE: it's possible to get more than one stream shard with the same token value (thus some of those stream shards will be empty) -
@@ -870,6 +841,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
    auto& opts = bs->cdc_options();

    auto status = "DISABLED";
+    bool stream_disabled = !opts.enabled();

    if (opts.enabled()) {
        if (!_cdc_metadata.streams_available()) {
@@ -885,7 +857,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    rjson::add(stream_desc, "StreamStatus", rjson::from_string(status));

-    stream_view_type type = cdc_options_to_steam_view_type(opts);
+    stream_view_type type = cdc_options_to_stream_view_type(opts);

    rjson::add(stream_desc, "StreamArn", stream_arn);
    rjson::add(stream_desc, "StreamViewType", type);
@@ -893,10 +865,9 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    describe_key_schema(stream_desc, *bs);

-    if (!opts.enabled()) {
-        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-        co_return rjson::print(std::move(ret));
-    }
+    // For disabled streams, we still fall through to enumerate shards
+    // below. All shards will have EndingSequenceNumber set, indicating
+    // they are closed. See issue #7239.

    // TODO: label
    // TODO: creation time
@@ -979,6 +950,12 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
        auto expired = [&]() -> std::optional<db_clock::time_point> {
            auto j = std::next(i);
            if (j == e) {
+                // For a disabled stream, all shards are closed (#7239).
+                // Use "now" as the ending sequence number for the last
+                // generation's shards.
+                if (stream_disabled) {
+                    return db_clock::now();
+                }
                return std::nullopt;
            }
            // add this so we sort of match potential 
@@ -1329,7 +1306,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
        | std::ranges::to<query::column_id_vector>()
    ;

-    stream_view_type type = cdc_options_to_steam_view_type(base->cdc_options());
+    stream_view_type type = cdc_options_to_stream_view_type(base->cdc_options());

    auto selection = cql3::selection::selection::for_columns(schema, std::move(columns));
    auto partition_slice = query::partition_slice(
@@ -1513,17 +1490,17 @@ future<executor::request_return_type> executor::get_records(client_state& client

    auto& shard = iter.shard;

-    if (shard.time < ts && ts < high_ts) {
+    if (!base->cdc_options().enabled()) {
+        // Stream is disabled -- all shards are closed (#7239).
+        // Don't return NextShardIterator.
+    } else if (shard.time < ts && ts < high_ts) {
        // The DynamoDB documentation states that when a shard is
        // closed, reading it until the end has NextShardIterator
        // "set to null". Our test test_streams_closed_read
        // confirms that by "null" they meant not set at all.
    } else {
-        // We could have return the same iterator again, but we did
-        // a search from it until high_ts and found nothing, so we
-        // can also start the next search from high_ts.
-        // TODO: but why? It's simpler just to leave the iterator be.
-        shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
+        // Shard is still open with no records in the scanned window.
+        // Return the original iterator so the client can poll again.
        rjson::add(ret, "NextShardIterator", iter);
    }
    _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
@@ -1533,17 +1510,13 @@ future<executor::request_return_type> executor::get_records(client_state& client
    co_return rjson::print(std::move(ret));
 }

-bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
+bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp, const cdc::options& existing_cdc_opts) {
    auto stream_enabled = rjson::find(stream_specification, "StreamEnabled");
    if (!stream_enabled || !stream_enabled->IsBool()) {
        throw api_error::validation("StreamSpecification needs boolean StreamEnabled");
    }

    if (stream_enabled->GetBool()) {
-        if (!sp.features().alternator_streams) {
-            throw api_error::validation("StreamSpecification: alternator streams feature not enabled in cluster.");
-        }
-
        cdc::options opts;
        opts.enabled(true);
        opts.tablet_merge_blocked(true);
@@ -1569,8 +1542,13 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche
        builder.with_cdc_options(opts);
        return true;
    } else {
-        cdc::options opts;
+        // When disabling, preserve the existing CDC options (preimage,
+        // postimage, ttl, etc.) so that DescribeStream can still report
+        // the correct StreamViewType on a disabled stream.
+        cdc::options opts = existing_cdc_opts;
        opts.enabled(false);
+        opts.enable_requested(false);
+        opts.tablet_merge_blocked(false);
        builder.with_cdc_options(opts);
        return false;
    }
@@ -1578,33 +1556,36 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche

 void executor::supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp) {
    auto& opts = schema.cdc_options();
-    if (opts.enabled()) {
-        auto db = sp.data_dictionary();
-        auto cf = db.find_table(schema.ks_name(), cdc::log_name(schema.cf_name()));
-        stream_arn arn(cf.schema(), cdc::get_base_table(db.real_database(), *cf.schema()));
+    // Report stream info when:
+    //   1. Log table exists (covers both enabled and disabled-but-readable).
+    //   2. enable_requested (ENABLING state, log not yet created).
+    auto db = sp.data_dictionary();
+    auto log_name = cdc::log_name(schema.cf_name());
+    auto log_cf = db.try_find_table(schema.ks_name(), log_name);
+    if (log_cf) {
+        auto log_schema = log_cf->schema();
+        stream_arn arn(log_schema, cdc::get_base_table(db.real_database(), *log_schema));
        rjson::add(descr, "LatestStreamArn", arn);
-        rjson::add(descr, "LatestStreamLabel", rjson::from_string(stream_label(*cf.schema())));
-    } else if (!opts.enable_requested()) {
-        return;
-    }
-    // For both enabled() and enable_requested():
-    // DynamoDB returns StreamEnabled=true in StreamSpecification even when
-    // the stream status is ENABLING (not yet fully active). We mirror this
-    // behavior: enable_requested means the user asked for streams but CDC
-    // is not yet finalized, so we still report StreamEnabled=true.
-    auto stream_desc = rjson::empty_object();
-    rjson::add(stream_desc, "StreamEnabled", true);
+        rjson::add(descr, "LatestStreamLabel", rjson::from_string(stream_label(*log_schema)));

-    auto mode = stream_view_type::KEYS_ONLY;
-    if (opts.preimage() && opts.postimage()) {
-        mode = stream_view_type::NEW_AND_OLD_IMAGES;
-    } else if (opts.preimage()) {
-        mode = stream_view_type::OLD_IMAGE;
-    } else if (opts.postimage()) {
-        mode = stream_view_type::NEW_IMAGE;
+        auto stream_desc = rjson::empty_object();
+        rjson::add(stream_desc, "StreamEnabled", opts.enabled());
+
+        stream_view_type mode = cdc_options_to_stream_view_type(opts);
+        rjson::add(stream_desc, "StreamViewType", mode);
+        rjson::add(descr, "StreamSpecification", std::move(stream_desc));
+    } else if (opts.enable_requested()) {
+        // DynamoDB returns StreamEnabled=true in StreamSpecification even when
+        // the stream status is ENABLING (not yet fully active). We mirror this
+        // behavior: enable_requested means the user asked for streams but CDC
+        // is not yet finalized, so we still report StreamEnabled=true.
+        auto stream_desc = rjson::empty_object();
+        rjson::add(stream_desc, "StreamEnabled", true);
+
+        stream_view_type mode = cdc_options_to_stream_view_type(opts);
+        rjson::add(stream_desc, "StreamViewType", mode);
+        rjson::add(descr, "StreamSpecification", std::move(stream_desc));
    }
-    rjson::add(stream_desc, "StreamViewType", mode);
-    rjson::add(descr, "StreamSpecification", std::move(stream_desc));
 }

 } // namespace alternator
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -856,7 +856,9 @@ rest_exclude_node(sharded<service::storage_service>& ss, std::unique_ptr<http::r
    }

    apilog.info("exclude_node: hosts={}", hosts);
-    co_await ss.local().mark_excluded(hosts);
+    co_await ss.local().run_with_no_api_lock([hosts = std::move(hosts)] (service::storage_service& ss) {
+        return ss.mark_excluded(hosts);
+    });
    co_return json_void();
 }

@@ -1731,7 +1733,9 @@ rest_create_vnode_tablet_migration(http_context& ctx, sharded<service::storage_s
        throw std::runtime_error("vnodes-to-tablets migration requires all nodes to support the VNODES_TO_TABLETS_MIGRATIONS cluster feature");
    }
    auto keyspace = validate_keyspace(ctx, req);
-    co_await ss.local().prepare_for_tablets_migration(keyspace);
+    co_await ss.local().run_with_no_api_lock([keyspace] (service::storage_service& ss) {
+        return ss.prepare_for_tablets_migration(keyspace);
+    });
    co_return json_void();
 }

@@ -1743,7 +1747,9 @@ rest_get_vnode_tablet_migration(http_context& ctx, sharded<service::storage_serv
        throw std::runtime_error("vnodes-to-tablets migration requires all nodes to support the VNODES_TO_TABLETS_MIGRATIONS cluster feature");
    }
    auto keyspace = validate_keyspace(ctx, req);
-    auto status = co_await ss.local().get_tablets_migration_status_with_node_details(keyspace);
+    auto status = co_await ss.local().run_with_no_api_lock([keyspace] (service::storage_service& ss) {
+        return ss.get_tablets_migration_status_with_node_details(keyspace);
+    });

    ss::vnode_tablet_migration_status result;
    result.keyspace = status.keyspace;
@@ -1768,7 +1774,9 @@ rest_set_vnode_tablet_migration_node_storage_mode(http_context& ctx, sharded<ser
    }
    auto mode_str = req->get_query_param("intended_mode");
    auto mode = service::intended_storage_mode_from_string(mode_str);
-    co_await ss.local().set_node_intended_storage_mode(mode);
+    co_await ss.local().run_with_no_api_lock([mode] (service::storage_service& ss) {
+        return ss.set_node_intended_storage_mode(mode);
+    });
    co_return json_void();
 }

@@ -1782,7 +1790,9 @@ rest_finalize_vnode_tablet_migration(http_context& ctx, sharded<service::storage
    auto keyspace = validate_keyspace(ctx, req);
    validate_keyspace(ctx, keyspace);

-    co_await ss.local().finalize_tablets_migration(keyspace);
+    co_await ss.local().run_with_no_api_lock([keyspace] (service::storage_service& ss) {
+        return ss.finalize_tablets_migration(keyspace);
+    });
    co_return json_void();
 }

@@ -1859,90 +1869,106 @@ rest_bind(FuncType func, BindArgs&... args) {
    return std::bind_front(func, std::ref(args)...);
 }

+// Hold the storage_service async gate for the duration of async REST
+// handlers so stop() drains in-flight requests before teardown.
+// Synchronous handlers don't yield and need no gate.
+static seastar::httpd::future_json_function
+gated(sharded<service::storage_service>& ss, seastar::httpd::future_json_function fn) {
+    return [fn = std::move(fn), &ss](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
+        auto holder = ss.local().hold_async_gate();
+        co_return co_await fn(std::move(req));
+    };
+}
+
+static seastar::httpd::json_request_function
+gated(sharded<service::storage_service>&, seastar::httpd::json_request_function fn) {
+    return fn;
+}
+
 void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, sharded<db::snapshot_ctl>& ssc, service::raft_group0_client& group0_client) {
-    ss::get_token_endpoint.set(r, rest_bind(rest_get_token_endpoint, ctx, ss));
-    ss::get_release_version.set(r, rest_bind(rest_get_release_version, ss));
-    ss::get_scylla_release_version.set(r, rest_bind(rest_get_scylla_release_version, ss));
-    ss::get_schema_version.set(r, rest_bind(rest_get_schema_version, ss));
-    ss::get_range_to_endpoint_map.set(r, rest_bind(rest_get_range_to_endpoint_map, ctx, ss));
-    ss::get_pending_range_to_endpoint_map.set(r, rest_bind(rest_get_pending_range_to_endpoint_map, ctx));
-    ss::describe_ring.set(r, rest_bind(rest_describe_ring, ctx, ss));
-    ss::get_current_generation_number.set(r, rest_bind(rest_get_current_generation_number, ss));
-    ss::get_natural_endpoints.set(r, rest_bind(rest_get_natural_endpoints, ctx, ss));
-    ss::get_natural_endpoints_v2.set(r, rest_bind(rest_get_natural_endpoints_v2, ctx, ss));
-    ss::cdc_streams_check_and_repair.set(r, rest_bind(rest_cdc_streams_check_and_repair, ss));
-    ss::cleanup_all.set(r, rest_bind(rest_cleanup_all, ctx, ss));
-    ss::reset_cleanup_needed.set(r, rest_bind(rest_reset_cleanup_needed, ctx, ss));
-    ss::force_flush.set(r, rest_bind(rest_force_flush, ctx));
-    ss::force_keyspace_flush.set(r, rest_bind(rest_force_keyspace_flush, ctx));
-    ss::decommission.set(r, rest_bind(rest_decommission, ss, ssc));
-    ss::logstor_compaction.set(r, rest_bind(rest_logstor_compaction, ctx));
-    ss::logstor_flush.set(r, rest_bind(rest_logstor_flush, ctx));
-    ss::move.set(r, rest_bind(rest_move, ss));
-    ss::remove_node.set(r, rest_bind(rest_remove_node, ss));
-    ss::exclude_node.set(r, rest_bind(rest_exclude_node, ss));
-    ss::get_removal_status.set(r, rest_bind(rest_get_removal_status, ss));
-    ss::force_remove_completion.set(r, rest_bind(rest_force_remove_completion, ss));
-    ss::set_logging_level.set(r, rest_bind(rest_set_logging_level));
-    ss::get_logging_levels.set(r, rest_bind(rest_get_logging_levels));
-    ss::get_operation_mode.set(r, rest_bind(rest_get_operation_mode, ss));
-    ss::is_starting.set(r, rest_bind(rest_is_starting, ss));
-    ss::get_drain_progress.set(r, rest_bind(rest_get_drain_progress, ss));
-    ss::drain.set(r, rest_bind(rest_drain, ss));
-    ss::stop_gossiping.set(r, rest_bind(rest_stop_gossiping, ss));
-    ss::start_gossiping.set(r, rest_bind(rest_start_gossiping, ss));
-    ss::is_gossip_running.set(r, rest_bind(rest_is_gossip_running, ss));
-    ss::stop_daemon.set(r, rest_bind(rest_stop_daemon));
-    ss::is_initialized.set(r, rest_bind(rest_is_initialized, ss));
-    ss::join_ring.set(r, rest_bind(rest_join_ring));
-    ss::is_joined.set(r, rest_bind(rest_is_joined, ss));
-    ss::is_incremental_backups_enabled.set(r, rest_bind(rest_is_incremental_backups_enabled, ctx));
-    ss::set_incremental_backups_enabled.set(r, rest_bind(rest_set_incremental_backups_enabled, ctx));
-    ss::rebuild.set(r, rest_bind(rest_rebuild, ss));
-    ss::bulk_load.set(r, rest_bind(rest_bulk_load));
-    ss::bulk_load_async.set(r, rest_bind(rest_bulk_load_async));
-    ss::reschedule_failed_deletions.set(r, rest_bind(rest_reschedule_failed_deletions));
-    ss::sample_key_range.set(r, rest_bind(rest_sample_key_range));
-    ss::reset_local_schema.set(r, rest_bind(rest_reset_local_schema, ss));
-    ss::set_trace_probability.set(r, rest_bind(rest_set_trace_probability));
-    ss::get_trace_probability.set(r, rest_bind(rest_get_trace_probability));
-    ss::get_slow_query_info.set(r, rest_bind(rest_get_slow_query_info));
-    ss::set_slow_query.set(r, rest_bind(rest_set_slow_query));
-    ss::deliver_hints.set(r, rest_bind(rest_deliver_hints));
-    ss::get_cluster_name.set(r, rest_bind(rest_get_cluster_name, ss));
-    ss::get_partitioner_name.set(r, rest_bind(rest_get_partitioner_name, ss));
-    ss::get_tombstone_warn_threshold.set(r, rest_bind(rest_get_tombstone_warn_threshold));
-    ss::set_tombstone_warn_threshold.set(r, rest_bind(rest_set_tombstone_warn_threshold));
-    ss::get_tombstone_failure_threshold.set(r, rest_bind(rest_get_tombstone_failure_threshold));
-    ss::set_tombstone_failure_threshold.set(r, rest_bind(rest_set_tombstone_failure_threshold));
-    ss::get_batch_size_failure_threshold.set(r, rest_bind(rest_get_batch_size_failure_threshold));
-    ss::set_batch_size_failure_threshold.set(r, rest_bind(rest_set_batch_size_failure_threshold));
-    ss::set_hinted_handoff_throttle_in_kb.set(r, rest_bind(rest_set_hinted_handoff_throttle_in_kb));
-    ss::get_exceptions.set(r, rest_bind(rest_get_exceptions, ss));
-    ss::get_total_hints_in_progress.set(r, rest_bind(rest_get_total_hints_in_progress));
-    ss::get_total_hints.set(r, rest_bind(rest_get_total_hints));
-    ss::get_ownership.set(r, rest_bind(rest_get_ownership, ctx, ss));
-    ss::get_effective_ownership.set(r, rest_bind(rest_get_effective_ownership, ctx, ss));
-    ss::retrain_dict.set(r, rest_bind(rest_retrain_dict, ctx, ss, group0_client));
-    ss::estimate_compression_ratios.set(r, rest_bind(rest_estimate_compression_ratios, ctx, ss));
-    ss::sstable_info.set(r, rest_bind(rest_sstable_info, ctx));
-    ss::logstor_info.set(r, rest_bind(rest_logstor_info, ctx));
-    ss::reload_raft_topology_state.set(r, rest_bind(rest_reload_raft_topology_state, ss, group0_client));
-    ss::upgrade_to_raft_topology.set(r, rest_bind(rest_upgrade_to_raft_topology, ss));
-    ss::raft_topology_upgrade_status.set(r, rest_bind(rest_raft_topology_upgrade_status, ss));
-    ss::raft_topology_get_cmd_status.set(r, rest_bind(rest_raft_topology_get_cmd_status, ss));
-    ss::move_tablet.set(r, rest_bind(rest_move_tablet, ctx, ss));
-    ss::add_tablet_replica.set(r, rest_bind(rest_add_tablet_replica, ctx, ss));
-    ss::del_tablet_replica.set(r, rest_bind(rest_del_tablet_replica, ctx, ss));
-    ss::repair_tablet.set(r, rest_bind(rest_repair_tablet, ctx, ss));
-    ss::tablet_balancing_enable.set(r, rest_bind(rest_tablet_balancing_enable, ss));
-    ss::create_vnode_tablet_migration.set(r, rest_bind(rest_create_vnode_tablet_migration, ctx, ss));
-    ss::get_vnode_tablet_migration.set(r, rest_bind(rest_get_vnode_tablet_migration, ctx, ss));
-    ss::set_vnode_tablet_migration_node_storage_mode.set(r, rest_bind(rest_set_vnode_tablet_migration_node_storage_mode, ctx, ss));
-    ss::finalize_vnode_tablet_migration.set(r, rest_bind(rest_finalize_vnode_tablet_migration, ctx, ss));
-    ss::quiesce_topology.set(r, rest_bind(rest_quiesce_topology, ss));
-    sp::get_schema_versions.set(r, rest_bind(rest_get_schema_versions, ss));
-    ss::drop_quarantined_sstables.set(r, rest_bind(rest_drop_quarantined_sstables, ctx, ss));
+    ss::get_token_endpoint.set(r, gated(ss, rest_bind(rest_get_token_endpoint, ctx, ss)));
+    ss::get_release_version.set(r, gated(ss, rest_bind(rest_get_release_version, ss)));
+    ss::get_scylla_release_version.set(r, gated(ss, rest_bind(rest_get_scylla_release_version, ss)));
+    ss::get_schema_version.set(r, gated(ss, rest_bind(rest_get_schema_version, ss)));
+    ss::get_range_to_endpoint_map.set(r, gated(ss, rest_bind(rest_get_range_to_endpoint_map, ctx, ss)));
+    ss::get_pending_range_to_endpoint_map.set(r, gated(ss, rest_bind(rest_get_pending_range_to_endpoint_map, ctx)));
+    ss::describe_ring.set(r, gated(ss, rest_bind(rest_describe_ring, ctx, ss)));
+    ss::get_current_generation_number.set(r, gated(ss, rest_bind(rest_get_current_generation_number, ss)));
+    ss::get_natural_endpoints.set(r, gated(ss, rest_bind(rest_get_natural_endpoints, ctx, ss)));
+    ss::get_natural_endpoints_v2.set(r, gated(ss, rest_bind(rest_get_natural_endpoints_v2, ctx, ss)));
+    ss::cdc_streams_check_and_repair.set(r, gated(ss, rest_bind(rest_cdc_streams_check_and_repair, ss)));
+    ss::cleanup_all.set(r, gated(ss, rest_bind(rest_cleanup_all, ctx, ss)));
+    ss::reset_cleanup_needed.set(r, gated(ss, rest_bind(rest_reset_cleanup_needed, ctx, ss)));
+    ss::force_flush.set(r, gated(ss, rest_bind(rest_force_flush, ctx)));
+    ss::force_keyspace_flush.set(r, gated(ss, rest_bind(rest_force_keyspace_flush, ctx)));
+    ss::decommission.set(r, gated(ss, rest_bind(rest_decommission, ss, ssc)));
+    ss::logstor_compaction.set(r, gated(ss, rest_bind(rest_logstor_compaction, ctx)));
+    ss::logstor_flush.set(r, gated(ss, rest_bind(rest_logstor_flush, ctx)));
+    ss::move.set(r, gated(ss, rest_bind(rest_move, ss)));
+    ss::remove_node.set(r, gated(ss, rest_bind(rest_remove_node, ss)));
+    ss::exclude_node.set(r, gated(ss, rest_bind(rest_exclude_node, ss)));
+    ss::get_removal_status.set(r, gated(ss, rest_bind(rest_get_removal_status, ss)));
+    ss::force_remove_completion.set(r, gated(ss, rest_bind(rest_force_remove_completion, ss)));
+    ss::set_logging_level.set(r, gated(ss, rest_bind(rest_set_logging_level)));
+    ss::get_logging_levels.set(r, gated(ss, rest_bind(rest_get_logging_levels)));
+    ss::get_operation_mode.set(r, gated(ss, rest_bind(rest_get_operation_mode, ss)));
+    ss::is_starting.set(r, gated(ss, rest_bind(rest_is_starting, ss)));
+    ss::get_drain_progress.set(r, gated(ss, rest_bind(rest_get_drain_progress, ss)));
+    ss::drain.set(r, gated(ss, rest_bind(rest_drain, ss)));
+    ss::stop_gossiping.set(r, gated(ss, rest_bind(rest_stop_gossiping, ss)));
+    ss::start_gossiping.set(r, gated(ss, rest_bind(rest_start_gossiping, ss)));
+    ss::is_gossip_running.set(r, gated(ss, rest_bind(rest_is_gossip_running, ss)));
+    ss::stop_daemon.set(r, gated(ss, rest_bind(rest_stop_daemon)));
+    ss::is_initialized.set(r, gated(ss, rest_bind(rest_is_initialized, ss)));
+    ss::join_ring.set(r, gated(ss, rest_bind(rest_join_ring)));
+    ss::is_joined.set(r, gated(ss, rest_bind(rest_is_joined, ss)));
+    ss::is_incremental_backups_enabled.set(r, gated(ss, rest_bind(rest_is_incremental_backups_enabled, ctx)));
+    ss::set_incremental_backups_enabled.set(r, gated(ss, rest_bind(rest_set_incremental_backups_enabled, ctx)));
+    ss::rebuild.set(r, gated(ss, rest_bind(rest_rebuild, ss)));
+    ss::bulk_load.set(r, gated(ss, rest_bind(rest_bulk_load)));
+    ss::bulk_load_async.set(r, gated(ss, rest_bind(rest_bulk_load_async)));
+    ss::reschedule_failed_deletions.set(r, gated(ss, rest_bind(rest_reschedule_failed_deletions)));
+    ss::sample_key_range.set(r, gated(ss, rest_bind(rest_sample_key_range)));
+    ss::reset_local_schema.set(r, gated(ss, rest_bind(rest_reset_local_schema, ss)));
+    ss::set_trace_probability.set(r, gated(ss, rest_bind(rest_set_trace_probability)));
+    ss::get_trace_probability.set(r, gated(ss, rest_bind(rest_get_trace_probability)));
+    ss::get_slow_query_info.set(r, gated(ss, rest_bind(rest_get_slow_query_info)));
+    ss::set_slow_query.set(r, gated(ss, rest_bind(rest_set_slow_query)));
+    ss::deliver_hints.set(r, gated(ss, rest_bind(rest_deliver_hints)));
+    ss::get_cluster_name.set(r, gated(ss, rest_bind(rest_get_cluster_name, ss)));
+    ss::get_partitioner_name.set(r, gated(ss, rest_bind(rest_get_partitioner_name, ss)));
+    ss::get_tombstone_warn_threshold.set(r, gated(ss, rest_bind(rest_get_tombstone_warn_threshold)));
+    ss::set_tombstone_warn_threshold.set(r, gated(ss, rest_bind(rest_set_tombstone_warn_threshold)));
+    ss::get_tombstone_failure_threshold.set(r, gated(ss, rest_bind(rest_get_tombstone_failure_threshold)));
+    ss::set_tombstone_failure_threshold.set(r, gated(ss, rest_bind(rest_set_tombstone_failure_threshold)));
+    ss::get_batch_size_failure_threshold.set(r, gated(ss, rest_bind(rest_get_batch_size_failure_threshold)));
+    ss::set_batch_size_failure_threshold.set(r, gated(ss, rest_bind(rest_set_batch_size_failure_threshold)));
+    ss::set_hinted_handoff_throttle_in_kb.set(r, gated(ss, rest_bind(rest_set_hinted_handoff_throttle_in_kb)));
+    ss::get_exceptions.set(r, gated(ss, rest_bind(rest_get_exceptions, ss)));
+    ss::get_total_hints_in_progress.set(r, gated(ss, rest_bind(rest_get_total_hints_in_progress)));
+    ss::get_total_hints.set(r, gated(ss, rest_bind(rest_get_total_hints)));
+    ss::get_ownership.set(r, gated(ss, rest_bind(rest_get_ownership, ctx, ss)));
+    ss::get_effective_ownership.set(r, gated(ss, rest_bind(rest_get_effective_ownership, ctx, ss)));
+    ss::retrain_dict.set(r, gated(ss, rest_bind(rest_retrain_dict, ctx, ss, group0_client)));
+    ss::estimate_compression_ratios.set(r, gated(ss, rest_bind(rest_estimate_compression_ratios, ctx, ss)));
+    ss::sstable_info.set(r, gated(ss, rest_bind(rest_sstable_info, ctx)));
+    ss::logstor_info.set(r, gated(ss, rest_bind(rest_logstor_info, ctx)));
+    ss::reload_raft_topology_state.set(r, gated(ss, rest_bind(rest_reload_raft_topology_state, ss, group0_client)));
+    ss::upgrade_to_raft_topology.set(r, gated(ss, rest_bind(rest_upgrade_to_raft_topology, ss)));
+    ss::raft_topology_upgrade_status.set(r, gated(ss, rest_bind(rest_raft_topology_upgrade_status, ss)));
+    ss::raft_topology_get_cmd_status.set(r, gated(ss, rest_bind(rest_raft_topology_get_cmd_status, ss)));
+    ss::move_tablet.set(r, gated(ss, rest_bind(rest_move_tablet, ctx, ss)));
+    ss::add_tablet_replica.set(r, gated(ss, rest_bind(rest_add_tablet_replica, ctx, ss)));
+    ss::del_tablet_replica.set(r, gated(ss, rest_bind(rest_del_tablet_replica, ctx, ss)));
+    ss::repair_tablet.set(r, gated(ss, rest_bind(rest_repair_tablet, ctx, ss)));
+    ss::tablet_balancing_enable.set(r, gated(ss, rest_bind(rest_tablet_balancing_enable, ss)));
+    ss::create_vnode_tablet_migration.set(r, gated(ss, rest_bind(rest_create_vnode_tablet_migration, ctx, ss)));
+    ss::get_vnode_tablet_migration.set(r, gated(ss, rest_bind(rest_get_vnode_tablet_migration, ctx, ss)));
+    ss::set_vnode_tablet_migration_node_storage_mode.set(r, gated(ss, rest_bind(rest_set_vnode_tablet_migration_node_storage_mode, ctx, ss)));
+    ss::finalize_vnode_tablet_migration.set(r, gated(ss, rest_bind(rest_finalize_vnode_tablet_migration, ctx, ss)));
+    ss::quiesce_topology.set(r, gated(ss, rest_bind(rest_quiesce_topology, ss)));
+    sp::get_schema_versions.set(r, gated(ss, rest_bind(rest_get_schema_versions, ss)));
+    ss::drop_quarantined_sstables.set(r, gated(ss, rest_bind(rest_drop_quarantined_sstables, ctx, ss)));
 }

 void unset_storage_service(http_context& ctx, routes& r) {
--- a/audit/audit.cc
+++ b/audit/audit.cc
@@ -113,8 +113,8 @@ static category_set parse_audit_categories(const sstring& data) {
    return result;
 }

-static std::map<sstring, std::set<sstring>> parse_audit_tables(const sstring& data) {
-    std::map<sstring, std::set<sstring>> result;
+static audit::audited_tables_t parse_audit_tables(const sstring& data) {
+    audit::audited_tables_t result;
    if (!data.empty()) {
        std::vector<sstring> tokens;
        boost::split(tokens, data, boost::is_any_of(","));
@@ -139,8 +139,8 @@ static std::map<sstring, std::set<sstring>> parse_audit_tables(const sstring& da
    return result;
 }

-static std::set<sstring> parse_audit_keyspaces(const sstring& data) {
-    std::set<sstring> result;
+static audit::audited_keyspaces_t parse_audit_keyspaces(const sstring& data) {
+    audit::audited_keyspaces_t result;
    if (!data.empty()) {
        std::vector<sstring> tokens;
        boost::split(tokens, data, boost::is_any_of(","));
@@ -156,8 +156,8 @@ audit::audit(locator::shared_token_metadata& token_metadata,
             cql3::query_processor& qp,
             service::migration_manager& mm,
             std::set<sstring>&& audit_modes,
-             std::set<sstring>&& audited_keyspaces,
-             std::map<sstring, std::set<sstring>>&& audited_tables,
+             audited_keyspaces_t&& audited_keyspaces,
+             audited_tables_t&& audited_tables,
             category_set&& audited_categories,
             const db::config& cfg)
    : _token_metadata(token_metadata)
@@ -165,8 +165,8 @@ audit::audit(locator::shared_token_metadata& token_metadata,
    , _audited_tables(std::move(audited_tables))
    , _audited_categories(std::move(audited_categories))
    , _cfg(cfg)
-    , _cfg_keyspaces_observer(cfg.audit_keyspaces.observe([this] (sstring const& new_value){ update_config<std::set<sstring>>(new_value, parse_audit_keyspaces, _audited_keyspaces); }))
-    , _cfg_tables_observer(cfg.audit_tables.observe([this] (sstring const& new_value){ update_config<std::map<sstring, std::set<sstring>>>(new_value, parse_audit_tables, _audited_tables); }))
+    , _cfg_keyspaces_observer(cfg.audit_keyspaces.observe([this] (sstring const& new_value){ update_config<audited_keyspaces_t>(new_value, parse_audit_keyspaces, _audited_keyspaces); }))
+    , _cfg_tables_observer(cfg.audit_tables.observe([this] (sstring const& new_value){ update_config<audited_tables_t>(new_value, parse_audit_tables, _audited_tables); }))
    , _cfg_categories_observer(cfg.audit_categories.observe([this] (sstring const& new_value){ update_config<category_set>(new_value, parse_audit_categories, _audited_categories); }))
 {
    _storage_helper_ptr = create_storage_helper(std::move(audit_modes), qp, mm);
@@ -181,8 +181,8 @@ future<> audit::start_audit(const db::config& cfg, sharded<locator::shared_token
        return make_ready_future<>();
    }
    category_set audited_categories = parse_audit_categories(cfg.audit_categories());
-    std::map<sstring, std::set<sstring>> audited_tables = parse_audit_tables(cfg.audit_tables());
-    std::set<sstring> audited_keyspaces = parse_audit_keyspaces(cfg.audit_keyspaces());
+    audit::audited_tables_t audited_tables = parse_audit_tables(cfg.audit_tables());
+    audit::audited_keyspaces_t audited_keyspaces = parse_audit_keyspaces(cfg.audit_keyspaces());

    logger.info("Audit is enabled. Auditing to: \"{}\", with the following categories: \"{}\", keyspaces: \"{}\", and tables: \"{}\"",
                cfg.audit(), cfg.audit_categories(), cfg.audit_keyspaces(), cfg.audit_tables());
@@ -194,22 +194,36 @@ future<> audit::start_audit(const db::config& cfg, sharded<locator::shared_token
                                  std::move(audited_keyspaces),
                                  std::move(audited_tables),
                                  std::move(audited_categories),
-                                  std::cref(cfg))
-    .then([&cfg] {
-        if (!audit_instance().local_is_initialized()) {
-            return make_ready_future<>();
-        }
-        return audit_instance().invoke_on_all([&cfg] (audit& local_audit) {
-            return local_audit.start(cfg);
+                                  std::cref(cfg));
+}
+
+future<> audit::start_storage(const db::config& cfg) {
+    if (!audit_instance().local_is_initialized()) {
+        return make_ready_future<>();
+    }
+    return audit_instance().invoke_on_all([&cfg] (audit& local_audit) {
+        return local_audit._storage_helper_ptr->start(cfg).then([&local_audit] {
+            local_audit._storage_running = true;
        });
    });
 }

+future<> audit::stop_storage() {
+    if (!audit_instance().local_is_initialized()) {
+        return make_ready_future<>();
+    }
+    return audit_instance().invoke_on_all([] (audit& local_audit) {
+        local_audit._storage_running = false;
+        return local_audit._storage_helper_ptr->stop();
+    });
+}
+
 future<> audit::stop_audit() {
    if (!audit_instance().local_is_initialized()) {
        return make_ready_future<>();
    }
    return audit::audit::audit_instance().invoke_on_all([] (auto& local_audit) {
+        SCYLLA_ASSERT(!local_audit._storage_running);
        return local_audit.shutdown();
    }).then([] {
        return audit::audit::audit_instance().stop();
@@ -223,14 +237,6 @@ audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& k
    return std::make_unique<audit_info>(cat, keyspace, table, batch);
 }

-future<> audit::start(const db::config& cfg) {
-    return _storage_helper_ptr->start(cfg);
-}
-
-future<> audit::stop() {
-    return _storage_helper_ptr->stop();
-}
-
 future<> audit::shutdown() {
    return make_ready_future<>();
 }
@@ -241,6 +247,12 @@ future<> audit::log(const audit_info& audit_info, const service::client_state& c
    const sstring& username = client_state.user() ? client_state.user()->name.value_or(anonymous_username) : no_username;
    socket_address client_ip = client_state.get_client_address().addr();
    socket_address node_ip = _token_metadata.get()->get_topology().my_address().addr();
+    if (!_storage_running) {
+        on_internal_error_noexcept(logger, fmt::format("Audit log dropped (storage not ready): node_ip {} category {} cl {} error {} keyspace {} query '{}' client_ip {} table {} username {}",
+            node_ip, audit_info.category_string(), cl, error, audit_info.keyspace(),
+            audit_info.query(), client_ip, audit_info.table(), username));
+        return make_ready_future<>();
+    }
    if (logger.is_enabled(logging::log_level::debug)) {
        logger.debug("Log written: node_ip {} category {} cl {} error {} keyspace {} query '{}' client_ip {} table {} username {}",
            node_ip, audit_info.category_string(), cl, error, audit_info.keyspace(),
@@ -286,6 +298,11 @@ future<> inspect(const audit_info_alternator& ai, const service::client_state& c

 future<> audit::log_login(const sstring& username, socket_address client_ip, bool error) noexcept {
    socket_address node_ip = _token_metadata.get()->get_topology().my_address().addr();
+    if (!_storage_running) {
+        on_internal_error_noexcept(logger, fmt::format("Audit login log dropped (storage not ready): node_ip {} client_ip {} username {} error {}",
+            node_ip, client_ip, username, error ? "true" : "false"));
+        return make_ready_future<>();
+    }
    if (logger.is_enabled(logging::log_level::debug)) {
        logger.debug("Login log written: node_ip {}, client_ip {}, username {}, error {}",
            node_ip, client_ip, username, error ? "true" : "false");
@@ -304,7 +321,7 @@ future<> inspect_login(const sstring& username, socket_address client_ip, bool e
    return audit::local_audit_instance().log_login(username, client_ip, error);
 }

-bool audit::should_log_table(const sstring& keyspace, const sstring& name) const {
+bool audit::should_log_table(std::string_view keyspace, std::string_view name) const {
    auto keyspace_it = _audited_tables.find(keyspace);
    return keyspace_it != _audited_tables.cend() && keyspace_it->second.find(name) != keyspace_it->second.cend();
 }
@@ -319,8 +336,8 @@ bool audit::will_log(statement_category cat, std::string_view keyspace, std::str
    // so it is logged whenever the category matches.
    return _audited_categories.contains(cat)
           && (keyspace.empty()
-                         || _audited_keyspaces.find(sstring(keyspace)) != _audited_keyspaces.cend()
-                         || should_log_table(sstring(keyspace), sstring(table))
+                         || _audited_keyspaces.find(keyspace) != _audited_keyspaces.cend()
+                         || should_log_table(keyspace, table)
                         || cat == statement_category::AUTH
                         || cat == statement_category::ADMIN
                         || cat == statement_category::DCL);
--- a/audit/audit.hh
+++ b/audit/audit.hh
@@ -129,13 +129,19 @@ public:
 class storage_helper;

 class audit final : public seastar::async_sharded_service<audit> {
+public:
+    // Transparent comparator (std::less<>) enables heterogeneous lookup with
+    // string_view keys.
+    using audited_keyspaces_t = std::set<sstring, std::less<>>;
+    using audited_tables_t = std::map<sstring, std::set<sstring, std::less<>>, std::less<>>;
+private:
    locator::shared_token_metadata& _token_metadata;
-    std::set<sstring> _audited_keyspaces;
-    // Maps keyspace name to set of table names in that keyspace
-    std::map<sstring, std::set<sstring>> _audited_tables;
+    audited_keyspaces_t _audited_keyspaces;
+    audited_tables_t _audited_tables;
    category_set _audited_categories;

    std::unique_ptr<storage_helper> _storage_helper_ptr;
+    bool _storage_running = false;

    const db::config& _cfg;
    utils::observer<sstring> _cfg_keyspaces_observer;
@@ -145,7 +151,7 @@ class audit final : public seastar::async_sharded_service<audit> {
    template<class T>
    void update_config(const sstring & new_value, std::function<T(const sstring&)> parse_func, T& cfg_parameter);

-    bool should_log_table(const sstring& keyspace, const sstring& name) const;
+    bool should_log_table(std::string_view keyspace, std::string_view name) const;
 public:
    static seastar::sharded<audit>& audit_instance() {
        // FIXME: leaked intentionally to avoid shutdown problems, see #293
@@ -158,19 +164,19 @@ public:
        return audit_instance().local();
    }
    static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
+    static future<> start_storage(const db::config& cfg);
+    static future<> stop_storage();
    static future<> stop_audit();
    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch = false);
    audit(locator::shared_token_metadata& stm,
          cql3::query_processor& qp,
          service::migration_manager& mm,
          std::set<sstring>&& audit_modes,
-          std::set<sstring>&& audited_keyspaces,
-          std::map<sstring, std::set<sstring>>&& audited_tables,
+          audited_keyspaces_t&& audited_keyspaces,
+          audited_tables_t&& audited_tables,
          category_set&& audited_categories,
          const db::config& cfg);
    ~audit();
-    future<> start(const db::config& cfg);
-    future<> stop();
    future<> shutdown();
    bool should_log(const audit_info& audit_info) const;
    bool will_log(statement_category cat, std::string_view keyspace = {}, std::string_view table = {}) const;
--- a/auth/cache.cc
+++ b/auth/cache.cc
@@ -185,24 +185,14 @@ future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& r
        static const sstring q = format("SELECT role, name, value FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, ROLE_ATTRIBUTES_CF);
        auto rs = co_await fetch(q);
        for (const auto& r : *rs) {
+            if (!r.has("value")) {
+                continue;
+            }
            rec->attributes[r.get_as<sstring>("name")] =
                    r.get_as<sstring>("value");
            co_await coroutine::maybe_yield();
        }
    }
-    // permissions
-    {
-        static const sstring q = format("SELECT role, resource, permissions FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, PERMISSIONS_CF);
-        auto rs = co_await fetch(q);
-        for (const auto& r : *rs) {
-            auto resource = r.get_as<sstring>("resource");
-            auto perms_strings = r.get_set<sstring>("permissions");
-            std::unordered_set<sstring> perms_set(perms_strings.begin(), perms_strings.end());
-            auto pset = permissions::from_strings(perms_set);
-            rec->permissions[std::move(resource)] = std::move(pset);
-            co_await coroutine::maybe_yield();
-        }
-    }
    co_return rec;
 }

--- a/auth/cache.hh
+++ b/auth/cache.hh
@@ -44,7 +44,6 @@ public:
        std::unordered_set<role_name_t> members;
        sstring salted_hash;
        std::unordered_map<sstring, sstring, sstring_hash, sstring_eq> attributes;
-        std::unordered_map<sstring, permission_set, sstring_hash, sstring_eq> permissions;
    private:
        friend cache;
        // cached permissions include effects of role's inheritance
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -76,7 +76,11 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
    if (results->empty()) {
        co_return permissions::NONE;
    }
-    co_return permissions::from_strings(results->one().get_set<sstring>(PERMISSIONS_NAME));
+    const auto& row = results->one();
+    if (!row.has(PERMISSIONS_NAME)) {
+        co_return permissions::NONE;
+    }
+    co_return permissions::from_strings(row.get_set<sstring>(PERMISSIONS_NAME));
 }

 future<>
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -258,13 +258,11 @@ future<> ldap_role_manager::start() {
            } catch (const seastar::sleep_aborted&) {
                co_return; // ignore
            }
-            co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
-                try {
-                    co_await c.reload_all_permissions();
-                } catch (...) {
-                    mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
-                }
-            });
+            try {
+                co_await _cache.reload_all_permissions();
+            } catch (...) {
+                mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
+            }
        }
    });
    return _std_mgr.start();
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -157,15 +157,12 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
            return create_legacy_keyspace_if_missing(mm);
        });
    }
-    co_await _role_manager->start();
-    if (this_shard_id() == 0) {
-        // Role manager and password authenticator have this odd startup
-        // mechanism where they asynchronously create the superuser role
-        // in the background. Correct password creation depends on role
-        // creation therefore we need to wait here.
-        co_await _role_manager->ensure_superuser_is_created();
-    }
-    co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
+    // Authorizer must be started before the permission loader is set,
+    // because the loader calls _authorizer->authorize().
+    // The loader must be set before starting the role manager, because
+    // LDAP role manager starts a pruner fiber that calls
+    // reload_all_permissions() which asserts _permission_loader is set.
+    co_await _authorizer->start();
    if (!_used_by_maintenance_socket) {
        // Maintenance socket mode can't cache permissions because it has
        // different authorizer. We can't mix cached permissions, they could be
@@ -174,12 +171,27 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
                &service::get_uncached_permissions,
                this, std::placeholders::_1, std::placeholders::_2));
    }
+    co_await _role_manager->start();
+    if (this_shard_id() == 0) {
+        // Role manager and password authenticator have this odd startup
+        // mechanism where they asynchronously create the superuser role
+        // in the background. Correct password creation depends on role
+        // creation therefore we need to wait here.
+        co_await _role_manager->ensure_superuser_is_created();
+    }
+    // Authenticator must be started after ensure_superuser_is_created()
+    // because password_authenticator queries system.roles for the
+    // superuser entry created by the role manager.
+    co_await _authenticator->start();
 }

 future<> service::stop() {
    _as.request_abort();
+    // Reverse of start() order.
+    co_await _authenticator->stop();
+    co_await _role_manager->stop();
    _cache.set_permission_loader(nullptr);
-    return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
+    co_await _authorizer->stop();
 }

 future<> service::ensure_superuser_is_created() {
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -1625,7 +1625,7 @@ struct process_change_visitor {
        if (_enable_updating_state) {
            if (_request_options.alternator && _alternator_schema_has_no_clustering_key && _clustering_row_states.empty()) {
                // Alternator's table can be with or without clustering key. If the clustering key exists,
-                // delete request will be `clustered_row_delete` and will be hanlded there.
+                // delete request will be `clustered_row_delete` and will be handled there.
                // If the clustering key doesn't exist, delete request will be `partition_delete` and will be handled here.
                // The no-clustering-key case is slightly tricky, because insert of such item is handled by `clustered_row_cells`
                // and has some value as clustering_key (the value currently seems to be empty bytes object).
@@ -1933,7 +1933,7 @@ public:
        if (_options.alternator && !_alternator_clustering_keys_to_ignore.empty()) {
            // we filter mutations for Alternator's changes here.
            // We do it per mutation object (user might submit a batch of those in one go
-            // and some might be splitted because of different timestamps),
+            // and some might be split because of different timestamps),
            // ignore key set is cleared afterwards.
            // If single mutation object contains two separate changes to the same row
            // and at least one of them is ignored, all of them will be ignored.
--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -267,7 +267,7 @@ struct extract_row_visitor {
            visit_collection(v);
        },
        [&] (const abstract_type& o) {
-            throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
+            throw std::runtime_error(format("extract_changes: unknown collection type: {}", o.name()));
        }
        ));
    }
--- a/cmake/mode.common.cmake
+++ b/cmake/mode.common.cmake
@@ -137,6 +137,24 @@ endfunction()

 option(Scylla_WITH_DEBUG_INFO "Enable debug info" OFF)

+# Time trace profiling: adds -ftime-trace to all C++ compilations (Clang only).
+# Each .o produces a companion .json file in the build directory that can be
+# analyzed with ClangBuildAnalyzer or loaded in chrome://tracing.
+#
+# Usage:
+#   cmake -DScylla_TIME_TRACE=ON ...
+#   ninja
+#   # Analyze results (requires ClangBuildAnalyzer):
+#   ClangBuildAnalyzer --all <build-dir> capture.bin
+#   ClangBuildAnalyzer --analyze capture.bin
+option(Scylla_TIME_TRACE "Enable Clang -ftime-trace for build profiling" OFF)
+if(Scylla_TIME_TRACE)
+  if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    message(FATAL_ERROR "Scylla_TIME_TRACE requires Clang (found ${CMAKE_CXX_COMPILER_ID})")
+  endif()
+  add_compile_options(-ftime-trace)
+endif()
+
 macro(update_build_flags config)
  cmake_parse_arguments (
    parsed_args
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -240,7 +240,7 @@ static max_purgeable get_max_purgeable_timestamp(const compaction_group_view& ta
    // and if the memtable also contains the key we're calculating max purgeable timestamp for.
    // First condition helps to not penalize the common scenario where memtable only contains
    // newer data.
-    if (memtable_min_timestamp <= compacting_max_timestamp && table_s.memtable_has_key(dk)) {
+    if (!table_s.skip_memtable_for_tombstone_gc() && memtable_min_timestamp <= compacting_max_timestamp && table_s.memtable_has_key(dk)) {
        timestamp = memtable_min_timestamp;
        source = max_purgeable::timestamp_source::memtable_possibly_shadowing_data;
    }
--- a/compaction/compaction_group_view.hh
+++ b/compaction/compaction_group_view.hh
@@ -39,6 +39,9 @@ public:
    virtual future<lw_shared_ptr<const sstables::sstable_set>> main_sstable_set() const = 0;
    virtual future<lw_shared_ptr<const sstables::sstable_set>> maintenance_sstable_set() const = 0;
    virtual lw_shared_ptr<const sstables::sstable_set> sstable_set_for_tombstone_gc() const = 0;
+    // Returns true when tombstone GC considers only the repaired sstable set, meaning the
+    // memtable does not need to be consulted (its data is always newer than any GC-eligible tombstone).
+    virtual bool skip_memtable_for_tombstone_gc() const noexcept = 0;
    virtual std::unordered_set<sstables::shared_sstable> fully_expired_sstables(const std::vector<sstables::shared_sstable>& sstables, gc_clock::time_point compaction_time) const = 0;
    virtual const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const noexcept = 0;
    virtual compaction_strategy& get_compaction_strategy() const noexcept = 0;
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -1088,7 +1088,7 @@ void compaction_manager::register_metrics() {
        sm::make_gauge("normalized_backlog", [this] { return _last_backlog / available_memory(); },
                       sm::description("Holds the sum of normalized compaction backlog for all tables in the system. Backlog is normalized by dividing backlog by shard's available memory.")),
        sm::make_counter("validation_errors", [this] { return _validation_errors; },
-                       sm::description("Holds the number of encountered validation errors.")),
+                       sm::description("Holds the number of encountered validation errors.")).set_skip_when_empty(),
    });
 }

--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -406,7 +406,11 @@ commitlog_total_space_in_mb: -1
 # In short, `ms` needs more CPU during sstable writes,
 # but should behave better during reads,
 # although it might behave worse for very long clustering keys.
+#
+# `ms` sstable format works even better with `column_index_size_in_kb` set to 1,
+# so keep those two settings in sync (either both set, or both unset).
 sstable_format: ms
+column_index_size_in_kb: 1

 # Auto-scaling of the promoted index prevents running out of memory
 # when the promoted index grows too large (due to partitions with many rows
--- a/configure.py
+++ b/configure.py
@@ -285,8 +285,12 @@ def generate_compdb(compdb, ninja, buildfile, modes):
                os.symlink(compdb_target, compdb)
            except FileExistsError:
                # if there is already a valid compile_commands.json link in the
-                # source root, we are done.
-                pass
+                # source root, we are done. if it's a stale link, update it.
+                if os.path.islink(compdb):
+                    current_target = os.readlink(compdb)
+                    if not os.path.exists(current_target):
+                        os.unlink(compdb)
+                        os.symlink(compdb_target, compdb)
            return


@@ -593,6 +597,7 @@ scylla_tests = set([
    'test/boost/linearizing_input_stream_test',
    'test/boost/lister_test',
    'test/boost/locator_topology_test',
+    'test/boost/lock_tables_metadata_test',
    'test/boost/log_heap_test',
    'test/boost/logalloc_standard_allocator_segment_pool_backend_test',
    'test/boost/logalloc_test',
@@ -853,6 +858,10 @@ arg_parser.add_argument('--coverage', action = 'store_true', help = 'Compile scy
 arg_parser.add_argument('--build-dir', action='store', default='build',
                        help='Build directory path')
 arg_parser.add_argument('--disable-precompiled-header', action='store_true', default=False, help='Disable precompiled header for scylla binary')
+arg_parser.add_argument('--time-trace', action='store_true', default=False,
+                        help='Enable Clang -ftime-trace for build profiling. '
+                             'Each .o produces a .json file analyzable with '
+                             'ClangBuildAnalyzer or chrome://tracing')
 arg_parser.add_argument('-h', '--help', action='store_true', help='show this help message and exit')
 args = arg_parser.parse_args()
 if args.help:
@@ -1659,6 +1668,7 @@ deps['test/boost/combined_tests'] += [
    'test/boost/auth_cache_test.cc',
    'test/boost/auth_test.cc',
    'test/boost/batchlog_manager_test.cc',
+    'test/boost/table_helper_test.cc',
    'test/boost/cache_algorithm_test.cc',
    'test/boost/castas_fcts_test.cc',
    'test/boost/cdc_test.cc',
@@ -1710,7 +1720,7 @@ deps['test/boost/combined_tests'] += [
    'test/boost/sstable_compression_config_test.cc',
    'test/boost/sstable_directory_test.cc',
    'test/boost/sstable_set_test.cc',
-    'test/boost/sstable_tablet_streaming.cc',
+    'test/boost/sstable_tablet_streaming_test.cc',
    'test/boost/statement_restrictions_test.cc',
    'test/boost/storage_proxy_test.cc',
    'test/boost/tablets_test.cc',
@@ -1965,6 +1975,9 @@ user_cflags += ' -fextend-variable-liveness=none'
 if args.target != '':
    user_cflags += ' -march=' + args.target

+if args.time_trace:
+    user_cflags += ' -ftime-trace'
+
 for mode in modes:
    # Those flags are passed not only to Scylla objects, but also to libraries
    # that we compile ourselves.
@@ -2457,6 +2470,9 @@ def write_build_file(f,
            command = reloc/build_deb.sh --reloc-pkg $in --builddir $out
        rule unified
            command = unified/build_unified.sh --build-dir $builddir/$mode --unified-pkg $out
+        rule collect_pkgs
+            command = rm -rf $out && mkdir -p $out && cp $pkgs $out/
+            description = COLLECT $out
        rule rust_header
            command = cxxbridge --include rust/cxx.h --header $in > $out
            description = RUST_HEADER $out
@@ -2942,6 +2958,8 @@ def write_build_file(f,
        build dist-tar: phony dist-unified-tar dist-server-tar dist-python3-tar dist-cqlsh-tar

        build dist: phony dist-unified dist-server dist-python3 dist-cqlsh
+
+        build collect-dist: phony {' '.join([f'collect-dist-{mode}' for mode in default_modes])}
        '''))

    f.write(textwrap.dedent(f'''\
@@ -2949,7 +2967,28 @@ def write_build_file(f,
        rule dist-check
          command = ./tools/testing/dist-check/dist-check.sh --mode $mode
        '''))
+    deb_arch = {'x86_64': 'amd64', 'aarch64': 'arm64'}[arch]
+    deb_ver = f'{scylla_version}-{scylla_release}-1'
+    rpm_ver = f'{scylla_version}-{scylla_release}'
    for mode in build_modes:
+        server_rpms_dir = f'$builddir/dist/{mode}/redhat/RPMS/{arch}'
+        server_rpms = [f'{server_rpms_dir}/{scylla_product}{suffix}-{rpm_ver}.{arch}.rpm'
+                       for suffix in ['', '-server', '-server-debuginfo', '-conf', '-kernel-conf', '-node-exporter']]
+        cqlsh_rpms = [f'tools/cqlsh/build/redhat/RPMS/{arch}/{scylla_product}-cqlsh-{rpm_ver}.{arch}.rpm']
+        python3_rpms = [f'tools/python3/build/redhat/RPMS/{arch}/{scylla_product}-python3-{rpm_ver}.{arch}.rpm']
+        all_rpms = server_rpms + cqlsh_rpms + python3_rpms
+
+        server_deb_dir = f'$builddir/dist/{mode}/debian'
+        server_debs = [f'{server_deb_dir}/{scylla_product}{suffix}_{deb_ver}_{deb_arch}.deb'
+                       for suffix in ['', '-server', '-server-dbg', '-conf', '-kernel-conf', '-node-exporter']]
+        server_debs += [f'{server_deb_dir}/scylla-enterprise{suffix}_{deb_ver}_all.deb'
+                        for suffix in ['', '-server', '-conf', '-kernel-conf', '-node-exporter']]
+        cqlsh_debs = [f'tools/cqlsh/build/debian/{scylla_product}-cqlsh_{deb_ver}_{deb_arch}.deb',
+                      f'tools/cqlsh/build/debian/scylla-enterprise-cqlsh_{deb_ver}_all.deb']
+        python3_debs = [f'tools/python3/build/debian/{scylla_product}-python3_{deb_ver}_{deb_arch}.deb',
+                        f'tools/python3/build/debian/scylla-enterprise-python3_{deb_ver}_all.deb']
+        all_debs = server_debs + cqlsh_debs + python3_debs
+
        f.write(textwrap.dedent(f'''\
        build $builddir/{mode}/dist/tar/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz: copy tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
        build $builddir/{mode}/dist/tar/{scylla_product}-python3-package.tar.gz: copy tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
@@ -2957,6 +2996,11 @@ def write_build_file(f,
        build $builddir/{mode}/dist/tar/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz: copy tools/cqlsh/build/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz
        build $builddir/{mode}/dist/tar/{scylla_product}-cqlsh-package.tar.gz: copy tools/cqlsh/build/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz

+        build $builddir/{mode}/dist/rpm: collect_pkgs | {' '.join(all_rpms)} $builddir/dist/{mode}/redhat dist-cqlsh-rpm dist-python3-rpm
+          pkgs = {' '.join(all_rpms)}
+        build $builddir/{mode}/dist/deb: collect_pkgs | {' '.join(all_debs)} $builddir/dist/{mode}/debian dist-cqlsh-deb dist-python3-deb
+          pkgs = {' '.join(all_debs)}
+        build collect-dist-{mode}: phony $builddir/{mode}/dist/rpm $builddir/{mode}/dist/deb
        build {mode}-dist: phony dist-server-{mode} dist-server-debuginfo-{mode} dist-python3-{mode} dist-unified-{mode} dist-cqlsh-{mode}
        build dist-{mode}: phony {mode}-dist
        build dist-check-{mode}: dist-check
--- a/cql3/authorized_prepared_statements_cache.hh
+++ b/cql3/authorized_prepared_statements_cache.hh
@@ -136,9 +136,9 @@ public:
    {}

    future<> insert(auth::authenticated_user user, cql3::prepared_cache_key_type prep_cache_key, value_type v) noexcept {
-        return _cache.get_ptr(key_type(std::move(user), std::move(prep_cache_key)), [v = std::move(v)] (const cache_key_type&) mutable {
+        return _cache.insert(key_type(std::move(user), std::move(prep_cache_key)), [v = std::move(v)] (const cache_key_type&) mutable {
            return make_ready_future<value_type>(std::move(v));
-        }).discard_result();
+        });
    }

    value_ptr find(const auth::authenticated_user& user, const cql3::prepared_cache_key_type& prep_cache_key) {
--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -1070,7 +1070,7 @@ try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database
                                .args = {},
                            };
                        } else {
-                            throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
+                            throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument, got {}", fc.args[0]));
                        }
                    }
                }
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -23,15 +23,113 @@ namespace cql3 {

 namespace restrictions {

+/// A set of discrete values.
+using value_list = std::vector<managed_bytes>; // Sorted and deduped using value comparator.
+
+/// General set of values.  Empty set and single-element sets are always value_list.  interval is
+/// never singular and never has start > end.  Universal set is a interval with both bounds null.
+using value_set = std::variant<value_list, interval<managed_bytes>>;
+
+// For some boolean expression (say (X = 3) = TRUE, this represents a function that solves for X.
+// (here, it would return 3). The expression is obtained by equating some factors of the WHERE
+// clause to TRUE.
+using solve_for_t = std::function<value_set (const query_options&)>;
+
+struct on_row {
+    bool operator==(const on_row&) const = default;
+};
+
+struct on_column {
+    const column_definition* column;
+
+    bool operator==(const on_column&) const = default;
+};
+
+// Placeholder type indicating we're solving for the partition key token.
+struct on_partition_key_token {
+    const ::schema* schema;
+
+    bool operator==(const on_partition_key_token&) const = default;
+};
+
+struct on_clustering_key_prefix {
+    std::vector<const column_definition*> columns;
+
+    bool operator==(const on_clustering_key_prefix&) const = default;
+};
+
+// A predicate on a column or a combination of columns. The WHERE clause analyzer
+// will attempt to convert predicates (that return true or false for a particular row)
+// to solvers (that return the set of column values that satisfy the predicate) when possible.
+struct predicate {
+    // A function that returns the set of values that satisfy the filter. Can be unset,
+    // in which case the filter must be interpreted.
+    solve_for_t solve_for;
+    // The original filter for this column.
+    expr::expression filter;
+    // What column the predicate can be solved for
+    std::variant<
+            on_row,                        // cannot determine, so predicate is on entire row
+            on_column,                     // solving for a single column: e.g. c1 = 3
+            on_partition_key_token,        // solving for the token, e.g. token(pk1, pk2) >= :var
+            on_clustering_key_prefix       // solving for a clustering key prefix: e.g. (ck1, ck2) >= (3, 4)
+    > on;
+    // Whether the returned value_set will resolve to a single value.
+    bool is_singleton = false;
+    // Whether the returned value_set follows CQL comparison semantics
+    bool comparable = true;
+    bool is_multi_column = false;
+    bool is_not_null_single_column = false;
+    bool equality = false;        // operator is EQ
+    bool is_in = false;           // operator is IN
+    bool is_slice = false;        // operator is LT/LTE/GT/GTE
+    bool is_upper_bound = false;  // operator is LT/LTE
+    bool is_lower_bound = false;  // operator is GT/GTE
+    expr::comparison_order order = expr::comparison_order::cql;
+    std::optional<expr::oper_t> op;  // the binary operator, if any
+    bool is_subscript = false;       // whether the LHS is a subscript (map element access)
+};
+
 ///In some cases checking if columns have indexes is undesired of even
 ///impossible, because e.g. the query runs on a pseudo-table, which does not
 ///have an index-manager, or even a table object.
 using check_indexes = bool_class<class check_indexes_tag>;

+// A function that returns the partition key ranges for a query. It is the solver of
+// WHERE clause fragments such as WHERE token(pk) > 1 or WHERE pk1 IN :list1 AND pk2 IN :list2.
+using get_partition_key_ranges_fn_t = std::function<dht::partition_range_vector (const query_options&)>;
+
+// A function that returns the clustering key ranges for a query. It is the solver of
+// WHERE clause fragments such as WHERE ck > 1 or WHERE (ck1, ck2) > (1, 2).
+using get_clustering_bounds_fn_t = std::function<std::vector<query::clustering_range> (const query_options& options)>;
+
+// A function that returns a singleton value, usable for a key (e.g. bytes_opt)
+using get_singleton_value_fn_t = std::function<bytes_opt (const query_options&)>;
+
+struct no_partition_range_restrictions {
+};
+
+struct token_range_restrictions {
+    predicate token_restrictions;
+};
+
+struct single_column_partition_range_restrictions {
+    std::vector<predicate> per_column_restrictions;
+};
+
+using partition_range_restrictions = std::variant<
+        no_partition_range_restrictions,
+        token_range_restrictions,
+        single_column_partition_range_restrictions>;
+
+// A map of per-column predicate vectors, ordered by schema position.
+using single_column_predicate_vectors = std::map<const column_definition*, std::vector<predicate>, expr::schema_pos_column_definition_comparator>;
+
 /**
 * The restrictions corresponding to the relations specified on the where-clause of CQL query.
 */
 class statement_restrictions {
+    struct private_tag {}; // Tag for private constructor
 private:
    schema_ptr _schema;

@@ -81,7 +179,7 @@ private:
    bool _has_queriable_regular_index = false, _has_queriable_pk_index = false, _has_queriable_ck_index = false;
    bool _has_multi_column; ///< True iff _clustering_columns_restrictions has a multi-column restriction.

-    std::optional<expr::expression> _where; ///< The entire WHERE clause.
+    std::vector<expr::expression> _where; ///< The entire WHERE clause (factorized).

    /// Parts of _where defining the clustering slice.
    ///
@@ -96,7 +194,7 @@ private:
    ///   4.4 elements other than the last have only EQ or IN atoms
    ///   4.5 the last element has only EQ, IN, or is_slice() atoms
    /// 5. if multi-column, then each element is a binary_operator
-    std::vector<expr::expression> _clustering_prefix_restrictions;
+    std::vector<predicate> _clustering_prefix_restrictions;

    /// Like _clustering_prefix_restrictions, but for the indexing table (if this is an index-reading statement).
    /// Recall that the index-table CK is (token, PK, CK) of the base table for a global index and (indexed column,
@@ -105,7 +203,7 @@ private:
    /// Elements are conjunctions of single-column binary operators with the same LHS.
    /// Element order follows the indexing-table clustering key.
    /// In case of a global index the first element's (token restriction) RHS is a dummy value, it is filled later.
-    std::optional<std::vector<expr::expression>> _idx_tbl_ck_prefix;
+    std::optional<std::vector<predicate>> _idx_tbl_ck_prefix;

    /// Parts of _where defining the partition range.
    ///
@@ -113,16 +211,25 @@ private:
    /// binary_operators on token.  If single-column restrictions define the partition range, each element holds
    /// restrictions for one partition column.  Each partition column has a corresponding element, but the elements
    /// are in arbitrary order.
-    std::vector<expr::expression> _partition_range_restrictions;
+    partition_range_restrictions _partition_range_restrictions;

    bool _partition_range_is_simple; ///< False iff _partition_range_restrictions imply a Cartesian product.


    check_indexes _check_indexes = check_indexes::yes;
+    /// Columns that appear on the LHS of an EQ restriction (not IN).
+    /// For multi-column EQ like (ck1, ck2) = (1, 2), all columns in the tuple are included.
+    std::unordered_set<const column_definition*> _columns_with_eq;
    std::vector<const column_definition*> _column_defs_for_filtering;
    schema_ptr _view_schema;
    std::optional<secondary_index::index> _idx_opt;
    expr::expression _idx_restrictions = expr::conjunction({});
+    get_partition_key_ranges_fn_t _get_partition_key_ranges_fn;
+    get_clustering_bounds_fn_t _get_clustering_bounds_fn;
+    get_clustering_bounds_fn_t _get_global_index_clustering_ranges_fn;
+    get_clustering_bounds_fn_t _get_global_index_token_clustering_ranges_fn;
+    get_clustering_bounds_fn_t _get_local_index_clustering_ranges_fn;
+    get_singleton_value_fn_t _value_for_index_partition_key_fn;
 public:
    /**
     * Creates a new empty <code>StatementRestrictions</code>.
@@ -130,9 +237,10 @@ public:
     * @param cfm the column family meta data
     * @return a new empty <code>StatementRestrictions</code>.
     */
-    statement_restrictions(schema_ptr schema, bool allow_filtering);
+    statement_restrictions(private_tag, schema_ptr schema, bool allow_filtering);

-    friend statement_restrictions analyze_statement_restrictions(
+public:
+    friend shared_ptr<const statement_restrictions> analyze_statement_restrictions(
        data_dictionary::database db,
        schema_ptr schema,
        statements::statement_type type,
@@ -142,9 +250,15 @@ public:
        bool for_view,
        bool allow_filtering,
        check_indexes do_check_indexes);
+    friend shared_ptr<const statement_restrictions> make_trivial_statement_restrictions(
+        schema_ptr schema,
+        bool allow_filtering);

-private:
-    statement_restrictions(data_dictionary::database db,
+    // Important: objects of this class captures `this` extensively and so must remain non-copyable.
+    statement_restrictions(const statement_restrictions&) = delete;
+    statement_restrictions& operator=(const statement_restrictions&) = delete;
+    statement_restrictions(private_tag,
+        data_dictionary::database db,
        schema_ptr schema,
        statements::statement_type type,
        const expr::expression& where_clause,
@@ -211,10 +325,7 @@ public:

    bool has_token_restrictions() const;

-    // Checks whether the given column has an EQ restriction.
-    // EQ restriction is `col = ...` or `(col, col2) = ...`
-    // IN restriction is NOT an EQ restriction, this function will not look for IN restrictions.
-    // Uses column_defintion::operator== for comparison, columns with the same name but different schema will not be equal.
+    // Checks whether the given column has an EQ restriction (not IN).
    bool has_eq_restriction_on_column(const column_definition&) const;

    /**
@@ -224,12 +335,6 @@ public:
     */
    std::vector<const column_definition*> get_column_defs_for_filtering(data_dictionary::database db) const;

-    /**
-     * Gives a score that the index has - index with the highest score will be chosen
-     * in find_idx()
-     */
-    int score(const secondary_index::index& index) const;
-
    /**
     * Determines the index to be used with the restriction.
     * @param db - the data_dictionary::database context (for extracting index manager)
@@ -250,18 +355,8 @@ public:

    size_t partition_key_restrictions_size() const;

-    bool parition_key_restrictions_have_supporting_index(const secondary_index::secondary_index_manager& index_manager, expr::allow_local_index allow_local) const;
-
    size_t clustering_columns_restrictions_size() const;

-    bool clustering_columns_restrictions_have_supporting_index(
-        const secondary_index::secondary_index_manager& index_manager,
-        expr::allow_local_index allow_local) const;
-
-    bool multi_column_clustering_restrictions_are_supported_by(const secondary_index::index& index) const;
-
-    bounds_slice get_clustering_slice() const;
-
    /**
     * Checks if the clustering key has some unrestricted components.
     * @return <code>true</code> if the clustering key has some unrestricted components, <code>false</code> otherwise.
@@ -279,15 +374,6 @@ public:

    schema_ptr get_view_schema() const { return _view_schema; }
 private:
-    std::pair<std::optional<secondary_index::index>, expr::expression> do_find_idx(const secondary_index::secondary_index_manager& sim) const;
-    void add_restriction(const expr::binary_operator& restr, schema_ptr schema, bool allow_filtering, bool for_view);
-    void add_is_not_restriction(const expr::binary_operator& restr, schema_ptr schema, bool for_view);
-    void add_single_column_parition_key_restriction(const expr::binary_operator& restr, schema_ptr schema, bool allow_filtering, bool for_view);
-    void add_token_partition_key_restriction(const expr::binary_operator& restr);
-    void add_single_column_clustering_key_restriction(const expr::binary_operator& restr, schema_ptr schema, bool allow_filtering);
-    void add_multi_column_clustering_key_restriction(const expr::binary_operator& restr);
-    void add_single_column_nonprimary_key_restriction(const expr::binary_operator& restr);
-
    void process_partition_key_restrictions(bool for_view, bool allow_filtering, statements::statement_type type);

    /**
@@ -315,7 +401,17 @@ private:
    void add_clustering_restrictions_to_idx_ck_prefix(const schema& idx_tbl_schema);

    unsigned int num_clustering_prefix_columns_that_need_not_be_filtered() const;
-    void calculate_column_defs_for_filtering_and_erase_restrictions_used_for_index(data_dictionary::database db);
+    void calculate_column_defs_for_filtering_and_erase_restrictions_used_for_index(
+            data_dictionary::database db,
+            const single_column_predicate_vectors& sc_pk_pred_vectors,
+            const single_column_predicate_vectors& sc_ck_pred_vectors,
+            const single_column_predicate_vectors& sc_nonpk_pred_vectors);
+    get_partition_key_ranges_fn_t build_partition_key_ranges_fn() const;
+    get_clustering_bounds_fn_t build_get_clustering_bounds_fn() const;
+    get_clustering_bounds_fn_t build_get_global_index_clustering_ranges_fn() const;
+    get_clustering_bounds_fn_t build_get_global_index_token_clustering_ranges_fn() const;
+    get_clustering_bounds_fn_t build_get_local_index_clustering_ranges_fn() const;
+    get_singleton_value_fn_t build_value_for_index_partition_key_fn() const;
 public:
    /**
     * Returns the specified range of the partition key.
@@ -389,7 +485,10 @@ public:
 private:
    /// Prepares internal data for evaluating index-table queries.  Must be called before
    /// get_local_index_clustering_ranges().
-    void prepare_indexed_local(const schema& idx_tbl_schema);
+    void prepare_indexed_local(const schema& idx_tbl_schema,
+            const single_column_predicate_vectors& sc_pk_pred_vectors,
+            const single_column_predicate_vectors& sc_ck_pred_vectors,
+            const single_column_predicate_vectors& sc_nonpk_pred_vectors);

    /// Prepares internal data for evaluating index-table queries.  Must be called before
    /// get_global_index_clustering_ranges() or get_global_index_token_clustering_ranges().
@@ -398,15 +497,18 @@ private:
 public:
    /// Calculates clustering ranges for querying a global-index table.
    std::vector<query::clustering_range> get_global_index_clustering_ranges(
-            const query_options& options, const schema& idx_tbl_schema) const;
+            const query_options& options) const;

    /// Calculates clustering ranges for querying a global-index table for queries with token restrictions present.
    std::vector<query::clustering_range> get_global_index_token_clustering_ranges(
-            const query_options& options, const schema& idx_tbl_schema) const;
+            const query_options& options) const;

    /// Calculates clustering ranges for querying a local-index table.
    std::vector<query::clustering_range> get_local_index_clustering_ranges(
-            const query_options& options, const schema& idx_tbl_schema) const;
+            const query_options& options) const;
+
+    /// Finds the value of partition key of the index table
+    bytes_opt value_for_index_partition_key(const query_options&) const;

    sstring to_string() const;

@@ -416,7 +518,7 @@ public:
    bool is_empty() const;
 };

-statement_restrictions analyze_statement_restrictions(
+shared_ptr<const statement_restrictions> analyze_statement_restrictions(
        data_dictionary::database db,
        schema_ptr schema,
        statements::statement_type type,
@@ -427,23 +529,14 @@ statement_restrictions analyze_statement_restrictions(
        bool allow_filtering,
        check_indexes do_check_indexes);

-
-// Extracts all binary operators which have the given column on their left hand side.
-// Extracts only single-column restrictions.
-// Does not include multi-column restrictions.
-// Does not include token() restrictions.
-// Does not include boolean constant restrictions.
-// For example "WHERE c = 1 AND (a, c) = (2, 1) AND token(p) < 2 AND FALSE" will return {"c = 1"}.
-std::vector<expr::expression> extract_single_column_restrictions_for_column(const expr::expression&, const column_definition&);
+shared_ptr<const statement_restrictions> make_trivial_statement_restrictions(
+        schema_ptr schema,
+        bool allow_filtering);


 // Checks whether this expression is empty - doesn't restrict anything
 bool is_empty_restriction(const expr::expression&);

-// Finds the value of the given column in the expression
-// In case of multpiple possible values calls on_internal_error
-bytes_opt value_for(const column_definition&, const expr::expression&, const query_options&);
-
 }

 }
--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -90,6 +90,20 @@ void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, c
                auto& current_rf_per_dc = ks.metadata()->strategy_options();
                auto new_rf_per_dc = _attrs->get_replication_options();
                new_rf_per_dc.erase(ks_prop_defs::REPLICATION_STRATEGY_CLASS_KEY);
+                // Check if multi-RF change is allowed: all DC changes must be 0->N or N->0.
+                auto all_changes_are_0_N = [&] {
+                    for (const auto& [dc, new_rf] : new_rf_per_dc) {
+                        auto old_rf_val = size_t(0);
+                        if (auto it = current_rf_per_dc.find(dc); it != current_rf_per_dc.end()) {
+                            old_rf_val = locator::get_replication_factor(it->second);
+                        }
+                        auto new_rf_val = locator::get_replication_factor(new_rf);
+                        if (old_rf_val != new_rf_val && old_rf_val != 0 && new_rf_val != 0) {
+                            return false;
+                        }
+                    }
+                    return true;
+                };
                unsigned total_abs_rfs_diff = 0;
                for (const auto& [new_dc, new_rf] : new_rf_per_dc) {
                    auto old_rf = locator::replication_strategy_config_option(sstring("0"));
@@ -103,7 +117,9 @@ void cql3::statements::alter_keyspace_statement::validate(query_processor& qp, c
                        // first we need to report non-existing DCs, then if RFs aren't changed by too much.
                        continue;
                    }
-                    if (total_abs_rfs_diff += get_abs_rf_diff(old_rf, new_rf); total_abs_rfs_diff >= 2) {
+                    if (total_abs_rfs_diff += get_abs_rf_diff(old_rf, new_rf); total_abs_rfs_diff >= 2 &&
+                            !(qp.proxy().features().keyspace_multi_rf_change && locator::uses_rack_list_exclusively(current_rf_per_dc)
+                            && locator::uses_rack_list_exclusively(new_ks->strategy_options()) && all_changes_are_0_N())) {
                        throw exceptions::invalid_request_exception("Only one DC's RF can be changed at a time and not by more than 1");
                    }
                }
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -89,6 +89,10 @@ public:

    const std::vector<single_statement>& statements() const { return _statements; }

+    audit::audit_info_ptr audit_info() const {
+        return audit::audit::create_audit_info(audit::statement_category::DML, sstring(), sstring(), true);
+    }
+
    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;
--- a/cql3/statements/ks_prop_defs.cc
+++ b/cql3/statements/ks_prop_defs.cc
@@ -411,10 +411,10 @@ bool ks_prop_defs::get_durable_writes() const {

 lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata(sstring ks_name, const locator::token_metadata& tm, const gms::feature_service& feat, const db::config& cfg) {
    auto sc = get_replication_strategy_class().value();
-    // if tablets options have not been specified, but tablets are globally enabled, set the value to 0 for N.T.S. only
+    // if tablets options have not been specified, but tablets are globally enabled, set the value to 0. The strategy will
+    // validate it and throw an error if it does not support tablets.
    auto enable_tablets = feat.tablets && cfg.enable_tablets_by_default();
-    std::optional<unsigned> default_initial_tablets = enable_tablets && locator::abstract_replication_strategy::to_qualified_class_name(sc) == "org.apache.cassandra.locator.NetworkTopologyStrategy"
-            ? std::optional<unsigned>(0) : std::nullopt;
+    std::optional<unsigned> default_initial_tablets = enable_tablets ? std::optional<unsigned>(0) : std::nullopt;
    auto initial_tablets = get_initial_tablets(default_initial_tablets, cfg.enforce_tablets());
    bool uses_tablets = initial_tablets.has_value();
    bool rack_list_enabled = utils::get_local_injector().enter("create_with_numeric") ? false : feat.rack_list_rf;
@@ -440,7 +440,7 @@ lw_shared_ptr<data_dictionary::keyspace_metadata> ks_prop_defs::as_ks_metadata_u
        sc = old->strategy_name();
        options = old_options;
    }
-    return data_dictionary::keyspace_metadata::new_keyspace(old->name(), *sc, options, initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options());
+    return data_dictionary::keyspace_metadata::new_keyspace(old->name(), *sc, options, initial_tablets, get_consistency_option(), get_boolean(KW_DURABLE_WRITES, true), get_storage_options(), {}, old->next_strategy_options_opt());
 }

 namespace {
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -626,7 +626,7 @@ modification_statement::prepare(data_dictionary::database db, prepare_context& c
    // Since this cache is only meaningful for LWT queries, just clear the ids
    // if it's not a conditional statement so that the AST nodes don't
    // participate in the caching mechanism later.
-    if (!prepared_stmt->has_conditions() && prepared_stmt->_restrictions.has_value()) {
+    if (!prepared_stmt->has_conditions() && prepared_stmt->_restrictions) {
        ctx.clear_pk_function_calls_cache();
    }
    prepared_stmt->_may_use_token_aware_routing = ctx.get_partition_key_bind_indexes(*schema).size() != 0;
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -94,7 +94,7 @@ private:
    std::optional<bool> _is_raw_counter_shard_write;

 protected:
-    std::optional<restrictions::statement_restrictions> _restrictions;
+    shared_ptr<const restrictions::statement_restrictions> _restrictions;
 public:
    typedef std::optional<std::unordered_map<sstring, bytes_opt>> json_cache_opt;

--- a/cql3/statements/prune_materialized_view_statement.hh
+++ b/cql3/statements/prune_materialized_view_statement.hh
@@ -19,7 +19,7 @@ public:
                     uint32_t bound_terms,
                     lw_shared_ptr<const parameters> parameters,
                     ::shared_ptr<selection::selection> selection,
-                     ::shared_ptr<restrictions::statement_restrictions> restrictions,
+                     ::shared_ptr<const restrictions::statement_restrictions> restrictions,
                     ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
                     bool is_reversed,
                     ordering_comparator_type ordering_comparator,
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -109,7 +109,7 @@ public:
    std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats, const cql_config& cfg, bool for_view);
 private:
    std::vector<selection::prepared_selector> maybe_jsonize_select_clause(std::vector<selection::prepared_selector> select, data_dictionary::database db, schema_ptr schema);
-    ::shared_ptr<restrictions::statement_restrictions> prepare_restrictions(
+    ::shared_ptr<const restrictions::statement_restrictions> prepare_restrictions(
        data_dictionary::database db,
        schema_ptr schema,
        prepare_context& ctx,
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -1027,7 +1027,7 @@ view_indexed_table_select_statement::prepare(data_dictionary::database db,
                                        uint32_t bound_terms,
                                        lw_shared_ptr<const parameters> parameters,
                                        ::shared_ptr<selection::selection> selection,
-                                        ::shared_ptr<restrictions::statement_restrictions> restrictions,
+                                        ::shared_ptr<const restrictions::statement_restrictions> restrictions,
                                        ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
                                        bool is_reversed,
                                        ordering_comparator_type ordering_comparator,
@@ -1139,7 +1139,7 @@ lw_shared_ptr<const service::pager::paging_state> view_indexed_table_select_stat
    auto& last_base_pk = last_pos.partition;
    auto* last_base_ck = last_pos.position.has_key() ? &last_pos.position.key() : nullptr;

-    bytes_opt indexed_column_value = restrictions::value_for(*cdef, _used_index_restrictions, options);
+    bytes_opt indexed_column_value = _restrictions->value_for_index_partition_key(options);

    auto index_pk = [&]() {
        if (_index.metadata().local()) {
@@ -1350,12 +1350,7 @@ dht::partition_range_vector view_indexed_table_select_statement::get_partition_r
 dht::partition_range_vector view_indexed_table_select_statement::get_partition_ranges_for_global_index_posting_list(const query_options& options) const {
    dht::partition_range_vector partition_ranges;

-    const column_definition* cdef = _schema->get_column_definition(to_bytes(_index.target_column()));
-    if (!cdef) {
-        throw exceptions::invalid_request_exception("Indexed column not found in schema");
-    }
-
-    bytes_opt value = restrictions::value_for(*cdef, _used_index_restrictions, options);
+    bytes_opt value = _restrictions->value_for_index_partition_key(options);
    if (value) {
        auto pk = partition_key::from_single_value(*_view_schema, *value);
        auto dk = dht::decorate_key(*_view_schema, pk);
@@ -1374,11 +1369,11 @@ query::partition_slice view_indexed_table_select_statement::get_partition_slice_
        // Only EQ restrictions on base partition key can be used in an index view query
        if (pk_restrictions_is_single && _restrictions->partition_key_restrictions_is_all_eq()) {
            partition_slice_builder.with_ranges(
-                    _restrictions->get_global_index_clustering_ranges(options, *_view_schema));
+                    _restrictions->get_global_index_clustering_ranges(options));
        } else if (_restrictions->has_token_restrictions()) {
            // Restrictions like token(p1, p2) < 0 have all partition key components restricted, but require special handling.
            partition_slice_builder.with_ranges(
-                    _restrictions->get_global_index_token_clustering_ranges(options, *_view_schema));
+                    _restrictions->get_global_index_token_clustering_ranges(options));
        }
    }

@@ -1389,7 +1384,7 @@ query::partition_slice view_indexed_table_select_statement::get_partition_slice_
    partition_slice_builder partition_slice_builder{*_view_schema};

    partition_slice_builder.with_ranges(
-        _restrictions->get_local_index_clustering_ranges(options, *_view_schema));
+        _restrictions->get_local_index_clustering_ranges(options));

    return partition_slice_builder.build();
 }
@@ -1607,7 +1602,7 @@ public:
        uint32_t bound_terms,
        lw_shared_ptr<const parameters> parameters,
        ::shared_ptr<selection::selection> selection,
-        ::shared_ptr<restrictions::statement_restrictions> restrictions,
+        ::shared_ptr<const restrictions::statement_restrictions> restrictions,
        ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
        bool is_reversed,
        ordering_comparator_type ordering_comparator,
@@ -1645,7 +1640,7 @@ private:
    uint32_t bound_terms,
    lw_shared_ptr<const select_statement::parameters> parameters,
    ::shared_ptr<selection::selection> selection,
-    ::shared_ptr<restrictions::statement_restrictions> restrictions,
+    ::shared_ptr<const restrictions::statement_restrictions> restrictions,
    ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
    bool is_reversed,
    parallelized_select_statement::ordering_comparator_type ordering_comparator,
@@ -2076,7 +2071,7 @@ static select_statement::ordering_comparator_type get_similarity_ordering_compar

 ::shared_ptr<cql3::statements::select_statement> vector_indexed_table_select_statement::prepare(data_dictionary::database db, schema_ptr schema,
        uint32_t bound_terms, lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
-        ::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
+        ::shared_ptr<const restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
        ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
        std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<attributes> attrs) {

@@ -2589,7 +2584,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(data_dictionary::d
    return make_unique<prepared_statement>(audit_info(), std::move(stmt), ctx, std::move(partition_key_bind_indices), std::move(warnings));
 }

-::shared_ptr<restrictions::statement_restrictions>
+::shared_ptr<const restrictions::statement_restrictions>
 select_statement::prepare_restrictions(data_dictionary::database db,
                                       schema_ptr schema,
                                       prepare_context& ctx,
@@ -2599,8 +2594,8 @@ select_statement::prepare_restrictions(data_dictionary::database db,
                                       restrictions::check_indexes do_check_indexes)
 {
    try {
-        return ::make_shared<restrictions::statement_restrictions>(restrictions::analyze_statement_restrictions(db, schema, statement_type::SELECT, _where_clause, ctx,
-            selection->contains_only_static_columns(), for_view, allow_filtering, do_check_indexes));
+        return restrictions::analyze_statement_restrictions(db, schema, statement_type::SELECT, _where_clause, ctx,
+            selection->contains_only_static_columns(), for_view, allow_filtering, do_check_indexes);
    } catch (const exceptions::unrecognized_entity_exception& e) {
        if (contains_alias(e.entity)) {
            throw exceptions::invalid_request_exception(format("Aliases aren't allowed in the WHERE clause (name: '{}')", e.entity));
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -200,7 +200,7 @@ public:
                                                                    uint32_t bound_terms,
                                                                    lw_shared_ptr<const parameters> parameters,
                                                                    ::shared_ptr<selection::selection> selection,
-                                                                    ::shared_ptr<restrictions::statement_restrictions> restrictions,
+                                                                    ::shared_ptr<const restrictions::statement_restrictions> restrictions,
                                                                    ::shared_ptr<std::vector<size_t>> group_by_cell_indices,
                                                                    bool is_reversed,
                                                                    ordering_comparator_type ordering_comparator,
@@ -372,7 +372,7 @@ public:

    static ::shared_ptr<cql3::statements::select_statement> prepare(data_dictionary::database db, schema_ptr schema, uint32_t bound_terms,
            lw_shared_ptr<const parameters> parameters, ::shared_ptr<selection::selection> selection,
-            ::shared_ptr<restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
+            ::shared_ptr<const restrictions::statement_restrictions> restrictions, ::shared_ptr<std::vector<size_t>> group_by_cell_indices, bool is_reversed,
            ordering_comparator_type ordering_comparator, prepared_ann_ordering_type prepared_ann_ordering, std::optional<expr::expression> limit,
            std::optional<expr::expression> per_partition_limit, cql_stats& stats, const secondary_index::index& index, std::unique_ptr<cql3::attributes> attrs);

--- a/cql3/statements/update_statement.hh
+++ b/cql3/statements/update_statement.hh
@@ -66,7 +66,7 @@ public:
        : update_statement(std::move(audit_info), statement_type::INSERT, bound_terms, s, std::move(attrs), stats)
        , _value(std::move(v))
        , _default_unset(default_unset) {
-        _restrictions = restrictions::statement_restrictions(s, false);
+        _restrictions = cql3::restrictions::make_trivial_statement_restrictions(s, false);
    }
 private:
    virtual void execute_operations_for_key(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params, const json_cache_opt& json_cache) const override;
--- a/data_dictionary/data_dictionary.cc
+++ b/data_dictionary/data_dictionary.cc
@@ -224,10 +224,12 @@ keyspace_metadata::keyspace_metadata(std::string_view name,
             bool durable_writes,
             std::vector<schema_ptr> cf_defs,
             user_types_metadata user_types,
-             storage_options storage_opts)
+             storage_options storage_opts,
+             std::optional<locator::replication_strategy_config_options> next_options)
    : _name{name}
    , _strategy_name{locator::abstract_replication_strategy::to_qualified_class_name(strategy_name.empty() ? "NetworkTopologyStrategy" : strategy_name)}
    , _strategy_options{std::move(strategy_options)}
+    , _next_strategy_options{std::move(next_options)}
    , _initial_tablets(initial_tablets)
    , _durable_writes{durable_writes}
    , _user_types{std::move(user_types)}
@@ -273,14 +275,15 @@ keyspace_metadata::new_keyspace(std::string_view name,
                                std::optional<consistency_config_option> consistency_option,
                                bool durables_writes,
                                storage_options storage_opts,
-                                std::vector<schema_ptr> cf_defs)
+                                std::vector<schema_ptr> cf_defs,
+                                std::optional<locator::replication_strategy_config_options> next_options)
 {
-    return ::make_lw_shared<keyspace_metadata>(name, strategy_name, options, initial_tablets, consistency_option, durables_writes, cf_defs, user_types_metadata{}, storage_opts);
+    return ::make_lw_shared<keyspace_metadata>(name, strategy_name, options, initial_tablets, consistency_option, durables_writes, cf_defs, user_types_metadata{}, storage_opts, next_options);
 }

 lw_shared_ptr<keyspace_metadata>
 keyspace_metadata::new_keyspace(const keyspace_metadata& ksm) {
-    return new_keyspace(ksm.name(), ksm.strategy_name(), ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option(), ksm.durable_writes(), ksm.get_storage_options());
+    return new_keyspace(ksm.name(), ksm.strategy_name(), ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option(), ksm.durable_writes(), ksm.get_storage_options(), {}, ksm.next_strategy_options_opt());
 }

 void keyspace_metadata::add_user_type(const user_type ut) {
@@ -336,7 +339,7 @@ static storage_options::object_storage object_storage_from_map(std::string_view
    }
    if (values.size() > allowed_options.size()) {
        throw std::runtime_error(fmt::format("Extraneous options for {}: {}; allowed: {}",
-            fmt::join(values | std::views::keys, ","), type,
+            type, fmt::join(values | std::views::keys, ","),
            fmt::join(allowed_options | std::views::keys, ",")));
    }
    options.type = std::string(type);
@@ -649,8 +652,8 @@ struct fmt::formatter<data_dictionary::user_types_metadata> {
 };

 auto fmt::formatter<data_dictionary::keyspace_metadata>::format(const data_dictionary::keyspace_metadata& m, fmt::format_context& ctx) const -> decltype(ctx.out()) {
-    fmt::format_to(ctx.out(), "KSMetaData{{name={}, strategyClass={}, strategyOptions={}, cfMetaData={}, durable_writes={}, tablets=",
-            m.name(), m.strategy_name(), m.strategy_options(), m.cf_meta_data(), m.durable_writes());
+    fmt::format_to(ctx.out(), "KSMetaData{{name={}, strategyClass={}, strategyOptions={}, nextStrategyOptions={}, cfMetaData={}, durable_writes={}, tablets=",
+            m.name(), m.strategy_name(), m.strategy_options(), m.next_strategy_options_opt(), m.cf_meta_data(), m.durable_writes());
    if (m.initial_tablets()) {
        if (auto initial_tablets = m.initial_tablets().value()) {
            fmt::format_to(ctx.out(), "{{\"initial\":{}}}", initial_tablets);
--- a/data_dictionary/keyspace_metadata.hh
+++ b/data_dictionary/keyspace_metadata.hh
@@ -28,7 +28,9 @@ namespace data_dictionary {
 class keyspace_metadata final {
    sstring _name;
    sstring _strategy_name;
+    // If _next_strategy_options has value, there is ongoing rf change of this keyspace.
    locator::replication_strategy_config_options _strategy_options;
+    std::optional<locator::replication_strategy_config_options> _next_strategy_options;
    std::optional<unsigned> _initial_tablets;
    std::unordered_map<sstring, schema_ptr> _cf_meta_data;
    bool _durable_writes;
@@ -44,7 +46,8 @@ public:
                 bool durable_writes,
                 std::vector<schema_ptr> cf_defs = std::vector<schema_ptr>{},
                 user_types_metadata user_types = user_types_metadata{},
-                 storage_options storage_opts = storage_options{});
+                 storage_options storage_opts = storage_options{},
+                 std::optional<locator::replication_strategy_config_options> next_options = std::nullopt);
    static lw_shared_ptr<keyspace_metadata>
    new_keyspace(std::string_view name,
                 std::string_view strategy_name,
@@ -53,7 +56,8 @@ public:
                 std::optional<consistency_config_option> consistency_option,
                 bool durables_writes = true,
                 storage_options storage_opts = {},
-                 std::vector<schema_ptr> cf_defs = {});
+                 std::vector<schema_ptr> cf_defs = {},
+                 std::optional<locator::replication_strategy_config_options> next_options = std::nullopt);
    static lw_shared_ptr<keyspace_metadata>
    new_keyspace(const keyspace_metadata& ksm);
    void validate(const gms::feature_service&, const locator::topology&) const;
@@ -66,6 +70,18 @@ public:
    const locator::replication_strategy_config_options& strategy_options() const {
        return _strategy_options;
    }
+    void set_strategy_options(const locator::replication_strategy_config_options& options) {
+        _strategy_options = options;
+    }
+    const std::optional<locator::replication_strategy_config_options>& next_strategy_options_opt() const {
+        return _next_strategy_options;
+    }
+    void set_next_strategy_options(const locator::replication_strategy_config_options& options) {
+        _next_strategy_options = options;
+    }
+    void clear_next_strategy_options() {
+        _next_strategy_options = std::nullopt;
+    }
    locator::replication_strategy_config_options strategy_options_v1() const;
    std::optional<unsigned> initial_tablets() const {
        return _initial_tablets;
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -776,7 +776,7 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
    friend std::ostream& operator<<(std::ostream&, const segment&);
    friend class segment_manager;

-    size_t sector_overhead(size_t size) const {
+    constexpr size_t sector_overhead(size_t size) const {
        return (size / (_alignment - detail::sector_overhead_size)) * detail::sector_overhead_size;
    }

@@ -1028,18 +1028,21 @@ public:
        co_return me;
    }

-    /**
-     * Allocate a new buffer
-     */
-    void new_buffer(size_t s) {
-        SCYLLA_ASSERT(_buffer.empty());
-
+    std::tuple<size_t, size_t> buffer_usage_size(size_t s) const {
        auto overhead = segment_overhead_size;
        if (_file_pos == 0) {
            overhead += descriptor_header_size;
        }

-        s += overhead;
+        return {s + overhead, overhead};
+    }
+
+    /**
+     * Allocate a new buffer
+     */
+    void new_buffer(size_t size_in) {
+        SCYLLA_ASSERT(_buffer.empty());
+        auto [s, overhead] = buffer_usage_size(size_in);
        // add bookkeep data reqs. 
        auto a = align_up(s + sector_overhead(s), _alignment);
        auto k = std::max(a, default_size);
@@ -1427,6 +1430,9 @@ public:

    position_type next_position(size_t size) const {
        auto used = _buffer_ostream_size - _buffer_ostream.size();
+        if (used == 0) { // new chunk/segment
+            std::tie(size, std::ignore) = buffer_usage_size(size);
+        }
        used += size;
        return _file_pos + used + sector_overhead(used);
    }
@@ -1570,7 +1576,6 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
    clogger.debug("Attempting oversized alloc of {} entry writer", writer.num_entries);

    auto size = writer.size();
-    auto max_file_size = cfg.commitlog_segment_size_in_mb * 1024 * 1024;

    // check if this cannot be written at all...
    if (!cfg.allow_going_over_size_limit) {
@@ -1579,11 +1584,11 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
        // more worst case
        auto size_with_meta_overhead = size_with_sector_overhead
            + (1 + size_with_sector_overhead/max_mutation_size) * (segment::entry_overhead_size + segment::fragmented_entry_overhead_size + segment::segment_overhead_size)
-            * (1 + size_with_sector_overhead/max_file_size) * segment::descriptor_header_size
+            * (1 + size_with_sector_overhead/max_size) * segment::descriptor_header_size
            ;
        // this is not really true. We could have some space in current segment,
        // but again, lets be conservative.
-        auto max_file_size_avail = max_disk_size - max_file_size;
+        auto max_file_size_avail = max_disk_size - max_size;

        if (size_with_meta_overhead > max_file_size_avail) {
            throw std::invalid_argument(fmt::format("Mutation of {} bytes is too large for potentially available disk space of {}", size, max_file_size_avail));
@@ -1770,11 +1775,13 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
                    co_await s->close();
                    s = co_await get_segment();
                }
-                // bytes not counting overhead                
-                auto buf_rem = std::min(max_size - s->position(), s->_buffer_ostream.size());
+                // bytes not counting overhead
+                auto pos = s->position();
+                auto max = std::max<size_t>(pos, max_size);
+                auto buf_rem = std::min(max_size - max, s->_buffer_ostream.size());

                size_t avail;
-                if (buf_rem > align) {
+                if (buf_rem >= align) {
                    auto rem2 = buf_rem - (1 + buf_rem/sector_size) * detail::sector_overhead_size;
                    avail = std::min(rem2, max_mutation_size)
                        - segment::entry_overhead_size
@@ -1784,7 +1791,7 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
                } else {
                    co_await s->cycle();
                    auto pos = s->position();
-                    auto max = std::max<size_t>(pos, max_file_size);
+                    auto max = std::max<size_t>(pos, max_size);
                    auto file_rem = max - pos;

                    if (file_rem < align) {
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -217,7 +217,7 @@ future<> db::commitlog_replayer::impl::process(stats* s, commitlog::buffer_and_r
        if (cm_it == local_cm.end()) {
            if (!cer.get_column_mapping()) {
                rlogger.debug("replaying at {} v={} at {}", fm.column_family_id(), fm.schema_version(), rp);
-                throw std::runtime_error(format("unknown schema version {}, table=", fm.schema_version(), fm.column_family_id()));
+                throw std::runtime_error(format("unknown schema version {}, table={}", fm.schema_version(), fm.column_family_id()));
            }
            rlogger.debug("new schema version {} in entry {}", fm.schema_version(), rp);
            cm_it = local_cm.emplace(fm.schema_version(), *cer.get_column_mapping()).first;
--- a/db/config.cc
+++ b/db/config.cc
@@ -1921,7 +1921,7 @@ std::map<sstring, db::experimental_features_t::feature> db::experimental_feature
        {"lwt", feature::UNUSED},
        {"udf", feature::UDF},
        {"cdc", feature::UNUSED},
-        {"alternator-streams", feature::ALTERNATOR_STREAMS},
+        {"alternator-streams", feature::UNUSED},
        {"alternator-ttl", feature::UNUSED },
        {"consistent-topology-changes", feature::UNUSED},
        {"broadcast-tables", feature::BROADCAST_TABLES},
--- a/db/config.hh
+++ b/db/config.hh
@@ -115,7 +115,6 @@ struct experimental_features_t {
    enum class feature {
        UNUSED,
        UDF,
-        ALTERNATOR_STREAMS,
        BROADCAST_TABLES,
        KEYSPACE_STORAGE_OPTIONS,
        STRONGLY_CONSISTENT_TABLES,
--- a/db/consistency_level.cc
+++ b/db/consistency_level.cc
@@ -277,7 +277,7 @@ filter_for_query(consistency_level cl,

    host_id_vector_replica_set selected_endpoints;

-    // Pre-select endpoints based on client preference. If the endpoints
+    // Preselect endpoints based on client preference. If the endpoints
    // selected this way aren't enough to satisfy CL requirements select the
    // remaining ones according to the load-balancing strategy as before.
    if (!preferred_endpoints.empty()) {
--- a/db/heat_load_balance.cc
+++ b/db/heat_load_balance.cc
@@ -327,7 +327,7 @@ redistribute(const std::vector<float>& p, unsigned me, unsigned k) {
                }
            }

-            hr_logger.trace("     pp after1=", pp);
+            hr_logger.trace("     pp after1={}", pp);
            if (d.first == me) {
                // We only care what "me" sends, and only the elements in
                // the sorted list earlier than me could have forced it to
--- a/db/schema_features.hh
+++ b/db/schema_features.hh
@@ -33,6 +33,11 @@ enum class schema_feature {

    // Per-table tablet options
    TABLET_OPTIONS,
+
+    // When enabled, `system_schema.keyspaces` will keep three replication values:
+    // the initial, the current, and the target replication factor,
+    // which reflect the phases of the multi RF change.
+    KEYSPACE_MULTI_RF_CHANGE,
 };

 using schema_features = enum_set<super_enum<schema_feature,
@@ -43,7 +48,8 @@ using schema_features = enum_set<super_enum<schema_feature,
    schema_feature::TABLE_DIGEST_INSENSITIVE_TO_EXPIRY,
    schema_feature::GROUP0_SCHEMA_VERSIONING,
    schema_feature::IN_MEMORY_TABLES,
-    schema_feature::TABLET_OPTIONS
+    schema_feature::TABLET_OPTIONS,
+    schema_feature::KEYSPACE_MULTI_RF_CHANGE
    >>;

 }
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -216,6 +216,7 @@ schema_ptr keyspaces() {
            {"durable_writes", boolean_type},
            {"replication", map_type_impl::get_instance(utf8_type, utf8_type, false)},
            {"replication_v2", map_type_impl::get_instance(utf8_type, utf8_type, false)}, // with rack list RF
+            {"next_replication", map_type_impl::get_instance(utf8_type, utf8_type, false)}, // target rack list RF for this RF change
        },
        // static columns
        {},
@@ -1178,6 +1179,14 @@ utils::chunked_vector<mutation> make_create_keyspace_mutations(schema_features f
        // If the maps are different, the upgrade must be already done.
        store_map(m, ckey, "replication_v2", timestamp, cql3::statements::to_flattened_map(map));
    }
+    if (features.contains<schema_feature::KEYSPACE_MULTI_RF_CHANGE>()) {
+        const auto& next_map_opt = keyspace->next_strategy_options_opt();
+        if (next_map_opt) {
+            auto next_map = *next_map_opt;
+            next_map["class"] = keyspace->strategy_name();
+            store_map(m, ckey, "next_replication", timestamp, cql3::statements::to_flattened_map(next_map));
+        }
+    }

    if (features.contains<schema_feature::SCYLLA_KEYSPACES>()) {
        schema_ptr scylla_keyspaces_s = scylla_keyspaces();
@@ -1251,6 +1260,7 @@ future<lw_shared_ptr<keyspace_metadata>> create_keyspace_metadata(
    // (or screw up shared pointers)
    const auto& replication = row.get_nonnull<map_type_impl::native_type>("replication");
    const auto& replication_v2 = row.get<map_type_impl::native_type>("replication_v2");
+    const auto& next_replication = row.get<map_type_impl::native_type>("next_replication");

    cql3::statements::property_definitions::map_type flat_strategy_options;
    for (auto& p : replication_v2 ? *replication_v2 : replication) {
@@ -1259,6 +1269,17 @@ future<lw_shared_ptr<keyspace_metadata>> create_keyspace_metadata(
    auto strategy_options = cql3::statements::from_flattened_map(flat_strategy_options);
    auto strategy_name = std::get<sstring>(strategy_options["class"]);
    strategy_options.erase("class");
+
+    std::optional<cql3::statements::property_definitions::extended_map_type> next_strategy_options = std::nullopt;
+    if (next_replication) {
+        cql3::statements::property_definitions::map_type flat_next_replication;
+        for (auto& p : *next_replication) {
+            flat_next_replication.emplace(value_cast<sstring>(p.first), value_cast<sstring>(p.second));
+        }
+        next_strategy_options = cql3::statements::from_flattened_map(flat_next_replication);
+        next_strategy_options->erase("class");
+    }
+
    bool durable_writes = row.get_nonnull<bool>("durable_writes");

    data_dictionary::storage_options storage_opts;
@@ -1284,7 +1305,7 @@ future<lw_shared_ptr<keyspace_metadata>> create_keyspace_metadata(
            }
        }
    }
-    co_return keyspace_metadata::new_keyspace(keyspace_name, strategy_name, strategy_options, initial_tablets, consistency, durable_writes, storage_opts);
+    co_return keyspace_metadata::new_keyspace(keyspace_name, strategy_name, strategy_options, initial_tablets, consistency, durable_writes, storage_opts, {}, next_strategy_options);
 }

 template<typename V>
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -13,7 +13,6 @@
 #include "replica/database.hh"
 #include "db/consistency_level_type.hh"
 #include "db/system_keyspace.hh"
-#include "db/config.hh"
 #include "schema/schema_builder.hh"
 #include "timeout_config.hh"
 #include "types/types.hh"
@@ -22,8 +21,6 @@
 #include "cdc/generation.hh"
 #include "cql3/query_processor.hh"
 #include "service/storage_proxy.hh"
-#include "gms/feature_service.hh"
-
 #include "service/migration_manager.hh"
 #include "locator/host_id.hh"

@@ -41,27 +38,10 @@ static logging::logger dlogger("system_distributed_keyspace");
 extern logging::logger cdc_log;

 namespace db {
-namespace {
-    const auto set_wait_for_sync_to_commitlog = schema_builder::register_schema_initializer([](schema_builder& builder) {
-        if ((builder.ks_name() == system_distributed_keyspace::NAME_EVERYWHERE && builder.cf_name() == system_distributed_keyspace::CDC_GENERATIONS_V2) ||
-            (builder.ks_name() == system_distributed_keyspace::NAME && builder.cf_name() == system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION))
-        {
-            builder.set_wait_for_sync_to_commitlog(true);
-        }
-    });
-}

 extern thread_local data_type cdc_streams_set_type;
 thread_local data_type cdc_streams_set_type = set_type_impl::get_instance(bytes_type, false);

-/* See `token_range_description` struct */
-thread_local data_type cdc_streams_list_type = list_type_impl::get_instance(bytes_type, false);
-thread_local data_type cdc_token_range_description_type = tuple_type_impl::get_instance(
-        { long_type             // dht::token token_range_end;
-        , cdc_streams_list_type // std::vector<stream_id> streams;
-        , byte_type             // uint8_t sharding_ignore_msb;
-        });
-thread_local data_type cdc_generation_description_type = list_type_impl::get_instance(cdc_token_range_description_type, false);

 schema_ptr view_build_status() {
    static thread_local auto schema = [] {
@@ -77,42 +57,6 @@ schema_ptr view_build_status() {
    return schema;
 }

-/* An internal table used by nodes to exchange CDC generation data. */
-schema_ptr cdc_generations_v2() {
-    thread_local auto schema = [] {
-        auto id = generate_legacy_id(system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2);
-        return schema_builder(system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2, {id})
-                /* The unique identifier of this generation. */
-                .with_column("id", uuid_type, column_kind::partition_key)
-                /* The generation describes a mapping from all tokens in the token ring to a set of stream IDs.
-                 * This mapping is built from a bunch of smaller mappings, each describing how tokens in a subrange
-                 * of the token ring are mapped to stream IDs; these subranges together cover the entire token ring.
-                 * Each such range-local mapping is represented by a row of this table.
-                 * The clustering key of the row is the end of the range being described by this row.
-                 * The start of this range is the range_end of the previous row (in the clustering order, which is the integer order)
-                 * or of the last row of this partition if this is the first the first row. */
-                .with_column("range_end", long_type, column_kind::clustering_key)
-                /* The set of streams mapped to in this range.
-                 * The number of streams mapped to a single range in a CDC generation is bounded from above by the number
-                 * of shards on the owner of that range in the token ring.
-                 * In other words, the number of elements of this set is bounded by the maximum of the number of shards
-                 * over all nodes. The serialized size is obtained by counting about 20B for each stream.
-                 * For example, if all nodes in the cluster have at most 128 shards,
-                 * the serialized size of this set will be bounded by ~2.5 KB. */
-                .with_column("streams", cdc_streams_set_type)
-                /* The value of the `ignore_msb` sharding parameter of the node which was the owner of this token range
-                 * when the generation was first created. Together with the set of streams above it fully describes
-                 * the mapping for this particular range. */
-                .with_column("ignore_msb", byte_type)
-                /* Column used for sanity checking.
-                 * For a given generation it's equal to the number of ranges in this generation;
-                 * thus, after the generation is fully inserted, it must be equal to the number of rows in the partition. */
-                .with_column("num_ranges", int32_type, column_kind::static_column)
-                .with_hash_version()
-                .build();
-    }();
-    return schema;
-}

 /* A user-facing table providing identifiers of the streams used in CDC generations. */
 schema_ptr cdc_desc() {
@@ -152,23 +96,6 @@ schema_ptr cdc_timestamps() {

 static const sstring CDC_TIMESTAMPS_KEY = "timestamps";

-schema_ptr service_levels() {
-    static thread_local auto schema = [] {
-        auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS);
-        auto builder = schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS, std::make_optional(id))
-                .with_column("service_level", utf8_type, column_kind::partition_key)
-                .with_column("shares", int32_type);
-        if (utils::get_local_injector().is_enabled("service_levels_v1_table_without_shares")) {
-            builder.remove_column("shares");
-        }
-
-        return builder
-                .with_hash_version()
-                .build();
-    }();
-    return schema;
-}
-
 // This is the set of tables which this node ensures to exist in the cluster.
 // It does that by announcing the creation of these schemas on initialization
 // of the `system_distributed_keyspace` service (see `start()`), unless it first
@@ -182,19 +109,13 @@ schema_ptr service_levels() {
 static std::vector<schema_ptr> ensured_tables() {
    return {
        view_build_status(),
-        cdc_generations_v2(),
        cdc_desc(),
        cdc_timestamps(),
-        service_levels(),
    };
 }

 std::vector<schema_ptr> system_distributed_keyspace::all_distributed_tables() {
-    return {view_build_status(), cdc_desc(), cdc_timestamps(), service_levels()};
-}
-
-std::vector<schema_ptr> system_distributed_keyspace::all_everywhere_tables() {
-    return {cdc_generations_v2()};
+    return {view_build_status(), cdc_desc(), cdc_timestamps()};
 }

 system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor& qp, service::migration_manager& mm, service::storage_proxy& sp)
@@ -203,36 +124,6 @@ system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor&
        , _sp(sp) {
 }

-static std::vector<std::pair<std::string_view, data_type>> new_service_levels_columns(bool workload_prioritization_enabled) {
-    std::vector<std::pair<std::string_view, data_type>> new_columns {{"timeout", duration_type}, {"workload_type", utf8_type}};
-    if (workload_prioritization_enabled) {
-        new_columns.push_back({"shares", int32_type});
-    }
-    return new_columns;
-};
-
-static schema_ptr get_current_service_levels(data_dictionary::database db) {
-    return db.has_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS)
-            ? db.find_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS)
-            : service_levels();
-}
-
-static schema_ptr get_updated_service_levels(data_dictionary::database db, bool workload_prioritization_enabled) {
-    SCYLLA_ASSERT(this_shard_id() == 0);
-    auto schema = get_current_service_levels(db);
-    schema_builder b(schema);
-    for (const auto& col : new_service_levels_columns(workload_prioritization_enabled)) {
-        auto& [col_name, col_type] = col;
-        bytes options_name = to_bytes(col_name.data());
-        if (schema->get_column_definition(options_name)) {
-            continue;
-        }
-        b.with_column(options_name, col_type, column_kind::regular_column);
-    }
-    b.with_hash_version();
-    return b.build();
-}
-
 future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tables) {
    if (this_shard_id() != 0) {
        _started = true;
@@ -243,11 +134,9 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl

    while (true) {
        // Check if there is any work to do before taking the group 0 guard.
-        bool workload_prioritization_enabled = _sp.features().workload_prioritization;
-        bool keyspaces_setup = db.has_keyspace(NAME) && db.has_keyspace(NAME_EVERYWHERE);
+        bool keyspaces_setup = db.has_keyspace(NAME);
        bool tables_setup = std::all_of(tables.begin(), tables.end(), [db] (schema_ptr t) { return db.has_schema(t->ks_name(), t->cf_name()); } );
-        bool service_levels_up_to_date = get_current_service_levels(db)->equal_columns(*get_updated_service_levels(db, workload_prioritization_enabled));
-        if (keyspaces_setup && tables_setup && service_levels_up_to_date) {
+        if (keyspaces_setup && tables_setup) {
            dlogger.info("system_distributed(_everywhere) keyspaces and tables are up-to-date. Not creating");
            _started = true;
            co_return;
@@ -258,51 +147,25 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
        utils::chunked_vector<mutation> mutations;
        sstring description;

-        auto sd_ksm = keyspace_metadata::new_keyspace(
+        auto ksm = keyspace_metadata::new_keyspace(
                NAME,
                "org.apache.cassandra.locator.SimpleStrategy",
                {{"replication_factor", "3"}},
                std::nullopt, std::nullopt);
        if (!db.has_keyspace(NAME)) {
-            mutations = service::prepare_new_keyspace_announcement(db.real_database(), sd_ksm, ts);
+            mutations = service::prepare_new_keyspace_announcement(db.real_database(), ksm, ts);
            description += format(" create {} keyspace;", NAME);
        } else {
            dlogger.info("{} keyspace is already present. Not creating", NAME);
        }

-        auto sde_ksm = keyspace_metadata::new_keyspace(
-                NAME_EVERYWHERE,
-                "org.apache.cassandra.locator.EverywhereStrategy",
-                {},
-                std::nullopt, std::nullopt);
-        if (!db.has_keyspace(NAME_EVERYWHERE)) {
-            auto sde_mutations = service::prepare_new_keyspace_announcement(db.real_database(), sde_ksm, ts);
-            std::move(sde_mutations.begin(), sde_mutations.end(), std::back_inserter(mutations));
-            description += format(" create {} keyspace;", NAME_EVERYWHERE);
-        } else {
-            dlogger.info("{} keyspace is already present. Not creating", NAME_EVERYWHERE);
-        }
-
-        // Get mutations for creating and updating tables.
+        // Get mutations for creating tables.
        auto num_keyspace_mutations = mutations.size();
        co_await coroutine::parallel_for_each(ensured_tables(),
-                [this, &mutations, db, ts, sd_ksm, sde_ksm, workload_prioritization_enabled] (auto&& table) -> future<> {
-            auto ksm = table->ks_name() == NAME ? sd_ksm : sde_ksm;
-
-            // Ensure that the service_levels table contains new columns.
-            if (table->cf_name() == SERVICE_LEVELS) {
-                table = get_updated_service_levels(db, workload_prioritization_enabled);
-            }
-
+                [this, &mutations, db, ts, ksm] (auto&& table) -> future<> {
            if (!db.has_schema(table->ks_name(), table->cf_name())) {
                co_return co_await service::prepare_new_column_family_announcement(mutations, _sp, *ksm, std::move(table), ts);
            }
-
-            // The service_levels table exists. Update it if it lacks new columns.
-            if (table->cf_name() == SERVICE_LEVELS && !get_current_service_levels(db)->equal_columns(*table)) {
-                auto update_mutations = co_await service::prepare_column_family_update_announcement(_sp, table, std::vector<view_ptr>(), ts);
-                std::move(update_mutations.begin(), update_mutations.end(), std::back_inserter(mutations));
-            }
        });
        if (mutations.size() > num_keyspace_mutations) {
            description += " create and update system_distributed(_everywhere) tables";
@@ -324,15 +187,6 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
    }
 }

- future<> system_distributed_keyspace::start_workload_prioritization() {
-    if (this_shard_id() != 0) {
-        co_return;
-    }
-    if (_qp.db().features().workload_prioritization) {
-       co_await create_tables({get_updated_service_levels(_qp.db(), true)});
-    }
-}
-
 future<> system_distributed_keyspace::start() {
    if (this_shard_id() != 0) {
        _started = true;
@@ -375,90 +229,6 @@ static db::consistency_level quorum_if_many(size_t num_token_owners) {
    return num_token_owners > 1 ? db::consistency_level::QUORUM : db::consistency_level::ONE;
 }

-future<>
-system_distributed_keyspace::insert_cdc_generation(
-        utils::UUID id,
-        const cdc::topology_description& desc,
-        context ctx) {
-    using namespace std::chrono_literals;
-
-    const size_t concurrency = 10;
-    const size_t num_replicas = ctx.num_token_owners;
-
-    // To insert the data quickly and efficiently we send it in batches of multiple rows
-    // (each batch represented by a single mutation). We also send multiple such batches concurrently.
-    // However, we need to limit the memory consumption of the operation.
-    // I assume that the memory consumption grows linearly with the number of replicas
-    // (we send to all replicas ``at the same time''), with the batch size (the data must
-    // be copied for each replica?) and with concurrency. These assumptions may be too conservative
-    // but that won't hurt in a significant way (it may hurt the efficiency of the operation a little).
-    // Thus, if we want to limit the memory consumption to L, it should be true that
-    // mutation_size * num_replicas * concurrency <= L, hence
-    // mutation_size <= L / (num_replicas * concurrency).
-    // For example, say L = 10MB, concurrency = 10, num_replicas = 100; we get
-    // mutation_size <= 10MB / 1000 = 10KB.
-    // On the other hand we must have mutation_size >= size of a single row,
-    // so we will use mutation_size <= max(size of single row, L/(num_replicas*concurrency)).
-
-    // It has been tested that sending 1MB batches to 3 replicas with concurrency 20 works OK,
-    // which would correspond to L ~= 60MB. Hence that's the limit we use here.
-    const size_t L = 60'000'000;
-    const auto mutation_size_threshold = std::max(size_t(1), L / (num_replicas * concurrency));
-
-    auto s = _qp.db().real_database().find_schema(
-        system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2);
-    auto ms = co_await cdc::get_cdc_generation_mutations_v2(s, id, desc, mutation_size_threshold, api::new_timestamp());
-    co_await max_concurrent_for_each(ms, concurrency, [&] (mutation& m) -> future<> {
-        co_await _sp.mutate(
-            { std::move(m) },
-            db::consistency_level::ALL,
-            db::timeout_clock::now() + 60s,
-            nullptr, // trace_state
-            empty_service_permit(),
-            db::allow_per_partition_rate_limit::no,
-            false // raw_counters
-        );
-    });
-}
-
-future<std::optional<cdc::topology_description>>
-system_distributed_keyspace::read_cdc_generation(utils::UUID id) {
-    utils::chunked_vector<cdc::token_range_description> entries;
-    size_t num_ranges = 0;
-    co_await _qp.query_internal(
-            // This should be a local read so 20s should be more than enough
-            format("SELECT range_end, streams, ignore_msb, num_ranges FROM {}.{} WHERE id = ? USING TIMEOUT 20s", NAME_EVERYWHERE, CDC_GENERATIONS_V2),
-            db::consistency_level::ONE, // we wrote the generation with ALL so ONE must see it (or there's something really wrong)
-            { id },
-            1000, // for ~1KB rows, ~1MB page size
-            [&] (const cql3::untyped_result_set_row& row) {
-
-        std::vector<cdc::stream_id> streams;
-        row.get_list_data<bytes>("streams", std::back_inserter(streams));
-        entries.push_back(cdc::token_range_description{
-                dht::token::from_int64(row.get_as<int64_t>("range_end")),
-                std::move(streams),
-                uint8_t(row.get_as<int8_t>("ignore_msb"))});
-        num_ranges = row.get_as<int32_t>("num_ranges");
-        return make_ready_future<stop_iteration>(stop_iteration::no);
-    });
-
-    if (entries.empty()) {
-        co_return std::nullopt;
-    }
-
-    // Paranoic sanity check. Partial reads should not happen since generations should be retrieved only after they
-    // were written successfully with CL=ALL. But nobody uses EverywhereStrategy tables so they weren't ever properly
-    // tested, so just in case...
-    if (entries.size() != num_ranges) {
-        throw std::runtime_error(format(
-                "read_cdc_generation: wrong number of rows. The `num_ranges` column claimed {} rows,"
-                " but reading the partition returned {}.", num_ranges, entries.size()));
-    }
-
-    co_return std::optional{cdc::topology_description(std::move(entries))};
-}
-
 static future<utils::chunked_vector<mutation>> get_cdc_streams_descriptions_v2_mutation(
        const replica::database& db,
        db_clock::time_point time,
@@ -630,65 +400,4 @@ system_distributed_keyspace::cdc_current_generation_timestamp(context ctx) {
    co_return timestamp_cql->one().get_as<db_clock::time_point>("time");
 }

-future<qos::service_levels_info> system_distributed_keyspace::get_service_levels(qos::query_context ctx) const {
-    return qos::get_service_levels(_qp, NAME, SERVICE_LEVELS, db::consistency_level::ONE, ctx);
-}
-
-future<qos::service_levels_info> system_distributed_keyspace::get_service_level(sstring service_level_name) const {
-    return qos::get_service_level(_qp, NAME, SERVICE_LEVELS, service_level_name, db::consistency_level::ONE);
-}
-
-future<> system_distributed_keyspace::set_service_level(sstring service_level_name, qos::service_level_options slo) const {
-    static sstring prepared_query = format("INSERT INTO {}.{} (service_level) VALUES (?);", NAME, SERVICE_LEVELS);
-    co_await _qp.execute_internal(prepared_query, db::consistency_level::ONE, internal_distributed_query_state(), {service_level_name}, cql3::query_processor::cache_internal::no);
-    auto to_data_value = [&] (const qos::service_level_options::timeout_type& tv) {
-        return std::visit(overloaded_functor {
-            [&] (const qos::service_level_options::unset_marker&) {
-                return data_value::make_null(duration_type);
-            },
-            [&] (const qos::service_level_options::delete_marker&) {
-                return data_value::make_null(duration_type);
-            },
-            [&] (const lowres_clock::duration& d) {
-                return data_value(cql_duration(months_counter{0},
-                        days_counter{0},
-                        nanoseconds_counter{std::chrono::duration_cast<std::chrono::nanoseconds>(d).count()}));
-            },
-        }, tv);
-    };
-    auto to_data_value_g = [&] <typename T> (const std::variant<qos::service_level_options::unset_marker, qos::service_level_options::delete_marker, T>& v) {
-        return std::visit(overloaded_functor {
-            [&] (const qos::service_level_options::unset_marker&) {
-                return data_value::make_null(data_type_for<T>());
-            },
-            [&] (const qos::service_level_options::delete_marker&) {
-                return data_value::make_null(data_type_for<T>());
-            },
-            [&] (const T& v) {
-                return data_value(v);
-            },
-        }, v);
-    };
-    data_value workload = slo.workload == qos::service_level_options::workload_type::unspecified
-            ? data_value::make_null(utf8_type)
-            : data_value(qos::service_level_options::to_string(slo.workload));
-    co_await _qp.execute_internal(format("UPDATE {}.{} SET timeout = ?, workload_type = ? WHERE service_level = ?;", NAME, SERVICE_LEVELS),
-                db::consistency_level::ONE,
-                internal_distributed_query_state(),
-                {to_data_value(slo.timeout),
-                    workload,
-                    service_level_name},
-                cql3::query_processor::cache_internal::no);
-    co_await _qp.execute_internal(format("UPDATE {}.{} SET shares = ? WHERE service_level = ?;", NAME, SERVICE_LEVELS),
-                db::consistency_level::ONE,
-                internal_distributed_query_state(),
-                {to_data_value_g(slo.shares), service_level_name},
-                cql3::query_processor::cache_internal::no);
-}
-
-future<> system_distributed_keyspace::drop_service_level(sstring service_level_name) const {
-    static sstring prepared_query = format("DELETE FROM {}.{} WHERE service_level= ?;", NAME, SERVICE_LEVELS);
-    return _qp.execute_internal(prepared_query, db::consistency_level::ONE, internal_distributed_query_state(), {service_level_name}, cql3::query_processor::cache_internal::no).discard_result();
-}
-
 }
--- a/db/system_distributed_keyspace.hh
+++ b/db/system_distributed_keyspace.hh
@@ -9,9 +9,6 @@
 #pragma once

 #include "schema/schema_fwd.hh"
-#include "service/qos/qos_common.hh"
-#include "utils/UUID.hh"
-#include "cdc/generation_id.hh"
 #include "locator/host_id.hh"

 #include <seastar/core/future.hh>
@@ -24,7 +21,6 @@ class query_processor;
 }

 namespace cdc {
-    class stream_id;
    class topology_description;
    class streams_version;
 } // namespace cdc
@@ -39,17 +35,8 @@ namespace db {
 class system_distributed_keyspace {
 public:
    static constexpr auto NAME = "system_distributed";
-    static constexpr auto NAME_EVERYWHERE = "system_distributed_everywhere";

    static constexpr auto VIEW_BUILD_STATUS = "view_build_status";
-    static constexpr auto SERVICE_LEVELS = "service_levels";
-
-    /* Nodes use this table to communicate new CDC stream generations to other nodes. */
-    static constexpr auto CDC_TOPOLOGY_DESCRIPTION = "cdc_generation_descriptions";
-
-    /* Nodes use this table to communicate new CDC stream generations to other nodes.
-     * Resides in system_distributed_everywhere. */
-    static constexpr auto CDC_GENERATIONS_V2 = "cdc_generation_descriptions_v2";

    /* This table is used by CDC clients to learn about available CDC streams. */
    static constexpr auto CDC_DESC_V2 = "cdc_streams_descriptions_v2";
@@ -77,19 +64,14 @@ private:

 public:
    static std::vector<schema_ptr> all_distributed_tables();
-    static std::vector<schema_ptr> all_everywhere_tables();

    system_distributed_keyspace(cql3::query_processor&, service::migration_manager&, service::storage_proxy&);

    future<> start();
-    future<> start_workload_prioritization();
    future<> stop();

    bool started() const { return _started; }

-    future<> insert_cdc_generation(utils::UUID, const cdc::topology_description&, context);
-    future<std::optional<cdc::topology_description>> read_cdc_generation(utils::UUID);
-
    future<> create_cdc_desc(db_clock::time_point, const cdc::topology_description&, context);
    future<bool> cdc_desc_exists(db_clock::time_point, context);

@@ -105,11 +87,6 @@ public:
    // NOTE: currently used only by alternator
    future<db_clock::time_point> cdc_current_generation_timestamp(context);

-    future<qos::service_levels_info> get_service_levels(qos::query_context ctx) const;
-    future<qos::service_levels_info> get_service_level(sstring service_level_name) const;
-    future<> set_service_level(sstring service_level_name, qos::service_level_options slo) const;
-    future<> drop_service_level(sstring service_level_name) const;
-
 private:
    future<> create_tables(std::vector<schema_ptr> tables);
 };
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -300,6 +300,7 @@ schema_ptr system_keyspace::topology() {
            .with_column("upgrade_state", utf8_type, column_kind::static_column)
            .with_column("global_requests", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
            .with_column("paused_rf_change_requests", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
+            .with_column("ongoing_rf_changes", set_type_impl::get_instance(timeuuid_type, true), column_kind::static_column)
            .set_comment("Current state of topology change machine")
            .with_hash_version()
            .build();
@@ -3350,6 +3351,12 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
            }
        }

+        if (some_row.has("ongoing_rf_changes")) {
+            for (auto&& v : deserialize_set_column(*topology(), some_row, "ongoing_rf_changes")) {
+                ret.ongoing_rf_changes.insert(value_cast<utils::UUID>(v));
+            }
+        }
+
        if (some_row.has("enabled_features")) {
            ret.enabled_features = decode_features(deserialize_set_column(*topology(), some_row, "enabled_features"));
        }
--- a/db/view/node_view_update_backlog.hh
+++ b/db/view/node_view_update_backlog.hh
@@ -10,6 +10,7 @@

 #include "db/view/view_update_backlog.hh"
 #include "utils/error_injection.hh"
+#include "utils/updateable_value.hh"

 #include <seastar/core/cacheline.hh>
 #include <seastar/core/future.hh>
@@ -41,13 +42,16 @@ class node_update_backlog {
    std::chrono::milliseconds _interval;
    std::atomic<clock::time_point> _last_update;
    std::atomic<update_backlog> _max;
+    utils::updateable_value<uint32_t> _view_flow_control_delay_limit_in_ms;

 public:
-    explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval)
+    explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval,
+            utils::updateable_value<uint32_t> view_flow_control_delay_limit_in_ms = utils::updateable_value<uint32_t>(1000))
            : _backlogs(shards)
            , _interval(interval)
            , _last_update(clock::now() - _interval)
-            , _max(update_backlog::no_backlog()) {
+            , _max(update_backlog::no_backlog())
+            , _view_flow_control_delay_limit_in_ms(std::move(view_flow_control_delay_limit_in_ms)) {
        if (utils::get_local_injector().enter("update_backlog_immediately")) {
            _interval = std::chrono::milliseconds(0);
            _last_update = clock::now();
@@ -59,6 +63,9 @@ public:
    update_backlog fetch_shard(unsigned shard);
    seastar::future<std::optional<update_backlog>> fetch_if_changed();

+    std::chrono::microseconds calculate_throttling_delay(update_backlog backlog,
+            db::timeout_clock::time_point timeout) const;
+
    // Exposed for testing only.
    update_backlog load() const {
        return _max.load(std::memory_order_relaxed);
--- a/db/view/row_locking.cc
+++ b/db/view/row_locking.cc
@@ -150,14 +150,14 @@ row_locker::unlock(const dht::decorated_key* pk, bool partition_exclusive,
        auto pli = _two_level_locks.find(*pk);
        if (pli == _two_level_locks.end()) {
            // This shouldn't happen... We can't unlock this lock if we can't find it...
-            mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for partition", *pk);
+            mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for partition {}", *pk);
            return;
        }
        SCYLLA_ASSERT(&pli->first == pk);
        if (cpk) {
            auto rli = pli->second._row_locks.find(*cpk);
            if (rli == pli->second._row_locks.end()) {
-                mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for row", *cpk);
+                mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for row {}", *cpk);
                return;
            }
            SCYLLA_ASSERT(&rli->first == cpk);
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -45,6 +45,7 @@
 #include "db/view/view_builder.hh"
 #include "db/view/view_updating_consumer.hh"
 #include "db/view/view_update_generator.hh"
+#include "db/view/node_view_update_backlog.hh"
 #include "db/view/regular_column_transformation.hh"
 #include "db/system_keyspace_view_types.hh"
 #include "db/system_keyspace.hh"
@@ -1584,9 +1585,11 @@ future<stop_iteration> view_update_builder::on_results() {

    auto tombstone = std::max(_update_partition_tombstone, _update_current_tombstone);
    if (tombstone && _existing && !_existing->is_end_of_partition()) {
-        // We don't care if it's a range tombstone, as we're only looking for existing entries that get deleted
-        if (_existing->is_clustering_row()) {
+        if (_existing->is_range_tombstone_change()) {
+            _existing_current_tombstone = _existing->as_range_tombstone_change().tombstone();
+        } else if (_existing->is_clustering_row()) {
            auto existing = clustering_row(*_schema, _existing->as_clustering_row());
+            existing.apply(std::max(_existing_partition_tombstone, _existing_current_tombstone));
            auto update = clustering_row(existing.key(), row_tombstone(std::move(tombstone)), row_marker(), ::row());
            generate_update(std::move(update), { std::move(existing) });
        } else if (_existing->is_static_row()) {
@@ -1597,9 +1600,10 @@ future<stop_iteration> view_update_builder::on_results() {
        return should_stop_updates() ? stop() : advance_existings();
    }

-    // If we have updates and it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it
    if (_update && !_update->is_end_of_partition()) {
-        if (_update->is_clustering_row()) {
+        if (_update->is_range_tombstone_change()) {
+            _update_current_tombstone = _update->as_range_tombstone_change().tombstone();
+        } else if (_update->is_clustering_row()) {
            _update->mutate_as_clustering_row(*_schema, [&] (clustering_row& cr) mutable {
                cr.apply(std::max(_update_partition_tombstone, _update_current_tombstone));
            });
@@ -3489,18 +3493,27 @@ future<> delete_ghost_rows_visitor::do_accept_new_row(partition_key pk, clusteri
    }
 }

-std::chrono::microseconds calculate_view_update_throttling_delay(db::view::update_backlog backlog,
-                                                                 db::timeout_clock::time_point timeout,
-                                                                 uint32_t view_flow_control_delay_limit_in_ms) {
+// View updates are asynchronous, and because of this limiting their concurrency requires
+// a special approach. The current algorithm places all of the pending view updates in the backlog
+// and artificially slows down new responses to coordinator requests based on how full the backlog is.
+// This function calculates how much a request should be slowed down based on the backlog's fullness.
+// The equation is basically: delay(in seconds) = view_fullness_ratio^3
+// The more full the backlog gets the more aggressively the requests are slowed down.
+// The delay is limited to the amount of time left until timeout.
+// After the timeout the request fails, so there's no point in waiting longer than that.
+// The second argument defines this timeout point - we can't delay the request more than this time point.
+// See: https://www.scylladb.com/2018/12/04/worry-free-ingestion-flow-control/
+std::chrono::microseconds node_update_backlog::calculate_throttling_delay(update_backlog backlog,
+                                                                         db::timeout_clock::time_point timeout) const {
    auto adjust = [] (float x) { return x * x * x; };
-    auto budget = std::max(service::storage_proxy::clock_type::duration(0),
-        timeout - service::storage_proxy::clock_type::now());
-    std::chrono::microseconds ret(uint32_t(adjust(backlog.relative_size()) * view_flow_control_delay_limit_in_ms * 1000));
+    auto budget = std::max(db::timeout_clock::duration(0),
+        timeout - db::timeout_clock::now());
+    std::chrono::microseconds ret(uint32_t(adjust(backlog.relative_size()) * _view_flow_control_delay_limit_in_ms() * 1000));
    // "budget" has millisecond resolution and can potentially be long
    // in the future so converting it to microseconds may overflow.
    // So to compare buget and ret we need to convert both to the lower
    // resolution.
-    if (std::chrono::duration_cast<service::storage_proxy::clock_type::duration>(ret) < budget) {
+    if (std::chrono::duration_cast<db::timeout_clock::duration>(ret) < budget) {
        return ret;
    } else {
        // budget is small (< ret) so can be converted to microseconds
--- a/db/view/view_building_worker.cc
+++ b/db/view/view_building_worker.cc
@@ -715,7 +715,7 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
            vbw_logger.info("Building range {} for base table {} and views {} was aborted.", range, base_id, views_ids);
        } catch (...) {
            eptr = std::current_exception();
-            vbw_logger.warn("Error during processing range {} for base table {} and views {}: ", range, base_id, views_ids, eptr);
+            vbw_logger.warn("Error during processing range {} for base table {} and views {}: {}", range, base_id, views_ids, eptr);
        }
        reader.close().get();

--- a/db/view/view_update_backlog.hh
+++ b/db/view/view_update_backlog.hh
@@ -43,7 +43,7 @@ public:
    // Returns the number of bytes in the backlog divided by the maximum number of bytes
    // that the backlog can hold before employing admission control. While the backlog
    // is below the threshold, the coordinator will slow down the view updates up to
-    // calculate_view_update_throttling_delay()::delay_limit_us. Above the threshold,
+    // node_update_backlog::calculate_throttling_delay()::delay_limit_us. Above the threshold,
    // the coordinator will reject the writes that would increase the backlog. On the
    // replica, the writes will start failing only after reaching the hard limit '_max'.
    float relative_size() const {
@@ -70,18 +70,4 @@ public:
    }
 };

-// View updates are asynchronous, and because of this limiting their concurrency requires
-// a special approach. The current algorithm places all of the pending view updates in the backlog
-// and artificially slows down new responses to coordinator requests based on how full the backlog is.
-// This function calculates how much a request should be slowed down based on the backlog's fullness.
-// The equation is basically: delay(in seconds) = view_fullness_ratio^3
-// The more full the backlog gets the more aggressively the requests are slowed down.
-// The delay is limited to the amount of time left until timeout.
-// After the timeout the request fails, so there's no point in waiting longer than that.
-// The second argument defines this timeout point - we can't delay the request more than this time point.
-// See: https://www.scylladb.com/2018/12/04/worry-free-ingestion-flow-control/
-std::chrono::microseconds calculate_view_update_throttling_delay(
-    update_backlog backlog,
-    db::timeout_clock::time_point timeout,
-    uint32_t view_flow_control_delay_limit_in_ms);
 }
--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -7,6 +7,7 @@
 */

 #include "db/view/view_update_backlog.hh"
+#include "db/view/node_view_update_backlog.hh"
 #include <seastar/core/timed_out_error.hh>
 #include "gms/inet_address.hh"
 #include <seastar/util/defer.hh>
@@ -95,9 +96,10 @@ public:
    }
 };

-view_update_generator::view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, abort_source& as)
+view_update_generator::view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, node_update_backlog& node_backlog, abort_source& as)
        : _db(db)
        , _proxy(proxy)
+        , _node_update_backlog(node_backlog)
        , _progress_tracker(std::make_unique<progress_tracker>())
        , _early_abort_subscription(as.subscribe([this] () noexcept { do_abort(); }))
 {
@@ -112,7 +114,7 @@ future<> view_update_generator::start() {
    _started = seastar::async([this]() mutable {
        auto drop_sstable_references = defer([&] () noexcept {
            // Clear sstable references so sstables_manager::stop() doesn't hang.
-            vug_logger.info("leaving {} unstaged sstables unprocessed",
+            vug_logger.info("leaving {} unstaged sstables and {} sstables with tables unprocessed",
                    _sstables_to_move.size(), _sstables_with_tables.size());
            _sstables_to_move.clear();
            _sstables_with_tables.clear();
@@ -240,6 +242,9 @@ future<> view_update_generator::process_staging_sstables(lw_shared_ptr<replica::
            _progress_tracker->on_sstable_registration(sst);
        }

+        utils::get_local_injector().inject("view_update_generator_pause_before_processing",
+                utils::wait_for_message(std::chrono::minutes(5))).get();
+
        // Generate view updates from staging sstables
        auto start_time = db_clock::now();
        auto [result, input_size] = generate_updates_from_staging_sstables(table, sstables);
@@ -495,7 +500,7 @@ future<> view_update_generator::generate_and_propagate_view_updates(const replic
        // the one which limits the number of incoming client requests by delaying the response to the client.
        if (batch_num > 0) {
            update_backlog local_backlog = _db.get_view_update_backlog();
-            std::chrono::microseconds throttle_delay =  calculate_view_update_throttling_delay(local_backlog, timeout, _db.get_config().view_flow_control_delay_limit_in_ms());
+            std::chrono::microseconds throttle_delay =  _node_update_backlog.calculate_throttling_delay(local_backlog, timeout);

            co_await seastar::sleep(throttle_delay);

--- a/db/view/view_update_generator.hh
+++ b/db/view/view_update_generator.hh
@@ -52,6 +52,7 @@ using allow_hints = bool_class<allow_hints_tag>;

 namespace db::view {

+class node_update_backlog;
 class stats;
 struct wait_for_all_updates_tag {};
 using wait_for_all_updates = bool_class<wait_for_all_updates_tag>;
@@ -63,6 +64,7 @@ public:
 private:
    replica::database& _db;
    sharded<service::storage_proxy>& _proxy;
+    node_update_backlog& _node_update_backlog;
    seastar::abort_source _as;
    future<> _started = make_ready_future<>();
    seastar::condition_variable _pending_sstables;
@@ -75,7 +77,7 @@ private:
    optimized_optional<abort_source::subscription> _early_abort_subscription;
    void do_abort() noexcept;
 public:
-    view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, abort_source& as);
+    view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, node_update_backlog& node_backlog, abort_source& as);
    ~view_update_generator();

    future<> start();
--- a/dist/CMakeLists.txt
+++ b/dist/CMakeLists.txt
@@ -141,4 +141,72 @@ add_dependencies(dist
  dist-python3
  dist-server)

+set(dist_rpm_dir "${CMAKE_BINARY_DIR}/$<CONFIG>/dist/rpm")
+set(dist_deb_dir "${CMAKE_BINARY_DIR}/$<CONFIG>/dist/deb")
+
+# Map system processor to Debian architecture names
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+  set(deb_arch "amd64")
+elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+  set(deb_arch "arm64")
+else()
+  message(FATAL_ERROR "Unsupported architecture: ${CMAKE_SYSTEM_PROCESSOR}")
+endif()
+
+set(rpm_ver "${Scylla_VERSION}-${Scylla_RELEASE}")
+set(deb_ver "${Scylla_VERSION}-${Scylla_RELEASE}-1")
+set(rpm_arch "${CMAKE_SYSTEM_PROCESSOR}")
+
+set(server_rpms_dir "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/redhat/RPMS/${rpm_arch}")
+set(server_rpms
+  "${server_rpms_dir}/${Scylla_PRODUCT}-${rpm_ver}.${rpm_arch}.rpm"
+  "${server_rpms_dir}/${Scylla_PRODUCT}-server-${rpm_ver}.${rpm_arch}.rpm"
+  "${server_rpms_dir}/${Scylla_PRODUCT}-server-debuginfo-${rpm_ver}.${rpm_arch}.rpm"
+  "${server_rpms_dir}/${Scylla_PRODUCT}-conf-${rpm_ver}.${rpm_arch}.rpm"
+  "${server_rpms_dir}/${Scylla_PRODUCT}-kernel-conf-${rpm_ver}.${rpm_arch}.rpm"
+  "${server_rpms_dir}/${Scylla_PRODUCT}-node-exporter-${rpm_ver}.${rpm_arch}.rpm")
+set(cqlsh_rpms
+  "${CMAKE_SOURCE_DIR}/tools/cqlsh/build/redhat/RPMS/${rpm_arch}/${Scylla_PRODUCT}-cqlsh-${rpm_ver}.${rpm_arch}.rpm")
+set(python3_rpms
+  "${CMAKE_SOURCE_DIR}/tools/python3/build/redhat/RPMS/${rpm_arch}/${Scylla_PRODUCT}-python3-${rpm_ver}.${rpm_arch}.rpm")
+
+set(server_debs_dir "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/debian")
+set(server_debs
+  "${server_debs_dir}/${Scylla_PRODUCT}_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/${Scylla_PRODUCT}-server_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/${Scylla_PRODUCT}-server-dbg_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/${Scylla_PRODUCT}-conf_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/${Scylla_PRODUCT}-kernel-conf_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/${Scylla_PRODUCT}-node-exporter_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/scylla-enterprise_${deb_ver}_all.deb"
+  "${server_debs_dir}/scylla-enterprise-server_${deb_ver}_all.deb"
+  "${server_debs_dir}/scylla-enterprise-conf_${deb_ver}_all.deb"
+  "${server_debs_dir}/scylla-enterprise-kernel-conf_${deb_ver}_all.deb"
+  "${server_debs_dir}/scylla-enterprise-node-exporter_${deb_ver}_all.deb")
+set(cqlsh_debs
+  "${CMAKE_SOURCE_DIR}/tools/cqlsh/build/debian/${Scylla_PRODUCT}-cqlsh_${deb_ver}_${deb_arch}.deb"
+  "${CMAKE_SOURCE_DIR}/tools/cqlsh/build/debian/scylla-enterprise-cqlsh_${deb_ver}_all.deb")
+set(python3_debs
+  "${CMAKE_SOURCE_DIR}/tools/python3/build/debian/${Scylla_PRODUCT}-python3_${deb_ver}_${deb_arch}.deb"
+  "${CMAKE_SOURCE_DIR}/tools/python3/build/debian/scylla-enterprise-python3_${deb_ver}_all.deb")
+
+add_custom_target(collect-dist-rpm
+  COMMAND ${CMAKE_COMMAND} -E rm -rf ${dist_rpm_dir}
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${dist_rpm_dir}
+  COMMAND ${CMAKE_COMMAND} -E copy ${server_rpms} ${cqlsh_rpms} ${python3_rpms} ${dist_rpm_dir}/
+  DEPENDS dist
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  COMMENT "Collecting RPMs into ${dist_rpm_dir}")
+
+add_custom_target(collect-dist-deb
+  COMMAND ${CMAKE_COMMAND} -E rm -rf ${dist_deb_dir}
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${dist_deb_dir}
+  COMMAND ${CMAKE_COMMAND} -E copy ${server_debs} ${cqlsh_debs} ${python3_debs} ${dist_deb_dir}/
+  DEPENDS dist
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  COMMENT "Collecting DEBs into ${dist_deb_dir}")
+
+add_custom_target(collect-dist
+  DEPENDS collect-dist-rpm collect-dist-deb)
+
 add_subdirectory(debuginfo)
--- a/docs/alternator/compatibility.md
+++ b/docs/alternator/compatibility.md
@@ -324,6 +324,13 @@ experimental:
    stream events. Without this option, such no-op operations may still
    generate spurious stream events.
    <https://github.com/scylladb/scylladb/issues/28368>
+  * When a stream is disabled, no new records are written but the existing
+    stream data is preserved and remains readable through its original
+    StreamArn. The data expires via TTL after 24 hours. Re-enabling the
+    stream purges the old data immediately and produces a new StreamArn.
+    In contrast, DynamoDB keeps the old stream and its data readable for
+    24 hours through the old StreamArn even after re-enabling.
+    <https://scylladb.atlassian.net/browse/SCYLLADB-1873>

 ## Unimplemented API features

--- a/docs/cql/ddl.rst
+++ b/docs/cql/ddl.rst
@@ -415,7 +415,7 @@ An empty list is allowed, and it's equivalent to numeric replication factor of 0
 .. code-block:: cql

  ALTER KEYSPACE Excelsior
-   WITH replication = { 'class' : 'NetworkTopologyStrategy', dc2' : []};
+   WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc2' : []};


 Altering from a rack list to a numeric replication factor is not supported.
@@ -1017,11 +1017,11 @@ For example:

    CREATE TABLE customer_data (
        cust_id uuid,
-        cust_first-name text,
-        cust_last-name text,
+        "cust_first-name" text,
+        "cust_last-name" text,
        cust_phone text,
-        cust_get-sms text,
-        PRIMARY KEY (customer_id)
+        "cust_get-sms" text,
+        PRIMARY KEY (cust_id)
    ) WITH cdc = { 'enabled' : 'true', 'preimage' : 'true' };

 .. _cql-caching-options:
--- a/docs/cql/dml/insert.rst
+++ b/docs/cql/dml/insert.rst
@@ -24,7 +24,8 @@ For example:

    INSERT INTO NerdMovies (movie, director, main_actor, year)
          VALUES ('Serenity', 'Joss Whedon', 'Nathan Fillion', 2005)
-          USING TTL 86400 IF NOT EXISTS;
+          IF NOT EXISTS
+          USING TTL 86400;

 The ``INSERT`` statement writes one or more columns for a given row in a table. Note that since a row is identified by
 its ``PRIMARY KEY``, at least the columns composing it must be specified. The list of columns to insert to must be
--- a/docs/cql/types.rst
+++ b/docs/cql/types.rst
@@ -507,7 +507,7 @@ For example::

  CREATE TABLE superheroes (
       name frozen<full_name> PRIMARY KEY,
-       home address
+       home frozen<address>
  );

 .. note::
--- a/docs/dev/object_storage.md
+++ b/docs/dev/object_storage.md
@@ -271,7 +271,7 @@ The json structure is as follows:
 }

 The `manifest` member contains the following attributes:
- `version` - respresenting the version of the manifest itself. It is incremented when members are added or removed from the manifest.
+- `version` - representing the version of the manifest itself. It is incremented when members are added or removed from the manifest.
 - `scope` - the scope of metadata stored in this manifest file.  The following scopes are supported:
    - `node` - the manifest describes all SSTables owned by this node in this snapshot.

--- a/docs/dev/system_schema_keyspace.md
+++ b/docs/dev/system_schema_keyspace.md
@@ -12,7 +12,9 @@ Schema:
 CREATE TABLE system_schema.keyspaces (
    keyspace_name text PRIMARY KEY,
    durable_writes boolean,
-    replication frozen<map<text, text>>
+    replication frozen<map<text, text>>,
+    replication_v2 frozen<map<text, text>>,
+    next_replication frozen<map<text, text>>
 )
 ```

@@ -31,6 +33,8 @@ Columns:
   stored as a flattened map of the extended options map (see below).

   For `SimpleStrategy` there is a single option `"replication_factor"` specifying the replication factor.
+* `next_replication` - the target replication factor for the keyspace during rf change.
+   If there is no ongoing rf change, `next_replication` value is not set.

 Extended options map used by NetworkTopologyStrategy is a map where values can be either strings or lists of strings.

--- a/docs/operating-scylla/admin.rst
+++ b/docs/operating-scylla/admin.rst
@@ -146,6 +146,25 @@ AWS Security Token Service (STS) or the EC2 Instance Metadata Service.
   - When set, these values are used by the S3 client to sign requests.
   - If not set, requests are sent unsigned, which may not be accepted by all servers.

+.. _admin-oci-object-storage:
+
+Using Oracle OCI Object Storage
+=================================
+
+Oracle Cloud Infrastructure (OCI) Object Storage is compatible with the Amazon
+S3 API, so it works with ScyllaDB without additional configuration.
+
+To use OCI Object Storage, follow the same configuration as for AWS S3, and
+specify your OCI S3-compatible endpoint.
+
+Example:
+
+.. code:: yaml
+
+   object_storage_endpoints:
+     - name: https://idedxcgnkfkt.compat.objectstorage.us-ashburn-1.oci.customer-oci.com:443
+       aws_region: us-ashburn-1
+
 .. _admin-compression:

 Compression
--- a/docs/operating-scylla/procedures/cluster-management/add-dc-to-existing-dc.rst
+++ b/docs/operating-scylla/procedures/cluster-management/add-dc-to-existing-dc.rst
@@ -231,6 +231,46 @@ Add New DC

         Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.

+   If the keyspace uses rack list replication, update the replication factor in one ``ALTER KEYSPACE`` statement, under the following rules:
+      * Existing datacenters must keep their current replication factor.
+      * A new datacenter can be assigned a replication factor (**0 to N**).
+      * An existing datacenter can be removed (**N to 0**).
+
+   .. warning::
+
+      While adding a new datacenter and altering keyspaces, do **not** perform any reads or writes that involve the new datacenter.
+      In particular, avoid using global consistency levels (such as ``ALL``, ``EACH_QUORUM``) that would include the new datacenter in the operation.
+      Use ``LOCAL_*`` consistency levels (e.g., ``LOCAL_QUORUM``, ``LOCAL_ONE``) until the new datacenter is fully operational.
+
+   Before
+
+   .. code-block:: cql
+
+      DESCRIBE KEYSPACE mykeyspace4;
+
+      CREATE KEYSPACE mykeyspace4 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>']} AND tablets = { 'enabled': true };
+
+   The following is **not** allowed because it changes the replication factor of ``<existing_dc>`` (adds ``<existing_rack4>``) and adds ``<new_dc>`` in the same statement:
+
+   .. code-block:: cql
+
+      ALTER KEYSPACE mykeyspace4 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>', '<existing_rack4>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
+
+   Add all the nodes to the new datacenter and then:
+
+   .. code-block:: cql
+
+      ALTER KEYSPACE mykeyspace4 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
+
+   After
+
+   .. code-block:: cql
+
+      DESCRIBE KEYSPACE mykeyspace4;
+      CREATE KEYSPACE mykeyspace4 WITH REPLICATION = {'class': 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
+
+   You can abort the keyspace alteration using :doc:`Task manager </operating-scylla/admin-tools/task-manager>`.
+
 #. If any vnode keyspace was altered, run ``nodetool rebuild`` on each node in the new datacenter, specifying the existing datacenter name in the rebuild command.

   For example:
--- a/docs/operating-scylla/procedures/cluster-management/decommissioning-data-center.rst
+++ b/docs/operating-scylla/procedures/cluster-management/decommissioning-data-center.rst
@@ -102,6 +102,34 @@ Procedure

         Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.

+   If the keyspace uses rack list replication, update the replication factor in one ``ALTER KEYSPACE`` statement, under the following rules:
+      * Existing datacenters must keep their current replication factor.
+      * An existing datacenter can be removed (**N to 0**).
+      * A new datacenter can be assigned a replication factor (**0 to N**).
+
+   .. warning::
+
+      While removing a datacenter and altering keyspaces, do **not** perform any reads or writes that involve the datacenter being removed.
+      In particular, avoid using global consistency levels (such as ``ALL``, ``EACH_QUORUM``) that would include the decommissioned datacenter in the operation.
+      Use ``LOCAL_*`` consistency levels (e.g., ``LOCAL_QUORUM``, ``LOCAL_ONE``) until the datacenter is fully decommissioned.
+
+   .. code-block:: shell
+
+      cqlsh> DESCRIBE nba4
+      cqlsh> CREATE KEYSPACE nba4 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : ['RAC4', 'RAC5'], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
+
+   The following is **not** allowed because it changes the replication factor of ``EUROPE-DC`` (adds ``RAC9``) and removes ``ASIA-DC`` in the same statement:
+
+   .. code-block:: shell
+
+      cqlsh> ALTER KEYSPACE nba4 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : [], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8', 'RAC9']} AND tablets = { 'enabled': true };
+
+   Remove all replicas from the decommissioned datacenter:
+
+   .. code-block:: shell
+
+      cqlsh> ALTER KEYSPACE nba4 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : [], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
+
   .. note::

      If table audit is enabled, the ``audit`` keyspace is automatically created with ``NetworkTopologyStrategy``.
@@ -113,6 +141,10 @@ Procedure

      Failure to do so will result in decommission errors such as "zero replica after the removal".

+   .. warning::
+
+         Removal of replicas from a datacenter cannot be aborted. To get back to the previous replication, wait until the ALTER KEYSPACE finishes and then add the replicas back by running another ALTER KEYSPACE statement.
+
 #. Run :doc:`nodetool decommission </operating-scylla/nodetool-commands/decommission>` on every node in the data center that is to be removed.
   Refer to :doc:`Remove a Node from a ScyllaDB Cluster - Down Scale </operating-scylla/procedures/cluster-management/remove-node>` for further information.

--- a/docs/upgrade/upgrade-guides/index.rst
+++ b/docs/upgrade/upgrade-guides/index.rst
@@ -4,7 +4,7 @@ Upgrade ScyllaDB

 .. toctree::
   
-   ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1/index>
+   ScyllaDB 2026.1 to ScyllaDB 2026.2 <upgrade-guide-from-2026.1-to-2026.2/index>
   ScyllaDB 2026.x Patch Upgrades <upgrade-guide-from-2026.x.y-to-2026.x.z>
   ScyllaDB Image <ami-upgrade>

--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/index.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/index.rst
@@ -1,13 +0,0 @@
-==========================================================
-Upgrade - ScyllaDB 2025.x to ScyllaDB 2026.1
-==========================================================
-
-.. toctree::
-   :maxdepth: 2
-   :hidden:
-
-   Upgrade ScyllaDB <upgrade-guide-from-2025.x-to-2026.1>
-   Metrics Update <metric-update-2025.x-to-2026.1>
-
-* :doc:`Upgrade from ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1>`
-* :doc:`Metrics Update Between 2025.x and 2026.1 <metric-update-2025.x-to-2026.1>`
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/metric-update-2025.x-to-2026.1.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/metric-update-2025.x-to-2026.1.rst
@@ -1,82 +0,0 @@
-.. |SRC_VERSION| replace:: 2025.x
-.. |NEW_VERSION| replace:: 2026.1
-.. |PRECEDING_VERSION| replace:: 2025.4
-
-================================================================
-Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
-================================================================
-
-.. toctree::
-   :maxdepth: 2
-   :hidden:
-
-ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
-
-
-New Metrics in |NEW_VERSION|
--------------------------------------
-
-The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
-
-.. list-table::
-   :widths: 25 150
-   :header-rows: 1
-
-   * - Metric
-     - Description
-   * - scylla_alternator_operation_size_kb
-     - Histogram of item sizes involved in a request.
-   * - scylla_column_family_total_disk_space_before_compression
-     - Hypothetical total disk space used if data files weren't compressed
-   * - scylla_group_name_auto_repair_enabled_nr
-     - Number of tablets with auto repair enabled.
-   * - scylla_group_name_auto_repair_needs_repair_nr
-     - Number of tablets with auto repair enabled that currently need repair.
-   * - scylla_lsa_compact_time_ms
-     - Total time spent on segment compaction that was not accounted under ``reclaim_time_ms``.
-   * - scylla_lsa_evict_time_ms
-     - Total time spent on evicting objects that was not accounted under ``reclaim_time_ms``,
-   * - scylla_lsa_reclaim_time_ms
-     - Total time spent in reclaiming LSA memory back to std allocator.
-   * - scylla_object_storage_memory_usage
-     - Total number of bytes consumed by the object storage client.
-   * - scylla_tablet_ops_failed
-     - Number of failed tablet auto repair attempts.
-   * - scylla_tablet_ops_succeeded
-     - Number of successful tablet auto repair attempts.
-   
-Renamed Metrics in |NEW_VERSION|
--------------------------------------
-
-The following metrics are renamed in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
-
-.. list-table::
-   :widths: 25 150
-   :header-rows: 1
-
-   * - Metric Name in |PRECEDING_VERSION|
-     - Metric Name in |NEW_VERSION|
-   * - scylla_s3_memory_usage
-     - scylla_object_storage_memory_usage
-
-Removed Metrics in |NEW_VERSION|
--------------------------------------
-
-The following metrics are removed in ScyllaDB |NEW_VERSION|.
-
-* scylla_redis_current_connections
-* scylla_redis_op_latency
-* scylla_redis_operation
-* scylla_redis_operation
-* scylla_redis_requests_latency
-* scylla_redis_requests_served
-* scylla_redis_requests_serving
-
-New and Updated Metrics in Previous Releases
-------------------------------------------------------
-
-* `Metrics Update Between 2025.3 and 2025.4 <https://docs.scylladb.com/manual/branch-2025.4/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/metric-update-2025.x-to-2025.4.html>`_
-* `Metrics Update Between 2025.2 and 2025.3 <https://docs.scylladb.com/manual/branch-2025.3/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/metric-update-2025.2-to-2025.3.html>`_
-* `Metrics Update Between 2025.1 and 2025.2 <https://docs.scylladb.com/manual/branch-2025.2/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/metric-update-2025.1-to-2025.2.html>`_
-
-
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/index.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/index.rst
@@ -0,0 +1,13 @@
+==========================================================
+Upgrade - ScyllaDB 2026.1 to ScyllaDB 2026.2
+==========================================================
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+   Upgrade ScyllaDB <upgrade-guide-from-2026.1-to-2026.2>
+   Metrics Update <metric-update-2026.1-to-2026.2>
+
+* :doc:`Upgrade from ScyllaDB 2026.1 to ScyllaDB 2026.2 <upgrade-guide-from-2026.1-to-2026.2>`
+* :doc:`Metrics Update Between 2026.1 and 2026.2 <metric-update-2026.1-to-2026.2>`
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/metric-update-2026.1-to-2026.2.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/metric-update-2026.1-to-2026.2.rst
@@ -0,0 +1,126 @@
+.. |SRC_VERSION| replace:: 2026.1
+.. |NEW_VERSION| replace:: 2026.2
+.. |PRECEDING_VERSION| replace:: 2026.1
+
+================================================================
+Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
+================================================================
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
+
+
+New Metrics in |NEW_VERSION|
+--------------------------------------
+
+The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
+
+.. list-table::
+   :widths: 25 150
+   :header-rows: 1
+
+   * - Metric
+     - Description
+   * - scylla_auth_cache_permissions
+     - Total number of permission sets currently cached across all roles.
+   * - scylla_auth_cache_roles
+     - Number of roles currently cached.
+   * - scylla_cql_forwarded_requests
+     - Counts the total number of attempts to forward CQL requests to other nodes.
+       One request may be forwarded multiple times, particularly when a write is
+       handled by a non-replica node.
+   * - scylla_cql_write_consistency_levels_disallowed_violations
+     - Counts the number of write_consistency_levels_disallowed guardrail violations,
+       i.e. attempts to write with a forbidden consistency level.
+   * - scylla_cql_write_consistency_levels_warned_violations
+     - Counts the number of write_consistency_levels_warned guardrail violations,
+       i.e. attempts to write with a discouraged consistency level.
+   * - scylla_cql_writes_per_consistency_level
+     - Counts the number of writes for each consistency level.
+   * - scylla_io_queue_integrated_disk_queue_length
+     - Length of the integrated disk queue.
+   * - scylla_io_queue_integrated_queue_length
+     - Length of the integrated queue.
+   * - scylla_logstor_sm_bytes_freed
+     - Counts the number of data bytes freed.
+   * - scylla_logstor_sm_bytes_read
+     - Counts the number of bytes read from the disk.
+   * - scylla_logstor_sm_bytes_written
+     - Counts the number of bytes written to the disk.
+   * - scylla_logstor_sm_compaction_bytes_written
+     - Counts the number of bytes written to the disk by compaction.
+   * - scylla_logstor_sm_compaction_data_bytes_written
+     - Counts the number of data bytes written to the disk by compaction.
+   * - scylla_logstor_sm_compaction_records_rewritten
+     - Counts the number of records rewritten during compaction.
+   * - scylla_logstor_sm_compaction_records_skipped
+     - Counts the number of records skipped during compaction.
+   * - scylla_logstor_sm_compaction_segments_freed
+     - Counts the number of data bytes written to the disk.
+   * - scylla_logstor_sm_disk_usage
+     - Total disk usage.
+   * - scylla_logstor_sm_free_segments
+     - Counts the number of free segments currently available.
+   * - scylla_logstor_sm_segment_pool_compaction_segments_get
+     - Counts the number of segments taken from the segment pool for compaction.
+   * - scylla_logstor_sm_segment_pool_normal_segments_get
+     - Counts the number of segments taken from the segment pool for normal writes.
+   * - scylla_logstor_sm_segment_pool_normal_segments_wait
+     - Counts the number of times normal writes had to wait for a segment to become
+       available in the segment pool.
+   * - scylla_logstor_sm_segment_pool_segments_put
+     - Counts the number of segments returned to the segment pool.
+   * - scylla_logstor_sm_segment_pool_separator_segments_get
+     - Counts the number of segments taken from the segment pool for separator writes.
+   * - scylla_logstor_sm_segment_pool_size
+     - Counts the number of segments in the segment pool.
+   * - scylla_logstor_sm_segments_allocated
+     - Counts the number of segments allocated.
+   * - scylla_logstor_sm_segments_compacted
+     - Counts the number of segments compacted.
+   * - scylla_logstor_sm_segments_freed
+     - Counts the number of segments freed.
+   * - scylla_logstor_sm_segments_in_use
+     - Counts the number of segments currently in use.
+   * - scylla_logstor_sm_separator_buffer_flushed
+     - Counts the number of times the separator buffer has been flushed.
+   * - scylla_logstor_sm_separator_bytes_written
+     - Counts the number of bytes written to the separator.
+   * - scylla_logstor_sm_separator_data_bytes_written
+     - Counts the number of data bytes written to the separator.
+   * - scylla_logstor_sm_separator_flow_control_delay
+     - Current delay applied to writes to control separator debt in microseconds.
+   * - scylla_logstor_sm_separator_segments_freed
+     - Counts the number of segments freed by the separator.
+   * - scylla_transport_cql_pending_response_memory
+     - Holds the total memory in bytes consumed by responses waiting to be sent.
+   * - scylla_transport_cql_request_histogram_bytes
+     - A histogram of received bytes in CQL messages of a specific kind and
+       specific scheduling group.
+   * - scylla_transport_cql_requests_serving
+     - Holds the number of requests that are being processed right now.
+   * - scylla_transport_cql_response_histogram_bytes
+     - A histogram of received bytes in CQL messages of a specific kind and
+       specific scheduling group.
+   * - scylla_transport_requests_forwarded_failed
+     - Counts the number of requests that were forwarded to another replica
+       but failed to execute there.
+   * - scylla_transport_requests_forwarded_prepared_not_found
+     - Counts the number of requests that were forwarded to another replica
+       but failed there because the statement was not prepared on the target.
+       When this happens, the coordinator performs an additional remote call
+       to prepare the statement on the replica and retries the EXECUTE request
+       afterwards.
+   * - scylla_transport_requests_forwarded_redirected
+     - Counts the number of requests that were forwarded to another replica
+       but that replica responded with a redirect to another node. This can
+       happen when replica has stale information about the cluster topology or
+       when the request is handled by a node that is not a replica for the data
+       being accessed by the request.
+   * - scylla_transport_requests_forwarded_successfully
+     - Counts the number of requests that were forwarded to another replica
+       and executed successfully there.
+
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/upgrade-guide-from-2026.1-to-2026.2.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/upgrade-guide-from-2026.1-to-2026.2.rst
@@ -1,13 +1,13 @@
 .. |SCYLLA_NAME| replace:: ScyllaDB

-.. |SRC_VERSION| replace:: 2025.x
-.. |NEW_VERSION| replace:: 2026.1
+.. |SRC_VERSION| replace:: 2026.1
+.. |NEW_VERSION| replace:: 2026.2

 .. |ROLLBACK| replace:: rollback
 .. _ROLLBACK: ./#rollback-procedure

-.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.x to 2026.1
-.. _SCYLLA_METRICS: ../metric-update-2025.x-to-2026.1
+.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2026.1 to 2026.2
+.. _SCYLLA_METRICS: ../metric-update-2026.1-to-2026.2

 =======================================================================================
 Upgrade from |SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION|
--- a/ent/encryption/kmip_host.cc
+++ b/ent/encryption/kmip_host.cc
@@ -598,7 +598,7 @@ future<int> kmip_host::impl::do_cmd(KMIP_CMD* cmd, con_ptr cp, Func& f, bool ret

 template<typename Func>
 future<kmip_host::impl::kmip_cmd> kmip_host::impl::do_cmd(kmip_cmd cmd_in, Func && f) {
-    kmip_log.trace("{}: begin do_cmd", *this, cmd_in);
+    kmip_log.trace("{}: begin do_cmd {}", *this, cmd_in);
    KMIP_CMD* cmd = cmd_in;

    // #998 Need to do retry loop, because we can have either timed out connection,
--- a/ent/encryption/kms_host.cc
+++ b/ent/encryption/kms_host.cc
@@ -616,7 +616,7 @@ future<rjson::value> encryption::kms_host::impl::do_post(std::string_view target
            static auto get_xml_node = [](node_type* node, const char* what) {
                auto res = node->first_node(what);
                if (!res) {
-                    throw malformed_response_error(fmt::format("XML parse error", what));
+                    throw malformed_response_error(fmt::format("XML parse error: {}", what));
                }
                return res;
            };
--- a/gms/feature_service.cc
+++ b/gms/feature_service.cc
@@ -7,6 +7,7 @@
 #include <seastar/core/sstring.hh>
 #include <seastar/core/seastar.hh>
 #include <seastar/core/smp.hh>
+#include "db/schema_features.hh"
 #include "utils/log.hh"
 #include "gms/feature.hh"
 #include "gms/feature_service.hh"
@@ -108,6 +109,7 @@ std::set<std::string_view> feature_service::supported_feature_set() const {
        "UUID_SSTABLE_IDENTIFIERS"sv,
        "GROUP0_SCHEMA_VERSIONING"sv,
        "VIEW_BUILD_STATUS_ON_GROUP0"sv,
+        "CDC_GENERATIONS_V2"sv,
    };

    if (is_test_only_feature_deprecated()) {
@@ -179,6 +181,7 @@ db::schema_features feature_service::cluster_schema_features() const {
    f.set<db::schema_feature::GROUP0_SCHEMA_VERSIONING>();
    f.set_if<db::schema_feature::IN_MEMORY_TABLES>(bool(in_memory_tables));
    f.set_if<db::schema_feature::TABLET_OPTIONS>(bool(tablet_options));
+    f.set_if<db::schema_feature::KEYSPACE_MULTI_RF_CHANGE>(bool(keyspace_multi_rf_change));
    return f;
 }

--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -83,7 +83,6 @@ public:
    gms::feature alternator_ttl { *this, "ALTERNATOR_TTL"sv };
    gms::feature cql_row_ttl { *this, "CQL_ROW_TTL"sv };
    gms::feature range_scan_data_variant { *this, "RANGE_SCAN_DATA_VARIANT"sv };
-    gms::feature cdc_generations_v2 { *this, "CDC_GENERATIONS_V2"sv };
    gms::feature user_defined_aggregates { *this, "UDA"sv };
    // Historically max_result_size contained only two fields: soft_limit and
    // hard_limit. It was somehow obscure because for normal paged queries both
@@ -182,6 +181,7 @@ public:
    gms::feature writetime_ttl_individual_element { *this, "WRITETIME_TTL_INDIVIDUAL_ELEMENT"sv };
    gms::feature arbitrary_tablet_boundaries { *this, "ARBITRARY_TABLET_BOUNDARIES"sv };
    gms::feature large_data_virtual_tables { *this, "LARGE_DATA_VIRTUAL_TABLES"sv };
+    gms::feature keyspace_multi_rf_change { *this, "KEYSPACE_MULTI_RF_CHANGE"sv };
 public:

    const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -399,9 +399,10 @@ future<> gossiper::do_send_ack2_msg(locator::host_id from, utils::chunked_vector
        }
    }
    gms::gossip_digest_ack2 ack2_msg(std::move(delta_ep_state_map));
-    logger.debug("Calling do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg);
+    auto ack2_msg_str = fmt::format("{}", ack2_msg);
+    logger.debug("Calling do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg_str);
    co_await ser::gossip_rpc_verbs::send_gossip_digest_ack2(&_messaging, from, std::move(ack2_msg));
-    logger.debug("finished do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg);
+    logger.debug("finished do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg_str);
 }

 // Depends on
@@ -964,8 +965,7 @@ future<> gossiper::failure_detector_loop_for_node(locator::host_id host_id, gene
        diff = now - last;
        if (!failed) {
            last = now;
-        }
-        if (diff > max_duration) {
+        } else if (diff > max_duration) {
            logger.info("failure_detector_loop: Mark node {}/{} as DOWN", host_id, node);
            co_await container().invoke_on(0, [host_id] (gms::gossiper& g) {
                return g.convict(host_id);
--- a/init.cc
+++ b/init.cc
@@ -87,9 +87,6 @@ std::set<sstring> get_disabled_features_from_db_config(const db::config& cfg, st
        }
    }

-    if (!cfg.check_experimental(db::experimental_features_t::feature::ALTERNATOR_STREAMS)) {
-        disabled.insert("ALTERNATOR_STREAMS"s);
-    }
    if (!cfg.check_experimental(db::experimental_features_t::feature::KEYSPACE_STORAGE_OPTIONS)) {
        disabled.insert("KEYSPACE_STORAGE_OPTIONS"s);
    }
--- a/locator/load_sketch.hh
+++ b/locator/load_sketch.hh
@@ -381,6 +381,10 @@ public:
        return _nodes.at(node)._du.capacity;
    }

+    bool has_node(host_id node) const {
+        return _nodes.contains(node);
+    }
+
    shard_id get_shard_count(host_id node) const {
        if (!_nodes.contains(node)) {
            return 0;
--- a/locator/tablets.hh
+++ b/locator/tablets.hh
@@ -153,19 +153,27 @@ struct hash<locator::range_based_tablet_id> {

 namespace locator {

-/// Creates a new replica set with old_replica replaced by new_replica.
-/// If there is no old_replica, the set is returned unchanged.
+/// Returns a copy of the replica set with the following modifications:
+/// - If both old_replica and new_replica are set, old_replica is substituted
+///   with new_replica. If old_replica is not found in rs, the set is returned as-is.
+/// - If only old_replica is set, it is removed from the result.
+/// - If only new_replica is set, it is appended to the result.
 inline
-tablet_replica_set replace_replica(const tablet_replica_set& rs, tablet_replica old_replica, tablet_replica new_replica) {
+tablet_replica_set replace_replica(const tablet_replica_set& rs, std::optional<tablet_replica> old_replica, std::optional<tablet_replica> new_replica) {
    tablet_replica_set result;
    result.reserve(rs.size());
    for (auto&& r : rs) {
-        if (r == old_replica) {
-            result.push_back(new_replica);
+        if (old_replica.has_value() && r == old_replica.value()) {
+            if (new_replica.has_value()) {
+                result.push_back(new_replica.value());
+            }
        } else {
            result.push_back(r);
        }
    }
+    if (!old_replica.has_value() && new_replica.has_value()) {
+        result.push_back(new_replica.value());
+    }
    return result;
 }

@@ -383,8 +391,8 @@ bool is_post_cleanup(tablet_replica replica, const tablet_info& tinfo, const tab
 struct tablet_migration_info {
    locator::tablet_transition_kind kind;
    locator::global_tablet_id tablet;
-    locator::tablet_replica src;
-    locator::tablet_replica dst;
+    std::optional<locator::tablet_replica> src;
+    std::optional<locator::tablet_replica> dst;
 };

 class tablet_map;
--- a/main.cc
+++ b/main.cc
@@ -942,7 +942,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl

            auto background_reclaim_scheduling_group = create_scheduling_group("background_reclaim", "bgre", 50).get();

-            // Maintenance supergroup -- the collection of background low-prio activites
+            // Maintenance supergroup -- the collection of background low-prio activities
            auto maintenance_supergroup = create_scheduling_supergroup(200).get();
            auto bandwidth_updater = io_throughput_updater("maintenance supergroup", maintenance_supergroup,
                    cfg->maintenance_io_throughput_mb_per_sec.is_set() ? cfg->maintenance_io_throughput_mb_per_sec : cfg->stream_io_throughput_mb_per_sec);
@@ -1358,6 +1358,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            };
            spcfg.hinted_handoff_enabled = hinted_handoff_enabled;
            spcfg.available_memory = memory::stats().total_memory();
+            spcfg.maintenance_mode = maintenance_mode_enabled{cfg->maintenance_mode()};
            smp_service_group_config storage_proxy_smp_service_group_config;
            // Assuming less than 1kB per queued request, this limits storage_proxy submit_to() queues to 5MB or less
            storage_proxy_smp_service_group_config.max_nonlocal_requests = 5000;
@@ -1366,7 +1367,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            spcfg.write_mv_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get();
            spcfg.hints_write_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get();
            spcfg.write_ack_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get();
-            static db::view::node_update_backlog node_backlog(smp::count, 10ms);
+            static db::view::node_update_backlog node_backlog(smp::count, 10ms, cfg->view_flow_control_delay_limit_in_ms);
            scheduling_group_key_config storage_proxy_stats_cfg =
                    make_scheduling_group_key_config<service::storage_proxy_stats::stats>();
            storage_proxy_stats_cfg.constructor = [plain_constructor = storage_proxy_stats_cfg.constructor] (void* ptr) {
@@ -1810,6 +1811,18 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            utils::get_local_injector().inject("stop_after_starting_migration_manager",
                [] { std::raise(SIGSTOP); });

+            // Audit must be constructed before the maintenance socket so
+            // that on shutdown (reverse destruction order) the audit service
+            // outlives the maintenance socket and in-flight queries can
+            // still reach audit::inspect() safely.
+            checkpoint(stop_signal, "starting audit service");
+            audit::audit::start_audit(*cfg, token_metadata, qp, mm).handle_exception([&] (auto&& e) {
+                startlog.error("audit start failed: {}", e);
+            }).get();
+            auto audit_stop = defer([] {
+                audit::audit::stop_audit().get();
+            });
+
            // XXX: stop_raft has to happen before query_processor and migration_manager
            // is stopped, since some groups keep using the query
            // processor until are stopped inside stop_raft.
@@ -1841,7 +1854,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            });

            checkpoint(stop_signal, "starting view update generator");
-            view_update_generator.start(std::ref(db), std::ref(proxy), std::ref(stop_signal.as_sharded_abort_source())).get();
+            view_update_generator.start(std::ref(db), std::ref(proxy), std::ref(node_backlog), std::ref(stop_signal.as_sharded_abort_source())).get();
            auto stop_view_update_generator = defer_verbose_shutdown("view update generator", [] {
                view_update_generator.stop().get();
            });
@@ -2287,10 +2300,12 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
               ss.local().wait_for_group0_stop().get();
            });

-            // Setup group0 early in case the node is bootstrapped already and the group exists.
-            // Need to do it before allowing incoming messaging service connections since
-            // storage proxy's and migration manager's verbs may access group0.
-            group0_service.setup_group0_if_exist(sys_ks.local(), ss.local(), qp.local(), mm.local()).get();
+            if (!group0_service.maintenance_mode() && sys_ks.local().bootstrap_complete()) {
+                // Setup group0 early in case the node is bootstrapped already and the group exists.
+                // Need to do it before allowing incoming messaging service connections since
+                // storage proxy's and migration manager's verbs may access group0.
+                group0_service.setup_group0_if_exist(sys_ks.local(), ss.local(), qp.local(), mm.local()).get();
+            }

            // The call to setup_group0_if_exists() above guarantees that, if group0 is
            // created and started, the locally persisted group0 state has been applied
@@ -2340,15 +2355,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            }).get();
            stop_signal.ready(false);

-            if (cfg->maintenance_socket() != "ignore") {
-                // Enable role operations now that node joined the cluster
-                maintenance_auth_service.invoke_on_all([](auth::service& svc) {
-                    return auth::ensure_role_operations_are_enabled(svc);
-                }).get();
-
-                start_cql(*cql_maintenance_server_ctl, stop_maintenance_cql, "maintenance native server");
-            }
-
            // At this point, `locator::topology` should be stable, i.e. we should have complete information
            // about the layout of the cluster (= list of nodes along with the racks/DCs).
            startlog.info("Verifying that all of the keyspaces are RF-rack-valid");
@@ -2357,16 +2363,23 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            startlog.info("Verifying that all of the tablet keyspaces use rack list replication factors");
            db.local().check_rack_list_everywhere(cfg->enforce_rack_list());

-            // Start audit service after join_cluster so that the table-based audit backend
-            // can properly create its keyspace and table.
-            checkpoint(stop_signal, "starting audit service");
-            audit::audit::start_audit(*cfg, token_metadata, qp, mm).handle_exception([&] (auto&& e) {
-                startlog.error("audit start failed: {}", e);
-            }).get();
-            auto audit_stop = defer([] {
-                audit::audit::stop_audit().get();
+            // The table-based audit backend needs Raft (via join_cluster)
+            // to create its keyspace and table.
+            checkpoint(stop_signal, "starting audit storage");
+            audit::audit::start_storage(*cfg).get();
+            auto audit_storage_stop = defer([] {
+                audit::audit::stop_storage().get();
            });

+            if (cfg->maintenance_socket() != "ignore") {
+                // Enable role operations now that node joined the cluster
+                maintenance_auth_service.invoke_on_all([](auth::service& svc) {
+                    return auth::ensure_role_operations_are_enabled(svc);
+                }).get();
+
+                start_cql(*cql_maintenance_server_ctl, stop_maintenance_cql, "maintenance native server");
+            }
+
            // Semantic validation of sstable compression parameters from config.
            // Adding here (i.e., after `join_cluster`) to ensure that the
            // required SSTABLE_COMPRESSION_DICTS cluster feature has been negotiated.
--- a/mutation/atomic_cell.hh
+++ b/mutation/atomic_cell.hh
@@ -48,8 +48,8 @@ static void set_field(atomic_cell_value& out, unsigned offset, T val) {
 }

 template <FragmentRange Buffer>
-static void set_value(managed_bytes& b, unsigned value_offset, const Buffer& value) {
-    auto v = managed_bytes_mutable_view(b).substr(value_offset, value.size_bytes());
+static void set_value(atomic_cell_value_mutable_view b, unsigned value_offset, const Buffer& value) {
+    auto v = b.substr(value_offset, value.size_bytes());
    for (auto frag : value) {
        write_fragmented(v, single_fragmented_view(frag));
    }
@@ -141,20 +141,36 @@ public:
        SCYLLA_ASSERT(is_live_and_has_ttl(cell));
        return gc_clock::duration(get_field<int32_t>(cell, ttl_offset));
    }
-    static managed_bytes make_dead(api::timestamp_type timestamp, gc_clock::time_point deletion_time) {
-        managed_bytes b(managed_bytes::initialized_later(), flags_size + timestamp_size + deletion_time_size);
+    static size_t dead_serialized_size() {
+        return flags_size + timestamp_size + deletion_time_size;
+    }
+    static size_t live_serialized_size(size_t value_size) {
+        return flags_size + timestamp_size + value_size;
+    }
+    static size_t live_expiring_serialized_size(size_t value_size) {
+        return flags_size + timestamp_size + expiry_size + ttl_size + value_size;
+    }
+    static void write_dead(atomic_cell_value_mutable_view b, api::timestamp_type timestamp, gc_clock::time_point deletion_time) {
        b[0] = 0;
        set_field(b, timestamp_offset, timestamp);
        set_field(b, deletion_time_offset, static_cast<int64_t>(deletion_time.time_since_epoch().count()));
+    }
+    static managed_bytes make_dead(api::timestamp_type timestamp, gc_clock::time_point deletion_time) {
+        managed_bytes b(managed_bytes::initialized_later(), dead_serialized_size());
+        write_dead(b, timestamp, deletion_time);
        return b;
    }
    template <FragmentRange Buffer>
-    static managed_bytes make_live(api::timestamp_type timestamp, const Buffer& value) {
+    static void write_live(atomic_cell_value_mutable_view b, api::timestamp_type timestamp, const Buffer& value) {
        auto value_offset = flags_size + timestamp_size;
-        managed_bytes b(managed_bytes::initialized_later(), value_offset + value.size_bytes());
        b[0] = LIVE_FLAG;
        set_field(b, timestamp_offset, timestamp);
        set_value(b, value_offset, value);
+    }
+    template <FragmentRange Buffer>
+    static managed_bytes make_live(api::timestamp_type timestamp, const Buffer& value) {
+        managed_bytes b(managed_bytes::initialized_later(), live_serialized_size(value.size_bytes()));
+        write_live(b, timestamp, value);
        return b;
    }
    static managed_bytes make_live_counter_update(api::timestamp_type timestamp, int64_t value) {
@@ -166,14 +182,18 @@ public:
        return b;
    }
    template <FragmentRange Buffer>
-    static managed_bytes make_live(api::timestamp_type timestamp, const Buffer& value, gc_clock::time_point expiry, gc_clock::duration ttl) {
+    static void write_live(atomic_cell_value_mutable_view b, api::timestamp_type timestamp, const Buffer& value, gc_clock::time_point expiry, gc_clock::duration ttl) {
        auto value_offset = flags_size + timestamp_size + expiry_size + ttl_size;
-        managed_bytes b(managed_bytes::initialized_later(), value_offset + value.size_bytes());
        b[0] = EXPIRY_FLAG | LIVE_FLAG;
        set_field(b, timestamp_offset, timestamp);
        set_field(b, expiry_offset, static_cast<int64_t>(expiry.time_since_epoch().count()));
        set_field(b, ttl_offset, static_cast<int32_t>(ttl.count()));
        set_value(b, value_offset, value);
+    }
+    template <FragmentRange Buffer>
+    static managed_bytes make_live(api::timestamp_type timestamp, const Buffer& value, gc_clock::time_point expiry, gc_clock::duration ttl) {
+        managed_bytes b(managed_bytes::initialized_later(), live_expiring_serialized_size(value.size_bytes()));
+        write_live(b, timestamp, value, expiry, ttl);
        return b;
    }
    static managed_bytes make_live_uninitialized(api::timestamp_type timestamp, size_t size) {
--- a/mutation/canonical_mutation.cc
+++ b/mutation/canonical_mutation.cc
@@ -113,10 +113,10 @@ auto fmt::formatter<canonical_mutation>::format(const canonical_mutation& cm, fm
            auto&& entry = _cm.static_column_at(id);
            _os = fmt::format_to(_os, "static column {} {}", bytes_to_text(entry.name()), atomic_cell::printer(*entry.type(), ac));
        }
-        virtual void accept_static_cell(column_id id, collection_mutation_view cmv) override {
+        virtual void accept_static_cell(column_id id, collection_mutation cm) override {
            print_separator();
            auto&& entry = _cm.static_column_at(id);
-            _os = fmt::format_to(_os, "static column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cmv));
+            _os = fmt::format_to(_os, "static column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cm));
        }
        virtual stop_iteration accept_row_tombstone(range_tombstone rt) override {
            print_separator();
@@ -137,10 +137,10 @@ auto fmt::formatter<canonical_mutation>::format(const canonical_mutation& cm, fm
            auto&& entry = _cm.regular_column_at(id);
            _os = fmt::format_to(_os, "column {} {}", bytes_to_text(entry.name()), atomic_cell::printer(*entry.type(), ac));
        }
-        virtual void accept_row_cell(column_id id, collection_mutation_view cmv) override {
+        virtual void accept_row_cell(column_id id, collection_mutation cm) override {
            print_separator();
            auto&& entry = _cm.regular_column_at(id);
-            _os = fmt::format_to(_os, "column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cmv));
+            _os = fmt::format_to(_os, "column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cm));
        }
        out_t finalize() {
            if (_in_row) {
--- a/mutation/collection_mutation.cc
+++ b/mutation/collection_mutation.cc
@@ -7,12 +7,14 @@
 */

 #include "utils/assert.hh"
+#include "utils/on_internal_error.hh"
 #include "types/collection.hh"
 #include "types/user.hh"
 #include "types/concrete_types.hh"
 #include "mutation/mutation_partition.hh"
 #include "compaction/compaction_garbage_collector.hh"
 #include "combine.hh"
+#include "idl/mutation.dist.impl.hh"

 #include "collection_mutation.hh"

@@ -224,13 +226,26 @@ compact_and_expire_result collection_mutation_description::compact_and_expire(co
    return res;
 }

-template <typename Iterator>
+/// A CollectionMutationAdaptor is a static interface that adapts a collection
+/// element (an iterator value type) to the serialization requirements of
+/// serialize_collection_mutation(). It provides static methods to measure the
+/// serialized sizes and to write the key and value of each element into a buffer.
+template <typename Adaptor, typename Element>
+concept CollectionMutationAdaptor = requires(const Element& e, managed_bytes_mutable_view& out) {
+    { Adaptor::key_size(e) } -> std::convertible_to<size_t>;
+    { Adaptor::value_size(e) } -> std::convertible_to<size_t>;
+    { Adaptor::write_key(e, out) };
+    { Adaptor::write_value(e, out) };
+};
+
+template <typename Adaptor, typename Iterator>
+    requires CollectionMutationAdaptor<Adaptor, std::iter_value_t<Iterator>>
 static collection_mutation serialize_collection_mutation(
        const abstract_type& type,
        const tombstone& tomb,
        std::ranges::subrange<Iterator> cells) {
    auto element_size = [] (size_t c, auto&& e) -> size_t {
-        return c + 8 + e.first.size() + e.second.serialize().size();
+        return c + 8 + Adaptor::key_size(e) + Adaptor::value_size(e);
    };
    auto size = std::ranges::fold_left(cells, (size_t)4, element_size);
    size += 1;
@@ -244,32 +259,112 @@ static collection_mutation serialize_collection_mutation(
        write<int64_t>(out, tomb.timestamp);
        write<int64_t>(out, tomb.deletion_time.time_since_epoch().count());
    }
-    auto writek = [&out] (bytes_view v) {
-        write<int32_t>(out, v.size());
-        write_fragmented(out, single_fragmented_view(v));
+    auto writek = [&out] (auto& kv) {
+        write<int32_t>(out, Adaptor::key_size(kv));
+        Adaptor::write_key(kv, out);
    };
-    auto writev = [&out] (managed_bytes_view v) {
-        write<int32_t>(out, v.size());
-        write_fragmented(out, v);
+    auto writev = [&out] (auto& kv) {
+        write<int32_t>(out, Adaptor::value_size(kv));
+        Adaptor::write_value(kv, out);
    };
    // FIXME: overflow?
    write<int32_t>(out, std::ranges::distance(cells));
    for (auto&& kv : cells) {
-        auto&& k = kv.first;
-        auto&& v = kv.second;
-        writek(k);
-
-        writev(v.serialize());
+        writek(kv);
+        writev(kv);
    }
    return collection_mutation(type, std::move(ret));
 }

+namespace {
+
+/// A key-value pair where the key is bytes-like and the value is an atomic_cell-like type
+/// with a serialize() method returning managed_bytes_view.
+template <typename T>
+concept AtomicCellKV = requires(const T& kv) {
+    { kv.first.size() } -> std::convertible_to<size_t>;
+    { kv.second.serialize() } -> std::convertible_to<managed_bytes_view>;
+};
+
+struct atomic_cell_adaptor {
+    static size_t key_size(const AtomicCellKV auto& v) { return v.first.size(); }
+    static size_t value_size(const AtomicCellKV auto& v) { return v.second.serialize().size(); }
+
+    static void write_key(const AtomicCellKV auto& v, managed_bytes_mutable_view& out) {
+        write_fragmented(out, single_fragmented_view(v.first));
+    }
+    static void write_value(const AtomicCellKV auto& v, managed_bytes_mutable_view& out) {
+        write_fragmented(out, v.second.serialize());
+    }
+};
+
+}
+
 collection_mutation collection_mutation_description::serialize(const abstract_type& type) const {
-    return serialize_collection_mutation(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
+    return serialize_collection_mutation<atomic_cell_adaptor>(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
 }

 collection_mutation collection_mutation_view_description::serialize(const abstract_type& type) const {
-    return serialize_collection_mutation(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
+    return serialize_collection_mutation<atomic_cell_adaptor>(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
+}
+
+namespace {
+
+struct serialized_cell_adaptor {
+    static size_t key_size(const ser::collection_element_view& v) {
+        return v.key().view().size_bytes();
+    }
+
+    static size_t value_size(const ser::collection_element_view& v) {
+        struct collection_cell_visitor {
+            size_t operator()(const ser::live_cell_view& lcv) const { return atomic_cell_type::live_serialized_size(lcv.value().view().size_bytes()); }
+            size_t operator()(const ser::expiring_cell_view& ecv) const { return atomic_cell_type::live_expiring_serialized_size(ecv.c().value().view().size_bytes()); }
+            size_t operator()(const ser::dead_cell_view& dcv) const { return atomic_cell_type::dead_serialized_size(); }
+            size_t operator()(const ser::counter_cell_view& ccv) const { utils::on_internal_error("Trying to deserialize counter cell from collection"); }
+            size_t operator()(const ser::unknown_variant_type&) const { utils::on_internal_error("Trying to deserialize cell in unknown state"); };
+        };
+        return boost::apply_visitor(collection_cell_visitor{}, v.value());
+    }
+
+    static void write_key(const ser::collection_element_view& v, managed_bytes_mutable_view& out) {
+        write_fragmented(out, v.key().view());
+    }
+
+    static void write_value(const ser::collection_element_view& v, managed_bytes_mutable_view& out) {
+        struct collection_cell_visitor {
+            managed_bytes_mutable_view& out;
+
+            void operator()(const ser::live_cell_view& lcv) const {
+                const auto v = lcv.value().view();
+                atomic_cell_type::write_live(out, lcv.created_at(), v);
+                out.remove_prefix(atomic_cell_type::live_serialized_size(v.size_bytes()));
+            }
+            void operator()(const ser::expiring_cell_view& ecv) const {
+                const auto v = ecv.c().value().view();
+                atomic_cell_type::write_live(out, ecv.c().created_at(), v, ecv.expiry(), ecv.ttl());
+                out.remove_prefix(atomic_cell_type::live_expiring_serialized_size(v.size_bytes()));
+            }
+            void operator()(const ser::dead_cell_view& dcv) const {
+                atomic_cell_type::write_dead(out, dcv.tomb().timestamp(), dcv.tomb().deletion_time());
+                out.remove_prefix(atomic_cell_type::dead_serialized_size());
+            }
+            void operator()(const ser::counter_cell_view& ccv) const {
+                utils::on_internal_error("Trying to deserialize counter cell from collection");
+            }
+            void operator()(const ser::unknown_variant_type&) const {
+                utils::on_internal_error("Trying to deserialize cell in unknown state");
+            }
+        };
+        boost::apply_visitor(collection_cell_visitor{out}, v.value());
+    }
+};
+
+}
+
+collection_mutation read_from_collection_cell_view(const abstract_type& type, const ser::collection_cell_view& collection) {
+    auto tomb = collection.tomb();
+    auto cells = collection.elements();
+    return serialize_collection_mutation<serialized_cell_adaptor>(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
 }

 template <typename C>
--- a/mutation/collection_mutation.hh
+++ b/mutation/collection_mutation.hh
@@ -23,6 +23,10 @@ class row_tombstone;

 class collection_mutation;

+namespace ser {
+class collection_cell_view;
+}
+
 // An auxiliary struct used to (de)construct collection_mutations.
 // Unlike collection_mutation which is a serialized blob, this struct allows to inspect logical units of information
 // (tombstone and cells) inside the mutation easily.
@@ -130,6 +134,12 @@ collection_mutation merge(const abstract_type&, collection_mutation_view, collec

 collection_mutation difference(const abstract_type&, collection_mutation_view, collection_mutation_view);

+// Transcode a collection from the IDL representation directly into the
+// collection_mutation serialization format, without using any intermediary representation.
+// Only the final collection-mutation blob is allocated, no intermediate allocations needed.
+// Safe to use in LSA, it won't produce garbage.
+collection_mutation read_from_collection_cell_view(const abstract_type&, const ser::collection_cell_view&);
+
 // Serializes the given collection of cells to a sequence of bytes ready to be sent over the CQL protocol.
 bytes_ostream serialize_for_cql(const abstract_type&, collection_mutation_view);

--- a/mutation/frozen_mutation.hh
+++ b/mutation/frozen_mutation.hh
@@ -97,9 +97,9 @@ public:
        r.append_cell(id, atomic_cell_or_collection(std::move(cell)));
    }

-    virtual void accept_static_cell(column_id id, collection_mutation_view collection) override {
+    virtual void accept_static_cell(column_id id, collection_mutation collection) override {
        row& r = _static_row.maybe_create();
-        r.append_cell(id, collection_mutation(*_schema.static_column_at(id).type, std::move(collection)));
+        r.append_cell(id, std::move(collection));
    }

    virtual stop_iteration accept_row_tombstone(range_tombstone rt) override {
@@ -125,9 +125,9 @@ public:
        r.append_cell(id, std::move(cell));
    }

-    virtual void accept_row_cell(column_id id, collection_mutation_view collection) override {
+    virtual void accept_row_cell(column_id id, collection_mutation collection) override {
        row& r = _current_row->cells();
-        r.append_cell(id, collection_mutation(*_schema.regular_column_at(id).type, std::move(collection)));
+        r.append_cell(id, std::move(collection));
    }

    auto on_end_of_partition() {
--- a/mutation/mutation_partition.hh
+++ b/mutation/mutation_partition.hh
@@ -707,9 +707,10 @@ struct fmt::formatter<shadowable_tombstone> : fmt::formatter<string_view> {
    template <typename FormatContext>
    auto format(const shadowable_tombstone& t, FormatContext& ctx) const {
        if (t) {
+            auto& tomb = t.tomb();
            return fmt::format_to(ctx.out(),
                                  "{{shadowable tombstone: timestamp={}, deletion_time={}}}",
-                                  t.tomb().timestamp, t.tomb(), t.tomb().deletion_time.time_since_epoch().count());
+                                  tomb.timestamp, tomb.deletion_time.time_since_epoch().count());
        } else {
            return fmt::format_to(ctx.out(),
                                  "{{shadowable tombstone: none}}");
--- a/mutation/mutation_partition_view.cc
+++ b/mutation/mutation_partition_view.cc
@@ -86,37 +86,6 @@ atomic_cell read_atomic_cell(const abstract_type& type, atomic_cell_variant cv,
    return boost::apply_visitor(atomic_cell_visitor(type, cm), cv);
 }

-collection_mutation read_collection_cell(const abstract_type& type, ser::collection_cell_view cv)
-{
-    collection_mutation_description mut;
-    mut.tomb = cv.tomb();
-    auto&& elements = cv.elements();
-    mut.cells.reserve(elements.size());
-
-    visit(type, make_visitor(
-        [&] (const collection_type_impl& ctype) {
-            auto& value_type = *ctype.value_comparator();
-            for (auto&& e : elements) {
-                mut.cells.emplace_back(e.key(), read_atomic_cell(value_type, e.value(), atomic_cell::collection_member::yes));
-            }
-        },
-        [&] (const user_type_impl& utype) {
-            for (auto&& e : elements) {
-                bytes key = e.key();
-                auto idx = deserialize_field_index(key);
-                SCYLLA_ASSERT(idx < utype.size());
-
-                mut.cells.emplace_back(key, read_atomic_cell(*utype.type(idx), e.value(), atomic_cell::collection_member::yes));
-            }
-        },
-        [&] (const abstract_type& o) {
-            throw std::runtime_error(format("attempted to read a collection cell with type: {}", o.name()));
-        }
-    ));
-
-    return mut.serialize(type);
-}
-
 template<typename Visitor>
 void read_and_visit_row(ser::row_view rv, const column_mapping& cm, column_kind kind, Visitor&& visitor)
 {
@@ -142,14 +111,7 @@ void read_and_visit_row(ser::row_view rv, const column_mapping& cm, column_kind
                if (_col.is_atomic()) {
                    throw std::runtime_error("An atomic cell expected, got a collection");
                }
-                // FIXME: Pass view to cell to avoid copy
-                auto&& outer = current_allocator();
-                with_allocator(standard_allocator(), [&] {
-                    auto cell = read_collection_cell(*_col.type(), ccv);
-                    with_allocator(outer, [&] {
-                        _visitor.accept_collection(_id, cell);
-                    });
-                });
+                _visitor.accept_collection(_id, read_from_collection_cell_view(*_col.type(), ccv));
            }
            void operator()(ser::unknown_variant_type&) const {
                throw std::runtime_error("Trying to deserialize unknown cell type");
@@ -198,8 +160,8 @@ void mutation_partition_view::do_accept(const column_mapping& cm, Visitor& visit
        void accept_atomic_cell(column_id id, atomic_cell ac) const {
           _visitor.accept_static_cell(id, std::move(ac));
        }
-        void accept_collection(column_id id, const collection_mutation& cm) const {
-           _visitor.accept_static_cell(id, cm);
+        void accept_collection(column_id id, collection_mutation cm) const {
+           _visitor.accept_static_cell(id, std::move(cm));
        }
    };
    read_and_visit_row(mpv.static_row(), cm, column_kind::static_column, static_row_cell_visitor{visitor});
@@ -218,8 +180,8 @@ void mutation_partition_view::do_accept(const column_mapping& cm, Visitor& visit
            void accept_atomic_cell(column_id id, atomic_cell ac) const {
               _visitor.accept_row_cell(id, std::move(ac));
            }
-            void accept_collection(column_id id, const collection_mutation& cm) const {
-               _visitor.accept_row_cell(id, cm);
+            void accept_collection(column_id id, collection_mutation cm) const {
+               _visitor.accept_row_cell(id, std::move(cm));
            }
        };
        read_and_visit_row(cr.cells(), cm, column_kind::regular_column, cell_visitor{visitor});
@@ -240,8 +202,8 @@ future<> mutation_partition_view::do_accept_gently(const column_mapping& cm, Vis
        void accept_atomic_cell(column_id id, atomic_cell ac) const {
           _visitor.accept_static_cell(id, std::move(ac));
        }
-        void accept_collection(column_id id, const collection_mutation& cm) const {
-           _visitor.accept_static_cell(id, cm);
+        void accept_collection(column_id id, collection_mutation cm) const {
+           _visitor.accept_static_cell(id, std::move(cm));
        }
    };
    read_and_visit_row(mpv.static_row(), cm, column_kind::static_column, static_row_cell_visitor{visitor});
@@ -263,8 +225,8 @@ future<> mutation_partition_view::do_accept_gently(const column_mapping& cm, Vis
            void accept_atomic_cell(column_id id, atomic_cell ac) const {
               _visitor.accept_row_cell(id, std::move(ac));
            }
-            void accept_collection(column_id id, const collection_mutation& cm) const {
-               _visitor.accept_row_cell(id, cm);
+            void accept_collection(column_id id, collection_mutation cm) const {
+               _visitor.accept_row_cell(id, std::move(cm));
            }
        };
        read_and_visit_row(cr.cells(), cm, column_kind::regular_column, cell_visitor{visitor});
@@ -286,8 +248,8 @@ future<> mutation_partition_view::do_accept_gently(const column_mapping& cm, Asy
        void accept_atomic_cell(column_id id, atomic_cell ac) const {
           _visitor.accept_static_cell(id, std::move(ac));
        }
-        void accept_collection(column_id id, const collection_mutation& cm) const {
-           _visitor.accept_static_cell(id, cm);
+        void accept_collection(column_id id, collection_mutation cm) const {
+           _visitor.accept_static_cell(id, std::move(cm));
        }
    };
    read_and_visit_row(mpv.static_row(), cm, column_kind::static_column, static_row_cell_visitor{visitor});
@@ -308,8 +270,8 @@ future<> mutation_partition_view::do_accept_gently(const column_mapping& cm, Asy
            void accept_atomic_cell(column_id id, atomic_cell ac) const {
               _visitor.accept_row_cell(id, std::move(ac));
            }
-            void accept_collection(column_id id, const collection_mutation& cm) const {
-               _visitor.accept_row_cell(id, cm);
+            void accept_collection(column_id id, collection_mutation cm) const {
+               _visitor.accept_row_cell(id, std::move(cm));
            }
        };
        read_and_visit_row(cr.cells(), cm, column_kind::regular_column, cell_visitor{visitor});
@@ -337,8 +299,8 @@ mutation_partition_view::accept_ordered_result mutation_partition_view::do_accep
            void accept_atomic_cell(column_id id, atomic_cell ac) const {
                _visitor.accept_static_cell(id, std::move(ac));
            }
-            void accept_collection(column_id id, const collection_mutation& cm) const {
-                _visitor.accept_static_cell(id, cm);
+            void accept_collection(column_id id, collection_mutation cm) const {
+                _visitor.accept_static_cell(id, std::move(cm));
            }
        };
        read_and_visit_row(mpv.static_row(), cm, column_kind::static_column, static_row_cell_visitor{visitor});
@@ -376,8 +338,8 @@ mutation_partition_view::accept_ordered_result mutation_partition_view::do_accep
            void accept_atomic_cell(column_id id, atomic_cell ac) const {
                _visitor.accept_row_cell(id, std::move(ac));
            }
-            void accept_collection(column_id id, const collection_mutation& cm) const {
-                _visitor.accept_row_cell(id, cm);
+            void accept_collection(column_id id, collection_mutation cm) const {
+                _visitor.accept_row_cell(id, std::move(cm));
            }
        };
        read_and_visit_row(cr.cells(), cm, column_kind::regular_column, cell_visitor{visitor});
@@ -501,44 +463,40 @@ mutation_partition_view mutation_partition_view::from_view(ser::mutation_partiti

 clustering_row read_clustered_row(const schema& s, ser::clustering_row_view crv) {
    class clustering_row_builder {
-        const schema& _s;
        clustering_row _row;
    public:
-        clustering_row_builder(const schema& s, clustering_key key, row_tombstone t, row_marker m)
-            : _s(s), _row(std::move(key), std::move(t), std::move(m), row()) { }
+        clustering_row_builder(clustering_key key, row_tombstone t, row_marker m)
+            : _row(std::move(key), std::move(t), std::move(m), row()) { }
        void accept_atomic_cell(column_id id, atomic_cell ac) {
            _row.cells().append_cell(id, std::move(ac));
        }
-        void accept_collection(column_id id, const collection_mutation& cm) {
-            _row.cells().append_cell(id, collection_mutation(*_s.regular_column_at(id).type, cm));
+        void accept_collection(column_id id, collection_mutation cm) {
+            _row.cells().append_cell(id, std::move(cm));
        }
        clustering_row get() && { return std::move(_row); }
    };

    auto cr = crv.row();
    auto t = row_tombstone(cr.deleted_at(), shadowable_tombstone(cr.shadowable_deleted_at()));
-    clustering_row_builder builder(s, cr.key(), std::move(t), read_row_marker(cr.marker()));
+    clustering_row_builder builder(cr.key(), std::move(t), read_row_marker(cr.marker()));
    read_and_visit_row(cr.cells(), s.get_column_mapping(), column_kind::regular_column, builder);
    return std::move(builder).get();
 }

 static_row read_static_row(const schema& s, ser::static_row_view sr) {
    class static_row_builder {
-        const schema& _s;
        static_row _row;
    public:
-        explicit static_row_builder(const schema& s)
-            : _s(s) { }
        void accept_atomic_cell(column_id id, atomic_cell ac) {
            _row.cells().append_cell(id, std::move(ac));
        }
-        void accept_collection(column_id id, const collection_mutation& cm) {
-            _row.cells().append_cell(id, collection_mutation(*_s.static_column_at(id).type, cm));
+        void accept_collection(column_id id, collection_mutation cm) {
+            _row.cells().append_cell(id, std::move(cm));
        }
        static_row get() && { return std::move(_row); }
    };

-    static_row_builder builder(s);
+    static_row_builder builder;
    read_and_visit_row(sr.cells(), s.get_column_mapping(), column_kind::static_column, builder);
    return std::move(builder).get();
 }
--- a/mutation/mutation_partition_view.hh
+++ b/mutation/mutation_partition_view.hh
@@ -23,31 +23,31 @@ class converting_mutation_partition_applier;

 template<typename T>
 concept MutationViewVisitor = requires (T& visitor, tombstone t, atomic_cell ac,
-                                             collection_mutation_view cmv, range_tombstone rt,
+                                             collection_mutation cm, range_tombstone rt,
                                             position_in_partition_view pipv, row_tombstone row_tomb,
                                             row_marker rm) {
    visitor.accept_partition_tombstone(t);
    visitor.accept_static_cell(column_id(), std::move(ac));
-    visitor.accept_static_cell(column_id(), cmv);
+    visitor.accept_static_cell(column_id(), std::move(cm));
    visitor.accept_row_tombstone(rt);
    visitor.accept_row(pipv, row_tomb, rm,
            is_dummy::no, is_continuous::yes);
    visitor.accept_row_cell(column_id(), std::move(ac));
-    visitor.accept_row_cell(column_id(), cmv);
+    visitor.accept_row_cell(column_id(), std::move(cm));
 };

 template<typename T>
 concept AsyncMutationViewVisitor = requires (T& visitor, tombstone t, atomic_cell ac,
-                                             collection_mutation_view cmv, range_tombstone rt,
+                                             collection_mutation cm, range_tombstone rt,
                                             position_in_partition_view pipv, row_tombstone row_tomb,
                                             row_marker rm) {
    { visitor.accept_partition_tombstone(t) } -> std::same_as<void>;
    { visitor.accept_static_cell(column_id(), std::move(ac)) } -> std::same_as<void>;
-    { visitor.accept_static_cell(column_id(), cmv) } -> std::same_as<void>;
+    { visitor.accept_static_cell(column_id(), std::move(cm)) } -> std::same_as<void>;
    { visitor.accept_row_tombstone(rt) } -> std::same_as<future<>>;
    { visitor.accept_row(pipv, row_tomb, rm, is_dummy::no, is_continuous::yes) } -> std::same_as<future<>>;
    { visitor.accept_row_cell(column_id(), std::move(ac)) } -> std::same_as<void>;
-    { visitor.accept_row_cell(column_id(), cmv) } -> std::same_as<void>;
+    { visitor.accept_row_cell(column_id(), std::move(cm)) } -> std::same_as<void>;
    { visitor.accept_end_of_partition() } -> std::same_as<future<>>;
 };

@@ -56,11 +56,11 @@ public:
    virtual ~mutation_partition_view_virtual_visitor();
    virtual void accept_partition_tombstone(tombstone t) = 0;
    virtual void accept_static_cell(column_id, atomic_cell ac) = 0;
-    virtual void accept_static_cell(column_id, collection_mutation_view cmv) = 0;
+    virtual void accept_static_cell(column_id, collection_mutation cm) = 0;
    virtual stop_iteration accept_row_tombstone(range_tombstone rt) = 0;
    virtual stop_iteration accept_row(position_in_partition_view pipv, row_tombstone rt, row_marker rm, is_dummy, is_continuous) = 0;
    virtual void accept_row_cell(column_id, atomic_cell ac) = 0;
-    virtual void accept_row_cell(column_id, collection_mutation_view cmv) = 0;
+    virtual void accept_row_cell(column_id, collection_mutation cm) = 0;
 };

 // View on serialized mutation partition. See mutation_partition_serializer.
--- a/partition_builder.hh
+++ b/partition_builder.hh
@@ -46,8 +46,12 @@ public:
    }

    virtual void accept_static_cell(column_id id, collection_mutation_view collection) override {
+        accept_static_cell(id, collection_mutation(*_schema.static_column_at(id).type, std::move(collection)));
+    }
+
+    void accept_static_cell(column_id id, collection_mutation&& collection) {
        row& r = _partition.static_row().maybe_create();
-        r.append_cell(id, collection_mutation(*_schema.static_column_at(id).type, std::move(collection)));
+        r.append_cell(id, std::move(collection));
    }

    virtual void accept_row_tombstone(const range_tombstone& rt) override {
@@ -72,8 +76,12 @@ public:
    }

    virtual void accept_row_cell(column_id id, collection_mutation_view collection) override {
+        accept_row_cell(id, collection_mutation(*_schema.regular_column_at(id).type, std::move(collection)));
+    }
+
+    void accept_row_cell(column_id id, collection_mutation collection) {
        row& r = _current_row->cells();
-        r.append_cell(id, collection_mutation(*_schema.regular_column_at(id).type, std::move(collection)));
+        r.append_cell(id, std::move(collection));
    }
 };

--- a/pgo/exec_cql.py
+++ b/pgo/exec_cql.py
@@ -16,6 +16,7 @@ Usage:
 import argparse, os, sys
 from typing import Sequence

+
 def read_statements(path: str) -> list[tuple[int, str]]:
    stms: list[tuple[int, str]] = []
    with open(path, 'r', encoding='utf-8') as f:
--- a/pgo/profiles/aarch64/profile.profdata.xz
+++ b/pgo/profiles/aarch64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:524c54493b72c5e1b783f14dfa49d733e21b24cc2ec776e9c6e578095073162d
-size 6646304
+oid sha256:8b22f9a548a03c88250d31e97ea3e8f77b4d90c502bcf74336c24056557f947f
+size 6698412
--- a/pgo/profiles/x86_64/profile.profdata.xz
+++ b/pgo/profiles/x86_64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fec2bb253d43139da954cee3441fc8bc74824246b080f23bf1f824714d0adc45
-size 6646576
+oid sha256:31e515a62f006649b0dc4671b51b2643fba9a70884c09b90fbc2237044954254
+size 6707108
--- a/raft/server.cc
+++ b/raft/server.cc
@@ -239,7 +239,10 @@ private:

    // Drop waiter that we lost track of, can happen due to a snapshot transfer,
    // or a leader removed from cluster while some entries added on it are uncommitted.
-    void drop_waiters(std::optional<index_t> idx = {});
+    // When `snp` is provided (snapshot transfer case), waiters whose term matches
+    // the snapshot term are resolved successfully, since the snapshot-term match proves
+    // they were committed and included in the snapshot (by the Log Matching Property).
+    void drop_waiters(const snapshot_descriptor* snp = nullptr);

    // Wake up all waiter that wait for entries with idx smaller of equal to the one provided
    // to be applied.
@@ -556,12 +559,10 @@ future<> server_impl::wait_for_entry(entry_id eid, wait_type type, seastar::abor
                auto snap_term = _fsm->log_term_for(snap_idx);
                SCYLLA_ASSERT(snap_term);
                SCYLLA_ASSERT(snap_idx >= eid.idx);
-                if (type == wait_type::committed && snap_term == eid.term) {
+                if (snap_term == eid.term) {
                    logger.trace("[{}] wait_for_entry {}.{}: entry got truncated away, but has the snapshot's term"
                                 " (snapshot index: {})", id(), eid.term, eid.idx, snap_idx);
                    co_return;
-
-                    // We don't do this for `wait_type::applied` - see below why.
                }

                logger.trace("[{}] wait_for_entry {}.{}: entry got truncated away", id(), eid.term, eid.idx);
@@ -572,20 +573,6 @@ future<> server_impl::wait_for_entry(entry_id eid, wait_type type, seastar::abor
                throw dropped_entry();
            }

-            if (type == wait_type::applied && _fsm->log_last_snapshot_idx() >= eid.idx) {
-                // We know the entry was committed but the wait type is `applied`
-                // and we don't know if the entry was applied with `state_machine::apply`
-                // (we may've loaded a snapshot before we managed to apply the entry).
-                // As specified by `add_entry`, throw `commit_status_unknown` in this case.
-                //
-                // FIXME: replace this with a different exception type - `commit_status_unknown`
-                // gives too much uncertainty while we know that the entry was committed
-                // and had to be applied on at least one server. Some callers of `add_entry`
-                // need to know only that the current state includes that entry, whether it was done
-                // through `apply` on this server or through receiving a snapshot.
-                throw commit_status_unknown();
-            }
-
            co_return;
        }
    }
@@ -760,6 +747,8 @@ future<> server_impl::add_entry(command command, wait_type type, seastar::abort_
            throw not_a_leader{leader};
        }
        auto eid = co_await add_entry_on_leader(std::move(command), as);
+        co_await utils::get_local_injector().inject("block_raft_add_entry_before_wait_for_entry",
+                utils::wait_for_message(std::chrono::minutes(5)));
        co_return co_await wait_for_entry(eid, type, as);
    }

@@ -995,17 +984,24 @@ void server_impl::notify_waiters(std::map<index_t, op_status>& waiters,
    }
 }

-void server_impl::drop_waiters(std::optional<index_t> idx) {
+void server_impl::drop_waiters(const snapshot_descriptor* snp) {
    auto drop = [&] (std::map<index_t, op_status>& waiters) {
        while (waiters.size() != 0) {
            auto it = waiters.begin();
-            if (idx && it->first > *idx) {
+            if (snp && it->first > snp->idx) {
                break;
            }
            auto [entry_idx, status] = std::move(*it);
            waiters.erase(it);
-            status.done.set_exception(commit_status_unknown());
-            _stats.waiters_dropped++;
+            if (snp && status.term == snp->term) {
+                // entry_idx <= snapshot index and the entry's term matches the snapshot term.
+                // By the Log Matching Property the entry was committed and included in the snapshot.
+                status.done.set_value();
+                _stats.waiters_awoken++;
+            } else {
+                status.done.set_exception(commit_status_unknown());
+                _stats.waiters_dropped++;
+            }
        }
    };
    drop(_awaited_commits);
@@ -1431,7 +1427,7 @@ future<> server_impl::applier_fiber() {
                // Apply snapshot it to the state machine
                logger.trace("[{}] apply_fiber applying snapshot {}", _id, snp.id);
                co_await _state_machine->load_snapshot(snp.id);
-                drop_waiters(snp.idx);
+                drop_waiters(&snp);
                _applied_idx = snp.idx;
                _applied_index_changed.broadcast();
                _stats.sm_load_snapshot++;
@@ -1940,7 +1936,7 @@ std::unique_ptr<server> create_server(server_id uuid, std::unique_ptr<rpc> rpc,
 }

 std::ostream& operator<<(std::ostream& os, const server_impl& s) {
-    fmt::print(os, "[id: {}, fsm ()]\n", s._id, *s._fsm);
+    fmt::print(os, "[id: {}, fsm ({})]\n", s._id, *s._fsm);
    return os;
 }

--- a/raft/server.hh
+++ b/raft/server.hh
@@ -79,18 +79,18 @@ public:
    // The caller may pass a pointer to an abort_source to make the operation abortable.
    // If it passes nullptr, the operation is unabortable.
    //
-    // Successful `add_entry` with `wait_type::committed` does not guarantee that `state_machine::apply` will be called
-    // locally for this entry. Between the commit and the application we may receive a snapshot containing this entry,
-    // so the state machine's state 'jumps' forward in time, skipping the entry application.
-    // However, for `wait_type::applied`, we guarantee that the entry will be applied locally with `state_machine::apply`.
-    // If a snapshot causes the state machine to jump over the entry, `add_entry` will return `commit_status_unknown`
-    // (even if the snapshot included that entry).
+    // Successful `add_entry` does not guarantee that `state_machine::apply` will be called
+    // locally for this entry. Between the commit and the application we may load a snapshot
+    // containing this entry, so the state machine's state 'jumps' forward in time, skipping
+    // the local entry application. For `wait_type::applied` this should be fine, because
+    // state machine implementations shouldn't care whether an entry was applied via
+    // `state_machine::apply` or via a snapshot load.
    //
    // Exceptions:
    // raft::commit_status_unknown
    //     Thrown if the leader has changed and the log entry has either
    //     been replaced by the new leader or the server has lost track of it.
-    //     It may also be thrown in case of a transport error while forwarding add_entry to the leader.L
+    //     It may also be thrown in case of a transport error while forwarding add_entry to the leader.
    // raft::dropped_entry
    //     Thrown if the entry was replaced because of a leader change.
    // raft::request_aborted
--- a/replica/compaction_group.hh
+++ b/replica/compaction_group.hh
@@ -269,6 +269,10 @@ public:
    // Gets the view a sstable currently belongs to.
    compaction::compaction_group_view& view_for_sstable(const sstables::shared_sstable& sst) const;
    utils::small_vector<compaction::compaction_group_view*, 3> all_views() const;
+    // Returns true iff v is the repaired view of this compaction group.
+    bool is_repaired_view(const compaction::compaction_group_view* v) const noexcept;
+    // Returns an sstable set containing only repaired sstables (those classified as repaired).
+    lw_shared_ptr<sstables::sstable_set> make_repaired_sstable_set() const;

    seastar::condition_variable& get_staging_done_condition() noexcept {
        return _staging_done_condition;
@@ -404,6 +408,8 @@ public:

    // Make an sstable set spanning all sstables in the storage_group
    lw_shared_ptr<const sstables::sstable_set> make_sstable_set() const;
+    // Like make_sstable_set(), but restricted to repaired sstables only across all compaction groups.
+    lw_shared_ptr<const sstables::sstable_set> make_repaired_sstable_set() const;

    future<utils::chunked_vector<logstor::segment_snapshot>> take_logstor_snapshot() const;

--- a/replica/database.cc
+++ b/replica/database.cc
@@ -1006,7 +1006,7 @@ future<database::keyspace_change_per_shard> database::prepare_update_keyspace_on
    co_await modify_keyspace_on_all_shards(sharded_db, [&] (replica::database& db) -> future<> {
        auto& ks = db.find_keyspace(ksm.name());
        auto new_ksm = ::make_lw_shared<keyspace_metadata>(ksm.name(), ksm.strategy_name(), ksm.strategy_options(), ksm.initial_tablets(), ksm.consistency_option(), ksm.durable_writes(),
-                ks.metadata()->cf_meta_data() | std::views::values | std::ranges::to<std::vector>(), ks.metadata()->user_types(), ksm.get_storage_options());
+                ks.metadata()->cf_meta_data() | std::views::values | std::ranges::to<std::vector>(), ks.metadata()->user_types(), ksm.get_storage_options(), ksm.next_strategy_options_opt());

        auto change = co_await db.prepare_update_keyspace(ks, new_ksm, pending_token_metadata.local());
        changes[this_shard_id()] = make_foreign(std::make_unique<keyspace_change>(std::move(change)));
@@ -1022,8 +1022,7 @@ void database::drop_keyspace(const sstring& name) {
 static bool is_system_table(const schema& s) {
    auto& k = s.ks_name();
    return k == db::system_keyspace::NAME ||
-        k == db::system_distributed_keyspace::NAME ||
-        k == db::system_distributed_keyspace::NAME_EVERYWHERE;
+        k == db::system_distributed_keyspace::NAME;
 }

 sstables::sstables_manager& database::get_sstables_manager(const schema& s) const {
@@ -1142,7 +1141,7 @@ future<> database::create_local_system_table(
        cfg.memtable_scheduling_group = default_scheduling_group();
        cfg.memtable_to_cache_scheduling_group = default_scheduling_group();
    }
-    auto lock = get_tables_metadata().hold_write_lock();
+    auto lock = co_await get_tables_metadata().hold_write_lock();
    std::exception_ptr ex;
    try {
        add_column_family(ks, table, std::move(cfg), replica::database::is_new_cf::no);
@@ -1328,9 +1327,27 @@ future<global_table_ptr> get_table_on_all_shards(sharded<database>& sharded_db,

 future<tables_metadata_lock_on_all_shards> database::lock_tables_metadata(sharded<database>& sharded_db) {
    tables_metadata_lock_on_all_shards locks;
-    co_await sharded_db.invoke_on_all([&] (auto& db) -> future<> {
+    // Acquire write lock on shard 0 first, and then on the remaining shards.
+    //
+    // Parallel acquisition on all shards could deadlock when two
+    // fibers call lock_tables_metadata() concurrently: parallel_for_each
+    // sends SMP messages to all shards even when the local shard's lock
+    // attempt blocks.  If task reordering (SEASTAR_SHUFFLE_TASK_QUEUE in
+    // debug/sanitize builds) causes fiber A to win on shard X while
+    // fiber B wins on shard Y, neither can make progress — classic
+    // cross-shard lock-ordering deadlock.
+    //
+    // Acquiring the write lock on shard 0 first, and then on the remaining
+    // shards, eliminates this: whichever fiber acquires shard 0 first is
+    // guaranteed to acquire locks on all other shards before the other fiber
+    // can acquire the lock on shard 0.
+    co_await sharded_db.invoke_on(0, [&locks, &sharded_db] (auto& db) -> future<> {
        locks.assign_lock(co_await db.get_tables_metadata().hold_write_lock());
+        co_await sharded_db.invoke_on_others([&locks] (auto& db) -> future<> {
+            locks.assign_lock(co_await db.get_tables_metadata().hold_write_lock());
+        });
    });
+
    co_return locks;
 }

--- a/replica/database.hh
+++ b/replica/database.hh
@@ -757,6 +757,10 @@ private:
    // groups during tablet split with overlapping token range, and we need to include them all in a single
    // sstable set to allow safe tombstone gc.
    lw_shared_ptr<const sstables::sstable_set> sstable_set_for_tombstone_gc(const compaction_group&) const;
+    // Like sstable_set_for_tombstone_gc(), but restricted to repaired sstables only across all compaction
+    // groups of the same tablet (storage group).  Used by the tombstone_gc=repair optimization to avoid
+    // scanning unrepaired sstables when looking for GC-blocking shadows.
+    lw_shared_ptr<const sstables::sstable_set> make_repaired_sstable_set_for_tombstone_gc(const compaction_group&) const;

    bool cache_enabled() const {
        return _config.enable_cache && _schema->caching_options().enabled();
--- a/Show More
+++ b/Show More