test: assert ALTER TYPE RENAME rejected on frozen PK UDTs

Add assertion that ALTER TYPE RENAME is rejected when the UDT is used as a frozen partition key column. The existing test only covered ALTER TYPE ADD. This closes the coverage gap from dtest udtencoding_test.py::test_udt_change_in_partition_key, enabling its removal. Refs: SCYLLADB-1929
Merge 'Don't use database.get_config() to fetch calculate_view_update_throttling_delay option' from Pavel Emelyanov
2026-05-13 03:12:13 +00:00 · 2026-05-11 18:49:01 +03:00 · 2026-05-11 10:30:24 +03:00 · 2026-05-11 10:11:20 +03:00 · 2026-05-11 09:12:40 +03:00 · 2026-05-11 08:55:33 +03:00
185 changed files with 3321 additions and 1770 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -36,4 +36,6 @@ compile_commands.json
 clang_build
 .idea/
 nuke
-rust/target
+rust/**/target
+rust/**/Cargo.lock
+test/resource/wasm/rust/target
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -681,7 +681,7 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
    case parsed::primitive_condition::type::VALUE:
        if (calculated_values.size() != 1) {
            // Shouldn't happen unless we have a bug in the parser
-            throw std::logic_error(format("Unexpected values in primitive_condition", cond._values.size()));
+            throw std::logic_error(format("Unexpected values {} in primitive_condition", cond._values.size()));
        }
        // Unwrap the boolean wrapped as the value (if it is a boolean)
        if (calculated_values[0].IsObject() && calculated_values[0].MemberCount() == 1) {
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -1362,6 +1362,33 @@ static int get_dimensions(const rjson::value& vector_attribute, std::string_view
    return dimensions_v->GetInt();
 }

+// As noted in issue #5052, in Alternator the CreateTable and UpdateTable are
+// currently synchronous - they return only after the operation is complete.
+// After announce() of the new schema finished, the schema change is committed
+// and a majority of nodes know it - but it's possible that some live nodes
+// have not yet applied the new schema. If we return to the user now, and the
+// user sends a node request that relies on the new schema, it might fail.
+// So before returning, we must verify that *all* nodes have applied the new
+// schema. This is what wait_for_schema_agreement_after_ddl() does.
+//
+// Note that wait_for_schema_agreement_after_ddl() has a timeout (currently
+// hard-coded to 30 seconds). If the timeout is reached an InternalServerError
+// is returned. The user, who doesn't know if the CreateTable succeeded or not,
+// can retry the request and will get a ResourceInUseException and know the
+// table already exists. So a CreateTable that returns a ResourceInUseException
+// should also call wait_for_schema_agreement_after_ddl().
+//
+// When issue #5052 is resolved, this function can be removed - we will need
+// to check if we reached schema agreement, but not to *wait* for it.
+static future<> wait_for_schema_agreement_after_ddl(service::migration_manager& mm, const replica::database& db) {
+    static constexpr auto schema_agreement_seconds = 30;
+    try {
+        co_await mm.wait_for_schema_agreement(db, db::timeout_clock::now() + std::chrono::seconds(schema_agreement_seconds), nullptr);
+    } catch (const service::migration_manager::schema_agreement_timeout&) {
+        throw api_error::internal(fmt::format("The operation was successful, but unable to confirm cluster-wide schema agreement after {} seconds. Please retry the operation, and wait for the retry to report an error since the operation was already done.", schema_agreement_seconds));
+    }
+}
+
 future<executor::request_return_type> executor::create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization,
            const db::tablets_mode_t::mode tablets_mode, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
    throwing_assert(this_shard_id() == 0);
@@ -1695,13 +1722,26 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
                }
            }
        }
+        bool table_already_exists = false;
        try {
            schema_mutations = service::prepare_new_keyspace_announcement(_proxy.local_db(), ksm, ts);
        } catch (exceptions::already_exists_exception&) {
            if (_proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
-                co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
+                table_already_exists = true;
            }
        }
+        if (table_already_exists) {
+            // The user may have retried a CreateTable operation after it timed
+            // out in wait_for_schema_agreement_after_ddl(). So before we may
+            // return ResourceInUseException (which can lead the user to start
+            // using the table which it now knows exists), we need to wait for
+            // schema agreement, just like the original CreateTable did. Again
+            // we fail with InternalServerError if schema agreement still cannot
+            // be reached. We can release group0_guard before waiting.
+            release_guard(std::move(group0_guard));
+            co_await wait_for_schema_agreement_after_ddl(_mm, _proxy.local_db());
+            co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
+        }
        if (_proxy.data_dictionary().try_find_table(schema->id())) {
            // This should never happen, the ID is supposed to be unique
            co_return api_error::internal(format("Table with ID {} already exists", schema->id()));
@@ -1750,7 +1790,7 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
        }
    }

-    co_await _mm.wait_for_schema_agreement(_proxy.local_db(), db::timeout_clock::now() + 10s, nullptr);
+    co_await wait_for_schema_agreement_after_ddl(_mm, _proxy.local_db());
    rjson::value status = rjson::empty_object();
    executor::supplement_table_info(request, *schema, _proxy);
    rjson::add(status, "TableDescription", std::move(request));
@@ -1860,7 +1900,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
            rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
            if (stream_specification && stream_specification->IsObject()) {
                empty_request = false;
-                if (add_stream_options(*stream_specification, builder, p.local())) {
+                if (add_stream_options(*stream_specification, builder, p.local(), tab->cdc_options())) {
                    validate_cdc_log_name_length(builder.cf_name());
                    // On tablet tables, defer stream enablement and block
                    // tablet merges (see defer_enabling_streams_block_tablet_merges).
@@ -1875,6 +1915,23 @@ future<executor::request_return_type> executor::update_table(client_state& clien
                        if (tab->cdc_options().enabled() || tab->cdc_options().enable_requested()) {
                            co_return api_error::validation("Table already has an enabled stream: TableName: " + tab->cf_name());
                        }
+                        // When re-enabling streams on an Alternator table, drop the old
+                        // CDC log table first as a separate schema change, so the
+                        // subsequent UpdateTable creates a fresh one with a new UUID
+                        // (= new StreamArn). See #7239.
+                        auto logname = cdc::log_name(tab->cf_name());
+                        auto& local_db = p.local().local_db();
+                        if (local_db.has_schema(tab->ks_name(), logname)
+                                && cdc::is_log_schema(*local_db.find_schema(tab->ks_name(), logname))) {
+                            auto drop_m = co_await service::prepare_column_family_drop_announcement(
+                                p.local(), tab->ks_name(), logname,
+                                group0_guard.write_timestamp());
+                            co_await mm.announce(std::move(drop_m), std::move(group0_guard),
+                                format("alternator-executor: drop old CDC log for {}", tab->cf_name()));
+                            co_await mm.wait_for_schema_agreement(
+                                p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
+                            continue;
+                        }
                    }
                    else if (!tab->cdc_options().enabled() && !tab->cdc_options().enable_requested()) {
                        co_return api_error::validation("Table has no stream to disable: TableName: " + tab->cf_name());
@@ -2189,7 +2246,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
                throw;
            }
        }
-        co_await mm.wait_for_schema_agreement(p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
+        co_await wait_for_schema_agreement_after_ddl(mm, p.local().local_db());

        rjson::value status = rjson::empty_object();
        supplement_table_info(request, *schema, p.local());
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -30,6 +30,7 @@
 #include "utils/updateable_value.hh"

 #include "tracing/trace_state.hh"
+#include "cdc/cdc_options.hh"


 namespace db {
@@ -199,7 +200,7 @@ private:
        tracing::trace_state_ptr trace_state, service_permit permit);

 public:
-    static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp);
+    static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp, const cdc::options& existing_cdc_opts = {});
    static void supplement_table_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp);
    static void supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp);
 };
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -243,7 +243,10 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
        if (!is_alternator_keyspace(ks_name)) {
            continue;
        }
-        if (cdc::is_log_for_some_table(db.real_database(), ks_name, cf_name)) {
+        // Use get_base_table instead of is_log_for_some_table because the
+        // latter requires CDC to be enabled, but we want to list streams
+        // that have been disabled but whose log table still exists (#7239).
+        if (cdc::get_base_table(db.real_database(), ks_name, cf_name)) {
            rjson::value new_entry = rjson::empty_object();

            auto arn = stream_arn{ i->schema(), cdc::get_base_table(db.real_database(), *i->schema()) };
@@ -392,7 +395,7 @@ std::istream& operator>>(std::istream& is, stream_view_type& type) {
    return is;
 }

-static stream_view_type cdc_options_to_steam_view_type(const cdc::options& opts) {
+static stream_view_type cdc_options_to_stream_view_type(const cdc::options& opts) {
    stream_view_type type = stream_view_type::KEYS_ONLY;
    if (opts.preimage() && opts.postimage()) {
        type = stream_view_type::NEW_AND_OLD_IMAGES;
@@ -838,6 +841,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
    auto& opts = bs->cdc_options();

    auto status = "DISABLED";
+    bool stream_disabled = !opts.enabled();

    if (opts.enabled()) {
        if (!_cdc_metadata.streams_available()) {
@@ -853,7 +857,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    rjson::add(stream_desc, "StreamStatus", rjson::from_string(status));

-    stream_view_type type = cdc_options_to_steam_view_type(opts);
+    stream_view_type type = cdc_options_to_stream_view_type(opts);

    rjson::add(stream_desc, "StreamArn", stream_arn);
    rjson::add(stream_desc, "StreamViewType", type);
@@ -861,10 +865,9 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    describe_key_schema(stream_desc, *bs);

-    if (!opts.enabled()) {
-        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-        co_return rjson::print(std::move(ret));
-    }
+    // For disabled streams, we still fall through to enumerate shards
+    // below. All shards will have EndingSequenceNumber set, indicating
+    // they are closed. See issue #7239.

    // TODO: label
    // TODO: creation time
@@ -947,6 +950,12 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
        auto expired = [&]() -> std::optional<db_clock::time_point> {
            auto j = std::next(i);
            if (j == e) {
+                // For a disabled stream, all shards are closed (#7239).
+                // Use "now" as the ending sequence number for the last
+                // generation's shards.
+                if (stream_disabled) {
+                    return db_clock::now();
+                }
                return std::nullopt;
            }
            // add this so we sort of match potential 
@@ -1297,7 +1306,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
        | std::ranges::to<query::column_id_vector>()
    ;

-    stream_view_type type = cdc_options_to_steam_view_type(base->cdc_options());
+    stream_view_type type = cdc_options_to_stream_view_type(base->cdc_options());

    auto selection = cql3::selection::selection::for_columns(schema, std::move(columns));
    auto partition_slice = query::partition_slice(
@@ -1481,17 +1490,17 @@ future<executor::request_return_type> executor::get_records(client_state& client

    auto& shard = iter.shard;

-    if (shard.time < ts && ts < high_ts) {
+    if (!base->cdc_options().enabled()) {
+        // Stream is disabled -- all shards are closed (#7239).
+        // Don't return NextShardIterator.
+    } else if (shard.time < ts && ts < high_ts) {
        // The DynamoDB documentation states that when a shard is
        // closed, reading it until the end has NextShardIterator
        // "set to null". Our test test_streams_closed_read
        // confirms that by "null" they meant not set at all.
    } else {
-        // We could have return the same iterator again, but we did
-        // a search from it until high_ts and found nothing, so we
-        // can also start the next search from high_ts.
-        // TODO: but why? It's simpler just to leave the iterator be.
-        shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
+        // Shard is still open with no records in the scanned window.
+        // Return the original iterator so the client can poll again.
        rjson::add(ret, "NextShardIterator", iter);
    }
    _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
@@ -1501,17 +1510,13 @@ future<executor::request_return_type> executor::get_records(client_state& client
    co_return rjson::print(std::move(ret));
 }

-bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
+bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp, const cdc::options& existing_cdc_opts) {
    auto stream_enabled = rjson::find(stream_specification, "StreamEnabled");
    if (!stream_enabled || !stream_enabled->IsBool()) {
        throw api_error::validation("StreamSpecification needs boolean StreamEnabled");
    }

    if (stream_enabled->GetBool()) {
-        if (!sp.features().alternator_streams) {
-            throw api_error::validation("StreamSpecification: alternator streams feature not enabled in cluster.");
-        }
-
        cdc::options opts;
        opts.enabled(true);
        opts.tablet_merge_blocked(true);
@@ -1537,8 +1542,13 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche
        builder.with_cdc_options(opts);
        return true;
    } else {
-        cdc::options opts;
+        // When disabling, preserve the existing CDC options (preimage,
+        // postimage, ttl, etc.) so that DescribeStream can still report
+        // the correct StreamViewType on a disabled stream.
+        cdc::options opts = existing_cdc_opts;
        opts.enabled(false);
+        opts.enable_requested(false);
+        opts.tablet_merge_blocked(false);
        builder.with_cdc_options(opts);
        return false;
    }
@@ -1546,33 +1556,36 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche

 void executor::supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp) {
    auto& opts = schema.cdc_options();
-    if (opts.enabled()) {
-        auto db = sp.data_dictionary();
-        auto cf = db.find_table(schema.ks_name(), cdc::log_name(schema.cf_name()));
-        stream_arn arn(cf.schema(), cdc::get_base_table(db.real_database(), *cf.schema()));
+    // Report stream info when:
+    //   1. Log table exists (covers both enabled and disabled-but-readable).
+    //   2. enable_requested (ENABLING state, log not yet created).
+    auto db = sp.data_dictionary();
+    auto log_name = cdc::log_name(schema.cf_name());
+    auto log_cf = db.try_find_table(schema.ks_name(), log_name);
+    if (log_cf) {
+        auto log_schema = log_cf->schema();
+        stream_arn arn(log_schema, cdc::get_base_table(db.real_database(), *log_schema));
        rjson::add(descr, "LatestStreamArn", arn);
-        rjson::add(descr, "LatestStreamLabel", rjson::from_string(stream_label(*cf.schema())));
-    } else if (!opts.enable_requested()) {
-        return;
-    }
-    // For both enabled() and enable_requested():
-    // DynamoDB returns StreamEnabled=true in StreamSpecification even when
-    // the stream status is ENABLING (not yet fully active). We mirror this
-    // behavior: enable_requested means the user asked for streams but CDC
-    // is not yet finalized, so we still report StreamEnabled=true.
-    auto stream_desc = rjson::empty_object();
-    rjson::add(stream_desc, "StreamEnabled", true);
+        rjson::add(descr, "LatestStreamLabel", rjson::from_string(stream_label(*log_schema)));

-    auto mode = stream_view_type::KEYS_ONLY;
-    if (opts.preimage() && opts.postimage()) {
-        mode = stream_view_type::NEW_AND_OLD_IMAGES;
-    } else if (opts.preimage()) {
-        mode = stream_view_type::OLD_IMAGE;
-    } else if (opts.postimage()) {
-        mode = stream_view_type::NEW_IMAGE;
+        auto stream_desc = rjson::empty_object();
+        rjson::add(stream_desc, "StreamEnabled", opts.enabled());
+
+        stream_view_type mode = cdc_options_to_stream_view_type(opts);
+        rjson::add(stream_desc, "StreamViewType", mode);
+        rjson::add(descr, "StreamSpecification", std::move(stream_desc));
+    } else if (opts.enable_requested()) {
+        // DynamoDB returns StreamEnabled=true in StreamSpecification even when
+        // the stream status is ENABLING (not yet fully active). We mirror this
+        // behavior: enable_requested means the user asked for streams but CDC
+        // is not yet finalized, so we still report StreamEnabled=true.
+        auto stream_desc = rjson::empty_object();
+        rjson::add(stream_desc, "StreamEnabled", true);
+
+        stream_view_type mode = cdc_options_to_stream_view_type(opts);
+        rjson::add(stream_desc, "StreamViewType", mode);
+        rjson::add(descr, "StreamSpecification", std::move(stream_desc));
    }
-    rjson::add(stream_desc, "StreamViewType", mode);
-    rjson::add(descr, "StreamSpecification", std::move(stream_desc));
 }

 } // namespace alternator
--- a/audit/audit.cc
+++ b/audit/audit.cc
@@ -194,22 +194,36 @@ future<> audit::start_audit(const db::config& cfg, sharded<locator::shared_token
                                  std::move(audited_keyspaces),
                                  std::move(audited_tables),
                                  std::move(audited_categories),
-                                  std::cref(cfg))
-    .then([&cfg] {
-        if (!audit_instance().local_is_initialized()) {
-            return make_ready_future<>();
-        }
-        return audit_instance().invoke_on_all([&cfg] (audit& local_audit) {
-            return local_audit.start(cfg);
+                                  std::cref(cfg));
+}
+
+future<> audit::start_storage(const db::config& cfg) {
+    if (!audit_instance().local_is_initialized()) {
+        return make_ready_future<>();
+    }
+    return audit_instance().invoke_on_all([&cfg] (audit& local_audit) {
+        return local_audit._storage_helper_ptr->start(cfg).then([&local_audit] {
+            local_audit._storage_running = true;
        });
    });
 }

+future<> audit::stop_storage() {
+    if (!audit_instance().local_is_initialized()) {
+        return make_ready_future<>();
+    }
+    return audit_instance().invoke_on_all([] (audit& local_audit) {
+        local_audit._storage_running = false;
+        return local_audit._storage_helper_ptr->stop();
+    });
+}
+
 future<> audit::stop_audit() {
    if (!audit_instance().local_is_initialized()) {
        return make_ready_future<>();
    }
    return audit::audit::audit_instance().invoke_on_all([] (auto& local_audit) {
+        SCYLLA_ASSERT(!local_audit._storage_running);
        return local_audit.shutdown();
    }).then([] {
        return audit::audit::audit_instance().stop();
@@ -223,14 +237,6 @@ audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& k
    return std::make_unique<audit_info>(cat, keyspace, table, batch);
 }

-future<> audit::start(const db::config& cfg) {
-    return _storage_helper_ptr->start(cfg);
-}
-
-future<> audit::stop() {
-    return _storage_helper_ptr->stop();
-}
-
 future<> audit::shutdown() {
    return make_ready_future<>();
 }
@@ -241,6 +247,12 @@ future<> audit::log(const audit_info& audit_info, const service::client_state& c
    const sstring& username = client_state.user() ? client_state.user()->name.value_or(anonymous_username) : no_username;
    socket_address client_ip = client_state.get_client_address().addr();
    socket_address node_ip = _token_metadata.get()->get_topology().my_address().addr();
+    if (!_storage_running) {
+        on_internal_error_noexcept(logger, fmt::format("Audit log dropped (storage not ready): node_ip {} category {} cl {} error {} keyspace {} query '{}' client_ip {} table {} username {}",
+            node_ip, audit_info.category_string(), cl, error, audit_info.keyspace(),
+            audit_info.query(), client_ip, audit_info.table(), username));
+        return make_ready_future<>();
+    }
    if (logger.is_enabled(logging::log_level::debug)) {
        logger.debug("Log written: node_ip {} category {} cl {} error {} keyspace {} query '{}' client_ip {} table {} username {}",
            node_ip, audit_info.category_string(), cl, error, audit_info.keyspace(),
@@ -286,6 +298,11 @@ future<> inspect(const audit_info_alternator& ai, const service::client_state& c

 future<> audit::log_login(const sstring& username, socket_address client_ip, bool error) noexcept {
    socket_address node_ip = _token_metadata.get()->get_topology().my_address().addr();
+    if (!_storage_running) {
+        on_internal_error_noexcept(logger, fmt::format("Audit login log dropped (storage not ready): node_ip {} client_ip {} username {} error {}",
+            node_ip, client_ip, username, error ? "true" : "false"));
+        return make_ready_future<>();
+    }
    if (logger.is_enabled(logging::log_level::debug)) {
        logger.debug("Login log written: node_ip {}, client_ip {}, username {}, error {}",
            node_ip, client_ip, username, error ? "true" : "false");
--- a/audit/audit.hh
+++ b/audit/audit.hh
@@ -141,6 +141,7 @@ private:
    category_set _audited_categories;

    std::unique_ptr<storage_helper> _storage_helper_ptr;
+    bool _storage_running = false;

    const db::config& _cfg;
    utils::observer<sstring> _cfg_keyspaces_observer;
@@ -163,6 +164,8 @@ public:
        return audit_instance().local();
    }
    static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
+    static future<> start_storage(const db::config& cfg);
+    static future<> stop_storage();
    static future<> stop_audit();
    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch = false);
    audit(locator::shared_token_metadata& stm,
@@ -174,8 +177,6 @@ public:
          category_set&& audited_categories,
          const db::config& cfg);
    ~audit();
-    future<> start(const db::config& cfg);
-    future<> stop();
    future<> shutdown();
    bool should_log(const audit_info& audit_info) const;
    bool will_log(statement_category cat, std::string_view keyspace = {}, std::string_view table = {}) const;
--- a/auth/cache.cc
+++ b/auth/cache.cc
@@ -185,24 +185,14 @@ future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& r
        static const sstring q = format("SELECT role, name, value FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, ROLE_ATTRIBUTES_CF);
        auto rs = co_await fetch(q);
        for (const auto& r : *rs) {
+            if (!r.has("value")) {
+                continue;
+            }
            rec->attributes[r.get_as<sstring>("name")] =
                    r.get_as<sstring>("value");
            co_await coroutine::maybe_yield();
        }
    }
-    // permissions
-    {
-        static const sstring q = format("SELECT role, resource, permissions FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, PERMISSIONS_CF);
-        auto rs = co_await fetch(q);
-        for (const auto& r : *rs) {
-            auto resource = r.get_as<sstring>("resource");
-            auto perms_strings = r.get_set<sstring>("permissions");
-            std::unordered_set<sstring> perms_set(perms_strings.begin(), perms_strings.end());
-            auto pset = permissions::from_strings(perms_set);
-            rec->permissions[std::move(resource)] = std::move(pset);
-            co_await coroutine::maybe_yield();
-        }
-    }
    co_return rec;
 }

--- a/auth/cache.hh
+++ b/auth/cache.hh
@@ -44,7 +44,6 @@ public:
        std::unordered_set<role_name_t> members;
        sstring salted_hash;
        std::unordered_map<sstring, sstring, sstring_hash, sstring_eq> attributes;
-        std::unordered_map<sstring, permission_set, sstring_hash, sstring_eq> permissions;
    private:
        friend cache;
        // cached permissions include effects of role's inheritance
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -76,7 +76,11 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
    if (results->empty()) {
        co_return permissions::NONE;
    }
-    co_return permissions::from_strings(results->one().get_set<sstring>(PERMISSIONS_NAME));
+    const auto& row = results->one();
+    if (!row.has(PERMISSIONS_NAME)) {
+        co_return permissions::NONE;
+    }
+    co_return permissions::from_strings(row.get_set<sstring>(PERMISSIONS_NAME));
 }

 future<>
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -258,13 +258,11 @@ future<> ldap_role_manager::start() {
            } catch (const seastar::sleep_aborted&) {
                co_return; // ignore
            }
-            co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
-                try {
-                    co_await c.reload_all_permissions();
-                } catch (...) {
-                    mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
-                }
-            });
+            try {
+                co_await _cache.reload_all_permissions();
+            } catch (...) {
+                mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
+            }
        }
    });
    return _std_mgr.start();
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -157,15 +157,12 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
            return create_legacy_keyspace_if_missing(mm);
        });
    }
-    co_await _role_manager->start();
-    if (this_shard_id() == 0) {
-        // Role manager and password authenticator have this odd startup
-        // mechanism where they asynchronously create the superuser role
-        // in the background. Correct password creation depends on role
-        // creation therefore we need to wait here.
-        co_await _role_manager->ensure_superuser_is_created();
-    }
-    co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
+    // Authorizer must be started before the permission loader is set,
+    // because the loader calls _authorizer->authorize().
+    // The loader must be set before starting the role manager, because
+    // LDAP role manager starts a pruner fiber that calls
+    // reload_all_permissions() which asserts _permission_loader is set.
+    co_await _authorizer->start();
    if (!_used_by_maintenance_socket) {
        // Maintenance socket mode can't cache permissions because it has
        // different authorizer. We can't mix cached permissions, they could be
@@ -174,12 +171,27 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
                &service::get_uncached_permissions,
                this, std::placeholders::_1, std::placeholders::_2));
    }
+    co_await _role_manager->start();
+    if (this_shard_id() == 0) {
+        // Role manager and password authenticator have this odd startup
+        // mechanism where they asynchronously create the superuser role
+        // in the background. Correct password creation depends on role
+        // creation therefore we need to wait here.
+        co_await _role_manager->ensure_superuser_is_created();
+    }
+    // Authenticator must be started after ensure_superuser_is_created()
+    // because password_authenticator queries system.roles for the
+    // superuser entry created by the role manager.
+    co_await _authenticator->start();
 }

 future<> service::stop() {
    _as.request_abort();
+    // Reverse of start() order.
+    co_await _authenticator->stop();
+    co_await _role_manager->stop();
    _cache.set_permission_loader(nullptr);
-    return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
+    co_await _authorizer->stop();
 }

 future<> service::ensure_superuser_is_created() {
--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -267,7 +267,7 @@ struct extract_row_visitor {
            visit_collection(v);
        },
        [&] (const abstract_type& o) {
-            throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
+            throw std::runtime_error(format("extract_changes: unknown collection type: {}", o.name()));
        }
        ));
    }
--- a/cmake/mode.common.cmake
+++ b/cmake/mode.common.cmake
@@ -137,6 +137,24 @@ endfunction()

 option(Scylla_WITH_DEBUG_INFO "Enable debug info" OFF)

+# Time trace profiling: adds -ftime-trace to all C++ compilations (Clang only).
+# Each .o produces a companion .json file in the build directory that can be
+# analyzed with ClangBuildAnalyzer or loaded in chrome://tracing.
+#
+# Usage:
+#   cmake -DScylla_TIME_TRACE=ON ...
+#   ninja
+#   # Analyze results (requires ClangBuildAnalyzer):
+#   ClangBuildAnalyzer --all <build-dir> capture.bin
+#   ClangBuildAnalyzer --analyze capture.bin
+option(Scylla_TIME_TRACE "Enable Clang -ftime-trace for build profiling" OFF)
+if(Scylla_TIME_TRACE)
+  if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    message(FATAL_ERROR "Scylla_TIME_TRACE requires Clang (found ${CMAKE_CXX_COMPILER_ID})")
+  endif()
+  add_compile_options(-ftime-trace)
+endif()
+
 macro(update_build_flags config)
  cmake_parse_arguments (
    parsed_args
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -1088,7 +1088,7 @@ void compaction_manager::register_metrics() {
        sm::make_gauge("normalized_backlog", [this] { return _last_backlog / available_memory(); },
                       sm::description("Holds the sum of normalized compaction backlog for all tables in the system. Backlog is normalized by dividing backlog by shard's available memory.")),
        sm::make_counter("validation_errors", [this] { return _validation_errors; },
-                       sm::description("Holds the number of encountered validation errors.")),
+                       sm::description("Holds the number of encountered validation errors.")).set_skip_when_empty(),
    });
 }

--- a/configure.py
+++ b/configure.py
@@ -285,8 +285,12 @@ def generate_compdb(compdb, ninja, buildfile, modes):
                os.symlink(compdb_target, compdb)
            except FileExistsError:
                # if there is already a valid compile_commands.json link in the
-                # source root, we are done.
-                pass
+                # source root, we are done. if it's a stale link, update it.
+                if os.path.islink(compdb):
+                    current_target = os.readlink(compdb)
+                    if not os.path.exists(current_target):
+                        os.unlink(compdb)
+                        os.symlink(compdb_target, compdb)
            return


@@ -593,6 +597,7 @@ scylla_tests = set([
    'test/boost/linearizing_input_stream_test',
    'test/boost/lister_test',
    'test/boost/locator_topology_test',
+    'test/boost/lock_tables_metadata_test',
    'test/boost/log_heap_test',
    'test/boost/logalloc_standard_allocator_segment_pool_backend_test',
    'test/boost/logalloc_test',
@@ -853,6 +858,10 @@ arg_parser.add_argument('--coverage', action = 'store_true', help = 'Compile scy
 arg_parser.add_argument('--build-dir', action='store', default='build',
                        help='Build directory path')
 arg_parser.add_argument('--disable-precompiled-header', action='store_true', default=False, help='Disable precompiled header for scylla binary')
+arg_parser.add_argument('--time-trace', action='store_true', default=False,
+                        help='Enable Clang -ftime-trace for build profiling. '
+                             'Each .o produces a .json file analyzable with '
+                             'ClangBuildAnalyzer or chrome://tracing')
 arg_parser.add_argument('-h', '--help', action='store_true', help='show this help message and exit')
 args = arg_parser.parse_args()
 if args.help:
@@ -1659,6 +1668,7 @@ deps['test/boost/combined_tests'] += [
    'test/boost/auth_cache_test.cc',
    'test/boost/auth_test.cc',
    'test/boost/batchlog_manager_test.cc',
+    'test/boost/table_helper_test.cc',
    'test/boost/cache_algorithm_test.cc',
    'test/boost/castas_fcts_test.cc',
    'test/boost/cdc_test.cc',
@@ -1710,7 +1720,7 @@ deps['test/boost/combined_tests'] += [
    'test/boost/sstable_compression_config_test.cc',
    'test/boost/sstable_directory_test.cc',
    'test/boost/sstable_set_test.cc',
-    'test/boost/sstable_tablet_streaming.cc',
+    'test/boost/sstable_tablet_streaming_test.cc',
    'test/boost/statement_restrictions_test.cc',
    'test/boost/storage_proxy_test.cc',
    'test/boost/tablets_test.cc',
@@ -1965,6 +1975,9 @@ user_cflags += ' -fextend-variable-liveness=none'
 if args.target != '':
    user_cflags += ' -march=' + args.target

+if args.time_trace:
+    user_cflags += ' -ftime-trace'
+
 for mode in modes:
    # Those flags are passed not only to Scylla objects, but also to libraries
    # that we compile ourselves.
@@ -2457,6 +2470,9 @@ def write_build_file(f,
            command = reloc/build_deb.sh --reloc-pkg $in --builddir $out
        rule unified
            command = unified/build_unified.sh --build-dir $builddir/$mode --unified-pkg $out
+        rule collect_pkgs
+            command = rm -rf $out && mkdir -p $out && cp $pkgs $out/
+            description = COLLECT $out
        rule rust_header
            command = cxxbridge --include rust/cxx.h --header $in > $out
            description = RUST_HEADER $out
@@ -2942,6 +2958,8 @@ def write_build_file(f,
        build dist-tar: phony dist-unified-tar dist-server-tar dist-python3-tar dist-cqlsh-tar

        build dist: phony dist-unified dist-server dist-python3 dist-cqlsh
+
+        build collect-dist: phony {' '.join([f'collect-dist-{mode}' for mode in default_modes])}
        '''))

    f.write(textwrap.dedent(f'''\
@@ -2949,7 +2967,28 @@ def write_build_file(f,
        rule dist-check
          command = ./tools/testing/dist-check/dist-check.sh --mode $mode
        '''))
+    deb_arch = {'x86_64': 'amd64', 'aarch64': 'arm64'}[arch]
+    deb_ver = f'{scylla_version}-{scylla_release}-1'
+    rpm_ver = f'{scylla_version}-{scylla_release}'
    for mode in build_modes:
+        server_rpms_dir = f'$builddir/dist/{mode}/redhat/RPMS/{arch}'
+        server_rpms = [f'{server_rpms_dir}/{scylla_product}{suffix}-{rpm_ver}.{arch}.rpm'
+                       for suffix in ['', '-server', '-server-debuginfo', '-conf', '-kernel-conf', '-node-exporter']]
+        cqlsh_rpms = [f'tools/cqlsh/build/redhat/RPMS/{arch}/{scylla_product}-cqlsh-{rpm_ver}.{arch}.rpm']
+        python3_rpms = [f'tools/python3/build/redhat/RPMS/{arch}/{scylla_product}-python3-{rpm_ver}.{arch}.rpm']
+        all_rpms = server_rpms + cqlsh_rpms + python3_rpms
+
+        server_deb_dir = f'$builddir/dist/{mode}/debian'
+        server_debs = [f'{server_deb_dir}/{scylla_product}{suffix}_{deb_ver}_{deb_arch}.deb'
+                       for suffix in ['', '-server', '-server-dbg', '-conf', '-kernel-conf', '-node-exporter']]
+        server_debs += [f'{server_deb_dir}/scylla-enterprise{suffix}_{deb_ver}_all.deb'
+                        for suffix in ['', '-server', '-conf', '-kernel-conf', '-node-exporter']]
+        cqlsh_debs = [f'tools/cqlsh/build/debian/{scylla_product}-cqlsh_{deb_ver}_{deb_arch}.deb',
+                      f'tools/cqlsh/build/debian/scylla-enterprise-cqlsh_{deb_ver}_all.deb']
+        python3_debs = [f'tools/python3/build/debian/{scylla_product}-python3_{deb_ver}_{deb_arch}.deb',
+                        f'tools/python3/build/debian/scylla-enterprise-python3_{deb_ver}_all.deb']
+        all_debs = server_debs + cqlsh_debs + python3_debs
+
        f.write(textwrap.dedent(f'''\
        build $builddir/{mode}/dist/tar/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz: copy tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
        build $builddir/{mode}/dist/tar/{scylla_product}-python3-package.tar.gz: copy tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
@@ -2957,6 +2996,11 @@ def write_build_file(f,
        build $builddir/{mode}/dist/tar/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz: copy tools/cqlsh/build/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz
        build $builddir/{mode}/dist/tar/{scylla_product}-cqlsh-package.tar.gz: copy tools/cqlsh/build/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz

+        build $builddir/{mode}/dist/rpm: collect_pkgs | {' '.join(all_rpms)} $builddir/dist/{mode}/redhat dist-cqlsh-rpm dist-python3-rpm
+          pkgs = {' '.join(all_rpms)}
+        build $builddir/{mode}/dist/deb: collect_pkgs | {' '.join(all_debs)} $builddir/dist/{mode}/debian dist-cqlsh-deb dist-python3-deb
+          pkgs = {' '.join(all_debs)}
+        build collect-dist-{mode}: phony $builddir/{mode}/dist/rpm $builddir/{mode}/dist/deb
        build {mode}-dist: phony dist-server-{mode} dist-server-debuginfo-{mode} dist-python3-{mode} dist-unified-{mode} dist-cqlsh-{mode}
        build dist-{mode}: phony {mode}-dist
        build dist-check-{mode}: dist-check
--- a/cql3/authorized_prepared_statements_cache.hh
+++ b/cql3/authorized_prepared_statements_cache.hh
@@ -136,9 +136,9 @@ public:
    {}

    future<> insert(auth::authenticated_user user, cql3::prepared_cache_key_type prep_cache_key, value_type v) noexcept {
-        return _cache.get_ptr(key_type(std::move(user), std::move(prep_cache_key)), [v = std::move(v)] (const cache_key_type&) mutable {
+        return _cache.insert(key_type(std::move(user), std::move(prep_cache_key)), [v = std::move(v)] (const cache_key_type&) mutable {
            return make_ready_future<value_type>(std::move(v));
-        }).discard_result();
+        });
    }

    value_ptr find(const auth::authenticated_user& user, const cql3::prepared_cache_key_type& prep_cache_key) {
--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -1070,7 +1070,7 @@ try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database
                                .args = {},
                            };
                        } else {
-                            throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
+                            throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument, got {}", fc.args[0]));
                        }
                    }
                }
--- a/data_dictionary/data_dictionary.cc
+++ b/data_dictionary/data_dictionary.cc
@@ -339,7 +339,7 @@ static storage_options::object_storage object_storage_from_map(std::string_view
    }
    if (values.size() > allowed_options.size()) {
        throw std::runtime_error(fmt::format("Extraneous options for {}: {}; allowed: {}",
-            fmt::join(values | std::views::keys, ","), type,
+            type, fmt::join(values | std::views::keys, ","),
            fmt::join(allowed_options | std::views::keys, ",")));
    }
    options.type = std::string(type);
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -776,7 +776,7 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
    friend std::ostream& operator<<(std::ostream&, const segment&);
    friend class segment_manager;

-    size_t sector_overhead(size_t size) const {
+    constexpr size_t sector_overhead(size_t size) const {
        return (size / (_alignment - detail::sector_overhead_size)) * detail::sector_overhead_size;
    }

@@ -1028,18 +1028,21 @@ public:
        co_return me;
    }

-    /**
-     * Allocate a new buffer
-     */
-    void new_buffer(size_t s) {
-        SCYLLA_ASSERT(_buffer.empty());
-
+    std::tuple<size_t, size_t> buffer_usage_size(size_t s) const {
        auto overhead = segment_overhead_size;
        if (_file_pos == 0) {
            overhead += descriptor_header_size;
        }

-        s += overhead;
+        return {s + overhead, overhead};
+    }
+
+    /**
+     * Allocate a new buffer
+     */
+    void new_buffer(size_t size_in) {
+        SCYLLA_ASSERT(_buffer.empty());
+        auto [s, overhead] = buffer_usage_size(size_in);
        // add bookkeep data reqs. 
        auto a = align_up(s + sector_overhead(s), _alignment);
        auto k = std::max(a, default_size);
@@ -1427,6 +1430,9 @@ public:

    position_type next_position(size_t size) const {
        auto used = _buffer_ostream_size - _buffer_ostream.size();
+        if (used == 0) { // new chunk/segment
+            std::tie(size, std::ignore) = buffer_usage_size(size);
+        }
        used += size;
        return _file_pos + used + sector_overhead(used);
    }
@@ -1570,7 +1576,6 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
    clogger.debug("Attempting oversized alloc of {} entry writer", writer.num_entries);

    auto size = writer.size();
-    auto max_file_size = cfg.commitlog_segment_size_in_mb * 1024 * 1024;

    // check if this cannot be written at all...
    if (!cfg.allow_going_over_size_limit) {
@@ -1579,11 +1584,11 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
        // more worst case
        auto size_with_meta_overhead = size_with_sector_overhead
            + (1 + size_with_sector_overhead/max_mutation_size) * (segment::entry_overhead_size + segment::fragmented_entry_overhead_size + segment::segment_overhead_size)
-            * (1 + size_with_sector_overhead/max_file_size) * segment::descriptor_header_size
+            * (1 + size_with_sector_overhead/max_size) * segment::descriptor_header_size
            ;
        // this is not really true. We could have some space in current segment,
        // but again, lets be conservative.
-        auto max_file_size_avail = max_disk_size - max_file_size;
+        auto max_file_size_avail = max_disk_size - max_size;

        if (size_with_meta_overhead > max_file_size_avail) {
            throw std::invalid_argument(fmt::format("Mutation of {} bytes is too large for potentially available disk space of {}", size, max_file_size_avail));
@@ -1770,11 +1775,13 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
                    co_await s->close();
                    s = co_await get_segment();
                }
-                // bytes not counting overhead                
-                auto buf_rem = std::min(max_size - s->position(), s->_buffer_ostream.size());
+                // bytes not counting overhead
+                auto pos = s->position();
+                auto max = std::max<size_t>(pos, max_size);
+                auto buf_rem = std::min(max_size - max, s->_buffer_ostream.size());

                size_t avail;
-                if (buf_rem > align) {
+                if (buf_rem >= align) {
                    auto rem2 = buf_rem - (1 + buf_rem/sector_size) * detail::sector_overhead_size;
                    avail = std::min(rem2, max_mutation_size)
                        - segment::entry_overhead_size
@@ -1784,7 +1791,7 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
                } else {
                    co_await s->cycle();
                    auto pos = s->position();
-                    auto max = std::max<size_t>(pos, max_file_size);
+                    auto max = std::max<size_t>(pos, max_size);
                    auto file_rem = max - pos;

                    if (file_rem < align) {
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -217,7 +217,7 @@ future<> db::commitlog_replayer::impl::process(stats* s, commitlog::buffer_and_r
        if (cm_it == local_cm.end()) {
            if (!cer.get_column_mapping()) {
                rlogger.debug("replaying at {} v={} at {}", fm.column_family_id(), fm.schema_version(), rp);
-                throw std::runtime_error(format("unknown schema version {}, table=", fm.schema_version(), fm.column_family_id()));
+                throw std::runtime_error(format("unknown schema version {}, table={}", fm.schema_version(), fm.column_family_id()));
            }
            rlogger.debug("new schema version {} in entry {}", fm.schema_version(), rp);
            cm_it = local_cm.emplace(fm.schema_version(), *cer.get_column_mapping()).first;
--- a/db/config.cc
+++ b/db/config.cc
@@ -1921,7 +1921,7 @@ std::map<sstring, db::experimental_features_t::feature> db::experimental_feature
        {"lwt", feature::UNUSED},
        {"udf", feature::UDF},
        {"cdc", feature::UNUSED},
-        {"alternator-streams", feature::ALTERNATOR_STREAMS},
+        {"alternator-streams", feature::UNUSED},
        {"alternator-ttl", feature::UNUSED },
        {"consistent-topology-changes", feature::UNUSED},
        {"broadcast-tables", feature::BROADCAST_TABLES},
--- a/db/config.hh
+++ b/db/config.hh
@@ -115,7 +115,6 @@ struct experimental_features_t {
    enum class feature {
        UNUSED,
        UDF,
-        ALTERNATOR_STREAMS,
        BROADCAST_TABLES,
        KEYSPACE_STORAGE_OPTIONS,
        STRONGLY_CONSISTENT_TABLES,
--- a/db/heat_load_balance.cc
+++ b/db/heat_load_balance.cc
@@ -327,7 +327,7 @@ redistribute(const std::vector<float>& p, unsigned me, unsigned k) {
                }
            }

-            hr_logger.trace("     pp after1=", pp);
+            hr_logger.trace("     pp after1={}", pp);
            if (d.first == me) {
                // We only care what "me" sends, and only the elements in
                // the sorted list earlier than me could have forced it to
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -13,7 +13,6 @@
 #include "replica/database.hh"
 #include "db/consistency_level_type.hh"
 #include "db/system_keyspace.hh"
-#include "db/config.hh"
 #include "schema/schema_builder.hh"
 #include "timeout_config.hh"
 #include "types/types.hh"
@@ -22,8 +21,6 @@
 #include "cdc/generation.hh"
 #include "cql3/query_processor.hh"
 #include "service/storage_proxy.hh"
-#include "gms/feature_service.hh"
-
 #include "service/migration_manager.hh"
 #include "locator/host_id.hh"

@@ -41,27 +38,10 @@ static logging::logger dlogger("system_distributed_keyspace");
 extern logging::logger cdc_log;

 namespace db {
-namespace {
-    const auto set_wait_for_sync_to_commitlog = schema_builder::register_schema_initializer([](schema_builder& builder) {
-        if ((builder.ks_name() == system_distributed_keyspace::NAME_EVERYWHERE && builder.cf_name() == system_distributed_keyspace::CDC_GENERATIONS_V2) ||
-            (builder.ks_name() == system_distributed_keyspace::NAME && builder.cf_name() == system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION))
-        {
-            builder.set_wait_for_sync_to_commitlog(true);
-        }
-    });
-}

 extern thread_local data_type cdc_streams_set_type;
 thread_local data_type cdc_streams_set_type = set_type_impl::get_instance(bytes_type, false);

-/* See `token_range_description` struct */
-thread_local data_type cdc_streams_list_type = list_type_impl::get_instance(bytes_type, false);
-thread_local data_type cdc_token_range_description_type = tuple_type_impl::get_instance(
-        { long_type             // dht::token token_range_end;
-        , cdc_streams_list_type // std::vector<stream_id> streams;
-        , byte_type             // uint8_t sharding_ignore_msb;
-        });
-thread_local data_type cdc_generation_description_type = list_type_impl::get_instance(cdc_token_range_description_type, false);

 schema_ptr view_build_status() {
    static thread_local auto schema = [] {
@@ -77,42 +57,6 @@ schema_ptr view_build_status() {
    return schema;
 }

-/* An internal table used by nodes to exchange CDC generation data. */
-schema_ptr cdc_generations_v2() {
-    thread_local auto schema = [] {
-        auto id = generate_legacy_id(system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2);
-        return schema_builder(system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2, {id})
-                /* The unique identifier of this generation. */
-                .with_column("id", uuid_type, column_kind::partition_key)
-                /* The generation describes a mapping from all tokens in the token ring to a set of stream IDs.
-                 * This mapping is built from a bunch of smaller mappings, each describing how tokens in a subrange
-                 * of the token ring are mapped to stream IDs; these subranges together cover the entire token ring.
-                 * Each such range-local mapping is represented by a row of this table.
-                 * The clustering key of the row is the end of the range being described by this row.
-                 * The start of this range is the range_end of the previous row (in the clustering order, which is the integer order)
-                 * or of the last row of this partition if this is the first the first row. */
-                .with_column("range_end", long_type, column_kind::clustering_key)
-                /* The set of streams mapped to in this range.
-                 * The number of streams mapped to a single range in a CDC generation is bounded from above by the number
-                 * of shards on the owner of that range in the token ring.
-                 * In other words, the number of elements of this set is bounded by the maximum of the number of shards
-                 * over all nodes. The serialized size is obtained by counting about 20B for each stream.
-                 * For example, if all nodes in the cluster have at most 128 shards,
-                 * the serialized size of this set will be bounded by ~2.5 KB. */
-                .with_column("streams", cdc_streams_set_type)
-                /* The value of the `ignore_msb` sharding parameter of the node which was the owner of this token range
-                 * when the generation was first created. Together with the set of streams above it fully describes
-                 * the mapping for this particular range. */
-                .with_column("ignore_msb", byte_type)
-                /* Column used for sanity checking.
-                 * For a given generation it's equal to the number of ranges in this generation;
-                 * thus, after the generation is fully inserted, it must be equal to the number of rows in the partition. */
-                .with_column("num_ranges", int32_type, column_kind::static_column)
-                .with_hash_version()
-                .build();
-    }();
-    return schema;
-}

 /* A user-facing table providing identifiers of the streams used in CDC generations. */
 schema_ptr cdc_desc() {
@@ -152,23 +96,6 @@ schema_ptr cdc_timestamps() {

 static const sstring CDC_TIMESTAMPS_KEY = "timestamps";

-schema_ptr service_levels() {
-    static thread_local auto schema = [] {
-        auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS);
-        auto builder = schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS, std::make_optional(id))
-                .with_column("service_level", utf8_type, column_kind::partition_key)
-                .with_column("shares", int32_type);
-        if (utils::get_local_injector().is_enabled("service_levels_v1_table_without_shares")) {
-            builder.remove_column("shares");
-        }
-
-        return builder
-                .with_hash_version()
-                .build();
-    }();
-    return schema;
-}
-
 // This is the set of tables which this node ensures to exist in the cluster.
 // It does that by announcing the creation of these schemas on initialization
 // of the `system_distributed_keyspace` service (see `start()`), unless it first
@@ -182,19 +109,13 @@ schema_ptr service_levels() {
 static std::vector<schema_ptr> ensured_tables() {
    return {
        view_build_status(),
-        cdc_generations_v2(),
        cdc_desc(),
        cdc_timestamps(),
-        service_levels(),
    };
 }

 std::vector<schema_ptr> system_distributed_keyspace::all_distributed_tables() {
-    return {view_build_status(), cdc_desc(), cdc_timestamps(), service_levels()};
-}
-
-std::vector<schema_ptr> system_distributed_keyspace::all_everywhere_tables() {
-    return {cdc_generations_v2()};
+    return {view_build_status(), cdc_desc(), cdc_timestamps()};
 }

 system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor& qp, service::migration_manager& mm, service::storage_proxy& sp)
@@ -203,36 +124,6 @@ system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor&
        , _sp(sp) {
 }

-static std::vector<std::pair<std::string_view, data_type>> new_service_levels_columns(bool workload_prioritization_enabled) {
-    std::vector<std::pair<std::string_view, data_type>> new_columns {{"timeout", duration_type}, {"workload_type", utf8_type}};
-    if (workload_prioritization_enabled) {
-        new_columns.push_back({"shares", int32_type});
-    }
-    return new_columns;
-};
-
-static schema_ptr get_current_service_levels(data_dictionary::database db) {
-    return db.has_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS)
-            ? db.find_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS)
-            : service_levels();
-}
-
-static schema_ptr get_updated_service_levels(data_dictionary::database db, bool workload_prioritization_enabled) {
-    SCYLLA_ASSERT(this_shard_id() == 0);
-    auto schema = get_current_service_levels(db);
-    schema_builder b(schema);
-    for (const auto& col : new_service_levels_columns(workload_prioritization_enabled)) {
-        auto& [col_name, col_type] = col;
-        bytes options_name = to_bytes(col_name.data());
-        if (schema->get_column_definition(options_name)) {
-            continue;
-        }
-        b.with_column(options_name, col_type, column_kind::regular_column);
-    }
-    b.with_hash_version();
-    return b.build();
-}
-
 future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tables) {
    if (this_shard_id() != 0) {
        _started = true;
@@ -243,11 +134,9 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl

    while (true) {
        // Check if there is any work to do before taking the group 0 guard.
-        bool workload_prioritization_enabled = _sp.features().workload_prioritization;
-        bool keyspaces_setup = db.has_keyspace(NAME) && db.has_keyspace(NAME_EVERYWHERE);
+        bool keyspaces_setup = db.has_keyspace(NAME);
        bool tables_setup = std::all_of(tables.begin(), tables.end(), [db] (schema_ptr t) { return db.has_schema(t->ks_name(), t->cf_name()); } );
-        bool service_levels_up_to_date = get_current_service_levels(db)->equal_columns(*get_updated_service_levels(db, workload_prioritization_enabled));
-        if (keyspaces_setup && tables_setup && service_levels_up_to_date) {
+        if (keyspaces_setup && tables_setup) {
            dlogger.info("system_distributed(_everywhere) keyspaces and tables are up-to-date. Not creating");
            _started = true;
            co_return;
@@ -258,51 +147,25 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
        utils::chunked_vector<mutation> mutations;
        sstring description;

-        auto sd_ksm = keyspace_metadata::new_keyspace(
+        auto ksm = keyspace_metadata::new_keyspace(
                NAME,
                "org.apache.cassandra.locator.SimpleStrategy",
                {{"replication_factor", "3"}},
                std::nullopt, std::nullopt);
        if (!db.has_keyspace(NAME)) {
-            mutations = service::prepare_new_keyspace_announcement(db.real_database(), sd_ksm, ts);
+            mutations = service::prepare_new_keyspace_announcement(db.real_database(), ksm, ts);
            description += format(" create {} keyspace;", NAME);
        } else {
            dlogger.info("{} keyspace is already present. Not creating", NAME);
        }

-        auto sde_ksm = keyspace_metadata::new_keyspace(
-                NAME_EVERYWHERE,
-                "org.apache.cassandra.locator.EverywhereStrategy",
-                {},
-                std::nullopt, std::nullopt);
-        if (!db.has_keyspace(NAME_EVERYWHERE)) {
-            auto sde_mutations = service::prepare_new_keyspace_announcement(db.real_database(), sde_ksm, ts);
-            std::move(sde_mutations.begin(), sde_mutations.end(), std::back_inserter(mutations));
-            description += format(" create {} keyspace;", NAME_EVERYWHERE);
-        } else {
-            dlogger.info("{} keyspace is already present. Not creating", NAME_EVERYWHERE);
-        }
-
-        // Get mutations for creating and updating tables.
+        // Get mutations for creating tables.
        auto num_keyspace_mutations = mutations.size();
        co_await coroutine::parallel_for_each(ensured_tables(),
-                [this, &mutations, db, ts, sd_ksm, sde_ksm, workload_prioritization_enabled] (auto&& table) -> future<> {
-            auto ksm = table->ks_name() == NAME ? sd_ksm : sde_ksm;
-
-            // Ensure that the service_levels table contains new columns.
-            if (table->cf_name() == SERVICE_LEVELS) {
-                table = get_updated_service_levels(db, workload_prioritization_enabled);
-            }
-
+                [this, &mutations, db, ts, ksm] (auto&& table) -> future<> {
            if (!db.has_schema(table->ks_name(), table->cf_name())) {
                co_return co_await service::prepare_new_column_family_announcement(mutations, _sp, *ksm, std::move(table), ts);
            }
-
-            // The service_levels table exists. Update it if it lacks new columns.
-            if (table->cf_name() == SERVICE_LEVELS && !get_current_service_levels(db)->equal_columns(*table)) {
-                auto update_mutations = co_await service::prepare_column_family_update_announcement(_sp, table, std::vector<view_ptr>(), ts);
-                std::move(update_mutations.begin(), update_mutations.end(), std::back_inserter(mutations));
-            }
        });
        if (mutations.size() > num_keyspace_mutations) {
            description += " create and update system_distributed(_everywhere) tables";
@@ -324,15 +187,6 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
    }
 }

- future<> system_distributed_keyspace::start_workload_prioritization() {
-    if (this_shard_id() != 0) {
-        co_return;
-    }
-    if (_qp.db().features().workload_prioritization) {
-       co_await create_tables({get_updated_service_levels(_qp.db(), true)});
-    }
-}
-
 future<> system_distributed_keyspace::start() {
    if (this_shard_id() != 0) {
        _started = true;
@@ -375,90 +229,6 @@ static db::consistency_level quorum_if_many(size_t num_token_owners) {
    return num_token_owners > 1 ? db::consistency_level::QUORUM : db::consistency_level::ONE;
 }

-future<>
-system_distributed_keyspace::insert_cdc_generation(
-        utils::UUID id,
-        const cdc::topology_description& desc,
-        context ctx) {
-    using namespace std::chrono_literals;
-
-    const size_t concurrency = 10;
-    const size_t num_replicas = ctx.num_token_owners;
-
-    // To insert the data quickly and efficiently we send it in batches of multiple rows
-    // (each batch represented by a single mutation). We also send multiple such batches concurrently.
-    // However, we need to limit the memory consumption of the operation.
-    // I assume that the memory consumption grows linearly with the number of replicas
-    // (we send to all replicas ``at the same time''), with the batch size (the data must
-    // be copied for each replica?) and with concurrency. These assumptions may be too conservative
-    // but that won't hurt in a significant way (it may hurt the efficiency of the operation a little).
-    // Thus, if we want to limit the memory consumption to L, it should be true that
-    // mutation_size * num_replicas * concurrency <= L, hence
-    // mutation_size <= L / (num_replicas * concurrency).
-    // For example, say L = 10MB, concurrency = 10, num_replicas = 100; we get
-    // mutation_size <= 10MB / 1000 = 10KB.
-    // On the other hand we must have mutation_size >= size of a single row,
-    // so we will use mutation_size <= max(size of single row, L/(num_replicas*concurrency)).
-
-    // It has been tested that sending 1MB batches to 3 replicas with concurrency 20 works OK,
-    // which would correspond to L ~= 60MB. Hence that's the limit we use here.
-    const size_t L = 60'000'000;
-    const auto mutation_size_threshold = std::max(size_t(1), L / (num_replicas * concurrency));
-
-    auto s = _qp.db().real_database().find_schema(
-        system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2);
-    auto ms = co_await cdc::get_cdc_generation_mutations_v2(s, id, desc, mutation_size_threshold, api::new_timestamp());
-    co_await max_concurrent_for_each(ms, concurrency, [&] (mutation& m) -> future<> {
-        co_await _sp.mutate(
-            { std::move(m) },
-            db::consistency_level::ALL,
-            db::timeout_clock::now() + 60s,
-            nullptr, // trace_state
-            empty_service_permit(),
-            db::allow_per_partition_rate_limit::no,
-            false // raw_counters
-        );
-    });
-}
-
-future<std::optional<cdc::topology_description>>
-system_distributed_keyspace::read_cdc_generation(utils::UUID id) {
-    utils::chunked_vector<cdc::token_range_description> entries;
-    size_t num_ranges = 0;
-    co_await _qp.query_internal(
-            // This should be a local read so 20s should be more than enough
-            format("SELECT range_end, streams, ignore_msb, num_ranges FROM {}.{} WHERE id = ? USING TIMEOUT 20s", NAME_EVERYWHERE, CDC_GENERATIONS_V2),
-            db::consistency_level::ONE, // we wrote the generation with ALL so ONE must see it (or there's something really wrong)
-            { id },
-            1000, // for ~1KB rows, ~1MB page size
-            [&] (const cql3::untyped_result_set_row& row) {
-
-        std::vector<cdc::stream_id> streams;
-        row.get_list_data<bytes>("streams", std::back_inserter(streams));
-        entries.push_back(cdc::token_range_description{
-                dht::token::from_int64(row.get_as<int64_t>("range_end")),
-                std::move(streams),
-                uint8_t(row.get_as<int8_t>("ignore_msb"))});
-        num_ranges = row.get_as<int32_t>("num_ranges");
-        return make_ready_future<stop_iteration>(stop_iteration::no);
-    });
-
-    if (entries.empty()) {
-        co_return std::nullopt;
-    }
-
-    // Paranoic sanity check. Partial reads should not happen since generations should be retrieved only after they
-    // were written successfully with CL=ALL. But nobody uses EverywhereStrategy tables so they weren't ever properly
-    // tested, so just in case...
-    if (entries.size() != num_ranges) {
-        throw std::runtime_error(format(
-                "read_cdc_generation: wrong number of rows. The `num_ranges` column claimed {} rows,"
-                " but reading the partition returned {}.", num_ranges, entries.size()));
-    }
-
-    co_return std::optional{cdc::topology_description(std::move(entries))};
-}
-
 static future<utils::chunked_vector<mutation>> get_cdc_streams_descriptions_v2_mutation(
        const replica::database& db,
        db_clock::time_point time,
@@ -630,65 +400,4 @@ system_distributed_keyspace::cdc_current_generation_timestamp(context ctx) {
    co_return timestamp_cql->one().get_as<db_clock::time_point>("time");
 }

-future<qos::service_levels_info> system_distributed_keyspace::get_service_levels(qos::query_context ctx) const {
-    return qos::get_service_levels(_qp, NAME, SERVICE_LEVELS, db::consistency_level::ONE, ctx);
-}
-
-future<qos::service_levels_info> system_distributed_keyspace::get_service_level(sstring service_level_name) const {
-    return qos::get_service_level(_qp, NAME, SERVICE_LEVELS, service_level_name, db::consistency_level::ONE);
-}
-
-future<> system_distributed_keyspace::set_service_level(sstring service_level_name, qos::service_level_options slo) const {
-    static sstring prepared_query = format("INSERT INTO {}.{} (service_level) VALUES (?);", NAME, SERVICE_LEVELS);
-    co_await _qp.execute_internal(prepared_query, db::consistency_level::ONE, internal_distributed_query_state(), {service_level_name}, cql3::query_processor::cache_internal::no);
-    auto to_data_value = [&] (const qos::service_level_options::timeout_type& tv) {
-        return std::visit(overloaded_functor {
-            [&] (const qos::service_level_options::unset_marker&) {
-                return data_value::make_null(duration_type);
-            },
-            [&] (const qos::service_level_options::delete_marker&) {
-                return data_value::make_null(duration_type);
-            },
-            [&] (const lowres_clock::duration& d) {
-                return data_value(cql_duration(months_counter{0},
-                        days_counter{0},
-                        nanoseconds_counter{std::chrono::duration_cast<std::chrono::nanoseconds>(d).count()}));
-            },
-        }, tv);
-    };
-    auto to_data_value_g = [&] <typename T> (const std::variant<qos::service_level_options::unset_marker, qos::service_level_options::delete_marker, T>& v) {
-        return std::visit(overloaded_functor {
-            [&] (const qos::service_level_options::unset_marker&) {
-                return data_value::make_null(data_type_for<T>());
-            },
-            [&] (const qos::service_level_options::delete_marker&) {
-                return data_value::make_null(data_type_for<T>());
-            },
-            [&] (const T& v) {
-                return data_value(v);
-            },
-        }, v);
-    };
-    data_value workload = slo.workload == qos::service_level_options::workload_type::unspecified
-            ? data_value::make_null(utf8_type)
-            : data_value(qos::service_level_options::to_string(slo.workload));
-    co_await _qp.execute_internal(format("UPDATE {}.{} SET timeout = ?, workload_type = ? WHERE service_level = ?;", NAME, SERVICE_LEVELS),
-                db::consistency_level::ONE,
-                internal_distributed_query_state(),
-                {to_data_value(slo.timeout),
-                    workload,
-                    service_level_name},
-                cql3::query_processor::cache_internal::no);
-    co_await _qp.execute_internal(format("UPDATE {}.{} SET shares = ? WHERE service_level = ?;", NAME, SERVICE_LEVELS),
-                db::consistency_level::ONE,
-                internal_distributed_query_state(),
-                {to_data_value_g(slo.shares), service_level_name},
-                cql3::query_processor::cache_internal::no);
-}
-
-future<> system_distributed_keyspace::drop_service_level(sstring service_level_name) const {
-    static sstring prepared_query = format("DELETE FROM {}.{} WHERE service_level= ?;", NAME, SERVICE_LEVELS);
-    return _qp.execute_internal(prepared_query, db::consistency_level::ONE, internal_distributed_query_state(), {service_level_name}, cql3::query_processor::cache_internal::no).discard_result();
-}
-
 }
--- a/db/system_distributed_keyspace.hh
+++ b/db/system_distributed_keyspace.hh
@@ -9,9 +9,6 @@
 #pragma once

 #include "schema/schema_fwd.hh"
-#include "service/qos/qos_common.hh"
-#include "utils/UUID.hh"
-#include "cdc/generation_id.hh"
 #include "locator/host_id.hh"

 #include <seastar/core/future.hh>
@@ -24,7 +21,6 @@ class query_processor;
 }

 namespace cdc {
-    class stream_id;
    class topology_description;
    class streams_version;
 } // namespace cdc
@@ -39,17 +35,8 @@ namespace db {
 class system_distributed_keyspace {
 public:
    static constexpr auto NAME = "system_distributed";
-    static constexpr auto NAME_EVERYWHERE = "system_distributed_everywhere";

    static constexpr auto VIEW_BUILD_STATUS = "view_build_status";
-    static constexpr auto SERVICE_LEVELS = "service_levels";
-
-    /* Nodes use this table to communicate new CDC stream generations to other nodes. */
-    static constexpr auto CDC_TOPOLOGY_DESCRIPTION = "cdc_generation_descriptions";
-
-    /* Nodes use this table to communicate new CDC stream generations to other nodes.
-     * Resides in system_distributed_everywhere. */
-    static constexpr auto CDC_GENERATIONS_V2 = "cdc_generation_descriptions_v2";

    /* This table is used by CDC clients to learn about available CDC streams. */
    static constexpr auto CDC_DESC_V2 = "cdc_streams_descriptions_v2";
@@ -77,19 +64,14 @@ private:

 public:
    static std::vector<schema_ptr> all_distributed_tables();
-    static std::vector<schema_ptr> all_everywhere_tables();

    system_distributed_keyspace(cql3::query_processor&, service::migration_manager&, service::storage_proxy&);

    future<> start();
-    future<> start_workload_prioritization();
    future<> stop();

    bool started() const { return _started; }

-    future<> insert_cdc_generation(utils::UUID, const cdc::topology_description&, context);
-    future<std::optional<cdc::topology_description>> read_cdc_generation(utils::UUID);
-
    future<> create_cdc_desc(db_clock::time_point, const cdc::topology_description&, context);
    future<bool> cdc_desc_exists(db_clock::time_point, context);

@@ -105,11 +87,6 @@ public:
    // NOTE: currently used only by alternator
    future<db_clock::time_point> cdc_current_generation_timestamp(context);

-    future<qos::service_levels_info> get_service_levels(qos::query_context ctx) const;
-    future<qos::service_levels_info> get_service_level(sstring service_level_name) const;
-    future<> set_service_level(sstring service_level_name, qos::service_level_options slo) const;
-    future<> drop_service_level(sstring service_level_name) const;
-
 private:
    future<> create_tables(std::vector<schema_ptr> tables);
 };
--- a/db/view/node_view_update_backlog.hh
+++ b/db/view/node_view_update_backlog.hh
@@ -10,6 +10,7 @@

 #include "db/view/view_update_backlog.hh"
 #include "utils/error_injection.hh"
+#include "utils/updateable_value.hh"

 #include <seastar/core/cacheline.hh>
 #include <seastar/core/future.hh>
@@ -41,13 +42,16 @@ class node_update_backlog {
    std::chrono::milliseconds _interval;
    std::atomic<clock::time_point> _last_update;
    std::atomic<update_backlog> _max;
+    utils::updateable_value<uint32_t> _view_flow_control_delay_limit_in_ms;

 public:
-    explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval)
+    explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval,
+            utils::updateable_value<uint32_t> view_flow_control_delay_limit_in_ms = utils::updateable_value<uint32_t>(1000))
            : _backlogs(shards)
            , _interval(interval)
            , _last_update(clock::now() - _interval)
-            , _max(update_backlog::no_backlog()) {
+            , _max(update_backlog::no_backlog())
+            , _view_flow_control_delay_limit_in_ms(std::move(view_flow_control_delay_limit_in_ms)) {
        if (utils::get_local_injector().enter("update_backlog_immediately")) {
            _interval = std::chrono::milliseconds(0);
            _last_update = clock::now();
@@ -59,6 +63,9 @@ public:
    update_backlog fetch_shard(unsigned shard);
    seastar::future<std::optional<update_backlog>> fetch_if_changed();

+    std::chrono::microseconds calculate_throttling_delay(update_backlog backlog,
+            db::timeout_clock::time_point timeout) const;
+
    // Exposed for testing only.
    update_backlog load() const {
        return _max.load(std::memory_order_relaxed);
--- a/db/view/row_locking.cc
+++ b/db/view/row_locking.cc
@@ -150,14 +150,14 @@ row_locker::unlock(const dht::decorated_key* pk, bool partition_exclusive,
        auto pli = _two_level_locks.find(*pk);
        if (pli == _two_level_locks.end()) {
            // This shouldn't happen... We can't unlock this lock if we can't find it...
-            mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for partition", *pk);
+            mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for partition {}", *pk);
            return;
        }
        SCYLLA_ASSERT(&pli->first == pk);
        if (cpk) {
            auto rli = pli->second._row_locks.find(*cpk);
            if (rli == pli->second._row_locks.end()) {
-                mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for row", *cpk);
+                mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for row {}", *cpk);
                return;
            }
            SCYLLA_ASSERT(&rli->first == cpk);
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -45,6 +45,7 @@
 #include "db/view/view_builder.hh"
 #include "db/view/view_updating_consumer.hh"
 #include "db/view/view_update_generator.hh"
+#include "db/view/node_view_update_backlog.hh"
 #include "db/view/regular_column_transformation.hh"
 #include "db/system_keyspace_view_types.hh"
 #include "db/system_keyspace.hh"
@@ -3492,18 +3493,27 @@ future<> delete_ghost_rows_visitor::do_accept_new_row(partition_key pk, clusteri
    }
 }

-std::chrono::microseconds calculate_view_update_throttling_delay(db::view::update_backlog backlog,
-                                                                 db::timeout_clock::time_point timeout,
-                                                                 uint32_t view_flow_control_delay_limit_in_ms) {
+// View updates are asynchronous, and because of this limiting their concurrency requires
+// a special approach. The current algorithm places all of the pending view updates in the backlog
+// and artificially slows down new responses to coordinator requests based on how full the backlog is.
+// This function calculates how much a request should be slowed down based on the backlog's fullness.
+// The equation is basically: delay(in seconds) = view_fullness_ratio^3
+// The more full the backlog gets the more aggressively the requests are slowed down.
+// The delay is limited to the amount of time left until timeout.
+// After the timeout the request fails, so there's no point in waiting longer than that.
+// The second argument defines this timeout point - we can't delay the request more than this time point.
+// See: https://www.scylladb.com/2018/12/04/worry-free-ingestion-flow-control/
+std::chrono::microseconds node_update_backlog::calculate_throttling_delay(update_backlog backlog,
+                                                                         db::timeout_clock::time_point timeout) const {
    auto adjust = [] (float x) { return x * x * x; };
-    auto budget = std::max(service::storage_proxy::clock_type::duration(0),
-        timeout - service::storage_proxy::clock_type::now());
-    std::chrono::microseconds ret(uint32_t(adjust(backlog.relative_size()) * view_flow_control_delay_limit_in_ms * 1000));
+    auto budget = std::max(db::timeout_clock::duration(0),
+        timeout - db::timeout_clock::now());
+    std::chrono::microseconds ret(uint32_t(adjust(backlog.relative_size()) * _view_flow_control_delay_limit_in_ms() * 1000));
    // "budget" has millisecond resolution and can potentially be long
    // in the future so converting it to microseconds may overflow.
    // So to compare buget and ret we need to convert both to the lower
    // resolution.
-    if (std::chrono::duration_cast<service::storage_proxy::clock_type::duration>(ret) < budget) {
+    if (std::chrono::duration_cast<db::timeout_clock::duration>(ret) < budget) {
        return ret;
    } else {
        // budget is small (< ret) so can be converted to microseconds
--- a/db/view/view_building_worker.cc
+++ b/db/view/view_building_worker.cc
@@ -715,7 +715,7 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
            vbw_logger.info("Building range {} for base table {} and views {} was aborted.", range, base_id, views_ids);
        } catch (...) {
            eptr = std::current_exception();
-            vbw_logger.warn("Error during processing range {} for base table {} and views {}: ", range, base_id, views_ids, eptr);
+            vbw_logger.warn("Error during processing range {} for base table {} and views {}: {}", range, base_id, views_ids, eptr);
        }
        reader.close().get();

--- a/db/view/view_update_backlog.hh
+++ b/db/view/view_update_backlog.hh
@@ -43,7 +43,7 @@ public:
    // Returns the number of bytes in the backlog divided by the maximum number of bytes
    // that the backlog can hold before employing admission control. While the backlog
    // is below the threshold, the coordinator will slow down the view updates up to
-    // calculate_view_update_throttling_delay()::delay_limit_us. Above the threshold,
+    // node_update_backlog::calculate_throttling_delay()::delay_limit_us. Above the threshold,
    // the coordinator will reject the writes that would increase the backlog. On the
    // replica, the writes will start failing only after reaching the hard limit '_max'.
    float relative_size() const {
@@ -70,18 +70,4 @@ public:
    }
 };

-// View updates are asynchronous, and because of this limiting their concurrency requires
-// a special approach. The current algorithm places all of the pending view updates in the backlog
-// and artificially slows down new responses to coordinator requests based on how full the backlog is.
-// This function calculates how much a request should be slowed down based on the backlog's fullness.
-// The equation is basically: delay(in seconds) = view_fullness_ratio^3
-// The more full the backlog gets the more aggressively the requests are slowed down.
-// The delay is limited to the amount of time left until timeout.
-// After the timeout the request fails, so there's no point in waiting longer than that.
-// The second argument defines this timeout point - we can't delay the request more than this time point.
-// See: https://www.scylladb.com/2018/12/04/worry-free-ingestion-flow-control/
-std::chrono::microseconds calculate_view_update_throttling_delay(
-    update_backlog backlog,
-    db::timeout_clock::time_point timeout,
-    uint32_t view_flow_control_delay_limit_in_ms);
 }
--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -7,6 +7,7 @@
 */

 #include "db/view/view_update_backlog.hh"
+#include "db/view/node_view_update_backlog.hh"
 #include <seastar/core/timed_out_error.hh>
 #include "gms/inet_address.hh"
 #include <seastar/util/defer.hh>
@@ -95,9 +96,10 @@ public:
    }
 };

-view_update_generator::view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, abort_source& as)
+view_update_generator::view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, node_update_backlog& node_backlog, abort_source& as)
        : _db(db)
        , _proxy(proxy)
+        , _node_update_backlog(node_backlog)
        , _progress_tracker(std::make_unique<progress_tracker>())
        , _early_abort_subscription(as.subscribe([this] () noexcept { do_abort(); }))
 {
@@ -112,7 +114,7 @@ future<> view_update_generator::start() {
    _started = seastar::async([this]() mutable {
        auto drop_sstable_references = defer([&] () noexcept {
            // Clear sstable references so sstables_manager::stop() doesn't hang.
-            vug_logger.info("leaving {} unstaged sstables unprocessed",
+            vug_logger.info("leaving {} unstaged sstables and {} sstables with tables unprocessed",
                    _sstables_to_move.size(), _sstables_with_tables.size());
            _sstables_to_move.clear();
            _sstables_with_tables.clear();
@@ -498,7 +500,7 @@ future<> view_update_generator::generate_and_propagate_view_updates(const replic
        // the one which limits the number of incoming client requests by delaying the response to the client.
        if (batch_num > 0) {
            update_backlog local_backlog = _db.get_view_update_backlog();
-            std::chrono::microseconds throttle_delay =  calculate_view_update_throttling_delay(local_backlog, timeout, _db.get_config().view_flow_control_delay_limit_in_ms());
+            std::chrono::microseconds throttle_delay =  _node_update_backlog.calculate_throttling_delay(local_backlog, timeout);

            co_await seastar::sleep(throttle_delay);

--- a/db/view/view_update_generator.hh
+++ b/db/view/view_update_generator.hh
@@ -52,6 +52,7 @@ using allow_hints = bool_class<allow_hints_tag>;

 namespace db::view {

+class node_update_backlog;
 class stats;
 struct wait_for_all_updates_tag {};
 using wait_for_all_updates = bool_class<wait_for_all_updates_tag>;
@@ -63,6 +64,7 @@ public:
 private:
    replica::database& _db;
    sharded<service::storage_proxy>& _proxy;
+    node_update_backlog& _node_update_backlog;
    seastar::abort_source _as;
    future<> _started = make_ready_future<>();
    seastar::condition_variable _pending_sstables;
@@ -75,7 +77,7 @@ private:
    optimized_optional<abort_source::subscription> _early_abort_subscription;
    void do_abort() noexcept;
 public:
-    view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, abort_source& as);
+    view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, node_update_backlog& node_backlog, abort_source& as);
    ~view_update_generator();

    future<> start();
--- a/dist/CMakeLists.txt
+++ b/dist/CMakeLists.txt
@@ -141,4 +141,72 @@ add_dependencies(dist
  dist-python3
  dist-server)

+set(dist_rpm_dir "${CMAKE_BINARY_DIR}/$<CONFIG>/dist/rpm")
+set(dist_deb_dir "${CMAKE_BINARY_DIR}/$<CONFIG>/dist/deb")
+
+# Map system processor to Debian architecture names
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+  set(deb_arch "amd64")
+elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+  set(deb_arch "arm64")
+else()
+  message(FATAL_ERROR "Unsupported architecture: ${CMAKE_SYSTEM_PROCESSOR}")
+endif()
+
+set(rpm_ver "${Scylla_VERSION}-${Scylla_RELEASE}")
+set(deb_ver "${Scylla_VERSION}-${Scylla_RELEASE}-1")
+set(rpm_arch "${CMAKE_SYSTEM_PROCESSOR}")
+
+set(server_rpms_dir "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/redhat/RPMS/${rpm_arch}")
+set(server_rpms
+  "${server_rpms_dir}/${Scylla_PRODUCT}-${rpm_ver}.${rpm_arch}.rpm"
+  "${server_rpms_dir}/${Scylla_PRODUCT}-server-${rpm_ver}.${rpm_arch}.rpm"
+  "${server_rpms_dir}/${Scylla_PRODUCT}-server-debuginfo-${rpm_ver}.${rpm_arch}.rpm"
+  "${server_rpms_dir}/${Scylla_PRODUCT}-conf-${rpm_ver}.${rpm_arch}.rpm"
+  "${server_rpms_dir}/${Scylla_PRODUCT}-kernel-conf-${rpm_ver}.${rpm_arch}.rpm"
+  "${server_rpms_dir}/${Scylla_PRODUCT}-node-exporter-${rpm_ver}.${rpm_arch}.rpm")
+set(cqlsh_rpms
+  "${CMAKE_SOURCE_DIR}/tools/cqlsh/build/redhat/RPMS/${rpm_arch}/${Scylla_PRODUCT}-cqlsh-${rpm_ver}.${rpm_arch}.rpm")
+set(python3_rpms
+  "${CMAKE_SOURCE_DIR}/tools/python3/build/redhat/RPMS/${rpm_arch}/${Scylla_PRODUCT}-python3-${rpm_ver}.${rpm_arch}.rpm")
+
+set(server_debs_dir "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/debian")
+set(server_debs
+  "${server_debs_dir}/${Scylla_PRODUCT}_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/${Scylla_PRODUCT}-server_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/${Scylla_PRODUCT}-server-dbg_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/${Scylla_PRODUCT}-conf_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/${Scylla_PRODUCT}-kernel-conf_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/${Scylla_PRODUCT}-node-exporter_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/scylla-enterprise_${deb_ver}_all.deb"
+  "${server_debs_dir}/scylla-enterprise-server_${deb_ver}_all.deb"
+  "${server_debs_dir}/scylla-enterprise-conf_${deb_ver}_all.deb"
+  "${server_debs_dir}/scylla-enterprise-kernel-conf_${deb_ver}_all.deb"
+  "${server_debs_dir}/scylla-enterprise-node-exporter_${deb_ver}_all.deb")
+set(cqlsh_debs
+  "${CMAKE_SOURCE_DIR}/tools/cqlsh/build/debian/${Scylla_PRODUCT}-cqlsh_${deb_ver}_${deb_arch}.deb"
+  "${CMAKE_SOURCE_DIR}/tools/cqlsh/build/debian/scylla-enterprise-cqlsh_${deb_ver}_all.deb")
+set(python3_debs
+  "${CMAKE_SOURCE_DIR}/tools/python3/build/debian/${Scylla_PRODUCT}-python3_${deb_ver}_${deb_arch}.deb"
+  "${CMAKE_SOURCE_DIR}/tools/python3/build/debian/scylla-enterprise-python3_${deb_ver}_all.deb")
+
+add_custom_target(collect-dist-rpm
+  COMMAND ${CMAKE_COMMAND} -E rm -rf ${dist_rpm_dir}
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${dist_rpm_dir}
+  COMMAND ${CMAKE_COMMAND} -E copy ${server_rpms} ${cqlsh_rpms} ${python3_rpms} ${dist_rpm_dir}/
+  DEPENDS dist
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  COMMENT "Collecting RPMs into ${dist_rpm_dir}")
+
+add_custom_target(collect-dist-deb
+  COMMAND ${CMAKE_COMMAND} -E rm -rf ${dist_deb_dir}
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${dist_deb_dir}
+  COMMAND ${CMAKE_COMMAND} -E copy ${server_debs} ${cqlsh_debs} ${python3_debs} ${dist_deb_dir}/
+  DEPENDS dist
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  COMMENT "Collecting DEBs into ${dist_deb_dir}")
+
+add_custom_target(collect-dist
+  DEPENDS collect-dist-rpm collect-dist-deb)
+
 add_subdirectory(debuginfo)
--- a/docs/alternator/compatibility.md
+++ b/docs/alternator/compatibility.md
@@ -324,6 +324,13 @@ experimental:
    stream events. Without this option, such no-op operations may still
    generate spurious stream events.
    <https://github.com/scylladb/scylladb/issues/28368>
+  * When a stream is disabled, no new records are written but the existing
+    stream data is preserved and remains readable through its original
+    StreamArn. The data expires via TTL after 24 hours. Re-enabling the
+    stream purges the old data immediately and produces a new StreamArn.
+    In contrast, DynamoDB keeps the old stream and its data readable for
+    24 hours through the old StreamArn even after re-enabling.
+    <https://scylladb.atlassian.net/browse/SCYLLADB-1873>

 ## Unimplemented API features

--- a/docs/cql/ddl.rst
+++ b/docs/cql/ddl.rst
@@ -415,7 +415,7 @@ An empty list is allowed, and it's equivalent to numeric replication factor of 0
 .. code-block:: cql

  ALTER KEYSPACE Excelsior
-   WITH replication = { 'class' : 'NetworkTopologyStrategy', dc2' : []};
+   WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc2' : []};


 Altering from a rack list to a numeric replication factor is not supported.
@@ -1017,11 +1017,11 @@ For example:

    CREATE TABLE customer_data (
        cust_id uuid,
-        cust_first-name text,
-        cust_last-name text,
+        "cust_first-name" text,
+        "cust_last-name" text,
        cust_phone text,
-        cust_get-sms text,
-        PRIMARY KEY (customer_id)
+        "cust_get-sms" text,
+        PRIMARY KEY (cust_id)
    ) WITH cdc = { 'enabled' : 'true', 'preimage' : 'true' };

 .. _cql-caching-options:
--- a/docs/cql/dml/insert.rst
+++ b/docs/cql/dml/insert.rst
@@ -24,7 +24,8 @@ For example:

    INSERT INTO NerdMovies (movie, director, main_actor, year)
          VALUES ('Serenity', 'Joss Whedon', 'Nathan Fillion', 2005)
-          USING TTL 86400 IF NOT EXISTS;
+          IF NOT EXISTS
+          USING TTL 86400;

 The ``INSERT`` statement writes one or more columns for a given row in a table. Note that since a row is identified by
 its ``PRIMARY KEY``, at least the columns composing it must be specified. The list of columns to insert to must be
--- a/docs/cql/types.rst
+++ b/docs/cql/types.rst
@@ -507,7 +507,7 @@ For example::

  CREATE TABLE superheroes (
       name frozen<full_name> PRIMARY KEY,
-       home address
+       home frozen<address>
  );

 .. note::
--- a/docs/operating-scylla/nodetool-commands/removenode.rst
+++ b/docs/operating-scylla/nodetool-commands/removenode.rst
@@ -45,7 +45,7 @@ Example:

 .. code-block:: console

-    nodetool removenode 675ed9f4-6564-6dbd-ca08-43fddce952de
+    nodetool removenode 675ed9f4-6564-6dbd-can8-43fddce952gy

 To only mark the node as permanently down without doing actual removal, use :doc:`nodetool excludenode </operating-scylla/nodetool-commands/excludenode>`:

@@ -79,6 +79,6 @@ Example:

 .. code-block:: console

-    nodetool removenode --ignore-dead-nodes 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1db0-aac8-43fddce9123e 675ed9f4-6564-6dbd-ca08-43fddce952de   
+    nodetool removenode --ignore-dead-nodes 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1dbn-mac8-43fddce9123e 675ed9f4-6564-6dbd-can8-43fddce952gy   

 .. include:: nodetool-index.rst
--- a/docs/operating-scylla/procedures/cluster-management/add-node-to-cluster.rst
+++ b/docs/operating-scylla/procedures/cluster-management/add-node-to-cluster.rst
@@ -74,7 +74,7 @@ Procedure
       --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
       UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   B1
       UN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-mac8-43fddce9123e   B1
-       UJ  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   B1
+       UJ  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   B1

   Nodes in the cluster finished streaming data to the new node:

@@ -86,7 +86,7 @@ Procedure
        --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
        UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   B1
        UN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-mac8-43fddce9123e   B1
-        UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   B1
+        UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   B1

 #. When the new node status is Up Normal (UN), run the :doc:`nodetool cleanup </operating-scylla/nodetool-commands/cleanup>` command on all nodes in the cluster except for the new node that has just been added. Cleanup removes keys that were streamed to the newly added node and are no longer owned by the node.

--- a/docs/operating-scylla/procedures/cluster-management/cluster-platform-migration.rst
+++ b/docs/operating-scylla/procedures/cluster-management/cluster-platform-migration.rst
@@ -192,7 +192,7 @@ Adding new nodes
      --  Address        Load       Tokens  Owns   Host ID                               Rack
      UN  192.168.1.10   500 MB     256     33.3%  8d5ed9f4-7764-4dbd-bad8-43fddce94b7c  RACK0
      UN  192.168.1.11   500 MB     256     33.3%  125ed9f4-7777-1dbn-mac8-43fddce9123e  RACK1
-      UN  192.168.1.12   500 MB     256     33.3%  675ed9f4-6564-6dbd-ca08-43fddce952de  RACK2
+      UN  192.168.1.12   500 MB     256     33.3%  675ed9f4-6564-6dbd-can8-43fddce952gy  RACK2
      UJ  192.168.2.10   250 MB     256     ?      a1b2c3d4-5678-90ab-cdef-112233445566  RACK0

   **Example output after bootstrap completes:**
@@ -205,7 +205,7 @@ Adding new nodes
      --  Address        Load       Tokens  Owns   Host ID                               Rack
      UN  192.168.1.10   400 MB     256     25.0%  8d5ed9f4-7764-4dbd-bad8-43fddce94b7c  RACK0
      UN  192.168.1.11   400 MB     256     25.0%  125ed9f4-7777-1dbn-mac8-43fddce9123e  RACK1
-      UN  192.168.1.12   400 MB     256     25.0%  675ed9f4-6564-6dbd-ca08-43fddce952de  RACK2
+      UN  192.168.1.12   400 MB     256     25.0%  675ed9f4-6564-6dbd-can8-43fddce952gy  RACK2
      UN  192.168.2.10   400 MB     256     25.0%  a1b2c3d4-5678-90ab-cdef-112233445566  RACK0

 #. For tablets-enabled clusters, wait for tablet load balancing to complete.
--- a/docs/operating-scylla/procedures/cluster-management/create-cluster.rst
+++ b/docs/operating-scylla/procedures/cluster-management/create-cluster.rst
@@ -163,5 +163,5 @@ This example shows how to install and configure a three-node cluster using Gossi
   --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
   UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   43
   UN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-mac8-43fddce9123e   44
-   UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   45
+   UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   45

--- a/docs/operating-scylla/procedures/cluster-management/decommissioning-data-center.rst
+++ b/docs/operating-scylla/procedures/cluster-management/decommissioning-data-center.rst
@@ -19,7 +19,7 @@ Prerequisites
   --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
   UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   B1
   UN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-lac8-23fddce9123e   B1
-   UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   B1
+   UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   B1

   Datacenter: ASIA-DC
   Status=Up/Down
@@ -165,7 +165,7 @@ Procedure
      --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
      UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   B1
      UN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-mac8-43fddce9123e   B1
-      UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   B1
+      UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   B1

      Datacenter: EUROPE-DC
      Status=Up/Down
--- a/docs/operating-scylla/procedures/cluster-management/remove-node.rst
+++ b/docs/operating-scylla/procedures/cluster-management/remove-node.rst
@@ -18,7 +18,7 @@ Removing a Running Node
         --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
         UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   B1
         UN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-mac8-43fddce9123e   B1
-         UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   B1
+         UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   B1

 #. If the node status is **Up Normal (UN)**, run the :doc:`nodetool decommission </operating-scylla/nodetool-commands/decommission>` command
   to remove the node you are connected to. Using ``nodetool decommission`` is the recommended method for cluster scale-down operations. It prevents data loss
@@ -75,7 +75,7 @@ command providing the Host ID of the node you are removing. See :doc:`nodetool r

 .. code-block:: console
   
-   nodetool removenode 675ed9f4-6564-6dbd-ca08-43fddce952de
+   nodetool removenode 675ed9f4-6564-6dbd-can8-43fddce952gy

 The ``nodetool removenode`` command notifies other nodes that the token range it owns needs to be moved and
 the nodes should redistribute the data using streaming. Using the command does not guarantee the consistency of the rebalanced data if
--- a/docs/operating-scylla/procedures/cluster-management/replace-dead-node-or-more.rst
+++ b/docs/operating-scylla/procedures/cluster-management/replace-dead-node-or-more.rst
@@ -23,7 +23,7 @@ Prerequisites
   --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
   UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   B1
   DN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-mac8-43fddce9123e   B1
-   DN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   B1
+   DN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   B1

 Login to one of the nodes in the cluster with (UN) status, collect the following info from the node:

--- a/docs/operating-scylla/procedures/cluster-management/replace-dead-node.rst
+++ b/docs/operating-scylla/procedures/cluster-management/replace-dead-node.rst
@@ -29,7 +29,7 @@ Down (DN), and the node can be replaced.
   --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
   UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   B1
   UN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-mac8-43fddce9123e   B1
-   DN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   B1
+   DN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   B1

 Remove the Data
 ==================
@@ -72,7 +72,7 @@ Procedure

   For example (using the Host ID of the failed node from above):

-   ``replace_node_first_boot: 675ed9f4-6564-6dbd-ca08-43fddce952de``
+   ``replace_node_first_boot: 675ed9f4-6564-6dbd-can8-43fddce952gy``

 #. Start the new node.

@@ -90,7 +90,7 @@ Procedure
       --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
       UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   B1
       UN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-mac8-43fddce9123e   B1
-       DN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   B1
+       DN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   B1
    
    ``192.168.1.203`` is the dead node.
    
@@ -121,7 +121,7 @@ Procedure
       /192.168.1.203
         generation:1553759866
         heartbeat:2147483647
-         HOST_ID:675ed9f4-6564-6dbd-ca08-43fddce952de
+         HOST_ID:675ed9f4-6564-6dbd-can8-43fddce952gy
         STATUS:shutdown,true
         RELEASE_VERSION:3.0.8
         X3:3
@@ -178,7 +178,7 @@ In this case, the node's data will be cleaned after restart. To remedy this, you

   .. code-block:: none

-      echo 'replace_node_first_boot: 675ed9f4-6564-6dbd-ca08-43fddce952de' | sudo tee --append /etc/scylla/scylla.yaml
+      echo 'replace_node_first_boot: 675ed9f4-6564-6dbd-can8-43fddce952gy' | sudo tee --append /etc/scylla/scylla.yaml

 #. Run the following command to re-setup RAID

--- a/docs/upgrade/upgrade-guides/index.rst
+++ b/docs/upgrade/upgrade-guides/index.rst
@@ -4,7 +4,7 @@ Upgrade ScyllaDB

 .. toctree::
   
-   ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1/index>
+   ScyllaDB 2026.1 to ScyllaDB 2026.2 <upgrade-guide-from-2026.1-to-2026.2/index>
   ScyllaDB 2026.x Patch Upgrades <upgrade-guide-from-2026.x.y-to-2026.x.z>
   ScyllaDB Image <ami-upgrade>

--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/index.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/index.rst
@@ -1,13 +0,0 @@
-==========================================================
-Upgrade - ScyllaDB 2025.x to ScyllaDB 2026.1
-==========================================================
-
-.. toctree::
-   :maxdepth: 2
-   :hidden:
-
-   Upgrade ScyllaDB <upgrade-guide-from-2025.x-to-2026.1>
-   Metrics Update <metric-update-2025.x-to-2026.1>
-
-* :doc:`Upgrade from ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1>`
-* :doc:`Metrics Update Between 2025.x and 2026.1 <metric-update-2025.x-to-2026.1>`
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/metric-update-2025.x-to-2026.1.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/metric-update-2025.x-to-2026.1.rst
@@ -1,82 +0,0 @@
-.. |SRC_VERSION| replace:: 2025.x
-.. |NEW_VERSION| replace:: 2026.1
-.. |PRECEDING_VERSION| replace:: 2025.4
-
-================================================================
-Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
-================================================================
-
-.. toctree::
-   :maxdepth: 2
-   :hidden:
-
-ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
-
-
-New Metrics in |NEW_VERSION|
--------------------------------------
-
-The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
-
-.. list-table::
-   :widths: 25 150
-   :header-rows: 1
-
-   * - Metric
-     - Description
-   * - scylla_alternator_operation_size_kb
-     - Histogram of item sizes involved in a request.
-   * - scylla_column_family_total_disk_space_before_compression
-     - Hypothetical total disk space used if data files weren't compressed
-   * - scylla_group_name_auto_repair_enabled_nr
-     - Number of tablets with auto repair enabled.
-   * - scylla_group_name_auto_repair_needs_repair_nr
-     - Number of tablets with auto repair enabled that currently need repair.
-   * - scylla_lsa_compact_time_ms
-     - Total time spent on segment compaction that was not accounted under ``reclaim_time_ms``.
-   * - scylla_lsa_evict_time_ms
-     - Total time spent on evicting objects that was not accounted under ``reclaim_time_ms``,
-   * - scylla_lsa_reclaim_time_ms
-     - Total time spent in reclaiming LSA memory back to std allocator.
-   * - scylla_object_storage_memory_usage
-     - Total number of bytes consumed by the object storage client.
-   * - scylla_tablet_ops_failed
-     - Number of failed tablet auto repair attempts.
-   * - scylla_tablet_ops_succeeded
-     - Number of successful tablet auto repair attempts.
-   
-Renamed Metrics in |NEW_VERSION|
--------------------------------------
-
-The following metrics are renamed in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
-
-.. list-table::
-   :widths: 25 150
-   :header-rows: 1
-
-   * - Metric Name in |PRECEDING_VERSION|
-     - Metric Name in |NEW_VERSION|
-   * - scylla_s3_memory_usage
-     - scylla_object_storage_memory_usage
-
-Removed Metrics in |NEW_VERSION|
--------------------------------------
-
-The following metrics are removed in ScyllaDB |NEW_VERSION|.
-
-* scylla_redis_current_connections
-* scylla_redis_op_latency
-* scylla_redis_operation
-* scylla_redis_operation
-* scylla_redis_requests_latency
-* scylla_redis_requests_served
-* scylla_redis_requests_serving
-
-New and Updated Metrics in Previous Releases
-------------------------------------------------------
-
-* `Metrics Update Between 2025.3 and 2025.4 <https://docs.scylladb.com/manual/branch-2025.4/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/metric-update-2025.x-to-2025.4.html>`_
-* `Metrics Update Between 2025.2 and 2025.3 <https://docs.scylladb.com/manual/branch-2025.3/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/metric-update-2025.2-to-2025.3.html>`_
-* `Metrics Update Between 2025.1 and 2025.2 <https://docs.scylladb.com/manual/branch-2025.2/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/metric-update-2025.1-to-2025.2.html>`_
-
-
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/index.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/index.rst
@@ -0,0 +1,13 @@
+==========================================================
+Upgrade - ScyllaDB 2026.1 to ScyllaDB 2026.2
+==========================================================
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+   Upgrade ScyllaDB <upgrade-guide-from-2026.1-to-2026.2>
+   Metrics Update <metric-update-2026.1-to-2026.2>
+
+* :doc:`Upgrade from ScyllaDB 2026.1 to ScyllaDB 2026.2 <upgrade-guide-from-2026.1-to-2026.2>`
+* :doc:`Metrics Update Between 2026.1 and 2026.2 <metric-update-2026.1-to-2026.2>`
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/metric-update-2026.1-to-2026.2.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/metric-update-2026.1-to-2026.2.rst
@@ -0,0 +1,126 @@
+.. |SRC_VERSION| replace:: 2026.1
+.. |NEW_VERSION| replace:: 2026.2
+.. |PRECEDING_VERSION| replace:: 2026.1
+
+================================================================
+Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
+================================================================
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
+
+
+New Metrics in |NEW_VERSION|
+--------------------------------------
+
+The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
+
+.. list-table::
+   :widths: 25 150
+   :header-rows: 1
+
+   * - Metric
+     - Description
+   * - scylla_auth_cache_permissions
+     - Total number of permission sets currently cached across all roles.
+   * - scylla_auth_cache_roles
+     - Number of roles currently cached.
+   * - scylla_cql_forwarded_requests
+     - Counts the total number of attempts to forward CQL requests to other nodes.
+       One request may be forwarded multiple times, particularly when a write is
+       handled by a non-replica node.
+   * - scylla_cql_write_consistency_levels_disallowed_violations
+     - Counts the number of write_consistency_levels_disallowed guardrail violations,
+       i.e. attempts to write with a forbidden consistency level.
+   * - scylla_cql_write_consistency_levels_warned_violations
+     - Counts the number of write_consistency_levels_warned guardrail violations,
+       i.e. attempts to write with a discouraged consistency level.
+   * - scylla_cql_writes_per_consistency_level
+     - Counts the number of writes for each consistency level.
+   * - scylla_io_queue_integrated_disk_queue_length
+     - Length of the integrated disk queue.
+   * - scylla_io_queue_integrated_queue_length
+     - Length of the integrated queue.
+   * - scylla_logstor_sm_bytes_freed
+     - Counts the number of data bytes freed.
+   * - scylla_logstor_sm_bytes_read
+     - Counts the number of bytes read from the disk.
+   * - scylla_logstor_sm_bytes_written
+     - Counts the number of bytes written to the disk.
+   * - scylla_logstor_sm_compaction_bytes_written
+     - Counts the number of bytes written to the disk by compaction.
+   * - scylla_logstor_sm_compaction_data_bytes_written
+     - Counts the number of data bytes written to the disk by compaction.
+   * - scylla_logstor_sm_compaction_records_rewritten
+     - Counts the number of records rewritten during compaction.
+   * - scylla_logstor_sm_compaction_records_skipped
+     - Counts the number of records skipped during compaction.
+   * - scylla_logstor_sm_compaction_segments_freed
+     - Counts the number of data bytes written to the disk.
+   * - scylla_logstor_sm_disk_usage
+     - Total disk usage.
+   * - scylla_logstor_sm_free_segments
+     - Counts the number of free segments currently available.
+   * - scylla_logstor_sm_segment_pool_compaction_segments_get
+     - Counts the number of segments taken from the segment pool for compaction.
+   * - scylla_logstor_sm_segment_pool_normal_segments_get
+     - Counts the number of segments taken from the segment pool for normal writes.
+   * - scylla_logstor_sm_segment_pool_normal_segments_wait
+     - Counts the number of times normal writes had to wait for a segment to become
+       available in the segment pool.
+   * - scylla_logstor_sm_segment_pool_segments_put
+     - Counts the number of segments returned to the segment pool.
+   * - scylla_logstor_sm_segment_pool_separator_segments_get
+     - Counts the number of segments taken from the segment pool for separator writes.
+   * - scylla_logstor_sm_segment_pool_size
+     - Counts the number of segments in the segment pool.
+   * - scylla_logstor_sm_segments_allocated
+     - Counts the number of segments allocated.
+   * - scylla_logstor_sm_segments_compacted
+     - Counts the number of segments compacted.
+   * - scylla_logstor_sm_segments_freed
+     - Counts the number of segments freed.
+   * - scylla_logstor_sm_segments_in_use
+     - Counts the number of segments currently in use.
+   * - scylla_logstor_sm_separator_buffer_flushed
+     - Counts the number of times the separator buffer has been flushed.
+   * - scylla_logstor_sm_separator_bytes_written
+     - Counts the number of bytes written to the separator.
+   * - scylla_logstor_sm_separator_data_bytes_written
+     - Counts the number of data bytes written to the separator.
+   * - scylla_logstor_sm_separator_flow_control_delay
+     - Current delay applied to writes to control separator debt in microseconds.
+   * - scylla_logstor_sm_separator_segments_freed
+     - Counts the number of segments freed by the separator.
+   * - scylla_transport_cql_pending_response_memory
+     - Holds the total memory in bytes consumed by responses waiting to be sent.
+   * - scylla_transport_cql_request_histogram_bytes
+     - A histogram of received bytes in CQL messages of a specific kind and
+       specific scheduling group.
+   * - scylla_transport_cql_requests_serving
+     - Holds the number of requests that are being processed right now.
+   * - scylla_transport_cql_response_histogram_bytes
+     - A histogram of received bytes in CQL messages of a specific kind and
+       specific scheduling group.
+   * - scylla_transport_requests_forwarded_failed
+     - Counts the number of requests that were forwarded to another replica
+       but failed to execute there.
+   * - scylla_transport_requests_forwarded_prepared_not_found
+     - Counts the number of requests that were forwarded to another replica
+       but failed there because the statement was not prepared on the target.
+       When this happens, the coordinator performs an additional remote call
+       to prepare the statement on the replica and retries the EXECUTE request
+       afterwards.
+   * - scylla_transport_requests_forwarded_redirected
+     - Counts the number of requests that were forwarded to another replica
+       but that replica responded with a redirect to another node. This can
+       happen when replica has stale information about the cluster topology or
+       when the request is handled by a node that is not a replica for the data
+       being accessed by the request.
+   * - scylla_transport_requests_forwarded_successfully
+     - Counts the number of requests that were forwarded to another replica
+       and executed successfully there.
+
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/upgrade-guide-from-2026.1-to-2026.2.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/upgrade-guide-from-2026.1-to-2026.2.rst
@@ -1,13 +1,13 @@
 .. |SCYLLA_NAME| replace:: ScyllaDB

-.. |SRC_VERSION| replace:: 2025.x
-.. |NEW_VERSION| replace:: 2026.1
+.. |SRC_VERSION| replace:: 2026.1
+.. |NEW_VERSION| replace:: 2026.2

 .. |ROLLBACK| replace:: rollback
 .. _ROLLBACK: ./#rollback-procedure

-.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.x to 2026.1
-.. _SCYLLA_METRICS: ../metric-update-2025.x-to-2026.1
+.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2026.1 to 2026.2
+.. _SCYLLA_METRICS: ../metric-update-2026.1-to-2026.2

 =======================================================================================
 Upgrade from |SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION|
--- a/ent/encryption/kmip_host.cc
+++ b/ent/encryption/kmip_host.cc
@@ -598,7 +598,7 @@ future<int> kmip_host::impl::do_cmd(KMIP_CMD* cmd, con_ptr cp, Func& f, bool ret

 template<typename Func>
 future<kmip_host::impl::kmip_cmd> kmip_host::impl::do_cmd(kmip_cmd cmd_in, Func && f) {
-    kmip_log.trace("{}: begin do_cmd", *this, cmd_in);
+    kmip_log.trace("{}: begin do_cmd {}", *this, cmd_in);
    KMIP_CMD* cmd = cmd_in;

    // #998 Need to do retry loop, because we can have either timed out connection,
--- a/ent/encryption/kms_host.cc
+++ b/ent/encryption/kms_host.cc
@@ -616,7 +616,7 @@ future<rjson::value> encryption::kms_host::impl::do_post(std::string_view target
            static auto get_xml_node = [](node_type* node, const char* what) {
                auto res = node->first_node(what);
                if (!res) {
-                    throw malformed_response_error(fmt::format("XML parse error", what));
+                    throw malformed_response_error(fmt::format("XML parse error: {}", what));
                }
                return res;
            };
--- a/gms/feature_service.cc
+++ b/gms/feature_service.cc
@@ -109,6 +109,7 @@ std::set<std::string_view> feature_service::supported_feature_set() const {
        "UUID_SSTABLE_IDENTIFIERS"sv,
        "GROUP0_SCHEMA_VERSIONING"sv,
        "VIEW_BUILD_STATUS_ON_GROUP0"sv,
+        "CDC_GENERATIONS_V2"sv,
    };

    if (is_test_only_feature_deprecated()) {
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -83,7 +83,6 @@ public:
    gms::feature alternator_ttl { *this, "ALTERNATOR_TTL"sv };
    gms::feature cql_row_ttl { *this, "CQL_ROW_TTL"sv };
    gms::feature range_scan_data_variant { *this, "RANGE_SCAN_DATA_VARIANT"sv };
-    gms::feature cdc_generations_v2 { *this, "CDC_GENERATIONS_V2"sv };
    gms::feature user_defined_aggregates { *this, "UDA"sv };
    // Historically max_result_size contained only two fields: soft_limit and
    // hard_limit. It was somehow obscure because for normal paged queries both
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -399,9 +399,10 @@ future<> gossiper::do_send_ack2_msg(locator::host_id from, utils::chunked_vector
        }
    }
    gms::gossip_digest_ack2 ack2_msg(std::move(delta_ep_state_map));
-    logger.debug("Calling do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg);
+    auto ack2_msg_str = fmt::format("{}", ack2_msg);
+    logger.debug("Calling do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg_str);
    co_await ser::gossip_rpc_verbs::send_gossip_digest_ack2(&_messaging, from, std::move(ack2_msg));
-    logger.debug("finished do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg);
+    logger.debug("finished do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg_str);
 }

 // Depends on
@@ -964,8 +965,7 @@ future<> gossiper::failure_detector_loop_for_node(locator::host_id host_id, gene
        diff = now - last;
        if (!failed) {
            last = now;
-        }
-        if (diff > max_duration) {
+        } else if (diff > max_duration) {
            logger.info("failure_detector_loop: Mark node {}/{} as DOWN", host_id, node);
            co_await container().invoke_on(0, [host_id] (gms::gossiper& g) {
                return g.convict(host_id);
--- a/init.cc
+++ b/init.cc
@@ -87,9 +87,6 @@ std::set<sstring> get_disabled_features_from_db_config(const db::config& cfg, st
        }
    }

-    if (!cfg.check_experimental(db::experimental_features_t::feature::ALTERNATOR_STREAMS)) {
-        disabled.insert("ALTERNATOR_STREAMS"s);
-    }
    if (!cfg.check_experimental(db::experimental_features_t::feature::KEYSPACE_STORAGE_OPTIONS)) {
        disabled.insert("KEYSPACE_STORAGE_OPTIONS"s);
    }
--- a/main.cc
+++ b/main.cc
@@ -1358,6 +1358,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            };
            spcfg.hinted_handoff_enabled = hinted_handoff_enabled;
            spcfg.available_memory = memory::stats().total_memory();
+            spcfg.maintenance_mode = maintenance_mode_enabled{cfg->maintenance_mode()};
            smp_service_group_config storage_proxy_smp_service_group_config;
            // Assuming less than 1kB per queued request, this limits storage_proxy submit_to() queues to 5MB or less
            storage_proxy_smp_service_group_config.max_nonlocal_requests = 5000;
@@ -1366,7 +1367,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            spcfg.write_mv_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get();
            spcfg.hints_write_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get();
            spcfg.write_ack_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get();
-            static db::view::node_update_backlog node_backlog(smp::count, 10ms);
+            static db::view::node_update_backlog node_backlog(smp::count, 10ms, cfg->view_flow_control_delay_limit_in_ms);
            scheduling_group_key_config storage_proxy_stats_cfg =
                    make_scheduling_group_key_config<service::storage_proxy_stats::stats>();
            storage_proxy_stats_cfg.constructor = [plain_constructor = storage_proxy_stats_cfg.constructor] (void* ptr) {
@@ -1810,6 +1811,18 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            utils::get_local_injector().inject("stop_after_starting_migration_manager",
                [] { std::raise(SIGSTOP); });

+            // Audit must be constructed before the maintenance socket so
+            // that on shutdown (reverse destruction order) the audit service
+            // outlives the maintenance socket and in-flight queries can
+            // still reach audit::inspect() safely.
+            checkpoint(stop_signal, "starting audit service");
+            audit::audit::start_audit(*cfg, token_metadata, qp, mm).handle_exception([&] (auto&& e) {
+                startlog.error("audit start failed: {}", e);
+            }).get();
+            auto audit_stop = defer([] {
+                audit::audit::stop_audit().get();
+            });
+
            // XXX: stop_raft has to happen before query_processor and migration_manager
            // is stopped, since some groups keep using the query
            // processor until are stopped inside stop_raft.
@@ -1841,7 +1854,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            });

            checkpoint(stop_signal, "starting view update generator");
-            view_update_generator.start(std::ref(db), std::ref(proxy), std::ref(stop_signal.as_sharded_abort_source())).get();
+            view_update_generator.start(std::ref(db), std::ref(proxy), std::ref(node_backlog), std::ref(stop_signal.as_sharded_abort_source())).get();
            auto stop_view_update_generator = defer_verbose_shutdown("view update generator", [] {
                view_update_generator.stop().get();
            });
@@ -2287,10 +2300,12 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
               ss.local().wait_for_group0_stop().get();
            });

-            // Setup group0 early in case the node is bootstrapped already and the group exists.
-            // Need to do it before allowing incoming messaging service connections since
-            // storage proxy's and migration manager's verbs may access group0.
-            group0_service.setup_group0_if_exist(sys_ks.local(), ss.local(), qp.local(), mm.local()).get();
+            if (!group0_service.maintenance_mode() && sys_ks.local().bootstrap_complete()) {
+                // Setup group0 early in case the node is bootstrapped already and the group exists.
+                // Need to do it before allowing incoming messaging service connections since
+                // storage proxy's and migration manager's verbs may access group0.
+                group0_service.setup_group0_if_exist(sys_ks.local(), ss.local(), qp.local(), mm.local()).get();
+            }

            // The call to setup_group0_if_exists() above guarantees that, if group0 is
            // created and started, the locally persisted group0 state has been applied
@@ -2340,15 +2355,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            }).get();
            stop_signal.ready(false);

-            if (cfg->maintenance_socket() != "ignore") {
-                // Enable role operations now that node joined the cluster
-                maintenance_auth_service.invoke_on_all([](auth::service& svc) {
-                    return auth::ensure_role_operations_are_enabled(svc);
-                }).get();
-
-                start_cql(*cql_maintenance_server_ctl, stop_maintenance_cql, "maintenance native server");
-            }
-
            // At this point, `locator::topology` should be stable, i.e. we should have complete information
            // about the layout of the cluster (= list of nodes along with the racks/DCs).
            startlog.info("Verifying that all of the keyspaces are RF-rack-valid");
@@ -2357,16 +2363,23 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            startlog.info("Verifying that all of the tablet keyspaces use rack list replication factors");
            db.local().check_rack_list_everywhere(cfg->enforce_rack_list());

-            // Start audit service after join_cluster so that the table-based audit backend
-            // can properly create its keyspace and table.
-            checkpoint(stop_signal, "starting audit service");
-            audit::audit::start_audit(*cfg, token_metadata, qp, mm).handle_exception([&] (auto&& e) {
-                startlog.error("audit start failed: {}", e);
-            }).get();
-            auto audit_stop = defer([] {
-                audit::audit::stop_audit().get();
+            // The table-based audit backend needs Raft (via join_cluster)
+            // to create its keyspace and table.
+            checkpoint(stop_signal, "starting audit storage");
+            audit::audit::start_storage(*cfg).get();
+            auto audit_storage_stop = defer([] {
+                audit::audit::stop_storage().get();
            });

+            if (cfg->maintenance_socket() != "ignore") {
+                // Enable role operations now that node joined the cluster
+                maintenance_auth_service.invoke_on_all([](auth::service& svc) {
+                    return auth::ensure_role_operations_are_enabled(svc);
+                }).get();
+
+                start_cql(*cql_maintenance_server_ctl, stop_maintenance_cql, "maintenance native server");
+            }
+
            // Semantic validation of sstable compression parameters from config.
            // Adding here (i.e., after `join_cluster`) to ensure that the
            // required SSTABLE_COMPRESSION_DICTS cluster feature has been negotiated.
--- a/mutation/atomic_cell.hh
+++ b/mutation/atomic_cell.hh
@@ -48,8 +48,8 @@ static void set_field(atomic_cell_value& out, unsigned offset, T val) {
 }

 template <FragmentRange Buffer>
-static void set_value(managed_bytes& b, unsigned value_offset, const Buffer& value) {
-    auto v = managed_bytes_mutable_view(b).substr(value_offset, value.size_bytes());
+static void set_value(atomic_cell_value_mutable_view b, unsigned value_offset, const Buffer& value) {
+    auto v = b.substr(value_offset, value.size_bytes());
    for (auto frag : value) {
        write_fragmented(v, single_fragmented_view(frag));
    }
@@ -141,20 +141,36 @@ public:
        SCYLLA_ASSERT(is_live_and_has_ttl(cell));
        return gc_clock::duration(get_field<int32_t>(cell, ttl_offset));
    }
-    static managed_bytes make_dead(api::timestamp_type timestamp, gc_clock::time_point deletion_time) {
-        managed_bytes b(managed_bytes::initialized_later(), flags_size + timestamp_size + deletion_time_size);
+    static size_t dead_serialized_size() {
+        return flags_size + timestamp_size + deletion_time_size;
+    }
+    static size_t live_serialized_size(size_t value_size) {
+        return flags_size + timestamp_size + value_size;
+    }
+    static size_t live_expiring_serialized_size(size_t value_size) {
+        return flags_size + timestamp_size + expiry_size + ttl_size + value_size;
+    }
+    static void write_dead(atomic_cell_value_mutable_view b, api::timestamp_type timestamp, gc_clock::time_point deletion_time) {
        b[0] = 0;
        set_field(b, timestamp_offset, timestamp);
        set_field(b, deletion_time_offset, static_cast<int64_t>(deletion_time.time_since_epoch().count()));
+    }
+    static managed_bytes make_dead(api::timestamp_type timestamp, gc_clock::time_point deletion_time) {
+        managed_bytes b(managed_bytes::initialized_later(), dead_serialized_size());
+        write_dead(b, timestamp, deletion_time);
        return b;
    }
    template <FragmentRange Buffer>
-    static managed_bytes make_live(api::timestamp_type timestamp, const Buffer& value) {
+    static void write_live(atomic_cell_value_mutable_view b, api::timestamp_type timestamp, const Buffer& value) {
        auto value_offset = flags_size + timestamp_size;
-        managed_bytes b(managed_bytes::initialized_later(), value_offset + value.size_bytes());
        b[0] = LIVE_FLAG;
        set_field(b, timestamp_offset, timestamp);
        set_value(b, value_offset, value);
+    }
+    template <FragmentRange Buffer>
+    static managed_bytes make_live(api::timestamp_type timestamp, const Buffer& value) {
+        managed_bytes b(managed_bytes::initialized_later(), live_serialized_size(value.size_bytes()));
+        write_live(b, timestamp, value);
        return b;
    }
    static managed_bytes make_live_counter_update(api::timestamp_type timestamp, int64_t value) {
@@ -166,14 +182,18 @@ public:
        return b;
    }
    template <FragmentRange Buffer>
-    static managed_bytes make_live(api::timestamp_type timestamp, const Buffer& value, gc_clock::time_point expiry, gc_clock::duration ttl) {
+    static void write_live(atomic_cell_value_mutable_view b, api::timestamp_type timestamp, const Buffer& value, gc_clock::time_point expiry, gc_clock::duration ttl) {
        auto value_offset = flags_size + timestamp_size + expiry_size + ttl_size;
-        managed_bytes b(managed_bytes::initialized_later(), value_offset + value.size_bytes());
        b[0] = EXPIRY_FLAG | LIVE_FLAG;
        set_field(b, timestamp_offset, timestamp);
        set_field(b, expiry_offset, static_cast<int64_t>(expiry.time_since_epoch().count()));
        set_field(b, ttl_offset, static_cast<int32_t>(ttl.count()));
        set_value(b, value_offset, value);
+    }
+    template <FragmentRange Buffer>
+    static managed_bytes make_live(api::timestamp_type timestamp, const Buffer& value, gc_clock::time_point expiry, gc_clock::duration ttl) {
+        managed_bytes b(managed_bytes::initialized_later(), live_expiring_serialized_size(value.size_bytes()));
+        write_live(b, timestamp, value, expiry, ttl);
        return b;
    }
    static managed_bytes make_live_uninitialized(api::timestamp_type timestamp, size_t size) {
--- a/mutation/canonical_mutation.cc
+++ b/mutation/canonical_mutation.cc
@@ -113,10 +113,10 @@ auto fmt::formatter<canonical_mutation>::format(const canonical_mutation& cm, fm
            auto&& entry = _cm.static_column_at(id);
            _os = fmt::format_to(_os, "static column {} {}", bytes_to_text(entry.name()), atomic_cell::printer(*entry.type(), ac));
        }
-        virtual void accept_static_cell(column_id id, collection_mutation_view cmv) override {
+        virtual void accept_static_cell(column_id id, collection_mutation cm) override {
            print_separator();
            auto&& entry = _cm.static_column_at(id);
-            _os = fmt::format_to(_os, "static column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cmv));
+            _os = fmt::format_to(_os, "static column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cm));
        }
        virtual stop_iteration accept_row_tombstone(range_tombstone rt) override {
            print_separator();
@@ -137,10 +137,10 @@ auto fmt::formatter<canonical_mutation>::format(const canonical_mutation& cm, fm
            auto&& entry = _cm.regular_column_at(id);
            _os = fmt::format_to(_os, "column {} {}", bytes_to_text(entry.name()), atomic_cell::printer(*entry.type(), ac));
        }
-        virtual void accept_row_cell(column_id id, collection_mutation_view cmv) override {
+        virtual void accept_row_cell(column_id id, collection_mutation cm) override {
            print_separator();
            auto&& entry = _cm.regular_column_at(id);
-            _os = fmt::format_to(_os, "column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cmv));
+            _os = fmt::format_to(_os, "column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cm));
        }
        out_t finalize() {
            if (_in_row) {
--- a/mutation/collection_mutation.cc
+++ b/mutation/collection_mutation.cc
@@ -7,12 +7,14 @@
 */

 #include "utils/assert.hh"
+#include "utils/on_internal_error.hh"
 #include "types/collection.hh"
 #include "types/user.hh"
 #include "types/concrete_types.hh"
 #include "mutation/mutation_partition.hh"
 #include "compaction/compaction_garbage_collector.hh"
 #include "combine.hh"
+#include "idl/mutation.dist.impl.hh"

 #include "collection_mutation.hh"

@@ -224,13 +226,26 @@ compact_and_expire_result collection_mutation_description::compact_and_expire(co
    return res;
 }

-template <typename Iterator>
+/// A CollectionMutationAdaptor is a static interface that adapts a collection
+/// element (an iterator value type) to the serialization requirements of
+/// serialize_collection_mutation(). It provides static methods to measure the
+/// serialized sizes and to write the key and value of each element into a buffer.
+template <typename Adaptor, typename Element>
+concept CollectionMutationAdaptor = requires(const Element& e, managed_bytes_mutable_view& out) {
+    { Adaptor::key_size(e) } -> std::convertible_to<size_t>;
+    { Adaptor::value_size(e) } -> std::convertible_to<size_t>;
+    { Adaptor::write_key(e, out) };
+    { Adaptor::write_value(e, out) };
+};
+
+template <typename Adaptor, typename Iterator>
+    requires CollectionMutationAdaptor<Adaptor, std::iter_value_t<Iterator>>
 static collection_mutation serialize_collection_mutation(
        const abstract_type& type,
        const tombstone& tomb,
        std::ranges::subrange<Iterator> cells) {
    auto element_size = [] (size_t c, auto&& e) -> size_t {
-        return c + 8 + e.first.size() + e.second.serialize().size();
+        return c + 8 + Adaptor::key_size(e) + Adaptor::value_size(e);
    };
    auto size = std::ranges::fold_left(cells, (size_t)4, element_size);
    size += 1;
@@ -244,32 +259,112 @@ static collection_mutation serialize_collection_mutation(
        write<int64_t>(out, tomb.timestamp);
        write<int64_t>(out, tomb.deletion_time.time_since_epoch().count());
    }
-    auto writek = [&out] (bytes_view v) {
-        write<int32_t>(out, v.size());
-        write_fragmented(out, single_fragmented_view(v));
+    auto writek = [&out] (auto& kv) {
+        write<int32_t>(out, Adaptor::key_size(kv));
+        Adaptor::write_key(kv, out);
    };
-    auto writev = [&out] (managed_bytes_view v) {
-        write<int32_t>(out, v.size());
-        write_fragmented(out, v);
+    auto writev = [&out] (auto& kv) {
+        write<int32_t>(out, Adaptor::value_size(kv));
+        Adaptor::write_value(kv, out);
    };
    // FIXME: overflow?
    write<int32_t>(out, std::ranges::distance(cells));
    for (auto&& kv : cells) {
-        auto&& k = kv.first;
-        auto&& v = kv.second;
-        writek(k);
-
-        writev(v.serialize());
+        writek(kv);
+        writev(kv);
    }
    return collection_mutation(type, std::move(ret));
 }

+namespace {
+
+/// A key-value pair where the key is bytes-like and the value is an atomic_cell-like type
+/// with a serialize() method returning managed_bytes_view.
+template <typename T>
+concept AtomicCellKV = requires(const T& kv) {
+    { kv.first.size() } -> std::convertible_to<size_t>;
+    { kv.second.serialize() } -> std::convertible_to<managed_bytes_view>;
+};
+
+struct atomic_cell_adaptor {
+    static size_t key_size(const AtomicCellKV auto& v) { return v.first.size(); }
+    static size_t value_size(const AtomicCellKV auto& v) { return v.second.serialize().size(); }
+
+    static void write_key(const AtomicCellKV auto& v, managed_bytes_mutable_view& out) {
+        write_fragmented(out, single_fragmented_view(v.first));
+    }
+    static void write_value(const AtomicCellKV auto& v, managed_bytes_mutable_view& out) {
+        write_fragmented(out, v.second.serialize());
+    }
+};
+
+}
+
 collection_mutation collection_mutation_description::serialize(const abstract_type& type) const {
-    return serialize_collection_mutation(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
+    return serialize_collection_mutation<atomic_cell_adaptor>(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
 }

 collection_mutation collection_mutation_view_description::serialize(const abstract_type& type) const {
-    return serialize_collection_mutation(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
+    return serialize_collection_mutation<atomic_cell_adaptor>(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
+}
+
+namespace {
+
+struct serialized_cell_adaptor {
+    static size_t key_size(const ser::collection_element_view& v) {
+        return v.key().view().size_bytes();
+    }
+
+    static size_t value_size(const ser::collection_element_view& v) {
+        struct collection_cell_visitor {
+            size_t operator()(const ser::live_cell_view& lcv) const { return atomic_cell_type::live_serialized_size(lcv.value().view().size_bytes()); }
+            size_t operator()(const ser::expiring_cell_view& ecv) const { return atomic_cell_type::live_expiring_serialized_size(ecv.c().value().view().size_bytes()); }
+            size_t operator()(const ser::dead_cell_view& dcv) const { return atomic_cell_type::dead_serialized_size(); }
+            size_t operator()(const ser::counter_cell_view& ccv) const { utils::on_internal_error("Trying to deserialize counter cell from collection"); }
+            size_t operator()(const ser::unknown_variant_type&) const { utils::on_internal_error("Trying to deserialize cell in unknown state"); };
+        };
+        return boost::apply_visitor(collection_cell_visitor{}, v.value());
+    }
+
+    static void write_key(const ser::collection_element_view& v, managed_bytes_mutable_view& out) {
+        write_fragmented(out, v.key().view());
+    }
+
+    static void write_value(const ser::collection_element_view& v, managed_bytes_mutable_view& out) {
+        struct collection_cell_visitor {
+            managed_bytes_mutable_view& out;
+
+            void operator()(const ser::live_cell_view& lcv) const {
+                const auto v = lcv.value().view();
+                atomic_cell_type::write_live(out, lcv.created_at(), v);
+                out.remove_prefix(atomic_cell_type::live_serialized_size(v.size_bytes()));
+            }
+            void operator()(const ser::expiring_cell_view& ecv) const {
+                const auto v = ecv.c().value().view();
+                atomic_cell_type::write_live(out, ecv.c().created_at(), v, ecv.expiry(), ecv.ttl());
+                out.remove_prefix(atomic_cell_type::live_expiring_serialized_size(v.size_bytes()));
+            }
+            void operator()(const ser::dead_cell_view& dcv) const {
+                atomic_cell_type::write_dead(out, dcv.tomb().timestamp(), dcv.tomb().deletion_time());
+                out.remove_prefix(atomic_cell_type::dead_serialized_size());
+            }
+            void operator()(const ser::counter_cell_view& ccv) const {
+                utils::on_internal_error("Trying to deserialize counter cell from collection");
+            }
+            void operator()(const ser::unknown_variant_type&) const {
+                utils::on_internal_error("Trying to deserialize cell in unknown state");
+            }
+        };
+        boost::apply_visitor(collection_cell_visitor{out}, v.value());
+    }
+};
+
+}
+
+collection_mutation read_from_collection_cell_view(const abstract_type& type, const ser::collection_cell_view& collection) {
+    auto tomb = collection.tomb();
+    auto cells = collection.elements();
+    return serialize_collection_mutation<serialized_cell_adaptor>(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
 }

 template <typename C>
--- a/mutation/collection_mutation.hh
+++ b/mutation/collection_mutation.hh
@@ -23,6 +23,10 @@ class row_tombstone;

 class collection_mutation;

+namespace ser {
+class collection_cell_view;
+}
+
 // An auxiliary struct used to (de)construct collection_mutations.
 // Unlike collection_mutation which is a serialized blob, this struct allows to inspect logical units of information
 // (tombstone and cells) inside the mutation easily.
@@ -130,6 +134,12 @@ collection_mutation merge(const abstract_type&, collection_mutation_view, collec

 collection_mutation difference(const abstract_type&, collection_mutation_view, collection_mutation_view);

+// Transcode a collection from the IDL representation directly into the
+// collection_mutation serialization format, without using any intermediary representation.
+// Only the final collection-mutation blob is allocated, no intermediate allocations needed.
+// Safe to use in LSA, it won't produce garbage.
+collection_mutation read_from_collection_cell_view(const abstract_type&, const ser::collection_cell_view&);
+
 // Serializes the given collection of cells to a sequence of bytes ready to be sent over the CQL protocol.
 bytes_ostream serialize_for_cql(const abstract_type&, collection_mutation_view);

--- a/mutation/frozen_mutation.hh
+++ b/mutation/frozen_mutation.hh
@@ -97,9 +97,9 @@ public:
        r.append_cell(id, atomic_cell_or_collection(std::move(cell)));
    }

-    virtual void accept_static_cell(column_id id, collection_mutation_view collection) override {
+    virtual void accept_static_cell(column_id id, collection_mutation collection) override {
        row& r = _static_row.maybe_create();
-        r.append_cell(id, collection_mutation(*_schema.static_column_at(id).type, std::move(collection)));
+        r.append_cell(id, std::move(collection));
    }

    virtual stop_iteration accept_row_tombstone(range_tombstone rt) override {
@@ -125,9 +125,9 @@ public:
        r.append_cell(id, std::move(cell));
    }

-    virtual void accept_row_cell(column_id id, collection_mutation_view collection) override {
+    virtual void accept_row_cell(column_id id, collection_mutation collection) override {
        row& r = _current_row->cells();
-        r.append_cell(id, collection_mutation(*_schema.regular_column_at(id).type, std::move(collection)));
+        r.append_cell(id, std::move(collection));
    }

    auto on_end_of_partition() {
--- a/mutation/mutation_partition.hh
+++ b/mutation/mutation_partition.hh
@@ -707,9 +707,10 @@ struct fmt::formatter<shadowable_tombstone> : fmt::formatter<string_view> {
    template <typename FormatContext>
    auto format(const shadowable_tombstone& t, FormatContext& ctx) const {
        if (t) {
+            auto& tomb = t.tomb();
            return fmt::format_to(ctx.out(),
                                  "{{shadowable tombstone: timestamp={}, deletion_time={}}}",
-                                  t.tomb().timestamp, t.tomb(), t.tomb().deletion_time.time_since_epoch().count());
+                                  tomb.timestamp, tomb.deletion_time.time_since_epoch().count());
        } else {
            return fmt::format_to(ctx.out(),
                                  "{{shadowable tombstone: none}}");
--- a/mutation/mutation_partition_view.cc
+++ b/mutation/mutation_partition_view.cc
@@ -86,37 +86,6 @@ atomic_cell read_atomic_cell(const abstract_type& type, atomic_cell_variant cv,
    return boost::apply_visitor(atomic_cell_visitor(type, cm), cv);
 }

-collection_mutation read_collection_cell(const abstract_type& type, ser::collection_cell_view cv)
-{
-    collection_mutation_description mut;
-    mut.tomb = cv.tomb();
-    auto&& elements = cv.elements();
-    mut.cells.reserve(elements.size());
-
-    visit(type, make_visitor(
-        [&] (const collection_type_impl& ctype) {
-            auto& value_type = *ctype.value_comparator();
-            for (auto&& e : elements) {
-                mut.cells.emplace_back(e.key(), read_atomic_cell(value_type, e.value(), atomic_cell::collection_member::yes));
-            }
-        },
-        [&] (const user_type_impl& utype) {
-            for (auto&& e : elements) {
-                bytes key = e.key();
-                auto idx = deserialize_field_index(key);
-                SCYLLA_ASSERT(idx < utype.size());
-
-                mut.cells.emplace_back(key, read_atomic_cell(*utype.type(idx), e.value(), atomic_cell::collection_member::yes));
-            }
-        },
-        [&] (const abstract_type& o) {
-            throw std::runtime_error(format("attempted to read a collection cell with type: {}", o.name()));
-        }
-    ));
-
-    return mut.serialize(type);
-}
-
 template<typename Visitor>
 void read_and_visit_row(ser::row_view rv, const column_mapping& cm, column_kind kind, Visitor&& visitor)
 {
@@ -142,14 +111,7 @@ void read_and_visit_row(ser::row_view rv, const column_mapping& cm, column_kind
                if (_col.is_atomic()) {
                    throw std::runtime_error("An atomic cell expected, got a collection");
                }
-                // FIXME: Pass view to cell to avoid copy
-                auto&& outer = current_allocator();
-                with_allocator(standard_allocator(), [&] {
-                    auto cell = read_collection_cell(*_col.type(), ccv);
-                    with_allocator(outer, [&] {
-                        _visitor.accept_collection(_id, cell);
-                    });
-                });
+                _visitor.accept_collection(_id, read_from_collection_cell_view(*_col.type(), ccv));
            }
            void operator()(ser::unknown_variant_type&) const {
                throw std::runtime_error("Trying to deserialize unknown cell type");
@@ -198,8 +160,8 @@ void mutation_partition_view::do_accept(const column_mapping& cm, Visitor& visit
        void accept_atomic_cell(column_id id, atomic_cell ac) const {
           _visitor.accept_static_cell(id, std::move(ac));
        }
-        void accept_collection(column_id id, const collection_mutation& cm) const {
-           _visitor.accept_static_cell(id, cm);
+        void accept_collection(column_id id, collection_mutation cm) const {
+           _visitor.accept_static_cell(id, std::move(cm));
        }
    };
    read_and_visit_row(mpv.static_row(), cm, column_kind::static_column, static_row_cell_visitor{visitor});
@@ -218,8 +180,8 @@ void mutation_partition_view::do_accept(const column_mapping& cm, Visitor& visit
            void accept_atomic_cell(column_id id, atomic_cell ac) const {
               _visitor.accept_row_cell(id, std::move(ac));
            }
-            void accept_collection(column_id id, const collection_mutation& cm) const {
-               _visitor.accept_row_cell(id, cm);
+            void accept_collection(column_id id, collection_mutation cm) const {
+               _visitor.accept_row_cell(id, std::move(cm));
            }
        };
        read_and_visit_row(cr.cells(), cm, column_kind::regular_column, cell_visitor{visitor});
@@ -240,8 +202,8 @@ future<> mutation_partition_view::do_accept_gently(const column_mapping& cm, Vis
        void accept_atomic_cell(column_id id, atomic_cell ac) const {
           _visitor.accept_static_cell(id, std::move(ac));
        }
-        void accept_collection(column_id id, const collection_mutation& cm) const {
-           _visitor.accept_static_cell(id, cm);
+        void accept_collection(column_id id, collection_mutation cm) const {
+           _visitor.accept_static_cell(id, std::move(cm));
        }
    };
    read_and_visit_row(mpv.static_row(), cm, column_kind::static_column, static_row_cell_visitor{visitor});
@@ -263,8 +225,8 @@ future<> mutation_partition_view::do_accept_gently(const column_mapping& cm, Vis
            void accept_atomic_cell(column_id id, atomic_cell ac) const {
               _visitor.accept_row_cell(id, std::move(ac));
            }
-            void accept_collection(column_id id, const collection_mutation& cm) const {
-               _visitor.accept_row_cell(id, cm);
+            void accept_collection(column_id id, collection_mutation cm) const {
+               _visitor.accept_row_cell(id, std::move(cm));
            }
        };
        read_and_visit_row(cr.cells(), cm, column_kind::regular_column, cell_visitor{visitor});
@@ -286,8 +248,8 @@ future<> mutation_partition_view::do_accept_gently(const column_mapping& cm, Asy
        void accept_atomic_cell(column_id id, atomic_cell ac) const {
           _visitor.accept_static_cell(id, std::move(ac));
        }
-        void accept_collection(column_id id, const collection_mutation& cm) const {
-           _visitor.accept_static_cell(id, cm);
+        void accept_collection(column_id id, collection_mutation cm) const {
+           _visitor.accept_static_cell(id, std::move(cm));
        }
    };
    read_and_visit_row(mpv.static_row(), cm, column_kind::static_column, static_row_cell_visitor{visitor});
@@ -308,8 +270,8 @@ future<> mutation_partition_view::do_accept_gently(const column_mapping& cm, Asy
            void accept_atomic_cell(column_id id, atomic_cell ac) const {
               _visitor.accept_row_cell(id, std::move(ac));
            }
-            void accept_collection(column_id id, const collection_mutation& cm) const {
-               _visitor.accept_row_cell(id, cm);
+            void accept_collection(column_id id, collection_mutation cm) const {
+               _visitor.accept_row_cell(id, std::move(cm));
            }
        };
        read_and_visit_row(cr.cells(), cm, column_kind::regular_column, cell_visitor{visitor});
@@ -337,8 +299,8 @@ mutation_partition_view::accept_ordered_result mutation_partition_view::do_accep
            void accept_atomic_cell(column_id id, atomic_cell ac) const {
                _visitor.accept_static_cell(id, std::move(ac));
            }
-            void accept_collection(column_id id, const collection_mutation& cm) const {
-                _visitor.accept_static_cell(id, cm);
+            void accept_collection(column_id id, collection_mutation cm) const {
+                _visitor.accept_static_cell(id, std::move(cm));
            }
        };
        read_and_visit_row(mpv.static_row(), cm, column_kind::static_column, static_row_cell_visitor{visitor});
@@ -376,8 +338,8 @@ mutation_partition_view::accept_ordered_result mutation_partition_view::do_accep
            void accept_atomic_cell(column_id id, atomic_cell ac) const {
                _visitor.accept_row_cell(id, std::move(ac));
            }
-            void accept_collection(column_id id, const collection_mutation& cm) const {
-                _visitor.accept_row_cell(id, cm);
+            void accept_collection(column_id id, collection_mutation cm) const {
+                _visitor.accept_row_cell(id, std::move(cm));
            }
        };
        read_and_visit_row(cr.cells(), cm, column_kind::regular_column, cell_visitor{visitor});
@@ -501,44 +463,40 @@ mutation_partition_view mutation_partition_view::from_view(ser::mutation_partiti

 clustering_row read_clustered_row(const schema& s, ser::clustering_row_view crv) {
    class clustering_row_builder {
-        const schema& _s;
        clustering_row _row;
    public:
-        clustering_row_builder(const schema& s, clustering_key key, row_tombstone t, row_marker m)
-            : _s(s), _row(std::move(key), std::move(t), std::move(m), row()) { }
+        clustering_row_builder(clustering_key key, row_tombstone t, row_marker m)
+            : _row(std::move(key), std::move(t), std::move(m), row()) { }
        void accept_atomic_cell(column_id id, atomic_cell ac) {
            _row.cells().append_cell(id, std::move(ac));
        }
-        void accept_collection(column_id id, const collection_mutation& cm) {
-            _row.cells().append_cell(id, collection_mutation(*_s.regular_column_at(id).type, cm));
+        void accept_collection(column_id id, collection_mutation cm) {
+            _row.cells().append_cell(id, std::move(cm));
        }
        clustering_row get() && { return std::move(_row); }
    };

    auto cr = crv.row();
    auto t = row_tombstone(cr.deleted_at(), shadowable_tombstone(cr.shadowable_deleted_at()));
-    clustering_row_builder builder(s, cr.key(), std::move(t), read_row_marker(cr.marker()));
+    clustering_row_builder builder(cr.key(), std::move(t), read_row_marker(cr.marker()));
    read_and_visit_row(cr.cells(), s.get_column_mapping(), column_kind::regular_column, builder);
    return std::move(builder).get();
 }

 static_row read_static_row(const schema& s, ser::static_row_view sr) {
    class static_row_builder {
-        const schema& _s;
        static_row _row;
    public:
-        explicit static_row_builder(const schema& s)
-            : _s(s) { }
        void accept_atomic_cell(column_id id, atomic_cell ac) {
            _row.cells().append_cell(id, std::move(ac));
        }
-        void accept_collection(column_id id, const collection_mutation& cm) {
-            _row.cells().append_cell(id, collection_mutation(*_s.static_column_at(id).type, cm));
+        void accept_collection(column_id id, collection_mutation cm) {
+            _row.cells().append_cell(id, std::move(cm));
        }
        static_row get() && { return std::move(_row); }
    };

-    static_row_builder builder(s);
+    static_row_builder builder;
    read_and_visit_row(sr.cells(), s.get_column_mapping(), column_kind::static_column, builder);
    return std::move(builder).get();
 }
--- a/mutation/mutation_partition_view.hh
+++ b/mutation/mutation_partition_view.hh
@@ -23,31 +23,31 @@ class converting_mutation_partition_applier;

 template<typename T>
 concept MutationViewVisitor = requires (T& visitor, tombstone t, atomic_cell ac,
-                                             collection_mutation_view cmv, range_tombstone rt,
+                                             collection_mutation cm, range_tombstone rt,
                                             position_in_partition_view pipv, row_tombstone row_tomb,
                                             row_marker rm) {
    visitor.accept_partition_tombstone(t);
    visitor.accept_static_cell(column_id(), std::move(ac));
-    visitor.accept_static_cell(column_id(), cmv);
+    visitor.accept_static_cell(column_id(), std::move(cm));
    visitor.accept_row_tombstone(rt);
    visitor.accept_row(pipv, row_tomb, rm,
            is_dummy::no, is_continuous::yes);
    visitor.accept_row_cell(column_id(), std::move(ac));
-    visitor.accept_row_cell(column_id(), cmv);
+    visitor.accept_row_cell(column_id(), std::move(cm));
 };

 template<typename T>
 concept AsyncMutationViewVisitor = requires (T& visitor, tombstone t, atomic_cell ac,
-                                             collection_mutation_view cmv, range_tombstone rt,
+                                             collection_mutation cm, range_tombstone rt,
                                             position_in_partition_view pipv, row_tombstone row_tomb,
                                             row_marker rm) {
    { visitor.accept_partition_tombstone(t) } -> std::same_as<void>;
    { visitor.accept_static_cell(column_id(), std::move(ac)) } -> std::same_as<void>;
-    { visitor.accept_static_cell(column_id(), cmv) } -> std::same_as<void>;
+    { visitor.accept_static_cell(column_id(), std::move(cm)) } -> std::same_as<void>;
    { visitor.accept_row_tombstone(rt) } -> std::same_as<future<>>;
    { visitor.accept_row(pipv, row_tomb, rm, is_dummy::no, is_continuous::yes) } -> std::same_as<future<>>;
    { visitor.accept_row_cell(column_id(), std::move(ac)) } -> std::same_as<void>;
-    { visitor.accept_row_cell(column_id(), cmv) } -> std::same_as<void>;
+    { visitor.accept_row_cell(column_id(), std::move(cm)) } -> std::same_as<void>;
    { visitor.accept_end_of_partition() } -> std::same_as<future<>>;
 };

@@ -56,11 +56,11 @@ public:
    virtual ~mutation_partition_view_virtual_visitor();
    virtual void accept_partition_tombstone(tombstone t) = 0;
    virtual void accept_static_cell(column_id, atomic_cell ac) = 0;
-    virtual void accept_static_cell(column_id, collection_mutation_view cmv) = 0;
+    virtual void accept_static_cell(column_id, collection_mutation cm) = 0;
    virtual stop_iteration accept_row_tombstone(range_tombstone rt) = 0;
    virtual stop_iteration accept_row(position_in_partition_view pipv, row_tombstone rt, row_marker rm, is_dummy, is_continuous) = 0;
    virtual void accept_row_cell(column_id, atomic_cell ac) = 0;
-    virtual void accept_row_cell(column_id, collection_mutation_view cmv) = 0;
+    virtual void accept_row_cell(column_id, collection_mutation cm) = 0;
 };

 // View on serialized mutation partition. See mutation_partition_serializer.
--- a/partition_builder.hh
+++ b/partition_builder.hh
@@ -46,8 +46,12 @@ public:
    }

    virtual void accept_static_cell(column_id id, collection_mutation_view collection) override {
+        accept_static_cell(id, collection_mutation(*_schema.static_column_at(id).type, std::move(collection)));
+    }
+
+    void accept_static_cell(column_id id, collection_mutation&& collection) {
        row& r = _partition.static_row().maybe_create();
-        r.append_cell(id, collection_mutation(*_schema.static_column_at(id).type, std::move(collection)));
+        r.append_cell(id, std::move(collection));
    }

    virtual void accept_row_tombstone(const range_tombstone& rt) override {
@@ -72,8 +76,12 @@ public:
    }

    virtual void accept_row_cell(column_id id, collection_mutation_view collection) override {
+        accept_row_cell(id, collection_mutation(*_schema.regular_column_at(id).type, std::move(collection)));
+    }
+
+    void accept_row_cell(column_id id, collection_mutation collection) {
        row& r = _current_row->cells();
-        r.append_cell(id, collection_mutation(*_schema.regular_column_at(id).type, std::move(collection)));
+        r.append_cell(id, std::move(collection));
    }
 };

--- a/pgo/exec_cql.py
+++ b/pgo/exec_cql.py
@@ -16,7 +16,6 @@ Usage:
 import argparse, os, sys
 from typing import Sequence

-from test.pylib.driver_utils import safe_driver_shutdown

 def read_statements(path: str) -> list[tuple[int, str]]:
    stms: list[tuple[int, str]] = []
@@ -58,7 +57,7 @@ def exec_statements(statements: list[tuple[int, str]], socket_path: str, timeout
                print(f"ERROR executing statement from file line {lineno}: {s}\n{e}", file=sys.stderr)
                return 1
    finally:
-        safe_driver_shutdown(cluster)
+        cluster.shutdown()
    return 0

 def main(argv: Sequence[str]) -> int:
--- a/pgo/profiles/aarch64/profile.profdata.xz
+++ b/pgo/profiles/aarch64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:524c54493b72c5e1b783f14dfa49d733e21b24cc2ec776e9c6e578095073162d
-size 6646304
+oid sha256:8b22f9a548a03c88250d31e97ea3e8f77b4d90c502bcf74336c24056557f947f
+size 6698412
--- a/pgo/profiles/x86_64/profile.profdata.xz
+++ b/pgo/profiles/x86_64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fec2bb253d43139da954cee3441fc8bc74824246b080f23bf1f824714d0adc45
-size 6646576
+oid sha256:31e515a62f006649b0dc4671b51b2643fba9a70884c09b90fbc2237044954254
+size 6707108
--- a/raft/server.cc
+++ b/raft/server.cc
@@ -239,7 +239,10 @@ private:

    // Drop waiter that we lost track of, can happen due to a snapshot transfer,
    // or a leader removed from cluster while some entries added on it are uncommitted.
-    void drop_waiters(std::optional<index_t> idx = {});
+    // When `snp` is provided (snapshot transfer case), waiters whose term matches
+    // the snapshot term are resolved successfully, since the snapshot-term match proves
+    // they were committed and included in the snapshot (by the Log Matching Property).
+    void drop_waiters(const snapshot_descriptor* snp = nullptr);

    // Wake up all waiter that wait for entries with idx smaller of equal to the one provided
    // to be applied.
@@ -556,12 +559,10 @@ future<> server_impl::wait_for_entry(entry_id eid, wait_type type, seastar::abor
                auto snap_term = _fsm->log_term_for(snap_idx);
                SCYLLA_ASSERT(snap_term);
                SCYLLA_ASSERT(snap_idx >= eid.idx);
-                if (type == wait_type::committed && snap_term == eid.term) {
+                if (snap_term == eid.term) {
                    logger.trace("[{}] wait_for_entry {}.{}: entry got truncated away, but has the snapshot's term"
                                 " (snapshot index: {})", id(), eid.term, eid.idx, snap_idx);
                    co_return;
-
-                    // We don't do this for `wait_type::applied` - see below why.
                }

                logger.trace("[{}] wait_for_entry {}.{}: entry got truncated away", id(), eid.term, eid.idx);
@@ -572,20 +573,6 @@ future<> server_impl::wait_for_entry(entry_id eid, wait_type type, seastar::abor
                throw dropped_entry();
            }

-            if (type == wait_type::applied && _fsm->log_last_snapshot_idx() >= eid.idx) {
-                // We know the entry was committed but the wait type is `applied`
-                // and we don't know if the entry was applied with `state_machine::apply`
-                // (we may've loaded a snapshot before we managed to apply the entry).
-                // As specified by `add_entry`, throw `commit_status_unknown` in this case.
-                //
-                // FIXME: replace this with a different exception type - `commit_status_unknown`
-                // gives too much uncertainty while we know that the entry was committed
-                // and had to be applied on at least one server. Some callers of `add_entry`
-                // need to know only that the current state includes that entry, whether it was done
-                // through `apply` on this server or through receiving a snapshot.
-                throw commit_status_unknown();
-            }
-
            co_return;
        }
    }
@@ -760,6 +747,8 @@ future<> server_impl::add_entry(command command, wait_type type, seastar::abort_
            throw not_a_leader{leader};
        }
        auto eid = co_await add_entry_on_leader(std::move(command), as);
+        co_await utils::get_local_injector().inject("block_raft_add_entry_before_wait_for_entry",
+                utils::wait_for_message(std::chrono::minutes(5)));
        co_return co_await wait_for_entry(eid, type, as);
    }

@@ -995,17 +984,24 @@ void server_impl::notify_waiters(std::map<index_t, op_status>& waiters,
    }
 }

-void server_impl::drop_waiters(std::optional<index_t> idx) {
+void server_impl::drop_waiters(const snapshot_descriptor* snp) {
    auto drop = [&] (std::map<index_t, op_status>& waiters) {
        while (waiters.size() != 0) {
            auto it = waiters.begin();
-            if (idx && it->first > *idx) {
+            if (snp && it->first > snp->idx) {
                break;
            }
            auto [entry_idx, status] = std::move(*it);
            waiters.erase(it);
-            status.done.set_exception(commit_status_unknown());
-            _stats.waiters_dropped++;
+            if (snp && status.term == snp->term) {
+                // entry_idx <= snapshot index and the entry's term matches the snapshot term.
+                // By the Log Matching Property the entry was committed and included in the snapshot.
+                status.done.set_value();
+                _stats.waiters_awoken++;
+            } else {
+                status.done.set_exception(commit_status_unknown());
+                _stats.waiters_dropped++;
+            }
        }
    };
    drop(_awaited_commits);
@@ -1431,7 +1427,7 @@ future<> server_impl::applier_fiber() {
                // Apply snapshot it to the state machine
                logger.trace("[{}] apply_fiber applying snapshot {}", _id, snp.id);
                co_await _state_machine->load_snapshot(snp.id);
-                drop_waiters(snp.idx);
+                drop_waiters(&snp);
                _applied_idx = snp.idx;
                _applied_index_changed.broadcast();
                _stats.sm_load_snapshot++;
@@ -1940,7 +1936,7 @@ std::unique_ptr<server> create_server(server_id uuid, std::unique_ptr<rpc> rpc,
 }

 std::ostream& operator<<(std::ostream& os, const server_impl& s) {
-    fmt::print(os, "[id: {}, fsm ()]\n", s._id, *s._fsm);
+    fmt::print(os, "[id: {}, fsm ({})]\n", s._id, *s._fsm);
    return os;
 }

--- a/raft/server.hh
+++ b/raft/server.hh
@@ -79,18 +79,18 @@ public:
    // The caller may pass a pointer to an abort_source to make the operation abortable.
    // If it passes nullptr, the operation is unabortable.
    //
-    // Successful `add_entry` with `wait_type::committed` does not guarantee that `state_machine::apply` will be called
-    // locally for this entry. Between the commit and the application we may receive a snapshot containing this entry,
-    // so the state machine's state 'jumps' forward in time, skipping the entry application.
-    // However, for `wait_type::applied`, we guarantee that the entry will be applied locally with `state_machine::apply`.
-    // If a snapshot causes the state machine to jump over the entry, `add_entry` will return `commit_status_unknown`
-    // (even if the snapshot included that entry).
+    // Successful `add_entry` does not guarantee that `state_machine::apply` will be called
+    // locally for this entry. Between the commit and the application we may load a snapshot
+    // containing this entry, so the state machine's state 'jumps' forward in time, skipping
+    // the local entry application. For `wait_type::applied` this should be fine, because
+    // state machine implementations shouldn't care whether an entry was applied via
+    // `state_machine::apply` or via a snapshot load.
    //
    // Exceptions:
    // raft::commit_status_unknown
    //     Thrown if the leader has changed and the log entry has either
    //     been replaced by the new leader or the server has lost track of it.
-    //     It may also be thrown in case of a transport error while forwarding add_entry to the leader.L
+    //     It may also be thrown in case of a transport error while forwarding add_entry to the leader.
    // raft::dropped_entry
    //     Thrown if the entry was replaced because of a leader change.
    // raft::request_aborted
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -1022,8 +1022,7 @@ void database::drop_keyspace(const sstring& name) {
 static bool is_system_table(const schema& s) {
    auto& k = s.ks_name();
    return k == db::system_keyspace::NAME ||
-        k == db::system_distributed_keyspace::NAME ||
-        k == db::system_distributed_keyspace::NAME_EVERYWHERE;
+        k == db::system_distributed_keyspace::NAME;
 }

 sstables::sstables_manager& database::get_sstables_manager(const schema& s) const {
@@ -1142,7 +1141,7 @@ future<> database::create_local_system_table(
        cfg.memtable_scheduling_group = default_scheduling_group();
        cfg.memtable_to_cache_scheduling_group = default_scheduling_group();
    }
-    auto lock = get_tables_metadata().hold_write_lock();
+    auto lock = co_await get_tables_metadata().hold_write_lock();
    std::exception_ptr ex;
    try {
        add_column_family(ks, table, std::move(cfg), replica::database::is_new_cf::no);
@@ -1328,9 +1327,27 @@ future<global_table_ptr> get_table_on_all_shards(sharded<database>& sharded_db,

 future<tables_metadata_lock_on_all_shards> database::lock_tables_metadata(sharded<database>& sharded_db) {
    tables_metadata_lock_on_all_shards locks;
-    co_await sharded_db.invoke_on_all([&] (auto& db) -> future<> {
+    // Acquire write lock on shard 0 first, and then on the remaining shards.
+    //
+    // Parallel acquisition on all shards could deadlock when two
+    // fibers call lock_tables_metadata() concurrently: parallel_for_each
+    // sends SMP messages to all shards even when the local shard's lock
+    // attempt blocks.  If task reordering (SEASTAR_SHUFFLE_TASK_QUEUE in
+    // debug/sanitize builds) causes fiber A to win on shard X while
+    // fiber B wins on shard Y, neither can make progress — classic
+    // cross-shard lock-ordering deadlock.
+    //
+    // Acquiring the write lock on shard 0 first, and then on the remaining
+    // shards, eliminates this: whichever fiber acquires shard 0 first is
+    // guaranteed to acquire locks on all other shards before the other fiber
+    // can acquire the lock on shard 0.
+    co_await sharded_db.invoke_on(0, [&locks, &sharded_db] (auto& db) -> future<> {
        locks.assign_lock(co_await db.get_tables_metadata().hold_write_lock());
+        co_await sharded_db.invoke_on_others([&locks] (auto& db) -> future<> {
+            locks.assign_lock(co_await db.get_tables_metadata().hold_write_lock());
+        });
    });
+
    co_return locks;
 }

--- a/replica/distributed_loader.cc
+++ b/replica/distributed_loader.cc
@@ -48,7 +48,6 @@ bool is_system_keyspace(std::string_view name) {

 static const std::unordered_set<std::string_view> internal_keyspaces = {
        db::system_distributed_keyspace::NAME,
-        db::system_distributed_keyspace::NAME_EVERYWHERE,
        db::system_keyspace::NAME,
        db::schema_tables::NAME,
        auth::meta::legacy::AUTH_KS,
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -4624,7 +4624,7 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat
        sstables::shared_sstable sst;
        replica::enable_backlog_tracker enable_backlog_tracker;
    };
-    std::vector<removed_sstable> remove;
+    std::unordered_map<size_t, std::vector<removed_sstable>> per_cg_remove;

    _stats.pending_sstable_deletions++;
    auto undo_stats = defer([this] {
@@ -4633,7 +4633,7 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat

    auto permit = co_await get_sstable_list_permit();

-    co_await _cache.invalidate(row_cache::external_updater([this, &rp, &remove, truncated_at] {
+    co_await _cache.invalidate(row_cache::external_updater([this, &rp, &per_cg_remove, truncated_at] {
        // FIXME: the following isn't exception safe.
        for_each_compaction_group([&] (compaction_group& cg) {

@@ -4648,7 +4648,7 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat
                        if (p->originated_on_this_node().value_or(false) && p->get_stats_metadata().position.shard_id() == this_shard_id()) {
                            rp = std::max(p->get_stats_metadata().position, rp);
                        }
-                        remove.emplace_back(removed_sstable{cg, p, enable_backlog_tracker});
+                        per_cg_remove[cg.group_id()].emplace_back(removed_sstable{cg, p, enable_backlog_tracker});
                        return;
                    }
                    pruned->insert(p);
@@ -4665,16 +4665,19 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat
    }));
    rebuild_statistics();

-    std::vector<sstables::shared_sstable> del;
-    del.reserve(remove.size());
-    for (auto& r : remove) {
-        if (r.enable_backlog_tracker) {
-            remove_sstable_from_backlog_tracker(r.cg.get_backlog_tracker(), r.sst);
+    co_await coroutine::parallel_for_each(per_cg_remove, [&] (auto& entry) {
+        auto& removed = entry.second;
+        std::vector<sstables::shared_sstable> del;
+        del.reserve(removed.size());
+        for (auto& r : removed) {
+            if (r.enable_backlog_tracker) {
+                remove_sstable_from_backlog_tracker(r.cg.get_backlog_tracker(), r.sst);
+            }
+            erase_sstable_cleanup_state(r.sst);
+            del.emplace_back(std::move(r.sst));
        }
-        erase_sstable_cleanup_state(r.sst);
-        del.emplace_back(r.sst);
-    };
-    co_await delete_sstables_atomically(permit, std::move(del));
+        return delete_sstables_atomically(permit, std::move(del));
+    });
    co_return rp;
 }

@@ -5609,7 +5612,7 @@ future<> compaction_group::cleanup() {
    auto updater = row_cache::external_updater(std::make_unique<compaction_group_cleaner>(*this));

    auto p_range = to_partition_range(token_range());
-    tlogger.debug("Invalidating range {} for compaction group {} of table {} during cleanup.",
+    tlogger.debug("Invalidating range {} for compaction group {} of table {}.{} during cleanup.",
                  p_range, group_id(), _t.schema()->ks_name(), _t.schema()->cf_name());
    // Since permit is still held, all actions below will be executed atomically:
    co_await _t._cache.invalidate(std::move(updater), p_range);
--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -1328,8 +1328,14 @@ class interval_printer(gdb.printing.PrettyPrinter):
    def __init__(self, val):
        self.val = val['_interval']

-    def inspect_bound(self, bound_opt):
-        bound = std_optional(bound_opt)
+    def inspect_bound(self, bound_name):
+        if f'_{bound_name}_exists' in self.val:
+            if not self.val[f'_{bound_name}_exists']:
+                return False, False, None
+
+            return True, bool(self.val[f'_{bound_name}_inclusive']), self.val[f'_{bound_name}_value']
+
+        bound = std_optional(self.val[f'_{bound_name}'])
        if not bound:
            return False, False, None

@@ -1338,8 +1344,8 @@ class interval_printer(gdb.printing.PrettyPrinter):
        return True, bool(bound['_inclusive']), bound['_value']

    def to_string(self):
-        has_start, start_inclusive, start_value = self.inspect_bound(self.val['_start'])
-        has_end, end_inclusive, end_value = self.inspect_bound(self.val['_end'])
+        has_start, start_inclusive, start_value = self.inspect_bound('start')
+        has_end, end_inclusive, end_value = self.inspect_bound('end')

        if self.val['_singular']:
            return '{{{}}}'.format(str(start_value))
@@ -5466,10 +5472,9 @@ class scylla_compaction_tasks(gdb.Command):
        try:
            task_list = list(intrusive_list(cm['_tasks']))
        except gdb.error: # 6.2 compatibility
-            task_list = list(std_list(cm['_tasks']))
+            task_list = [seastar_shared_ptr(t).get().dereference() for t in std_list(cm['_tasks'])]

        for task in task_list:
-            task = seastar_shared_ptr(task).get().dereference()
            schema = schema_ptr(task['_compacting_table'].dereference()['_schema'])
            key = 'type={}, state={:5}, {}'.format(task['_type'], str(task['_state']), schema.table_name())
            task_hist.add(key)
--- a/service/client_state.cc
+++ b/service/client_state.cc
@@ -122,11 +122,9 @@ future<> service::client_state::check_internal_table_permissions(std::string_vie
                    auth::permission::ALTER, auth::permission::DROP>();

    if (forbidden_permissions.contains(cmd.permission)) {
-        if ((ks == db::system_distributed_keyspace::NAME || ks == db::system_distributed_keyspace::NAME_EVERYWHERE)
+        if (ks == db::system_distributed_keyspace::NAME
                && (table_name == db::system_distributed_keyspace::CDC_DESC_V2
-                || table_name == db::system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION
-                || table_name == db::system_distributed_keyspace::CDC_TIMESTAMPS
-                || table_name == db::system_distributed_keyspace::CDC_GENERATIONS_V2)) {
+                || table_name == db::system_distributed_keyspace::CDC_TIMESTAMPS)) {
            return make_exception_future(exceptions::unauthorized_exception(
                    format("Cannot {} {}", auth::permissions::to_string(cmd.permission), cmd.resource)));
        }
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -239,7 +239,7 @@ future<> migration_manager::wait_for_schema_agreement(const replica::database& d
            as->check();
        }
        if (db::timeout_clock::now() > deadline) {
-            throw std::runtime_error("Unable to reach schema agreement");
+            throw schema_agreement_timeout();
        }
        co_await (as ? sleep_abortable(std::chrono::milliseconds(500), *as) : sleep(std::chrono::milliseconds(500)));
    }
--- a/service/migration_manager.hh
+++ b/service/migration_manager.hh
@@ -14,6 +14,7 @@
 #include "gms/endpoint_state.hh"
 #include <seastar/core/abort_source.hh>
 #include <seastar/core/gate.hh>
+#include <seastar/core/timed_out_error.hh>
 #include "gms/inet_address.hh"
 #include "gms/feature.hh"
 #include "gms/i_endpoint_state_change_subscriber.hh"
@@ -133,6 +134,19 @@ public:
     * Known peers in the cluster have the same schema version as us.
     */
    bool have_schema_agreement();
+    // Thrown by wait_for_schema_agreement() when the deadline is reached.
+    struct schema_agreement_timeout : public seastar::timed_out_error {
+        const char* what() const noexcept override {
+            return "Unable to reach schema agreement";
+        }
+    };
+    /**
+     * Waits until all known live peers have the same schema version as this
+     * node. Returns normally once agreement is reached, or throws
+     * schema_agreement_timeout if the deadline is reached before agreement.
+     * If as != nullptr, can also throw abort_requested_exception if the abort
+     * source fires.
+     */
    future<> wait_for_schema_agreement(const replica::database& db, db::timeout_clock::time_point deadline, seastar::abort_source* as);

    // Maximum number of retries one should attempt when trying to perform
--- a/service/paxos/paxos_state.cc
+++ b/service/paxos/paxos_state.cc
@@ -438,9 +438,10 @@ static future<cql3::untyped_result_set> do_execute_cql_with_timeout(sstring req,

    const auto cache_key = qp.compute_id(req, "", cql3::internal_dialect());
    auto ps_ptr = qp.get_prepared(cache_key);
+    shared_ptr<cql_transport::messages::result_message::prepared> prepared_msg;
    if (!ps_ptr) {
-        const auto msg_ptr = co_await qp.prepare(req, qs, cql3::internal_dialect());
-        ps_ptr = msg_ptr->get_prepared();
+        prepared_msg = co_await qp.prepare(req, qs, cql3::internal_dialect());
+        ps_ptr = prepared_msg->get_prepared();
        if (!ps_ptr) {
            on_internal_error(paxos_state::logger, "prepared statement is null");
        }
@@ -449,8 +450,8 @@ static future<cql3::untyped_result_set> do_execute_cql_with_timeout(sstring req,
        -1, service::node_local_only::yes);
    const auto st = ps_ptr->statement;

-    const auto msg_ptr = co_await st->execute(qp, qs, qo, std::nullopt);
-    co_return cql3::untyped_result_set(msg_ptr);
+    const auto result_ptr = co_await st->execute(qp, qs, qo, std::nullopt);
+    co_return cql3::untyped_result_set(result_ptr);
 }

 template <typename... Args>
--- a/service/qos/service_level_controller.cc
+++ b/service/qos/service_level_controller.cc
@@ -26,7 +26,6 @@
 #include <seastar/coroutine/maybe_yield.hh>
 #include "service/qos/raft_service_level_distributed_data_accessor.hh"
 #include "service_level_controller.hh"
-#include "db/system_distributed_keyspace.hh"
 #include "cql3/query_processor.hh"
 #include "service/storage_service.hh"
 #include "service/topology_state_machine.hh"
--- a/service/qos/service_level_controller.hh
+++ b/service/qos/service_level_controller.hh
@@ -31,7 +31,6 @@

 namespace db {
    class system_keyspace;
-    class system_distributed_keyspace;
 }
 namespace cql3 {
    class query_processor;
--- a/service/raft/group0_state_machine.cc
+++ b/service/raft/group0_state_machine.cc
@@ -434,6 +434,8 @@ future<> group0_state_machine::load_snapshot(raft::snapshot_id id) {
 }

 future<> group0_state_machine::enable_in_memory_state_machine() {
+    co_await utils::get_local_injector().inject("group0_state_machine_enable_in_memory_fail",
+            [] { return std::make_exception_ptr(std::runtime_error("injected failure in enable_in_memory_state_machine")); });
    auto read_apply_mutex_holder = co_await _client.hold_read_apply_mutex(_abort_source);
    if (!_in_memory_state_machine_enabled) {
        _in_memory_state_machine_enabled = true;
--- a/service/raft/raft_group0.cc
+++ b/service/raft/raft_group0.cc
@@ -452,14 +452,16 @@ future<> raft_group0::start_server_for_group0(raft::group_id group0_id, service:
    auto srv_for_group0 = create_server_for_group0(group0_id, my_id, ss, qp, mm);
    auto& persistence = srv_for_group0.persistence;
    auto& server = *srv_for_group0.server;
-    co_await with_scheduling_group(_sg, [this, &srv_for_group0] (this auto self) -> future<> {
+    co_await with_scheduling_group(_sg, [this, &srv_for_group0, group0_id] (this auto self) -> future<> {
        auto& state_machine = dynamic_cast<group0_state_machine&>(srv_for_group0.state_machine);
        co_await _raft_gr.start_server_for_group(std::move(srv_for_group0));
+        // Set _group0 immediately after the server is registered in _raft_gr._servers.
+        // This ensures abort_and_drain()/destroy() can find and clean up the server
+        // even if enable_in_memory_state_machine() or later steps throw.
+        _group0.emplace<raft::group_id>(group0_id);
        co_await state_machine.enable_in_memory_state_machine();
    });

-    _group0.emplace<raft::group_id>(group0_id);
-
    // Fix for scylladb/scylladb#16683:
    // If the snapshot index is 0, trigger creation of a new snapshot
    // so bootstrapping nodes will receive a snapshot transfer.
@@ -681,16 +683,6 @@ bool raft_group0::maintenance_mode() {
 }

 future<> raft_group0::setup_group0_if_exist(db::system_keyspace& sys_ks, service::storage_service& ss, cql3::query_processor& qp, service::migration_manager& mm) {
-    if (maintenance_mode()) {
-        co_return;
-    }
-
-    if (!sys_ks.bootstrap_complete()) {
-        // If bootstrap did not complete yet, there is no group 0 to setup at this point
-        // -- it will be done after we start gossiping, in `setup_group0`.
-        co_return;
-    }
-
    auto group0_id = raft::group_id{co_await sys_ks.get_raft_group0_id()};
    if (group0_id) {
        // Group 0 ID is present => we've already joined group 0 earlier.
@@ -711,15 +703,6 @@ future<> raft_group0::setup_group0(
        db::system_keyspace& sys_ks, const std::unordered_set<gms::inet_address>& initial_contact_nodes, shared_ptr<group0_handshaker> handshaker,
        service::storage_service& ss, cql3::query_processor& qp, service::migration_manager& mm,
        const join_node_request_params& params) {
-    if (maintenance_mode()) {
-        // The node is in maintenance mode.
-        co_return;
-    }
-
-    if (joined_group0()) {
-        // Group 0 is already set up, there is nothing to do.
-        co_return;
-    }
    // Reaching this point is possible only in two cases:
    // - the node is bootstrapping,
    // - the node is restarting in the Raft-based recovery procedure and has not joined the new group 0 yet.
@@ -1036,7 +1019,7 @@ with_timeout(abort_source& as, db::timeout_clock::duration d, F&& fun) {
        } catch (...) {
            // There should be no other exceptions, but just in case, catch and discard.
            // we want to propagate exceptions from `f`, not from sleep.
-            group0_log.error("unexpected exception from sleep_and_abort", std::current_exception());
+            group0_log.error("unexpected exception from sleep_and_abort: {}", std::current_exception());
        }

        // Translate aborts caused by timeout to `timed_out_error`.
--- a/service/raft/raft_group0.hh
+++ b/service/raft/raft_group0.hh
@@ -271,6 +271,10 @@ public:
    seastar::scheduling_group get_scheduling_group() {
        return _sg;
    }
+
+    // Returns true if in maintenance mode
+    bool maintenance_mode();
+
 private:
    static void init_rpc_verbs(raft_group0& shard0_this);
    static future<> uninit_rpc_verbs(netw::messaging_service& ms);
@@ -332,9 +336,6 @@ private:
    // Does not affect non-members. This behavior is only guaranteed if no concurrent membership changes are happening.
    future<> modify_raft_voter_status(const std::unordered_set<raft::server_id>& voters_add, const std::unordered_set<raft::server_id>& voters_del,
            abort_source& as, std::optional<raft_timeout> timeout = std::nullopt);
-
-    // Returns true if in maintenance mode
-    bool maintenance_mode();
 };

 } // end of namespace service
--- a/service/session.cc
+++ b/service/session.cc
@@ -9,6 +9,7 @@
 #include "service/session.hh"
 #include "utils/log.hh"
 #include <seastar/core/coroutine.hh>
+#include <seastar/core/timer.hh>

 namespace service {

@@ -58,18 +59,35 @@ void session_manager::initiate_close_of_sessions_except(const std::unordered_set
 }

 future<> session_manager::drain_closing_sessions() {
+    slogger.info("drain_closing_sessions: waiting for lock");
+    seastar::timer<lowres_clock> lock_timer([this] {
+        slogger.warn("drain_closing_sessions: still waiting for lock, available units {}",
+                     _session_drain_sem.available_units());
+    });
+    lock_timer.arm_periodic(std::chrono::minutes(5));
    auto lock = co_await get_units(_session_drain_sem, 1);
+    lock_timer.cancel();
+    auto n = std::distance(_closing_sessions.begin(), _closing_sessions.end());
+    slogger.info("drain_closing_sessions: acquired lock, {} sessions to drain", n);
    auto i = _closing_sessions.begin();
    while (i != _closing_sessions.end()) {
        session& s = *i;
        ++i;
        auto id = s.id();
-        slogger.debug("draining session {}", id);
+        slogger.info("drain_closing_sessions: waiting for session {} to close, gate count {}", id, s.gate_count());
+        std::optional<seastar::timer<lowres_clock>> warn_timer;
+        warn_timer.emplace([&s, id] {
+            slogger.warn("drain_closing_sessions: session {} still not closed, gate count {}",
+                         id, s.gate_count());
+        });
+        warn_timer->arm_periodic(std::chrono::minutes(5));
        co_await s.close();
+        warn_timer.reset();
        if (_sessions.erase(id)) {
-            slogger.debug("session {} closed", id);
+            slogger.info("drain_closing_sessions: session {} closed", id);
        }
    }
+    slogger.info("drain_closing_sessions: done");
 }

 } // namespace service
--- a/service/session.hh
+++ b/service/session.hh
@@ -95,6 +95,10 @@ public:
        return _id;
    }

+    size_t gate_count() const {
+        return _gate.get_count();
+    }
+
    /// Post-condition of successfully resolved future: There are no guards alive for this session, and
    /// and it's impossible to create more such guards later.
    /// Can be called concurrently.
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -1940,7 +1940,7 @@ public:
    // Calculates how much to delay completing the request. The delay adds to the request's inherent latency.
    template<typename Func>
    void delay(tracing::trace_state_ptr trace, Func&& on_resume) {
-        auto delay = db::view::calculate_view_update_throttling_delay(_view_backlog, _expire_timer.get_timeout(), _proxy->data_dictionary().get_config().view_flow_control_delay_limit_in_ms());
+        auto delay = _proxy->_max_view_update_backlog.calculate_throttling_delay(_view_backlog, _expire_timer.get_timeout());
        stats().last_mv_flow_control_delay = delay;
        stats().mv_flow_control_delay += delay.count();
        if (delay.count() == 0) {
@@ -3337,6 +3337,7 @@ storage_proxy::storage_proxy(sharded<replica::database>& db, storage_proxy::conf
    , _hints_for_views_manager(*this, _db.local().get_config().view_hints_directory(), {}, _db.local().get_config().max_hint_window_in_ms(), _hints_resource_manager, _db, cfg.hints_sched_group)
    , _stats_key(stats_key)
    , _features(feat)
+    , _maintenance_mode(cfg.maintenance_mode)
    , _background_write_throttle_threahsold(cfg.available_memory / 10)
    , _mutate_stage{"storage_proxy_mutate", &storage_proxy::do_mutate}
    , _max_view_update_backlog(max_view_update_backlog)
@@ -7103,7 +7104,7 @@ host_id_vector_replica_set storage_proxy::get_endpoints_for_reading(const schema
    auto endpoints = erm.get_replicas_for_reading(token);
    // Skip for non-debug builds and maintenance mode.
    if constexpr (tools::build_info::is_debug_build()) {
-        if (!_db.local().get_config().maintenance_mode()) {
+        if (!_maintenance_mode) {
            validate_read_replicas(erm, endpoints);
        }
    }
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -40,6 +40,7 @@
 #include "dht/token_range_endpoints.hh"
 #include "service/storage_service.hh"
 #include "service/cas_shard.hh"
+#include "service/maintenance_mode.hh"
 #include "service/storage_proxy_fwd.hh"

 class reconcilable_result;
@@ -197,6 +198,7 @@ public:
        // with writes.
        smp_service_group write_ack_smp_service_group = default_smp_service_group();
        scheduling_group hints_sched_group;
+        maintenance_mode_enabled maintenance_mode = maintenance_mode_enabled::no;
    };
 private:

@@ -294,6 +296,7 @@ private:
    scheduling_group_key _stats_key;
    storage_proxy_stats::global_stats _global_stats;
    gms::feature_service& _features;
+    maintenance_mode_enabled _maintenance_mode;

    class remote;
    std::unique_ptr<remote> _remote;
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -496,7 +496,7 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
    };

    auto process_normal_node = [&] (raft::server_id id, locator::host_id host_id, std::optional<gms::inet_address> ip, const replica_state& rs) -> future<> {
-        rtlogger.trace("loading topology: raft id={} ip={} node state={} dc={} rack={} tokens state={} tokens={} shards={}",
+        rtlogger.trace("loading topology: raft id={} ip={} node state={} dc={} rack={} tokens state={} tokens={} shards={} cleanup={}",
                      id, ip, rs.state, rs.datacenter, rs.rack, _topology_state_machine._topology.tstate, rs.ring.value().tokens, rs.shard_count, rs.cleanup);
        // Save tokens, not needed for raft topology management, but needed by legacy
        // Also ip -> id mapping is needed for address map recreation on reboot
@@ -1614,44 +1614,43 @@ future<> storage_service::join_topology(sharded<service::storage_proxy>& proxy,

    SCYLLA_ASSERT(_group0);

-    join_node_request_params join_params {
-        .host_id = _group0->load_my_id(),
-        .cluster_name = _db.local().get_config().cluster_name(),
-        .snitch_name = _db.local().get_snitch_name(),
-        .datacenter = _snitch.local()->get_datacenter(),
-        .rack = _snitch.local()->get_rack(),
-        .release_version = version::release(),
-        .num_tokens = _db.local().get_config().join_ring() ? _db.local().get_config().num_tokens() : 0,
-        .tokens_string = _db.local().get_config().join_ring() ? _db.local().get_config().initial_token() : sstring(),
-        .shard_count = smp::count,
-        .ignore_msb =  _db.local().get_config().murmur3_partitioner_ignore_msb_bits(),
-        .supported_features = _feature_service.supported_feature_set() | std::ranges::to<std::vector<sstring>>(),
-        .request_id = utils::UUID_gen::get_time_UUID(),
-    };
+    auto request_id = utils::UUID_gen::get_time_UUID();
+    if (!_group0->maintenance_mode() && !_group0->joined_group0()) {
+        join_node_request_params join_params {
+            .host_id = _group0->load_my_id(),
+            .cluster_name = _db.local().get_config().cluster_name(),
+            .snitch_name = _db.local().get_snitch_name(),
+            .datacenter = _snitch.local()->get_datacenter(),
+            .rack = _snitch.local()->get_rack(),
+            .release_version = version::release(),
+            .num_tokens = _db.local().get_config().join_ring() ? _db.local().get_config().num_tokens() : 0,
+            .tokens_string = _db.local().get_config().join_ring() ? _db.local().get_config().initial_token() : sstring(),
+            .shard_count = smp::count,
+            .ignore_msb =  _db.local().get_config().murmur3_partitioner_ignore_msb_bits(),
+            .supported_features = _feature_service.supported_feature_set() | std::ranges::to<std::vector<sstring>>(),
+            .request_id = request_id,
+        };

-    if (raft_replace_info) {
-        join_params.replaced_id = raft_replace_info->raft_id;
-        join_params.ignore_nodes = utils::split_comma_separated_list(_db.local().get_config().ignore_dead_nodes_for_replace());
-        if (!locator::check_host_ids_contain_only_uuid(join_params.ignore_nodes)) {
-            slogger.warn("Warning: Using IP addresses for '--ignore-dead-nodes-for-replace' is deprecated and will"
-                         " be disabled in a future release. Please use host IDs instead. Provided values: {}",
-                         _db.local().get_config().ignore_dead_nodes_for_replace());
+        if (raft_replace_info) {
+            join_params.replaced_id = raft_replace_info->raft_id;
+            join_params.ignore_nodes = utils::split_comma_separated_list(_db.local().get_config().ignore_dead_nodes_for_replace());
+            if (!locator::check_host_ids_contain_only_uuid(join_params.ignore_nodes)) {
+                slogger.warn("Warning: Using IP addresses for '--ignore-dead-nodes-for-replace' is deprecated and will"
+                            " be disabled in a future release. Please use host IDs instead. Provided values: {}",
+                            _db.local().get_config().ignore_dead_nodes_for_replace());
+            }
        }
-    }

-    // setup_group0 will do nothing if the node has already set up group 0 in setup_group0_if_exist in main.cc, which
-    // happens when the node is restarting and not joining the new group 0 in the Raft-based recovery procedure.
-    // It does not matter which handshaker we choose in this case since it will not be used.
-    //
-    // We use the legacy handshaker in the Raft-based recovery procedure to join the new group 0 without involving
-    // the topology coordinator. We can assume this node has already been accepted by the topology coordinator once
-    // and joined topology.
-    ::shared_ptr<group0_handshaker> handshaker =
-            !_db.local().get_config().recovery_leader.is_set()
-            ? ::make_shared<join_node_rpc_handshaker>(*this, join_params)
-            : _group0->make_legacy_handshaker(raft::is_voter::no);
-    co_await _group0->setup_group0(_sys_ks.local(), initial_contact_nodes, std::move(handshaker),
-            *this, _qp, _migration_manager.local(), join_params);
+        // We use the legacy handshaker in the Raft-based recovery procedure to join the new group 0 without involving
+        // the topology coordinator. We can assume this node has already been accepted by the topology coordinator once
+        // and joined topology.
+        ::shared_ptr<group0_handshaker> handshaker =
+                !_db.local().get_config().recovery_leader.is_set()
+                ? ::make_shared<join_node_rpc_handshaker>(*this, join_params)
+                : _group0->make_legacy_handshaker(raft::is_voter::no);
+        co_await _group0->setup_group0(_sys_ks.local(), initial_contact_nodes, std::move(handshaker),
+                *this, _qp, _migration_manager.local(), join_params);
+    }

    raft::server& raft_server = _group0->group0_server();

@@ -1700,7 +1699,7 @@ future<> storage_service::join_topology(sharded<service::storage_proxy>& proxy,
            throw std::runtime_error("Crashed in crash_before_topology_request_completion");
        });

-        auto err = co_await wait_for_topology_request_completion(join_params.request_id);
+        auto err = co_await wait_for_topology_request_completion(request_id);
        if (!err.empty()) {
            throw std::runtime_error(fmt::format("{} failed. See earlier errors ({})", raft_replace_info ? "Replace" : "Bootstrap", err));
        }
@@ -4494,10 +4493,20 @@ future<> storage_service::local_topology_barrier() {
                             version, current_version)));
        }

-        co_await ss._shared_token_metadata.stale_versions_in_use();
+        rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: waiting for stale token metadata versions to be released", version);
+        {
+            seastar::timer<lowres_clock> warn_timer([&ss, version] {
+                rtlogger.warn("raft_topology_cmd::barrier_and_drain version {}: still waiting for stale versions, "
+                              "stale versions (version: use_count): {}",
+                              version, ss._shared_token_metadata.describe_stale_versions());
+            });
+            warn_timer.arm_periodic(std::chrono::minutes(5));
+            co_await ss._shared_token_metadata.stale_versions_in_use();
+        }
+        rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: stale versions released, draining closing sessions", version);
        co_await get_topology_session_manager().drain_closing_sessions();

-        rtlogger.info("raft_topology_cmd::barrier_and_drain done");
+        rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: done", version);
    });
 }

@@ -4509,7 +4518,9 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
        auto& raft_server = _group0->group0_server();
        auto group0_holder = _group0->hold_group0_gate();
        // do barrier to make sure we always see the latest topology
+        rtlogger.info("topology cmd rpc {} index={}: starting read_barrier, term={}", cmd.cmd, cmd_index, term);
        co_await raft_server.read_barrier(&_group0_as);
+        rtlogger.info("topology cmd rpc {} index={}: read_barrier completed", cmd.cmd, cmd_index);
        if (raft_server.get_current_term() != term) {
           // Return an error since the command is from outdated leader
           co_return result;
@@ -5949,18 +5960,12 @@ future<join_node_request_result> storage_service::join_node_request_handler(join
        if (const auto *p = _topology_state_machine._topology.find(params.host_id)) {
            const auto& rs = p->second;
            if (rs.state == node_state::left) {
-                rtlogger.warn("the node {} attempted to join",
-                        " but it was removed from the cluster. Rejecting"
-                        " the node",
-                        params.host_id);
+                rtlogger.warn("the node {} attempted to join but it was removed from the cluster. Rejecting the node", params.host_id);
                result.result = join_node_request_result::rejected{
                    .reason = "The node has already been removed from the cluster",
                };
            } else {
-                rtlogger.warn("the node {} attempted to join",
-                        " again after an unfinished attempt but it is no longer"
-                        " allowed to do so. Rejecting the node",
-                        params.host_id);
+                rtlogger.warn("the node {} attempted to join again after an unfinished attempt but it is no longer allowed to do so. Rejecting the node", params.host_id);
                result.result = join_node_request_result::rejected{
                    .reason = "The node requested to join before but didn't finish the procedure. "
                              "Please clear the data directory and restart.",
--- a/service/tablet_allocator.cc
+++ b/service/tablet_allocator.cc
@@ -2117,10 +2117,14 @@ public:
        co_return std::move(plan);
    }

+    // Returns the schema and tablet-aware replication strategy for a given table.
+    // Returns {nullptr, nullptr} if the table has been dropped concurrently (race between
+    // the token metadata snapshot and the live schema).
    std::tuple<schema_ptr, const tablet_aware_replication_strategy*> get_schema_and_rs(table_id table) {
        auto t = _db.get_tables_metadata().get_table_if_exists(table);
        if (!t) {
-            on_internal_error(lblogger, format("Table {} does not exist", table));
+            lblogger.debug("Table {} no longer exists, skipping", table);
+            return {nullptr, nullptr};
        }

        auto s = t->schema();
@@ -2135,6 +2139,8 @@ public:
        return {s, rs};
    }

+    // Returns the tablet-aware replication strategy for a given table, or nullptr
+    // if the table has been dropped concurrently.
    const tablet_aware_replication_strategy* get_rs(table_id id) {
        auto [s, rs] = get_schema_and_rs(id);
        return rs;
@@ -2158,6 +2164,7 @@ public:
        sstring target_tablet_count_reason; // Winning rule for target_tablet_count value.
        std::optional<uint64_t> avg_tablet_size; // nullopt when stats not yet available.
        bool pow2_count; // Whether tablet count for the table should be a power of two.
+        bool tablet_merges_allowed; // Whether merges are allowed for the table.

        // Final tablet count.
        // It's target_tablet_count aligned to power of 2 if pow2_count == true.
@@ -2312,6 +2319,17 @@ public:
            table_plan.current_tablet_count = tablet_count;
            table_plan.pow2_count = tablet_options.pow2_count.value_or(
                    _db.features().arbitrary_tablet_boundaries ? db::tablet_options::default_pow2_count : true);
+            table_plan.tablet_merges_allowed = !s->tablet_merges_forbidden();
+            if (!table_plan.tablet_merges_allowed) {
+                // Block merge decisions for Alternator tablet tables whose
+                // stream configuration forbids merges. Tablet merges produce
+                // 2 parents per child which is incompatible with the DynamoDB
+                // Streams API. If a merge is already in progress on the tmap,
+                // suppressing new_resize_decision here causes the existing
+                // revocation logic in tables_being_resized to cancel the merge.
+                lblogger.debug("Table {} ({}.{}): suppressing new merge decision because tablet merges are forbidden",
+                            table, s->ks_name(), s->cf_name());
+            }

            rs_by_table[table] = rs;

@@ -2419,6 +2437,9 @@ public:
            }
            const auto& tmap = _tm->tablets().get_tablet_map(table);
            auto [s, rs] = get_schema_and_rs(table);
+            if (s == nullptr || rs == nullptr) {
+                continue;
+            }
            auto tablet_options = combine_tablet_options(
                    tables | std::views::transform([&] (table_id table) { return _db.get_tables_metadata().get_table_if_exists(table); })
                           | std::views::filter([] (auto t) { return t != nullptr; })
@@ -2551,7 +2572,7 @@ public:
            } else if (table_plan.target_tablet_count_aligned < table_plan.current_tablet_count) {
                // Needed to avoid oscillations, because we reduce the count by a factor of 2.
                // FIXME: Once we have a way to split individual tablets, we can achieve exactly the desired tablet count.
-                if (div_ceil(table_plan.current_tablet_count, 2) >= table_plan.target_tablet_count_aligned) {
+                if (table_plan.tablet_merges_allowed && div_ceil(table_plan.current_tablet_count, 2) >= table_plan.target_tablet_count_aligned) {
                    auto& tmap = _tm->tablets().get_tablet_map(table);
                    auto cur_decision = tmap.resize_decision();
                    if (cur_decision.is_merge()) {
@@ -2601,21 +2622,6 @@ public:
            resize_decision new_resize_decision;
            new_resize_decision.way = table_plan.resize_decision;

-            // Block merge decisions for Alternator tablet tables whose
-            // stream configuration forbids merges. Tablet merges produce
-            // 2 parents per child which is incompatible with the DynamoDB
-            // Streams API. If a merge is already in progress on the tmap,
-            // suppressing new_resize_decision here causes the existing
-            // revocation logic in tables_being_resized to cancel the merge.
-            if (new_resize_decision.is_merge()) {
-                auto [s, rs] = get_schema_and_rs(table);
-                if (s->tablet_merges_forbidden()) {
-                    lblogger.debug("Table {} ({}.{}): suppressing new merge decision because tablet merges are forbidden",
-                                   table, s->ks_name(), s->cf_name());
-                    new_resize_decision = {};
-                }
-            }
-
            table_size_desc size_desc {
                .avg_tablet_size = *table_plan.avg_tablet_size,
                .resize_decision = tmap.resize_decision(),
@@ -3287,6 +3293,10 @@ public:
        std::unordered_map<sstring, int> rack_load;

        auto rs = get_rs(tablet.table);
+        if (rs == nullptr) {
+            // Table was dropped concurrently. Skip this tablet.
+            return skip_info{};
+        }

        auto get_viable_targets = [&] () {
            std::unordered_set<host_id> viable_targets;
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -4237,6 +4237,7 @@ public:
        , _topology_cmd_rpc_tracker(topology_cmd_rpc_tracker)
        , _async_gate("topology_coordinator")
    {
+        _lifecycle_notifier.register_subscriber(this);
        _db.get_notifier().register_listener(this);
        // When the delay_cdc_stream_finalization error injection is disabled
        // (test releases it), wake the topology coordinator so it retries
@@ -4400,6 +4401,7 @@ future<bool> topology_coordinator::maybe_retry_failed_rf_change_tablet_rebuilds(
 }

 future<> topology_coordinator::refresh_tablet_load_stats() {
+    co_await utils::get_local_injector().inject("refresh_tablet_load_stats_pause", utils::wait_for_message(5min));
    auto tm = get_token_metadata_ptr();

    locator::load_stats stats;
@@ -4723,7 +4725,6 @@ future<> topology_coordinator::run() {

    co_await _async_gate.close();
    co_await std::move(tablet_load_stats_refresher);
-    co_await _tablet_load_stats_refresh.join();
    co_await std::move(cdc_generation_publisher);
    co_await std::move(cdc_streams_gc);
    co_await std::move(gossiper_orphan_remover);
@@ -4736,6 +4737,8 @@ future<> topology_coordinator::stop() {
    co_await _db.get_notifier().unregister_listener(this);
    utils::get_local_injector().unregister_on_disable("delay_cdc_stream_finalization");
    _topo_sm.on_tablet_split_ready = nullptr;
+    co_await _lifecycle_notifier.unregister_subscriber(this);
+    co_await _tablet_load_stats_refresh.join();

    // if topology_coordinator::run() is aborted either because we are not a
    // leader anymore, or we are shutting down as a leader, we have to handle
@@ -4797,7 +4800,6 @@ future<> run_topology_coordinator(
            topology_cmd_rpc_tracker};

    std::exception_ptr ex;
-    lifecycle_notifier.register_subscriber(&coordinator);
    try {
        rtlogger.info("start topology coordinator fiber");
        co_await with_scheduling_group(group0.get_scheduling_group(), [&] {
@@ -4818,7 +4820,7 @@ future<> run_topology_coordinator(
        }
        on_fatal_internal_error(rtlogger, format("unhandled exception in topology_coordinator::run: {}", ex));
    }
-    co_await lifecycle_notifier.unregister_subscriber(&coordinator);
+    co_await utils::get_local_injector().inject("topology_coordinator_pause_before_stop", utils::wait_for_message(5min));
    co_await coordinator.stop();
 }

--- a/sstables/mx/partition_reversing_data_source.cc
+++ b/sstables/mx/partition_reversing_data_source.cc
@@ -502,7 +502,7 @@ public:
                }
                if (_row_start != _partition_end) {
                    on_internal_error(sstlog, format(
-                        "partition_reversing_data_source: invariant broken: _row_start == _row_end({}), but"
+                        "partition_reversing_data_source: invariant broken: _row_start({}) == _row_end({}), but"
                        " != _partition_end({})", _row_start, _row_end, _partition_end));
                }
                look_in_last_block = true;
--- a/sstables/mx/reader.cc
+++ b/sstables/mx/reader.cc
@@ -505,7 +505,7 @@ public:
            return consume_range_tombstone_boundary(std::move(pos), end_tombstone, start_tombstone);
        }
        default:
-            on_parse_error(format("Invalid boundary type", static_cast<std::underlying_type<sstables::bound_kind_m>::type>(kind)), _sst->get_filename());
+            on_parse_error(format("Invalid boundary type {}", static_cast<std::underlying_type<sstables::bound_kind_m>::type>(kind)), _sst->get_filename());
        }
    }

@@ -2221,7 +2221,7 @@ public:
        case bound_kind_m::excl_end_incl_start:
            return consume_range_tombstone(ecp, bound_kind::incl_start, start_tombstone);
        default:
-            on_parse_error(format("Invalid boundary type", static_cast<std::underlying_type_t<bound_kind_m>>(kind)), {});
+            on_parse_error(format("Invalid boundary type {}", static_cast<std::underlying_type_t<bound_kind_m>>(kind)), {});
        }
    }

--- a/sstables/storage.cc
+++ b/sstables/storage.cc
@@ -543,11 +543,16 @@ future<> filesystem_storage::wipe(const sstable& sst, sync_dir sync) noexcept {
            // during SSTable writing and removed before sealing.  If the write
            // failed before sealing, the file may still be on disk and must be
            // cleaned up explicitly.
+            // The component is only defined for the `ms` sstable format; for
+            // older formats it is absent from the component map and looking up
+            // its filename would throw std::out_of_range.
            // Use file_exists() to avoid a C++ exception on the common path
            // where the file was already removed before sealing.
-            auto temp_hashes = filename(sst, dir_name.native(), sst._generation, component_type::TemporaryHashes);
-            if (co_await file_exists(temp_hashes)) {
-                co_await sst.sstable_write_io_check(remove_file, std::move(temp_hashes));
+            if (sstable_version_constants::get_component_map(sst.get_version()).contains(component_type::TemporaryHashes)) {
+                auto temp_hashes = filename(sst, dir_name.native(), sst._generation, component_type::TemporaryHashes);
+                if (co_await file_exists(temp_hashes)) {
+                    co_await sst.sstable_write_io_check(remove_file, std::move(temp_hashes));
+                }
            }
            if (sync) {
                co_await sst.sstable_write_io_check(sync_directory, dir_name.native());
--- a/sstables/trie/trie_writer.hh
+++ b/sstables/trie/trie_writer.hh
@@ -32,6 +32,7 @@

 #pragma once

+#include <seastar/core/thread.hh>
 #include <seastar/util/log.hh>
 #include <map>
 #include <set>
@@ -254,6 +255,7 @@ inline void trie_writer<Output>::lay_out_children(ptr<writer_node> x) {
    }

    while (unwritten_children.size()) {
+        seastar::thread::maybe_yield();
        // Find the smallest child which doesn't fit.
        // (If all fit, then this will be the past-the-end iterator).
        // Its predecessor will be the biggest child which does fit.
@@ -350,6 +352,7 @@ template <trie_writer_sink Output>
 inline void trie_writer<Output>::complete_until_depth(size_t depth) {
    expensive_log("writer_node::complete_until_depth: start,_stack={}, depth={}, _current_depth={}", _stack.size(), depth, _current_depth);
    while (_current_depth > depth) {
+        seastar::thread::maybe_yield();
        // Every node must be smaller than a page, and the transition chain
        // must be short enough to ensure that.
        //
--- a/table_helper.cc
+++ b/table_helper.cc
@@ -9,6 +9,7 @@

 #include "cql3/statements/property_definitions.hh"
 #include "utils/assert.hh"
+#include "utils/error_injection.hh"
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
 #include "table_helper.hh"
@@ -135,10 +136,32 @@ future<> table_helper::cache_table_info(cql3::query_processor& qp, service::migr
 }

 future<> table_helper::insert(cql3::query_processor& qp, service::migration_manager& mm, service::query_state& qs, noncopyable_function<cql3::query_options ()> opt_maker) {
-    co_await cache_table_info(qp, mm, qs);
+    // _prepared_stmt is a checked_weak_ptr into the prepared statements
+    // cache and can be invalidated by a concurrent purge (e.g. on a schema
+    // change). cache_table_info() (re-)prepares and assigns _prepared_stmt,
+    // but the pin protecting the entry is dropped when try_prepare()
+    // returns. In release the chain of ready-future co_awaits back to here
+    // resumes synchronously, but debug builds preempt on every co_await
+    // even for ready futures, opening a window for a purge to drop the
+    // entry and leave _prepared_stmt null. Loop until a synchronous
+    // post-resume check finds _prepared_stmt valid; nothing can run between
+    // that check and the dereference below. _insert_stmt is a strong
+    // shared_ptr and is not affected by cache invalidation.
+    while (true) {
+        co_await cache_table_info(qp, mm, qs);
+        if (_prepared_stmt) {
+            break;
+        }
+    }
+    // Pin a strong ref locally: while we suspend in execute(), a concurrent
+    // insert() on this shard may reset _insert_stmt to nullptr if the
+    // prepared_statements_cache entry gets invalidated, freeing the object.
+    auto stmt = _insert_stmt;
    auto opts = opt_maker();
    opts.prepare(_prepared_stmt->bound_names);
-    co_await _insert_stmt->execute(qp, qs, opts, std::nullopt);
+    co_await utils::get_local_injector().inject("table_helper_insert_before_execute",
+            utils::wait_for_message(std::chrono::seconds{30}));
+    co_await stmt->execute(qp, qs, opts, std::nullopt);
 }

 future<> table_helper::setup_keyspace(cql3::query_processor& qp, service::migration_manager& mm, std::string_view keyspace_name, sstring replication_strategy_name,
--- a/test.py
+++ b/test.py
@@ -11,9 +11,11 @@ from __future__ import annotations

 import argparse
 import asyncio
+import dataclasses
 import math
 import shlex
 import textwrap
+from bisect import insort
 from random import randint

 import pytest
@@ -183,6 +185,8 @@ def parse_cmd_line() -> argparse.Namespace:
                        help="Specific byte limit for failure injection (random by default)")
    parser.add_argument('--skip-internet-dependent-tests', action="store_true",
                        help="Skip tests which depend on artifacts from the internet.")
+    parser.add_argument('--keep-duplicates', action='store_true', default=False,
+                        help="Do not deduplicate test arguments.")
    parser.add_argument("--pytest-arg", action='store', type=str,
                        default=None, dest="pytest_arg",
                        help="Additional command line arguments to pass to pytest, for example ./test.py --pytest-arg=\"-v -x\"")
@@ -241,6 +245,73 @@ def parse_cmd_line() -> argparse.Namespace:
    return args


+# TODO: Remove _CollectionArgument and _deduplicate_test_args once we update
+# to pytest 9.x, which fixes argument deduplication:
+# https://github.com/pytest-dev/pytest/issues/12083
+@dataclasses.dataclass(frozen=True, order=True)
+class _CollectionArgument:
+    """Resolved collection argument for deduplication.
+
+    A version-independent subset of pytest's CollectionArgument that
+    includes the fields needed for normalization (parametrization and
+    original_index were added in pytest 9.0).
+
+    ``a in b`` means ``b`` subsumes (contains) ``a``.  Adapted from
+    pytest 9.0.3 ``_pytest.main.is_collection_argument_subsumed_by``.
+    """
+    path: pathlib.Path
+    parts: tuple[str, ...]
+    parametrization: str
+    original_index: int
+
+    def __contains__(self, other: _CollectionArgument) -> bool:
+        if self.path != other.path:
+            return not self.parts and other.path.is_relative_to(self.path)
+        if len(self.parts) > len(other.parts) or other.parts[:len(self.parts)] != self.parts:
+            return False
+        return not self.parametrization or self.parametrization == other.parametrization
+
+
+def _deduplicate_test_args(args: list[str]) -> list[str]:
+    """Remove duplicate and subsumed test arguments.
+
+    Resolves and normalizes CLI test arguments, then applies the normalization
+    algorithm from pytest 9.0.3 to remove exact duplicates and arguments whose
+    paths are contained within another argument's path.
+    For example, ``["test/cql", "test/cql/lua_test.cql"]`` becomes ``["test/cql"]``.
+    """
+    if not args:
+        return args
+    invocation_path = pathlib.Path.cwd()
+    resolved_sorted: list[_CollectionArgument] = []
+    unresolved_indices: set[int] = set()
+    for i, arg in enumerate(args):
+        # Adapted from pytest 9.0.3 _pytest.main.resolve_collection_argument.
+        base, squacket, rest = arg.partition("[")
+        strpath, *parts = base.split("::")
+        fspath = pathlib.Path(os.path.abspath(invocation_path / strpath))
+        if not fspath.exists():
+            # Keep unresolved args — let pytest report the error.
+            unresolved_indices.add(i)
+            continue
+        insort(resolved_sorted, _CollectionArgument(
+            path=fspath,
+            parts=tuple(parts),
+            parametrization=squacket + rest,
+            original_index=i,
+        ))
+
+    # Normalize: remove duplicates and subsumed arguments using an O(n log n)
+    # sort-based algorithm adapted from pytest 9.0.3.
+    normalized = resolved_sorted[:1]
+    for ca in resolved_sorted[1:]:
+        if ca not in normalized[-1]:
+            normalized.append(ca)
+
+    kept_indices = {ca.original_index for ca in normalized} | unresolved_indices
+    return [arg for i, arg in enumerate(args) if i in kept_indices]
+
+
 def run_pytest(options: argparse.Namespace) -> int:
    # When tests are executed in parallel on different hosts, we need to distinguish results from them.
    # So HOST_ID needed to not overwrite results from different hosts during Jenkins will copy to one directory.
@@ -249,7 +320,8 @@ def run_pytest(options: argparse.Namespace) -> int:

    report_dir =  temp_dir / 'report'
    junit_output_file = report_dir / f'pytest_cpp_{HOST_ID}.xml'
-    files_to_run = options.name or [str(TOP_SRC_DIR / 'test/')]
+    files_to_run = options.name if options.keep_duplicates else _deduplicate_test_args(options.name)
+    files_to_run = files_to_run or [str(TOP_SRC_DIR / 'test/')]
    args = [
        '--color=yes',
        f'--repeat={options.repeat}',
@@ -269,6 +341,8 @@ def run_pytest(options: argparse.Namespace) -> int:
        ])
    if options.verbose:
        args.append('-v')
+    if options.keep_duplicates:
+        args.append('--keep-duplicates')
    if options.quiet:
        args.append('--quiet')
        args.extend(['-p','no:sugar'])
--- a/test/alternator/run
+++ b/test/alternator/run
@@ -70,11 +70,6 @@ def run_alternator_cmd(pid, dir):
        # now that this parameter is used also by CQL's per-row TTL.
        #'--alternator-ttl-period-in-seconds', '0.5',
        '--alternator-allow-system-table-write=1',
-        # Allow testing experimental features. Following issue #9467, we need
-        # to add here specific experimental features as they are introduced.
-        # We only list here Alternator-specific experimental features - CQL
-        # ones are listed in test/cqlpy/run.py.
-        '--experimental-features=alternator-streams',
        # this is required by test_streams.py test_parent_filtering and test_get_records_with_alternating_tablets_count
        # setting the value using scylla_config_temporary won't work, because the value is read
        # at the start and then periodically with `tablet-load-stats-refresh-interval-in-seconds`
--- a/Show More
+++ b/Show More