docs: fix typo in materialized views docs - "columns are" instead of "is"

Agent-Logs-Url: https://github.com/scylladb/scylladb/sessions/bcc29e46-1902-4ac6-9a16-4b7e3d03421a Co-authored-by: annastuchlik <37244380+annastuchlik@users.noreply.github.com>
Initial plan
2026-05-13 11:22:01 +00:00 · 2026-04-27 14:19:39 +00:00 · 2026-04-27 14:18:58 +00:00
178 changed files with 1757 additions and 3308 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -36,6 +36,4 @@ compile_commands.json
 clang_build
 .idea/
 nuke
-rust/**/target
-rust/**/Cargo.lock
-test/resource/wasm/rust/target
+rust/target
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -681,7 +681,7 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
    case parsed::primitive_condition::type::VALUE:
        if (calculated_values.size() != 1) {
            // Shouldn't happen unless we have a bug in the parser
-            throw std::logic_error(format("Unexpected values {} in primitive_condition", cond._values.size()));
+            throw std::logic_error(format("Unexpected values in primitive_condition", cond._values.size()));
        }
        // Unwrap the boolean wrapped as the value (if it is a boolean)
        if (calculated_values[0].IsObject() && calculated_values[0].MemberCount() == 1) {
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -1362,33 +1362,6 @@ static int get_dimensions(const rjson::value& vector_attribute, std::string_view
    return dimensions_v->GetInt();
 }

-// As noted in issue #5052, in Alternator the CreateTable and UpdateTable are
-// currently synchronous - they return only after the operation is complete.
-// After announce() of the new schema finished, the schema change is committed
-// and a majority of nodes know it - but it's possible that some live nodes
-// have not yet applied the new schema. If we return to the user now, and the
-// user sends a node request that relies on the new schema, it might fail.
-// So before returning, we must verify that *all* nodes have applied the new
-// schema. This is what wait_for_schema_agreement_after_ddl() does.
-//
-// Note that wait_for_schema_agreement_after_ddl() has a timeout (currently
-// hard-coded to 30 seconds). If the timeout is reached an InternalServerError
-// is returned. The user, who doesn't know if the CreateTable succeeded or not,
-// can retry the request and will get a ResourceInUseException and know the
-// table already exists. So a CreateTable that returns a ResourceInUseException
-// should also call wait_for_schema_agreement_after_ddl().
-//
-// When issue #5052 is resolved, this function can be removed - we will need
-// to check if we reached schema agreement, but not to *wait* for it.
-static future<> wait_for_schema_agreement_after_ddl(service::migration_manager& mm, const replica::database& db) {
-    static constexpr auto schema_agreement_seconds = 30;
-    try {
-        co_await mm.wait_for_schema_agreement(db, db::timeout_clock::now() + std::chrono::seconds(schema_agreement_seconds), nullptr);
-    } catch (const service::migration_manager::schema_agreement_timeout&) {
-        throw api_error::internal(fmt::format("The operation was successful, but unable to confirm cluster-wide schema agreement after {} seconds. Please retry the operation, and wait for the retry to report an error since the operation was already done.", schema_agreement_seconds));
-    }
-}
-
 future<executor::request_return_type> executor::create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization,
            const db::tablets_mode_t::mode tablets_mode, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
    throwing_assert(this_shard_id() == 0);
@@ -1722,26 +1695,13 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
                }
            }
        }
-        bool table_already_exists = false;
        try {
            schema_mutations = service::prepare_new_keyspace_announcement(_proxy.local_db(), ksm, ts);
        } catch (exceptions::already_exists_exception&) {
            if (_proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
-                table_already_exists = true;
+                co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
            }
        }
-        if (table_already_exists) {
-            // The user may have retried a CreateTable operation after it timed
-            // out in wait_for_schema_agreement_after_ddl(). So before we may
-            // return ResourceInUseException (which can lead the user to start
-            // using the table which it now knows exists), we need to wait for
-            // schema agreement, just like the original CreateTable did. Again
-            // we fail with InternalServerError if schema agreement still cannot
-            // be reached. We can release group0_guard before waiting.
-            release_guard(std::move(group0_guard));
-            co_await wait_for_schema_agreement_after_ddl(_mm, _proxy.local_db());
-            co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
-        }
        if (_proxy.data_dictionary().try_find_table(schema->id())) {
            // This should never happen, the ID is supposed to be unique
            co_return api_error::internal(format("Table with ID {} already exists", schema->id()));
@@ -1790,7 +1750,7 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
        }
    }

-    co_await wait_for_schema_agreement_after_ddl(_mm, _proxy.local_db());
+    co_await _mm.wait_for_schema_agreement(_proxy.local_db(), db::timeout_clock::now() + 10s, nullptr);
    rjson::value status = rjson::empty_object();
    executor::supplement_table_info(request, *schema, _proxy);
    rjson::add(status, "TableDescription", std::move(request));
@@ -1900,7 +1860,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
            rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
            if (stream_specification && stream_specification->IsObject()) {
                empty_request = false;
-                if (add_stream_options(*stream_specification, builder, p.local(), tab->cdc_options())) {
+                if (add_stream_options(*stream_specification, builder, p.local())) {
                    validate_cdc_log_name_length(builder.cf_name());
                    // On tablet tables, defer stream enablement and block
                    // tablet merges (see defer_enabling_streams_block_tablet_merges).
@@ -1915,23 +1875,6 @@ future<executor::request_return_type> executor::update_table(client_state& clien
                        if (tab->cdc_options().enabled() || tab->cdc_options().enable_requested()) {
                            co_return api_error::validation("Table already has an enabled stream: TableName: " + tab->cf_name());
                        }
-                        // When re-enabling streams on an Alternator table, drop the old
-                        // CDC log table first as a separate schema change, so the
-                        // subsequent UpdateTable creates a fresh one with a new UUID
-                        // (= new StreamArn). See #7239.
-                        auto logname = cdc::log_name(tab->cf_name());
-                        auto& local_db = p.local().local_db();
-                        if (local_db.has_schema(tab->ks_name(), logname)
-                                && cdc::is_log_schema(*local_db.find_schema(tab->ks_name(), logname))) {
-                            auto drop_m = co_await service::prepare_column_family_drop_announcement(
-                                p.local(), tab->ks_name(), logname,
-                                group0_guard.write_timestamp());
-                            co_await mm.announce(std::move(drop_m), std::move(group0_guard),
-                                format("alternator-executor: drop old CDC log for {}", tab->cf_name()));
-                            co_await mm.wait_for_schema_agreement(
-                                p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
-                            continue;
-                        }
                    }
                    else if (!tab->cdc_options().enabled() && !tab->cdc_options().enable_requested()) {
                        co_return api_error::validation("Table has no stream to disable: TableName: " + tab->cf_name());
@@ -2246,7 +2189,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
                throw;
            }
        }
-        co_await wait_for_schema_agreement_after_ddl(mm, p.local().local_db());
+        co_await mm.wait_for_schema_agreement(p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);

        rjson::value status = rjson::empty_object();
        supplement_table_info(request, *schema, p.local());
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -30,7 +30,6 @@
 #include "utils/updateable_value.hh"

 #include "tracing/trace_state.hh"
-#include "cdc/cdc_options.hh"


 namespace db {
@@ -200,7 +199,7 @@ private:
        tracing::trace_state_ptr trace_state, service_permit permit);

 public:
-    static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp, const cdc::options& existing_cdc_opts = {});
+    static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp);
    static void supplement_table_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp);
    static void supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp);
 };
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -243,10 +243,7 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
        if (!is_alternator_keyspace(ks_name)) {
            continue;
        }
-        // Use get_base_table instead of is_log_for_some_table because the
-        // latter requires CDC to be enabled, but we want to list streams
-        // that have been disabled but whose log table still exists (#7239).
-        if (cdc::get_base_table(db.real_database(), ks_name, cf_name)) {
+        if (cdc::is_log_for_some_table(db.real_database(), ks_name, cf_name)) {
            rjson::value new_entry = rjson::empty_object();

            auto arn = stream_arn{ i->schema(), cdc::get_base_table(db.real_database(), *i->schema()) };
@@ -395,7 +392,7 @@ std::istream& operator>>(std::istream& is, stream_view_type& type) {
    return is;
 }

-static stream_view_type cdc_options_to_stream_view_type(const cdc::options& opts) {
+static stream_view_type cdc_options_to_steam_view_type(const cdc::options& opts) {
    stream_view_type type = stream_view_type::KEYS_ONLY;
    if (opts.preimage() && opts.postimage()) {
        type = stream_view_type::NEW_AND_OLD_IMAGES;
@@ -841,7 +838,6 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
    auto& opts = bs->cdc_options();

    auto status = "DISABLED";
-    bool stream_disabled = !opts.enabled();

    if (opts.enabled()) {
        if (!_cdc_metadata.streams_available()) {
@@ -857,7 +853,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    rjson::add(stream_desc, "StreamStatus", rjson::from_string(status));

-    stream_view_type type = cdc_options_to_stream_view_type(opts);
+    stream_view_type type = cdc_options_to_steam_view_type(opts);

    rjson::add(stream_desc, "StreamArn", stream_arn);
    rjson::add(stream_desc, "StreamViewType", type);
@@ -865,9 +861,10 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    describe_key_schema(stream_desc, *bs);

-    // For disabled streams, we still fall through to enumerate shards
-    // below. All shards will have EndingSequenceNumber set, indicating
-    // they are closed. See issue #7239.
+    if (!opts.enabled()) {
+        rjson::add(ret, "StreamDescription", std::move(stream_desc));
+        co_return rjson::print(std::move(ret));
+    }

    // TODO: label
    // TODO: creation time
@@ -950,12 +947,6 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
        auto expired = [&]() -> std::optional<db_clock::time_point> {
            auto j = std::next(i);
            if (j == e) {
-                // For a disabled stream, all shards are closed (#7239).
-                // Use "now" as the ending sequence number for the last
-                // generation's shards.
-                if (stream_disabled) {
-                    return db_clock::now();
-                }
                return std::nullopt;
            }
            // add this so we sort of match potential 
@@ -1306,7 +1297,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
        | std::ranges::to<query::column_id_vector>()
    ;

-    stream_view_type type = cdc_options_to_stream_view_type(base->cdc_options());
+    stream_view_type type = cdc_options_to_steam_view_type(base->cdc_options());

    auto selection = cql3::selection::selection::for_columns(schema, std::move(columns));
    auto partition_slice = query::partition_slice(
@@ -1490,17 +1481,17 @@ future<executor::request_return_type> executor::get_records(client_state& client

    auto& shard = iter.shard;

-    if (!base->cdc_options().enabled()) {
-        // Stream is disabled -- all shards are closed (#7239).
-        // Don't return NextShardIterator.
-    } else if (shard.time < ts && ts < high_ts) {
+    if (shard.time < ts && ts < high_ts) {
        // The DynamoDB documentation states that when a shard is
        // closed, reading it until the end has NextShardIterator
        // "set to null". Our test test_streams_closed_read
        // confirms that by "null" they meant not set at all.
    } else {
-        // Shard is still open with no records in the scanned window.
-        // Return the original iterator so the client can poll again.
+        // We could have return the same iterator again, but we did
+        // a search from it until high_ts and found nothing, so we
+        // can also start the next search from high_ts.
+        // TODO: but why? It's simpler just to leave the iterator be.
+        shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
        rjson::add(ret, "NextShardIterator", iter);
    }
    _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
@@ -1510,13 +1501,17 @@ future<executor::request_return_type> executor::get_records(client_state& client
    co_return rjson::print(std::move(ret));
 }

-bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp, const cdc::options& existing_cdc_opts) {
+bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
    auto stream_enabled = rjson::find(stream_specification, "StreamEnabled");
    if (!stream_enabled || !stream_enabled->IsBool()) {
        throw api_error::validation("StreamSpecification needs boolean StreamEnabled");
    }

    if (stream_enabled->GetBool()) {
+        if (!sp.features().alternator_streams) {
+            throw api_error::validation("StreamSpecification: alternator streams feature not enabled in cluster.");
+        }
+
        cdc::options opts;
        opts.enabled(true);
        opts.tablet_merge_blocked(true);
@@ -1542,13 +1537,8 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche
        builder.with_cdc_options(opts);
        return true;
    } else {
-        // When disabling, preserve the existing CDC options (preimage,
-        // postimage, ttl, etc.) so that DescribeStream can still report
-        // the correct StreamViewType on a disabled stream.
-        cdc::options opts = existing_cdc_opts;
+        cdc::options opts;
        opts.enabled(false);
-        opts.enable_requested(false);
-        opts.tablet_merge_blocked(false);
        builder.with_cdc_options(opts);
        return false;
    }
@@ -1556,36 +1546,33 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche

 void executor::supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp) {
    auto& opts = schema.cdc_options();
-    // Report stream info when:
-    //   1. Log table exists (covers both enabled and disabled-but-readable).
-    //   2. enable_requested (ENABLING state, log not yet created).
-    auto db = sp.data_dictionary();
-    auto log_name = cdc::log_name(schema.cf_name());
-    auto log_cf = db.try_find_table(schema.ks_name(), log_name);
-    if (log_cf) {
-        auto log_schema = log_cf->schema();
-        stream_arn arn(log_schema, cdc::get_base_table(db.real_database(), *log_schema));
+    if (opts.enabled()) {
+        auto db = sp.data_dictionary();
+        auto cf = db.find_table(schema.ks_name(), cdc::log_name(schema.cf_name()));
+        stream_arn arn(cf.schema(), cdc::get_base_table(db.real_database(), *cf.schema()));
        rjson::add(descr, "LatestStreamArn", arn);
-        rjson::add(descr, "LatestStreamLabel", rjson::from_string(stream_label(*log_schema)));
-
-        auto stream_desc = rjson::empty_object();
-        rjson::add(stream_desc, "StreamEnabled", opts.enabled());
-
-        stream_view_type mode = cdc_options_to_stream_view_type(opts);
-        rjson::add(stream_desc, "StreamViewType", mode);
-        rjson::add(descr, "StreamSpecification", std::move(stream_desc));
-    } else if (opts.enable_requested()) {
-        // DynamoDB returns StreamEnabled=true in StreamSpecification even when
-        // the stream status is ENABLING (not yet fully active). We mirror this
-        // behavior: enable_requested means the user asked for streams but CDC
-        // is not yet finalized, so we still report StreamEnabled=true.
-        auto stream_desc = rjson::empty_object();
-        rjson::add(stream_desc, "StreamEnabled", true);
-
-        stream_view_type mode = cdc_options_to_stream_view_type(opts);
-        rjson::add(stream_desc, "StreamViewType", mode);
-        rjson::add(descr, "StreamSpecification", std::move(stream_desc));
+        rjson::add(descr, "LatestStreamLabel", rjson::from_string(stream_label(*cf.schema())));
+    } else if (!opts.enable_requested()) {
+        return;
    }
+    // For both enabled() and enable_requested():
+    // DynamoDB returns StreamEnabled=true in StreamSpecification even when
+    // the stream status is ENABLING (not yet fully active). We mirror this
+    // behavior: enable_requested means the user asked for streams but CDC
+    // is not yet finalized, so we still report StreamEnabled=true.
+    auto stream_desc = rjson::empty_object();
+    rjson::add(stream_desc, "StreamEnabled", true);
+
+    auto mode = stream_view_type::KEYS_ONLY;
+    if (opts.preimage() && opts.postimage()) {
+        mode = stream_view_type::NEW_AND_OLD_IMAGES;
+    } else if (opts.preimage()) {
+        mode = stream_view_type::OLD_IMAGE;
+    } else if (opts.postimage()) {
+        mode = stream_view_type::NEW_IMAGE;
+    }
+    rjson::add(stream_desc, "StreamViewType", mode);
+    rjson::add(descr, "StreamSpecification", std::move(stream_desc));
 }

 } // namespace alternator
--- a/audit/audit.cc
+++ b/audit/audit.cc
@@ -194,36 +194,22 @@ future<> audit::start_audit(const db::config& cfg, sharded<locator::shared_token
                                  std::move(audited_keyspaces),
                                  std::move(audited_tables),
                                  std::move(audited_categories),
-                                  std::cref(cfg));
-}
-
-future<> audit::start_storage(const db::config& cfg) {
-    if (!audit_instance().local_is_initialized()) {
-        return make_ready_future<>();
-    }
-    return audit_instance().invoke_on_all([&cfg] (audit& local_audit) {
-        return local_audit._storage_helper_ptr->start(cfg).then([&local_audit] {
-            local_audit._storage_running = true;
+                                  std::cref(cfg))
+    .then([&cfg] {
+        if (!audit_instance().local_is_initialized()) {
+            return make_ready_future<>();
+        }
+        return audit_instance().invoke_on_all([&cfg] (audit& local_audit) {
+            return local_audit.start(cfg);
        });
    });
 }

-future<> audit::stop_storage() {
-    if (!audit_instance().local_is_initialized()) {
-        return make_ready_future<>();
-    }
-    return audit_instance().invoke_on_all([] (audit& local_audit) {
-        local_audit._storage_running = false;
-        return local_audit._storage_helper_ptr->stop();
-    });
-}
-
 future<> audit::stop_audit() {
    if (!audit_instance().local_is_initialized()) {
        return make_ready_future<>();
    }
    return audit::audit::audit_instance().invoke_on_all([] (auto& local_audit) {
-        SCYLLA_ASSERT(!local_audit._storage_running);
        return local_audit.shutdown();
    }).then([] {
        return audit::audit::audit_instance().stop();
@@ -237,6 +223,14 @@ audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& k
    return std::make_unique<audit_info>(cat, keyspace, table, batch);
 }

+future<> audit::start(const db::config& cfg) {
+    return _storage_helper_ptr->start(cfg);
+}
+
+future<> audit::stop() {
+    return _storage_helper_ptr->stop();
+}
+
 future<> audit::shutdown() {
    return make_ready_future<>();
 }
@@ -247,12 +241,6 @@ future<> audit::log(const audit_info& audit_info, const service::client_state& c
    const sstring& username = client_state.user() ? client_state.user()->name.value_or(anonymous_username) : no_username;
    socket_address client_ip = client_state.get_client_address().addr();
    socket_address node_ip = _token_metadata.get()->get_topology().my_address().addr();
-    if (!_storage_running) {
-        on_internal_error_noexcept(logger, fmt::format("Audit log dropped (storage not ready): node_ip {} category {} cl {} error {} keyspace {} query '{}' client_ip {} table {} username {}",
-            node_ip, audit_info.category_string(), cl, error, audit_info.keyspace(),
-            audit_info.query(), client_ip, audit_info.table(), username));
-        return make_ready_future<>();
-    }
    if (logger.is_enabled(logging::log_level::debug)) {
        logger.debug("Log written: node_ip {} category {} cl {} error {} keyspace {} query '{}' client_ip {} table {} username {}",
            node_ip, audit_info.category_string(), cl, error, audit_info.keyspace(),
@@ -298,11 +286,6 @@ future<> inspect(const audit_info_alternator& ai, const service::client_state& c

 future<> audit::log_login(const sstring& username, socket_address client_ip, bool error) noexcept {
    socket_address node_ip = _token_metadata.get()->get_topology().my_address().addr();
-    if (!_storage_running) {
-        on_internal_error_noexcept(logger, fmt::format("Audit login log dropped (storage not ready): node_ip {} client_ip {} username {} error {}",
-            node_ip, client_ip, username, error ? "true" : "false"));
-        return make_ready_future<>();
-    }
    if (logger.is_enabled(logging::log_level::debug)) {
        logger.debug("Login log written: node_ip {}, client_ip {}, username {}, error {}",
            node_ip, client_ip, username, error ? "true" : "false");
--- a/audit/audit.hh
+++ b/audit/audit.hh
@@ -141,7 +141,6 @@ private:
    category_set _audited_categories;

    std::unique_ptr<storage_helper> _storage_helper_ptr;
-    bool _storage_running = false;

    const db::config& _cfg;
    utils::observer<sstring> _cfg_keyspaces_observer;
@@ -164,8 +163,6 @@ public:
        return audit_instance().local();
    }
    static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
-    static future<> start_storage(const db::config& cfg);
-    static future<> stop_storage();
    static future<> stop_audit();
    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch = false);
    audit(locator::shared_token_metadata& stm,
@@ -177,6 +174,8 @@ public:
          category_set&& audited_categories,
          const db::config& cfg);
    ~audit();
+    future<> start(const db::config& cfg);
+    future<> stop();
    future<> shutdown();
    bool should_log(const audit_info& audit_info) const;
    bool will_log(statement_category cat, std::string_view keyspace = {}, std::string_view table = {}) const;
--- a/auth/cache.cc
+++ b/auth/cache.cc
@@ -185,14 +185,24 @@ future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& r
        static const sstring q = format("SELECT role, name, value FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, ROLE_ATTRIBUTES_CF);
        auto rs = co_await fetch(q);
        for (const auto& r : *rs) {
-            if (!r.has("value")) {
-                continue;
-            }
            rec->attributes[r.get_as<sstring>("name")] =
                    r.get_as<sstring>("value");
            co_await coroutine::maybe_yield();
        }
    }
+    // permissions
+    {
+        static const sstring q = format("SELECT role, resource, permissions FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, PERMISSIONS_CF);
+        auto rs = co_await fetch(q);
+        for (const auto& r : *rs) {
+            auto resource = r.get_as<sstring>("resource");
+            auto perms_strings = r.get_set<sstring>("permissions");
+            std::unordered_set<sstring> perms_set(perms_strings.begin(), perms_strings.end());
+            auto pset = permissions::from_strings(perms_set);
+            rec->permissions[std::move(resource)] = std::move(pset);
+            co_await coroutine::maybe_yield();
+        }
+    }
    co_return rec;
 }

--- a/auth/cache.hh
+++ b/auth/cache.hh
@@ -44,6 +44,7 @@ public:
        std::unordered_set<role_name_t> members;
        sstring salted_hash;
        std::unordered_map<sstring, sstring, sstring_hash, sstring_eq> attributes;
+        std::unordered_map<sstring, permission_set, sstring_hash, sstring_eq> permissions;
    private:
        friend cache;
        // cached permissions include effects of role's inheritance
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -76,11 +76,7 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
    if (results->empty()) {
        co_return permissions::NONE;
    }
-    const auto& row = results->one();
-    if (!row.has(PERMISSIONS_NAME)) {
-        co_return permissions::NONE;
-    }
-    co_return permissions::from_strings(row.get_set<sstring>(PERMISSIONS_NAME));
+    co_return permissions::from_strings(results->one().get_set<sstring>(PERMISSIONS_NAME));
 }

 future<>
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -258,11 +258,13 @@ future<> ldap_role_manager::start() {
            } catch (const seastar::sleep_aborted&) {
                co_return; // ignore
            }
-            try {
-                co_await _cache.reload_all_permissions();
-            } catch (...) {
-                mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
-            }
+            co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
+                try {
+                    co_await c.reload_all_permissions();
+                } catch (...) {
+                    mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
+                }
+            });
        }
    });
    return _std_mgr.start();
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -157,20 +157,6 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
            return create_legacy_keyspace_if_missing(mm);
        });
    }
-    // Authorizer must be started before the permission loader is set,
-    // because the loader calls _authorizer->authorize().
-    // The loader must be set before starting the role manager, because
-    // LDAP role manager starts a pruner fiber that calls
-    // reload_all_permissions() which asserts _permission_loader is set.
-    co_await _authorizer->start();
-    if (!_used_by_maintenance_socket) {
-        // Maintenance socket mode can't cache permissions because it has
-        // different authorizer. We can't mix cached permissions, they could be
-        // different in normal mode.
-        _cache.set_permission_loader(std::bind(
-                &service::get_uncached_permissions,
-                this, std::placeholders::_1, std::placeholders::_2));
-    }
    co_await _role_manager->start();
    if (this_shard_id() == 0) {
        // Role manager and password authenticator have this odd startup
@@ -179,19 +165,21 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
        // creation therefore we need to wait here.
        co_await _role_manager->ensure_superuser_is_created();
    }
-    // Authenticator must be started after ensure_superuser_is_created()
-    // because password_authenticator queries system.roles for the
-    // superuser entry created by the role manager.
-    co_await _authenticator->start();
+    co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
+    if (!_used_by_maintenance_socket) {
+        // Maintenance socket mode can't cache permissions because it has
+        // different authorizer. We can't mix cached permissions, they could be
+        // different in normal mode.
+        _cache.set_permission_loader(std::bind(
+                &service::get_uncached_permissions,
+                this, std::placeholders::_1, std::placeholders::_2));
+    }
 }

 future<> service::stop() {
    _as.request_abort();
-    // Reverse of start() order.
-    co_await _authenticator->stop();
-    co_await _role_manager->stop();
    _cache.set_permission_loader(nullptr);
-    co_await _authorizer->stop();
+    return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
 }

 future<> service::ensure_superuser_is_created() {
--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -267,7 +267,7 @@ struct extract_row_visitor {
            visit_collection(v);
        },
        [&] (const abstract_type& o) {
-            throw std::runtime_error(format("extract_changes: unknown collection type: {}", o.name()));
+            throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
        }
        ));
    }
--- a/cmake/mode.common.cmake
+++ b/cmake/mode.common.cmake
@@ -137,24 +137,6 @@ endfunction()

 option(Scylla_WITH_DEBUG_INFO "Enable debug info" OFF)

-# Time trace profiling: adds -ftime-trace to all C++ compilations (Clang only).
-# Each .o produces a companion .json file in the build directory that can be
-# analyzed with ClangBuildAnalyzer or loaded in chrome://tracing.
-#
-# Usage:
-#   cmake -DScylla_TIME_TRACE=ON ...
-#   ninja
-#   # Analyze results (requires ClangBuildAnalyzer):
-#   ClangBuildAnalyzer --all <build-dir> capture.bin
-#   ClangBuildAnalyzer --analyze capture.bin
-option(Scylla_TIME_TRACE "Enable Clang -ftime-trace for build profiling" OFF)
-if(Scylla_TIME_TRACE)
-  if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-    message(FATAL_ERROR "Scylla_TIME_TRACE requires Clang (found ${CMAKE_CXX_COMPILER_ID})")
-  endif()
-  add_compile_options(-ftime-trace)
-endif()
-
 macro(update_build_flags config)
  cmake_parse_arguments (
    parsed_args
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -1088,7 +1088,7 @@ void compaction_manager::register_metrics() {
        sm::make_gauge("normalized_backlog", [this] { return _last_backlog / available_memory(); },
                       sm::description("Holds the sum of normalized compaction backlog for all tables in the system. Backlog is normalized by dividing backlog by shard's available memory.")),
        sm::make_counter("validation_errors", [this] { return _validation_errors; },
-                       sm::description("Holds the number of encountered validation errors.")).set_skip_when_empty(),
+                       sm::description("Holds the number of encountered validation errors.")),
    });
 }

--- a/configure.py
+++ b/configure.py
@@ -285,12 +285,8 @@ def generate_compdb(compdb, ninja, buildfile, modes):
                os.symlink(compdb_target, compdb)
            except FileExistsError:
                # if there is already a valid compile_commands.json link in the
-                # source root, we are done. if it's a stale link, update it.
-                if os.path.islink(compdb):
-                    current_target = os.readlink(compdb)
-                    if not os.path.exists(current_target):
-                        os.unlink(compdb)
-                        os.symlink(compdb_target, compdb)
+                # source root, we are done.
+                pass
            return


@@ -597,7 +593,6 @@ scylla_tests = set([
    'test/boost/linearizing_input_stream_test',
    'test/boost/lister_test',
    'test/boost/locator_topology_test',
-    'test/boost/lock_tables_metadata_test',
    'test/boost/log_heap_test',
    'test/boost/logalloc_standard_allocator_segment_pool_backend_test',
    'test/boost/logalloc_test',
@@ -858,10 +853,6 @@ arg_parser.add_argument('--coverage', action = 'store_true', help = 'Compile scy
 arg_parser.add_argument('--build-dir', action='store', default='build',
                        help='Build directory path')
 arg_parser.add_argument('--disable-precompiled-header', action='store_true', default=False, help='Disable precompiled header for scylla binary')
-arg_parser.add_argument('--time-trace', action='store_true', default=False,
-                        help='Enable Clang -ftime-trace for build profiling. '
-                             'Each .o produces a .json file analyzable with '
-                             'ClangBuildAnalyzer or chrome://tracing')
 arg_parser.add_argument('-h', '--help', action='store_true', help='show this help message and exit')
 args = arg_parser.parse_args()
 if args.help:
@@ -1668,7 +1659,6 @@ deps['test/boost/combined_tests'] += [
    'test/boost/auth_cache_test.cc',
    'test/boost/auth_test.cc',
    'test/boost/batchlog_manager_test.cc',
-    'test/boost/table_helper_test.cc',
    'test/boost/cache_algorithm_test.cc',
    'test/boost/castas_fcts_test.cc',
    'test/boost/cdc_test.cc',
@@ -1720,7 +1710,7 @@ deps['test/boost/combined_tests'] += [
    'test/boost/sstable_compression_config_test.cc',
    'test/boost/sstable_directory_test.cc',
    'test/boost/sstable_set_test.cc',
-    'test/boost/sstable_tablet_streaming_test.cc',
+    'test/boost/sstable_tablet_streaming.cc',
    'test/boost/statement_restrictions_test.cc',
    'test/boost/storage_proxy_test.cc',
    'test/boost/tablets_test.cc',
@@ -1975,9 +1965,6 @@ user_cflags += ' -fextend-variable-liveness=none'
 if args.target != '':
    user_cflags += ' -march=' + args.target

-if args.time_trace:
-    user_cflags += ' -ftime-trace'
-
 for mode in modes:
    # Those flags are passed not only to Scylla objects, but also to libraries
    # that we compile ourselves.
@@ -2470,9 +2457,6 @@ def write_build_file(f,
            command = reloc/build_deb.sh --reloc-pkg $in --builddir $out
        rule unified
            command = unified/build_unified.sh --build-dir $builddir/$mode --unified-pkg $out
-        rule collect_pkgs
-            command = rm -rf $out && mkdir -p $out && cp $pkgs $out/
-            description = COLLECT $out
        rule rust_header
            command = cxxbridge --include rust/cxx.h --header $in > $out
            description = RUST_HEADER $out
@@ -2958,8 +2942,6 @@ def write_build_file(f,
        build dist-tar: phony dist-unified-tar dist-server-tar dist-python3-tar dist-cqlsh-tar

        build dist: phony dist-unified dist-server dist-python3 dist-cqlsh
-
-        build collect-dist: phony {' '.join([f'collect-dist-{mode}' for mode in default_modes])}
        '''))

    f.write(textwrap.dedent(f'''\
@@ -2967,28 +2949,7 @@ def write_build_file(f,
        rule dist-check
          command = ./tools/testing/dist-check/dist-check.sh --mode $mode
        '''))
-    deb_arch = {'x86_64': 'amd64', 'aarch64': 'arm64'}[arch]
-    deb_ver = f'{scylla_version}-{scylla_release}-1'
-    rpm_ver = f'{scylla_version}-{scylla_release}'
    for mode in build_modes:
-        server_rpms_dir = f'$builddir/dist/{mode}/redhat/RPMS/{arch}'
-        server_rpms = [f'{server_rpms_dir}/{scylla_product}{suffix}-{rpm_ver}.{arch}.rpm'
-                       for suffix in ['', '-server', '-server-debuginfo', '-conf', '-kernel-conf', '-node-exporter']]
-        cqlsh_rpms = [f'tools/cqlsh/build/redhat/RPMS/{arch}/{scylla_product}-cqlsh-{rpm_ver}.{arch}.rpm']
-        python3_rpms = [f'tools/python3/build/redhat/RPMS/{arch}/{scylla_product}-python3-{rpm_ver}.{arch}.rpm']
-        all_rpms = server_rpms + cqlsh_rpms + python3_rpms
-
-        server_deb_dir = f'$builddir/dist/{mode}/debian'
-        server_debs = [f'{server_deb_dir}/{scylla_product}{suffix}_{deb_ver}_{deb_arch}.deb'
-                       for suffix in ['', '-server', '-server-dbg', '-conf', '-kernel-conf', '-node-exporter']]
-        server_debs += [f'{server_deb_dir}/scylla-enterprise{suffix}_{deb_ver}_all.deb'
-                        for suffix in ['', '-server', '-conf', '-kernel-conf', '-node-exporter']]
-        cqlsh_debs = [f'tools/cqlsh/build/debian/{scylla_product}-cqlsh_{deb_ver}_{deb_arch}.deb',
-                      f'tools/cqlsh/build/debian/scylla-enterprise-cqlsh_{deb_ver}_all.deb']
-        python3_debs = [f'tools/python3/build/debian/{scylla_product}-python3_{deb_ver}_{deb_arch}.deb',
-                        f'tools/python3/build/debian/scylla-enterprise-python3_{deb_ver}_all.deb']
-        all_debs = server_debs + cqlsh_debs + python3_debs
-
        f.write(textwrap.dedent(f'''\
        build $builddir/{mode}/dist/tar/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz: copy tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
        build $builddir/{mode}/dist/tar/{scylla_product}-python3-package.tar.gz: copy tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
@@ -2996,11 +2957,6 @@ def write_build_file(f,
        build $builddir/{mode}/dist/tar/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz: copy tools/cqlsh/build/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz
        build $builddir/{mode}/dist/tar/{scylla_product}-cqlsh-package.tar.gz: copy tools/cqlsh/build/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz

-        build $builddir/{mode}/dist/rpm: collect_pkgs | {' '.join(all_rpms)} $builddir/dist/{mode}/redhat dist-cqlsh-rpm dist-python3-rpm
-          pkgs = {' '.join(all_rpms)}
-        build $builddir/{mode}/dist/deb: collect_pkgs | {' '.join(all_debs)} $builddir/dist/{mode}/debian dist-cqlsh-deb dist-python3-deb
-          pkgs = {' '.join(all_debs)}
-        build collect-dist-{mode}: phony $builddir/{mode}/dist/rpm $builddir/{mode}/dist/deb
        build {mode}-dist: phony dist-server-{mode} dist-server-debuginfo-{mode} dist-python3-{mode} dist-unified-{mode} dist-cqlsh-{mode}
        build dist-{mode}: phony {mode}-dist
        build dist-check-{mode}: dist-check
--- a/cql3/authorized_prepared_statements_cache.hh
+++ b/cql3/authorized_prepared_statements_cache.hh
@@ -136,9 +136,9 @@ public:
    {}

    future<> insert(auth::authenticated_user user, cql3::prepared_cache_key_type prep_cache_key, value_type v) noexcept {
-        return _cache.insert(key_type(std::move(user), std::move(prep_cache_key)), [v = std::move(v)] (const cache_key_type&) mutable {
+        return _cache.get_ptr(key_type(std::move(user), std::move(prep_cache_key)), [v = std::move(v)] (const cache_key_type&) mutable {
            return make_ready_future<value_type>(std::move(v));
-        });
+        }).discard_result();
    }

    value_ptr find(const auth::authenticated_user& user, const cql3::prepared_cache_key_type& prep_cache_key) {
--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -1070,7 +1070,7 @@ try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database
                                .args = {},
                            };
                        } else {
-                            throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument, got {}", fc.args[0]));
+                            throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
                        }
                    }
                }
--- a/data_dictionary/data_dictionary.cc
+++ b/data_dictionary/data_dictionary.cc
@@ -339,7 +339,7 @@ static storage_options::object_storage object_storage_from_map(std::string_view
    }
    if (values.size() > allowed_options.size()) {
        throw std::runtime_error(fmt::format("Extraneous options for {}: {}; allowed: {}",
-            type, fmt::join(values | std::views::keys, ","),
+            fmt::join(values | std::views::keys, ","), type,
            fmt::join(allowed_options | std::views::keys, ",")));
    }
    options.type = std::string(type);
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -776,7 +776,7 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
    friend std::ostream& operator<<(std::ostream&, const segment&);
    friend class segment_manager;

-    constexpr size_t sector_overhead(size_t size) const {
+    size_t sector_overhead(size_t size) const {
        return (size / (_alignment - detail::sector_overhead_size)) * detail::sector_overhead_size;
    }

@@ -1028,21 +1028,18 @@ public:
        co_return me;
    }

-    std::tuple<size_t, size_t> buffer_usage_size(size_t s) const {
+    /**
+     * Allocate a new buffer
+     */
+    void new_buffer(size_t s) {
+        SCYLLA_ASSERT(_buffer.empty());
+
        auto overhead = segment_overhead_size;
        if (_file_pos == 0) {
            overhead += descriptor_header_size;
        }

-        return {s + overhead, overhead};
-    }
-
-    /**
-     * Allocate a new buffer
-     */
-    void new_buffer(size_t size_in) {
-        SCYLLA_ASSERT(_buffer.empty());
-        auto [s, overhead] = buffer_usage_size(size_in);
+        s += overhead;
        // add bookkeep data reqs. 
        auto a = align_up(s + sector_overhead(s), _alignment);
        auto k = std::max(a, default_size);
@@ -1430,9 +1427,6 @@ public:

    position_type next_position(size_t size) const {
        auto used = _buffer_ostream_size - _buffer_ostream.size();
-        if (used == 0) { // new chunk/segment
-            std::tie(size, std::ignore) = buffer_usage_size(size);
-        }
        used += size;
        return _file_pos + used + sector_overhead(used);
    }
@@ -1576,6 +1570,7 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
    clogger.debug("Attempting oversized alloc of {} entry writer", writer.num_entries);

    auto size = writer.size();
+    auto max_file_size = cfg.commitlog_segment_size_in_mb * 1024 * 1024;

    // check if this cannot be written at all...
    if (!cfg.allow_going_over_size_limit) {
@@ -1584,11 +1579,11 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
        // more worst case
        auto size_with_meta_overhead = size_with_sector_overhead
            + (1 + size_with_sector_overhead/max_mutation_size) * (segment::entry_overhead_size + segment::fragmented_entry_overhead_size + segment::segment_overhead_size)
-            * (1 + size_with_sector_overhead/max_size) * segment::descriptor_header_size
+            * (1 + size_with_sector_overhead/max_file_size) * segment::descriptor_header_size
            ;
        // this is not really true. We could have some space in current segment,
        // but again, lets be conservative.
-        auto max_file_size_avail = max_disk_size - max_size;
+        auto max_file_size_avail = max_disk_size - max_file_size;

        if (size_with_meta_overhead > max_file_size_avail) {
            throw std::invalid_argument(fmt::format("Mutation of {} bytes is too large for potentially available disk space of {}", size, max_file_size_avail));
@@ -1775,13 +1770,11 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
                    co_await s->close();
                    s = co_await get_segment();
                }
-                // bytes not counting overhead
-                auto pos = s->position();
-                auto max = std::max<size_t>(pos, max_size);
-                auto buf_rem = std::min(max_size - max, s->_buffer_ostream.size());
+                // bytes not counting overhead                
+                auto buf_rem = std::min(max_size - s->position(), s->_buffer_ostream.size());

                size_t avail;
-                if (buf_rem >= align) {
+                if (buf_rem > align) {
                    auto rem2 = buf_rem - (1 + buf_rem/sector_size) * detail::sector_overhead_size;
                    avail = std::min(rem2, max_mutation_size)
                        - segment::entry_overhead_size
@@ -1791,7 +1784,7 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
                } else {
                    co_await s->cycle();
                    auto pos = s->position();
-                    auto max = std::max<size_t>(pos, max_size);
+                    auto max = std::max<size_t>(pos, max_file_size);
                    auto file_rem = max - pos;

                    if (file_rem < align) {
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -217,7 +217,7 @@ future<> db::commitlog_replayer::impl::process(stats* s, commitlog::buffer_and_r
        if (cm_it == local_cm.end()) {
            if (!cer.get_column_mapping()) {
                rlogger.debug("replaying at {} v={} at {}", fm.column_family_id(), fm.schema_version(), rp);
-                throw std::runtime_error(format("unknown schema version {}, table={}", fm.schema_version(), fm.column_family_id()));
+                throw std::runtime_error(format("unknown schema version {}, table=", fm.schema_version(), fm.column_family_id()));
            }
            rlogger.debug("new schema version {} in entry {}", fm.schema_version(), rp);
            cm_it = local_cm.emplace(fm.schema_version(), *cer.get_column_mapping()).first;
--- a/db/config.cc
+++ b/db/config.cc
@@ -1921,7 +1921,7 @@ std::map<sstring, db::experimental_features_t::feature> db::experimental_feature
        {"lwt", feature::UNUSED},
        {"udf", feature::UDF},
        {"cdc", feature::UNUSED},
-        {"alternator-streams", feature::UNUSED},
+        {"alternator-streams", feature::ALTERNATOR_STREAMS},
        {"alternator-ttl", feature::UNUSED },
        {"consistent-topology-changes", feature::UNUSED},
        {"broadcast-tables", feature::BROADCAST_TABLES},
--- a/db/config.hh
+++ b/db/config.hh
@@ -115,6 +115,7 @@ struct experimental_features_t {
    enum class feature {
        UNUSED,
        UDF,
+        ALTERNATOR_STREAMS,
        BROADCAST_TABLES,
        KEYSPACE_STORAGE_OPTIONS,
        STRONGLY_CONSISTENT_TABLES,
--- a/db/heat_load_balance.cc
+++ b/db/heat_load_balance.cc
@@ -327,7 +327,7 @@ redistribute(const std::vector<float>& p, unsigned me, unsigned k) {
                }
            }

-            hr_logger.trace("     pp after1={}", pp);
+            hr_logger.trace("     pp after1=", pp);
            if (d.first == me) {
                // We only care what "me" sends, and only the elements in
                // the sorted list earlier than me could have forced it to
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -13,6 +13,7 @@
 #include "replica/database.hh"
 #include "db/consistency_level_type.hh"
 #include "db/system_keyspace.hh"
+#include "db/config.hh"
 #include "schema/schema_builder.hh"
 #include "timeout_config.hh"
 #include "types/types.hh"
@@ -21,6 +22,8 @@
 #include "cdc/generation.hh"
 #include "cql3/query_processor.hh"
 #include "service/storage_proxy.hh"
+#include "gms/feature_service.hh"
+
 #include "service/migration_manager.hh"
 #include "locator/host_id.hh"

@@ -38,10 +41,27 @@ static logging::logger dlogger("system_distributed_keyspace");
 extern logging::logger cdc_log;

 namespace db {
+namespace {
+    const auto set_wait_for_sync_to_commitlog = schema_builder::register_schema_initializer([](schema_builder& builder) {
+        if ((builder.ks_name() == system_distributed_keyspace::NAME_EVERYWHERE && builder.cf_name() == system_distributed_keyspace::CDC_GENERATIONS_V2) ||
+            (builder.ks_name() == system_distributed_keyspace::NAME && builder.cf_name() == system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION))
+        {
+            builder.set_wait_for_sync_to_commitlog(true);
+        }
+    });
+}

 extern thread_local data_type cdc_streams_set_type;
 thread_local data_type cdc_streams_set_type = set_type_impl::get_instance(bytes_type, false);

+/* See `token_range_description` struct */
+thread_local data_type cdc_streams_list_type = list_type_impl::get_instance(bytes_type, false);
+thread_local data_type cdc_token_range_description_type = tuple_type_impl::get_instance(
+        { long_type             // dht::token token_range_end;
+        , cdc_streams_list_type // std::vector<stream_id> streams;
+        , byte_type             // uint8_t sharding_ignore_msb;
+        });
+thread_local data_type cdc_generation_description_type = list_type_impl::get_instance(cdc_token_range_description_type, false);

 schema_ptr view_build_status() {
    static thread_local auto schema = [] {
@@ -57,6 +77,42 @@ schema_ptr view_build_status() {
    return schema;
 }

+/* An internal table used by nodes to exchange CDC generation data. */
+schema_ptr cdc_generations_v2() {
+    thread_local auto schema = [] {
+        auto id = generate_legacy_id(system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2);
+        return schema_builder(system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2, {id})
+                /* The unique identifier of this generation. */
+                .with_column("id", uuid_type, column_kind::partition_key)
+                /* The generation describes a mapping from all tokens in the token ring to a set of stream IDs.
+                 * This mapping is built from a bunch of smaller mappings, each describing how tokens in a subrange
+                 * of the token ring are mapped to stream IDs; these subranges together cover the entire token ring.
+                 * Each such range-local mapping is represented by a row of this table.
+                 * The clustering key of the row is the end of the range being described by this row.
+                 * The start of this range is the range_end of the previous row (in the clustering order, which is the integer order)
+                 * or of the last row of this partition if this is the first the first row. */
+                .with_column("range_end", long_type, column_kind::clustering_key)
+                /* The set of streams mapped to in this range.
+                 * The number of streams mapped to a single range in a CDC generation is bounded from above by the number
+                 * of shards on the owner of that range in the token ring.
+                 * In other words, the number of elements of this set is bounded by the maximum of the number of shards
+                 * over all nodes. The serialized size is obtained by counting about 20B for each stream.
+                 * For example, if all nodes in the cluster have at most 128 shards,
+                 * the serialized size of this set will be bounded by ~2.5 KB. */
+                .with_column("streams", cdc_streams_set_type)
+                /* The value of the `ignore_msb` sharding parameter of the node which was the owner of this token range
+                 * when the generation was first created. Together with the set of streams above it fully describes
+                 * the mapping for this particular range. */
+                .with_column("ignore_msb", byte_type)
+                /* Column used for sanity checking.
+                 * For a given generation it's equal to the number of ranges in this generation;
+                 * thus, after the generation is fully inserted, it must be equal to the number of rows in the partition. */
+                .with_column("num_ranges", int32_type, column_kind::static_column)
+                .with_hash_version()
+                .build();
+    }();
+    return schema;
+}

 /* A user-facing table providing identifiers of the streams used in CDC generations. */
 schema_ptr cdc_desc() {
@@ -96,6 +152,23 @@ schema_ptr cdc_timestamps() {

 static const sstring CDC_TIMESTAMPS_KEY = "timestamps";

+schema_ptr service_levels() {
+    static thread_local auto schema = [] {
+        auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS);
+        auto builder = schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS, std::make_optional(id))
+                .with_column("service_level", utf8_type, column_kind::partition_key)
+                .with_column("shares", int32_type);
+        if (utils::get_local_injector().is_enabled("service_levels_v1_table_without_shares")) {
+            builder.remove_column("shares");
+        }
+
+        return builder
+                .with_hash_version()
+                .build();
+    }();
+    return schema;
+}
+
 // This is the set of tables which this node ensures to exist in the cluster.
 // It does that by announcing the creation of these schemas on initialization
 // of the `system_distributed_keyspace` service (see `start()`), unless it first
@@ -109,13 +182,19 @@ static const sstring CDC_TIMESTAMPS_KEY = "timestamps";
 static std::vector<schema_ptr> ensured_tables() {
    return {
        view_build_status(),
+        cdc_generations_v2(),
        cdc_desc(),
        cdc_timestamps(),
+        service_levels(),
    };
 }

 std::vector<schema_ptr> system_distributed_keyspace::all_distributed_tables() {
-    return {view_build_status(), cdc_desc(), cdc_timestamps()};
+    return {view_build_status(), cdc_desc(), cdc_timestamps(), service_levels()};
+}
+
+std::vector<schema_ptr> system_distributed_keyspace::all_everywhere_tables() {
+    return {cdc_generations_v2()};
 }

 system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor& qp, service::migration_manager& mm, service::storage_proxy& sp)
@@ -124,6 +203,36 @@ system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor&
        , _sp(sp) {
 }

+static std::vector<std::pair<std::string_view, data_type>> new_service_levels_columns(bool workload_prioritization_enabled) {
+    std::vector<std::pair<std::string_view, data_type>> new_columns {{"timeout", duration_type}, {"workload_type", utf8_type}};
+    if (workload_prioritization_enabled) {
+        new_columns.push_back({"shares", int32_type});
+    }
+    return new_columns;
+};
+
+static schema_ptr get_current_service_levels(data_dictionary::database db) {
+    return db.has_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS)
+            ? db.find_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS)
+            : service_levels();
+}
+
+static schema_ptr get_updated_service_levels(data_dictionary::database db, bool workload_prioritization_enabled) {
+    SCYLLA_ASSERT(this_shard_id() == 0);
+    auto schema = get_current_service_levels(db);
+    schema_builder b(schema);
+    for (const auto& col : new_service_levels_columns(workload_prioritization_enabled)) {
+        auto& [col_name, col_type] = col;
+        bytes options_name = to_bytes(col_name.data());
+        if (schema->get_column_definition(options_name)) {
+            continue;
+        }
+        b.with_column(options_name, col_type, column_kind::regular_column);
+    }
+    b.with_hash_version();
+    return b.build();
+}
+
 future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tables) {
    if (this_shard_id() != 0) {
        _started = true;
@@ -134,9 +243,11 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl

    while (true) {
        // Check if there is any work to do before taking the group 0 guard.
-        bool keyspaces_setup = db.has_keyspace(NAME);
+        bool workload_prioritization_enabled = _sp.features().workload_prioritization;
+        bool keyspaces_setup = db.has_keyspace(NAME) && db.has_keyspace(NAME_EVERYWHERE);
        bool tables_setup = std::all_of(tables.begin(), tables.end(), [db] (schema_ptr t) { return db.has_schema(t->ks_name(), t->cf_name()); } );
-        if (keyspaces_setup && tables_setup) {
+        bool service_levels_up_to_date = get_current_service_levels(db)->equal_columns(*get_updated_service_levels(db, workload_prioritization_enabled));
+        if (keyspaces_setup && tables_setup && service_levels_up_to_date) {
            dlogger.info("system_distributed(_everywhere) keyspaces and tables are up-to-date. Not creating");
            _started = true;
            co_return;
@@ -147,25 +258,51 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
        utils::chunked_vector<mutation> mutations;
        sstring description;

-        auto ksm = keyspace_metadata::new_keyspace(
+        auto sd_ksm = keyspace_metadata::new_keyspace(
                NAME,
                "org.apache.cassandra.locator.SimpleStrategy",
                {{"replication_factor", "3"}},
                std::nullopt, std::nullopt);
        if (!db.has_keyspace(NAME)) {
-            mutations = service::prepare_new_keyspace_announcement(db.real_database(), ksm, ts);
+            mutations = service::prepare_new_keyspace_announcement(db.real_database(), sd_ksm, ts);
            description += format(" create {} keyspace;", NAME);
        } else {
            dlogger.info("{} keyspace is already present. Not creating", NAME);
        }

-        // Get mutations for creating tables.
+        auto sde_ksm = keyspace_metadata::new_keyspace(
+                NAME_EVERYWHERE,
+                "org.apache.cassandra.locator.EverywhereStrategy",
+                {},
+                std::nullopt, std::nullopt);
+        if (!db.has_keyspace(NAME_EVERYWHERE)) {
+            auto sde_mutations = service::prepare_new_keyspace_announcement(db.real_database(), sde_ksm, ts);
+            std::move(sde_mutations.begin(), sde_mutations.end(), std::back_inserter(mutations));
+            description += format(" create {} keyspace;", NAME_EVERYWHERE);
+        } else {
+            dlogger.info("{} keyspace is already present. Not creating", NAME_EVERYWHERE);
+        }
+
+        // Get mutations for creating and updating tables.
        auto num_keyspace_mutations = mutations.size();
        co_await coroutine::parallel_for_each(ensured_tables(),
-                [this, &mutations, db, ts, ksm] (auto&& table) -> future<> {
+                [this, &mutations, db, ts, sd_ksm, sde_ksm, workload_prioritization_enabled] (auto&& table) -> future<> {
+            auto ksm = table->ks_name() == NAME ? sd_ksm : sde_ksm;
+
+            // Ensure that the service_levels table contains new columns.
+            if (table->cf_name() == SERVICE_LEVELS) {
+                table = get_updated_service_levels(db, workload_prioritization_enabled);
+            }
+
            if (!db.has_schema(table->ks_name(), table->cf_name())) {
                co_return co_await service::prepare_new_column_family_announcement(mutations, _sp, *ksm, std::move(table), ts);
            }
+
+            // The service_levels table exists. Update it if it lacks new columns.
+            if (table->cf_name() == SERVICE_LEVELS && !get_current_service_levels(db)->equal_columns(*table)) {
+                auto update_mutations = co_await service::prepare_column_family_update_announcement(_sp, table, std::vector<view_ptr>(), ts);
+                std::move(update_mutations.begin(), update_mutations.end(), std::back_inserter(mutations));
+            }
        });
        if (mutations.size() > num_keyspace_mutations) {
            description += " create and update system_distributed(_everywhere) tables";
@@ -187,6 +324,15 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
    }
 }

+ future<> system_distributed_keyspace::start_workload_prioritization() {
+    if (this_shard_id() != 0) {
+        co_return;
+    }
+    if (_qp.db().features().workload_prioritization) {
+       co_await create_tables({get_updated_service_levels(_qp.db(), true)});
+    }
+}
+
 future<> system_distributed_keyspace::start() {
    if (this_shard_id() != 0) {
        _started = true;
@@ -229,6 +375,90 @@ static db::consistency_level quorum_if_many(size_t num_token_owners) {
    return num_token_owners > 1 ? db::consistency_level::QUORUM : db::consistency_level::ONE;
 }

+future<>
+system_distributed_keyspace::insert_cdc_generation(
+        utils::UUID id,
+        const cdc::topology_description& desc,
+        context ctx) {
+    using namespace std::chrono_literals;
+
+    const size_t concurrency = 10;
+    const size_t num_replicas = ctx.num_token_owners;
+
+    // To insert the data quickly and efficiently we send it in batches of multiple rows
+    // (each batch represented by a single mutation). We also send multiple such batches concurrently.
+    // However, we need to limit the memory consumption of the operation.
+    // I assume that the memory consumption grows linearly with the number of replicas
+    // (we send to all replicas ``at the same time''), with the batch size (the data must
+    // be copied for each replica?) and with concurrency. These assumptions may be too conservative
+    // but that won't hurt in a significant way (it may hurt the efficiency of the operation a little).
+    // Thus, if we want to limit the memory consumption to L, it should be true that
+    // mutation_size * num_replicas * concurrency <= L, hence
+    // mutation_size <= L / (num_replicas * concurrency).
+    // For example, say L = 10MB, concurrency = 10, num_replicas = 100; we get
+    // mutation_size <= 10MB / 1000 = 10KB.
+    // On the other hand we must have mutation_size >= size of a single row,
+    // so we will use mutation_size <= max(size of single row, L/(num_replicas*concurrency)).
+
+    // It has been tested that sending 1MB batches to 3 replicas with concurrency 20 works OK,
+    // which would correspond to L ~= 60MB. Hence that's the limit we use here.
+    const size_t L = 60'000'000;
+    const auto mutation_size_threshold = std::max(size_t(1), L / (num_replicas * concurrency));
+
+    auto s = _qp.db().real_database().find_schema(
+        system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2);
+    auto ms = co_await cdc::get_cdc_generation_mutations_v2(s, id, desc, mutation_size_threshold, api::new_timestamp());
+    co_await max_concurrent_for_each(ms, concurrency, [&] (mutation& m) -> future<> {
+        co_await _sp.mutate(
+            { std::move(m) },
+            db::consistency_level::ALL,
+            db::timeout_clock::now() + 60s,
+            nullptr, // trace_state
+            empty_service_permit(),
+            db::allow_per_partition_rate_limit::no,
+            false // raw_counters
+        );
+    });
+}
+
+future<std::optional<cdc::topology_description>>
+system_distributed_keyspace::read_cdc_generation(utils::UUID id) {
+    utils::chunked_vector<cdc::token_range_description> entries;
+    size_t num_ranges = 0;
+    co_await _qp.query_internal(
+            // This should be a local read so 20s should be more than enough
+            format("SELECT range_end, streams, ignore_msb, num_ranges FROM {}.{} WHERE id = ? USING TIMEOUT 20s", NAME_EVERYWHERE, CDC_GENERATIONS_V2),
+            db::consistency_level::ONE, // we wrote the generation with ALL so ONE must see it (or there's something really wrong)
+            { id },
+            1000, // for ~1KB rows, ~1MB page size
+            [&] (const cql3::untyped_result_set_row& row) {
+
+        std::vector<cdc::stream_id> streams;
+        row.get_list_data<bytes>("streams", std::back_inserter(streams));
+        entries.push_back(cdc::token_range_description{
+                dht::token::from_int64(row.get_as<int64_t>("range_end")),
+                std::move(streams),
+                uint8_t(row.get_as<int8_t>("ignore_msb"))});
+        num_ranges = row.get_as<int32_t>("num_ranges");
+        return make_ready_future<stop_iteration>(stop_iteration::no);
+    });
+
+    if (entries.empty()) {
+        co_return std::nullopt;
+    }
+
+    // Paranoic sanity check. Partial reads should not happen since generations should be retrieved only after they
+    // were written successfully with CL=ALL. But nobody uses EverywhereStrategy tables so they weren't ever properly
+    // tested, so just in case...
+    if (entries.size() != num_ranges) {
+        throw std::runtime_error(format(
+                "read_cdc_generation: wrong number of rows. The `num_ranges` column claimed {} rows,"
+                " but reading the partition returned {}.", num_ranges, entries.size()));
+    }
+
+    co_return std::optional{cdc::topology_description(std::move(entries))};
+}
+
 static future<utils::chunked_vector<mutation>> get_cdc_streams_descriptions_v2_mutation(
        const replica::database& db,
        db_clock::time_point time,
@@ -400,4 +630,65 @@ system_distributed_keyspace::cdc_current_generation_timestamp(context ctx) {
    co_return timestamp_cql->one().get_as<db_clock::time_point>("time");
 }

+future<qos::service_levels_info> system_distributed_keyspace::get_service_levels(qos::query_context ctx) const {
+    return qos::get_service_levels(_qp, NAME, SERVICE_LEVELS, db::consistency_level::ONE, ctx);
+}
+
+future<qos::service_levels_info> system_distributed_keyspace::get_service_level(sstring service_level_name) const {
+    return qos::get_service_level(_qp, NAME, SERVICE_LEVELS, service_level_name, db::consistency_level::ONE);
+}
+
+future<> system_distributed_keyspace::set_service_level(sstring service_level_name, qos::service_level_options slo) const {
+    static sstring prepared_query = format("INSERT INTO {}.{} (service_level) VALUES (?);", NAME, SERVICE_LEVELS);
+    co_await _qp.execute_internal(prepared_query, db::consistency_level::ONE, internal_distributed_query_state(), {service_level_name}, cql3::query_processor::cache_internal::no);
+    auto to_data_value = [&] (const qos::service_level_options::timeout_type& tv) {
+        return std::visit(overloaded_functor {
+            [&] (const qos::service_level_options::unset_marker&) {
+                return data_value::make_null(duration_type);
+            },
+            [&] (const qos::service_level_options::delete_marker&) {
+                return data_value::make_null(duration_type);
+            },
+            [&] (const lowres_clock::duration& d) {
+                return data_value(cql_duration(months_counter{0},
+                        days_counter{0},
+                        nanoseconds_counter{std::chrono::duration_cast<std::chrono::nanoseconds>(d).count()}));
+            },
+        }, tv);
+    };
+    auto to_data_value_g = [&] <typename T> (const std::variant<qos::service_level_options::unset_marker, qos::service_level_options::delete_marker, T>& v) {
+        return std::visit(overloaded_functor {
+            [&] (const qos::service_level_options::unset_marker&) {
+                return data_value::make_null(data_type_for<T>());
+            },
+            [&] (const qos::service_level_options::delete_marker&) {
+                return data_value::make_null(data_type_for<T>());
+            },
+            [&] (const T& v) {
+                return data_value(v);
+            },
+        }, v);
+    };
+    data_value workload = slo.workload == qos::service_level_options::workload_type::unspecified
+            ? data_value::make_null(utf8_type)
+            : data_value(qos::service_level_options::to_string(slo.workload));
+    co_await _qp.execute_internal(format("UPDATE {}.{} SET timeout = ?, workload_type = ? WHERE service_level = ?;", NAME, SERVICE_LEVELS),
+                db::consistency_level::ONE,
+                internal_distributed_query_state(),
+                {to_data_value(slo.timeout),
+                    workload,
+                    service_level_name},
+                cql3::query_processor::cache_internal::no);
+    co_await _qp.execute_internal(format("UPDATE {}.{} SET shares = ? WHERE service_level = ?;", NAME, SERVICE_LEVELS),
+                db::consistency_level::ONE,
+                internal_distributed_query_state(),
+                {to_data_value_g(slo.shares), service_level_name},
+                cql3::query_processor::cache_internal::no);
+}
+
+future<> system_distributed_keyspace::drop_service_level(sstring service_level_name) const {
+    static sstring prepared_query = format("DELETE FROM {}.{} WHERE service_level= ?;", NAME, SERVICE_LEVELS);
+    return _qp.execute_internal(prepared_query, db::consistency_level::ONE, internal_distributed_query_state(), {service_level_name}, cql3::query_processor::cache_internal::no).discard_result();
+}
+
 }
--- a/db/system_distributed_keyspace.hh
+++ b/db/system_distributed_keyspace.hh
@@ -9,6 +9,9 @@
 #pragma once

 #include "schema/schema_fwd.hh"
+#include "service/qos/qos_common.hh"
+#include "utils/UUID.hh"
+#include "cdc/generation_id.hh"
 #include "locator/host_id.hh"

 #include <seastar/core/future.hh>
@@ -21,6 +24,7 @@ class query_processor;
 }

 namespace cdc {
+    class stream_id;
    class topology_description;
    class streams_version;
 } // namespace cdc
@@ -35,8 +39,17 @@ namespace db {
 class system_distributed_keyspace {
 public:
    static constexpr auto NAME = "system_distributed";
+    static constexpr auto NAME_EVERYWHERE = "system_distributed_everywhere";

    static constexpr auto VIEW_BUILD_STATUS = "view_build_status";
+    static constexpr auto SERVICE_LEVELS = "service_levels";
+
+    /* Nodes use this table to communicate new CDC stream generations to other nodes. */
+    static constexpr auto CDC_TOPOLOGY_DESCRIPTION = "cdc_generation_descriptions";
+
+    /* Nodes use this table to communicate new CDC stream generations to other nodes.
+     * Resides in system_distributed_everywhere. */
+    static constexpr auto CDC_GENERATIONS_V2 = "cdc_generation_descriptions_v2";

    /* This table is used by CDC clients to learn about available CDC streams. */
    static constexpr auto CDC_DESC_V2 = "cdc_streams_descriptions_v2";
@@ -64,14 +77,19 @@ private:

 public:
    static std::vector<schema_ptr> all_distributed_tables();
+    static std::vector<schema_ptr> all_everywhere_tables();

    system_distributed_keyspace(cql3::query_processor&, service::migration_manager&, service::storage_proxy&);

    future<> start();
+    future<> start_workload_prioritization();
    future<> stop();

    bool started() const { return _started; }

+    future<> insert_cdc_generation(utils::UUID, const cdc::topology_description&, context);
+    future<std::optional<cdc::topology_description>> read_cdc_generation(utils::UUID);
+
    future<> create_cdc_desc(db_clock::time_point, const cdc::topology_description&, context);
    future<bool> cdc_desc_exists(db_clock::time_point, context);

@@ -87,6 +105,11 @@ public:
    // NOTE: currently used only by alternator
    future<db_clock::time_point> cdc_current_generation_timestamp(context);

+    future<qos::service_levels_info> get_service_levels(qos::query_context ctx) const;
+    future<qos::service_levels_info> get_service_level(sstring service_level_name) const;
+    future<> set_service_level(sstring service_level_name, qos::service_level_options slo) const;
+    future<> drop_service_level(sstring service_level_name) const;
+
 private:
    future<> create_tables(std::vector<schema_ptr> tables);
 };
--- a/db/view/node_view_update_backlog.hh
+++ b/db/view/node_view_update_backlog.hh
@@ -10,7 +10,6 @@

 #include "db/view/view_update_backlog.hh"
 #include "utils/error_injection.hh"
-#include "utils/updateable_value.hh"

 #include <seastar/core/cacheline.hh>
 #include <seastar/core/future.hh>
@@ -42,16 +41,13 @@ class node_update_backlog {
    std::chrono::milliseconds _interval;
    std::atomic<clock::time_point> _last_update;
    std::atomic<update_backlog> _max;
-    utils::updateable_value<uint32_t> _view_flow_control_delay_limit_in_ms;

 public:
-    explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval,
-            utils::updateable_value<uint32_t> view_flow_control_delay_limit_in_ms = utils::updateable_value<uint32_t>(1000))
+    explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval)
            : _backlogs(shards)
            , _interval(interval)
            , _last_update(clock::now() - _interval)
-            , _max(update_backlog::no_backlog())
-            , _view_flow_control_delay_limit_in_ms(std::move(view_flow_control_delay_limit_in_ms)) {
+            , _max(update_backlog::no_backlog()) {
        if (utils::get_local_injector().enter("update_backlog_immediately")) {
            _interval = std::chrono::milliseconds(0);
            _last_update = clock::now();
@@ -63,9 +59,6 @@ public:
    update_backlog fetch_shard(unsigned shard);
    seastar::future<std::optional<update_backlog>> fetch_if_changed();

-    std::chrono::microseconds calculate_throttling_delay(update_backlog backlog,
-            db::timeout_clock::time_point timeout) const;
-
    // Exposed for testing only.
    update_backlog load() const {
        return _max.load(std::memory_order_relaxed);
--- a/db/view/row_locking.cc
+++ b/db/view/row_locking.cc
@@ -150,14 +150,14 @@ row_locker::unlock(const dht::decorated_key* pk, bool partition_exclusive,
        auto pli = _two_level_locks.find(*pk);
        if (pli == _two_level_locks.end()) {
            // This shouldn't happen... We can't unlock this lock if we can't find it...
-            mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for partition {}", *pk);
+            mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for partition", *pk);
            return;
        }
        SCYLLA_ASSERT(&pli->first == pk);
        if (cpk) {
            auto rli = pli->second._row_locks.find(*cpk);
            if (rli == pli->second._row_locks.end()) {
-                mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for row {}", *cpk);
+                mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for row", *cpk);
                return;
            }
            SCYLLA_ASSERT(&rli->first == cpk);
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -45,7 +45,6 @@
 #include "db/view/view_builder.hh"
 #include "db/view/view_updating_consumer.hh"
 #include "db/view/view_update_generator.hh"
-#include "db/view/node_view_update_backlog.hh"
 #include "db/view/regular_column_transformation.hh"
 #include "db/system_keyspace_view_types.hh"
 #include "db/system_keyspace.hh"
@@ -3493,27 +3492,18 @@ future<> delete_ghost_rows_visitor::do_accept_new_row(partition_key pk, clusteri
    }
 }

-// View updates are asynchronous, and because of this limiting their concurrency requires
-// a special approach. The current algorithm places all of the pending view updates in the backlog
-// and artificially slows down new responses to coordinator requests based on how full the backlog is.
-// This function calculates how much a request should be slowed down based on the backlog's fullness.
-// The equation is basically: delay(in seconds) = view_fullness_ratio^3
-// The more full the backlog gets the more aggressively the requests are slowed down.
-// The delay is limited to the amount of time left until timeout.
-// After the timeout the request fails, so there's no point in waiting longer than that.
-// The second argument defines this timeout point - we can't delay the request more than this time point.
-// See: https://www.scylladb.com/2018/12/04/worry-free-ingestion-flow-control/
-std::chrono::microseconds node_update_backlog::calculate_throttling_delay(update_backlog backlog,
-                                                                         db::timeout_clock::time_point timeout) const {
+std::chrono::microseconds calculate_view_update_throttling_delay(db::view::update_backlog backlog,
+                                                                 db::timeout_clock::time_point timeout,
+                                                                 uint32_t view_flow_control_delay_limit_in_ms) {
    auto adjust = [] (float x) { return x * x * x; };
-    auto budget = std::max(db::timeout_clock::duration(0),
-        timeout - db::timeout_clock::now());
-    std::chrono::microseconds ret(uint32_t(adjust(backlog.relative_size()) * _view_flow_control_delay_limit_in_ms() * 1000));
+    auto budget = std::max(service::storage_proxy::clock_type::duration(0),
+        timeout - service::storage_proxy::clock_type::now());
+    std::chrono::microseconds ret(uint32_t(adjust(backlog.relative_size()) * view_flow_control_delay_limit_in_ms * 1000));
    // "budget" has millisecond resolution and can potentially be long
    // in the future so converting it to microseconds may overflow.
    // So to compare buget and ret we need to convert both to the lower
    // resolution.
-    if (std::chrono::duration_cast<db::timeout_clock::duration>(ret) < budget) {
+    if (std::chrono::duration_cast<service::storage_proxy::clock_type::duration>(ret) < budget) {
        return ret;
    } else {
        // budget is small (< ret) so can be converted to microseconds
--- a/db/view/view_building_worker.cc
+++ b/db/view/view_building_worker.cc
@@ -715,7 +715,7 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
            vbw_logger.info("Building range {} for base table {} and views {} was aborted.", range, base_id, views_ids);
        } catch (...) {
            eptr = std::current_exception();
-            vbw_logger.warn("Error during processing range {} for base table {} and views {}: {}", range, base_id, views_ids, eptr);
+            vbw_logger.warn("Error during processing range {} for base table {} and views {}: ", range, base_id, views_ids, eptr);
        }
        reader.close().get();

--- a/db/view/view_update_backlog.hh
+++ b/db/view/view_update_backlog.hh
@@ -43,7 +43,7 @@ public:
    // Returns the number of bytes in the backlog divided by the maximum number of bytes
    // that the backlog can hold before employing admission control. While the backlog
    // is below the threshold, the coordinator will slow down the view updates up to
-    // node_update_backlog::calculate_throttling_delay()::delay_limit_us. Above the threshold,
+    // calculate_view_update_throttling_delay()::delay_limit_us. Above the threshold,
    // the coordinator will reject the writes that would increase the backlog. On the
    // replica, the writes will start failing only after reaching the hard limit '_max'.
    float relative_size() const {
@@ -70,4 +70,18 @@ public:
    }
 };

+// View updates are asynchronous, and because of this limiting their concurrency requires
+// a special approach. The current algorithm places all of the pending view updates in the backlog
+// and artificially slows down new responses to coordinator requests based on how full the backlog is.
+// This function calculates how much a request should be slowed down based on the backlog's fullness.
+// The equation is basically: delay(in seconds) = view_fullness_ratio^3
+// The more full the backlog gets the more aggressively the requests are slowed down.
+// The delay is limited to the amount of time left until timeout.
+// After the timeout the request fails, so there's no point in waiting longer than that.
+// The second argument defines this timeout point - we can't delay the request more than this time point.
+// See: https://www.scylladb.com/2018/12/04/worry-free-ingestion-flow-control/
+std::chrono::microseconds calculate_view_update_throttling_delay(
+    update_backlog backlog,
+    db::timeout_clock::time_point timeout,
+    uint32_t view_flow_control_delay_limit_in_ms);
 }
--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -7,7 +7,6 @@
 */

 #include "db/view/view_update_backlog.hh"
-#include "db/view/node_view_update_backlog.hh"
 #include <seastar/core/timed_out_error.hh>
 #include "gms/inet_address.hh"
 #include <seastar/util/defer.hh>
@@ -96,10 +95,9 @@ public:
    }
 };

-view_update_generator::view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, node_update_backlog& node_backlog, abort_source& as)
+view_update_generator::view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, abort_source& as)
        : _db(db)
        , _proxy(proxy)
-        , _node_update_backlog(node_backlog)
        , _progress_tracker(std::make_unique<progress_tracker>())
        , _early_abort_subscription(as.subscribe([this] () noexcept { do_abort(); }))
 {
@@ -114,7 +112,7 @@ future<> view_update_generator::start() {
    _started = seastar::async([this]() mutable {
        auto drop_sstable_references = defer([&] () noexcept {
            // Clear sstable references so sstables_manager::stop() doesn't hang.
-            vug_logger.info("leaving {} unstaged sstables and {} sstables with tables unprocessed",
+            vug_logger.info("leaving {} unstaged sstables unprocessed",
                    _sstables_to_move.size(), _sstables_with_tables.size());
            _sstables_to_move.clear();
            _sstables_with_tables.clear();
@@ -500,7 +498,7 @@ future<> view_update_generator::generate_and_propagate_view_updates(const replic
        // the one which limits the number of incoming client requests by delaying the response to the client.
        if (batch_num > 0) {
            update_backlog local_backlog = _db.get_view_update_backlog();
-            std::chrono::microseconds throttle_delay =  _node_update_backlog.calculate_throttling_delay(local_backlog, timeout);
+            std::chrono::microseconds throttle_delay =  calculate_view_update_throttling_delay(local_backlog, timeout, _db.get_config().view_flow_control_delay_limit_in_ms());

            co_await seastar::sleep(throttle_delay);

--- a/db/view/view_update_generator.hh
+++ b/db/view/view_update_generator.hh
@@ -52,7 +52,6 @@ using allow_hints = bool_class<allow_hints_tag>;

 namespace db::view {

-class node_update_backlog;
 class stats;
 struct wait_for_all_updates_tag {};
 using wait_for_all_updates = bool_class<wait_for_all_updates_tag>;
@@ -64,7 +63,6 @@ public:
 private:
    replica::database& _db;
    sharded<service::storage_proxy>& _proxy;
-    node_update_backlog& _node_update_backlog;
    seastar::abort_source _as;
    future<> _started = make_ready_future<>();
    seastar::condition_variable _pending_sstables;
@@ -77,7 +75,7 @@ private:
    optimized_optional<abort_source::subscription> _early_abort_subscription;
    void do_abort() noexcept;
 public:
-    view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, node_update_backlog& node_backlog, abort_source& as);
+    view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, abort_source& as);
    ~view_update_generator();

    future<> start();
--- a/dist/CMakeLists.txt
+++ b/dist/CMakeLists.txt
@@ -141,72 +141,4 @@ add_dependencies(dist
  dist-python3
  dist-server)

-set(dist_rpm_dir "${CMAKE_BINARY_DIR}/$<CONFIG>/dist/rpm")
-set(dist_deb_dir "${CMAKE_BINARY_DIR}/$<CONFIG>/dist/deb")
-
-# Map system processor to Debian architecture names
-if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
-  set(deb_arch "amd64")
-elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
-  set(deb_arch "arm64")
-else()
-  message(FATAL_ERROR "Unsupported architecture: ${CMAKE_SYSTEM_PROCESSOR}")
-endif()
-
-set(rpm_ver "${Scylla_VERSION}-${Scylla_RELEASE}")
-set(deb_ver "${Scylla_VERSION}-${Scylla_RELEASE}-1")
-set(rpm_arch "${CMAKE_SYSTEM_PROCESSOR}")
-
-set(server_rpms_dir "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/redhat/RPMS/${rpm_arch}")
-set(server_rpms
-  "${server_rpms_dir}/${Scylla_PRODUCT}-${rpm_ver}.${rpm_arch}.rpm"
-  "${server_rpms_dir}/${Scylla_PRODUCT}-server-${rpm_ver}.${rpm_arch}.rpm"
-  "${server_rpms_dir}/${Scylla_PRODUCT}-server-debuginfo-${rpm_ver}.${rpm_arch}.rpm"
-  "${server_rpms_dir}/${Scylla_PRODUCT}-conf-${rpm_ver}.${rpm_arch}.rpm"
-  "${server_rpms_dir}/${Scylla_PRODUCT}-kernel-conf-${rpm_ver}.${rpm_arch}.rpm"
-  "${server_rpms_dir}/${Scylla_PRODUCT}-node-exporter-${rpm_ver}.${rpm_arch}.rpm")
-set(cqlsh_rpms
-  "${CMAKE_SOURCE_DIR}/tools/cqlsh/build/redhat/RPMS/${rpm_arch}/${Scylla_PRODUCT}-cqlsh-${rpm_ver}.${rpm_arch}.rpm")
-set(python3_rpms
-  "${CMAKE_SOURCE_DIR}/tools/python3/build/redhat/RPMS/${rpm_arch}/${Scylla_PRODUCT}-python3-${rpm_ver}.${rpm_arch}.rpm")
-
-set(server_debs_dir "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/debian")
-set(server_debs
-  "${server_debs_dir}/${Scylla_PRODUCT}_${deb_ver}_${deb_arch}.deb"
-  "${server_debs_dir}/${Scylla_PRODUCT}-server_${deb_ver}_${deb_arch}.deb"
-  "${server_debs_dir}/${Scylla_PRODUCT}-server-dbg_${deb_ver}_${deb_arch}.deb"
-  "${server_debs_dir}/${Scylla_PRODUCT}-conf_${deb_ver}_${deb_arch}.deb"
-  "${server_debs_dir}/${Scylla_PRODUCT}-kernel-conf_${deb_ver}_${deb_arch}.deb"
-  "${server_debs_dir}/${Scylla_PRODUCT}-node-exporter_${deb_ver}_${deb_arch}.deb"
-  "${server_debs_dir}/scylla-enterprise_${deb_ver}_all.deb"
-  "${server_debs_dir}/scylla-enterprise-server_${deb_ver}_all.deb"
-  "${server_debs_dir}/scylla-enterprise-conf_${deb_ver}_all.deb"
-  "${server_debs_dir}/scylla-enterprise-kernel-conf_${deb_ver}_all.deb"
-  "${server_debs_dir}/scylla-enterprise-node-exporter_${deb_ver}_all.deb")
-set(cqlsh_debs
-  "${CMAKE_SOURCE_DIR}/tools/cqlsh/build/debian/${Scylla_PRODUCT}-cqlsh_${deb_ver}_${deb_arch}.deb"
-  "${CMAKE_SOURCE_DIR}/tools/cqlsh/build/debian/scylla-enterprise-cqlsh_${deb_ver}_all.deb")
-set(python3_debs
-  "${CMAKE_SOURCE_DIR}/tools/python3/build/debian/${Scylla_PRODUCT}-python3_${deb_ver}_${deb_arch}.deb"
-  "${CMAKE_SOURCE_DIR}/tools/python3/build/debian/scylla-enterprise-python3_${deb_ver}_all.deb")
-
-add_custom_target(collect-dist-rpm
-  COMMAND ${CMAKE_COMMAND} -E rm -rf ${dist_rpm_dir}
-  COMMAND ${CMAKE_COMMAND} -E make_directory ${dist_rpm_dir}
-  COMMAND ${CMAKE_COMMAND} -E copy ${server_rpms} ${cqlsh_rpms} ${python3_rpms} ${dist_rpm_dir}/
-  DEPENDS dist
-  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-  COMMENT "Collecting RPMs into ${dist_rpm_dir}")
-
-add_custom_target(collect-dist-deb
-  COMMAND ${CMAKE_COMMAND} -E rm -rf ${dist_deb_dir}
-  COMMAND ${CMAKE_COMMAND} -E make_directory ${dist_deb_dir}
-  COMMAND ${CMAKE_COMMAND} -E copy ${server_debs} ${cqlsh_debs} ${python3_debs} ${dist_deb_dir}/
-  DEPENDS dist
-  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-  COMMENT "Collecting DEBs into ${dist_deb_dir}")
-
-add_custom_target(collect-dist
-  DEPENDS collect-dist-rpm collect-dist-deb)
-
 add_subdirectory(debuginfo)
--- a/docs/alternator/compatibility.md
+++ b/docs/alternator/compatibility.md
@@ -324,13 +324,6 @@ experimental:
    stream events. Without this option, such no-op operations may still
    generate spurious stream events.
    <https://github.com/scylladb/scylladb/issues/28368>
-  * When a stream is disabled, no new records are written but the existing
-    stream data is preserved and remains readable through its original
-    StreamArn. The data expires via TTL after 24 hours. Re-enabling the
-    stream purges the old data immediately and produces a new StreamArn.
-    In contrast, DynamoDB keeps the old stream and its data readable for
-    24 hours through the old StreamArn even after re-enabling.
-    <https://scylladb.atlassian.net/browse/SCYLLADB-1873>

 ## Unimplemented API features

--- a/docs/cql/ddl.rst
+++ b/docs/cql/ddl.rst
@@ -415,7 +415,7 @@ An empty list is allowed, and it's equivalent to numeric replication factor of 0
 .. code-block:: cql

  ALTER KEYSPACE Excelsior
-   WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc2' : []};
+   WITH replication = { 'class' : 'NetworkTopologyStrategy', dc2' : []};


 Altering from a rack list to a numeric replication factor is not supported.
@@ -1017,11 +1017,11 @@ For example:

    CREATE TABLE customer_data (
        cust_id uuid,
-        "cust_first-name" text,
-        "cust_last-name" text,
+        cust_first-name text,
+        cust_last-name text,
        cust_phone text,
-        "cust_get-sms" text,
-        PRIMARY KEY (cust_id)
+        cust_get-sms text,
+        PRIMARY KEY (customer_id)
    ) WITH cdc = { 'enabled' : 'true', 'preimage' : 'true' };

 .. _cql-caching-options:
--- a/docs/cql/dml/insert.rst
+++ b/docs/cql/dml/insert.rst
@@ -24,8 +24,7 @@ For example:

    INSERT INTO NerdMovies (movie, director, main_actor, year)
          VALUES ('Serenity', 'Joss Whedon', 'Nathan Fillion', 2005)
-          IF NOT EXISTS
-          USING TTL 86400;
+          USING TTL 86400 IF NOT EXISTS;

 The ``INSERT`` statement writes one or more columns for a given row in a table. Note that since a row is identified by
 its ``PRIMARY KEY``, at least the columns composing it must be specified. The list of columns to insert to must be
--- a/docs/cql/mv.rst
+++ b/docs/cql/mv.rst
@@ -71,7 +71,7 @@ used. If it is used, the statement will be a no-op if the materialized view alre
 MV Select Statement
 ...................

-The select statement of a materialized view creation defines which of the base table is included in the view. That
+The select statement of a materialized view creation defines which of the base table columns are included in the view. That
 statement is limited in a number of ways:

 - The :ref:`selection <selection-clause>` is limited to those that only select columns of the base table. In other
--- a/docs/cql/types.rst
+++ b/docs/cql/types.rst
@@ -507,7 +507,7 @@ For example::

  CREATE TABLE superheroes (
       name frozen<full_name> PRIMARY KEY,
-       home frozen<address>
+       home address
  );

 .. note::
--- a/docs/upgrade/upgrade-guides/index.rst
+++ b/docs/upgrade/upgrade-guides/index.rst
@@ -4,7 +4,7 @@ Upgrade ScyllaDB

 .. toctree::
   
-   ScyllaDB 2026.1 to ScyllaDB 2026.2 <upgrade-guide-from-2026.1-to-2026.2/index>
+   ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1/index>
   ScyllaDB 2026.x Patch Upgrades <upgrade-guide-from-2026.x.y-to-2026.x.z>
   ScyllaDB Image <ami-upgrade>

--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/index.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/index.rst
@@ -0,0 +1,13 @@
+==========================================================
+Upgrade - ScyllaDB 2025.x to ScyllaDB 2026.1
+==========================================================
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+   Upgrade ScyllaDB <upgrade-guide-from-2025.x-to-2026.1>
+   Metrics Update <metric-update-2025.x-to-2026.1>
+
+* :doc:`Upgrade from ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1>`
+* :doc:`Metrics Update Between 2025.x and 2026.1 <metric-update-2025.x-to-2026.1>`
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/metric-update-2025.x-to-2026.1.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/metric-update-2025.x-to-2026.1.rst
@@ -0,0 +1,82 @@
+.. |SRC_VERSION| replace:: 2025.x
+.. |NEW_VERSION| replace:: 2026.1
+.. |PRECEDING_VERSION| replace:: 2025.4
+
+================================================================
+Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
+================================================================
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
+
+
+New Metrics in |NEW_VERSION|
+--------------------------------------
+
+The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
+
+.. list-table::
+   :widths: 25 150
+   :header-rows: 1
+
+   * - Metric
+     - Description
+   * - scylla_alternator_operation_size_kb
+     - Histogram of item sizes involved in a request.
+   * - scylla_column_family_total_disk_space_before_compression
+     - Hypothetical total disk space used if data files weren't compressed
+   * - scylla_group_name_auto_repair_enabled_nr
+     - Number of tablets with auto repair enabled.
+   * - scylla_group_name_auto_repair_needs_repair_nr
+     - Number of tablets with auto repair enabled that currently need repair.
+   * - scylla_lsa_compact_time_ms
+     - Total time spent on segment compaction that was not accounted under ``reclaim_time_ms``.
+   * - scylla_lsa_evict_time_ms
+     - Total time spent on evicting objects that was not accounted under ``reclaim_time_ms``,
+   * - scylla_lsa_reclaim_time_ms
+     - Total time spent in reclaiming LSA memory back to std allocator.
+   * - scylla_object_storage_memory_usage
+     - Total number of bytes consumed by the object storage client.
+   * - scylla_tablet_ops_failed
+     - Number of failed tablet auto repair attempts.
+   * - scylla_tablet_ops_succeeded
+     - Number of successful tablet auto repair attempts.
+   
+Renamed Metrics in |NEW_VERSION|
+--------------------------------------
+
+The following metrics are renamed in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
+
+.. list-table::
+   :widths: 25 150
+   :header-rows: 1
+
+   * - Metric Name in |PRECEDING_VERSION|
+     - Metric Name in |NEW_VERSION|
+   * - scylla_s3_memory_usage
+     - scylla_object_storage_memory_usage
+
+Removed Metrics in |NEW_VERSION|
+--------------------------------------
+
+The following metrics are removed in ScyllaDB |NEW_VERSION|.
+
+* scylla_redis_current_connections
+* scylla_redis_op_latency
+* scylla_redis_operation
+* scylla_redis_operation
+* scylla_redis_requests_latency
+* scylla_redis_requests_served
+* scylla_redis_requests_serving
+
+New and Updated Metrics in Previous Releases
+-------------------------------------------------------
+
+* `Metrics Update Between 2025.3 and 2025.4 <https://docs.scylladb.com/manual/branch-2025.4/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/metric-update-2025.x-to-2025.4.html>`_
+* `Metrics Update Between 2025.2 and 2025.3 <https://docs.scylladb.com/manual/branch-2025.3/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/metric-update-2025.2-to-2025.3.html>`_
+* `Metrics Update Between 2025.1 and 2025.2 <https://docs.scylladb.com/manual/branch-2025.2/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/metric-update-2025.1-to-2025.2.html>`_
+
+
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/upgrade-guide-from-2025.x-to-2026.1.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/upgrade-guide-from-2025.x-to-2026.1.rst
@@ -1,13 +1,13 @@
 .. |SCYLLA_NAME| replace:: ScyllaDB

-.. |SRC_VERSION| replace:: 2026.1
-.. |NEW_VERSION| replace:: 2026.2
+.. |SRC_VERSION| replace:: 2025.x
+.. |NEW_VERSION| replace:: 2026.1

 .. |ROLLBACK| replace:: rollback
 .. _ROLLBACK: ./#rollback-procedure

-.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2026.1 to 2026.2
-.. _SCYLLA_METRICS: ../metric-update-2026.1-to-2026.2
+.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.x to 2026.1
+.. _SCYLLA_METRICS: ../metric-update-2025.x-to-2026.1

 =======================================================================================
 Upgrade from |SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION|
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/index.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/index.rst
@@ -1,13 +0,0 @@
-==========================================================
-Upgrade - ScyllaDB 2026.1 to ScyllaDB 2026.2
-==========================================================
-
-.. toctree::
-   :maxdepth: 2
-   :hidden:
-
-   Upgrade ScyllaDB <upgrade-guide-from-2026.1-to-2026.2>
-   Metrics Update <metric-update-2026.1-to-2026.2>
-
-* :doc:`Upgrade from ScyllaDB 2026.1 to ScyllaDB 2026.2 <upgrade-guide-from-2026.1-to-2026.2>`
-* :doc:`Metrics Update Between 2026.1 and 2026.2 <metric-update-2026.1-to-2026.2>`
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/metric-update-2026.1-to-2026.2.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/metric-update-2026.1-to-2026.2.rst
@@ -1,126 +0,0 @@
-.. |SRC_VERSION| replace:: 2026.1
-.. |NEW_VERSION| replace:: 2026.2
-.. |PRECEDING_VERSION| replace:: 2026.1
-
-================================================================
-Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
-================================================================
-
-.. toctree::
-   :maxdepth: 2
-   :hidden:
-
-ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
-
-
-New Metrics in |NEW_VERSION|
--------------------------------------
-
-The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
-
-.. list-table::
-   :widths: 25 150
-   :header-rows: 1
-
-   * - Metric
-     - Description
-   * - scylla_auth_cache_permissions
-     - Total number of permission sets currently cached across all roles.
-   * - scylla_auth_cache_roles
-     - Number of roles currently cached.
-   * - scylla_cql_forwarded_requests
-     - Counts the total number of attempts to forward CQL requests to other nodes.
-       One request may be forwarded multiple times, particularly when a write is
-       handled by a non-replica node.
-   * - scylla_cql_write_consistency_levels_disallowed_violations
-     - Counts the number of write_consistency_levels_disallowed guardrail violations,
-       i.e. attempts to write with a forbidden consistency level.
-   * - scylla_cql_write_consistency_levels_warned_violations
-     - Counts the number of write_consistency_levels_warned guardrail violations,
-       i.e. attempts to write with a discouraged consistency level.
-   * - scylla_cql_writes_per_consistency_level
-     - Counts the number of writes for each consistency level.
-   * - scylla_io_queue_integrated_disk_queue_length
-     - Length of the integrated disk queue.
-   * - scylla_io_queue_integrated_queue_length
-     - Length of the integrated queue.
-   * - scylla_logstor_sm_bytes_freed
-     - Counts the number of data bytes freed.
-   * - scylla_logstor_sm_bytes_read
-     - Counts the number of bytes read from the disk.
-   * - scylla_logstor_sm_bytes_written
-     - Counts the number of bytes written to the disk.
-   * - scylla_logstor_sm_compaction_bytes_written
-     - Counts the number of bytes written to the disk by compaction.
-   * - scylla_logstor_sm_compaction_data_bytes_written
-     - Counts the number of data bytes written to the disk by compaction.
-   * - scylla_logstor_sm_compaction_records_rewritten
-     - Counts the number of records rewritten during compaction.
-   * - scylla_logstor_sm_compaction_records_skipped
-     - Counts the number of records skipped during compaction.
-   * - scylla_logstor_sm_compaction_segments_freed
-     - Counts the number of data bytes written to the disk.
-   * - scylla_logstor_sm_disk_usage
-     - Total disk usage.
-   * - scylla_logstor_sm_free_segments
-     - Counts the number of free segments currently available.
-   * - scylla_logstor_sm_segment_pool_compaction_segments_get
-     - Counts the number of segments taken from the segment pool for compaction.
-   * - scylla_logstor_sm_segment_pool_normal_segments_get
-     - Counts the number of segments taken from the segment pool for normal writes.
-   * - scylla_logstor_sm_segment_pool_normal_segments_wait
-     - Counts the number of times normal writes had to wait for a segment to become
-       available in the segment pool.
-   * - scylla_logstor_sm_segment_pool_segments_put
-     - Counts the number of segments returned to the segment pool.
-   * - scylla_logstor_sm_segment_pool_separator_segments_get
-     - Counts the number of segments taken from the segment pool for separator writes.
-   * - scylla_logstor_sm_segment_pool_size
-     - Counts the number of segments in the segment pool.
-   * - scylla_logstor_sm_segments_allocated
-     - Counts the number of segments allocated.
-   * - scylla_logstor_sm_segments_compacted
-     - Counts the number of segments compacted.
-   * - scylla_logstor_sm_segments_freed
-     - Counts the number of segments freed.
-   * - scylla_logstor_sm_segments_in_use
-     - Counts the number of segments currently in use.
-   * - scylla_logstor_sm_separator_buffer_flushed
-     - Counts the number of times the separator buffer has been flushed.
-   * - scylla_logstor_sm_separator_bytes_written
-     - Counts the number of bytes written to the separator.
-   * - scylla_logstor_sm_separator_data_bytes_written
-     - Counts the number of data bytes written to the separator.
-   * - scylla_logstor_sm_separator_flow_control_delay
-     - Current delay applied to writes to control separator debt in microseconds.
-   * - scylla_logstor_sm_separator_segments_freed
-     - Counts the number of segments freed by the separator.
-   * - scylla_transport_cql_pending_response_memory
-     - Holds the total memory in bytes consumed by responses waiting to be sent.
-   * - scylla_transport_cql_request_histogram_bytes
-     - A histogram of received bytes in CQL messages of a specific kind and
-       specific scheduling group.
-   * - scylla_transport_cql_requests_serving
-     - Holds the number of requests that are being processed right now.
-   * - scylla_transport_cql_response_histogram_bytes
-     - A histogram of received bytes in CQL messages of a specific kind and
-       specific scheduling group.
-   * - scylla_transport_requests_forwarded_failed
-     - Counts the number of requests that were forwarded to another replica
-       but failed to execute there.
-   * - scylla_transport_requests_forwarded_prepared_not_found
-     - Counts the number of requests that were forwarded to another replica
-       but failed there because the statement was not prepared on the target.
-       When this happens, the coordinator performs an additional remote call
-       to prepare the statement on the replica and retries the EXECUTE request
-       afterwards.
-   * - scylla_transport_requests_forwarded_redirected
-     - Counts the number of requests that were forwarded to another replica
-       but that replica responded with a redirect to another node. This can
-       happen when replica has stale information about the cluster topology or
-       when the request is handled by a node that is not a replica for the data
-       being accessed by the request.
-   * - scylla_transport_requests_forwarded_successfully
-     - Counts the number of requests that were forwarded to another replica
-       and executed successfully there.
-
--- a/ent/encryption/kmip_host.cc
+++ b/ent/encryption/kmip_host.cc
@@ -598,7 +598,7 @@ future<int> kmip_host::impl::do_cmd(KMIP_CMD* cmd, con_ptr cp, Func& f, bool ret

 template<typename Func>
 future<kmip_host::impl::kmip_cmd> kmip_host::impl::do_cmd(kmip_cmd cmd_in, Func && f) {
-    kmip_log.trace("{}: begin do_cmd {}", *this, cmd_in);
+    kmip_log.trace("{}: begin do_cmd", *this, cmd_in);
    KMIP_CMD* cmd = cmd_in;

    // #998 Need to do retry loop, because we can have either timed out connection,
--- a/ent/encryption/kms_host.cc
+++ b/ent/encryption/kms_host.cc
@@ -616,7 +616,7 @@ future<rjson::value> encryption::kms_host::impl::do_post(std::string_view target
            static auto get_xml_node = [](node_type* node, const char* what) {
                auto res = node->first_node(what);
                if (!res) {
-                    throw malformed_response_error(fmt::format("XML parse error: {}", what));
+                    throw malformed_response_error(fmt::format("XML parse error", what));
                }
                return res;
            };
--- a/gms/feature_service.cc
+++ b/gms/feature_service.cc
@@ -109,7 +109,6 @@ std::set<std::string_view> feature_service::supported_feature_set() const {
        "UUID_SSTABLE_IDENTIFIERS"sv,
        "GROUP0_SCHEMA_VERSIONING"sv,
        "VIEW_BUILD_STATUS_ON_GROUP0"sv,
-        "CDC_GENERATIONS_V2"sv,
    };

    if (is_test_only_feature_deprecated()) {
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -83,6 +83,7 @@ public:
    gms::feature alternator_ttl { *this, "ALTERNATOR_TTL"sv };
    gms::feature cql_row_ttl { *this, "CQL_ROW_TTL"sv };
    gms::feature range_scan_data_variant { *this, "RANGE_SCAN_DATA_VARIANT"sv };
+    gms::feature cdc_generations_v2 { *this, "CDC_GENERATIONS_V2"sv };
    gms::feature user_defined_aggregates { *this, "UDA"sv };
    // Historically max_result_size contained only two fields: soft_limit and
    // hard_limit. It was somehow obscure because for normal paged queries both
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -399,10 +399,9 @@ future<> gossiper::do_send_ack2_msg(locator::host_id from, utils::chunked_vector
        }
    }
    gms::gossip_digest_ack2 ack2_msg(std::move(delta_ep_state_map));
-    auto ack2_msg_str = fmt::format("{}", ack2_msg);
-    logger.debug("Calling do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg_str);
+    logger.debug("Calling do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg);
    co_await ser::gossip_rpc_verbs::send_gossip_digest_ack2(&_messaging, from, std::move(ack2_msg));
-    logger.debug("finished do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg_str);
+    logger.debug("finished do_send_ack2_msg to node {}, ack_msg_digest={}, ack2_msg={}", from, ack_msg_digest, ack2_msg);
 }

 // Depends on
@@ -965,7 +964,8 @@ future<> gossiper::failure_detector_loop_for_node(locator::host_id host_id, gene
        diff = now - last;
        if (!failed) {
            last = now;
-        } else if (diff > max_duration) {
+        }
+        if (diff > max_duration) {
            logger.info("failure_detector_loop: Mark node {}/{} as DOWN", host_id, node);
            co_await container().invoke_on(0, [host_id] (gms::gossiper& g) {
                return g.convict(host_id);
--- a/init.cc
+++ b/init.cc
@@ -87,6 +87,9 @@ std::set<sstring> get_disabled_features_from_db_config(const db::config& cfg, st
        }
    }

+    if (!cfg.check_experimental(db::experimental_features_t::feature::ALTERNATOR_STREAMS)) {
+        disabled.insert("ALTERNATOR_STREAMS"s);
+    }
    if (!cfg.check_experimental(db::experimental_features_t::feature::KEYSPACE_STORAGE_OPTIONS)) {
        disabled.insert("KEYSPACE_STORAGE_OPTIONS"s);
    }
--- a/main.cc
+++ b/main.cc
@@ -1358,7 +1358,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            };
            spcfg.hinted_handoff_enabled = hinted_handoff_enabled;
            spcfg.available_memory = memory::stats().total_memory();
-            spcfg.maintenance_mode = maintenance_mode_enabled{cfg->maintenance_mode()};
            smp_service_group_config storage_proxy_smp_service_group_config;
            // Assuming less than 1kB per queued request, this limits storage_proxy submit_to() queues to 5MB or less
            storage_proxy_smp_service_group_config.max_nonlocal_requests = 5000;
@@ -1367,7 +1366,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            spcfg.write_mv_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get();
            spcfg.hints_write_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get();
            spcfg.write_ack_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get();
-            static db::view::node_update_backlog node_backlog(smp::count, 10ms, cfg->view_flow_control_delay_limit_in_ms);
+            static db::view::node_update_backlog node_backlog(smp::count, 10ms);
            scheduling_group_key_config storage_proxy_stats_cfg =
                    make_scheduling_group_key_config<service::storage_proxy_stats::stats>();
            storage_proxy_stats_cfg.constructor = [plain_constructor = storage_proxy_stats_cfg.constructor] (void* ptr) {
@@ -1811,18 +1810,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            utils::get_local_injector().inject("stop_after_starting_migration_manager",
                [] { std::raise(SIGSTOP); });

-            // Audit must be constructed before the maintenance socket so
-            // that on shutdown (reverse destruction order) the audit service
-            // outlives the maintenance socket and in-flight queries can
-            // still reach audit::inspect() safely.
-            checkpoint(stop_signal, "starting audit service");
-            audit::audit::start_audit(*cfg, token_metadata, qp, mm).handle_exception([&] (auto&& e) {
-                startlog.error("audit start failed: {}", e);
-            }).get();
-            auto audit_stop = defer([] {
-                audit::audit::stop_audit().get();
-            });
-
            // XXX: stop_raft has to happen before query_processor and migration_manager
            // is stopped, since some groups keep using the query
            // processor until are stopped inside stop_raft.
@@ -1854,7 +1841,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            });

            checkpoint(stop_signal, "starting view update generator");
-            view_update_generator.start(std::ref(db), std::ref(proxy), std::ref(node_backlog), std::ref(stop_signal.as_sharded_abort_source())).get();
+            view_update_generator.start(std::ref(db), std::ref(proxy), std::ref(stop_signal.as_sharded_abort_source())).get();
            auto stop_view_update_generator = defer_verbose_shutdown("view update generator", [] {
                view_update_generator.stop().get();
            });
@@ -2300,12 +2287,10 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
               ss.local().wait_for_group0_stop().get();
            });

-            if (!group0_service.maintenance_mode() && sys_ks.local().bootstrap_complete()) {
-                // Setup group0 early in case the node is bootstrapped already and the group exists.
-                // Need to do it before allowing incoming messaging service connections since
-                // storage proxy's and migration manager's verbs may access group0.
-                group0_service.setup_group0_if_exist(sys_ks.local(), ss.local(), qp.local(), mm.local()).get();
-            }
+            // Setup group0 early in case the node is bootstrapped already and the group exists.
+            // Need to do it before allowing incoming messaging service connections since
+            // storage proxy's and migration manager's verbs may access group0.
+            group0_service.setup_group0_if_exist(sys_ks.local(), ss.local(), qp.local(), mm.local()).get();

            // The call to setup_group0_if_exists() above guarantees that, if group0 is
            // created and started, the locally persisted group0 state has been applied
@@ -2355,22 +2340,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            }).get();
            stop_signal.ready(false);

-            // At this point, `locator::topology` should be stable, i.e. we should have complete information
-            // about the layout of the cluster (= list of nodes along with the racks/DCs).
-            startlog.info("Verifying that all of the keyspaces are RF-rack-valid");
-            db.local().check_rf_rack_validity(token_metadata.local().get());
-
-            startlog.info("Verifying that all of the tablet keyspaces use rack list replication factors");
-            db.local().check_rack_list_everywhere(cfg->enforce_rack_list());
-
-            // The table-based audit backend needs Raft (via join_cluster)
-            // to create its keyspace and table.
-            checkpoint(stop_signal, "starting audit storage");
-            audit::audit::start_storage(*cfg).get();
-            auto audit_storage_stop = defer([] {
-                audit::audit::stop_storage().get();
-            });
-
            if (cfg->maintenance_socket() != "ignore") {
                // Enable role operations now that node joined the cluster
                maintenance_auth_service.invoke_on_all([](auth::service& svc) {
@@ -2380,6 +2349,24 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
                start_cql(*cql_maintenance_server_ctl, stop_maintenance_cql, "maintenance native server");
            }

+            // At this point, `locator::topology` should be stable, i.e. we should have complete information
+            // about the layout of the cluster (= list of nodes along with the racks/DCs).
+            startlog.info("Verifying that all of the keyspaces are RF-rack-valid");
+            db.local().check_rf_rack_validity(token_metadata.local().get());
+
+            startlog.info("Verifying that all of the tablet keyspaces use rack list replication factors");
+            db.local().check_rack_list_everywhere(cfg->enforce_rack_list());
+
+            // Start audit service after join_cluster so that the table-based audit backend
+            // can properly create its keyspace and table.
+            checkpoint(stop_signal, "starting audit service");
+            audit::audit::start_audit(*cfg, token_metadata, qp, mm).handle_exception([&] (auto&& e) {
+                startlog.error("audit start failed: {}", e);
+            }).get();
+            auto audit_stop = defer([] {
+                audit::audit::stop_audit().get();
+            });
+
            // Semantic validation of sstable compression parameters from config.
            // Adding here (i.e., after `join_cluster`) to ensure that the
            // required SSTABLE_COMPRESSION_DICTS cluster feature has been negotiated.
--- a/mutation/atomic_cell.hh
+++ b/mutation/atomic_cell.hh
@@ -48,8 +48,8 @@ static void set_field(atomic_cell_value& out, unsigned offset, T val) {
 }

 template <FragmentRange Buffer>
-static void set_value(atomic_cell_value_mutable_view b, unsigned value_offset, const Buffer& value) {
-    auto v = b.substr(value_offset, value.size_bytes());
+static void set_value(managed_bytes& b, unsigned value_offset, const Buffer& value) {
+    auto v = managed_bytes_mutable_view(b).substr(value_offset, value.size_bytes());
    for (auto frag : value) {
        write_fragmented(v, single_fragmented_view(frag));
    }
@@ -141,36 +141,20 @@ public:
        SCYLLA_ASSERT(is_live_and_has_ttl(cell));
        return gc_clock::duration(get_field<int32_t>(cell, ttl_offset));
    }
-    static size_t dead_serialized_size() {
-        return flags_size + timestamp_size + deletion_time_size;
-    }
-    static size_t live_serialized_size(size_t value_size) {
-        return flags_size + timestamp_size + value_size;
-    }
-    static size_t live_expiring_serialized_size(size_t value_size) {
-        return flags_size + timestamp_size + expiry_size + ttl_size + value_size;
-    }
-    static void write_dead(atomic_cell_value_mutable_view b, api::timestamp_type timestamp, gc_clock::time_point deletion_time) {
+    static managed_bytes make_dead(api::timestamp_type timestamp, gc_clock::time_point deletion_time) {
+        managed_bytes b(managed_bytes::initialized_later(), flags_size + timestamp_size + deletion_time_size);
        b[0] = 0;
        set_field(b, timestamp_offset, timestamp);
        set_field(b, deletion_time_offset, static_cast<int64_t>(deletion_time.time_since_epoch().count()));
-    }
-    static managed_bytes make_dead(api::timestamp_type timestamp, gc_clock::time_point deletion_time) {
-        managed_bytes b(managed_bytes::initialized_later(), dead_serialized_size());
-        write_dead(b, timestamp, deletion_time);
        return b;
    }
    template <FragmentRange Buffer>
-    static void write_live(atomic_cell_value_mutable_view b, api::timestamp_type timestamp, const Buffer& value) {
+    static managed_bytes make_live(api::timestamp_type timestamp, const Buffer& value) {
        auto value_offset = flags_size + timestamp_size;
+        managed_bytes b(managed_bytes::initialized_later(), value_offset + value.size_bytes());
        b[0] = LIVE_FLAG;
        set_field(b, timestamp_offset, timestamp);
        set_value(b, value_offset, value);
-    }
-    template <FragmentRange Buffer>
-    static managed_bytes make_live(api::timestamp_type timestamp, const Buffer& value) {
-        managed_bytes b(managed_bytes::initialized_later(), live_serialized_size(value.size_bytes()));
-        write_live(b, timestamp, value);
        return b;
    }
    static managed_bytes make_live_counter_update(api::timestamp_type timestamp, int64_t value) {
@@ -182,18 +166,14 @@ public:
        return b;
    }
    template <FragmentRange Buffer>
-    static void write_live(atomic_cell_value_mutable_view b, api::timestamp_type timestamp, const Buffer& value, gc_clock::time_point expiry, gc_clock::duration ttl) {
+    static managed_bytes make_live(api::timestamp_type timestamp, const Buffer& value, gc_clock::time_point expiry, gc_clock::duration ttl) {
        auto value_offset = flags_size + timestamp_size + expiry_size + ttl_size;
+        managed_bytes b(managed_bytes::initialized_later(), value_offset + value.size_bytes());
        b[0] = EXPIRY_FLAG | LIVE_FLAG;
        set_field(b, timestamp_offset, timestamp);
        set_field(b, expiry_offset, static_cast<int64_t>(expiry.time_since_epoch().count()));
        set_field(b, ttl_offset, static_cast<int32_t>(ttl.count()));
        set_value(b, value_offset, value);
-    }
-    template <FragmentRange Buffer>
-    static managed_bytes make_live(api::timestamp_type timestamp, const Buffer& value, gc_clock::time_point expiry, gc_clock::duration ttl) {
-        managed_bytes b(managed_bytes::initialized_later(), live_expiring_serialized_size(value.size_bytes()));
-        write_live(b, timestamp, value, expiry, ttl);
        return b;
    }
    static managed_bytes make_live_uninitialized(api::timestamp_type timestamp, size_t size) {
--- a/mutation/canonical_mutation.cc
+++ b/mutation/canonical_mutation.cc
@@ -113,10 +113,10 @@ auto fmt::formatter<canonical_mutation>::format(const canonical_mutation& cm, fm
            auto&& entry = _cm.static_column_at(id);
            _os = fmt::format_to(_os, "static column {} {}", bytes_to_text(entry.name()), atomic_cell::printer(*entry.type(), ac));
        }
-        virtual void accept_static_cell(column_id id, collection_mutation cm) override {
+        virtual void accept_static_cell(column_id id, collection_mutation_view cmv) override {
            print_separator();
            auto&& entry = _cm.static_column_at(id);
-            _os = fmt::format_to(_os, "static column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cm));
+            _os = fmt::format_to(_os, "static column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cmv));
        }
        virtual stop_iteration accept_row_tombstone(range_tombstone rt) override {
            print_separator();
@@ -137,10 +137,10 @@ auto fmt::formatter<canonical_mutation>::format(const canonical_mutation& cm, fm
            auto&& entry = _cm.regular_column_at(id);
            _os = fmt::format_to(_os, "column {} {}", bytes_to_text(entry.name()), atomic_cell::printer(*entry.type(), ac));
        }
-        virtual void accept_row_cell(column_id id, collection_mutation cm) override {
+        virtual void accept_row_cell(column_id id, collection_mutation_view cmv) override {
            print_separator();
            auto&& entry = _cm.regular_column_at(id);
-            _os = fmt::format_to(_os, "column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cm));
+            _os = fmt::format_to(_os, "column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cmv));
        }
        out_t finalize() {
            if (_in_row) {
--- a/mutation/collection_mutation.cc
+++ b/mutation/collection_mutation.cc
@@ -7,14 +7,12 @@
 */

 #include "utils/assert.hh"
-#include "utils/on_internal_error.hh"
 #include "types/collection.hh"
 #include "types/user.hh"
 #include "types/concrete_types.hh"
 #include "mutation/mutation_partition.hh"
 #include "compaction/compaction_garbage_collector.hh"
 #include "combine.hh"
-#include "idl/mutation.dist.impl.hh"

 #include "collection_mutation.hh"

@@ -226,26 +224,13 @@ compact_and_expire_result collection_mutation_description::compact_and_expire(co
    return res;
 }

-/// A CollectionMutationAdaptor is a static interface that adapts a collection
-/// element (an iterator value type) to the serialization requirements of
-/// serialize_collection_mutation(). It provides static methods to measure the
-/// serialized sizes and to write the key and value of each element into a buffer.
-template <typename Adaptor, typename Element>
-concept CollectionMutationAdaptor = requires(const Element& e, managed_bytes_mutable_view& out) {
-    { Adaptor::key_size(e) } -> std::convertible_to<size_t>;
-    { Adaptor::value_size(e) } -> std::convertible_to<size_t>;
-    { Adaptor::write_key(e, out) };
-    { Adaptor::write_value(e, out) };
-};
-
-template <typename Adaptor, typename Iterator>
-    requires CollectionMutationAdaptor<Adaptor, std::iter_value_t<Iterator>>
+template <typename Iterator>
 static collection_mutation serialize_collection_mutation(
        const abstract_type& type,
        const tombstone& tomb,
        std::ranges::subrange<Iterator> cells) {
    auto element_size = [] (size_t c, auto&& e) -> size_t {
-        return c + 8 + Adaptor::key_size(e) + Adaptor::value_size(e);
+        return c + 8 + e.first.size() + e.second.serialize().size();
    };
    auto size = std::ranges::fold_left(cells, (size_t)4, element_size);
    size += 1;
@@ -259,112 +244,32 @@ static collection_mutation serialize_collection_mutation(
        write<int64_t>(out, tomb.timestamp);
        write<int64_t>(out, tomb.deletion_time.time_since_epoch().count());
    }
-    auto writek = [&out] (auto& kv) {
-        write<int32_t>(out, Adaptor::key_size(kv));
-        Adaptor::write_key(kv, out);
+    auto writek = [&out] (bytes_view v) {
+        write<int32_t>(out, v.size());
+        write_fragmented(out, single_fragmented_view(v));
    };
-    auto writev = [&out] (auto& kv) {
-        write<int32_t>(out, Adaptor::value_size(kv));
-        Adaptor::write_value(kv, out);
+    auto writev = [&out] (managed_bytes_view v) {
+        write<int32_t>(out, v.size());
+        write_fragmented(out, v);
    };
    // FIXME: overflow?
    write<int32_t>(out, std::ranges::distance(cells));
    for (auto&& kv : cells) {
-        writek(kv);
-        writev(kv);
+        auto&& k = kv.first;
+        auto&& v = kv.second;
+        writek(k);
+
+        writev(v.serialize());
    }
    return collection_mutation(type, std::move(ret));
 }

-namespace {
-
-/// A key-value pair where the key is bytes-like and the value is an atomic_cell-like type
-/// with a serialize() method returning managed_bytes_view.
-template <typename T>
-concept AtomicCellKV = requires(const T& kv) {
-    { kv.first.size() } -> std::convertible_to<size_t>;
-    { kv.second.serialize() } -> std::convertible_to<managed_bytes_view>;
-};
-
-struct atomic_cell_adaptor {
-    static size_t key_size(const AtomicCellKV auto& v) { return v.first.size(); }
-    static size_t value_size(const AtomicCellKV auto& v) { return v.second.serialize().size(); }
-
-    static void write_key(const AtomicCellKV auto& v, managed_bytes_mutable_view& out) {
-        write_fragmented(out, single_fragmented_view(v.first));
-    }
-    static void write_value(const AtomicCellKV auto& v, managed_bytes_mutable_view& out) {
-        write_fragmented(out, v.second.serialize());
-    }
-};
-
-}
-
 collection_mutation collection_mutation_description::serialize(const abstract_type& type) const {
-    return serialize_collection_mutation<atomic_cell_adaptor>(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
+    return serialize_collection_mutation(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
 }

 collection_mutation collection_mutation_view_description::serialize(const abstract_type& type) const {
-    return serialize_collection_mutation<atomic_cell_adaptor>(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
-}
-
-namespace {
-
-struct serialized_cell_adaptor {
-    static size_t key_size(const ser::collection_element_view& v) {
-        return v.key().view().size_bytes();
-    }
-
-    static size_t value_size(const ser::collection_element_view& v) {
-        struct collection_cell_visitor {
-            size_t operator()(const ser::live_cell_view& lcv) const { return atomic_cell_type::live_serialized_size(lcv.value().view().size_bytes()); }
-            size_t operator()(const ser::expiring_cell_view& ecv) const { return atomic_cell_type::live_expiring_serialized_size(ecv.c().value().view().size_bytes()); }
-            size_t operator()(const ser::dead_cell_view& dcv) const { return atomic_cell_type::dead_serialized_size(); }
-            size_t operator()(const ser::counter_cell_view& ccv) const { utils::on_internal_error("Trying to deserialize counter cell from collection"); }
-            size_t operator()(const ser::unknown_variant_type&) const { utils::on_internal_error("Trying to deserialize cell in unknown state"); };
-        };
-        return boost::apply_visitor(collection_cell_visitor{}, v.value());
-    }
-
-    static void write_key(const ser::collection_element_view& v, managed_bytes_mutable_view& out) {
-        write_fragmented(out, v.key().view());
-    }
-
-    static void write_value(const ser::collection_element_view& v, managed_bytes_mutable_view& out) {
-        struct collection_cell_visitor {
-            managed_bytes_mutable_view& out;
-
-            void operator()(const ser::live_cell_view& lcv) const {
-                const auto v = lcv.value().view();
-                atomic_cell_type::write_live(out, lcv.created_at(), v);
-                out.remove_prefix(atomic_cell_type::live_serialized_size(v.size_bytes()));
-            }
-            void operator()(const ser::expiring_cell_view& ecv) const {
-                const auto v = ecv.c().value().view();
-                atomic_cell_type::write_live(out, ecv.c().created_at(), v, ecv.expiry(), ecv.ttl());
-                out.remove_prefix(atomic_cell_type::live_expiring_serialized_size(v.size_bytes()));
-            }
-            void operator()(const ser::dead_cell_view& dcv) const {
-                atomic_cell_type::write_dead(out, dcv.tomb().timestamp(), dcv.tomb().deletion_time());
-                out.remove_prefix(atomic_cell_type::dead_serialized_size());
-            }
-            void operator()(const ser::counter_cell_view& ccv) const {
-                utils::on_internal_error("Trying to deserialize counter cell from collection");
-            }
-            void operator()(const ser::unknown_variant_type&) const {
-                utils::on_internal_error("Trying to deserialize cell in unknown state");
-            }
-        };
-        boost::apply_visitor(collection_cell_visitor{out}, v.value());
-    }
-};
-
-}
-
-collection_mutation read_from_collection_cell_view(const abstract_type& type, const ser::collection_cell_view& collection) {
-    auto tomb = collection.tomb();
-    auto cells = collection.elements();
-    return serialize_collection_mutation<serialized_cell_adaptor>(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
+    return serialize_collection_mutation(type, tomb, std::ranges::subrange(cells.begin(), cells.end()));
 }

 template <typename C>
--- a/mutation/collection_mutation.hh
+++ b/mutation/collection_mutation.hh
@@ -23,10 +23,6 @@ class row_tombstone;

 class collection_mutation;

-namespace ser {
-class collection_cell_view;
-}
-
 // An auxiliary struct used to (de)construct collection_mutations.
 // Unlike collection_mutation which is a serialized blob, this struct allows to inspect logical units of information
 // (tombstone and cells) inside the mutation easily.
@@ -134,12 +130,6 @@ collection_mutation merge(const abstract_type&, collection_mutation_view, collec

 collection_mutation difference(const abstract_type&, collection_mutation_view, collection_mutation_view);

-// Transcode a collection from the IDL representation directly into the
-// collection_mutation serialization format, without using any intermediary representation.
-// Only the final collection-mutation blob is allocated, no intermediate allocations needed.
-// Safe to use in LSA, it won't produce garbage.
-collection_mutation read_from_collection_cell_view(const abstract_type&, const ser::collection_cell_view&);
-
 // Serializes the given collection of cells to a sequence of bytes ready to be sent over the CQL protocol.
 bytes_ostream serialize_for_cql(const abstract_type&, collection_mutation_view);

--- a/mutation/frozen_mutation.hh
+++ b/mutation/frozen_mutation.hh
@@ -97,9 +97,9 @@ public:
        r.append_cell(id, atomic_cell_or_collection(std::move(cell)));
    }

-    virtual void accept_static_cell(column_id id, collection_mutation collection) override {
+    virtual void accept_static_cell(column_id id, collection_mutation_view collection) override {
        row& r = _static_row.maybe_create();
-        r.append_cell(id, std::move(collection));
+        r.append_cell(id, collection_mutation(*_schema.static_column_at(id).type, std::move(collection)));
    }

    virtual stop_iteration accept_row_tombstone(range_tombstone rt) override {
@@ -125,9 +125,9 @@ public:
        r.append_cell(id, std::move(cell));
    }

-    virtual void accept_row_cell(column_id id, collection_mutation collection) override {
+    virtual void accept_row_cell(column_id id, collection_mutation_view collection) override {
        row& r = _current_row->cells();
-        r.append_cell(id, std::move(collection));
+        r.append_cell(id, collection_mutation(*_schema.regular_column_at(id).type, std::move(collection)));
    }

    auto on_end_of_partition() {
--- a/mutation/mutation_partition.hh
+++ b/mutation/mutation_partition.hh
@@ -707,10 +707,9 @@ struct fmt::formatter<shadowable_tombstone> : fmt::formatter<string_view> {
    template <typename FormatContext>
    auto format(const shadowable_tombstone& t, FormatContext& ctx) const {
        if (t) {
-            auto& tomb = t.tomb();
            return fmt::format_to(ctx.out(),
                                  "{{shadowable tombstone: timestamp={}, deletion_time={}}}",
-                                  tomb.timestamp, tomb.deletion_time.time_since_epoch().count());
+                                  t.tomb().timestamp, t.tomb(), t.tomb().deletion_time.time_since_epoch().count());
        } else {
            return fmt::format_to(ctx.out(),
                                  "{{shadowable tombstone: none}}");
--- a/mutation/mutation_partition_view.cc
+++ b/mutation/mutation_partition_view.cc
@@ -86,6 +86,37 @@ atomic_cell read_atomic_cell(const abstract_type& type, atomic_cell_variant cv,
    return boost::apply_visitor(atomic_cell_visitor(type, cm), cv);
 }

+collection_mutation read_collection_cell(const abstract_type& type, ser::collection_cell_view cv)
+{
+    collection_mutation_description mut;
+    mut.tomb = cv.tomb();
+    auto&& elements = cv.elements();
+    mut.cells.reserve(elements.size());
+
+    visit(type, make_visitor(
+        [&] (const collection_type_impl& ctype) {
+            auto& value_type = *ctype.value_comparator();
+            for (auto&& e : elements) {
+                mut.cells.emplace_back(e.key(), read_atomic_cell(value_type, e.value(), atomic_cell::collection_member::yes));
+            }
+        },
+        [&] (const user_type_impl& utype) {
+            for (auto&& e : elements) {
+                bytes key = e.key();
+                auto idx = deserialize_field_index(key);
+                SCYLLA_ASSERT(idx < utype.size());
+
+                mut.cells.emplace_back(key, read_atomic_cell(*utype.type(idx), e.value(), atomic_cell::collection_member::yes));
+            }
+        },
+        [&] (const abstract_type& o) {
+            throw std::runtime_error(format("attempted to read a collection cell with type: {}", o.name()));
+        }
+    ));
+
+    return mut.serialize(type);
+}
+
 template<typename Visitor>
 void read_and_visit_row(ser::row_view rv, const column_mapping& cm, column_kind kind, Visitor&& visitor)
 {
@@ -111,7 +142,14 @@ void read_and_visit_row(ser::row_view rv, const column_mapping& cm, column_kind
                if (_col.is_atomic()) {
                    throw std::runtime_error("An atomic cell expected, got a collection");
                }
-                _visitor.accept_collection(_id, read_from_collection_cell_view(*_col.type(), ccv));
+                // FIXME: Pass view to cell to avoid copy
+                auto&& outer = current_allocator();
+                with_allocator(standard_allocator(), [&] {
+                    auto cell = read_collection_cell(*_col.type(), ccv);
+                    with_allocator(outer, [&] {
+                        _visitor.accept_collection(_id, cell);
+                    });
+                });
            }
            void operator()(ser::unknown_variant_type&) const {
                throw std::runtime_error("Trying to deserialize unknown cell type");
@@ -160,8 +198,8 @@ void mutation_partition_view::do_accept(const column_mapping& cm, Visitor& visit
        void accept_atomic_cell(column_id id, atomic_cell ac) const {
           _visitor.accept_static_cell(id, std::move(ac));
        }
-        void accept_collection(column_id id, collection_mutation cm) const {
-           _visitor.accept_static_cell(id, std::move(cm));
+        void accept_collection(column_id id, const collection_mutation& cm) const {
+           _visitor.accept_static_cell(id, cm);
        }
    };
    read_and_visit_row(mpv.static_row(), cm, column_kind::static_column, static_row_cell_visitor{visitor});
@@ -180,8 +218,8 @@ void mutation_partition_view::do_accept(const column_mapping& cm, Visitor& visit
            void accept_atomic_cell(column_id id, atomic_cell ac) const {
               _visitor.accept_row_cell(id, std::move(ac));
            }
-            void accept_collection(column_id id, collection_mutation cm) const {
-               _visitor.accept_row_cell(id, std::move(cm));
+            void accept_collection(column_id id, const collection_mutation& cm) const {
+               _visitor.accept_row_cell(id, cm);
            }
        };
        read_and_visit_row(cr.cells(), cm, column_kind::regular_column, cell_visitor{visitor});
@@ -202,8 +240,8 @@ future<> mutation_partition_view::do_accept_gently(const column_mapping& cm, Vis
        void accept_atomic_cell(column_id id, atomic_cell ac) const {
           _visitor.accept_static_cell(id, std::move(ac));
        }
-        void accept_collection(column_id id, collection_mutation cm) const {
-           _visitor.accept_static_cell(id, std::move(cm));
+        void accept_collection(column_id id, const collection_mutation& cm) const {
+           _visitor.accept_static_cell(id, cm);
        }
    };
    read_and_visit_row(mpv.static_row(), cm, column_kind::static_column, static_row_cell_visitor{visitor});
@@ -225,8 +263,8 @@ future<> mutation_partition_view::do_accept_gently(const column_mapping& cm, Vis
            void accept_atomic_cell(column_id id, atomic_cell ac) const {
               _visitor.accept_row_cell(id, std::move(ac));
            }
-            void accept_collection(column_id id, collection_mutation cm) const {
-               _visitor.accept_row_cell(id, std::move(cm));
+            void accept_collection(column_id id, const collection_mutation& cm) const {
+               _visitor.accept_row_cell(id, cm);
            }
        };
        read_and_visit_row(cr.cells(), cm, column_kind::regular_column, cell_visitor{visitor});
@@ -248,8 +286,8 @@ future<> mutation_partition_view::do_accept_gently(const column_mapping& cm, Asy
        void accept_atomic_cell(column_id id, atomic_cell ac) const {
           _visitor.accept_static_cell(id, std::move(ac));
        }
-        void accept_collection(column_id id, collection_mutation cm) const {
-           _visitor.accept_static_cell(id, std::move(cm));
+        void accept_collection(column_id id, const collection_mutation& cm) const {
+           _visitor.accept_static_cell(id, cm);
        }
    };
    read_and_visit_row(mpv.static_row(), cm, column_kind::static_column, static_row_cell_visitor{visitor});
@@ -270,8 +308,8 @@ future<> mutation_partition_view::do_accept_gently(const column_mapping& cm, Asy
            void accept_atomic_cell(column_id id, atomic_cell ac) const {
               _visitor.accept_row_cell(id, std::move(ac));
            }
-            void accept_collection(column_id id, collection_mutation cm) const {
-               _visitor.accept_row_cell(id, std::move(cm));
+            void accept_collection(column_id id, const collection_mutation& cm) const {
+               _visitor.accept_row_cell(id, cm);
            }
        };
        read_and_visit_row(cr.cells(), cm, column_kind::regular_column, cell_visitor{visitor});
@@ -299,8 +337,8 @@ mutation_partition_view::accept_ordered_result mutation_partition_view::do_accep
            void accept_atomic_cell(column_id id, atomic_cell ac) const {
                _visitor.accept_static_cell(id, std::move(ac));
            }
-            void accept_collection(column_id id, collection_mutation cm) const {
-                _visitor.accept_static_cell(id, std::move(cm));
+            void accept_collection(column_id id, const collection_mutation& cm) const {
+                _visitor.accept_static_cell(id, cm);
            }
        };
        read_and_visit_row(mpv.static_row(), cm, column_kind::static_column, static_row_cell_visitor{visitor});
@@ -338,8 +376,8 @@ mutation_partition_view::accept_ordered_result mutation_partition_view::do_accep
            void accept_atomic_cell(column_id id, atomic_cell ac) const {
                _visitor.accept_row_cell(id, std::move(ac));
            }
-            void accept_collection(column_id id, collection_mutation cm) const {
-                _visitor.accept_row_cell(id, std::move(cm));
+            void accept_collection(column_id id, const collection_mutation& cm) const {
+                _visitor.accept_row_cell(id, cm);
            }
        };
        read_and_visit_row(cr.cells(), cm, column_kind::regular_column, cell_visitor{visitor});
@@ -463,40 +501,44 @@ mutation_partition_view mutation_partition_view::from_view(ser::mutation_partiti

 clustering_row read_clustered_row(const schema& s, ser::clustering_row_view crv) {
    class clustering_row_builder {
+        const schema& _s;
        clustering_row _row;
    public:
-        clustering_row_builder(clustering_key key, row_tombstone t, row_marker m)
-            : _row(std::move(key), std::move(t), std::move(m), row()) { }
+        clustering_row_builder(const schema& s, clustering_key key, row_tombstone t, row_marker m)
+            : _s(s), _row(std::move(key), std::move(t), std::move(m), row()) { }
        void accept_atomic_cell(column_id id, atomic_cell ac) {
            _row.cells().append_cell(id, std::move(ac));
        }
-        void accept_collection(column_id id, collection_mutation cm) {
-            _row.cells().append_cell(id, std::move(cm));
+        void accept_collection(column_id id, const collection_mutation& cm) {
+            _row.cells().append_cell(id, collection_mutation(*_s.regular_column_at(id).type, cm));
        }
        clustering_row get() && { return std::move(_row); }
    };

    auto cr = crv.row();
    auto t = row_tombstone(cr.deleted_at(), shadowable_tombstone(cr.shadowable_deleted_at()));
-    clustering_row_builder builder(cr.key(), std::move(t), read_row_marker(cr.marker()));
+    clustering_row_builder builder(s, cr.key(), std::move(t), read_row_marker(cr.marker()));
    read_and_visit_row(cr.cells(), s.get_column_mapping(), column_kind::regular_column, builder);
    return std::move(builder).get();
 }

 static_row read_static_row(const schema& s, ser::static_row_view sr) {
    class static_row_builder {
+        const schema& _s;
        static_row _row;
    public:
+        explicit static_row_builder(const schema& s)
+            : _s(s) { }
        void accept_atomic_cell(column_id id, atomic_cell ac) {
            _row.cells().append_cell(id, std::move(ac));
        }
-        void accept_collection(column_id id, collection_mutation cm) {
-            _row.cells().append_cell(id, std::move(cm));
+        void accept_collection(column_id id, const collection_mutation& cm) {
+            _row.cells().append_cell(id, collection_mutation(*_s.static_column_at(id).type, cm));
        }
        static_row get() && { return std::move(_row); }
    };

-    static_row_builder builder;
+    static_row_builder builder(s);
    read_and_visit_row(sr.cells(), s.get_column_mapping(), column_kind::static_column, builder);
    return std::move(builder).get();
 }
--- a/mutation/mutation_partition_view.hh
+++ b/mutation/mutation_partition_view.hh
@@ -23,31 +23,31 @@ class converting_mutation_partition_applier;

 template<typename T>
 concept MutationViewVisitor = requires (T& visitor, tombstone t, atomic_cell ac,
-                                             collection_mutation cm, range_tombstone rt,
+                                             collection_mutation_view cmv, range_tombstone rt,
                                             position_in_partition_view pipv, row_tombstone row_tomb,
                                             row_marker rm) {
    visitor.accept_partition_tombstone(t);
    visitor.accept_static_cell(column_id(), std::move(ac));
-    visitor.accept_static_cell(column_id(), std::move(cm));
+    visitor.accept_static_cell(column_id(), cmv);
    visitor.accept_row_tombstone(rt);
    visitor.accept_row(pipv, row_tomb, rm,
            is_dummy::no, is_continuous::yes);
    visitor.accept_row_cell(column_id(), std::move(ac));
-    visitor.accept_row_cell(column_id(), std::move(cm));
+    visitor.accept_row_cell(column_id(), cmv);
 };

 template<typename T>
 concept AsyncMutationViewVisitor = requires (T& visitor, tombstone t, atomic_cell ac,
-                                             collection_mutation cm, range_tombstone rt,
+                                             collection_mutation_view cmv, range_tombstone rt,
                                             position_in_partition_view pipv, row_tombstone row_tomb,
                                             row_marker rm) {
    { visitor.accept_partition_tombstone(t) } -> std::same_as<void>;
    { visitor.accept_static_cell(column_id(), std::move(ac)) } -> std::same_as<void>;
-    { visitor.accept_static_cell(column_id(), std::move(cm)) } -> std::same_as<void>;
+    { visitor.accept_static_cell(column_id(), cmv) } -> std::same_as<void>;
    { visitor.accept_row_tombstone(rt) } -> std::same_as<future<>>;
    { visitor.accept_row(pipv, row_tomb, rm, is_dummy::no, is_continuous::yes) } -> std::same_as<future<>>;
    { visitor.accept_row_cell(column_id(), std::move(ac)) } -> std::same_as<void>;
-    { visitor.accept_row_cell(column_id(), std::move(cm)) } -> std::same_as<void>;
+    { visitor.accept_row_cell(column_id(), cmv) } -> std::same_as<void>;
    { visitor.accept_end_of_partition() } -> std::same_as<future<>>;
 };

@@ -56,11 +56,11 @@ public:
    virtual ~mutation_partition_view_virtual_visitor();
    virtual void accept_partition_tombstone(tombstone t) = 0;
    virtual void accept_static_cell(column_id, atomic_cell ac) = 0;
-    virtual void accept_static_cell(column_id, collection_mutation cm) = 0;
+    virtual void accept_static_cell(column_id, collection_mutation_view cmv) = 0;
    virtual stop_iteration accept_row_tombstone(range_tombstone rt) = 0;
    virtual stop_iteration accept_row(position_in_partition_view pipv, row_tombstone rt, row_marker rm, is_dummy, is_continuous) = 0;
    virtual void accept_row_cell(column_id, atomic_cell ac) = 0;
-    virtual void accept_row_cell(column_id, collection_mutation cm) = 0;
+    virtual void accept_row_cell(column_id, collection_mutation_view cmv) = 0;
 };

 // View on serialized mutation partition. See mutation_partition_serializer.
--- a/partition_builder.hh
+++ b/partition_builder.hh
@@ -46,12 +46,8 @@ public:
    }

    virtual void accept_static_cell(column_id id, collection_mutation_view collection) override {
-        accept_static_cell(id, collection_mutation(*_schema.static_column_at(id).type, std::move(collection)));
-    }
-
-    void accept_static_cell(column_id id, collection_mutation&& collection) {
        row& r = _partition.static_row().maybe_create();
-        r.append_cell(id, std::move(collection));
+        r.append_cell(id, collection_mutation(*_schema.static_column_at(id).type, std::move(collection)));
    }

    virtual void accept_row_tombstone(const range_tombstone& rt) override {
@@ -76,12 +72,8 @@ public:
    }

    virtual void accept_row_cell(column_id id, collection_mutation_view collection) override {
-        accept_row_cell(id, collection_mutation(*_schema.regular_column_at(id).type, std::move(collection)));
-    }
-
-    void accept_row_cell(column_id id, collection_mutation collection) {
        row& r = _current_row->cells();
-        r.append_cell(id, std::move(collection));
+        r.append_cell(id, collection_mutation(*_schema.regular_column_at(id).type, std::move(collection)));
    }
 };

--- a/pgo/exec_cql.py
+++ b/pgo/exec_cql.py
@@ -16,6 +16,7 @@ Usage:
 import argparse, os, sys
 from typing import Sequence

+from test.pylib.driver_utils import safe_driver_shutdown

 def read_statements(path: str) -> list[tuple[int, str]]:
    stms: list[tuple[int, str]] = []
@@ -57,7 +58,7 @@ def exec_statements(statements: list[tuple[int, str]], socket_path: str, timeout
                print(f"ERROR executing statement from file line {lineno}: {s}\n{e}", file=sys.stderr)
                return 1
    finally:
-        cluster.shutdown()
+        safe_driver_shutdown(cluster)
    return 0

 def main(argv: Sequence[str]) -> int:
--- a/pgo/profiles/aarch64/profile.profdata.xz
+++ b/pgo/profiles/aarch64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8b22f9a548a03c88250d31e97ea3e8f77b4d90c502bcf74336c24056557f947f
-size 6698412
+oid sha256:524c54493b72c5e1b783f14dfa49d733e21b24cc2ec776e9c6e578095073162d
+size 6646304
--- a/pgo/profiles/x86_64/profile.profdata.xz
+++ b/pgo/profiles/x86_64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:31e515a62f006649b0dc4671b51b2643fba9a70884c09b90fbc2237044954254
-size 6707108
+oid sha256:fec2bb253d43139da954cee3441fc8bc74824246b080f23bf1f824714d0adc45
+size 6646576
--- a/raft/server.cc
+++ b/raft/server.cc
@@ -239,10 +239,7 @@ private:

    // Drop waiter that we lost track of, can happen due to a snapshot transfer,
    // or a leader removed from cluster while some entries added on it are uncommitted.
-    // When `snp` is provided (snapshot transfer case), waiters whose term matches
-    // the snapshot term are resolved successfully, since the snapshot-term match proves
-    // they were committed and included in the snapshot (by the Log Matching Property).
-    void drop_waiters(const snapshot_descriptor* snp = nullptr);
+    void drop_waiters(std::optional<index_t> idx = {});

    // Wake up all waiter that wait for entries with idx smaller of equal to the one provided
    // to be applied.
@@ -559,10 +556,12 @@ future<> server_impl::wait_for_entry(entry_id eid, wait_type type, seastar::abor
                auto snap_term = _fsm->log_term_for(snap_idx);
                SCYLLA_ASSERT(snap_term);
                SCYLLA_ASSERT(snap_idx >= eid.idx);
-                if (snap_term == eid.term) {
+                if (type == wait_type::committed && snap_term == eid.term) {
                    logger.trace("[{}] wait_for_entry {}.{}: entry got truncated away, but has the snapshot's term"
                                 " (snapshot index: {})", id(), eid.term, eid.idx, snap_idx);
                    co_return;
+
+                    // We don't do this for `wait_type::applied` - see below why.
                }

                logger.trace("[{}] wait_for_entry {}.{}: entry got truncated away", id(), eid.term, eid.idx);
@@ -573,6 +572,20 @@ future<> server_impl::wait_for_entry(entry_id eid, wait_type type, seastar::abor
                throw dropped_entry();
            }

+            if (type == wait_type::applied && _fsm->log_last_snapshot_idx() >= eid.idx) {
+                // We know the entry was committed but the wait type is `applied`
+                // and we don't know if the entry was applied with `state_machine::apply`
+                // (we may've loaded a snapshot before we managed to apply the entry).
+                // As specified by `add_entry`, throw `commit_status_unknown` in this case.
+                //
+                // FIXME: replace this with a different exception type - `commit_status_unknown`
+                // gives too much uncertainty while we know that the entry was committed
+                // and had to be applied on at least one server. Some callers of `add_entry`
+                // need to know only that the current state includes that entry, whether it was done
+                // through `apply` on this server or through receiving a snapshot.
+                throw commit_status_unknown();
+            }
+
            co_return;
        }
    }
@@ -747,8 +760,6 @@ future<> server_impl::add_entry(command command, wait_type type, seastar::abort_
            throw not_a_leader{leader};
        }
        auto eid = co_await add_entry_on_leader(std::move(command), as);
-        co_await utils::get_local_injector().inject("block_raft_add_entry_before_wait_for_entry",
-                utils::wait_for_message(std::chrono::minutes(5)));
        co_return co_await wait_for_entry(eid, type, as);
    }

@@ -984,24 +995,17 @@ void server_impl::notify_waiters(std::map<index_t, op_status>& waiters,
    }
 }

-void server_impl::drop_waiters(const snapshot_descriptor* snp) {
+void server_impl::drop_waiters(std::optional<index_t> idx) {
    auto drop = [&] (std::map<index_t, op_status>& waiters) {
        while (waiters.size() != 0) {
            auto it = waiters.begin();
-            if (snp && it->first > snp->idx) {
+            if (idx && it->first > *idx) {
                break;
            }
            auto [entry_idx, status] = std::move(*it);
            waiters.erase(it);
-            if (snp && status.term == snp->term) {
-                // entry_idx <= snapshot index and the entry's term matches the snapshot term.
-                // By the Log Matching Property the entry was committed and included in the snapshot.
-                status.done.set_value();
-                _stats.waiters_awoken++;
-            } else {
-                status.done.set_exception(commit_status_unknown());
-                _stats.waiters_dropped++;
-            }
+            status.done.set_exception(commit_status_unknown());
+            _stats.waiters_dropped++;
        }
    };
    drop(_awaited_commits);
@@ -1427,7 +1431,7 @@ future<> server_impl::applier_fiber() {
                // Apply snapshot it to the state machine
                logger.trace("[{}] apply_fiber applying snapshot {}", _id, snp.id);
                co_await _state_machine->load_snapshot(snp.id);
-                drop_waiters(&snp);
+                drop_waiters(snp.idx);
                _applied_idx = snp.idx;
                _applied_index_changed.broadcast();
                _stats.sm_load_snapshot++;
@@ -1936,7 +1940,7 @@ std::unique_ptr<server> create_server(server_id uuid, std::unique_ptr<rpc> rpc,
 }

 std::ostream& operator<<(std::ostream& os, const server_impl& s) {
-    fmt::print(os, "[id: {}, fsm ({})]\n", s._id, *s._fsm);
+    fmt::print(os, "[id: {}, fsm ()]\n", s._id, *s._fsm);
    return os;
 }

--- a/raft/server.hh
+++ b/raft/server.hh
@@ -79,18 +79,18 @@ public:
    // The caller may pass a pointer to an abort_source to make the operation abortable.
    // If it passes nullptr, the operation is unabortable.
    //
-    // Successful `add_entry` does not guarantee that `state_machine::apply` will be called
-    // locally for this entry. Between the commit and the application we may load a snapshot
-    // containing this entry, so the state machine's state 'jumps' forward in time, skipping
-    // the local entry application. For `wait_type::applied` this should be fine, because
-    // state machine implementations shouldn't care whether an entry was applied via
-    // `state_machine::apply` or via a snapshot load.
+    // Successful `add_entry` with `wait_type::committed` does not guarantee that `state_machine::apply` will be called
+    // locally for this entry. Between the commit and the application we may receive a snapshot containing this entry,
+    // so the state machine's state 'jumps' forward in time, skipping the entry application.
+    // However, for `wait_type::applied`, we guarantee that the entry will be applied locally with `state_machine::apply`.
+    // If a snapshot causes the state machine to jump over the entry, `add_entry` will return `commit_status_unknown`
+    // (even if the snapshot included that entry).
    //
    // Exceptions:
    // raft::commit_status_unknown
    //     Thrown if the leader has changed and the log entry has either
    //     been replaced by the new leader or the server has lost track of it.
-    //     It may also be thrown in case of a transport error while forwarding add_entry to the leader.
+    //     It may also be thrown in case of a transport error while forwarding add_entry to the leader.L
    // raft::dropped_entry
    //     Thrown if the entry was replaced because of a leader change.
    // raft::request_aborted
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -1022,7 +1022,8 @@ void database::drop_keyspace(const sstring& name) {
 static bool is_system_table(const schema& s) {
    auto& k = s.ks_name();
    return k == db::system_keyspace::NAME ||
-        k == db::system_distributed_keyspace::NAME;
+        k == db::system_distributed_keyspace::NAME ||
+        k == db::system_distributed_keyspace::NAME_EVERYWHERE;
 }

 sstables::sstables_manager& database::get_sstables_manager(const schema& s) const {
@@ -1141,7 +1142,7 @@ future<> database::create_local_system_table(
        cfg.memtable_scheduling_group = default_scheduling_group();
        cfg.memtable_to_cache_scheduling_group = default_scheduling_group();
    }
-    auto lock = co_await get_tables_metadata().hold_write_lock();
+    auto lock = get_tables_metadata().hold_write_lock();
    std::exception_ptr ex;
    try {
        add_column_family(ks, table, std::move(cfg), replica::database::is_new_cf::no);
@@ -1327,27 +1328,9 @@ future<global_table_ptr> get_table_on_all_shards(sharded<database>& sharded_db,

 future<tables_metadata_lock_on_all_shards> database::lock_tables_metadata(sharded<database>& sharded_db) {
    tables_metadata_lock_on_all_shards locks;
-    // Acquire write lock on shard 0 first, and then on the remaining shards.
-    //
-    // Parallel acquisition on all shards could deadlock when two
-    // fibers call lock_tables_metadata() concurrently: parallel_for_each
-    // sends SMP messages to all shards even when the local shard's lock
-    // attempt blocks.  If task reordering (SEASTAR_SHUFFLE_TASK_QUEUE in
-    // debug/sanitize builds) causes fiber A to win on shard X while
-    // fiber B wins on shard Y, neither can make progress — classic
-    // cross-shard lock-ordering deadlock.
-    //
-    // Acquiring the write lock on shard 0 first, and then on the remaining
-    // shards, eliminates this: whichever fiber acquires shard 0 first is
-    // guaranteed to acquire locks on all other shards before the other fiber
-    // can acquire the lock on shard 0.
-    co_await sharded_db.invoke_on(0, [&locks, &sharded_db] (auto& db) -> future<> {
+    co_await sharded_db.invoke_on_all([&] (auto& db) -> future<> {
        locks.assign_lock(co_await db.get_tables_metadata().hold_write_lock());
-        co_await sharded_db.invoke_on_others([&locks] (auto& db) -> future<> {
-            locks.assign_lock(co_await db.get_tables_metadata().hold_write_lock());
-        });
    });
-
    co_return locks;
 }

--- a/replica/distributed_loader.cc
+++ b/replica/distributed_loader.cc
@@ -48,6 +48,7 @@ bool is_system_keyspace(std::string_view name) {

 static const std::unordered_set<std::string_view> internal_keyspaces = {
        db::system_distributed_keyspace::NAME,
+        db::system_distributed_keyspace::NAME_EVERYWHERE,
        db::system_keyspace::NAME,
        db::schema_tables::NAME,
        auth::meta::legacy::AUTH_KS,
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -4624,7 +4624,7 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat
        sstables::shared_sstable sst;
        replica::enable_backlog_tracker enable_backlog_tracker;
    };
-    std::unordered_map<size_t, std::vector<removed_sstable>> per_cg_remove;
+    std::vector<removed_sstable> remove;

    _stats.pending_sstable_deletions++;
    auto undo_stats = defer([this] {
@@ -4633,7 +4633,7 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat

    auto permit = co_await get_sstable_list_permit();

-    co_await _cache.invalidate(row_cache::external_updater([this, &rp, &per_cg_remove, truncated_at] {
+    co_await _cache.invalidate(row_cache::external_updater([this, &rp, &remove, truncated_at] {
        // FIXME: the following isn't exception safe.
        for_each_compaction_group([&] (compaction_group& cg) {

@@ -4648,7 +4648,7 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat
                        if (p->originated_on_this_node().value_or(false) && p->get_stats_metadata().position.shard_id() == this_shard_id()) {
                            rp = std::max(p->get_stats_metadata().position, rp);
                        }
-                        per_cg_remove[cg.group_id()].emplace_back(removed_sstable{cg, p, enable_backlog_tracker});
+                        remove.emplace_back(removed_sstable{cg, p, enable_backlog_tracker});
                        return;
                    }
                    pruned->insert(p);
@@ -4665,19 +4665,16 @@ future<db::replay_position> table::discard_sstables(db_clock::time_point truncat
    }));
    rebuild_statistics();

-    co_await coroutine::parallel_for_each(per_cg_remove, [&] (auto& entry) {
-        auto& removed = entry.second;
-        std::vector<sstables::shared_sstable> del;
-        del.reserve(removed.size());
-        for (auto& r : removed) {
-            if (r.enable_backlog_tracker) {
-                remove_sstable_from_backlog_tracker(r.cg.get_backlog_tracker(), r.sst);
-            }
-            erase_sstable_cleanup_state(r.sst);
-            del.emplace_back(std::move(r.sst));
+    std::vector<sstables::shared_sstable> del;
+    del.reserve(remove.size());
+    for (auto& r : remove) {
+        if (r.enable_backlog_tracker) {
+            remove_sstable_from_backlog_tracker(r.cg.get_backlog_tracker(), r.sst);
        }
-        return delete_sstables_atomically(permit, std::move(del));
-    });
+        erase_sstable_cleanup_state(r.sst);
+        del.emplace_back(r.sst);
+    };
+    co_await delete_sstables_atomically(permit, std::move(del));
    co_return rp;
 }

@@ -5612,7 +5609,7 @@ future<> compaction_group::cleanup() {
    auto updater = row_cache::external_updater(std::make_unique<compaction_group_cleaner>(*this));

    auto p_range = to_partition_range(token_range());
-    tlogger.debug("Invalidating range {} for compaction group {} of table {}.{} during cleanup.",
+    tlogger.debug("Invalidating range {} for compaction group {} of table {} during cleanup.",
                  p_range, group_id(), _t.schema()->ks_name(), _t.schema()->cf_name());
    // Since permit is still held, all actions below will be executed atomically:
    co_await _t._cache.invalidate(std::move(updater), p_range);
--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -1328,14 +1328,8 @@ class interval_printer(gdb.printing.PrettyPrinter):
    def __init__(self, val):
        self.val = val['_interval']

-    def inspect_bound(self, bound_name):
-        if f'_{bound_name}_exists' in self.val:
-            if not self.val[f'_{bound_name}_exists']:
-                return False, False, None
-
-            return True, bool(self.val[f'_{bound_name}_inclusive']), self.val[f'_{bound_name}_value']
-
-        bound = std_optional(self.val[f'_{bound_name}'])
+    def inspect_bound(self, bound_opt):
+        bound = std_optional(bound_opt)
        if not bound:
            return False, False, None

@@ -1344,8 +1338,8 @@ class interval_printer(gdb.printing.PrettyPrinter):
        return True, bool(bound['_inclusive']), bound['_value']

    def to_string(self):
-        has_start, start_inclusive, start_value = self.inspect_bound('start')
-        has_end, end_inclusive, end_value = self.inspect_bound('end')
+        has_start, start_inclusive, start_value = self.inspect_bound(self.val['_start'])
+        has_end, end_inclusive, end_value = self.inspect_bound(self.val['_end'])

        if self.val['_singular']:
            return '{{{}}}'.format(str(start_value))
@@ -5472,9 +5466,10 @@ class scylla_compaction_tasks(gdb.Command):
        try:
            task_list = list(intrusive_list(cm['_tasks']))
        except gdb.error: # 6.2 compatibility
-            task_list = [seastar_shared_ptr(t).get().dereference() for t in std_list(cm['_tasks'])]
+            task_list = list(std_list(cm['_tasks']))

        for task in task_list:
+            task = seastar_shared_ptr(task).get().dereference()
            schema = schema_ptr(task['_compacting_table'].dereference()['_schema'])
            key = 'type={}, state={:5}, {}'.format(task['_type'], str(task['_state']), schema.table_name())
            task_hist.add(key)
--- a/service/client_state.cc
+++ b/service/client_state.cc
@@ -122,9 +122,11 @@ future<> service::client_state::check_internal_table_permissions(std::string_vie
                    auth::permission::ALTER, auth::permission::DROP>();

    if (forbidden_permissions.contains(cmd.permission)) {
-        if (ks == db::system_distributed_keyspace::NAME
+        if ((ks == db::system_distributed_keyspace::NAME || ks == db::system_distributed_keyspace::NAME_EVERYWHERE)
                && (table_name == db::system_distributed_keyspace::CDC_DESC_V2
-                || table_name == db::system_distributed_keyspace::CDC_TIMESTAMPS)) {
+                || table_name == db::system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION
+                || table_name == db::system_distributed_keyspace::CDC_TIMESTAMPS
+                || table_name == db::system_distributed_keyspace::CDC_GENERATIONS_V2)) {
            return make_exception_future(exceptions::unauthorized_exception(
                    format("Cannot {} {}", auth::permissions::to_string(cmd.permission), cmd.resource)));
        }
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -239,7 +239,7 @@ future<> migration_manager::wait_for_schema_agreement(const replica::database& d
            as->check();
        }
        if (db::timeout_clock::now() > deadline) {
-            throw schema_agreement_timeout();
+            throw std::runtime_error("Unable to reach schema agreement");
        }
        co_await (as ? sleep_abortable(std::chrono::milliseconds(500), *as) : sleep(std::chrono::milliseconds(500)));
    }
--- a/service/migration_manager.hh
+++ b/service/migration_manager.hh
@@ -14,7 +14,6 @@
 #include "gms/endpoint_state.hh"
 #include <seastar/core/abort_source.hh>
 #include <seastar/core/gate.hh>
-#include <seastar/core/timed_out_error.hh>
 #include "gms/inet_address.hh"
 #include "gms/feature.hh"
 #include "gms/i_endpoint_state_change_subscriber.hh"
@@ -134,19 +133,6 @@ public:
     * Known peers in the cluster have the same schema version as us.
     */
    bool have_schema_agreement();
-    // Thrown by wait_for_schema_agreement() when the deadline is reached.
-    struct schema_agreement_timeout : public seastar::timed_out_error {
-        const char* what() const noexcept override {
-            return "Unable to reach schema agreement";
-        }
-    };
-    /**
-     * Waits until all known live peers have the same schema version as this
-     * node. Returns normally once agreement is reached, or throws
-     * schema_agreement_timeout if the deadline is reached before agreement.
-     * If as != nullptr, can also throw abort_requested_exception if the abort
-     * source fires.
-     */
    future<> wait_for_schema_agreement(const replica::database& db, db::timeout_clock::time_point deadline, seastar::abort_source* as);

    // Maximum number of retries one should attempt when trying to perform
--- a/service/paxos/paxos_state.cc
+++ b/service/paxos/paxos_state.cc
@@ -438,10 +438,9 @@ static future<cql3::untyped_result_set> do_execute_cql_with_timeout(sstring req,

    const auto cache_key = qp.compute_id(req, "", cql3::internal_dialect());
    auto ps_ptr = qp.get_prepared(cache_key);
-    shared_ptr<cql_transport::messages::result_message::prepared> prepared_msg;
    if (!ps_ptr) {
-        prepared_msg = co_await qp.prepare(req, qs, cql3::internal_dialect());
-        ps_ptr = prepared_msg->get_prepared();
+        const auto msg_ptr = co_await qp.prepare(req, qs, cql3::internal_dialect());
+        ps_ptr = msg_ptr->get_prepared();
        if (!ps_ptr) {
            on_internal_error(paxos_state::logger, "prepared statement is null");
        }
@@ -450,8 +449,8 @@ static future<cql3::untyped_result_set> do_execute_cql_with_timeout(sstring req,
        -1, service::node_local_only::yes);
    const auto st = ps_ptr->statement;

-    const auto result_ptr = co_await st->execute(qp, qs, qo, std::nullopt);
-    co_return cql3::untyped_result_set(result_ptr);
+    const auto msg_ptr = co_await st->execute(qp, qs, qo, std::nullopt);
+    co_return cql3::untyped_result_set(msg_ptr);
 }

 template <typename... Args>
--- a/service/qos/service_level_controller.cc
+++ b/service/qos/service_level_controller.cc
@@ -26,6 +26,7 @@
 #include <seastar/coroutine/maybe_yield.hh>
 #include "service/qos/raft_service_level_distributed_data_accessor.hh"
 #include "service_level_controller.hh"
+#include "db/system_distributed_keyspace.hh"
 #include "cql3/query_processor.hh"
 #include "service/storage_service.hh"
 #include "service/topology_state_machine.hh"
--- a/service/qos/service_level_controller.hh
+++ b/service/qos/service_level_controller.hh
@@ -31,6 +31,7 @@

 namespace db {
    class system_keyspace;
+    class system_distributed_keyspace;
 }
 namespace cql3 {
    class query_processor;
--- a/service/raft/group0_state_machine.cc
+++ b/service/raft/group0_state_machine.cc
@@ -434,8 +434,6 @@ future<> group0_state_machine::load_snapshot(raft::snapshot_id id) {
 }

 future<> group0_state_machine::enable_in_memory_state_machine() {
-    co_await utils::get_local_injector().inject("group0_state_machine_enable_in_memory_fail",
-            [] { return std::make_exception_ptr(std::runtime_error("injected failure in enable_in_memory_state_machine")); });
    auto read_apply_mutex_holder = co_await _client.hold_read_apply_mutex(_abort_source);
    if (!_in_memory_state_machine_enabled) {
        _in_memory_state_machine_enabled = true;
--- a/service/raft/raft_group0.cc
+++ b/service/raft/raft_group0.cc
@@ -452,16 +452,14 @@ future<> raft_group0::start_server_for_group0(raft::group_id group0_id, service:
    auto srv_for_group0 = create_server_for_group0(group0_id, my_id, ss, qp, mm);
    auto& persistence = srv_for_group0.persistence;
    auto& server = *srv_for_group0.server;
-    co_await with_scheduling_group(_sg, [this, &srv_for_group0, group0_id] (this auto self) -> future<> {
+    co_await with_scheduling_group(_sg, [this, &srv_for_group0] (this auto self) -> future<> {
        auto& state_machine = dynamic_cast<group0_state_machine&>(srv_for_group0.state_machine);
        co_await _raft_gr.start_server_for_group(std::move(srv_for_group0));
-        // Set _group0 immediately after the server is registered in _raft_gr._servers.
-        // This ensures abort_and_drain()/destroy() can find and clean up the server
-        // even if enable_in_memory_state_machine() or later steps throw.
-        _group0.emplace<raft::group_id>(group0_id);
        co_await state_machine.enable_in_memory_state_machine();
    });

+    _group0.emplace<raft::group_id>(group0_id);
+
    // Fix for scylladb/scylladb#16683:
    // If the snapshot index is 0, trigger creation of a new snapshot
    // so bootstrapping nodes will receive a snapshot transfer.
@@ -683,6 +681,16 @@ bool raft_group0::maintenance_mode() {
 }

 future<> raft_group0::setup_group0_if_exist(db::system_keyspace& sys_ks, service::storage_service& ss, cql3::query_processor& qp, service::migration_manager& mm) {
+    if (maintenance_mode()) {
+        co_return;
+    }
+
+    if (!sys_ks.bootstrap_complete()) {
+        // If bootstrap did not complete yet, there is no group 0 to setup at this point
+        // -- it will be done after we start gossiping, in `setup_group0`.
+        co_return;
+    }
+
    auto group0_id = raft::group_id{co_await sys_ks.get_raft_group0_id()};
    if (group0_id) {
        // Group 0 ID is present => we've already joined group 0 earlier.
@@ -703,6 +711,15 @@ future<> raft_group0::setup_group0(
        db::system_keyspace& sys_ks, const std::unordered_set<gms::inet_address>& initial_contact_nodes, shared_ptr<group0_handshaker> handshaker,
        service::storage_service& ss, cql3::query_processor& qp, service::migration_manager& mm,
        const join_node_request_params& params) {
+    if (maintenance_mode()) {
+        // The node is in maintenance mode.
+        co_return;
+    }
+
+    if (joined_group0()) {
+        // Group 0 is already set up, there is nothing to do.
+        co_return;
+    }
    // Reaching this point is possible only in two cases:
    // - the node is bootstrapping,
    // - the node is restarting in the Raft-based recovery procedure and has not joined the new group 0 yet.
@@ -1019,7 +1036,7 @@ with_timeout(abort_source& as, db::timeout_clock::duration d, F&& fun) {
        } catch (...) {
            // There should be no other exceptions, but just in case, catch and discard.
            // we want to propagate exceptions from `f`, not from sleep.
-            group0_log.error("unexpected exception from sleep_and_abort: {}", std::current_exception());
+            group0_log.error("unexpected exception from sleep_and_abort", std::current_exception());
        }

        // Translate aborts caused by timeout to `timed_out_error`.
--- a/service/raft/raft_group0.hh
+++ b/service/raft/raft_group0.hh
@@ -271,10 +271,6 @@ public:
    seastar::scheduling_group get_scheduling_group() {
        return _sg;
    }
-
-    // Returns true if in maintenance mode
-    bool maintenance_mode();
-
 private:
    static void init_rpc_verbs(raft_group0& shard0_this);
    static future<> uninit_rpc_verbs(netw::messaging_service& ms);
@@ -336,6 +332,9 @@ private:
    // Does not affect non-members. This behavior is only guaranteed if no concurrent membership changes are happening.
    future<> modify_raft_voter_status(const std::unordered_set<raft::server_id>& voters_add, const std::unordered_set<raft::server_id>& voters_del,
            abort_source& as, std::optional<raft_timeout> timeout = std::nullopt);
+
+    // Returns true if in maintenance mode
+    bool maintenance_mode();
 };

 } // end of namespace service
--- a/service/session.cc
+++ b/service/session.cc
@@ -9,7 +9,6 @@
 #include "service/session.hh"
 #include "utils/log.hh"
 #include <seastar/core/coroutine.hh>
-#include <seastar/core/timer.hh>

 namespace service {

@@ -59,35 +58,18 @@ void session_manager::initiate_close_of_sessions_except(const std::unordered_set
 }

 future<> session_manager::drain_closing_sessions() {
-    slogger.info("drain_closing_sessions: waiting for lock");
-    seastar::timer<lowres_clock> lock_timer([this] {
-        slogger.warn("drain_closing_sessions: still waiting for lock, available units {}",
-                     _session_drain_sem.available_units());
-    });
-    lock_timer.arm_periodic(std::chrono::minutes(5));
    auto lock = co_await get_units(_session_drain_sem, 1);
-    lock_timer.cancel();
-    auto n = std::distance(_closing_sessions.begin(), _closing_sessions.end());
-    slogger.info("drain_closing_sessions: acquired lock, {} sessions to drain", n);
    auto i = _closing_sessions.begin();
    while (i != _closing_sessions.end()) {
        session& s = *i;
        ++i;
        auto id = s.id();
-        slogger.info("drain_closing_sessions: waiting for session {} to close, gate count {}", id, s.gate_count());
-        std::optional<seastar::timer<lowres_clock>> warn_timer;
-        warn_timer.emplace([&s, id] {
-            slogger.warn("drain_closing_sessions: session {} still not closed, gate count {}",
-                         id, s.gate_count());
-        });
-        warn_timer->arm_periodic(std::chrono::minutes(5));
+        slogger.debug("draining session {}", id);
        co_await s.close();
-        warn_timer.reset();
        if (_sessions.erase(id)) {
-            slogger.info("drain_closing_sessions: session {} closed", id);
+            slogger.debug("session {} closed", id);
        }
    }
-    slogger.info("drain_closing_sessions: done");
 }

 } // namespace service
--- a/service/session.hh
+++ b/service/session.hh
@@ -95,10 +95,6 @@ public:
        return _id;
    }

-    size_t gate_count() const {
-        return _gate.get_count();
-    }
-
    /// Post-condition of successfully resolved future: There are no guards alive for this session, and
    /// and it's impossible to create more such guards later.
    /// Can be called concurrently.
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -1940,7 +1940,7 @@ public:
    // Calculates how much to delay completing the request. The delay adds to the request's inherent latency.
    template<typename Func>
    void delay(tracing::trace_state_ptr trace, Func&& on_resume) {
-        auto delay = _proxy->_max_view_update_backlog.calculate_throttling_delay(_view_backlog, _expire_timer.get_timeout());
+        auto delay = db::view::calculate_view_update_throttling_delay(_view_backlog, _expire_timer.get_timeout(), _proxy->data_dictionary().get_config().view_flow_control_delay_limit_in_ms());
        stats().last_mv_flow_control_delay = delay;
        stats().mv_flow_control_delay += delay.count();
        if (delay.count() == 0) {
@@ -3337,7 +3337,6 @@ storage_proxy::storage_proxy(sharded<replica::database>& db, storage_proxy::conf
    , _hints_for_views_manager(*this, _db.local().get_config().view_hints_directory(), {}, _db.local().get_config().max_hint_window_in_ms(), _hints_resource_manager, _db, cfg.hints_sched_group)
    , _stats_key(stats_key)
    , _features(feat)
-    , _maintenance_mode(cfg.maintenance_mode)
    , _background_write_throttle_threahsold(cfg.available_memory / 10)
    , _mutate_stage{"storage_proxy_mutate", &storage_proxy::do_mutate}
    , _max_view_update_backlog(max_view_update_backlog)
@@ -7104,7 +7103,7 @@ host_id_vector_replica_set storage_proxy::get_endpoints_for_reading(const schema
    auto endpoints = erm.get_replicas_for_reading(token);
    // Skip for non-debug builds and maintenance mode.
    if constexpr (tools::build_info::is_debug_build()) {
-        if (!_maintenance_mode) {
+        if (!_db.local().get_config().maintenance_mode()) {
            validate_read_replicas(erm, endpoints);
        }
    }
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -40,7 +40,6 @@
 #include "dht/token_range_endpoints.hh"
 #include "service/storage_service.hh"
 #include "service/cas_shard.hh"
-#include "service/maintenance_mode.hh"
 #include "service/storage_proxy_fwd.hh"

 class reconcilable_result;
@@ -198,7 +197,6 @@ public:
        // with writes.
        smp_service_group write_ack_smp_service_group = default_smp_service_group();
        scheduling_group hints_sched_group;
-        maintenance_mode_enabled maintenance_mode = maintenance_mode_enabled::no;
    };
 private:

@@ -296,7 +294,6 @@ private:
    scheduling_group_key _stats_key;
    storage_proxy_stats::global_stats _global_stats;
    gms::feature_service& _features;
-    maintenance_mode_enabled _maintenance_mode;

    class remote;
    std::unique_ptr<remote> _remote;
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -496,7 +496,7 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
    };

    auto process_normal_node = [&] (raft::server_id id, locator::host_id host_id, std::optional<gms::inet_address> ip, const replica_state& rs) -> future<> {
-        rtlogger.trace("loading topology: raft id={} ip={} node state={} dc={} rack={} tokens state={} tokens={} shards={} cleanup={}",
+        rtlogger.trace("loading topology: raft id={} ip={} node state={} dc={} rack={} tokens state={} tokens={} shards={}",
                      id, ip, rs.state, rs.datacenter, rs.rack, _topology_state_machine._topology.tstate, rs.ring.value().tokens, rs.shard_count, rs.cleanup);
        // Save tokens, not needed for raft topology management, but needed by legacy
        // Also ip -> id mapping is needed for address map recreation on reboot
@@ -1614,44 +1614,45 @@ future<> storage_service::join_topology(sharded<service::storage_proxy>& proxy,

    SCYLLA_ASSERT(_group0);

-    auto request_id = utils::UUID_gen::get_time_UUID();
-    if (!_group0->maintenance_mode() && !_group0->joined_group0()) {
-        join_node_request_params join_params {
-            .host_id = _group0->load_my_id(),
-            .cluster_name = _db.local().get_config().cluster_name(),
-            .snitch_name = _db.local().get_snitch_name(),
-            .datacenter = _snitch.local()->get_datacenter(),
-            .rack = _snitch.local()->get_rack(),
-            .release_version = version::release(),
-            .num_tokens = _db.local().get_config().join_ring() ? _db.local().get_config().num_tokens() : 0,
-            .tokens_string = _db.local().get_config().join_ring() ? _db.local().get_config().initial_token() : sstring(),
-            .shard_count = smp::count,
-            .ignore_msb =  _db.local().get_config().murmur3_partitioner_ignore_msb_bits(),
-            .supported_features = _feature_service.supported_feature_set() | std::ranges::to<std::vector<sstring>>(),
-            .request_id = request_id,
-        };
+    join_node_request_params join_params {
+        .host_id = _group0->load_my_id(),
+        .cluster_name = _db.local().get_config().cluster_name(),
+        .snitch_name = _db.local().get_snitch_name(),
+        .datacenter = _snitch.local()->get_datacenter(),
+        .rack = _snitch.local()->get_rack(),
+        .release_version = version::release(),
+        .num_tokens = _db.local().get_config().join_ring() ? _db.local().get_config().num_tokens() : 0,
+        .tokens_string = _db.local().get_config().join_ring() ? _db.local().get_config().initial_token() : sstring(),
+        .shard_count = smp::count,
+        .ignore_msb =  _db.local().get_config().murmur3_partitioner_ignore_msb_bits(),
+        .supported_features = _feature_service.supported_feature_set() | std::ranges::to<std::vector<sstring>>(),
+        .request_id = utils::UUID_gen::get_time_UUID(),
+    };

-        if (raft_replace_info) {
-            join_params.replaced_id = raft_replace_info->raft_id;
-            join_params.ignore_nodes = utils::split_comma_separated_list(_db.local().get_config().ignore_dead_nodes_for_replace());
-            if (!locator::check_host_ids_contain_only_uuid(join_params.ignore_nodes)) {
-                slogger.warn("Warning: Using IP addresses for '--ignore-dead-nodes-for-replace' is deprecated and will"
-                            " be disabled in a future release. Please use host IDs instead. Provided values: {}",
-                            _db.local().get_config().ignore_dead_nodes_for_replace());
-            }
+    if (raft_replace_info) {
+        join_params.replaced_id = raft_replace_info->raft_id;
+        join_params.ignore_nodes = utils::split_comma_separated_list(_db.local().get_config().ignore_dead_nodes_for_replace());
+        if (!locator::check_host_ids_contain_only_uuid(join_params.ignore_nodes)) {
+            slogger.warn("Warning: Using IP addresses for '--ignore-dead-nodes-for-replace' is deprecated and will"
+                         " be disabled in a future release. Please use host IDs instead. Provided values: {}",
+                         _db.local().get_config().ignore_dead_nodes_for_replace());
        }
-
-        // We use the legacy handshaker in the Raft-based recovery procedure to join the new group 0 without involving
-        // the topology coordinator. We can assume this node has already been accepted by the topology coordinator once
-        // and joined topology.
-        ::shared_ptr<group0_handshaker> handshaker =
-                !_db.local().get_config().recovery_leader.is_set()
-                ? ::make_shared<join_node_rpc_handshaker>(*this, join_params)
-                : _group0->make_legacy_handshaker(raft::is_voter::no);
-        co_await _group0->setup_group0(_sys_ks.local(), initial_contact_nodes, std::move(handshaker),
-                *this, _qp, _migration_manager.local(), join_params);
    }

+    // setup_group0 will do nothing if the node has already set up group 0 in setup_group0_if_exist in main.cc, which
+    // happens when the node is restarting and not joining the new group 0 in the Raft-based recovery procedure.
+    // It does not matter which handshaker we choose in this case since it will not be used.
+    //
+    // We use the legacy handshaker in the Raft-based recovery procedure to join the new group 0 without involving
+    // the topology coordinator. We can assume this node has already been accepted by the topology coordinator once
+    // and joined topology.
+    ::shared_ptr<group0_handshaker> handshaker =
+            !_db.local().get_config().recovery_leader.is_set()
+            ? ::make_shared<join_node_rpc_handshaker>(*this, join_params)
+            : _group0->make_legacy_handshaker(raft::is_voter::no);
+    co_await _group0->setup_group0(_sys_ks.local(), initial_contact_nodes, std::move(handshaker),
+            *this, _qp, _migration_manager.local(), join_params);
+
    raft::server& raft_server = _group0->group0_server();

    // This is the moment when the locator::topology has gathered information about other nodes
@@ -1699,7 +1700,7 @@ future<> storage_service::join_topology(sharded<service::storage_proxy>& proxy,
            throw std::runtime_error("Crashed in crash_before_topology_request_completion");
        });

-        auto err = co_await wait_for_topology_request_completion(request_id);
+        auto err = co_await wait_for_topology_request_completion(join_params.request_id);
        if (!err.empty()) {
            throw std::runtime_error(fmt::format("{} failed. See earlier errors ({})", raft_replace_info ? "Replace" : "Bootstrap", err));
        }
@@ -4493,20 +4494,10 @@ future<> storage_service::local_topology_barrier() {
                             version, current_version)));
        }

-        rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: waiting for stale token metadata versions to be released", version);
-        {
-            seastar::timer<lowres_clock> warn_timer([&ss, version] {
-                rtlogger.warn("raft_topology_cmd::barrier_and_drain version {}: still waiting for stale versions, "
-                              "stale versions (version: use_count): {}",
-                              version, ss._shared_token_metadata.describe_stale_versions());
-            });
-            warn_timer.arm_periodic(std::chrono::minutes(5));
-            co_await ss._shared_token_metadata.stale_versions_in_use();
-        }
-        rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: stale versions released, draining closing sessions", version);
+        co_await ss._shared_token_metadata.stale_versions_in_use();
        co_await get_topology_session_manager().drain_closing_sessions();

-        rtlogger.info("raft_topology_cmd::barrier_and_drain version {}: done", version);
+        rtlogger.info("raft_topology_cmd::barrier_and_drain done");
    });
 }

@@ -4518,9 +4509,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
        auto& raft_server = _group0->group0_server();
        auto group0_holder = _group0->hold_group0_gate();
        // do barrier to make sure we always see the latest topology
-        rtlogger.info("topology cmd rpc {} index={}: starting read_barrier, term={}", cmd.cmd, cmd_index, term);
        co_await raft_server.read_barrier(&_group0_as);
-        rtlogger.info("topology cmd rpc {} index={}: read_barrier completed", cmd.cmd, cmd_index);
        if (raft_server.get_current_term() != term) {
           // Return an error since the command is from outdated leader
           co_return result;
@@ -5960,12 +5949,18 @@ future<join_node_request_result> storage_service::join_node_request_handler(join
        if (const auto *p = _topology_state_machine._topology.find(params.host_id)) {
            const auto& rs = p->second;
            if (rs.state == node_state::left) {
-                rtlogger.warn("the node {} attempted to join but it was removed from the cluster. Rejecting the node", params.host_id);
+                rtlogger.warn("the node {} attempted to join",
+                        " but it was removed from the cluster. Rejecting"
+                        " the node",
+                        params.host_id);
                result.result = join_node_request_result::rejected{
                    .reason = "The node has already been removed from the cluster",
                };
            } else {
-                rtlogger.warn("the node {} attempted to join again after an unfinished attempt but it is no longer allowed to do so. Rejecting the node", params.host_id);
+                rtlogger.warn("the node {} attempted to join",
+                        " again after an unfinished attempt but it is no longer"
+                        " allowed to do so. Rejecting the node",
+                        params.host_id);
                result.result = join_node_request_result::rejected{
                    .reason = "The node requested to join before but didn't finish the procedure. "
                              "Please clear the data directory and restart.",
--- a/service/tablet_allocator.cc
+++ b/service/tablet_allocator.cc
@@ -2117,14 +2117,10 @@ public:
        co_return std::move(plan);
    }

-    // Returns the schema and tablet-aware replication strategy for a given table.
-    // Returns {nullptr, nullptr} if the table has been dropped concurrently (race between
-    // the token metadata snapshot and the live schema).
    std::tuple<schema_ptr, const tablet_aware_replication_strategy*> get_schema_and_rs(table_id table) {
        auto t = _db.get_tables_metadata().get_table_if_exists(table);
        if (!t) {
-            lblogger.debug("Table {} no longer exists, skipping", table);
-            return {nullptr, nullptr};
+            on_internal_error(lblogger, format("Table {} does not exist", table));
        }

        auto s = t->schema();
@@ -2139,8 +2135,6 @@ public:
        return {s, rs};
    }

-    // Returns the tablet-aware replication strategy for a given table, or nullptr
-    // if the table has been dropped concurrently.
    const tablet_aware_replication_strategy* get_rs(table_id id) {
        auto [s, rs] = get_schema_and_rs(id);
        return rs;
@@ -2164,7 +2158,6 @@ public:
        sstring target_tablet_count_reason; // Winning rule for target_tablet_count value.
        std::optional<uint64_t> avg_tablet_size; // nullopt when stats not yet available.
        bool pow2_count; // Whether tablet count for the table should be a power of two.
-        bool tablet_merges_allowed; // Whether merges are allowed for the table.

        // Final tablet count.
        // It's target_tablet_count aligned to power of 2 if pow2_count == true.
@@ -2319,17 +2312,6 @@ public:
            table_plan.current_tablet_count = tablet_count;
            table_plan.pow2_count = tablet_options.pow2_count.value_or(
                    _db.features().arbitrary_tablet_boundaries ? db::tablet_options::default_pow2_count : true);
-            table_plan.tablet_merges_allowed = !s->tablet_merges_forbidden();
-            if (!table_plan.tablet_merges_allowed) {
-                // Block merge decisions for Alternator tablet tables whose
-                // stream configuration forbids merges. Tablet merges produce
-                // 2 parents per child which is incompatible with the DynamoDB
-                // Streams API. If a merge is already in progress on the tmap,
-                // suppressing new_resize_decision here causes the existing
-                // revocation logic in tables_being_resized to cancel the merge.
-                lblogger.debug("Table {} ({}.{}): suppressing new merge decision because tablet merges are forbidden",
-                            table, s->ks_name(), s->cf_name());
-            }

            rs_by_table[table] = rs;

@@ -2437,9 +2419,6 @@ public:
            }
            const auto& tmap = _tm->tablets().get_tablet_map(table);
            auto [s, rs] = get_schema_and_rs(table);
-            if (s == nullptr || rs == nullptr) {
-                continue;
-            }
            auto tablet_options = combine_tablet_options(
                    tables | std::views::transform([&] (table_id table) { return _db.get_tables_metadata().get_table_if_exists(table); })
                           | std::views::filter([] (auto t) { return t != nullptr; })
@@ -2572,7 +2551,7 @@ public:
            } else if (table_plan.target_tablet_count_aligned < table_plan.current_tablet_count) {
                // Needed to avoid oscillations, because we reduce the count by a factor of 2.
                // FIXME: Once we have a way to split individual tablets, we can achieve exactly the desired tablet count.
-                if (table_plan.tablet_merges_allowed && div_ceil(table_plan.current_tablet_count, 2) >= table_plan.target_tablet_count_aligned) {
+                if (div_ceil(table_plan.current_tablet_count, 2) >= table_plan.target_tablet_count_aligned) {
                    auto& tmap = _tm->tablets().get_tablet_map(table);
                    auto cur_decision = tmap.resize_decision();
                    if (cur_decision.is_merge()) {
@@ -2622,6 +2601,21 @@ public:
            resize_decision new_resize_decision;
            new_resize_decision.way = table_plan.resize_decision;

+            // Block merge decisions for Alternator tablet tables whose
+            // stream configuration forbids merges. Tablet merges produce
+            // 2 parents per child which is incompatible with the DynamoDB
+            // Streams API. If a merge is already in progress on the tmap,
+            // suppressing new_resize_decision here causes the existing
+            // revocation logic in tables_being_resized to cancel the merge.
+            if (new_resize_decision.is_merge()) {
+                auto [s, rs] = get_schema_and_rs(table);
+                if (s->tablet_merges_forbidden()) {
+                    lblogger.debug("Table {} ({}.{}): suppressing new merge decision because tablet merges are forbidden",
+                                   table, s->ks_name(), s->cf_name());
+                    new_resize_decision = {};
+                }
+            }
+
            table_size_desc size_desc {
                .avg_tablet_size = *table_plan.avg_tablet_size,
                .resize_decision = tmap.resize_decision(),
@@ -3293,10 +3287,6 @@ public:
        std::unordered_map<sstring, int> rack_load;

        auto rs = get_rs(tablet.table);
-        if (rs == nullptr) {
-            // Table was dropped concurrently. Skip this tablet.
-            return skip_info{};
-        }

        auto get_viable_targets = [&] () {
            std::unordered_set<host_id> viable_targets;
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -4237,7 +4237,6 @@ public:
        , _topology_cmd_rpc_tracker(topology_cmd_rpc_tracker)
        , _async_gate("topology_coordinator")
    {
-        _lifecycle_notifier.register_subscriber(this);
        _db.get_notifier().register_listener(this);
        // When the delay_cdc_stream_finalization error injection is disabled
        // (test releases it), wake the topology coordinator so it retries
@@ -4401,7 +4400,6 @@ future<bool> topology_coordinator::maybe_retry_failed_rf_change_tablet_rebuilds(
 }

 future<> topology_coordinator::refresh_tablet_load_stats() {
-    co_await utils::get_local_injector().inject("refresh_tablet_load_stats_pause", utils::wait_for_message(5min));
    auto tm = get_token_metadata_ptr();

    locator::load_stats stats;
@@ -4725,6 +4723,7 @@ future<> topology_coordinator::run() {

    co_await _async_gate.close();
    co_await std::move(tablet_load_stats_refresher);
+    co_await _tablet_load_stats_refresh.join();
    co_await std::move(cdc_generation_publisher);
    co_await std::move(cdc_streams_gc);
    co_await std::move(gossiper_orphan_remover);
@@ -4737,8 +4736,6 @@ future<> topology_coordinator::stop() {
    co_await _db.get_notifier().unregister_listener(this);
    utils::get_local_injector().unregister_on_disable("delay_cdc_stream_finalization");
    _topo_sm.on_tablet_split_ready = nullptr;
-    co_await _lifecycle_notifier.unregister_subscriber(this);
-    co_await _tablet_load_stats_refresh.join();

    // if topology_coordinator::run() is aborted either because we are not a
    // leader anymore, or we are shutting down as a leader, we have to handle
@@ -4800,6 +4797,7 @@ future<> run_topology_coordinator(
            topology_cmd_rpc_tracker};

    std::exception_ptr ex;
+    lifecycle_notifier.register_subscriber(&coordinator);
    try {
        rtlogger.info("start topology coordinator fiber");
        co_await with_scheduling_group(group0.get_scheduling_group(), [&] {
@@ -4820,7 +4818,7 @@ future<> run_topology_coordinator(
        }
        on_fatal_internal_error(rtlogger, format("unhandled exception in topology_coordinator::run: {}", ex));
    }
-    co_await utils::get_local_injector().inject("topology_coordinator_pause_before_stop", utils::wait_for_message(5min));
+    co_await lifecycle_notifier.unregister_subscriber(&coordinator);
    co_await coordinator.stop();
 }

--- a/sstables/mx/partition_reversing_data_source.cc
+++ b/sstables/mx/partition_reversing_data_source.cc
@@ -502,7 +502,7 @@ public:
                }
                if (_row_start != _partition_end) {
                    on_internal_error(sstlog, format(
-                        "partition_reversing_data_source: invariant broken: _row_start({}) == _row_end({}), but"
+                        "partition_reversing_data_source: invariant broken: _row_start == _row_end({}), but"
                        " != _partition_end({})", _row_start, _row_end, _partition_end));
                }
                look_in_last_block = true;
--- a/sstables/mx/reader.cc
+++ b/sstables/mx/reader.cc
@@ -505,7 +505,7 @@ public:
            return consume_range_tombstone_boundary(std::move(pos), end_tombstone, start_tombstone);
        }
        default:
-            on_parse_error(format("Invalid boundary type {}", static_cast<std::underlying_type<sstables::bound_kind_m>::type>(kind)), _sst->get_filename());
+            on_parse_error(format("Invalid boundary type", static_cast<std::underlying_type<sstables::bound_kind_m>::type>(kind)), _sst->get_filename());
        }
    }

@@ -2221,7 +2221,7 @@ public:
        case bound_kind_m::excl_end_incl_start:
            return consume_range_tombstone(ecp, bound_kind::incl_start, start_tombstone);
        default:
-            on_parse_error(format("Invalid boundary type {}", static_cast<std::underlying_type_t<bound_kind_m>>(kind)), {});
+            on_parse_error(format("Invalid boundary type", static_cast<std::underlying_type_t<bound_kind_m>>(kind)), {});
        }
    }

--- a/sstables/storage.cc
+++ b/sstables/storage.cc
@@ -543,16 +543,11 @@ future<> filesystem_storage::wipe(const sstable& sst, sync_dir sync) noexcept {
            // during SSTable writing and removed before sealing.  If the write
            // failed before sealing, the file may still be on disk and must be
            // cleaned up explicitly.
-            // The component is only defined for the `ms` sstable format; for
-            // older formats it is absent from the component map and looking up
-            // its filename would throw std::out_of_range.
            // Use file_exists() to avoid a C++ exception on the common path
            // where the file was already removed before sealing.
-            if (sstable_version_constants::get_component_map(sst.get_version()).contains(component_type::TemporaryHashes)) {
-                auto temp_hashes = filename(sst, dir_name.native(), sst._generation, component_type::TemporaryHashes);
-                if (co_await file_exists(temp_hashes)) {
-                    co_await sst.sstable_write_io_check(remove_file, std::move(temp_hashes));
-                }
+            auto temp_hashes = filename(sst, dir_name.native(), sst._generation, component_type::TemporaryHashes);
+            if (co_await file_exists(temp_hashes)) {
+                co_await sst.sstable_write_io_check(remove_file, std::move(temp_hashes));
            }
            if (sync) {
                co_await sst.sstable_write_io_check(sync_directory, dir_name.native());
--- a/sstables/trie/trie_writer.hh
+++ b/sstables/trie/trie_writer.hh
@@ -32,7 +32,6 @@

 #pragma once

-#include <seastar/core/thread.hh>
 #include <seastar/util/log.hh>
 #include <map>
 #include <set>
@@ -255,7 +254,6 @@ inline void trie_writer<Output>::lay_out_children(ptr<writer_node> x) {
    }

    while (unwritten_children.size()) {
-        seastar::thread::maybe_yield();
        // Find the smallest child which doesn't fit.
        // (If all fit, then this will be the past-the-end iterator).
        // Its predecessor will be the biggest child which does fit.
@@ -352,7 +350,6 @@ template <trie_writer_sink Output>
 inline void trie_writer<Output>::complete_until_depth(size_t depth) {
    expensive_log("writer_node::complete_until_depth: start,_stack={}, depth={}, _current_depth={}", _stack.size(), depth, _current_depth);
    while (_current_depth > depth) {
-        seastar::thread::maybe_yield();
        // Every node must be smaller than a page, and the transition chain
        // must be short enough to ensure that.
        //
--- a/table_helper.cc
+++ b/table_helper.cc
@@ -9,7 +9,6 @@

 #include "cql3/statements/property_definitions.hh"
 #include "utils/assert.hh"
-#include "utils/error_injection.hh"
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
 #include "table_helper.hh"
@@ -136,32 +135,10 @@ future<> table_helper::cache_table_info(cql3::query_processor& qp, service::migr
 }

 future<> table_helper::insert(cql3::query_processor& qp, service::migration_manager& mm, service::query_state& qs, noncopyable_function<cql3::query_options ()> opt_maker) {
-    // _prepared_stmt is a checked_weak_ptr into the prepared statements
-    // cache and can be invalidated by a concurrent purge (e.g. on a schema
-    // change). cache_table_info() (re-)prepares and assigns _prepared_stmt,
-    // but the pin protecting the entry is dropped when try_prepare()
-    // returns. In release the chain of ready-future co_awaits back to here
-    // resumes synchronously, but debug builds preempt on every co_await
-    // even for ready futures, opening a window for a purge to drop the
-    // entry and leave _prepared_stmt null. Loop until a synchronous
-    // post-resume check finds _prepared_stmt valid; nothing can run between
-    // that check and the dereference below. _insert_stmt is a strong
-    // shared_ptr and is not affected by cache invalidation.
-    while (true) {
-        co_await cache_table_info(qp, mm, qs);
-        if (_prepared_stmt) {
-            break;
-        }
-    }
-    // Pin a strong ref locally: while we suspend in execute(), a concurrent
-    // insert() on this shard may reset _insert_stmt to nullptr if the
-    // prepared_statements_cache entry gets invalidated, freeing the object.
-    auto stmt = _insert_stmt;
+    co_await cache_table_info(qp, mm, qs);
    auto opts = opt_maker();
    opts.prepare(_prepared_stmt->bound_names);
-    co_await utils::get_local_injector().inject("table_helper_insert_before_execute",
-            utils::wait_for_message(std::chrono::seconds{30}));
-    co_await stmt->execute(qp, qs, opts, std::nullopt);
+    co_await _insert_stmt->execute(qp, qs, opts, std::nullopt);
 }

 future<> table_helper::setup_keyspace(cql3::query_processor& qp, service::migration_manager& mm, std::string_view keyspace_name, sstring replication_strategy_name,
--- a/test.py
+++ b/test.py
@@ -11,11 +11,9 @@ from __future__ import annotations

 import argparse
 import asyncio
-import dataclasses
 import math
 import shlex
 import textwrap
-from bisect import insort
 from random import randint

 import pytest
@@ -185,8 +183,6 @@ def parse_cmd_line() -> argparse.Namespace:
                        help="Specific byte limit for failure injection (random by default)")
    parser.add_argument('--skip-internet-dependent-tests', action="store_true",
                        help="Skip tests which depend on artifacts from the internet.")
-    parser.add_argument('--keep-duplicates', action='store_true', default=False,
-                        help="Do not deduplicate test arguments.")
    parser.add_argument("--pytest-arg", action='store', type=str,
                        default=None, dest="pytest_arg",
                        help="Additional command line arguments to pass to pytest, for example ./test.py --pytest-arg=\"-v -x\"")
@@ -245,73 +241,6 @@ def parse_cmd_line() -> argparse.Namespace:
    return args


-# TODO: Remove _CollectionArgument and _deduplicate_test_args once we update
-# to pytest 9.x, which fixes argument deduplication:
-# https://github.com/pytest-dev/pytest/issues/12083
-@dataclasses.dataclass(frozen=True, order=True)
-class _CollectionArgument:
-    """Resolved collection argument for deduplication.
-
-    A version-independent subset of pytest's CollectionArgument that
-    includes the fields needed for normalization (parametrization and
-    original_index were added in pytest 9.0).
-
-    ``a in b`` means ``b`` subsumes (contains) ``a``.  Adapted from
-    pytest 9.0.3 ``_pytest.main.is_collection_argument_subsumed_by``.
-    """
-    path: pathlib.Path
-    parts: tuple[str, ...]
-    parametrization: str
-    original_index: int
-
-    def __contains__(self, other: _CollectionArgument) -> bool:
-        if self.path != other.path:
-            return not self.parts and other.path.is_relative_to(self.path)
-        if len(self.parts) > len(other.parts) or other.parts[:len(self.parts)] != self.parts:
-            return False
-        return not self.parametrization or self.parametrization == other.parametrization
-
-
-def _deduplicate_test_args(args: list[str]) -> list[str]:
-    """Remove duplicate and subsumed test arguments.
-
-    Resolves and normalizes CLI test arguments, then applies the normalization
-    algorithm from pytest 9.0.3 to remove exact duplicates and arguments whose
-    paths are contained within another argument's path.
-    For example, ``["test/cql", "test/cql/lua_test.cql"]`` becomes ``["test/cql"]``.
-    """
-    if not args:
-        return args
-    invocation_path = pathlib.Path.cwd()
-    resolved_sorted: list[_CollectionArgument] = []
-    unresolved_indices: set[int] = set()
-    for i, arg in enumerate(args):
-        # Adapted from pytest 9.0.3 _pytest.main.resolve_collection_argument.
-        base, squacket, rest = arg.partition("[")
-        strpath, *parts = base.split("::")
-        fspath = pathlib.Path(os.path.abspath(invocation_path / strpath))
-        if not fspath.exists():
-            # Keep unresolved args — let pytest report the error.
-            unresolved_indices.add(i)
-            continue
-        insort(resolved_sorted, _CollectionArgument(
-            path=fspath,
-            parts=tuple(parts),
-            parametrization=squacket + rest,
-            original_index=i,
-        ))
-
-    # Normalize: remove duplicates and subsumed arguments using an O(n log n)
-    # sort-based algorithm adapted from pytest 9.0.3.
-    normalized = resolved_sorted[:1]
-    for ca in resolved_sorted[1:]:
-        if ca not in normalized[-1]:
-            normalized.append(ca)
-
-    kept_indices = {ca.original_index for ca in normalized} | unresolved_indices
-    return [arg for i, arg in enumerate(args) if i in kept_indices]
-
-
 def run_pytest(options: argparse.Namespace) -> int:
    # When tests are executed in parallel on different hosts, we need to distinguish results from them.
    # So HOST_ID needed to not overwrite results from different hosts during Jenkins will copy to one directory.
@@ -320,8 +249,7 @@ def run_pytest(options: argparse.Namespace) -> int:

    report_dir =  temp_dir / 'report'
    junit_output_file = report_dir / f'pytest_cpp_{HOST_ID}.xml'
-    files_to_run = options.name if options.keep_duplicates else _deduplicate_test_args(options.name)
-    files_to_run = files_to_run or [str(TOP_SRC_DIR / 'test/')]
+    files_to_run = options.name or [str(TOP_SRC_DIR / 'test/')]
    args = [
        '--color=yes',
        f'--repeat={options.repeat}',
@@ -341,8 +269,6 @@ def run_pytest(options: argparse.Namespace) -> int:
        ])
    if options.verbose:
        args.append('-v')
-    if options.keep_duplicates:
-        args.append('--keep-duplicates')
    if options.quiet:
        args.append('--quiet')
        args.extend(['-p','no:sugar'])
--- a/test/alternator/run
+++ b/test/alternator/run
@@ -70,6 +70,11 @@ def run_alternator_cmd(pid, dir):
        # now that this parameter is used also by CQL's per-row TTL.
        #'--alternator-ttl-period-in-seconds', '0.5',
        '--alternator-allow-system-table-write=1',
+        # Allow testing experimental features. Following issue #9467, we need
+        # to add here specific experimental features as they are introduced.
+        # We only list here Alternator-specific experimental features - CQL
+        # ones are listed in test/cqlpy/run.py.
+        '--experimental-features=alternator-streams',
        # this is required by test_streams.py test_parent_filtering and test_get_records_with_alternating_tablets_count
        # setting the value using scylla_config_temporary won't work, because the value is read
        # at the start and then periodically with `tablet-load-stats-refresh-interval-in-seconds`
--- a/test/alternator/test_audit.py
+++ b/test/alternator/test_audit.py
@@ -476,7 +476,8 @@ def test_audit_query_table_operations(dynamodb, cql, alternator_audit_enabled):
 #     table is pipe-separated "base_table|cdc_table". CL=LOCAL_QUORUM.
 # Produces 5 audit entries.
 def test_audit_streams_operations(dynamodb, dynamodbstreams, cql, alternator_audit_enabled):
-    with new_test_table(dynamodb, StreamSpecification={"StreamEnabled": True, "StreamViewType": "NEW_AND_OLD_IMAGES"}, **HASH_ONLY_SCHEMA) as table:
+    # With #23838 open, we will explicitly ask for a table with vnodes.
+    with new_test_table(dynamodb, StreamSpecification={"StreamEnabled": True, "StreamViewType": "NEW_AND_OLD_IMAGES"}, Tags=[{'Key': 'system:initial_tablets', 'Value': 'none'}], **HASH_ONLY_SCHEMA) as table:
        ks_name = f"alternator_{table.name}"
        client = table.meta.client
        # Write data so that stream records exist.
--- a/test/alternator/test_config.yaml
+++ b/test/alternator/test_config.yaml
@@ -15,6 +15,7 @@ extra_scylla_config_options:
  {
    experimental_features: [
                             udf,
+                             alternator-streams,
                             keyspace-storage-options
    ],
    alternator_port: 8000,
--- a/test/alternator/test_streams.py
+++ b/test/alternator/test_streams.py
@@ -48,8 +48,8 @@ def disable_stream(dynamodbstreams, table):
    # Wait for the stream to really be disabled. A table may have multiple
    # historic streams - we need all of them to become DISABLED. One of
    # them (the current one) may remain DISABLING for some time.
-    exp = time.time() + 60
-    while time.time() < exp:
+    exp = time.process_time() + 60
+    while time.process_time() < exp:
        streams = dynamodbstreams.list_streams(TableName=table.name)
        disabled = True
        for stream in streams['Streams']:
@@ -60,7 +60,7 @@ def disable_stream(dynamodbstreams, table):
        if disabled:
            print('disabled stream on {}'.format(table.name))
            return
-        time.sleep(0.1)
+        time.sleep(0.5)
    pytest.fail("timed out")
            
 # Cannot use fixtures. Because real dynamodb cannot _remove_ a stream
@@ -105,8 +105,8 @@ def create_stream_test_table(dynamodb, StreamViewType=None, Tags=None):
                raise

 def wait_for_active_stream(dynamodbstreams, table, timeout=60):
-    exp = time.time() + timeout
-    while time.time() < exp:
+    exp = time.process_time() + timeout
+    while time.process_time() < exp:
        streams = dynamodbstreams.list_streams(TableName=table.name)
        for stream in streams['Streams']:
            arn = stream['StreamArn']
@@ -2205,6 +2205,7 @@ def test_stream_specification(test_table_stream_with_result, dynamodbstreams):
 # be missing? Or a "null" JSON type? Or an empty string? This test verifies
 # that the right answer is that NextShardIterator should be *missing*
 # (reproduces issue #7237).
+@pytest.mark.xfail(reason="disabled stream is deleted - issue #7239")
 def test_streams_closed_read(dynamodb, dynamodbstreams):
    # This test can't use the shared table test_table_ss_keys_only,
    # because it wants to disable streaming, so let's create a new table:
@@ -2257,6 +2258,7 @@ def test_streams_closed_read(dynamodb, dynamodbstreams):
 # listed for the table, this ARN should continue to work, listing the
 # stream's shards should give an indication that they are all closed - but
 # all these shards should still be readable.
+@pytest.mark.xfail(reason="disabled stream is deleted - issue #7239")
 def test_streams_disabled_stream(dynamodb, dynamodbstreams):
    # This test can't use the shared table test_table_ss_keys_only,
    # because it wants to disable streaming, so let's create a new table:
@@ -2578,70 +2580,3 @@ def test_stream_shard_filtering_missing_shard_id(test_table_ss_keys_only, dynamo
 # TODO: Can we test shard splitting? (shard splitting
 #   requires the user to - periodically or following shards ending - to call
 #   DescribeStream again. We don't do this in any of our tests.
-
-# Count the total number of records currently visible on a stream by reading
-# all shards from the beginning (TRIM_HORIZON).
-def _count_stream_records(dynamodbstreams, arn):
-    shards = []
-    last_shard_id = None
-    while True:
-        kwargs = {'StreamArn': arn}
-        if last_shard_id:
-            kwargs['ExclusiveStartShardId'] = last_shard_id
-        desc = dynamodbstreams.describe_stream(**kwargs)['StreamDescription']
-        shards.extend(desc['Shards'])
-        last_shard_id = desc.get('LastEvaluatedShardId')
-        if not last_shard_id:
-            break
-    nrecords = 0
-    for shard in shards:
-        it = dynamodbstreams.get_shard_iterator(StreamArn=arn,
-            ShardId=shard['ShardId'], ShardIteratorType='TRIM_HORIZON')['ShardIterator']
-        while it:
-            response = dynamodbstreams.get_records(ShardIterator=it)
-            nrecords += len(response.get('Records', []))
-            it = response.get('NextShardIterator')
-            if not response.get('Records'):
-                break
-    return nrecords
-
-def _wait_for_stream_records(dynamodbstreams, arn, timeout=15):
-    """Poll until at least one stream record is visible."""
-    deadline = time.time() + timeout
-    while time.time() < deadline:
-        if _count_stream_records(dynamodbstreams, arn) > 0:
-            return
-        time.sleep(0.1)
-    pytest.fail(f"Timed out waiting for stream records on {arn}")
-
-# Test that after disabling and re-enabling a stream on a table, the old
-# stream data remains readable through the old ARN. In DynamoDB, it
-# remains readable for 24 hours. In Scylla, it is currently purged upon
-# re-enabling.
-@pytest.mark.xfail(reason="Scylla purges old stream data on re-enable "
-        "instead of keeping it readable for 24h - SCYLLADB-1873")
-def test_streams_reenable(dynamodb, dynamodbstreams):
-    with create_stream_test_table(dynamodb, StreamViewType='KEYS_ONLY') as table:
-        (arn1, label1) = wait_for_active_stream(dynamodbstreams, table)
-
-        # Write some data while the first stream is active
-        p = random_string()
-        table.update_item(Key={'p': p, 'c': random_string()},
-            UpdateExpression='SET x = :val1', ExpressionAttributeValues={':val1': 5})
-
-        _wait_for_stream_records(dynamodbstreams, arn1)
-
-        disable_stream(dynamodbstreams, table)
-
-        # Re-enable the stream
-        table.update(StreamSpecification={'StreamEnabled': True, 'StreamViewType': 'KEYS_ONLY'})
-        (arn2, label2) = wait_for_active_stream(dynamodbstreams, table)
-
-        # The new ARN must differ from the old one
-        assert arn1 != arn2
-
-        # The new stream should have no old data.
-        assert _count_stream_records(dynamodbstreams, arn2) == 0
-
-        # The old stream data should still be readable via the old ARN.
-        assert _count_stream_records(dynamodbstreams, arn1) > 0
--- a/test/alternator/test_ttl.py
+++ b/test/alternator/test_ttl.py
@@ -656,6 +656,12 @@ def test_ttl_expiration_lsi_key(dynamodb, waits_for_expiration):
 # content), and a special userIdentity flag saying that this is not a regular
 # REMOVE but an expiration. Reproduces issue #11523.
 def test_ttl_expiration_streams(dynamodb, dynamodbstreams, waits_for_expiration):
+    # Alternator Streams currently doesn't work with tablets, so until
+    # #23838 is solved, skip this test on tablets.
+    for tag in TAGS:
+        if tag['Key'] == 'system:initial_tablets' and tag['Value'].isdigit():
+            skip_bug("Streams test skipped on tablets due to #23838")
+
    # In my experiments, a 30-minute (1800 seconds) is the typical
    # expiration delay in this test. If the test doesn't finish within
    # max_duration, we report a failure.
--- a/test/boost/CMakeLists.txt
+++ b/test/boost/CMakeLists.txt
@@ -150,8 +150,6 @@ add_scylla_test(lister_test
  KIND SEASTAR)
 add_scylla_test(locator_topology_test
  KIND SEASTAR)
-add_scylla_test(lock_tables_metadata_test
-  KIND SEASTAR)
 add_scylla_test(log_heap_test
  KIND BOOST)
 add_scylla_test(logalloc_standard_allocator_segment_pool_backend_test
@@ -325,7 +323,6 @@ add_scylla_test(combined_tests
    auth_cache_test.cc
    auth_test.cc
    batchlog_manager_test.cc
-    table_helper_test.cc
    cache_algorithm_test.cc
    castas_fcts_test.cc
    cdc_test.cc
@@ -377,7 +374,7 @@ add_scylla_test(combined_tests
    sstable_compression_config_test.cc
    sstable_directory_test.cc
    sstable_set_test.cc
-    sstable_tablet_streaming_test.cc
+    sstable_tablet_streaming.cc
    statement_restrictions_test.cc
    storage_proxy_test.cc
    tablets_test.cc
--- a/test/boost/bloom_filter_test.cc
+++ b/test/boost/bloom_filter_test.cc
@@ -122,7 +122,7 @@ SEASTAR_TEST_CASE(test_reclaimed_bloom_filter_deletion_from_disk) {

        auto mut1 = mutation(s, pks[0]);
        mut1.partition().apply_insert(*s, ss.make_ckey(0), ss.new_timestamp());
-        auto sst = make_sstable_containing(env.make_sstable(s), {std::move(mut1)}).get();
+        auto sst = make_sstable_containing(env.make_sstable(s), {std::move(mut1)});
        auto sst_test = sstables::test(sst);

        const auto filter_path = (env.tempdir().path() / sst_test.filename(component_type::Filter)).native();
@@ -269,7 +269,7 @@ SEASTAR_TEST_CASE(test_bloom_filter_reload_after_unlink) {
        mut.partition().apply_insert(*schema, ss.make_ckey(1), ss.new_timestamp());

        // bloom filter will be reclaimed automatically due to low memory
-        auto sst = make_sstable_containing(env.make_sstable(schema), {mut}).get();
+        auto sst = make_sstable_containing(env.make_sstable(schema), {mut});
        auto& sst_mgr = env.manager();
        BOOST_REQUIRE_EQUAL(sst->filter_memory_size(), 0);

@@ -325,7 +325,7 @@ SEASTAR_TEST_CASE(test_bloom_filter_reclaim_after_unlink) {
        }

        // create one sst; there is sufficient memory for the bloom filter, so it won't be reclaimed
-        auto sst1 = make_sstable_containing(env.make_sstable(schema), mutations).get();
+        auto sst1 = make_sstable_containing(env.make_sstable(schema), mutations);
        auto& sst_mgr = env.manager();
        auto sst1_filename = sst1->get_filename();
        BOOST_REQUIRE(sst1->filter_memory_size() != 0);
@@ -358,7 +358,7 @@ SEASTAR_TEST_CASE(test_bloom_filter_reclaim_after_unlink) {

        // create another sst and unlink it to trigger reload of components.
        // the reload should not attempt to load sst'1 bloom filter into memory depsite its presence in the _active list.
-        auto sst2 = make_sstable_containing(env.make_sstable(schema), {mutations[0]}).get();
+        auto sst2 = make_sstable_containing(env.make_sstable(schema), {mutations[0]});
        sst2->unlink().get();
        sst2.release();

--- a/test/boost/cdc_test.cc
+++ b/test/boost/cdc_test.cc
@@ -297,10 +297,11 @@ SEASTAR_THREAD_TEST_CASE(test_permissions_of_cdc_description) {
            BOOST_REQUIRE_THROW(e.execute_cql(stmt).get(), exceptions::unauthorized_exception);
        };

+        const std::string generations_v2 = "system_distributed_everywhere.cdc_generation_descriptions_v2";
        const std::string streams = "system_distributed.cdc_streams_descriptions_v2";
        const std::string timestamps = "system_distributed.cdc_generation_timestamps";

-        for (auto& t : {streams, timestamps}) {
+        for (auto& t : {generations_v2, streams, timestamps}) {
            auto dot_pos = t.find_first_of('.');
            SCYLLA_ASSERT(dot_pos != std::string_view::npos && dot_pos != 0 && dot_pos != t.size() - 1);
            BOOST_REQUIRE(e.local_db().has_schema(t.substr(0, dot_pos), t.substr(dot_pos + 1)));
@@ -316,15 +317,18 @@ SEASTAR_THREAD_TEST_CASE(test_permissions_of_cdc_description) {
        for (auto& t : {streams}) {
            assert_unauthorized(seastar::format("ALTER TABLE {} ALTER time TYPE blob", t));
        }
+        assert_unauthorized(seastar::format("ALTER TABLE {} ALTER id TYPE blob", generations_v2));
        assert_unauthorized(seastar::format("ALTER TABLE {} ALTER key TYPE blob", timestamps));

        // Allow DELETE
        for (auto& t : {streams}) {
            e.execute_cql(seastar::format("DELETE FROM {} WHERE time = toTimeStamp(now())", t)).get();
        }
+        e.execute_cql(seastar::format("DELETE FROM {} WHERE id = uuid()", generations_v2)).get();
        e.execute_cql(seastar::format("DELETE FROM {} WHERE key = 'timestamps'", timestamps)).get();

        // Allow UPDATE, INSERT
+        e.execute_cql(seastar::format("INSERT INTO {} (id, range_end) VALUES (uuid(), 0)", generations_v2)).get();
        e.execute_cql(seastar::format("INSERT INTO {} (time, range_end) VALUES (toTimeStamp(now()), 0)", streams)).get();
        e.execute_cql(seastar::format("UPDATE {} SET expired = toTimeStamp(now()) WHERE key = 'timestamps' AND time = toTimeStamp(now())", timestamps)).get();
    }).get();
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
copilot-swe-agent[bot]	93fbc0a683	docs: fix typo in materialized views docs - "columns are" instead of "is" Agent-Logs-Url: https://github.com/scylladb/scylladb/sessions/bcc29e46-1902-4ac6-9a16-4b7e3d03421a Co-authored-by: annastuchlik <37244380+annastuchlik@users.noreply.github.com>	2026-04-27 14:19:39 +00:00
copilot-swe-agent[bot]	520466b407	Initial plan	2026-04-27 14:18:58 +00:00