test.py: fix issue that C++ tests' logs are deleted

Add skiping deletion of the log file in case of the fail in C++ tests. Closes scylladb/scylladb#29859
Merge 'test/cluster: fix server-starting functions to wait for all ports' from Nadav Har'El
2026-05-14 03:42:14 +00:00 · 2026-05-13 21:31:03 +03:00 · 2026-05-13 21:23:18 +03:00 · 2026-05-13 16:42:05 +02:00 · 2026-05-13 16:31:05 +02:00 · 2026-05-13 16:10:55 +02:00
320 changed files with 8436 additions and 3343 deletions
--- a/.github/workflows/call_sync_milestone_to_jira.yml
+++ b/.github/workflows/call_sync_milestone_to_jira.yml
@@ -4,6 +4,8 @@ on:
  milestone:
    types: [created, closed]

+permissions: {}
+
 jobs:
  sync-milestone-to-jira:
    uses: scylladb/github-automation/.github/workflows/main_sync_milestone_to_jira_release.yml@main
--- a/.gitignore
+++ b/.gitignore
@@ -36,4 +36,6 @@ compile_commands.json
 clang_build
 .idea/
 nuke
-rust/target
+rust/**/target
+rust/**/Cargo.lock
+test/resource/wasm/rust/target
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../scylla-seastar
+	url = ../seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -299,6 +299,7 @@ target_sources(scylla-main
    serializer.cc
    service/direct_failure_detector/failure_detector.cc
    sstables_loader.cc
+    sstables_loader_helpers.cc
    table_helper.cc
    tasks/task_handler.cc
    tasks/task_manager.cc
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2026.2.0-rc1
+VERSION=2026.3.0-dev

 if test -f version
 then
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -247,6 +247,18 @@ bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2, bool v1_from
    if (!v1) {
        return false;
    }
+    if (!v1->IsObject() || v1->MemberCount() != 1) {
+        if (v1_from_query) {
+            throw api_error::serialization("CONTAINS operator encountered malformed AttributeValue");
+        }
+        return false;
+    }
+    if (!v2.IsObject() || v2.MemberCount() != 1) {
+        if (v2_from_query) {
+            throw api_error::serialization("CONTAINS operator encountered malformed AttributeValue");
+        }
+        return false;
+    }
    const auto& kv1 = *v1->MemberBegin();
    const auto& kv2 = *v2.MemberBegin();
    if (kv1.name == "S" && kv2.name == "S") {
@@ -265,9 +277,17 @@ bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2, bool v1_from
            }
        }
    } else if (kv1.name == "L") {
+        if (!kv1.value.IsArray()) {
+            if (v1_from_query) {
+                throw api_error::serialization("CONTAINS operator received a malformed list");
+            }
+            return false;
+        }
        for (auto i = kv1.value.Begin(); i != kv1.value.End(); ++i) {
            if (!i->IsObject() || i->MemberCount() != 1) {
-                clogger.error("check_CONTAINS received a list whose element is malformed");
+                if (v1_from_query) {
+                    throw api_error::serialization("CONTAINS operator received a list whose element is malformed");
+                }
                return false;
            }
            const auto& el = *i->MemberBegin();
@@ -681,7 +701,7 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
    case parsed::primitive_condition::type::VALUE:
        if (calculated_values.size() != 1) {
            // Shouldn't happen unless we have a bug in the parser
-            throw std::logic_error(format("Unexpected values in primitive_condition", cond._values.size()));
+            throw std::logic_error(format("Unexpected values {} in primitive_condition", cond._values.size()));
        }
        // Unwrap the boolean wrapped as the value (if it is a boolean)
        if (calculated_values[0].IsObject() && calculated_values[0].MemberCount() == 1) {
--- a/alternator/controller.cc
+++ b/alternator/controller.cc
@@ -38,6 +38,7 @@ controller::controller(
        sharded<auth::service>& auth_service,
        sharded<qos::service_level_controller>& sl_controller,
        sharded<vector_search::vector_store_client>& vsc,
+        sharded<updateable_timeout_config>& timeout_config,
        const db::config& config,
        seastar::scheduling_group sg)
    : protocol_server(sg)
@@ -52,6 +53,7 @@ controller::controller(
    , _auth_service(auth_service)
    , _sl_controller(sl_controller)
    , _vsc(vsc)
+    , _timeout_config(timeout_config)
    , _config(config)
 {
 }
@@ -99,7 +101,7 @@ future<> controller::start_server() {
        _executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_ss), std::ref(_mm), std::ref(_sys_dist_ks), std::ref(_sys_ks),
                        sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), std::ref(_vsc), _ssg.value(),
                        sharded_parameter(get_timeout_in_ms, std::ref(_config))).get();
-        _server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper), std::ref(_auth_service), std::ref(_sl_controller)).get();
+        _server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper), std::ref(_auth_service), std::ref(_sl_controller), std::ref(_timeout_config)).get();
        // Note: from this point on, if start_server() throws for any reason,
        // it must first call stop_server() to stop the executor and server
        // services we just started - or Scylla will cause an assertion
--- a/alternator/controller.hh
+++ b/alternator/controller.hh
@@ -48,6 +48,8 @@ namespace vector_search {
 class vector_store_client;
 }

+class updateable_timeout_config;
+
 namespace alternator {

 // This is the official DynamoDB API version.
@@ -72,6 +74,7 @@ class controller : public protocol_server {
    sharded<auth::service>& _auth_service;
    sharded<qos::service_level_controller>& _sl_controller;
    sharded<vector_search::vector_store_client>& _vsc;
+    sharded<updateable_timeout_config>& _timeout_config;
    const db::config& _config;

    std::vector<socket_address> _listen_addresses;
@@ -92,6 +95,7 @@ public:
        sharded<auth::service>& auth_service,
        sharded<qos::service_level_controller>& sl_controller,
        sharded<vector_search::vector_store_client>& vsc,
+        sharded<updateable_timeout_config>& timeout_config,
        const db::config& config,
        seastar::scheduling_group sg);

--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -1362,6 +1362,33 @@ static int get_dimensions(const rjson::value& vector_attribute, std::string_view
    return dimensions_v->GetInt();
 }

+// As noted in issue #5052, in Alternator the CreateTable and UpdateTable are
+// currently synchronous - they return only after the operation is complete.
+// After announce() of the new schema finished, the schema change is committed
+// and a majority of nodes know it - but it's possible that some live nodes
+// have not yet applied the new schema. If we return to the user now, and the
+// user sends a node request that relies on the new schema, it might fail.
+// So before returning, we must verify that *all* nodes have applied the new
+// schema. This is what wait_for_schema_agreement_after_ddl() does.
+//
+// Note that wait_for_schema_agreement_after_ddl() has a timeout (currently
+// hard-coded to 30 seconds). If the timeout is reached an InternalServerError
+// is returned. The user, who doesn't know if the CreateTable succeeded or not,
+// can retry the request and will get a ResourceInUseException and know the
+// table already exists. So a CreateTable that returns a ResourceInUseException
+// should also call wait_for_schema_agreement_after_ddl().
+//
+// When issue #5052 is resolved, this function can be removed - we will need
+// to check if we reached schema agreement, but not to *wait* for it.
+static future<> wait_for_schema_agreement_after_ddl(service::migration_manager& mm, const replica::database& db) {
+    static constexpr auto schema_agreement_seconds = 30;
+    try {
+        co_await mm.wait_for_schema_agreement(db, db::timeout_clock::now() + std::chrono::seconds(schema_agreement_seconds), nullptr);
+    } catch (const service::migration_manager::schema_agreement_timeout&) {
+        throw api_error::internal(fmt::format("The operation was successful, but unable to confirm cluster-wide schema agreement after {} seconds. Please retry the operation, and wait for the retry to report an error since the operation was already done.", schema_agreement_seconds));
+    }
+}
+
 future<executor::request_return_type> executor::create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization,
            const db::tablets_mode_t::mode tablets_mode, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
    throwing_assert(this_shard_id() == 0);
@@ -1695,13 +1722,26 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
                }
            }
        }
+        bool table_already_exists = false;
        try {
            schema_mutations = service::prepare_new_keyspace_announcement(_proxy.local_db(), ksm, ts);
        } catch (exceptions::already_exists_exception&) {
            if (_proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
-                co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
+                table_already_exists = true;
            }
        }
+        if (table_already_exists) {
+            // The user may have retried a CreateTable operation after it timed
+            // out in wait_for_schema_agreement_after_ddl(). So before we may
+            // return ResourceInUseException (which can lead the user to start
+            // using the table which it now knows exists), we need to wait for
+            // schema agreement, just like the original CreateTable did. Again
+            // we fail with InternalServerError if schema agreement still cannot
+            // be reached. We can release group0_guard before waiting.
+            release_guard(std::move(group0_guard));
+            co_await wait_for_schema_agreement_after_ddl(_mm, _proxy.local_db());
+            co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
+        }
        if (_proxy.data_dictionary().try_find_table(schema->id())) {
            // This should never happen, the ID is supposed to be unique
            co_return api_error::internal(format("Table with ID {} already exists", schema->id()));
@@ -1750,7 +1790,7 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
        }
    }

-    co_await _mm.wait_for_schema_agreement(_proxy.local_db(), db::timeout_clock::now() + 10s, nullptr);
+    co_await wait_for_schema_agreement_after_ddl(_mm, _proxy.local_db());
    rjson::value status = rjson::empty_object();
    executor::supplement_table_info(request, *schema, _proxy);
    rjson::add(status, "TableDescription", std::move(request));
@@ -1860,7 +1900,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
            rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
            if (stream_specification && stream_specification->IsObject()) {
                empty_request = false;
-                if (add_stream_options(*stream_specification, builder, p.local())) {
+                if (add_stream_options(*stream_specification, builder, p.local(), tab->cdc_options())) {
                    validate_cdc_log_name_length(builder.cf_name());
                    // On tablet tables, defer stream enablement and block
                    // tablet merges (see defer_enabling_streams_block_tablet_merges).
@@ -1875,6 +1915,23 @@ future<executor::request_return_type> executor::update_table(client_state& clien
                        if (tab->cdc_options().enabled() || tab->cdc_options().enable_requested()) {
                            co_return api_error::validation("Table already has an enabled stream: TableName: " + tab->cf_name());
                        }
+                        // When re-enabling streams on an Alternator table, drop the old
+                        // CDC log table first as a separate schema change, so the
+                        // subsequent UpdateTable creates a fresh one with a new UUID
+                        // (= new StreamArn). See #7239.
+                        auto logname = cdc::log_name(tab->cf_name());
+                        auto& local_db = p.local().local_db();
+                        if (local_db.has_schema(tab->ks_name(), logname)
+                                && cdc::is_log_schema(*local_db.find_schema(tab->ks_name(), logname))) {
+                            auto drop_m = co_await service::prepare_column_family_drop_announcement(
+                                p.local(), tab->ks_name(), logname,
+                                group0_guard.write_timestamp());
+                            co_await mm.announce(std::move(drop_m), std::move(group0_guard),
+                                format("alternator-executor: drop old CDC log for {}", tab->cf_name()));
+                            co_await mm.wait_for_schema_agreement(
+                                p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
+                            continue;
+                        }
                    }
                    else if (!tab->cdc_options().enabled() && !tab->cdc_options().enable_requested()) {
                        co_return api_error::validation("Table has no stream to disable: TableName: " + tab->cf_name());
@@ -2189,7 +2246,7 @@ future<executor::request_return_type> executor::update_table(client_state& clien
                throw;
            }
        }
-        co_await mm.wait_for_schema_agreement(p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
+        co_await wait_for_schema_agreement_after_ddl(mm, p.local().local_db());

        rjson::value status = rjson::empty_object();
        supplement_table_info(request, *schema, p.local());
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -30,6 +30,7 @@
 #include "utils/updateable_value.hh"

 #include "tracing/trace_state.hh"
+#include "cdc/cdc_options.hh"


 namespace db {
@@ -199,7 +200,7 @@ private:
        tracing::trace_state_ptr trace_state, service_permit permit);

 public:
-    static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp);
+    static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp, const cdc::options& existing_cdc_opts = {});
    static void supplement_table_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp);
    static void supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp);
 };
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -485,7 +485,7 @@ std::optional<bytes> unwrap_bytes(const rjson::value& value, bool from_query) {
        return rjson::base64_decode(value);
    } catch (...) {
        if (from_query) {
-            throw api_error::serialization(format("Invalid base64 data"));
+            throw api_error::serialization("Invalid base64 data");
        }
        return std::nullopt;
    }
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -835,7 +835,7 @@ void server::set_routes(routes& r) {
 //FIXME: A way to immediately invalidate the cache should be considered,
 // e.g. when the system table which stores the keys is changed.
 // For now, this propagation may take up to 1 minute.
-server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& auth_service, qos::service_level_controller& sl_controller)
+server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& auth_service, qos::service_level_controller& sl_controller, updateable_timeout_config& timeout_config)
        : _http_server("http-alternator")
        , _https_server("https-alternator")
        , _executor(exec)
@@ -847,7 +847,7 @@ server::server(executor& exec, service::storage_proxy& proxy, gms::gossiper& gos
        , _max_users_query_size_in_trace_output(1024)
        , _enabled_servers{}
        , _pending_requests("alternator::server::pending_requests")
-        , _timeout_config(_proxy.data_dictionary().get_config())
+        , _timeout_config(timeout_config)
      , _callbacks{
        {"CreateTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
            return e.create_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request), audit_info);
--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -16,6 +16,7 @@
 #include <seastar/net/tls.hh>
 #include <optional>
 #include "alternator/auth.hh"
+#include "timeout_config.hh"
 #include "service/qos/service_level_controller.hh"
 #include "utils/small_vector.hh"
 #include "utils/updateable_value.hh"
@@ -53,8 +54,8 @@ class server : public peering_sharded_service<server> {
    named_gate _pending_requests;
    // In some places we will need a CQL updateable_timeout_config object even
    // though it isn't really relevant for Alternator which defines its own
-    // timeouts separately. We can create this object only once.
-    updateable_timeout_config _timeout_config;
+    // timeouts separately.
+    updateable_timeout_config& _timeout_config;
    client_options_cache_type _connection_options_keys_and_values;

    alternator_callbacks_map _callbacks;
@@ -98,7 +99,7 @@ class server : public peering_sharded_service<server> {
    utils::scoped_item_list<ongoing_request> _ongoing_requests;

 public:
-    server(executor& executor, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& service, qos::service_level_controller& sl_controller);
+    server(executor& executor, service::storage_proxy& proxy, gms::gossiper& gossiper, auth::service& service, qos::service_level_controller& sl_controller, updateable_timeout_config& timeout_config);

    future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port,
            std::optional<uint16_t> port_proxy_protocol, std::optional<uint16_t> https_port_proxy_protocol,
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -243,7 +243,10 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
        if (!is_alternator_keyspace(ks_name)) {
            continue;
        }
-        if (cdc::is_log_for_some_table(db.real_database(), ks_name, cf_name)) {
+        // Use get_base_table instead of is_log_for_some_table because the
+        // latter requires CDC to be enabled, but we want to list streams
+        // that have been disabled but whose log table still exists (#7239).
+        if (cdc::get_base_table(db.real_database(), ks_name, cf_name)) {
            rjson::value new_entry = rjson::empty_object();

            auto arn = stream_arn{ i->schema(), cdc::get_base_table(db.real_database(), *i->schema()) };
@@ -392,7 +395,7 @@ std::istream& operator>>(std::istream& is, stream_view_type& type) {
    return is;
 }

-static stream_view_type cdc_options_to_steam_view_type(const cdc::options& opts) {
+static stream_view_type cdc_options_to_stream_view_type(const cdc::options& opts) {
    stream_view_type type = stream_view_type::KEYS_ONLY;
    if (opts.preimage() && opts.postimage()) {
        type = stream_view_type::NEW_AND_OLD_IMAGES;
@@ -838,6 +841,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
    auto& opts = bs->cdc_options();

    auto status = "DISABLED";
+    bool stream_disabled = !opts.enabled();

    if (opts.enabled()) {
        if (!_cdc_metadata.streams_available()) {
@@ -853,7 +857,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    rjson::add(stream_desc, "StreamStatus", rjson::from_string(status));

-    stream_view_type type = cdc_options_to_steam_view_type(opts);
+    stream_view_type type = cdc_options_to_stream_view_type(opts);

    rjson::add(stream_desc, "StreamArn", stream_arn);
    rjson::add(stream_desc, "StreamViewType", type);
@@ -861,10 +865,9 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

    describe_key_schema(stream_desc, *bs);

-    if (!opts.enabled()) {
-        rjson::add(ret, "StreamDescription", std::move(stream_desc));
-        co_return rjson::print(std::move(ret));
-    }
+    // For disabled streams, we still fall through to enumerate shards
+    // below. All shards will have EndingSequenceNumber set, indicating
+    // they are closed. See issue #7239.

    // TODO: label
    // TODO: creation time
@@ -947,6 +950,12 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
        auto expired = [&]() -> std::optional<db_clock::time_point> {
            auto j = std::next(i);
            if (j == e) {
+                // For a disabled stream, all shards are closed (#7239).
+                // Use "now" as the ending sequence number for the last
+                // generation's shards.
+                if (stream_disabled) {
+                    return db_clock::now();
+                }
                return std::nullopt;
            }
            // add this so we sort of match potential 
@@ -1297,7 +1306,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
        | std::ranges::to<query::column_id_vector>()
    ;

-    stream_view_type type = cdc_options_to_steam_view_type(base->cdc_options());
+    stream_view_type type = cdc_options_to_stream_view_type(base->cdc_options());

    auto selection = cql3::selection::selection::for_columns(schema, std::move(columns));
    auto partition_slice = query::partition_slice(
@@ -1481,17 +1490,17 @@ future<executor::request_return_type> executor::get_records(client_state& client

    auto& shard = iter.shard;

-    if (shard.time < ts && ts < high_ts) {
+    if (!base->cdc_options().enabled()) {
+        // Stream is disabled -- all shards are closed (#7239).
+        // Don't return NextShardIterator.
+    } else if (shard.time < ts && ts < high_ts) {
        // The DynamoDB documentation states that when a shard is
        // closed, reading it until the end has NextShardIterator
        // "set to null". Our test test_streams_closed_read
        // confirms that by "null" they meant not set at all.
    } else {
-        // We could have return the same iterator again, but we did
-        // a search from it until high_ts and found nothing, so we
-        // can also start the next search from high_ts.
-        // TODO: but why? It's simpler just to leave the iterator be.
-        shard_iterator next_iter(iter.table, iter.shard, utils::UUID_gen::min_time_UUID(high_ts.time_since_epoch()), true);
+        // Shard is still open with no records in the scanned window.
+        // Return the original iterator so the client can poll again.
        rjson::add(ret, "NextShardIterator", iter);
    }
    _stats.api_operations.get_records_latency.mark(std::chrono::steady_clock::now() - start_time);
@@ -1501,17 +1510,13 @@ future<executor::request_return_type> executor::get_records(client_state& client
    co_return rjson::print(std::move(ret));
 }

-bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp) {
+bool executor::add_stream_options(const rjson::value& stream_specification, schema_builder& builder, service::storage_proxy& sp, const cdc::options& existing_cdc_opts) {
    auto stream_enabled = rjson::find(stream_specification, "StreamEnabled");
    if (!stream_enabled || !stream_enabled->IsBool()) {
        throw api_error::validation("StreamSpecification needs boolean StreamEnabled");
    }

    if (stream_enabled->GetBool()) {
-        if (!sp.features().alternator_streams) {
-            throw api_error::validation("StreamSpecification: alternator streams feature not enabled in cluster.");
-        }
-
        cdc::options opts;
        opts.enabled(true);
        opts.tablet_merge_blocked(true);
@@ -1537,8 +1542,13 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche
        builder.with_cdc_options(opts);
        return true;
    } else {
-        cdc::options opts;
+        // When disabling, preserve the existing CDC options (preimage,
+        // postimage, ttl, etc.) so that DescribeStream can still report
+        // the correct StreamViewType on a disabled stream.
+        cdc::options opts = existing_cdc_opts;
        opts.enabled(false);
+        opts.enable_requested(false);
+        opts.tablet_merge_blocked(false);
        builder.with_cdc_options(opts);
        return false;
    }
@@ -1546,33 +1556,36 @@ bool executor::add_stream_options(const rjson::value& stream_specification, sche

 void executor::supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp) {
    auto& opts = schema.cdc_options();
-    if (opts.enabled()) {
-        auto db = sp.data_dictionary();
-        auto cf = db.find_table(schema.ks_name(), cdc::log_name(schema.cf_name()));
-        stream_arn arn(cf.schema(), cdc::get_base_table(db.real_database(), *cf.schema()));
+    // Report stream info when:
+    //   1. Log table exists (covers both enabled and disabled-but-readable).
+    //   2. enable_requested (ENABLING state, log not yet created).
+    auto db = sp.data_dictionary();
+    auto log_name = cdc::log_name(schema.cf_name());
+    auto log_cf = db.try_find_table(schema.ks_name(), log_name);
+    if (log_cf) {
+        auto log_schema = log_cf->schema();
+        stream_arn arn(log_schema, cdc::get_base_table(db.real_database(), *log_schema));
        rjson::add(descr, "LatestStreamArn", arn);
-        rjson::add(descr, "LatestStreamLabel", rjson::from_string(stream_label(*cf.schema())));
-    } else if (!opts.enable_requested()) {
-        return;
-    }
-    // For both enabled() and enable_requested():
-    // DynamoDB returns StreamEnabled=true in StreamSpecification even when
-    // the stream status is ENABLING (not yet fully active). We mirror this
-    // behavior: enable_requested means the user asked for streams but CDC
-    // is not yet finalized, so we still report StreamEnabled=true.
-    auto stream_desc = rjson::empty_object();
-    rjson::add(stream_desc, "StreamEnabled", true);
+        rjson::add(descr, "LatestStreamLabel", rjson::from_string(stream_label(*log_schema)));

-    auto mode = stream_view_type::KEYS_ONLY;
-    if (opts.preimage() && opts.postimage()) {
-        mode = stream_view_type::NEW_AND_OLD_IMAGES;
-    } else if (opts.preimage()) {
-        mode = stream_view_type::OLD_IMAGE;
-    } else if (opts.postimage()) {
-        mode = stream_view_type::NEW_IMAGE;
+        auto stream_desc = rjson::empty_object();
+        rjson::add(stream_desc, "StreamEnabled", opts.enabled());
+
+        stream_view_type mode = cdc_options_to_stream_view_type(opts);
+        rjson::add(stream_desc, "StreamViewType", mode);
+        rjson::add(descr, "StreamSpecification", std::move(stream_desc));
+    } else if (opts.enable_requested()) {
+        // DynamoDB returns StreamEnabled=true in StreamSpecification even when
+        // the stream status is ENABLING (not yet fully active). We mirror this
+        // behavior: enable_requested means the user asked for streams but CDC
+        // is not yet finalized, so we still report StreamEnabled=true.
+        auto stream_desc = rjson::empty_object();
+        rjson::add(stream_desc, "StreamEnabled", true);
+
+        stream_view_type mode = cdc_options_to_stream_view_type(opts);
+        rjson::add(stream_desc, "StreamViewType", mode);
+        rjson::add(descr, "StreamSpecification", std::move(stream_desc));
    }
-    rjson::add(stream_desc, "StreamViewType", mode);
-    rjson::add(descr, "StreamSpecification", std::move(stream_desc));
 }

 } // namespace alternator
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -974,6 +974,54 @@
              }
          ]
      },
+      {
+         "path":"/storage_service/tablets/restore",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Starts copying SSTables from a designated bucket in object storage to a specified keyspace",
+               "type":"string",
+               "nickname":"tablet_aware_restore",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"keyspace",
+                     "description":"Name of a keyspace to copy SSTables to",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"table",
+                     "description":"Name of a table to copy SSTables to",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"snapshot",
+                     "description":"Name of the snapshot to restore from",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"backup_location",
+                     "description":"JSON array of backup location objects. Each object must contain: 'datacenter' (string), 'endpoint' (string), 'bucket' (string), and 'manifests' (array of strings). Currently, the array must contain exactly one entry.",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"array",
+                     "paramType":"body"
+                  }
+               ]
+            }
+         ]
+      },
      {
         "path":"/storage_service/keyspace_compaction/{keyspace}",
         "operations":[
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -527,11 +527,56 @@ void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>&
        co_return json::json_return_type(fmt::to_string(task_id));
    });

+    ss::tablet_aware_restore.set(r, [&ctx, &sst_loader](std::unique_ptr<http::request> req) -> future<json_return_type> {
+        std::string keyspace = req->get_query_param("keyspace");
+        std::string table = req->get_query_param("table");
+        std::string snapshot = req->get_query_param("snapshot");
+
+        rjson::chunked_content content = co_await util::read_entire_stream(*req->content_stream);
+        rjson::value parsed = rjson::parse(std::move(content));
+        if (!parsed.IsArray()) {
+            throw httpd::bad_param_exception("backup locations (in body) must be a JSON array");
+        }
+
+        const auto& locations = parsed.GetArray();
+        if (locations.Size() != 1) {
+            throw httpd::bad_param_exception("backup locations array (in body) must contain exactly one entry");
+        }
+
+        const auto& location = locations[0];
+        if (!location.IsObject()) {
+            throw httpd::bad_param_exception("backup location (in body) must be a JSON object");
+        }
+
+        auto endpoint = rjson::to_string_view(location["endpoint"]);
+        auto bucket = rjson::to_string_view(location["bucket"]);
+        auto dc = rjson::to_string_view(location["datacenter"]);
+
+        if (!location.HasMember("manifests") || !location["manifests"].IsArray()) {
+            throw httpd::bad_param_exception("backup location entry must have 'manifests' array");
+        }
+
+        auto manifests = location["manifests"].GetArray() |
+            std::views::transform([] (const auto& m) { return sstring(rjson::to_string_view(m)); }) |
+            std::ranges::to<utils::chunked_vector<sstring>>();
+
+        if (manifests.empty()) {
+            throw httpd::bad_param_exception("backup location 'manifests' array must not be empty");
+        }
+
+        apilog.info("Tablet restore for {}:{} called. Parameters: snapshot={} datacenter={} endpoint={} bucket={} manifests_count={}",
+                    keyspace, table, snapshot, dc, endpoint, bucket, manifests.size());
+
+        auto table_id = validate_table(ctx.db.local(), keyspace, table);
+        auto task_id = co_await sst_loader.local().restore_tablets(table_id, keyspace, table, snapshot, sstring(endpoint), sstring(bucket), std::move(manifests));
+        co_return json::json_return_type(fmt::to_string(task_id));
+    });
 }

 void unset_sstables_loader(http_context& ctx, routes& r) {
    ss::load_new_ss_tables.unset(r);
    ss::start_restore.unset(r);
+    ss::tablet_aware_restore.unset(r);
 }

 void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_builder>& vb, sharded<gms::gossiper>& g) {
--- a/audit/audit.cc
+++ b/audit/audit.cc
@@ -194,22 +194,36 @@ future<> audit::start_audit(const db::config& cfg, sharded<locator::shared_token
                                  std::move(audited_keyspaces),
                                  std::move(audited_tables),
                                  std::move(audited_categories),
-                                  std::cref(cfg))
-    .then([&cfg] {
-        if (!audit_instance().local_is_initialized()) {
-            return make_ready_future<>();
-        }
-        return audit_instance().invoke_on_all([&cfg] (audit& local_audit) {
-            return local_audit.start(cfg);
+                                  std::cref(cfg));
+}
+
+future<> audit::start_storage(const db::config& cfg) {
+    if (!audit_instance().local_is_initialized()) {
+        return make_ready_future<>();
+    }
+    return audit_instance().invoke_on_all([&cfg] (audit& local_audit) {
+        return local_audit._storage_helper_ptr->start(cfg).then([&local_audit] {
+            local_audit._storage_running = true;
        });
    });
 }

+future<> audit::stop_storage() {
+    if (!audit_instance().local_is_initialized()) {
+        return make_ready_future<>();
+    }
+    return audit_instance().invoke_on_all([] (audit& local_audit) {
+        local_audit._storage_running = false;
+        return local_audit._storage_helper_ptr->stop();
+    });
+}
+
 future<> audit::stop_audit() {
    if (!audit_instance().local_is_initialized()) {
        return make_ready_future<>();
    }
    return audit::audit::audit_instance().invoke_on_all([] (auto& local_audit) {
+        SCYLLA_ASSERT(!local_audit._storage_running);
        return local_audit.shutdown();
    }).then([] {
        return audit::audit::audit_instance().stop();
@@ -223,14 +237,6 @@ audit_info_ptr audit::create_audit_info(statement_category cat, const sstring& k
    return std::make_unique<audit_info>(cat, keyspace, table, batch);
 }

-future<> audit::start(const db::config& cfg) {
-    return _storage_helper_ptr->start(cfg);
-}
-
-future<> audit::stop() {
-    return _storage_helper_ptr->stop();
-}
-
 future<> audit::shutdown() {
    return make_ready_future<>();
 }
@@ -241,6 +247,12 @@ future<> audit::log(const audit_info& audit_info, const service::client_state& c
    const sstring& username = client_state.user() ? client_state.user()->name.value_or(anonymous_username) : no_username;
    socket_address client_ip = client_state.get_client_address().addr();
    socket_address node_ip = _token_metadata.get()->get_topology().my_address().addr();
+    if (!_storage_running) {
+        on_internal_error_noexcept(logger, fmt::format("Audit log dropped (storage not ready): node_ip {} category {} cl {} error {} keyspace {} query '{}' client_ip {} table {} username {}",
+            node_ip, audit_info.category_string(), cl, error, audit_info.keyspace(),
+            audit_info.query(), client_ip, audit_info.table(), username));
+        return make_ready_future<>();
+    }
    if (logger.is_enabled(logging::log_level::debug)) {
        logger.debug("Log written: node_ip {} category {} cl {} error {} keyspace {} query '{}' client_ip {} table {} username {}",
            node_ip, audit_info.category_string(), cl, error, audit_info.keyspace(),
@@ -286,6 +298,11 @@ future<> inspect(const audit_info_alternator& ai, const service::client_state& c

 future<> audit::log_login(const sstring& username, socket_address client_ip, bool error) noexcept {
    socket_address node_ip = _token_metadata.get()->get_topology().my_address().addr();
+    if (!_storage_running) {
+        on_internal_error_noexcept(logger, fmt::format("Audit login log dropped (storage not ready): node_ip {} client_ip {} username {} error {}",
+            node_ip, client_ip, username, error ? "true" : "false"));
+        return make_ready_future<>();
+    }
    if (logger.is_enabled(logging::log_level::debug)) {
        logger.debug("Login log written: node_ip {}, client_ip {}, username {}, error {}",
            node_ip, client_ip, username, error ? "true" : "false");
--- a/audit/audit.hh
+++ b/audit/audit.hh
@@ -141,6 +141,7 @@ private:
    category_set _audited_categories;

    std::unique_ptr<storage_helper> _storage_helper_ptr;
+    bool _storage_running = false;

    const db::config& _cfg;
    utils::observer<sstring> _cfg_keyspaces_observer;
@@ -163,6 +164,8 @@ public:
        return audit_instance().local();
    }
    static future<> start_audit(const db::config& cfg, sharded<locator::shared_token_metadata>& stm, sharded<cql3::query_processor>& qp, sharded<service::migration_manager>& mm);
+    static future<> start_storage(const db::config& cfg);
+    static future<> stop_storage();
    static future<> stop_audit();
    static audit_info_ptr create_audit_info(statement_category cat, const sstring& keyspace, const sstring& table, bool batch = false);
    audit(locator::shared_token_metadata& stm,
@@ -174,8 +177,6 @@ public:
          category_set&& audited_categories,
          const db::config& cfg);
    ~audit();
-    future<> start(const db::config& cfg);
-    future<> stop();
    future<> shutdown();
    bool should_log(const audit_info& audit_info) const;
    bool will_log(statement_category cat, std::string_view keyspace = {}, std::string_view table = {}) const;
--- a/auth/cache.cc
+++ b/auth/cache.cc
@@ -185,24 +185,14 @@ future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& r
        static const sstring q = format("SELECT role, name, value FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, ROLE_ATTRIBUTES_CF);
        auto rs = co_await fetch(q);
        for (const auto& r : *rs) {
+            if (!r.has("value")) {
+                continue;
+            }
            rec->attributes[r.get_as<sstring>("name")] =
                    r.get_as<sstring>("value");
            co_await coroutine::maybe_yield();
        }
    }
-    // permissions
-    {
-        static const sstring q = format("SELECT role, resource, permissions FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, PERMISSIONS_CF);
-        auto rs = co_await fetch(q);
-        for (const auto& r : *rs) {
-            auto resource = r.get_as<sstring>("resource");
-            auto perms_strings = r.get_set<sstring>("permissions");
-            std::unordered_set<sstring> perms_set(perms_strings.begin(), perms_strings.end());
-            auto pset = permissions::from_strings(perms_set);
-            rec->permissions[std::move(resource)] = std::move(pset);
-            co_await coroutine::maybe_yield();
-        }
-    }
    co_return rec;
 }

--- a/auth/cache.hh
+++ b/auth/cache.hh
@@ -44,7 +44,6 @@ public:
        std::unordered_set<role_name_t> members;
        sstring salted_hash;
        std::unordered_map<sstring, sstring, sstring_hash, sstring_eq> attributes;
-        std::unordered_map<sstring, permission_set, sstring_hash, sstring_eq> permissions;
    private:
        friend cache;
        // cached permissions include effects of role's inheritance
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -76,7 +76,11 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
    if (results->empty()) {
        co_return permissions::NONE;
    }
-    co_return permissions::from_strings(results->one().get_set<sstring>(PERMISSIONS_NAME));
+    const auto& row = results->one();
+    if (!row.has(PERMISSIONS_NAME)) {
+        co_return permissions::NONE;
+    }
+    co_return permissions::from_strings(row.get_set<sstring>(PERMISSIONS_NAME));
 }

 future<>
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -258,13 +258,11 @@ future<> ldap_role_manager::start() {
            } catch (const seastar::sleep_aborted&) {
                co_return; // ignore
            }
-            co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
-                try {
-                    co_await c.reload_all_permissions();
-                } catch (...) {
-                    mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
-                }
-            });
+            try {
+                co_await _cache.reload_all_permissions();
+            } catch (...) {
+                mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
+            }
        }
    });
    return _std_mgr.start();
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -157,15 +157,12 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
            return create_legacy_keyspace_if_missing(mm);
        });
    }
-    co_await _role_manager->start();
-    if (this_shard_id() == 0) {
-        // Role manager and password authenticator have this odd startup
-        // mechanism where they asynchronously create the superuser role
-        // in the background. Correct password creation depends on role
-        // creation therefore we need to wait here.
-        co_await _role_manager->ensure_superuser_is_created();
-    }
-    co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
+    // Authorizer must be started before the permission loader is set,
+    // because the loader calls _authorizer->authorize().
+    // The loader must be set before starting the role manager, because
+    // LDAP role manager starts a pruner fiber that calls
+    // reload_all_permissions() which asserts _permission_loader is set.
+    co_await _authorizer->start();
    if (!_used_by_maintenance_socket) {
        // Maintenance socket mode can't cache permissions because it has
        // different authorizer. We can't mix cached permissions, they could be
@@ -174,12 +171,27 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
                &service::get_uncached_permissions,
                this, std::placeholders::_1, std::placeholders::_2));
    }
+    co_await _role_manager->start();
+    if (this_shard_id() == 0) {
+        // Role manager and password authenticator have this odd startup
+        // mechanism where they asynchronously create the superuser role
+        // in the background. Correct password creation depends on role
+        // creation therefore we need to wait here.
+        co_await _role_manager->ensure_superuser_is_created();
+    }
+    // Authenticator must be started after ensure_superuser_is_created()
+    // because password_authenticator queries system.roles for the
+    // superuser entry created by the role manager.
+    co_await _authenticator->start();
 }

 future<> service::stop() {
    _as.request_abort();
+    // Reverse of start() order.
+    co_await _authenticator->stop();
+    co_await _role_manager->stop();
    _cache.set_permission_loader(nullptr);
-    return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
+    co_await _authorizer->stop();
 }

 future<> service::ensure_superuser_is_created() {
--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -267,7 +267,7 @@ struct extract_row_visitor {
            visit_collection(v);
        },
        [&] (const abstract_type& o) {
-            throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
+            throw std::runtime_error(format("extract_changes: unknown collection type: {}", o.name()));
        }
        ));
    }
--- a/cmake/mode.common.cmake
+++ b/cmake/mode.common.cmake
@@ -137,6 +137,24 @@ endfunction()

 option(Scylla_WITH_DEBUG_INFO "Enable debug info" OFF)

+# Time trace profiling: adds -ftime-trace to all C++ compilations (Clang only).
+# Each .o produces a companion .json file in the build directory that can be
+# analyzed with ClangBuildAnalyzer or loaded in chrome://tracing.
+#
+# Usage:
+#   cmake -DScylla_TIME_TRACE=ON ...
+#   ninja
+#   # Analyze results (requires ClangBuildAnalyzer):
+#   ClangBuildAnalyzer --all <build-dir> capture.bin
+#   ClangBuildAnalyzer --analyze capture.bin
+option(Scylla_TIME_TRACE "Enable Clang -ftime-trace for build profiling" OFF)
+if(Scylla_TIME_TRACE)
+  if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    message(FATAL_ERROR "Scylla_TIME_TRACE requires Clang (found ${CMAKE_CXX_COMPILER_ID})")
+  endif()
+  add_compile_options(-ftime-trace)
+endif()
+
 macro(update_build_flags config)
  cmake_parse_arguments (
    parsed_args
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -1088,7 +1088,7 @@ void compaction_manager::register_metrics() {
        sm::make_gauge("normalized_backlog", [this] { return _last_backlog / available_memory(); },
                       sm::description("Holds the sum of normalized compaction backlog for all tables in the system. Backlog is normalized by dividing backlog by shard's available memory.")),
        sm::make_counter("validation_errors", [this] { return _validation_errors; },
-                       sm::description("Holds the number of encountered validation errors.")),
+                       sm::description("Holds the number of encountered validation errors.")).set_skip_when_empty(),
    });
 }

--- a/configure.py
+++ b/configure.py
@@ -285,8 +285,12 @@ def generate_compdb(compdb, ninja, buildfile, modes):
                os.symlink(compdb_target, compdb)
            except FileExistsError:
                # if there is already a valid compile_commands.json link in the
-                # source root, we are done.
-                pass
+                # source root, we are done. if it's a stale link, update it.
+                if os.path.islink(compdb):
+                    current_target = os.readlink(compdb)
+                    if not os.path.exists(current_target):
+                        os.unlink(compdb)
+                        os.symlink(compdb_target, compdb)
            return


@@ -560,6 +564,7 @@ scylla_tests = set([
    'test/boost/crc_test',
    'test/boost/dict_trainer_test',
    'test/boost/dirty_memory_manager_test',
+    'test/boost/tablet_aware_restore_test',
    'test/boost/double_decker_test',
    'test/boost/duration_test',
    'test/boost/dynamic_bitset_test',
@@ -854,6 +859,10 @@ arg_parser.add_argument('--coverage', action = 'store_true', help = 'Compile scy
 arg_parser.add_argument('--build-dir', action='store', default='build',
                        help='Build directory path')
 arg_parser.add_argument('--disable-precompiled-header', action='store_true', default=False, help='Disable precompiled header for scylla binary')
+arg_parser.add_argument('--time-trace', action='store_true', default=False,
+                        help='Enable Clang -ftime-trace for build profiling. '
+                             'Each .o produces a .json file analyzable with '
+                             'ClangBuildAnalyzer or chrome://tracing')
 arg_parser.add_argument('-h', '--help', action='store_true', help='show this help message and exit')
 args = arg_parser.parse_args()
 if args.help:
@@ -1164,6 +1173,8 @@ scylla_core = (['message/messaging_service.cc',
                'index/secondary_index_manager.cc',
                'index/secondary_index.cc',
                'index/vector_index.cc',
+                'index/fulltext_index.cc',
+                'index/index_option_utils.cc',
                'utils/UUID_gen.cc',
                'utils/i_filter.cc',
                'utils/bloom_filter.cc',
@@ -1326,6 +1337,7 @@ scylla_core = (['message/messaging_service.cc',
                'ent/ldap/ldap_connection.cc',
                'reader_concurrency_semaphore.cc',
                'sstables_loader.cc',
+                'sstables_loader_helpers.cc',
                'utils/utf8.cc',
                'utils/ascii.cc',
                'utils/like_matcher.cc',
@@ -1465,6 +1477,7 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/frozen_mutation.idl.hh',
        'idl/reconcilable_result.idl.hh',
        'idl/streaming.idl.hh',
+        'idl/sstables_loader.idl.hh',
        'idl/paging_state.idl.hh',
        'idl/frozen_schema.idl.hh',
        'idl/repair.idl.hh',
@@ -1488,6 +1501,7 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/storage_proxy.idl.hh',
        'idl/sstables.idl.hh',
        'idl/strong_consistency/state_machine.idl.hh',
+        'idl/strong_consistency/groups_manager.idl.hh',
        'idl/group0_state_machine.idl.hh',
        'idl/mapreduce_request.idl.hh',
        'idl/replica_exception.idl.hh',
@@ -1660,6 +1674,7 @@ deps['test/boost/combined_tests'] += [
    'test/boost/auth_cache_test.cc',
    'test/boost/auth_test.cc',
    'test/boost/batchlog_manager_test.cc',
+    'test/boost/table_helper_test.cc',
    'test/boost/cache_algorithm_test.cc',
    'test/boost/castas_fcts_test.cc',
    'test/boost/cdc_test.cc',
@@ -1711,7 +1726,7 @@ deps['test/boost/combined_tests'] += [
    'test/boost/sstable_compression_config_test.cc',
    'test/boost/sstable_directory_test.cc',
    'test/boost/sstable_set_test.cc',
-    'test/boost/sstable_tablet_streaming.cc',
+    'test/boost/sstable_tablet_streaming_test.cc',
    'test/boost/statement_restrictions_test.cc',
    'test/boost/storage_proxy_test.cc',
    'test/boost/tablets_test.cc',
@@ -1966,6 +1981,9 @@ user_cflags += ' -fextend-variable-liveness=none'
 if args.target != '':
    user_cflags += ' -march=' + args.target

+if args.time_trace:
+    user_cflags += ' -ftime-trace'
+
 for mode in modes:
    # Those flags are passed not only to Scylla objects, but also to libraries
    # that we compile ourselves.
@@ -2458,6 +2476,9 @@ def write_build_file(f,
            command = reloc/build_deb.sh --reloc-pkg $in --builddir $out
        rule unified
            command = unified/build_unified.sh --build-dir $builddir/$mode --unified-pkg $out
+        rule collect_pkgs
+            command = rm -rf $out && mkdir -p $out && cp $pkgs $out/
+            description = COLLECT $out
        rule rust_header
            command = cxxbridge --include rust/cxx.h --header $in > $out
            description = RUST_HEADER $out
@@ -2943,6 +2964,8 @@ def write_build_file(f,
        build dist-tar: phony dist-unified-tar dist-server-tar dist-python3-tar dist-cqlsh-tar

        build dist: phony dist-unified dist-server dist-python3 dist-cqlsh
+
+        build collect-dist: phony {' '.join([f'collect-dist-{mode}' for mode in default_modes])}
        '''))

    f.write(textwrap.dedent(f'''\
@@ -2950,7 +2973,28 @@ def write_build_file(f,
        rule dist-check
          command = ./tools/testing/dist-check/dist-check.sh --mode $mode
        '''))
+    deb_arch = {'x86_64': 'amd64', 'aarch64': 'arm64'}[arch]
+    deb_ver = f'{scylla_version}-{scylla_release}-1'
+    rpm_ver = f'{scylla_version}-{scylla_release}'
    for mode in build_modes:
+        server_rpms_dir = f'$builddir/dist/{mode}/redhat/RPMS/{arch}'
+        server_rpms = [f'{server_rpms_dir}/{scylla_product}{suffix}-{rpm_ver}.{arch}.rpm'
+                       for suffix in ['', '-server', '-server-debuginfo', '-conf', '-kernel-conf', '-node-exporter']]
+        cqlsh_rpms = [f'tools/cqlsh/build/redhat/RPMS/{arch}/{scylla_product}-cqlsh-{rpm_ver}.{arch}.rpm']
+        python3_rpms = [f'tools/python3/build/redhat/RPMS/{arch}/{scylla_product}-python3-{rpm_ver}.{arch}.rpm']
+        all_rpms = server_rpms + cqlsh_rpms + python3_rpms
+
+        server_deb_dir = f'$builddir/dist/{mode}/debian'
+        server_debs = [f'{server_deb_dir}/{scylla_product}{suffix}_{deb_ver}_{deb_arch}.deb'
+                       for suffix in ['', '-server', '-server-dbg', '-conf', '-kernel-conf', '-node-exporter']]
+        server_debs += [f'{server_deb_dir}/scylla-enterprise{suffix}_{deb_ver}_all.deb'
+                        for suffix in ['', '-server', '-conf', '-kernel-conf', '-node-exporter']]
+        cqlsh_debs = [f'tools/cqlsh/build/debian/{scylla_product}-cqlsh_{deb_ver}_{deb_arch}.deb',
+                      f'tools/cqlsh/build/debian/scylla-enterprise-cqlsh_{deb_ver}_all.deb']
+        python3_debs = [f'tools/python3/build/debian/{scylla_product}-python3_{deb_ver}_{deb_arch}.deb',
+                        f'tools/python3/build/debian/scylla-enterprise-python3_{deb_ver}_all.deb']
+        all_debs = server_debs + cqlsh_debs + python3_debs
+
        f.write(textwrap.dedent(f'''\
        build $builddir/{mode}/dist/tar/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz: copy tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
        build $builddir/{mode}/dist/tar/{scylla_product}-python3-package.tar.gz: copy tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
@@ -2958,6 +3002,11 @@ def write_build_file(f,
        build $builddir/{mode}/dist/tar/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz: copy tools/cqlsh/build/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz
        build $builddir/{mode}/dist/tar/{scylla_product}-cqlsh-package.tar.gz: copy tools/cqlsh/build/{scylla_product}-cqlsh-{scylla_version}-{scylla_release}.{arch}.tar.gz

+        build $builddir/{mode}/dist/rpm: collect_pkgs | {' '.join(all_rpms)} $builddir/dist/{mode}/redhat dist-cqlsh-rpm dist-python3-rpm
+          pkgs = {' '.join(all_rpms)}
+        build $builddir/{mode}/dist/deb: collect_pkgs | {' '.join(all_debs)} $builddir/dist/{mode}/debian dist-cqlsh-deb dist-python3-deb
+          pkgs = {' '.join(all_debs)}
+        build collect-dist-{mode}: phony $builddir/{mode}/dist/rpm $builddir/{mode}/dist/deb
        build {mode}-dist: phony dist-server-{mode} dist-server-debuginfo-{mode} dist-python3-{mode} dist-unified-{mode} dist-cqlsh-{mode}
        build dist-{mode}: phony {mode}-dist
        build dist-check-{mode}: dist-check
--- a/cql3/authorized_prepared_statements_cache.hh
+++ b/cql3/authorized_prepared_statements_cache.hh
@@ -136,9 +136,9 @@ public:
    {}

    future<> insert(auth::authenticated_user user, cql3::prepared_cache_key_type prep_cache_key, value_type v) noexcept {
-        return _cache.get_ptr(key_type(std::move(user), std::move(prep_cache_key)), [v = std::move(v)] (const cache_key_type&) mutable {
+        return _cache.insert(key_type(std::move(user), std::move(prep_cache_key)), [v = std::move(v)] (const cache_key_type&) mutable {
            return make_ready_future<value_type>(std::move(v));
-        }).discard_result();
+        });
    }

    value_ptr find(const auth::authenticated_user& user, const cql3::prepared_cache_key_type& prep_cache_key) {
--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -1070,7 +1070,7 @@ try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database
                                .args = {},
                            };
                        } else {
-                            throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
+                            throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument, got {}", fc.args[0]));
                        }
                    }
                }
--- a/cql3/expr/restrictions.cc
+++ b/cql3/expr/restrictions.cc
@@ -13,6 +13,7 @@
 #include "cql3/prepare_context.hh"
 #include "cql3/expr/expr-utils.hh"
 #include "types/list.hh"
+#include "types/tuple.hh"
 #include <iterator>
 #include <ranges>

@@ -116,6 +117,34 @@ void validate_token_relation(const std::vector<const column_definition*> column_
    }
 }

+void validate_tuples_size(const expression& rhs, size_t valid_size) {
+    auto coll = as_if<collection_constructor>(&rhs);
+    if (!coll) {
+        // Pre-prepare, the IN list arrives as a collection_constructor.
+        // After prepare it would be a constant of list type whose elements
+        // are serialized; arity validation has already happened earlier in
+        // that case, so nothing to do here.
+        return;
+    }
+    for (const auto& expr : coll->elements) {
+        size_t expr_size = 0;
+        if (auto tuple = as_if<tuple_constructor>(&expr)) {
+            expr_size = tuple->elements.size();
+        } else {
+            auto the_const = as_if<constant>(&expr);
+            if (the_const && the_const->type->without_reversed().is_tuple()) {
+                const tuple_type_impl* const_tuple = dynamic_cast<const tuple_type_impl*>(&the_const->type->without_reversed());
+                expr_size = const_tuple->size();
+            } else {
+                continue; // not a tuple; perhaps we need to set expr_size to 1 here when #12554 is fixed
+            }
+        }
+        if (expr_size != valid_size) {
+            throw exceptions::invalid_request_exception(format("Expected {} elements in value tuple, but got {}: {}", valid_size, expr_size, expr));
+        }
+    }
+}
+
 void preliminary_binop_vaidation_checks(const binary_operator& binop) {
    if (binop.op == oper_t::NEQ) {
        throw exceptions::invalid_request_exception(format("Unsupported \"!=\" relation: {:user}", binop));
@@ -142,6 +171,10 @@ void preliminary_binop_vaidation_checks(const binary_operator& binop) {
            throw exceptions::invalid_request_exception("LIKE cannot be used for Multi-column relations");
        }

+        if (binop.op == oper_t::IN) {
+            validate_tuples_size(binop.rhs, lhs_tup->elements.size());
+        }
+
        if (auto rhs_tup = as_if<tuple_constructor>(&binop.rhs)) {
            if (lhs_tup->elements.size() != rhs_tup->elements.size()) {
                throw exceptions::invalid_request_exception(
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -22,6 +22,7 @@
 #include "service/mapreduce_service.hh"
 #include "service/raft/raft_group0_client.hh"
 #include "service/storage_service.hh"
+#include "service/strong_consistency/coordinator.hh"
 #include "cql3/CqlParser.hpp"
 #include "cql3/statements/batch_statement.hh"
 #include "cql3/statements/modification_statement.hh"
@@ -1168,6 +1169,11 @@ future<> query_processor::announce_schema_statement(const statements::schema_alt
    co_await remote_.get().mm.announce(std::move(m), std::move(guard), description);
 }

+future<> query_processor::wait_for_table_raft_groups_on_all_hosts(table_id table, lowres_clock::time_point timeout) {
+    auto [sc_coord, holder] = acquire_strongly_consistent_coordinator();
+    co_await sc_coord.get().wait_for_table_raft_groups_on_all_hosts(table, timeout);
+}
+
 query_processor::migration_subscriber::migration_subscriber(query_processor* qp) : _qp{qp} {
 }

--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -505,6 +505,11 @@ public:
    execute_schema_statement(const statements::schema_altering_statement&, service::query_state& state, const query_options& options, service::group0_batch& mc);
    future<> announce_schema_statement(const statements::schema_altering_statement&, service::group0_batch& mc);

+    // Sends an RPC to every host that holds a tablet replica of the given table, asking it to wait
+    // until the raft groups for those tablets are started and ready to serve queries.
+    // For the local node, waits directly without an RPC.
+    future<> wait_for_table_raft_groups_on_all_hosts(table_id table, lowres_clock::time_point timeout);
+
    std::unique_ptr<statements::prepared_statement> get_statement(
            const std::string_view& query,
            const service::client_state& client_state,
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -343,102 +343,102 @@ to_predicates(
                            auto cdef = col.col;
                            auto type = &cdef->type->without_reversed();
                            if (oper.op == oper_t::IS_NOT) {
-                              return to_vector(predicate{
-                                  .solve_for = nullptr,
-                                  .filter = oper,
-                                  .on = on_column{col.col},
-                                  .is_not_null_single_column = is_null_constant(oper.rhs),
-                                  .op = oper.op,
-                              });
+                                return to_vector(predicate{
+                                    .solve_for = nullptr,
+                                    .filter = oper,
+                                    .on = on_column{col.col},
+                                    .is_not_null_single_column = is_null_constant(oper.rhs),
+                                    .op = oper.op,
+                                });
                            }
                            if (is_compare(oper.op)) {
-                              auto solve = [oper] (const query_options& options) {
-                                managed_bytes_opt val = evaluate(oper.rhs, options).to_managed_bytes_opt();
-                                if (!val) {
-                                    return empty_value_set; // All NULL comparisons fail; no column values match.
-                                }
-                                return oper.op == oper_t::EQ ? value_set(value_list{*val})
-                                        : to_range(oper.op, std::move(*val));
-                              };
-                              return to_vector(predicate{
-                                  .solve_for = std::move(solve),
-                                  .filter = oper,
-                                  .on = on_column{col.col},
-                                  .is_singleton = (oper.op == oper_t::EQ),
-                                  .equality = (oper.op == oper_t::EQ),
-                                  .is_slice = expr::is_slice(oper.op),
-                                  .is_upper_bound = (oper.op == oper_t::LT || oper.op == oper_t::LTE),
-                                  .is_lower_bound = (oper.op == oper_t::GT || oper.op == oper_t::GTE),
-                                  .order = oper.order,
-                                  .op = oper.op,
-                              });
+                                auto solve = [oper] (const query_options& options) {
+                                    managed_bytes_opt val = evaluate(oper.rhs, options).to_managed_bytes_opt();
+                                    if (!val) {
+                                        return empty_value_set; // All NULL comparisons fail; no column values match.
+                                    }
+                                    return oper.op == oper_t::EQ ? value_set(value_list{*val})
+                                    : to_range(oper.op, std::move(*val));
+                                };
+                                return to_vector(predicate{
+                                    .solve_for = std::move(solve),
+                                    .filter = oper,
+                                    .on = on_column{col.col},
+                                    .is_singleton = (oper.op == oper_t::EQ),
+                                    .equality = (oper.op == oper_t::EQ),
+                                    .is_slice = expr::is_slice(oper.op),
+                                    .is_upper_bound = (oper.op == oper_t::LT || oper.op == oper_t::LTE),
+                                    .is_lower_bound = (oper.op == oper_t::GT || oper.op == oper_t::GTE),
+                                    .order = oper.order,
+                                    .op = oper.op,
+                                });
                            } else if (oper.op == oper_t::IN) {
-                              auto solve = [oper, type, cdef] (const query_options& options) {
-                                return get_IN_values(oper.rhs, options, type->as_less_comparator(), cdef->name_as_text());
-                              };
-                              return to_vector(predicate{
-                                  .solve_for = std::move(solve),
-                                  .filter = oper,
-                                  .on = on_column{col.col},
-                                  .is_singleton = false,
-                                  .is_in = true,
-                                  .order = oper.order,
-                                  .op = oper.op,
-                              });
+                                auto solve = [oper, type, cdef] (const query_options& options) {
+                                    return get_IN_values(oper.rhs, options, type->as_less_comparator(), cdef->name_as_text());
+                                };
+                                return to_vector(predicate{
+                                    .solve_for = std::move(solve),
+                                    .filter = oper,
+                                    .on = on_column{col.col},
+                                    .is_singleton = false,
+                                    .is_in = true,
+                                    .order = oper.order,
+                                    .op = oper.op,
+                                });
                            } else if (oper.op == oper_t::CONTAINS || oper.op == oper_t::CONTAINS_KEY) {
-                              auto solve = [oper] (const query_options& options) {
-                                managed_bytes_opt val = evaluate(oper.rhs, options).to_managed_bytes_opt();
-                                if (!val) {
-                                    return empty_value_set; // All NULL comparisons fail; no column values match.
-                                }
-                                return value_set(value_list{*val});
-                              };
-                              return to_vector(predicate{
-                                  .solve_for = std::move(solve),
-                                  .filter = oper,
-                                  .on = on_column{col.col},
-                                  .is_singleton = false,
-                                  .order = oper.order,
-                                  .op = oper.op,
-                              });
+                                auto solve = [oper] (const query_options& options) {
+                                    managed_bytes_opt val = evaluate(oper.rhs, options).to_managed_bytes_opt();
+                                    if (!val) {
+                                        return empty_value_set; // All NULL comparisons fail; no column values match.
+                                    }
+                                    return value_set(value_list{*val});
+                                };
+                                return to_vector(predicate{
+                                    .solve_for = std::move(solve),
+                                    .filter = oper,
+                                    .on = on_column{col.col},
+                                    .is_singleton = false,
+                                    .order = oper.order,
+                                    .op = oper.op,
+                                });
                            }
                            return cannot_solve_on_column(oper, col.col);
                        },
                        [&] (const subscript& s) -> std::vector<predicate> {
                            const column_value& col = get_subscripted_column(s);

-                          if (oper.op == oper_t::EQ) {
-                            auto solve = [s, oper] (const query_options& options) {
-                                managed_bytes_opt sval = evaluate(s.sub, options).to_managed_bytes_opt();
-                                if (!sval) {
-                                    return empty_value_set; // NULL can't be a map key
-                                }
+                            if (oper.op == oper_t::EQ) {
+                                auto solve = [s, oper] (const query_options& options) {
+                                    managed_bytes_opt sval = evaluate(s.sub, options).to_managed_bytes_opt();
+                                    if (!sval) {
+                                        return empty_value_set; // NULL can't be a map key
+                                    }

-                                managed_bytes_opt rval = evaluate(oper.rhs, options).to_managed_bytes_opt();
-                                if (!rval) {
-                                    return empty_value_set; // All NULL comparisons fail; no column values match.
-                                }
-                                managed_bytes_opt elements[] = {sval, rval};
-                                managed_bytes val = tuple_type_impl::build_value_fragmented(elements);
-                                return value_set(value_list{val});
-                            };
-                            return to_vector(predicate{
-                                .solve_for = std::move(solve),
-                                .filter = oper,
-                                .on = on_column{col.col},
-                                .is_singleton = true,
-                                .equality = true,
-                                .order = oper.order,
-                                .op = oper.op,
-                                .is_subscript = true,
-                            });
-                          }
-                          return cannot_solve_on_column(oper, col.col);
+                                    managed_bytes_opt rval = evaluate(oper.rhs, options).to_managed_bytes_opt();
+                                    if (!rval) {
+                                        return empty_value_set; // All NULL comparisons fail; no column values match.
+                                    }
+                                    managed_bytes_opt elements[] = {sval, rval};
+                                    managed_bytes val = tuple_type_impl::build_value_fragmented(elements);
+                                    return value_set(value_list{val});
+                                };
+                                return to_vector(predicate{
+                                    .solve_for = std::move(solve),
+                                    .filter = oper,
+                                    .on = on_column{col.col},
+                                    .is_singleton = true,
+                                    .equality = true,
+                                    .order = oper.order,
+                                    .op = oper.op,
+                                    .is_subscript = true,
+                                });
+                            }
+                            return cannot_solve_on_column(oper, col.col);
                        },
                        [&] (const tuple_constructor& tuple) -> std::vector<predicate> {
                            auto columns = tuple.elements
-                                    | std::views::transform([] (const expression& e) { return as<column_value>(e).col; })
-                                    | std::ranges::to<std::vector>();
+                            | std::views::transform([] (const expression& e) { return as<column_value>(e).col; })
+                            | std::ranges::to<std::vector>();
                            for (unsigned i = 0; i < columns.size(); ++i) {
                                if (!columns[i]->is_clustering_key() || columns[i]->position() != i) {
                                    on_internal_error(rlogger, "to_predicates: multi-column relation not on a clustering key prefix");
@@ -481,42 +481,42 @@ to_predicates(
                            if (!(oper.op == oper_t::EQ || is_slice(oper.op))) {
                                return cannot_solve(oper);
                            }
-                          auto solve = [oper] (const query_options& options) -> value_set {
-                            auto val = evaluate(oper.rhs, options).to_managed_bytes_opt();
-                            if (!val) {
-                                return empty_value_set; // All NULL comparisons fail; no token values match.
-                            }
-                            if (oper.op == oper_t::EQ) {
-                                return value_list{*val};
-                            } else if (oper.op == oper_t::GT) {
-                                return interval<managed_bytes>::make_starting_with(interval_bound(std::move(*val), exclusive));
-                            } else if (oper.op == oper_t::GTE) {
-                                return interval<managed_bytes>::make_starting_with(interval_bound(std::move(*val), inclusive));
-                            }
-                            static const managed_bytes MININT = managed_bytes(serialized(std::numeric_limits<int64_t>::min())),
-                                    MAXINT = managed_bytes(serialized(std::numeric_limits<int64_t>::max()));
-                            // Undocumented feature: when the user types `token(...) < MININT`, we interpret
-                            // that as MAXINT for some reason.
-                            const auto adjusted_val = (*val == MININT) ? MAXINT : *val;
-                            if (oper.op == oper_t::LT) {
-                                return interval<managed_bytes>::make_ending_with(interval_bound(std::move(adjusted_val), exclusive));
-                            } else if (oper.op == oper_t::LTE) {
-                                return interval<managed_bytes>::make_ending_with(interval_bound(std::move(adjusted_val), inclusive));
-                            }
-                            throw std::logic_error(format("get_token_interval unexpected operator {}", oper.op));
-                          };
-                          return to_vector(predicate{
-                            .solve_for = std::move(solve),
-                            .filter = oper,
-                            .on = on_partition_key_token{table_schema_opt},
-                            .is_singleton = (oper.op == oper_t::EQ),
-                            .equality = (oper.op == oper_t::EQ),
-                            .is_slice = expr::is_slice(oper.op),
-                            .is_upper_bound = (oper.op == oper_t::LT || oper.op == oper_t::LTE),
-                            .is_lower_bound = (oper.op == oper_t::GT || oper.op == oper_t::GTE),
-                            .order = oper.order,
-                            .op = oper.op,
-                          });
+                            auto solve = [oper] (const query_options& options) -> value_set {
+                                auto val = evaluate(oper.rhs, options).to_managed_bytes_opt();
+                                if (!val) {
+                                    return empty_value_set; // All NULL comparisons fail; no token values match.
+                                }
+                                if (oper.op == oper_t::EQ) {
+                                    return value_list{*val};
+                                } else if (oper.op == oper_t::GT) {
+                                    return interval<managed_bytes>::make_starting_with(interval_bound(std::move(*val), exclusive));
+                                } else if (oper.op == oper_t::GTE) {
+                                    return interval<managed_bytes>::make_starting_with(interval_bound(std::move(*val), inclusive));
+                                }
+                                static const managed_bytes MININT = managed_bytes(serialized(std::numeric_limits<int64_t>::min())),
+                                MAXINT = managed_bytes(serialized(std::numeric_limits<int64_t>::max()));
+                                // Undocumented feature: when the user types `token(...) < MININT`, we interpret
+                                // that as MAXINT for some reason.
+                                const auto adjusted_val = (*val == MININT) ? MAXINT : *val;
+                                if (oper.op == oper_t::LT) {
+                                    return interval<managed_bytes>::make_ending_with(interval_bound(std::move(adjusted_val), exclusive));
+                                } else if (oper.op == oper_t::LTE) {
+                                    return interval<managed_bytes>::make_ending_with(interval_bound(std::move(adjusted_val), inclusive));
+                                }
+                                throw std::logic_error(format("get_token_interval unexpected operator {}", oper.op));
+                            };
+                            return to_vector(predicate{
+                                .solve_for = std::move(solve),
+                                .filter = oper,
+                                .on = on_partition_key_token{table_schema_opt},
+                                .is_singleton = (oper.op == oper_t::EQ),
+                                .equality = (oper.op == oper_t::EQ),
+                                .is_slice = expr::is_slice(oper.op),
+                                .is_upper_bound = (oper.op == oper_t::LT || oper.op == oper_t::LTE),
+                                .is_lower_bound = (oper.op == oper_t::GT || oper.op == oper_t::GTE),
+                                .order = oper.order,
+                                .op = oper.op,
+                            });
                        },
                        [&] (const binary_operator&) -> std::vector<predicate> {
                            return cannot_solve(oper);
@@ -555,7 +555,7 @@ to_predicates(
                            return cannot_solve(oper);
                        },
                    }, oper.lhs);
-            },
+                },
            [] (const column_value& cv) -> std::vector<predicate> {
                return cannot_solve(cv);
            },
@@ -806,26 +806,26 @@ bool is_empty_restriction(const expression& e) {
 static
 std::function<bytes_opt (const query_options&)>
 build_value_for_fn(const column_definition& cdef, const expression& e, const schema& s) {
-  auto ac = to_predicate_on_column(e, &cdef, &s);
-  return [ac] (const query_options& options) -> bytes_opt {
-    value_set possible_vals = solve(ac, options);
-    return std::visit(overloaded_functor {
-        [&](const value_list& val_list) -> bytes_opt {
-            if (val_list.empty()) {
-                return std::nullopt;
-            }
+    auto ac = to_predicate_on_column(e, &cdef, &s);
+    return [ac] (const query_options& options) -> bytes_opt {
+        value_set possible_vals = solve(ac, options);
+        return std::visit(overloaded_functor {
+            [&](const value_list& val_list) -> bytes_opt {
+                if (val_list.empty()) {
+                    return std::nullopt;
+                }

-            if (val_list.size() != 1) {
-                on_internal_error(expr_logger, format("expr::value_for - multiple possible values for column: {}", ac.filter));
-            }
+                if (val_list.size() != 1) {
+                    on_internal_error(expr_logger, format("expr::value_for - multiple possible values for column: {}", ac.filter));
+                }

-            return to_bytes(val_list.front());
-        },
-        [&](const interval<managed_bytes>&) -> bytes_opt {
-            on_internal_error(expr_logger, format("expr::value_for - possible values are a range: {}", ac.filter));
-        }
-    }, possible_vals);
-  };
+                return to_bytes(val_list.front());
+            },
+            [&](const interval<managed_bytes>&) -> bytes_opt {
+                on_internal_error(expr_logger, format("expr::value_for - possible values are a range: {}", ac.filter));
+            }
+        }, possible_vals);
+    };
 }

 bool contains_multi_column_restriction(const expression& e) {
@@ -1337,11 +1337,11 @@ statement_restrictions::ck_restrictions_need_filtering() const {
    }

    return has_partition_key_unrestricted_components()
-    || clustering_key_restrictions_need_filtering()
-    // If token restrictions are present in an indexed query, then all other restrictions need to be filtered.
-    // A single token restriction can have multiple matching partition key values.
-    // Because of this we can't create a clustering prefix with more than token restriction.
-    || (_uses_secondary_indexing && has_token_restrictions());
+            || clustering_key_restrictions_need_filtering()
+            // If token restrictions are present in an indexed query, then all other restrictions need to be filtered.
+            // A single token restriction can have multiple matching partition key values.
+            // Because of this we can't create a clustering prefix with more than token restriction.
+            || (_uses_secondary_indexing && has_token_restrictions());
 }

 bool
@@ -1705,28 +1705,28 @@ dht::partition_range_vector statement_restrictions::get_partition_key_ranges(con
 get_partition_key_ranges_fn_t
 statement_restrictions::build_partition_key_ranges_fn() const {
    return std::visit(overloaded_functor{
-     [&] (const no_partition_range_restrictions&) -> get_partition_key_ranges_fn_t {
-      return [] (const query_options& options) -> dht::partition_range_vector{
-        return {dht::partition_range::make_open_ended_both_sides()};
-      };
-     },
-     [&] (const token_range_restrictions& r) -> get_partition_key_ranges_fn_t {
-      return [&] (const query_options& options) -> dht::partition_range_vector {
-        return partition_ranges_from_token(r.token_restrictions, options, *_schema);
-      };
-     },
-     [&] (const single_column_partition_range_restrictions& r) -> get_partition_key_ranges_fn_t {
-        if (_partition_range_is_simple) {
-            return [&] (const query_options& options) {
-                // Special case to avoid extra allocations required for a Cartesian product.
-                return partition_ranges_from_EQs(r.per_column_restrictions, options, *_schema);
+        [&] (const no_partition_range_restrictions&) -> get_partition_key_ranges_fn_t {
+            return [] (const query_options& options) -> dht::partition_range_vector{
+                return {dht::partition_range::make_open_ended_both_sides()};
            };
-        } else {
-            return [&] (const query_options& options) {
-                return partition_ranges_from_singles(r.per_column_restrictions, options, *_schema);
+        },
+        [&] (const token_range_restrictions& r) -> get_partition_key_ranges_fn_t {
+            return [&] (const query_options& options) -> dht::partition_range_vector {
+                return partition_ranges_from_token(r.token_restrictions, options, *_schema);
            };
-        }
-      }}, _partition_range_restrictions);
+        },
+        [&] (const single_column_partition_range_restrictions& r) -> get_partition_key_ranges_fn_t {
+            if (_partition_range_is_simple) {
+                return [&] (const query_options& options) {
+                    // Special case to avoid extra allocations required for a Cartesian product.
+                    return partition_ranges_from_EQs(r.per_column_restrictions, options, *_schema);
+                };
+            } else {
+                return [&] (const query_options& options) {
+                    return partition_ranges_from_singles(r.per_column_restrictions, options, *_schema);
+                };
+            }
+        }}, _partition_range_restrictions);
 }

 namespace {
@@ -1970,28 +1970,28 @@ build_get_multi_column_clustering_bounds_fn(
            }
        });
    }
-  return [schema, range_builders, all_natural, all_reverse] (const query_options& options) -> std::vector<query::clustering_range> {
-    multi_column_range_accumulator acc;
-    for (auto& builder : range_builders) {
-        builder(acc, options);
-    }
-    auto bounds = std::move(acc.ranges);
+    return [schema, range_builders, all_natural, all_reverse] (const query_options& options) -> std::vector<query::clustering_range> {
+        multi_column_range_accumulator acc;
+        for (auto& builder : range_builders) {
+            builder(acc, options);
+        }
+        auto bounds = std::move(acc.ranges);

-    if (!all_natural && !all_reverse) {
-        std::vector<query::clustering_range> bounds_in_clustering_order;
-        for (const auto& b : bounds) {
-            const auto eqv = get_equivalent_ranges(b, *schema);
-            bounds_in_clustering_order.insert(bounds_in_clustering_order.end(), eqv.cbegin(), eqv.cend());
+        if (!all_natural && !all_reverse) {
+            std::vector<query::clustering_range> bounds_in_clustering_order;
+            for (const auto& b : bounds) {
+                const auto eqv = get_equivalent_ranges(b, *schema);
+                bounds_in_clustering_order.insert(bounds_in_clustering_order.end(), eqv.cbegin(), eqv.cend());
+            }
+            return bounds_in_clustering_order;
        }
-        return bounds_in_clustering_order;
-    }
-    if (all_reverse) {
-        for (auto& crange : bounds) {
-            crange = query::clustering_range(crange.end(), crange.start());
+        if (all_reverse) {
+            for (auto& crange : bounds) {
+                crange = query::clustering_range(crange.end(), crange.start());
+            }
        }
-    }
-    return bounds;
-  };
+        return bounds;
+    };
 }

 /// Reverses the range if the type is reversed.  Why don't we have interval::reverse()??
@@ -2288,17 +2288,17 @@ build_range_from_raw_bounds_fn(
    std::vector<std::function<query::clustering_range (const query_options&)>> range_builders;
    for (const auto& e : exprs | std::views::transform(&predicate::filter)) {
        if (auto b = find_clustering_order(e)) {
-          range_builders.emplace_back([bb = *b, &schema] (const query_options& options) {
-            auto* b = &bb;
-            cql3::raw_value tup_val = expr::evaluate(b->rhs, options);
-            if (tup_val.is_null()) {
-                on_internal_error(rlogger, format("range_from_raw_bounds: unexpected atom {}", *b));
-            }
+            range_builders.emplace_back([bb = *b, &schema] (const query_options& options) {
+                auto* b = &bb;
+                cql3::raw_value tup_val = expr::evaluate(b->rhs, options);
+                if (tup_val.is_null()) {
+                    on_internal_error(rlogger, format("range_from_raw_bounds: unexpected atom {}", *b));
+                }

-            const auto r = to_range(
+                const auto r = to_range(
                    b->op, clustering_key_prefix::from_optional_exploded(schema, expr::get_tuple_elements(tup_val, *type_of(b->rhs))));
-            return r;
-          });
+                return r;
+            });
        }
    }
    return [range_builders] (const query_options& options) -> std::vector<query::clustering_range> {
@@ -2322,9 +2322,9 @@ build_range_from_raw_bounds_fn(
 get_clustering_bounds_fn_t
 statement_restrictions::build_get_clustering_bounds_fn() const {
    if (_clustering_prefix_restrictions.empty()) {
-      return [&] (const query_options& options) -> std::vector<query::clustering_range> {
-        return {query::clustering_range::make_open_ended_both_sides()};
-      };
+        return [&] (const query_options& options) -> std::vector<query::clustering_range> {
+            return {query::clustering_range::make_open_ended_both_sides()};
+        };
    }
    if (_clustering_prefix_restrictions[0].is_multi_column) {
        bool all_natural = true, all_reverse = true; ///< Whether column types are reversed or natural.
@@ -2342,14 +2342,14 @@ statement_restrictions::build_get_clustering_bounds_fn() const {
                }
            }
        }
-      return build_get_multi_column_clustering_bounds_fn(_schema, _clustering_prefix_restrictions,
-                all_natural, all_reverse);
-    } else {
-      return [&] (const query_options& options) -> std::vector<query::clustering_range> {
-        return get_single_column_clustering_bounds(options, *_schema, _clustering_prefix_restrictions);
-      };
+        return build_get_multi_column_clustering_bounds_fn(_schema, _clustering_prefix_restrictions,
+            all_natural, all_reverse);
+        } else {
+            return [&] (const query_options& options) -> std::vector<query::clustering_range> {
+                return get_single_column_clustering_bounds(options, *_schema, _clustering_prefix_restrictions);
+            };
+        }
    }
-}

 std::vector<query::clustering_range> statement_restrictions::get_clustering_bounds(const query_options& options) const {
    return _get_clustering_bounds_fn(options);
@@ -2475,11 +2475,11 @@ void statement_restrictions::prepare_indexed_global(const schema& idx_tbl_schema
    _idx_tbl_ck_prefix->reserve(_idx_tbl_ck_prefix->size() + idx_tbl_schema.clustering_key_size());
    auto *single_column_partition_key_restrictions = std::get_if<single_column_partition_range_restrictions>(&_partition_range_restrictions);
    if (single_column_partition_key_restrictions) {
-      for (const auto& e : single_column_partition_key_restrictions->per_column_restrictions) {
-        const auto col = require_on_single_column(e);
-        const auto pos = _schema->position(*col) + 1;
-        (*_idx_tbl_ck_prefix)[pos] = replace_column_def(e, &idx_tbl_schema.clustering_column_at(pos));
-      }
+        for (const auto& e : single_column_partition_key_restrictions->per_column_restrictions) {
+            const auto col = require_on_single_column(e);
+            const auto pos = _schema->position(*col) + 1;
+            (*_idx_tbl_ck_prefix)[pos] = replace_column_def(e, &idx_tbl_schema.clustering_column_at(pos));
+        }
    }

    if (std::ranges::any_of(*_idx_tbl_ck_prefix | std::views::drop(1) | std::views::transform(&predicate::filter), is_empty_restriction)) {
@@ -2621,10 +2621,10 @@ statement_restrictions::build_get_global_index_clustering_ranges_fn() const {
        return {};
    }

-  return [&] (const query_options& options) {
-    // Multi column restrictions are not added to _idx_tbl_ck_prefix, they are handled later by filtering.
-    return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
-  };
+    return [&] (const query_options& options) {
+        // Multi column restrictions are not added to _idx_tbl_ck_prefix, they are handled later by filtering.
+        return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
+    };
 }

 std::vector<query::clustering_range> statement_restrictions::get_global_index_clustering_ranges(
@@ -2643,14 +2643,14 @@ statement_restrictions::build_get_global_index_token_clustering_ranges_fn() cons
    // In old indexes the token column was of type blob.
    // This causes problems with sorting and must be handled separately.
    if (token_column.type != long_type) {
-      return [&] (const query_options& options) {
-        return get_index_v1_token_range_clustering_bounds(options, token_column, _idx_tbl_ck_prefix->at(0));
-      };
+        return [&] (const query_options& options) {
+            return get_index_v1_token_range_clustering_bounds(options, token_column, _idx_tbl_ck_prefix->at(0));
+        };
    }

-  return [&] (const query_options& options) {
-    return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
-  };
+    return [&] (const query_options& options) {
+        return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
+    };
 }

 std::vector<query::clustering_range> statement_restrictions::get_global_index_token_clustering_ranges(
@@ -2664,10 +2664,10 @@ statement_restrictions::build_get_local_index_clustering_ranges_fn() const {
        return {};
    }

-  return [&] (const query_options& options) {
-    // Multi column restrictions are not added to _idx_tbl_ck_prefix, they are handled later by filtering.
-    return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
-  };
+    return [&] (const query_options& options) {
+        // Multi column restrictions are not added to _idx_tbl_ck_prefix, they are handled later by filtering.
+        return get_single_column_clustering_bounds(options, *_view_schema, *_idx_tbl_ck_prefix);
+    };
 }

 std::vector<query::clustering_range> statement_restrictions::get_local_index_clustering_ranges(
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -351,6 +351,9 @@ public:
                    if (agg.state_to_result_function) {
                        ret.push_back(agg.state_to_result_function);
                    }
+                    if (agg.state_reduction_function) {
+                        ret.push_back(agg.state_reduction_function);
+                    }
                }
            }
            return false;
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -18,6 +18,7 @@

 #include "cql3/statements/create_table_statement.hh"
 #include "cql3/statements/prepared_statement.hh"
+#include "cql3/statements/strong_consistency/statement_helpers.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/cql_config.hh"

@@ -33,6 +34,7 @@
 #include "compaction/time_window_compaction_strategy.hh"
 #include "db/tags/extension.hh"
 #include "db/tags/utils.hh"
+#include "replica/database.hh"
 #include "alternator/ttl_tag.hh"

 namespace cql3 {
@@ -447,6 +449,23 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
    return std::make_unique<prepared_statement>(audit_info(), stmt, std::move(stmt_warnings));
 }

+future<::shared_ptr<messages::result_message>>
+create_table_statement::execute(query_processor& qp, service::query_state& state, const query_options& options, std::optional<service::group0_guard> guard) const {
+    auto result = co_await schema_altering_statement::execute(qp, state, options, std::move(guard));
+
+    auto& db = qp.proxy().local_db();
+    if (strong_consistency::is_strongly_consistent(db.as_data_dictionary(), keyspace())) {
+        auto& cf = db.find_column_family(keyspace(), column_family());
+        try {
+            co_await qp.wait_for_table_raft_groups_on_all_hosts(cf.schema()->id(), lowres_clock::now() + state.get_client_state().get_timeout_config().other_timeout);
+        } catch (...) {
+            result->add_warning(format("Failed to wait for raft groups of {}.{} to start on all hosts: {}", keyspace(), column_family(), std::current_exception()));
+        }
+    }
+
+    co_return result;
+}
+
 data_type create_table_statement::raw_statement::get_type_and_remove(column_map_type& columns, ::shared_ptr<column_identifier> t)
 {
    auto it = columns.find(t);
--- a/cql3/statements/create_table_statement.hh
+++ b/cql3/statements/create_table_statement.hh
@@ -76,6 +76,8 @@ public:

    virtual std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats, const cql_config& cfg) override;

+    future<::shared_ptr<messages::result_message>> execute(query_processor& qp, service::query_state& state, const query_options& options, std::optional<service::group0_guard> guard) const override;
+
    virtual future<> grant_permissions_to_creator(const service::client_state&, service::group0_batch&) const override;

    schema_ptr get_cf_meta_data(const data_dictionary::database) const;
--- a/cql3/statements/strong_consistency/modification_statement.cc
+++ b/cql3/statements/strong_consistency/modification_statement.cc
@@ -8,6 +8,7 @@

 #include "modification_statement.hh"

+#include "db/consistency_level_type.hh"
 #include "db/timeout_clock.hh"
 #include "transport/messages/result_message.hh"
 #include "cql3/query_processor.hh"
@@ -34,10 +35,18 @@ future<shared_ptr<result_message>> modification_statement::execute(query_process
            .then(cql_transport::messages::propagate_exception_as_future<shared_ptr<result_message>>);
 }

+static void validate_consistency_level(const db::consistency_level& cl) {
+    if (cl != db::consistency_level::QUORUM && cl != db::consistency_level::LOCAL_QUORUM) {
+        throw exceptions::invalid_request_exception("Strongly consistent writes must use QUORUM/LOCAL_QUORUM consistency level");
+    }
+}
+
 future<shared_ptr<result_message>> modification_statement::execute_without_checking_exception_message(
        query_processor& qp, service::query_state& qs, const query_options& options,
        std::optional<service::group0_guard> guard) const
 {
+    validate_consistency_level(options.get_consistency());
+
    auto timeout = db::timeout_clock::now() + _statement->get_timeout(qs.get_client_state(), options);
    auto json_cache = base_statement::json_cache_opt{};
    const auto keys = _statement->build_partition_keys(options, json_cache);
@@ -71,7 +80,7 @@ future<shared_ptr<result_message>> modification_statement::execute_without_check
    using namespace service::strong_consistency;
    if (const auto* redirect = get_if<need_redirect>(&mutate_result)) {
        bool is_write = true;
-        co_return co_await redirect_statement(qp, options, redirect->target, timeout, is_write);
+        co_return co_await redirect_statement(qp, options, redirect->target, timeout, is_write, coordinator.get().get_stats());
    }
    utils::get_local_injector().inject("sc_modification_statement_timeout", [&] {
        throw exceptions::mutation_write_timeout_exception{"", "", options.get_consistency(), 0, 0, db::write_type::SIMPLE};
--- a/cql3/statements/strong_consistency/select_statement.cc
+++ b/cql3/statements/strong_consistency/select_statement.cc
@@ -8,6 +8,7 @@

 #include "select_statement.hh"

+#include "db/consistency_level_type.hh"
 #include "query/query-request.hh"
 #include "cql3/query_processor.hh"
 #include "service/strong_consistency/coordinator.hh"
@@ -17,10 +18,19 @@ namespace cql3::statements::strong_consistency {

 using result_message = cql_transport::messages::result_message;

+static void validate_consistency_level(const db::consistency_level& cl) {
+    if (cl != db::consistency_level::QUORUM && cl != db::consistency_level::LOCAL_QUORUM &&
+        cl != db::consistency_level::ONE && cl != db::consistency_level::LOCAL_ONE) {
+        throw exceptions::invalid_request_exception("Strongly consistent reads must use QUORUM/LOCAL_QUORUM or ONE/LOCAL_ONE consistency level");
+    }
+}
+
 future<::shared_ptr<result_message>> select_statement::do_execute(query_processor& qp,
        service::query_state& state, 
        const query_options& options) const
 {
+    validate_consistency_level(options.get_consistency());
+
    const auto key_ranges = _restrictions->get_partition_key_ranges(options);
    if (key_ranges.size() != 1 || !query::is_single_partition(key_ranges[0])) {
        throw exceptions::invalid_request_exception("Strongly consistent queries can only target a single partition");
@@ -47,7 +57,7 @@ future<::shared_ptr<result_message>> select_statement::do_execute(query_processo
    using namespace service::strong_consistency;
    if (const auto* redirect = get_if<need_redirect>(&query_result)) {
        bool is_write = false;
-        co_return co_await redirect_statement(qp, options, redirect->target, timeout, is_write);
+        co_return co_await redirect_statement(qp, options, redirect->target, timeout, is_write, coordinator.get().get_stats());
    }

    co_return co_await process_results(get<lw_shared_ptr<query::result>>(std::move(query_result)),
--- a/cql3/statements/strong_consistency/statement_helpers.cc
+++ b/cql3/statements/strong_consistency/statement_helpers.cc
@@ -12,19 +12,23 @@
 #include "cql3/query_processor.hh"
 #include "replica/database.hh"
 #include "locator/tablet_replication_strategy.hh"
+#include "service/strong_consistency/coordinator.hh"

 namespace cql3::statements::strong_consistency {
 future<::shared_ptr<cql_transport::messages::result_message>> redirect_statement(query_processor& qp,
        const query_options& options,
        const locator::tablet_replica& target,
        db::timeout_clock::time_point timeout,
-        bool is_write)
+        bool is_write,
+        service::strong_consistency::stats& stats)
 {
    auto&& func_values_cache = const_cast<cql3::query_options&>(options).take_cached_pk_function_calls();
    const auto my_host_id = qp.db().real_database().get_token_metadata().get_topology().my_host_id();
    if (target.host != my_host_id) {
+        ++(is_write ? stats.write_node_bounces : stats.read_node_bounces);
        co_return qp.bounce_to_node(target, std::move(func_values_cache), timeout, is_write);
    }
+    ++(is_write ? stats.write_shard_bounces : stats.read_shard_bounces);
    co_return qp.bounce_to_shard(target.shard, std::move(func_values_cache));
 }

--- a/cql3/statements/strong_consistency/statement_helpers.hh
+++ b/cql3/statements/strong_consistency/statement_helpers.hh
@@ -11,6 +11,8 @@
 #include "cql3/cql_statement.hh"
 #include "locator/tablets.hh"

+namespace service::strong_consistency { struct stats; }
+
 namespace cql3::statements::strong_consistency {

 future<::shared_ptr<cql_transport::messages::result_message>> redirect_statement(
@@ -18,7 +20,8 @@ future<::shared_ptr<cql_transport::messages::result_message>> redirect_statement
    const query_options& options,
    const locator::tablet_replica& target,
    db::timeout_clock::time_point timeout,
-    bool is_write);
+    bool is_write,
+    service::strong_consistency::stats& stats);

 bool is_strongly_consistent(data_dictionary::database db, std::string_view ks_name);

--- a/data_dictionary/data_dictionary.cc
+++ b/data_dictionary/data_dictionary.cc
@@ -339,7 +339,7 @@ static storage_options::object_storage object_storage_from_map(std::string_view
    }
    if (values.size() > allowed_options.size()) {
        throw std::runtime_error(fmt::format("Extraneous options for {}: {}; allowed: {}",
-            fmt::join(values | std::views::keys, ","), type,
+            type, fmt::join(values | std::views::keys, ","),
            fmt::join(allowed_options | std::views::keys, ",")));
    }
    options.type = std::string(type);
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -776,7 +776,7 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
    friend std::ostream& operator<<(std::ostream&, const segment&);
    friend class segment_manager;

-    size_t sector_overhead(size_t size) const {
+    constexpr size_t sector_overhead(size_t size) const {
        return (size / (_alignment - detail::sector_overhead_size)) * detail::sector_overhead_size;
    }

@@ -1028,18 +1028,21 @@ public:
        co_return me;
    }

-    /**
-     * Allocate a new buffer
-     */
-    void new_buffer(size_t s) {
-        SCYLLA_ASSERT(_buffer.empty());
-
+    std::tuple<size_t, size_t> buffer_usage_size(size_t s) const {
        auto overhead = segment_overhead_size;
        if (_file_pos == 0) {
            overhead += descriptor_header_size;
        }

-        s += overhead;
+        return {s + overhead, overhead};
+    }
+
+    /**
+     * Allocate a new buffer
+     */
+    void new_buffer(size_t size_in) {
+        SCYLLA_ASSERT(_buffer.empty());
+        auto [s, overhead] = buffer_usage_size(size_in);
        // add bookkeep data reqs. 
        auto a = align_up(s + sector_overhead(s), _alignment);
        auto k = std::max(a, default_size);
@@ -1427,6 +1430,9 @@ public:

    position_type next_position(size_t size) const {
        auto used = _buffer_ostream_size - _buffer_ostream.size();
+        if (used == 0) { // new chunk/segment
+            std::tie(size, std::ignore) = buffer_usage_size(size);
+        }
        used += size;
        return _file_pos + used + sector_overhead(used);
    }
@@ -1570,7 +1576,6 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
    clogger.debug("Attempting oversized alloc of {} entry writer", writer.num_entries);

    auto size = writer.size();
-    auto max_file_size = cfg.commitlog_segment_size_in_mb * 1024 * 1024;

    // check if this cannot be written at all...
    if (!cfg.allow_going_over_size_limit) {
@@ -1579,11 +1584,11 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
        // more worst case
        auto size_with_meta_overhead = size_with_sector_overhead
            + (1 + size_with_sector_overhead/max_mutation_size) * (segment::entry_overhead_size + segment::fragmented_entry_overhead_size + segment::segment_overhead_size)
-            * (1 + size_with_sector_overhead/max_file_size) * segment::descriptor_header_size
+            * (1 + size_with_sector_overhead/max_size) * segment::descriptor_header_size
            ;
        // this is not really true. We could have some space in current segment,
        // but again, lets be conservative.
-        auto max_file_size_avail = max_disk_size - max_file_size;
+        auto max_file_size_avail = max_disk_size - max_size;

        if (size_with_meta_overhead > max_file_size_avail) {
            throw std::invalid_argument(fmt::format("Mutation of {} bytes is too large for potentially available disk space of {}", size, max_file_size_avail));
@@ -1770,11 +1775,13 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
                    co_await s->close();
                    s = co_await get_segment();
                }
-                // bytes not counting overhead                
-                auto buf_rem = std::min(max_size - s->position(), s->_buffer_ostream.size());
+                // bytes not counting overhead
+                auto pos = s->position();
+                auto max = std::max<size_t>(pos, max_size);
+                auto buf_rem = std::min(max_size - max, s->_buffer_ostream.size());

                size_t avail;
-                if (buf_rem > align) {
+                if (buf_rem >= align) {
                    auto rem2 = buf_rem - (1 + buf_rem/sector_size) * detail::sector_overhead_size;
                    avail = std::min(rem2, max_mutation_size)
                        - segment::entry_overhead_size
@@ -1784,7 +1791,7 @@ future<> db::commitlog::segment_manager::oversized_allocation(entry_writer& writ
                } else {
                    co_await s->cycle();
                    auto pos = s->position();
-                    auto max = std::max<size_t>(pos, max_file_size);
+                    auto max = std::max<size_t>(pos, max_size);
                    auto file_rem = max - pos;

                    if (file_rem < align) {
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -217,7 +217,7 @@ future<> db::commitlog_replayer::impl::process(stats* s, commitlog::buffer_and_r
        if (cm_it == local_cm.end()) {
            if (!cer.get_column_mapping()) {
                rlogger.debug("replaying at {} v={} at {}", fm.column_family_id(), fm.schema_version(), rp);
-                throw std::runtime_error(format("unknown schema version {}, table=", fm.schema_version(), fm.column_family_id()));
+                throw std::runtime_error(format("unknown schema version {}, table={}", fm.schema_version(), fm.column_family_id()));
            }
            rlogger.debug("new schema version {} in entry {}", fm.schema_version(), rp);
            cm_it = local_cm.emplace(fm.schema_version(), *cer.get_column_mapping()).first;
--- a/db/config.cc
+++ b/db/config.cc
@@ -1429,6 +1429,13 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , enable_shard_aware_drivers(this, "enable_shard_aware_drivers", value_status::Used, true, "Enable native transport drivers to use connection-per-shard for better performance.")
    , enable_ipv6_dns_lookup(this, "enable_ipv6_dns_lookup", value_status::Used, false, "Use IPv6 address resolution")
    , abort_on_internal_error(this, "abort_on_internal_error", liveness::LiveUpdate, value_status::Used, false, "Abort the server instead of throwing exception when internal invariants are violated.")
+    , abort_on_malformed_sstable_error(this, "abort_on_malformed_sstable_error", liveness::LiveUpdate, value_status::Used,
+#if defined(DEBUG) || defined(DEVEL)
+            true,
+#else
+            false,
+#endif
+            "Abort the server and generate a coredump instead of throwing an exception when any sstable parse error is detected (malformed_sstable_exception, bufsize_mismatch_exception, parse_assert() failures, or BTI parse errors). Intended for debugging memory corruption that may manifest as sstable corruption. Defaults to true in debug and dev builds.")
    , max_partition_key_restrictions_per_query(this, "max_partition_key_restrictions_per_query", liveness::LiveUpdate, value_status::Used, 100,
            "Maximum number of distinct partition keys restrictions per query. This limit places a bound on the size of IN tuples, "
            "especially when multiple partition key columns have IN restrictions. Increasing this value can result in server instability.")
@@ -1921,7 +1928,7 @@ std::map<sstring, db::experimental_features_t::feature> db::experimental_feature
        {"lwt", feature::UNUSED},
        {"udf", feature::UDF},
        {"cdc", feature::UNUSED},
-        {"alternator-streams", feature::ALTERNATOR_STREAMS},
+        {"alternator-streams", feature::UNUSED},
        {"alternator-ttl", feature::UNUSED },
        {"consistent-topology-changes", feature::UNUSED},
        {"broadcast-tables", feature::BROADCAST_TABLES},
--- a/db/config.hh
+++ b/db/config.hh
@@ -115,7 +115,6 @@ struct experimental_features_t {
    enum class feature {
        UNUSED,
        UDF,
-        ALTERNATOR_STREAMS,
        BROADCAST_TABLES,
        KEYSPACE_STORAGE_OPTIONS,
        STRONGLY_CONSISTENT_TABLES,
@@ -457,6 +456,7 @@ public:
    named_value<bool> enable_shard_aware_drivers;
    named_value<bool> enable_ipv6_dns_lookup;
    named_value<bool> abort_on_internal_error;
+    named_value<bool> abort_on_malformed_sstable_error;
    named_value<uint32_t> max_partition_key_restrictions_per_query;
    named_value<uint32_t> max_clustering_key_restrictions_per_query;
    named_value<uint64_t> max_memory_for_unlimited_query_soft_limit;
--- a/db/heat_load_balance.cc
+++ b/db/heat_load_balance.cc
@@ -327,7 +327,7 @@ redistribute(const std::vector<float>& p, unsigned me, unsigned k) {
                }
            }

-            hr_logger.trace("     pp after1=", pp);
+            hr_logger.trace("     pp after1={}", pp);
            if (d.first == me) {
                // We only care what "me" sends, and only the elements in
                // the sorted list earlier than me could have forced it to
--- a/db/large_data_handler.hh
+++ b/db/large_data_handler.hh
@@ -29,6 +29,9 @@ class large_data_handler {
 public:
    struct stats {
        int64_t partitions_bigger_than_threshold = 0; // number of large partition updates exceeding threshold_bytes
+        int64_t rows_bigger_than_threshold = 0; // number of large row updates exceeding row_threshold_bytes
+        int64_t cells_bigger_than_threshold = 0; // number of large cell updates exceeding cell_threshold_bytes
+        int64_t collections_bigger_than_threshold = 0; // number of large collection updates exceeding collection_elements_count_threshold
    };

 private:
@@ -82,6 +85,7 @@ public:
            const clustering_key_prefix* clustering_key, uint64_t row_size) {
        SCYLLA_ASSERT(running());
        if (row_size > _row_threshold_bytes) [[unlikely]] {
+            ++_stats.rows_bigger_than_threshold;
            return with_sem([&sst, &partition_key, clustering_key, row_size, this] {
                return record_large_rows(sst, partition_key, clustering_key, row_size);
            }).then([] {
@@ -102,6 +106,8 @@ public:
            const clustering_key_prefix* clustering_key, const column_definition& cdef, uint64_t cell_size, uint64_t collection_elements) {
        SCYLLA_ASSERT(running());
        above_threshold_result above_threshold{.size = cell_size > _cell_threshold_bytes, .elements = collection_elements > _collection_elements_count_threshold};
+        _stats.cells_bigger_than_threshold += above_threshold.size;
+        _stats.collections_bigger_than_threshold += above_threshold.elements;
        if (above_threshold.size || above_threshold.elements) [[unlikely]] {
            return with_sem([&sst, &partition_key, clustering_key, &cdef, cell_size, collection_elements, this] {
                return record_large_cells(sst, partition_key, clustering_key, cdef, cell_size, collection_elements);
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -9,6 +9,7 @@

 #include "db/schema_tables.hh"

+#include "db/view/view_building_task_mutation_builder.hh"
 #include "service/migration_manager.hh"
 #include "service/storage_proxy.hh"
 #include "gms/feature_service.hh"
@@ -1799,7 +1800,8 @@ static void make_update_indices_mutations(
        utils::chunked_vector<mutation>& mutations)
 {
    mutation indices_mutation(indexes(), partition_key::from_singular(*indexes(), old_table->ks_name()));
-    std::vector<mutation> view_building_muts;
+    view::view_building_task_mutation_builder vb_mut_builder(timestamp);
+    std::vector<mutation> view_status_muts;

    auto diff = difference(old_table->all_indices(), new_table->all_indices());
    auto& db = sp.local_db();
@@ -1834,8 +1836,7 @@ static void make_update_indices_mutations(
                    }

                    for (auto& [id, _]: replica_tasks.view_tasks.at(view->id())) {
-                        auto mut = sys_ks.make_remove_view_building_task_mutation(timestamp, id).get();
-                        view_building_muts.push_back(std::move(mut));
+                        vb_mut_builder.del_task(id);
                        slogger.trace("Aborting view building task with ID: {} because the index is being dropped", id);
                    }
                }
@@ -1843,7 +1844,7 @@ static void make_update_indices_mutations(

            // Remove entries from `system.view_build_status_v2`
            auto build_status_mut = sys_ks.make_remove_view_build_status_mutation(timestamp, {view->ks_name(), view->cf_name()}).get();
-            view_building_muts.push_back(std::move(build_status_mut));
+            view_status_muts.push_back(std::move(build_status_mut));
        }
    }

@@ -1862,7 +1863,8 @@ static void make_update_indices_mutations(
    }

    mutations.emplace_back(std::move(indices_mutation));
-    mutations.insert(mutations.end(), std::make_move_iterator(view_building_muts.begin()), std::make_move_iterator(view_building_muts.end()));
+    mutations.emplace_back(vb_mut_builder.build());
+    mutations.insert(mutations.end(), std::make_move_iterator(view_status_muts.begin()), std::make_move_iterator(view_status_muts.end()));
 }

 static void add_drop_column_to_mutations(schema_ptr table, const sstring& name, const schema::dropped_column& dc, api::timestamp_type timestamp, utils::chunked_vector<mutation>& mutations) {
--- a/db/snapshot/backup_task.cc
+++ b/db/snapshot/backup_task.cc
@@ -17,7 +17,6 @@
 #include "db/snapshot-ctl.hh"
 #include "db/snapshot/backup_task.hh"
 #include "schema/schema_fwd.hh"
-#include "sstables/exceptions.hh"
 #include "sstables/sstables.hh"
 #include "sstables/sstable_directory.hh"
 #include "sstables/sstables_manager.hh"
@@ -164,22 +163,23 @@ future<> backup_task_impl::process_snapshot_dir() {
            auto file_path = _snapshot_dir / name;
            auto st = co_await file_stat(directory, name);
            total += st.size;
-            try {
-                auto desc = sstables::parse_path(file_path, "", "");
-                const auto& gen = desc.generation;
-                _sstable_comps[gen].emplace_back(name);
-                _sstables_in_snapshot.insert(desc.generation);
-                ++num_sstable_comps;
-
-                // When the SSTable is only linked-to by the snapshot directory,
-                // it is already deleted from the table's base directory, and
-                // therefore it better be uploaded earlier to free-up its capacity.
-                if (desc.component == sstables::component_type::Data && st.number_of_links == 1) {
-                    snap_log.debug("backup_task: SSTable with generation {} is already deleted from the table", gen);
-                    _deleted_sstables.push_back(gen);
-                }
-            } catch (const sstables::malformed_sstable_exception&) {
+            auto result = sstables::parse_path(file_path, "", "");
+            if (!result) {
                _files.emplace_back(name);
+                continue;
+            }
+            auto desc = std::move(*result);
+            const auto& gen = desc.generation;
+            _sstable_comps[gen].emplace_back(name);
+            _sstables_in_snapshot.insert(desc.generation);
+            ++num_sstable_comps;
+
+            // When the SSTable is only linked-to by the snapshot directory,
+            // it is already deleted from the table's base directory, and
+            // therefore it better be uploaded earlier to free-up its capacity.
+            if (desc.component == sstables::component_type::Data && st.number_of_links == 1) {
+                snap_log.debug("backup_task: SSTable with generation {} is already deleted from the table", gen);
+                _deleted_sstables.push_back(gen);
            }
        }
        _total_progress.total = total;
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -13,7 +13,6 @@
 #include "replica/database.hh"
 #include "db/consistency_level_type.hh"
 #include "db/system_keyspace.hh"
-#include "db/config.hh"
 #include "schema/schema_builder.hh"
 #include "timeout_config.hh"
 #include "types/types.hh"
@@ -22,8 +21,6 @@
 #include "cdc/generation.hh"
 #include "cql3/query_processor.hh"
 #include "service/storage_proxy.hh"
-#include "gms/feature_service.hh"
-
 #include "service/migration_manager.hh"
 #include "locator/host_id.hh"

@@ -41,27 +38,10 @@ static logging::logger dlogger("system_distributed_keyspace");
 extern logging::logger cdc_log;

 namespace db {
-namespace {
-    const auto set_wait_for_sync_to_commitlog = schema_builder::register_schema_initializer([](schema_builder& builder) {
-        if ((builder.ks_name() == system_distributed_keyspace::NAME_EVERYWHERE && builder.cf_name() == system_distributed_keyspace::CDC_GENERATIONS_V2) ||
-            (builder.ks_name() == system_distributed_keyspace::NAME && builder.cf_name() == system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION))
-        {
-            builder.set_wait_for_sync_to_commitlog(true);
-        }
-    });
-}

 extern thread_local data_type cdc_streams_set_type;
 thread_local data_type cdc_streams_set_type = set_type_impl::get_instance(bytes_type, false);

-/* See `token_range_description` struct */
-thread_local data_type cdc_streams_list_type = list_type_impl::get_instance(bytes_type, false);
-thread_local data_type cdc_token_range_description_type = tuple_type_impl::get_instance(
-        { long_type             // dht::token token_range_end;
-        , cdc_streams_list_type // std::vector<stream_id> streams;
-        , byte_type             // uint8_t sharding_ignore_msb;
-        });
-thread_local data_type cdc_generation_description_type = list_type_impl::get_instance(cdc_token_range_description_type, false);

 schema_ptr view_build_status() {
    static thread_local auto schema = [] {
@@ -77,42 +57,6 @@ schema_ptr view_build_status() {
    return schema;
 }

-/* An internal table used by nodes to exchange CDC generation data. */
-schema_ptr cdc_generations_v2() {
-    thread_local auto schema = [] {
-        auto id = generate_legacy_id(system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2);
-        return schema_builder(system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2, {id})
-                /* The unique identifier of this generation. */
-                .with_column("id", uuid_type, column_kind::partition_key)
-                /* The generation describes a mapping from all tokens in the token ring to a set of stream IDs.
-                 * This mapping is built from a bunch of smaller mappings, each describing how tokens in a subrange
-                 * of the token ring are mapped to stream IDs; these subranges together cover the entire token ring.
-                 * Each such range-local mapping is represented by a row of this table.
-                 * The clustering key of the row is the end of the range being described by this row.
-                 * The start of this range is the range_end of the previous row (in the clustering order, which is the integer order)
-                 * or of the last row of this partition if this is the first the first row. */
-                .with_column("range_end", long_type, column_kind::clustering_key)
-                /* The set of streams mapped to in this range.
-                 * The number of streams mapped to a single range in a CDC generation is bounded from above by the number
-                 * of shards on the owner of that range in the token ring.
-                 * In other words, the number of elements of this set is bounded by the maximum of the number of shards
-                 * over all nodes. The serialized size is obtained by counting about 20B for each stream.
-                 * For example, if all nodes in the cluster have at most 128 shards,
-                 * the serialized size of this set will be bounded by ~2.5 KB. */
-                .with_column("streams", cdc_streams_set_type)
-                /* The value of the `ignore_msb` sharding parameter of the node which was the owner of this token range
-                 * when the generation was first created. Together with the set of streams above it fully describes
-                 * the mapping for this particular range. */
-                .with_column("ignore_msb", byte_type)
-                /* Column used for sanity checking.
-                 * For a given generation it's equal to the number of ranges in this generation;
-                 * thus, after the generation is fully inserted, it must be equal to the number of rows in the partition. */
-                .with_column("num_ranges", int32_type, column_kind::static_column)
-                .with_hash_version()
-                .build();
-    }();
-    return schema;
-}

 /* A user-facing table providing identifiers of the streams used in CDC generations. */
 schema_ptr cdc_desc() {
@@ -152,17 +96,32 @@ schema_ptr cdc_timestamps() {

 static const sstring CDC_TIMESTAMPS_KEY = "timestamps";

-schema_ptr service_levels() {
+schema_ptr snapshot_sstables() {
    static thread_local auto schema = [] {
-        auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS);
-        auto builder = schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS, std::make_optional(id))
-                .with_column("service_level", utf8_type, column_kind::partition_key)
-                .with_column("shares", int32_type);
-        if (utils::get_local_injector().is_enabled("service_levels_v1_table_without_shares")) {
-            builder.remove_column("shares");
-        }
-
-        return builder
+        auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::SNAPSHOT_SSTABLES);
+        return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::SNAPSHOT_SSTABLES, std::make_optional(id))
+                // Name of the snapshot
+                .with_column("snapshot_name", utf8_type, column_kind::partition_key)
+                // Keyspace where the snapshot was taken
+                .with_column("keyspace", utf8_type, column_kind::partition_key)
+                // Table within the keyspace
+                .with_column("table", utf8_type, column_kind::partition_key)
+                // Datacenter where this SSTable is located
+                .with_column("datacenter", utf8_type, column_kind::partition_key)
+                // Rack where this SSTable is located
+                .with_column("rack", utf8_type, column_kind::partition_key)
+                // First token in the token range covered by this SSTable
+                .with_column("first_token", long_type, column_kind::clustering_key)
+                // Unique identifier for the SSTable (UUID)
+                .with_column("sstable_id", uuid_type, column_kind::clustering_key)
+                // Last token in the token range covered by this SSTable
+                .with_column("last_token", long_type)
+                // TOC filename of the SSTable
+                .with_column("toc_name", utf8_type)
+                // Prefix path in object storage where the SSTable was backed up
+                .with_column("prefix", utf8_type)
+                // Flag if the SSTable was downloaded already
+                .with_column("downloaded", boolean_type)
                .with_hash_version()
                .build();
    }();
@@ -182,19 +141,14 @@ schema_ptr service_levels() {
 static std::vector<schema_ptr> ensured_tables() {
    return {
        view_build_status(),
-        cdc_generations_v2(),
        cdc_desc(),
        cdc_timestamps(),
-        service_levels(),
+        snapshot_sstables(),
    };
 }

 std::vector<schema_ptr> system_distributed_keyspace::all_distributed_tables() {
-    return {view_build_status(), cdc_desc(), cdc_timestamps(), service_levels()};
-}
-
-std::vector<schema_ptr> system_distributed_keyspace::all_everywhere_tables() {
-    return {cdc_generations_v2()};
+    return {view_build_status(), cdc_desc(), cdc_timestamps(), snapshot_sstables()};
 }

 system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor& qp, service::migration_manager& mm, service::storage_proxy& sp)
@@ -203,36 +157,6 @@ system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor&
        , _sp(sp) {
 }

-static std::vector<std::pair<std::string_view, data_type>> new_service_levels_columns(bool workload_prioritization_enabled) {
-    std::vector<std::pair<std::string_view, data_type>> new_columns {{"timeout", duration_type}, {"workload_type", utf8_type}};
-    if (workload_prioritization_enabled) {
-        new_columns.push_back({"shares", int32_type});
-    }
-    return new_columns;
-};
-
-static schema_ptr get_current_service_levels(data_dictionary::database db) {
-    return db.has_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS)
-            ? db.find_schema(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS)
-            : service_levels();
-}
-
-static schema_ptr get_updated_service_levels(data_dictionary::database db, bool workload_prioritization_enabled) {
-    SCYLLA_ASSERT(this_shard_id() == 0);
-    auto schema = get_current_service_levels(db);
-    schema_builder b(schema);
-    for (const auto& col : new_service_levels_columns(workload_prioritization_enabled)) {
-        auto& [col_name, col_type] = col;
-        bytes options_name = to_bytes(col_name.data());
-        if (schema->get_column_definition(options_name)) {
-            continue;
-        }
-        b.with_column(options_name, col_type, column_kind::regular_column);
-    }
-    b.with_hash_version();
-    return b.build();
-}
-
 future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tables) {
    if (this_shard_id() != 0) {
        _started = true;
@@ -243,11 +167,9 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl

    while (true) {
        // Check if there is any work to do before taking the group 0 guard.
-        bool workload_prioritization_enabled = _sp.features().workload_prioritization;
-        bool keyspaces_setup = db.has_keyspace(NAME) && db.has_keyspace(NAME_EVERYWHERE);
+        bool keyspaces_setup = db.has_keyspace(NAME);
        bool tables_setup = std::all_of(tables.begin(), tables.end(), [db] (schema_ptr t) { return db.has_schema(t->ks_name(), t->cf_name()); } );
-        bool service_levels_up_to_date = get_current_service_levels(db)->equal_columns(*get_updated_service_levels(db, workload_prioritization_enabled));
-        if (keyspaces_setup && tables_setup && service_levels_up_to_date) {
+        if (keyspaces_setup && tables_setup) {
            dlogger.info("system_distributed(_everywhere) keyspaces and tables are up-to-date. Not creating");
            _started = true;
            co_return;
@@ -258,51 +180,25 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
        utils::chunked_vector<mutation> mutations;
        sstring description;

-        auto sd_ksm = keyspace_metadata::new_keyspace(
+        auto ksm = keyspace_metadata::new_keyspace(
                NAME,
                "org.apache.cassandra.locator.SimpleStrategy",
                {{"replication_factor", "3"}},
                std::nullopt, std::nullopt);
        if (!db.has_keyspace(NAME)) {
-            mutations = service::prepare_new_keyspace_announcement(db.real_database(), sd_ksm, ts);
+            mutations = service::prepare_new_keyspace_announcement(db.real_database(), ksm, ts);
            description += format(" create {} keyspace;", NAME);
        } else {
            dlogger.info("{} keyspace is already present. Not creating", NAME);
        }

-        auto sde_ksm = keyspace_metadata::new_keyspace(
-                NAME_EVERYWHERE,
-                "org.apache.cassandra.locator.EverywhereStrategy",
-                {},
-                std::nullopt, std::nullopt);
-        if (!db.has_keyspace(NAME_EVERYWHERE)) {
-            auto sde_mutations = service::prepare_new_keyspace_announcement(db.real_database(), sde_ksm, ts);
-            std::move(sde_mutations.begin(), sde_mutations.end(), std::back_inserter(mutations));
-            description += format(" create {} keyspace;", NAME_EVERYWHERE);
-        } else {
-            dlogger.info("{} keyspace is already present. Not creating", NAME_EVERYWHERE);
-        }
-
-        // Get mutations for creating and updating tables.
+        // Get mutations for creating tables.
        auto num_keyspace_mutations = mutations.size();
        co_await coroutine::parallel_for_each(ensured_tables(),
-                [this, &mutations, db, ts, sd_ksm, sde_ksm, workload_prioritization_enabled] (auto&& table) -> future<> {
-            auto ksm = table->ks_name() == NAME ? sd_ksm : sde_ksm;
-
-            // Ensure that the service_levels table contains new columns.
-            if (table->cf_name() == SERVICE_LEVELS) {
-                table = get_updated_service_levels(db, workload_prioritization_enabled);
-            }
-
+                [this, &mutations, db, ts, ksm] (auto&& table) -> future<> {
            if (!db.has_schema(table->ks_name(), table->cf_name())) {
                co_return co_await service::prepare_new_column_family_announcement(mutations, _sp, *ksm, std::move(table), ts);
            }
-
-            // The service_levels table exists. Update it if it lacks new columns.
-            if (table->cf_name() == SERVICE_LEVELS && !get_current_service_levels(db)->equal_columns(*table)) {
-                auto update_mutations = co_await service::prepare_column_family_update_announcement(_sp, table, std::vector<view_ptr>(), ts);
-                std::move(update_mutations.begin(), update_mutations.end(), std::back_inserter(mutations));
-            }
        });
        if (mutations.size() > num_keyspace_mutations) {
            description += " create and update system_distributed(_everywhere) tables";
@@ -324,15 +220,6 @@ future<> system_distributed_keyspace::create_tables(std::vector<schema_ptr> tabl
    }
 }

- future<> system_distributed_keyspace::start_workload_prioritization() {
-    if (this_shard_id() != 0) {
-        co_return;
-    }
-    if (_qp.db().features().workload_prioritization) {
-       co_await create_tables({get_updated_service_levels(_qp.db(), true)});
-    }
-}
-
 future<> system_distributed_keyspace::start() {
    if (this_shard_id() != 0) {
        _started = true;
@@ -375,90 +262,6 @@ static db::consistency_level quorum_if_many(size_t num_token_owners) {
    return num_token_owners > 1 ? db::consistency_level::QUORUM : db::consistency_level::ONE;
 }

-future<>
-system_distributed_keyspace::insert_cdc_generation(
-        utils::UUID id,
-        const cdc::topology_description& desc,
-        context ctx) {
-    using namespace std::chrono_literals;
-
-    const size_t concurrency = 10;
-    const size_t num_replicas = ctx.num_token_owners;
-
-    // To insert the data quickly and efficiently we send it in batches of multiple rows
-    // (each batch represented by a single mutation). We also send multiple such batches concurrently.
-    // However, we need to limit the memory consumption of the operation.
-    // I assume that the memory consumption grows linearly with the number of replicas
-    // (we send to all replicas ``at the same time''), with the batch size (the data must
-    // be copied for each replica?) and with concurrency. These assumptions may be too conservative
-    // but that won't hurt in a significant way (it may hurt the efficiency of the operation a little).
-    // Thus, if we want to limit the memory consumption to L, it should be true that
-    // mutation_size * num_replicas * concurrency <= L, hence
-    // mutation_size <= L / (num_replicas * concurrency).
-    // For example, say L = 10MB, concurrency = 10, num_replicas = 100; we get
-    // mutation_size <= 10MB / 1000 = 10KB.
-    // On the other hand we must have mutation_size >= size of a single row,
-    // so we will use mutation_size <= max(size of single row, L/(num_replicas*concurrency)).
-
-    // It has been tested that sending 1MB batches to 3 replicas with concurrency 20 works OK,
-    // which would correspond to L ~= 60MB. Hence that's the limit we use here.
-    const size_t L = 60'000'000;
-    const auto mutation_size_threshold = std::max(size_t(1), L / (num_replicas * concurrency));
-
-    auto s = _qp.db().real_database().find_schema(
-        system_distributed_keyspace::NAME_EVERYWHERE, system_distributed_keyspace::CDC_GENERATIONS_V2);
-    auto ms = co_await cdc::get_cdc_generation_mutations_v2(s, id, desc, mutation_size_threshold, api::new_timestamp());
-    co_await max_concurrent_for_each(ms, concurrency, [&] (mutation& m) -> future<> {
-        co_await _sp.mutate(
-            { std::move(m) },
-            db::consistency_level::ALL,
-            db::timeout_clock::now() + 60s,
-            nullptr, // trace_state
-            empty_service_permit(),
-            db::allow_per_partition_rate_limit::no,
-            false // raw_counters
-        );
-    });
-}
-
-future<std::optional<cdc::topology_description>>
-system_distributed_keyspace::read_cdc_generation(utils::UUID id) {
-    utils::chunked_vector<cdc::token_range_description> entries;
-    size_t num_ranges = 0;
-    co_await _qp.query_internal(
-            // This should be a local read so 20s should be more than enough
-            format("SELECT range_end, streams, ignore_msb, num_ranges FROM {}.{} WHERE id = ? USING TIMEOUT 20s", NAME_EVERYWHERE, CDC_GENERATIONS_V2),
-            db::consistency_level::ONE, // we wrote the generation with ALL so ONE must see it (or there's something really wrong)
-            { id },
-            1000, // for ~1KB rows, ~1MB page size
-            [&] (const cql3::untyped_result_set_row& row) {
-
-        std::vector<cdc::stream_id> streams;
-        row.get_list_data<bytes>("streams", std::back_inserter(streams));
-        entries.push_back(cdc::token_range_description{
-                dht::token::from_int64(row.get_as<int64_t>("range_end")),
-                std::move(streams),
-                uint8_t(row.get_as<int8_t>("ignore_msb"))});
-        num_ranges = row.get_as<int32_t>("num_ranges");
-        return make_ready_future<stop_iteration>(stop_iteration::no);
-    });
-
-    if (entries.empty()) {
-        co_return std::nullopt;
-    }
-
-    // Paranoic sanity check. Partial reads should not happen since generations should be retrieved only after they
-    // were written successfully with CL=ALL. But nobody uses EverywhereStrategy tables so they weren't ever properly
-    // tested, so just in case...
-    if (entries.size() != num_ranges) {
-        throw std::runtime_error(format(
-                "read_cdc_generation: wrong number of rows. The `num_ranges` column claimed {} rows,"
-                " but reading the partition returned {}.", num_ranges, entries.size()));
-    }
-
-    co_return std::optional{cdc::topology_description(std::move(entries))};
-}
-
 static future<utils::chunked_vector<mutation>> get_cdc_streams_descriptions_v2_mutation(
        const replica::database& db,
        db_clock::time_point time,
@@ -630,65 +433,83 @@ system_distributed_keyspace::cdc_current_generation_timestamp(context ctx) {
    co_return timestamp_cql->one().get_as<db_clock::time_point>("time");
 }

-future<qos::service_levels_info> system_distributed_keyspace::get_service_levels(qos::query_context ctx) const {
-    return qos::get_service_levels(_qp, NAME, SERVICE_LEVELS, db::consistency_level::ONE, ctx);
+future<> system_distributed_keyspace::insert_snapshot_sstable(sstring snapshot_name, sstring ks, sstring table, sstring dc, sstring rack, sstables::sstable_id sstable_id, dht::token first_token, dht::token last_token, sstring toc_name, sstring prefix, db::consistency_level cl) {
+    // Not inserting the downloaded column so that re-populating on restore
+    // retry doesn't overwrite downloaded=true set by a previous attempt
+    static const sstring query = format("INSERT INTO {}.{} (snapshot_name, \"keyspace\", \"table\", datacenter, rack, first_token, sstable_id, last_token, toc_name, prefix) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) USING TTL {}", NAME, SNAPSHOT_SSTABLES, SNAPSHOT_SSTABLES_TTL_SECONDS);
+
+    return _qp.execute_internal(
+            query,
+            cl,
+            internal_distributed_query_state(),
+            { std::move(snapshot_name), std::move(ks), std::move(table), std::move(dc), std::move(rack),
+              dht::token::to_int64(first_token), sstable_id.uuid(), dht::token::to_int64(last_token), std::move(toc_name), std::move(prefix) },
+            cql3::query_processor::cache_internal::yes).discard_result();
 }

-future<qos::service_levels_info> system_distributed_keyspace::get_service_level(sstring service_level_name) const {
-    return qos::get_service_level(_qp, NAME, SERVICE_LEVELS, service_level_name, db::consistency_level::ONE);
-}
+future<utils::chunked_vector<snapshot_sstable_entry>>
+system_distributed_keyspace::get_snapshot_sstables(sstring snapshot_name, sstring ks, sstring table, sstring dc, sstring rack, db::consistency_level cl, std::optional<dht::token> start_token, std::optional<dht::token> end_token) const {
+    utils::chunked_vector<snapshot_sstable_entry> sstables;

-future<> system_distributed_keyspace::set_service_level(sstring service_level_name, qos::service_level_options slo) const {
-    static sstring prepared_query = format("INSERT INTO {}.{} (service_level) VALUES (?);", NAME, SERVICE_LEVELS);
-    co_await _qp.execute_internal(prepared_query, db::consistency_level::ONE, internal_distributed_query_state(), {service_level_name}, cql3::query_processor::cache_internal::no);
-    auto to_data_value = [&] (const qos::service_level_options::timeout_type& tv) {
-        return std::visit(overloaded_functor {
-            [&] (const qos::service_level_options::unset_marker&) {
-                return data_value::make_null(duration_type);
-            },
-            [&] (const qos::service_level_options::delete_marker&) {
-                return data_value::make_null(duration_type);
-            },
-            [&] (const lowres_clock::duration& d) {
-                return data_value(cql_duration(months_counter{0},
-                        days_counter{0},
-                        nanoseconds_counter{std::chrono::duration_cast<std::chrono::nanoseconds>(d).count()}));
-            },
-        }, tv);
+    static const sstring base_query = format("SELECT toc_name, prefix, sstable_id, first_token, last_token, downloaded FROM {}.{}"
+        " WHERE snapshot_name = ? AND \"keyspace\" = ? AND \"table\" = ? AND datacenter = ? AND rack = ?", NAME, SNAPSHOT_SSTABLES);
+
+    auto read_row = [&] (const cql3::untyped_result_set_row& row) {
+            sstables.emplace_back(sstables::sstable_id(row.get_as<utils::UUID>("sstable_id")), dht::token::from_int64(row.get_as<int64_t>("first_token")), dht::token::from_int64(row.get_as<int64_t>("last_token")), row.get_as<sstring>("toc_name"), row.get_as<sstring>("prefix"), is_downloaded(row.get_or<bool>("downloaded", false)));
+            return make_ready_future<stop_iteration>(stop_iteration::no);
    };
-    auto to_data_value_g = [&] <typename T> (const std::variant<qos::service_level_options::unset_marker, qos::service_level_options::delete_marker, T>& v) {
-        return std::visit(overloaded_functor {
-            [&] (const qos::service_level_options::unset_marker&) {
-                return data_value::make_null(data_type_for<T>());
-            },
-            [&] (const qos::service_level_options::delete_marker&) {
-                return data_value::make_null(data_type_for<T>());
-            },
-            [&] (const T& v) {
-                return data_value(v);
-            },
-        }, v);
-    };
-    data_value workload = slo.workload == qos::service_level_options::workload_type::unspecified
-            ? data_value::make_null(utf8_type)
-            : data_value(qos::service_level_options::to_string(slo.workload));
-    co_await _qp.execute_internal(format("UPDATE {}.{} SET timeout = ?, workload_type = ? WHERE service_level = ?;", NAME, SERVICE_LEVELS),
-                db::consistency_level::ONE,
-                internal_distributed_query_state(),
-                {to_data_value(slo.timeout),
-                    workload,
-                    service_level_name},
-                cql3::query_processor::cache_internal::no);
-    co_await _qp.execute_internal(format("UPDATE {}.{} SET shares = ? WHERE service_level = ?;", NAME, SERVICE_LEVELS),
-                db::consistency_level::ONE,
-                internal_distributed_query_state(),
-                {to_data_value_g(slo.shares), service_level_name},
-                cql3::query_processor::cache_internal::no);
+
+    if (start_token && end_token) {
+        co_await _qp.query_internal(
+            base_query + " AND first_token >= ? AND first_token <= ?",
+            cl,
+            { snapshot_name, ks, table, dc, rack, dht::token::to_int64(*start_token), dht::token::to_int64(*end_token) },
+            1000,
+            read_row);
+    } else if (start_token) {
+        co_await _qp.query_internal(
+            base_query + " AND first_token >= ?",
+            cl,
+            { snapshot_name, ks, table, dc, rack, dht::token::to_int64(*start_token) },
+            1000,
+            read_row);
+    } else if (end_token) {
+        co_await _qp.query_internal(
+            base_query + " AND first_token <= ?",
+            cl,
+            { snapshot_name, ks, table, dc, rack, dht::token::to_int64(*end_token) },
+            1000,
+            read_row);
+    } else {
+        co_await _qp.query_internal(
+            base_query,
+            cl,
+            { snapshot_name, ks, table, dc, rack },
+            1000,
+            read_row);
+    }
+
+    co_return sstables;
 }

-future<> system_distributed_keyspace::drop_service_level(sstring service_level_name) const {
-    static sstring prepared_query = format("DELETE FROM {}.{} WHERE service_level= ?;", NAME, SERVICE_LEVELS);
-    return _qp.execute_internal(prepared_query, db::consistency_level::ONE, internal_distributed_query_state(), {service_level_name}, cql3::query_processor::cache_internal::no).discard_result();
+future<> system_distributed_keyspace::update_sstable_download_status(sstring snapshot_name,
+                                                                     sstring ks,
+                                                                     sstring table,
+                                                                     sstring dc,
+                                                                     sstring rack,
+                                                                     sstables::sstable_id sstable_id,
+                                                                     dht::token start_token,
+                                                                     is_downloaded downloaded) const {
+    static const sstring update_query = format("UPDATE {}.{} USING TTL {} SET downloaded = ? WHERE snapshot_name = ? AND \"keyspace\" = ? AND \"table\" = ? AND "
+                                               "datacenter = ? AND rack = ? AND first_token = ? AND sstable_id = ?",
+                                               NAME,
+                                               SNAPSHOT_SSTABLES,
+                                               SNAPSHOT_SSTABLES_TTL_SECONDS);
+    co_await _qp.execute_internal(update_query,
+                                  consistency_level::ONE,
+                                  internal_distributed_query_state(),
+                                  {downloaded == is_downloaded::yes ? true : false, snapshot_name, ks, table, dc, rack, dht::token::to_int64(start_token), sstable_id.uuid()},
+                                  cql3::query_processor::cache_internal::no);
 }

-}
+} // namespace db
--- a/db/system_distributed_keyspace.hh
+++ b/db/system_distributed_keyspace.hh
@@ -9,14 +9,17 @@
 #pragma once

 #include "schema/schema_fwd.hh"
-#include "service/qos/qos_common.hh"
-#include "utils/UUID.hh"
-#include "cdc/generation_id.hh"
+#include "utils/chunked_vector.hh"
+#include "db/consistency_level_type.hh"
 #include "locator/host_id.hh"
+#include "dht/token.hh"
+#include "sstables/types.hh"

 #include <seastar/core/future.hh>
 #include <seastar/core/sstring.hh>
+#include <seastar/util/bool_class.hh>

+#include <optional>
 #include <unordered_map>

 namespace cql3 {
@@ -24,7 +27,6 @@ class query_processor;
 }

 namespace cdc {
-    class stream_id;
    class topology_description;
    class streams_version;
 } // namespace cdc
@@ -34,22 +36,25 @@ namespace service {
    class migration_manager;
 }

+
 namespace db {

+using is_downloaded = bool_class<class is_downloaded_tag>;
+
+struct snapshot_sstable_entry {
+    sstables::sstable_id sstable_id;
+    dht::token first_token;
+    dht::token last_token;
+    sstring toc_name;
+    sstring prefix;
+    is_downloaded downloaded{is_downloaded::no};
+};
+
 class system_distributed_keyspace {
 public:
    static constexpr auto NAME = "system_distributed";
-    static constexpr auto NAME_EVERYWHERE = "system_distributed_everywhere";

    static constexpr auto VIEW_BUILD_STATUS = "view_build_status";
-    static constexpr auto SERVICE_LEVELS = "service_levels";
-
-    /* Nodes use this table to communicate new CDC stream generations to other nodes. */
-    static constexpr auto CDC_TOPOLOGY_DESCRIPTION = "cdc_generation_descriptions";
-
-    /* Nodes use this table to communicate new CDC stream generations to other nodes.
-     * Resides in system_distributed_everywhere. */
-    static constexpr auto CDC_GENERATIONS_V2 = "cdc_generation_descriptions_v2";

    /* This table is used by CDC clients to learn about available CDC streams. */
    static constexpr auto CDC_DESC_V2 = "cdc_streams_descriptions_v2";
@@ -62,6 +67,12 @@ public:
     * in the old table also appear in the new table, if necessary. */
    static constexpr auto CDC_DESC_V1 = "cdc_streams_descriptions";

+    /* This table is used by the backup and restore code to store per-sstable metadata.
+     * The data the coordinator node puts in this table comes from the snapshot manifests. */
+    static constexpr auto SNAPSHOT_SSTABLES = "snapshot_sstables";
+
+    static constexpr uint64_t SNAPSHOT_SSTABLES_TTL_SECONDS = std::chrono::seconds(std::chrono::days(3)).count();
+
    /* Information required to modify/query some system_distributed tables, passed from the caller. */
    struct context {
        /* How many different token owners (endpoints) are there in the token ring? */
@@ -77,19 +88,14 @@ private:

 public:
    static std::vector<schema_ptr> all_distributed_tables();
-    static std::vector<schema_ptr> all_everywhere_tables();

    system_distributed_keyspace(cql3::query_processor&, service::migration_manager&, service::storage_proxy&);

    future<> start();
-    future<> start_workload_prioritization();
    future<> stop();

    bool started() const { return _started; }

-    future<> insert_cdc_generation(utils::UUID, const cdc::topology_description&, context);
-    future<std::optional<cdc::topology_description>> read_cdc_generation(utils::UUID);
-
    future<> create_cdc_desc(db_clock::time_point, const cdc::topology_description&, context);
    future<bool> cdc_desc_exists(db_clock::time_point, context);

@@ -105,10 +111,25 @@ public:
    // NOTE: currently used only by alternator
    future<db_clock::time_point> cdc_current_generation_timestamp(context);

-    future<qos::service_levels_info> get_service_levels(qos::query_context ctx) const;
-    future<qos::service_levels_info> get_service_level(sstring service_level_name) const;
-    future<> set_service_level(sstring service_level_name, qos::service_level_options slo) const;
-    future<> drop_service_level(sstring service_level_name) const;
+    /* Inserts a single SSTable entry for a given snapshot, keyspace, table, datacenter,
+     * and rack. The row is written with the specified TTL (in seconds). Uses consistency
+     * level `EACH_QUORUM` by default.*/
+    future<> insert_snapshot_sstable(sstring snapshot_name, sstring ks, sstring table, sstring dc, sstring rack, sstables::sstable_id sstable_id, dht::token first_token, dht::token last_token, sstring toc_name, sstring prefix, db::consistency_level cl = db::consistency_level::EACH_QUORUM);
+
+    /* Retrieves all SSTable entries for a given snapshot, keyspace, table, datacenter, and rack.
+     * If `start_token` and `end_token` are provided, only entries whose `first_token` is in the range [`start_token`, `end_token`] will be returned.
+     * Returns a vector of `snapshot_sstable_entry` structs containing `sstable_id`, `first_token`, `last_token`,
+     * `toc_name`, and `prefix`. Uses consistency level `LOCAL_QUORUM` by default. */
+    future<utils::chunked_vector<snapshot_sstable_entry>> get_snapshot_sstables(sstring snapshot_name, sstring ks, sstring table, sstring dc, sstring rack, db::consistency_level cl = db::consistency_level::LOCAL_QUORUM, std::optional<dht::token> start_token = std::nullopt, std::optional<dht::token> end_token = std::nullopt) const;
+
+    future<> update_sstable_download_status(sstring snapshot_name,
+                                            sstring ks,
+                                            sstring table,
+                                            sstring dc,
+                                            sstring rack,
+                                            sstables::sstable_id sstable_id,
+                                            dht::token start_token,
+                                            is_downloaded downloaded) const;

 private:
    future<> create_tables(std::vector<schema_ptr> tables);
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1146,7 +1146,8 @@ schema_ptr system_keyspace::sstables_registry() {
    static thread_local auto schema = [] {
        auto id = generate_legacy_id(NAME, SSTABLES_REGISTRY);
        return schema_builder(NAME, SSTABLES_REGISTRY, id)
-            .with_column("owner", uuid_type, column_kind::partition_key)
+            .with_column("table_id", uuid_type, column_kind::partition_key)
+            .with_column("node_owner", uuid_type, column_kind::partition_key)
            .with_column("generation", timeuuid_type, column_kind::clustering_key)
            .with_column("status", utf8_type)
            .with_column("state", utf8_type)
@@ -1309,6 +1310,7 @@ schema_ptr system_keyspace::view_building_tasks() {
        return schema_builder(NAME, VIEW_BUILDING_TASKS, std::make_optional(id))
                .with_column("key", utf8_type, column_kind::partition_key)
                .with_column("id", timeuuid_type, column_kind::clustering_key)
+                .with_column("min_task_id", timeuuid_type, column_kind::static_column)
                .with_column("type", utf8_type)
                .with_column("aborted", boolean_type)
                .with_column("base_id", uuid_type)
@@ -2749,12 +2751,36 @@ future<mutation> system_keyspace::make_remove_view_build_status_on_host_mutation

 static constexpr auto VIEW_BUILDING_KEY = "view_building";

-future<db::view::building_tasks> system_keyspace::get_view_building_tasks() {
-    static const sstring query = format("SELECT id, type, aborted, base_id, view_id, last_token, host_id, shard FROM {}.{} WHERE key = '{}'", NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
+future<std::pair<db::view::building_tasks, std::optional<utils::UUID>>> system_keyspace::get_view_building_tasks() {
    using namespace db::view;

+    // When the VIEW_BUILDING_TASKS_MIN_TASK_ID feature is active, read the static
+    // column min_task_id first and use it as a lower bound for the clustering row
+    // scan. This skips tombstoned rows below the boundary, avoiding dead-cell
+    // warnings from the tombstone_warn_threshold check.
+    std::optional<utils::UUID> min_task_id;
+    if (_db.features().view_building_tasks_min_task_id) {
+        auto schema = view_building_tasks();
+        auto pk = partition_key::from_single_value(*schema, data_value(VIEW_BUILDING_KEY).serialize_nonnull());
+        auto dk = dht::decorate_key(*schema, pk);
+        auto col_id = schema->get_column_definition("min_task_id")->id;
+        query::partition_slice slice(
+            query::clustering_row_ranges{},
+            {col_id},
+            {},
+            query::partition_slice::option_set::of<query::partition_slice::option::always_return_static_content>());
+        auto cmd = query::read_command(schema->id(), schema->version(), slice,
+                _db.get_query_max_result_size(), query::tombstone_limit::max);
+        auto [qr, _cache_temp] = co_await _db.query(schema, cmd, query::result_options::only_result(),
+                {dht::partition_range::make_singular(dk)}, nullptr, db::no_timeout);
+        auto rs = query::result_set::from_raw_result(schema, slice, *qr);
+        if (!rs.empty()) {
+            min_task_id = rs.row(0).get<utils::UUID>("min_task_id");
+        }
+    }
+
    building_tasks tasks;
-    co_await _qp.query_internal(query, [&] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
+    auto process_row = [&] (const cql3::untyped_result_set_row& row) -> future<stop_iteration> {
        auto id = row.get_as<utils::UUID>("id");
        auto type = task_type_from_string(row.get_as<sstring>("type"));
        auto aborted = row.get_as<bool>("aborted");
@@ -2779,40 +2805,18 @@ future<db::view::building_tasks> system_keyspace::get_view_building_tasks() {
            break;
        }
        co_return stop_iteration::no;
-    });
-    co_return tasks;
-}
+    };

-future<mutation> system_keyspace::make_view_building_task_mutation(api::timestamp_type ts, const db::view::view_building_task& task) {
-    static const sstring stmt = format("INSERT INTO {}.{}(key, id, type, aborted, base_id, view_id, last_token, host_id, shard) VALUES ('{}', ?, ?, ?, ?, ?, ?, ?, ?)", NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
-    using namespace db::view;
-
-    data_value_or_unset view_id = unset_value{};
-    if (task.type == db::view::view_building_task::task_type::build_range) {
-        if (!task.view_id) {
-            on_internal_error(slogger, fmt::format("view_id is not set for build_range task with id: {}", task.id));
-        }
-        view_id = data_value(task.view_id->uuid());
+    if (min_task_id) {
+        static const sstring bounded_query = format("SELECT id, type, aborted, base_id, view_id, last_token, host_id, shard FROM {}.{} WHERE key = '{}' AND id >= ?",
+                NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
+        co_await _qp.query_internal(bounded_query, db::consistency_level::LOCAL_ONE, {*min_task_id}, 1000, std::move(process_row));
+    } else {
+        static const sstring full_query = format("SELECT id, type, aborted, base_id, view_id, last_token, host_id, shard FROM {}.{} WHERE key = '{}'",
+                NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
+        co_await _qp.query_internal(full_query, std::move(process_row));
    }
-    auto muts = co_await _qp.get_mutations_internal(stmt, internal_system_query_state(), ts, {
-            task.id, task_type_to_sstring(task.type), task.aborted,
-            task.base_id.uuid(), view_id, dht::token::to_int64(task.last_token),
-            task.replica.host.uuid(), int32_t(task.replica.shard)
-    });
-    if (muts.size() != 1) {
-        on_internal_error(slogger, fmt::format("expected 1 mutation got {}", muts.size()));
-    }
-    co_return std::move(muts[0]);
-}
-
-future<mutation> system_keyspace::make_remove_view_building_task_mutation(api::timestamp_type ts, utils::UUID id) {
-    static const sstring stmt = format("DELETE FROM {}.{} WHERE key = '{}' AND id = ?", NAME, VIEW_BUILDING_TASKS, VIEW_BUILDING_KEY);
-
-    auto muts = co_await _qp.get_mutations_internal(stmt, internal_system_query_state(), ts, {id});
-    if (muts.size() != 1) {
-        on_internal_error(slogger, fmt::format("expected 1 mutation got {}", muts.size()));
-    }
-    co_return std::move(muts[0]);
+    co_return std::pair{std::move(tasks), std::move(min_task_id)};
 }

 static constexpr auto VIEW_BUILDING_PROCESSING_BASE_ID_KEY = "view_building_processing_base_id";
@@ -3473,37 +3477,37 @@ system_keyspace::read_cdc_generation_opt(utils::UUID id) {
    co_return cdc::topology_description{std::move(entries)};
 }

-future<> system_keyspace::sstables_registry_create_entry(table_id owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc) {
-    static const auto req = format("INSERT INTO system.{} (owner, generation, status, state, version, format) VALUES (?, ?, ?, ?, ?, ?)", SSTABLES_REGISTRY);
-    slogger.trace("Inserting {}.{} into {}", owner, desc.generation, SSTABLES_REGISTRY);
-    co_await execute_cql(req, owner.id, desc.generation, status, sstables::state_to_dir(state), fmt::to_string(desc.version), fmt::to_string(desc.format)).discard_result();
+future<> system_keyspace::sstables_registry_create_entry(table_id tid, locator::host_id node_owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc) {
+    static const auto req = format("INSERT INTO system.{} (table_id, node_owner, generation, status, state, version, format) VALUES (?, ?, ?, ?, ?, ?, ?)", SSTABLES_REGISTRY);
+    slogger.trace("Inserting {}.{}.{} into {}", tid, node_owner, desc.generation, SSTABLES_REGISTRY);
+    co_await execute_cql(req, tid.id, node_owner.uuid(), desc.generation, status, sstables::state_to_dir(state), fmt::to_string(desc.version), fmt::to_string(desc.format)).discard_result();
 }

-future<> system_keyspace::sstables_registry_update_entry_status(table_id owner, sstables::generation_type gen, sstring status) {
-    static const auto req = format("UPDATE system.{} SET status = ? WHERE owner = ? AND generation = ?", SSTABLES_REGISTRY);
-    slogger.trace("Updating {}.{} -> status={} in {}", owner, gen, status, SSTABLES_REGISTRY);
-    co_await execute_cql(req, status, owner.id, gen).discard_result();
+future<> system_keyspace::sstables_registry_update_entry_status(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstring status) {
+    static const auto req = format("UPDATE system.{} SET status = ? WHERE table_id = ? AND node_owner = ? AND generation = ?", SSTABLES_REGISTRY);
+    slogger.trace("Updating {}.{}.{} -> status={} in {}", tid, node_owner, gen, status, SSTABLES_REGISTRY);
+    co_await execute_cql(req, status, tid.id, node_owner.uuid(), gen).discard_result();
 }

-future<> system_keyspace::sstables_registry_update_entry_state(table_id owner, sstables::generation_type gen, sstables::sstable_state state) {
-    static const auto req = format("UPDATE system.{} SET state = ? WHERE owner = ? AND generation = ?", SSTABLES_REGISTRY);
+future<> system_keyspace::sstables_registry_update_entry_state(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstables::sstable_state state) {
+    static const auto req = format("UPDATE system.{} SET state = ? WHERE table_id = ? AND node_owner = ? AND generation = ?", SSTABLES_REGISTRY);
    auto new_state = sstables::state_to_dir(state);
-    slogger.trace("Updating {}.{} -> state={} in {}", owner, gen, new_state, SSTABLES_REGISTRY);
-    co_await execute_cql(req, new_state, owner.id, gen).discard_result();
+    slogger.trace("Updating {}.{}.{} -> state={} in {}", tid, node_owner, gen, new_state, SSTABLES_REGISTRY);
+    co_await execute_cql(req, new_state, tid.id, node_owner.uuid(), gen).discard_result();
 }

-future<> system_keyspace::sstables_registry_delete_entry(table_id owner, sstables::generation_type gen) {
-    static const auto req = format("DELETE FROM system.{} WHERE owner = ? AND generation = ?", SSTABLES_REGISTRY);
-    slogger.trace("Removing {}.{} from {}", owner, gen, SSTABLES_REGISTRY);
-    co_await execute_cql(req, owner.id, gen).discard_result();
+future<> system_keyspace::sstables_registry_delete_entry(table_id tid, locator::host_id node_owner, sstables::generation_type gen) {
+    static const auto req = format("DELETE FROM system.{} WHERE table_id = ? AND node_owner = ? AND generation = ?", SSTABLES_REGISTRY);
+    slogger.trace("Removing {}.{}.{} from {}", tid, node_owner, gen, SSTABLES_REGISTRY);
+    co_await execute_cql(req, tid.id, node_owner.uuid(), gen).discard_result();

 }

-future<> system_keyspace::sstables_registry_list(table_id owner, sstable_registry_entry_consumer consumer) {
-    static const auto req = format("SELECT status, state, generation, version, format FROM system.{} WHERE owner = ?", SSTABLES_REGISTRY);
-    slogger.trace("Listing {} entries from {}", owner, SSTABLES_REGISTRY);
+future<> system_keyspace::sstables_registry_list(table_id tid, locator::host_id node_owner, sstable_registry_entry_consumer consumer) {
+    static const auto req = format("SELECT status, state, generation, version, format FROM system.{} WHERE table_id = ? AND node_owner = ?", SSTABLES_REGISTRY);
+    slogger.trace("Listing {}.{} entries from {}", tid, node_owner, SSTABLES_REGISTRY);

-    co_await _qp.query_internal(req, db::consistency_level::ONE, { owner.id }, 1000, [ consumer = std::move(consumer) ] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
+    co_await _qp.query_internal(req, db::consistency_level::ONE, { tid.id, node_owner.uuid() }, 1000, [ consumer = std::move(consumer) ] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
        auto status = row.get_as<sstring>("status");
        auto state = sstables::state_from_dir(row.get_as<sstring>("state"));
        auto gen = sstables::generation_type(row.get_as<utils::UUID>("generation"));
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -572,9 +572,7 @@ public:
    future<mutation> make_remove_view_build_status_on_host_mutation(api::timestamp_type ts, system_keyspace_view_name view_name, locator::host_id host_id);

    // system.view_building_tasks
-    future<db::view::building_tasks> get_view_building_tasks();
-    future<mutation> make_view_building_task_mutation(api::timestamp_type ts, const db::view::view_building_task& task);
-    future<mutation> make_remove_view_building_task_mutation(api::timestamp_type ts, utils::UUID id);
+    future<std::pair<db::view::building_tasks, std::optional<utils::UUID>>> get_view_building_tasks();

    // system.scylla_local, view_building_processing_base key
    future<std::optional<table_id>> get_view_building_processing_base_id();
@@ -671,12 +669,12 @@ public:
    future<mutation> make_view_builder_version_mutation(api::timestamp_type ts, view_builder_version_t version);
    future<view_builder_version_t> get_view_builder_version();

-    future<> sstables_registry_create_entry(table_id owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc);
-    future<> sstables_registry_update_entry_status(table_id owner, sstables::generation_type gen, sstring status);
-    future<> sstables_registry_update_entry_state(table_id owner, sstables::generation_type gen, sstables::sstable_state state);
-    future<> sstables_registry_delete_entry(table_id owner, sstables::generation_type gen);
+    future<> sstables_registry_create_entry(table_id tid, locator::host_id node_owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc);
+    future<> sstables_registry_update_entry_status(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstring status);
+    future<> sstables_registry_update_entry_state(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstables::sstable_state state);
+    future<> sstables_registry_delete_entry(table_id tid, locator::host_id node_owner, sstables::generation_type gen);
    using sstable_registry_entry_consumer = sstables::sstables_registry::entry_consumer;
-    future<> sstables_registry_list(table_id owner, sstable_registry_entry_consumer consumer);
+    future<> sstables_registry_list(table_id tid, locator::host_id node_owner, sstable_registry_entry_consumer consumer);

    future<std::optional<sstring>> load_group0_upgrade_state();
    future<> save_group0_upgrade_state(sstring);
--- a/db/system_keyspace_sstables_registry.hh
+++ b/db/system_keyspace_sstables_registry.hh
@@ -15,24 +15,24 @@ class system_keyspace_sstables_registry : public sstables::sstables_registry {
 public:
    system_keyspace_sstables_registry(system_keyspace& keyspace) : _keyspace(keyspace.shared_from_this()) {}

-    virtual seastar::future<> create_entry(table_id owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc) override {
-        return _keyspace->sstables_registry_create_entry(owner, status, state, desc);
+    virtual seastar::future<> create_entry(table_id tid, locator::host_id node_owner, sstring status, sstables::sstable_state state, sstables::entry_descriptor desc) override {
+        return _keyspace->sstables_registry_create_entry(tid, node_owner, status, state, desc);
    }

-    virtual seastar::future<> update_entry_status(table_id owner, sstables::generation_type gen, sstring status) override {
-        return _keyspace->sstables_registry_update_entry_status(owner, gen, status);
+    virtual seastar::future<> update_entry_status(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstring status) override {
+        return _keyspace->sstables_registry_update_entry_status(tid, node_owner, gen, status);
    }

-    virtual seastar::future<> update_entry_state(table_id owner, sstables::generation_type gen, sstables::sstable_state state) override {
-        return _keyspace->sstables_registry_update_entry_state(owner, gen, state);
+    virtual seastar::future<> update_entry_state(table_id tid, locator::host_id node_owner, sstables::generation_type gen, sstables::sstable_state state) override {
+        return _keyspace->sstables_registry_update_entry_state(tid, node_owner, gen, state);
    }

-    virtual seastar::future<> delete_entry(table_id owner, sstables::generation_type gen) override {
-        return _keyspace->sstables_registry_delete_entry(owner, gen);
+    virtual seastar::future<> delete_entry(table_id tid, locator::host_id node_owner, sstables::generation_type gen) override {
+        return _keyspace->sstables_registry_delete_entry(tid, node_owner, gen);
    }

-    virtual seastar::future<> sstables_registry_list(table_id owner, entry_consumer consumer) override {
-        return _keyspace->sstables_registry_list(owner, std::move(consumer));
+    virtual seastar::future<> sstables_registry_list(table_id tid, locator::host_id node_owner, entry_consumer consumer) override {
+        return _keyspace->sstables_registry_list(tid, node_owner, std::move(consumer));
    }
 };

--- a/db/view/node_view_update_backlog.hh
+++ b/db/view/node_view_update_backlog.hh
@@ -10,6 +10,7 @@

 #include "db/view/view_update_backlog.hh"
 #include "utils/error_injection.hh"
+#include "utils/updateable_value.hh"

 #include <seastar/core/cacheline.hh>
 #include <seastar/core/future.hh>
@@ -41,13 +42,16 @@ class node_update_backlog {
    std::chrono::milliseconds _interval;
    std::atomic<clock::time_point> _last_update;
    std::atomic<update_backlog> _max;
+    utils::updateable_value<uint32_t> _view_flow_control_delay_limit_in_ms;

 public:
-    explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval)
+    explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval,
+            utils::updateable_value<uint32_t> view_flow_control_delay_limit_in_ms = utils::updateable_value<uint32_t>(1000))
            : _backlogs(shards)
            , _interval(interval)
            , _last_update(clock::now() - _interval)
-            , _max(update_backlog::no_backlog()) {
+            , _max(update_backlog::no_backlog())
+            , _view_flow_control_delay_limit_in_ms(std::move(view_flow_control_delay_limit_in_ms)) {
        if (utils::get_local_injector().enter("update_backlog_immediately")) {
            _interval = std::chrono::milliseconds(0);
            _last_update = clock::now();
@@ -59,6 +63,9 @@ public:
    update_backlog fetch_shard(unsigned shard);
    seastar::future<std::optional<update_backlog>> fetch_if_changed();

+    std::chrono::microseconds calculate_throttling_delay(update_backlog backlog,
+            db::timeout_clock::time_point timeout) const;
+
    // Exposed for testing only.
    update_backlog load() const {
        return _max.load(std::memory_order_relaxed);
--- a/db/view/row_locking.cc
+++ b/db/view/row_locking.cc
@@ -150,14 +150,14 @@ row_locker::unlock(const dht::decorated_key* pk, bool partition_exclusive,
        auto pli = _two_level_locks.find(*pk);
        if (pli == _two_level_locks.end()) {
            // This shouldn't happen... We can't unlock this lock if we can't find it...
-            mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for partition", *pk);
+            mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for partition {}", *pk);
            return;
        }
        SCYLLA_ASSERT(&pli->first == pk);
        if (cpk) {
            auto rli = pli->second._row_locks.find(*cpk);
            if (rli == pli->second._row_locks.end()) {
-                mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for row", *cpk);
+                mylog.error("column_family::local_base_lock_holder::~local_base_lock_holder() can't find lock for row {}", *cpk);
                return;
            }
            SCYLLA_ASSERT(&rli->first == cpk);
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -45,6 +45,7 @@
 #include "db/view/view_builder.hh"
 #include "db/view/view_updating_consumer.hh"
 #include "db/view/view_update_generator.hh"
+#include "db/view/node_view_update_backlog.hh"
 #include "db/view/regular_column_transformation.hh"
 #include "db/system_keyspace_view_types.hh"
 #include "db/system_keyspace.hh"
@@ -3492,18 +3493,27 @@ future<> delete_ghost_rows_visitor::do_accept_new_row(partition_key pk, clusteri
    }
 }

-std::chrono::microseconds calculate_view_update_throttling_delay(db::view::update_backlog backlog,
-                                                                 db::timeout_clock::time_point timeout,
-                                                                 uint32_t view_flow_control_delay_limit_in_ms) {
+// View updates are asynchronous, and because of this limiting their concurrency requires
+// a special approach. The current algorithm places all of the pending view updates in the backlog
+// and artificially slows down new responses to coordinator requests based on how full the backlog is.
+// This function calculates how much a request should be slowed down based on the backlog's fullness.
+// The equation is basically: delay(in seconds) = view_fullness_ratio^3
+// The more full the backlog gets the more aggressively the requests are slowed down.
+// The delay is limited to the amount of time left until timeout.
+// After the timeout the request fails, so there's no point in waiting longer than that.
+// The second argument defines this timeout point - we can't delay the request more than this time point.
+// See: https://www.scylladb.com/2018/12/04/worry-free-ingestion-flow-control/
+std::chrono::microseconds node_update_backlog::calculate_throttling_delay(update_backlog backlog,
+                                                                         db::timeout_clock::time_point timeout) const {
    auto adjust = [] (float x) { return x * x * x; };
-    auto budget = std::max(service::storage_proxy::clock_type::duration(0),
-        timeout - service::storage_proxy::clock_type::now());
-    std::chrono::microseconds ret(uint32_t(adjust(backlog.relative_size()) * view_flow_control_delay_limit_in_ms * 1000));
+    auto budget = std::max(db::timeout_clock::duration(0),
+        timeout - db::timeout_clock::now());
+    std::chrono::microseconds ret(uint32_t(adjust(backlog.relative_size()) * _view_flow_control_delay_limit_in_ms() * 1000));
    // "budget" has millisecond resolution and can potentially be long
    // in the future so converting it to microseconds may overflow.
    // So to compare buget and ret we need to convert both to the lower
    // resolution.
-    if (std::chrono::duration_cast<service::storage_proxy::clock_type::duration>(ret) < budget) {
+    if (std::chrono::duration_cast<db::timeout_clock::duration>(ret) < budget) {
        return ret;
    } else {
        // budget is small (< ret) so can be converted to microseconds
--- a/db/view/view_building_coordinator.cc
+++ b/db/view/view_building_coordinator.cc
@@ -11,6 +11,7 @@
 #include <exception>
 #include <ranges>
 #include <seastar/core/abort_source.hh>
+#include <seastar/coroutine/maybe_yield.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
 #include <seastar/core/on_internal_error.hh>
 #include "db/view/view_building_coordinator.hh"
@@ -179,7 +180,10 @@ future<> view_building_coordinator::clean_finished_tasks() {
        co_return;
    }

-    view_building_task_mutation_builder builder(guard.write_timestamp());
+    view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));
+
+    // Collect tasks eligible for deletion: must still be in state and not aborted.
+    std::vector<utils::UUID> tasks_to_delete;
    for (auto& [replica, tasks]: _finished_tasks) {
        for (auto& task_id: tasks) {
            // The task might be aborted in the meantime. In this case we cannot remove it because we need it to create a new task.
@@ -189,15 +193,65 @@ future<> view_building_coordinator::clean_finished_tasks() {
            //       If yes, we can just remove it instead of aborting it.
            auto task_opt = _vb_sm.building_state.get_task(*_vb_sm.building_state.currently_processed_base_table, replica, task_id);
            if (task_opt && !task_opt->get().aborted) {
-                builder.del_task(task_id);
-                vbc_logger.debug("Removing finished task with ID: {}", task_id);
+                tasks_to_delete.push_back(task_id);
            }
        }
    }

-    co_await commit_mutations(std::move(guard), {builder.build()}, "remove finished view building tasks");
-    for (auto& [_, tasks_set]: _finished_tasks) {
-        tasks_set.clear();
+    if (!tasks_to_delete.empty()) {
+        // Find the minimum UUID (by timeuuid ordering) among tasks that are NOT being
+        // deleted — i.e., alive tasks that must remain in the table.
+        // Everything strictly below this boundary is safe to cover with one range tombstone.
+        const std::unordered_set<utils::UUID> to_delete_set(tasks_to_delete.begin(), tasks_to_delete.end());
+        std::optional<utils::UUID> min_alive_uuid;
+        for (auto& [base_id, base_tasks] : _vb_sm.building_state.tasks_state) {
+            for (auto& [replica, rep_tasks] : base_tasks) {
+                auto check = [&](const utils::UUID& id) {
+                    if (!to_delete_set.contains(id)
+                            && (!min_alive_uuid || timeuuid_tri_compare(id, *min_alive_uuid) < 0)) {
+                        min_alive_uuid = id;
+                    }
+                };
+                for (auto& [id, task] : rep_tasks.staging_tasks) {
+                    check(id);
+                }
+                for (auto& [view_id, task_m] : rep_tasks.view_tasks) {
+                    for (auto& [id, task] : task_m) {
+                        check(id);
+                    }
+                }
+                co_await coroutine::maybe_yield();
+            }
+        }
+
+        if (min_alive_uuid) {
+            vbc_logger.debug("Removing finished tasks before ID: {} using range tombstone", *min_alive_uuid);
+            builder.del_tasks_before(*min_alive_uuid);
+            for (auto& task_id : tasks_to_delete) {
+                // Tasks below min_alive_uuid are already covered by the range tombstone.
+                if (timeuuid_tri_compare(task_id, *min_alive_uuid) < 0) {
+                    continue;
+                }
+                vbc_logger.debug("Removing finished task with ID: {}", task_id);
+                builder.del_task(task_id);
+            }
+        } else {
+            // No alive tasks remain — one range tombstone covers everything.
+            vbc_logger.debug("No alive tasks remain, removing all finished tasks using range tombstone");
+            builder.del_all_tasks();
+        }
+
+        if (_db.features().view_building_tasks_min_task_id) {
+            // If min_alive_uuid == std::nullopt, set min_task_id to a fresh UUID,
+            // so future scans start past all the just-deleted rows (new tasks created
+            // later will have larger UUIDs).
+            builder.set_min_task_id(min_alive_uuid ? *min_alive_uuid : utils::UUID_gen::get_time_UUID());
+        }
+
+        co_await commit_mutations(std::move(guard), {builder.build()}, "remove finished view building tasks");
+        for (auto& [_, tasks_set]: _finished_tasks) {
+            tasks_set.clear();
+        }
    }
 }

@@ -533,7 +587,7 @@ void view_building_coordinator::generate_tablet_migration_updates(utils::chunked
    }

    auto last_token = tmap.get_last_token(gid.tablet);
-    view_building_task_mutation_builder builder(guard.write_timestamp());
+    view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));

    auto create_task_copy_on_pending_replica = [&] (const view_building_task& task) {
        auto new_id = builder.new_id();
@@ -601,7 +655,7 @@ void view_building_coordinator::generate_tablet_resize_updates(utils::chunked_ve
        return;
    }
    bool is_split = old_tmap.tablet_count() < new_tmap.tablet_count();
-    view_building_task_mutation_builder builder(guard.write_timestamp());
+    view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));

    auto create_task_copy = [&] (const view_building_task& task, dht::token last_token) -> utils::UUID {
        auto new_id = builder.new_id();
@@ -671,7 +725,7 @@ void view_building_coordinator::abort_tasks(utils::chunked_vector<canonical_muta
    }
    vbc_logger.debug("Generating abort mutations for tasks for table {}", table_id);

-    view_building_task_mutation_builder builder(guard.write_timestamp());
+    view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));
    auto abort_task_map = [&] (const task_map& task_map) {
        for (auto& [id, _]: task_map) {
            vbc_logger.debug("Aborting task {}", id);
@@ -700,7 +754,7 @@ void abort_view_building_tasks(const view_building_state_machine& vb_sm,
    }
    vbc_logger.debug("Generating abort mutations for tasks for table {} on replica {} and last token {}", table_id, replica, last_token);

-    view_building_task_mutation_builder builder(write_timestamp);
+    view_building_task_mutation_builder builder(write_timestamp, vb_sm.building_state.make_task_uuid_generator(write_timestamp));
    auto abort_task_map = [&] (const task_map& task_map) {
        for (auto& [id, task]: task_map) {
            if (task.last_token == last_token) {
@@ -742,7 +796,7 @@ void view_building_coordinator::rollback_aborted_tasks(utils::chunked_vector<can
        return;
    }

-    view_building_task_mutation_builder builder(guard.write_timestamp());
+    view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));
    auto& base_tasks = _vb_sm.building_state.tasks_state.at(table_id);
    for (auto& [_, replica_tasks]: base_tasks) {
        for (auto& [_, building_task_map]: replica_tasks.view_tasks) {
@@ -759,7 +813,7 @@ void view_building_coordinator::rollback_aborted_tasks(utils::chunked_vector<can
        return;
    }

-    view_building_task_mutation_builder builder(guard.write_timestamp());
+    view_building_task_mutation_builder builder(guard.write_timestamp(), _vb_sm.building_state.make_task_uuid_generator(guard.write_timestamp()));
    auto& replica_tasks = _vb_sm.building_state.tasks_state.at(table_id).at(replica);
    for (auto& [_, building_task_map]: replica_tasks.view_tasks) {
        rollback_task_map(builder, building_task_map);
--- a/db/view/view_building_state.cc
+++ b/db/view/view_building_state.cc
@@ -8,6 +8,7 @@
 */

 #include "db/view/view_building_state.hh"
+#include "utils/UUID_gen.hh"

 namespace db {

@@ -22,9 +23,10 @@ view_building_task::view_building_task(utils::UUID id, task_type type, bool abor
        , replica(replica)
        , last_token(last_token) {}

-view_building_state::view_building_state(building_tasks tasks_state, std::optional<table_id> processed_base_table)
+view_building_state::view_building_state(building_tasks tasks_state, std::optional<table_id> processed_base_table, std::optional<utils::UUID> min_alive_uuid)
        : tasks_state(std::move(tasks_state))
-        , currently_processed_base_table(std::move(processed_base_table)) {}
+        , currently_processed_base_table(std::move(processed_base_table))
+        , min_alive_uuid(std::move(min_alive_uuid)) {}

 views_state::views_state(std::map<table_id, std::vector<table_id>> views_per_base, view_build_status_map status_map)
        : views_per_base(std::move(views_per_base))
@@ -127,6 +129,24 @@ std::map<dht::token, std::vector<view_building_task>> view_building_state::colle
    return tasks;
 }

+task_uuid_generator::task_uuid_generator(api::timestamp_type base_ts)
+        : _next_ts(base_ts) {}
+
+utils::UUID task_uuid_generator::operator()() {
+    return utils::UUID_gen::get_random_time_UUID_from_micros(
+            std::chrono::microseconds{_next_ts++});
+}
+
+task_uuid_generator view_building_state::make_task_uuid_generator(api::timestamp_type ts) const {
+    if (min_alive_uuid) {
+        auto lower_bound = utils::UUID_gen::micros_timestamp(*min_alive_uuid);
+        if (ts <= lower_bound) {
+            ts = lower_bound + 1;
+        }
+    }
+    return task_uuid_generator{ts};
+}
+
 }

 }
--- a/db/view/view_building_state.hh
+++ b/db/view/view_building_state.hh
@@ -14,6 +14,7 @@
 #include "db/view/view_build_status.hh"
 #include "locator/host_id.hh"
 #include "locator/tablets.hh"
+#include "mutation/timestamp.hh"
 #include "utils/UUID.hh"
 #include <fmt/base.h>
 #include "schema/schema_fwd.hh"
@@ -64,6 +65,16 @@ struct replica_tasks {
 using base_table_tasks = std::map<locator::tablet_replica, replica_tasks>;
 using building_tasks = std::map<table_id, base_table_tasks>;

+// Generates unique timeuuids with strictly increasing microsecond timestamps.
+// Each call to operator() returns a new timeuuid whose timestamp is one
+// microsecond greater than the previous one.
+class task_uuid_generator {
+    api::timestamp_type _next_ts;
+public:
+    explicit task_uuid_generator(api::timestamp_type base_ts);
+    utils::UUID operator()();
+};
+
 // Represents cluster-wide view building state (only for tablet-based views).
 // The state stores all unfinished view building tasks for all tablet-based views
 // and table_id of currently processed base table by view building coordinator.
@@ -73,14 +84,22 @@ using building_tasks = std::map<table_id, base_table_tasks>;
 struct view_building_state {
    building_tasks tasks_state;
    std::optional<table_id> currently_processed_base_table;
+    std::optional<utils::UUID> min_alive_uuid;

-    view_building_state(building_tasks tasks_state, std::optional<table_id> processed_base_table);
+    view_building_state(building_tasks tasks_state, std::optional<table_id> processed_base_table, std::optional<utils::UUID> min_alive_uuid);
    view_building_state() = default;
    
    std::optional<std::reference_wrapper<const view_building_task>> get_task(table_id base_id, locator::tablet_replica replica, utils::UUID id) const;
    std::vector<std::reference_wrapper<const view_building_task>> get_tasks_for_host(table_id base_id, locator::host_id host) const;
    std::map<dht::token, std::vector<view_building_task>> collect_tasks_by_last_token(table_id base_table_id) const;
    std::map<dht::token, std::vector<view_building_task>> collect_tasks_by_last_token(table_id base_table_id, const locator::tablet_replica& replica) const;
+
+    // Creates a generator that produces unique timeuuids suitable for view
+    // building task IDs. The generated uuids have strictly increasing
+    // microsecond timestamps starting from write_timestamp. If min_alive_uuid
+    // is set, all generated uuids are guaranteed to be greater than
+    // *min_alive_uuid in timeuuid order.
+    task_uuid_generator make_task_uuid_generator(api::timestamp_type write_timestamp) const;
 };

 // Represents global state of tablet-based views.
--- a/db/view/view_building_task_mutation_builder.cc
+++ b/db/view/view_building_task_mutation_builder.cc
@@ -14,7 +14,11 @@ namespace db {
 namespace view {

 utils::UUID view_building_task_mutation_builder::new_id() {
-    return utils::UUID_gen::get_time_UUID();
+    if (_uuid_gen) {
+        return (*_uuid_gen)();
+    } else {
+        utils::on_internal_error("view_building_task_mutation_builder: cannot generate new id without uuid generator");
+    }
 }

 clustering_key view_building_task_mutation_builder::get_ck(utils::UUID id) {
@@ -52,6 +56,43 @@ view_building_task_mutation_builder& view_building_task_mutation_builder::del_ta
    return *this;
 }

+view_building_task_mutation_builder& view_building_task_mutation_builder::del_tasks_before(utils::UUID id) {
+    auto ck = get_ck(id);
+    range_tombstone rt(
+        position_in_partition::before_all_clustered_rows(),
+        position_in_partition_view(ck, bound_weight::before_all_prefixed),
+        tombstone{_ts, gc_clock::now()});
+    _m.partition().apply_row_tombstone(*_s, std::move(rt));
+    return *this;
+}
+
+view_building_task_mutation_builder& view_building_task_mutation_builder::del_all_tasks() {
+    range_tombstone rt(
+        position_in_partition::before_all_clustered_rows(),
+        position_in_partition::after_all_clustered_rows(),
+        tombstone{_ts, gc_clock::now()});
+    _m.partition().apply_row_tombstone(*_s, std::move(rt));
+    return *this;
+}
+
+view_building_task_mutation_builder& view_building_task_mutation_builder::set_min_task_id(utils::UUID id) {
+    _m.set_static_cell("min_task_id", data_value(id), _ts);
+    return *this;
+}
+
+view_building_task_mutation_builder& view_building_task_mutation_builder::set_task(db::view::view_building_task& task) {
+    auto id = task.id;
+    set_type(id, task.type);
+    set_aborted(id, task.aborted);
+    set_base_id(id, task.base_id);
+    if (task.view_id) {
+        set_view_id(id, *task.view_id);
+    }
+    set_last_token(id, task.last_token);
+    set_replica(id, task.replica);
+    return *this;
+}
+
 }

 }
--- a/db/view/view_building_task_mutation_builder.hh
+++ b/db/view/view_building_task_mutation_builder.hh
@@ -8,6 +8,7 @@

 #pragma once

+#include "db/view/view_building_state.hh"
 #include "mutation/mutation.hh"
 #include "db/system_keyspace.hh"
 #include "mutation/timestamp.hh"
@@ -19,17 +20,19 @@ namespace view {
 // Factory for mutations to `system.view_building_tasks` table.
 class view_building_task_mutation_builder {
    api::timestamp_type _ts;
+    std::optional<task_uuid_generator> _uuid_gen;
    schema_ptr _s;
    mutation _m;

 public:
-    view_building_task_mutation_builder(api::timestamp_type ts)
+    view_building_task_mutation_builder(api::timestamp_type ts, std::optional<task_uuid_generator> uuid_gen = std::nullopt)
            : _ts(ts)
+            , _uuid_gen(std::move(uuid_gen))
            , _s(db::system_keyspace::view_building_tasks())
            , _m(_s, partition_key::from_single_value(*_s, data_value("view_building").serialize_nonnull()))
    { }

-    static utils::UUID new_id();
+    utils::UUID new_id();

    view_building_task_mutation_builder& set_type(utils::UUID id, db::view::view_building_task::task_type type);
    view_building_task_mutation_builder& set_aborted(utils::UUID id, bool aborted);
@@ -38,6 +41,13 @@ public:
    view_building_task_mutation_builder& set_last_token(utils::UUID id, dht::token last_token);
    view_building_task_mutation_builder& set_replica(utils::UUID id, const locator::tablet_replica& replica);
    view_building_task_mutation_builder& del_task(utils::UUID id);
+    // Deletes all tasks with clustering key < id using a range tombstone.
+    view_building_task_mutation_builder& del_tasks_before(utils::UUID id);
+    // Deletes all tasks using a range tombstone covering the entire clustering range.
+    view_building_task_mutation_builder& del_all_tasks();
+    // Sets the static column min_task_id to `id`.
+    view_building_task_mutation_builder& set_min_task_id(utils::UUID id);
+    view_building_task_mutation_builder& set_task(db::view::view_building_task& task);

    mutation build() {
        return std::move(_m);
--- a/db/view/view_building_worker.cc
+++ b/db/view/view_building_worker.cc
@@ -17,6 +17,7 @@
 #include <utility>

 #include "db/view/view_building_worker.hh"
+#include "db/view/view_building_task_mutation_builder.hh"
 #include "db/view/view_consumer.hh"
 #include "dht/token.hh"
 #include "replica/database.hh"
@@ -273,21 +274,23 @@ future<> view_building_worker::create_staging_sstable_tasks() {
    shards.erase(0); // We're already holding shard0 lock
    auto locks = co_await lock_staging_mutex_on_multiple_shards(std::move(shards));

-    utils::chunked_vector<canonical_mutation> cmuts;
    auto guard = co_await _group0.client().start_operation(_as);
+    auto uuid_gen = _vb_state_machine.building_state.make_task_uuid_generator(guard.write_timestamp());
+    view_building_task_mutation_builder builder(guard.write_timestamp(), std::move(uuid_gen));
    auto my_host_id = _db.get_token_metadata().get_topology().my_host_id();
    for (auto& [table_id, sst_infos]: _sstables_to_register) {
        for (auto& sst_info: sst_infos) {
            view_building_task task {
-                utils::UUID_gen::get_time_UUID(), view_building_task::task_type::process_staging, false,
+                builder.new_id(), view_building_task::task_type::process_staging, false,
                table_id, ::table_id{}, {my_host_id, sst_info.shard}, sst_info.last_token
            };
-            auto mut = co_await _sys_ks.make_view_building_task_mutation(guard.write_timestamp(), task);
-            cmuts.emplace_back(std::move(mut));
+            builder.set_task(task);
+            vbw_logger.trace("Creating process staging task: {} with ID: {} for replica: {}", task, task.id, task.replica);
        }
    }

-    vbw_logger.debug("Creating {} process_staging view_building_tasks", cmuts.size());
+    utils::chunked_vector<canonical_mutation> cmuts;
+    cmuts.emplace_back(builder.build());
    auto cmd = _group0.client().prepare_command(service::write_mutations{std::move(cmuts)}, guard, "create view building tasks");
    co_await _group0.client().add_entry(std::move(cmd), std::move(guard), _as);

@@ -715,7 +718,7 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
            vbw_logger.info("Building range {} for base table {} and views {} was aborted.", range, base_id, views_ids);
        } catch (...) {
            eptr = std::current_exception();
-            vbw_logger.warn("Error during processing range {} for base table {} and views {}: ", range, base_id, views_ids, eptr);
+            vbw_logger.warn("Error during processing range {} for base table {} and views {}: {}", range, base_id, views_ids, eptr);
        }
        reader.close().get();

--- a/db/view/view_update_backlog.hh
+++ b/db/view/view_update_backlog.hh
@@ -43,7 +43,7 @@ public:
    // Returns the number of bytes in the backlog divided by the maximum number of bytes
    // that the backlog can hold before employing admission control. While the backlog
    // is below the threshold, the coordinator will slow down the view updates up to
-    // calculate_view_update_throttling_delay()::delay_limit_us. Above the threshold,
+    // node_update_backlog::calculate_throttling_delay()::delay_limit_us. Above the threshold,
    // the coordinator will reject the writes that would increase the backlog. On the
    // replica, the writes will start failing only after reaching the hard limit '_max'.
    float relative_size() const {
@@ -70,18 +70,4 @@ public:
    }
 };

-// View updates are asynchronous, and because of this limiting their concurrency requires
-// a special approach. The current algorithm places all of the pending view updates in the backlog
-// and artificially slows down new responses to coordinator requests based on how full the backlog is.
-// This function calculates how much a request should be slowed down based on the backlog's fullness.
-// The equation is basically: delay(in seconds) = view_fullness_ratio^3
-// The more full the backlog gets the more aggressively the requests are slowed down.
-// The delay is limited to the amount of time left until timeout.
-// After the timeout the request fails, so there's no point in waiting longer than that.
-// The second argument defines this timeout point - we can't delay the request more than this time point.
-// See: https://www.scylladb.com/2018/12/04/worry-free-ingestion-flow-control/
-std::chrono::microseconds calculate_view_update_throttling_delay(
-    update_backlog backlog,
-    db::timeout_clock::time_point timeout,
-    uint32_t view_flow_control_delay_limit_in_ms);
 }
--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -7,6 +7,7 @@
 */

 #include "db/view/view_update_backlog.hh"
+#include "db/view/node_view_update_backlog.hh"
 #include <seastar/core/timed_out_error.hh>
 #include "gms/inet_address.hh"
 #include <seastar/util/defer.hh>
@@ -95,9 +96,10 @@ public:
    }
 };

-view_update_generator::view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, abort_source& as)
+view_update_generator::view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, node_update_backlog& node_backlog, abort_source& as)
        : _db(db)
        , _proxy(proxy)
+        , _node_update_backlog(node_backlog)
        , _progress_tracker(std::make_unique<progress_tracker>())
        , _early_abort_subscription(as.subscribe([this] () noexcept { do_abort(); }))
 {
@@ -112,7 +114,7 @@ future<> view_update_generator::start() {
    _started = seastar::async([this]() mutable {
        auto drop_sstable_references = defer([&] () noexcept {
            // Clear sstable references so sstables_manager::stop() doesn't hang.
-            vug_logger.info("leaving {} unstaged sstables unprocessed",
+            vug_logger.info("leaving {} unstaged sstables and {} sstables with tables unprocessed",
                    _sstables_to_move.size(), _sstables_with_tables.size());
            _sstables_to_move.clear();
            _sstables_with_tables.clear();
@@ -498,7 +500,7 @@ future<> view_update_generator::generate_and_propagate_view_updates(const replic
        // the one which limits the number of incoming client requests by delaying the response to the client.
        if (batch_num > 0) {
            update_backlog local_backlog = _db.get_view_update_backlog();
-            std::chrono::microseconds throttle_delay =  calculate_view_update_throttling_delay(local_backlog, timeout, _db.get_config().view_flow_control_delay_limit_in_ms());
+            std::chrono::microseconds throttle_delay =  _node_update_backlog.calculate_throttling_delay(local_backlog, timeout);

            co_await seastar::sleep(throttle_delay);

--- a/db/view/view_update_generator.hh
+++ b/db/view/view_update_generator.hh
@@ -52,6 +52,7 @@ using allow_hints = bool_class<allow_hints_tag>;

 namespace db::view {

+class node_update_backlog;
 class stats;
 struct wait_for_all_updates_tag {};
 using wait_for_all_updates = bool_class<wait_for_all_updates_tag>;
@@ -63,6 +64,7 @@ public:
 private:
    replica::database& _db;
    sharded<service::storage_proxy>& _proxy;
+    node_update_backlog& _node_update_backlog;
    seastar::abort_source _as;
    future<> _started = make_ready_future<>();
    seastar::condition_variable _pending_sstables;
@@ -75,7 +77,7 @@ private:
    optimized_optional<abort_source::subscription> _early_abort_subscription;
    void do_abort() noexcept;
 public:
-    view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, abort_source& as);
+    view_update_generator(replica::database& db, sharded<service::storage_proxy>& proxy, node_update_backlog& node_backlog, abort_source& as);
    ~view_update_generator();

    future<> start();
--- a/dist/CMakeLists.txt
+++ b/dist/CMakeLists.txt
@@ -141,4 +141,72 @@ add_dependencies(dist
  dist-python3
  dist-server)

+set(dist_rpm_dir "${CMAKE_BINARY_DIR}/$<CONFIG>/dist/rpm")
+set(dist_deb_dir "${CMAKE_BINARY_DIR}/$<CONFIG>/dist/deb")
+
+# Map system processor to Debian architecture names
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+  set(deb_arch "amd64")
+elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+  set(deb_arch "arm64")
+else()
+  message(FATAL_ERROR "Unsupported architecture: ${CMAKE_SYSTEM_PROCESSOR}")
+endif()
+
+set(rpm_ver "${Scylla_VERSION}-${Scylla_RELEASE}")
+set(deb_ver "${Scylla_VERSION}-${Scylla_RELEASE}-1")
+set(rpm_arch "${CMAKE_SYSTEM_PROCESSOR}")
+
+set(server_rpms_dir "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/redhat/RPMS/${rpm_arch}")
+set(server_rpms
+  "${server_rpms_dir}/${Scylla_PRODUCT}-${rpm_ver}.${rpm_arch}.rpm"
+  "${server_rpms_dir}/${Scylla_PRODUCT}-server-${rpm_ver}.${rpm_arch}.rpm"
+  "${server_rpms_dir}/${Scylla_PRODUCT}-server-debuginfo-${rpm_ver}.${rpm_arch}.rpm"
+  "${server_rpms_dir}/${Scylla_PRODUCT}-conf-${rpm_ver}.${rpm_arch}.rpm"
+  "${server_rpms_dir}/${Scylla_PRODUCT}-kernel-conf-${rpm_ver}.${rpm_arch}.rpm"
+  "${server_rpms_dir}/${Scylla_PRODUCT}-node-exporter-${rpm_ver}.${rpm_arch}.rpm")
+set(cqlsh_rpms
+  "${CMAKE_SOURCE_DIR}/tools/cqlsh/build/redhat/RPMS/${rpm_arch}/${Scylla_PRODUCT}-cqlsh-${rpm_ver}.${rpm_arch}.rpm")
+set(python3_rpms
+  "${CMAKE_SOURCE_DIR}/tools/python3/build/redhat/RPMS/${rpm_arch}/${Scylla_PRODUCT}-python3-${rpm_ver}.${rpm_arch}.rpm")
+
+set(server_debs_dir "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/debian")
+set(server_debs
+  "${server_debs_dir}/${Scylla_PRODUCT}_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/${Scylla_PRODUCT}-server_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/${Scylla_PRODUCT}-server-dbg_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/${Scylla_PRODUCT}-conf_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/${Scylla_PRODUCT}-kernel-conf_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/${Scylla_PRODUCT}-node-exporter_${deb_ver}_${deb_arch}.deb"
+  "${server_debs_dir}/scylla-enterprise_${deb_ver}_all.deb"
+  "${server_debs_dir}/scylla-enterprise-server_${deb_ver}_all.deb"
+  "${server_debs_dir}/scylla-enterprise-conf_${deb_ver}_all.deb"
+  "${server_debs_dir}/scylla-enterprise-kernel-conf_${deb_ver}_all.deb"
+  "${server_debs_dir}/scylla-enterprise-node-exporter_${deb_ver}_all.deb")
+set(cqlsh_debs
+  "${CMAKE_SOURCE_DIR}/tools/cqlsh/build/debian/${Scylla_PRODUCT}-cqlsh_${deb_ver}_${deb_arch}.deb"
+  "${CMAKE_SOURCE_DIR}/tools/cqlsh/build/debian/scylla-enterprise-cqlsh_${deb_ver}_all.deb")
+set(python3_debs
+  "${CMAKE_SOURCE_DIR}/tools/python3/build/debian/${Scylla_PRODUCT}-python3_${deb_ver}_${deb_arch}.deb"
+  "${CMAKE_SOURCE_DIR}/tools/python3/build/debian/scylla-enterprise-python3_${deb_ver}_all.deb")
+
+add_custom_target(collect-dist-rpm
+  COMMAND ${CMAKE_COMMAND} -E rm -rf ${dist_rpm_dir}
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${dist_rpm_dir}
+  COMMAND ${CMAKE_COMMAND} -E copy ${server_rpms} ${cqlsh_rpms} ${python3_rpms} ${dist_rpm_dir}/
+  DEPENDS dist
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  COMMENT "Collecting RPMs into ${dist_rpm_dir}")
+
+add_custom_target(collect-dist-deb
+  COMMAND ${CMAKE_COMMAND} -E rm -rf ${dist_deb_dir}
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${dist_deb_dir}
+  COMMAND ${CMAKE_COMMAND} -E copy ${server_debs} ${cqlsh_debs} ${python3_debs} ${dist_deb_dir}/
+  DEPENDS dist
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  COMMENT "Collecting DEBs into ${dist_deb_dir}")
+
+add_custom_target(collect-dist
+  DEPENDS collect-dist-rpm collect-dist-deb)
+
 add_subdirectory(debuginfo)
--- a/dist/common/supervisor/scylla-server.sh
+++ b/dist/common/supervisor/scylla-server.sh
@@ -9,6 +9,22 @@ for f in "$etcdir"/scylla.d/*.conf; do
 done

 if is_privileged; then
+    # Override pipe-based core_pattern that may not work inside a container
+    # (e.g. Ubuntu host's apport).  File-based patterns resolve inside the
+    # container's mount namespace, so coredumps land in the right place.
+    # Derive workdir from scylla.yaml, matching the Python entrypoint logic.
+    _workdir=$(python3 -c "import yaml; cfg=yaml.safe_load(open('/etc/scylla/scylla.yaml')); print(cfg.get('workdir') or '/var/lib/scylla')" 2>/dev/null || echo "/var/lib/scylla")
+    _coredump_dir="${_workdir}/coredump"
+    core_pattern=$(cat /proc/sys/kernel/core_pattern 2>/dev/null || true)
+    if [[ "$core_pattern" == "|"* ]]; then
+        if ! mkdir -p "$_coredump_dir" 2>/dev/null; then
+            echo "WARNING: could not create coredump directory $_coredump_dir" >&2
+        elif echo "${_coredump_dir}/core.%e.%p.%t" > /proc/sys/kernel/core_pattern 2>/dev/null; then
+            echo "kernel.core_pattern overridden to file-based pattern: ${_coredump_dir}/core.%e.%p.%t" >&2
+        else
+            echo "WARNING: pipe-based core_pattern detected but could not override. Coredumps may be lost." >&2
+        fi
+    fi
    "$scriptsdir"/scylla_prepare
 fi
 execsudo /usr/bin/env SCYLLA_HOME=$SCYLLA_HOME SCYLLA_CONF=$SCYLLA_CONF "$bindir"/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE $CPUSET $SCYLLA_DOCKER_ARGS
--- a/dist/docker/docker-entrypoint.py
+++ b/dist/docker/docker-entrypoint.py
@@ -24,6 +24,7 @@ try:
    setup.developerMode()
    setup.cpuSet()
    setup.io()
+    setup.coredumpSetup()
    setup.cqlshrc()
    setup.write_rackdc_properties()
    setup.arguments()
--- a/dist/docker/scyllasetup.py
+++ b/dist/docker/scyllasetup.py
@@ -3,6 +3,7 @@ import logging
 import yaml
 import os
 import socket
+import errno

 def is_bind_mount(path):
    # Check if the file or its parent is a mount point (bind mount or otherwise)
@@ -47,6 +48,7 @@ class ScyllaSetup:
        self._dc = arguments.dc
        self._rack = arguments.rack
        self._blocked_reactor_notify_ms = arguments.blocked_reactor_notify_ms
+        self._coredump_dir = None

    def _run(self, *args, **kwargs):
        logging.info('running: {}'.format(args))
@@ -132,6 +134,70 @@ class ScyllaSetup:
            f.write(f"dc={dc}\n")
            f.write(f"rack={rack}\n")

+    CORE_PATTERN_PATH = '/proc/sys/kernel/core_pattern'
+
+    def _get_coredump_dir(self):
+        """Return the coredump directory, deriving it from scylla.yaml workdir if needed."""
+        if self._coredump_dir is not None:
+            return self._coredump_dir
+        conf_dir = "/etc/scylla"
+        try:
+            with open(os.path.join(conf_dir, "scylla.yaml")) as f:
+                cfg = yaml.safe_load(f) or {}
+        except Exception:
+            cfg = {}
+        workdir = cfg.get('workdir') or '/var/lib/scylla'
+        self._coredump_dir = os.path.join(workdir, 'coredump')
+        return self._coredump_dir
+
+    def coredumpSetup(self):
+        """Configure coredump handling for containers.
+
+        The host's kernel.core_pattern may pipe core dumps to a handler
+        (e.g. Ubuntu's apport) that does not exist or work correctly
+        inside the container.  This method tries to switch to a file-based
+        core_pattern so that coredumps are written directly to disk.
+
+        Writing to /proc/sys/kernel/core_pattern requires privileges
+        (root with CAP_SYS_ADMIN).  When the container lacks permission
+        a warning is logged with guidance for the operator.
+        """
+        coredump_dir = self._get_coredump_dir()
+
+        try:
+            os.makedirs(coredump_dir, exist_ok=True)
+        except OSError as e:
+            logging.warning('Could not create coredump directory %s: %s',
+                            coredump_dir, e)
+            return
+
+        try:
+            with open(self.CORE_PATTERN_PATH) as f:
+                current = f.read().strip()
+        except Exception as e:
+            logging.debug('Could not read %s: %s', self.CORE_PATTERN_PATH, e)
+            return
+
+        if not current.startswith('|'):
+            return
+
+        desired = f'{coredump_dir}/core.%e.%p.%t'
+        try:
+            with open(self.CORE_PATTERN_PATH, 'w') as f:
+                f.write(desired + '\n')
+            logging.info('kernel.core_pattern set to %s', desired)
+        except OSError as e:
+            if e.errno in (errno.EACCES, errno.EPERM, errno.EROFS):
+                logging.warning(
+                    'kernel.core_pattern pipes to a program that may not work '
+                    'inside the container, and we lack permission to override it. '
+                    'To fix this, either run with --privileged or set on the host: '
+                    'sysctl -w kernel.core_pattern="%s"', desired)
+            else:
+                logging.debug('Unexpected OSError setting core_pattern: %s', e)
+        except Exception as e:
+            logging.debug('Unexpected error in coredumpSetup: %s', e)
+
    def arguments(self):
        args = []
        if self._memory is not None:
--- a/docs/alternator/compatibility.md
+++ b/docs/alternator/compatibility.md
@@ -324,6 +324,13 @@ experimental:
    stream events. Without this option, such no-op operations may still
    generate spurious stream events.
    <https://github.com/scylladb/scylladb/issues/28368>
+  * When a stream is disabled, no new records are written but the existing
+    stream data is preserved and remains readable through its original
+    StreamArn. The data expires via TTL after 24 hours. Re-enabling the
+    stream purges the old data immediately and produces a new StreamArn.
+    In contrast, DynamoDB keeps the old stream and its data readable for
+    24 hours through the old StreamArn even after re-enabling.
+    <https://scylladb.atlassian.net/browse/SCYLLADB-1873>

 ## Unimplemented API features

--- a/docs/alternator/vector-search.md
+++ b/docs/alternator/vector-search.md
@@ -1,5 +1,11 @@
 # Alternator Vector Search

+```{admonition} Availability
+:class: important
+
+The Vector Search feature is only available in [ScyllaDB Cloud](https://cloud.docs.scylladb.com/) - a fully managed DBaaS running ScyllaDB.
+```
+
 ## Introduction

 Alternator vector search is a ScyllaDB extension to the DynamoDB-compatible
--- a/docs/cql/ddl.rst
+++ b/docs/cql/ddl.rst
@@ -415,7 +415,7 @@ An empty list is allowed, and it's equivalent to numeric replication factor of 0
 .. code-block:: cql

  ALTER KEYSPACE Excelsior
-   WITH replication = { 'class' : 'NetworkTopologyStrategy', dc2' : []};
+   WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc2' : []};


 Altering from a rack list to a numeric replication factor is not supported.
@@ -1017,11 +1017,11 @@ For example:

    CREATE TABLE customer_data (
        cust_id uuid,
-        cust_first-name text,
-        cust_last-name text,
+        "cust_first-name" text,
+        "cust_last-name" text,
        cust_phone text,
-        cust_get-sms text,
-        PRIMARY KEY (customer_id)
+        "cust_get-sms" text,
+        PRIMARY KEY (cust_id)
    ) WITH cdc = { 'enabled' : 'true', 'preimage' : 'true' };

 .. _cql-caching-options:
--- a/docs/cql/dml/insert.rst
+++ b/docs/cql/dml/insert.rst
@@ -24,7 +24,8 @@ For example:

    INSERT INTO NerdMovies (movie, director, main_actor, year)
          VALUES ('Serenity', 'Joss Whedon', 'Nathan Fillion', 2005)
-          USING TTL 86400 IF NOT EXISTS;
+          IF NOT EXISTS
+          USING TTL 86400;

 The ``INSERT`` statement writes one or more columns for a given row in a table. Note that since a row is identified by
 its ``PRIMARY KEY``, at least the columns composing it must be specified. The list of columns to insert to must be
--- a/docs/cql/mv.rst
+++ b/docs/cql/mv.rst
@@ -71,7 +71,7 @@ used. If it is used, the statement will be a no-op if the materialized view alre
 MV Select Statement
 ...................

-The select statement of a materialized view creation defines which of the base table is included in the view. That
+The select statement of a materialized view creation defines which of the base table columns are included in the view. That
 statement is limited in a number of ways:

 - The :ref:`selection <selection-clause>` is limited to those that only select columns of the base table. In other
--- a/docs/cql/types.rst
+++ b/docs/cql/types.rst
@@ -507,7 +507,7 @@ For example::

  CREATE TABLE superheroes (
       name frozen<full_name> PRIMARY KEY,
-       home address
+       home frozen<address>
  );

 .. note::
--- a/docs/dev/object_storage.md
+++ b/docs/dev/object_storage.md
@@ -167,6 +167,11 @@ All tables in a keyspace are uploaded, the destination object names will look li
 or 
 `gs://bucket/some/prefix/to/store/data/.../sstable`

+# System tables
+There are a few system tables that object storage related code needs to touch in order to operate.
+* [system_distributed.snapshot_sstables](docs/dev/snapshot_sstables.md) - Used during restore by worker nodes to get the list of SSTables that need to be downloaded from object storage and restored locally.
+* [system.sstables](docs/dev/system_keyspace.md#systemsstables) - Used to keep track of SSTables on object storage when a keyspace is created with object storage storage_options.
+
 # Manipulating S3 data

 This section intends to give an overview of where, when and how we store data in S3 and provide a quick set of commands  
--- a/docs/dev/snapshot_sstables.md
+++ b/docs/dev/snapshot_sstables.md
@@ -0,0 +1,52 @@
+# system\_distributed.snapshot\_sstables
+
+## Purpose
+
+This table is used during tablet-aware restore to exchange per-SSTable metadata between
+the coordinator and worker nodes. When the restore process starts, the coordinator node
+populates this table with information about each SSTable extracted from the snapshot
+manifests. Worker nodes then read from this table to determine which SSTables need to
+be downloaded from object storage and restored locally.
+
+Rows are inserted with a TTL so that stale restore metadata is automatically cleaned up.
+
+## Schema
+
+~~~
+CREATE TABLE system_distributed.snapshot_sstables (
+    snapshot_name text,
+    "keyspace" text,
+    "table" text,
+    datacenter text,
+    rack text,
+    first_token bigint,
+    sstable_id uuid,
+    last_token bigint,
+    toc_name text,
+    prefix text,
+    PRIMARY KEY ((snapshot_name, "keyspace", "table", datacenter, rack), first_token, sstable_id)
+)
+~~~
+
+Column descriptions:
+
+| Column | Type | Description |
+|--------|------|-------------|
+| `snapshot_name` | text (partition key) | Name of the snapshot |
+| `keyspace` | text (partition key) | Keyspace the snapshot was taken from |
+| `table` | text (partition key) | Table within the keyspace |
+| `datacenter` | text (partition key) | Datacenter where the SSTable is located |
+| `rack` | text (partition key) | Rack where the SSTable is located |
+| `first_token` | bigint (clustering key) | First token in the token range covered by this SSTable |
+| `sstable_id` | uuid (clustering key) | Unique identifier for the SSTable |
+| `last_token` | bigint | Last token in the token range covered by this SSTable |
+| `toc_name` | text | TOC filename of the SSTable (e.g. `me-3gdq_0bki_2cvk01yl83nj0tp5gh-big-TOC.txt`) |
+| `prefix` | text | Prefix path in object storage where the SSTable was backed up |
+
+## APIs
+
+The following C++ APIs are provided in `db::system_distributed_keyspace`:
+
+- insert\_snapshot\_sstable
+
+- get\_snapshot\_sstables
--- a/docs/dev/topology-over-raft.md
+++ b/docs/dev/topology-over-raft.md
@@ -274,6 +274,8 @@ globally driven by the topology change coordinator and serialized per-tablet. Tr

 - repair - tablet replicas are repaired

+- restore - tablet replicas download SSTables from object storage during cluster-wide backup restore
+
 Each tablet has its own state machine for keeping state of transition stored in group0 which is part of the tablet state. It involves
 these properties of a tablet:

@@ -390,6 +392,9 @@ stateDiagram-v2

 The repair tablet transition kind is different. It transits only to the repair and end_repair stage because no token ownership is changed.

+The restore tablet transition kind is also simple. It uses a single `restore` stage and does not change token
+ownership. See the [Tablet-aware restore](#tablet-aware-restore) section below for details.
+
 The behavioral difference between "migration" and "intranode_migration" transitions is in the way "streaming" stage
 is performed. In case of intra-node migration, streaming is done by fast duplication of data by creating hard links to
 sstable files on the destination shard. Original sstable files on the source shard will be removed by the standard "cleanup" stage.
@@ -984,3 +989,18 @@ Losing a committed entry can be observed by external systems. For example, the l
 schema version in the cluster can go back in time from the driver's perspective. This
 is outside the scope of the recovery procedure, though, and it shouldn't cause
 problems in practice.
+
+# Tablet restore transition
+
+The `restore` tablet transition kind is used by the tablet-aware restore to download SSTables
+from object storage. The transition contains `restore_config` with snapshot name, endpoint and
+bucket.
+
+Like `repair`, the `restore` transition does not change token ownership — replicas remain intact.
+The topology coordinator processes a tablet in this stage by calling the `RESTORE_TABLET` RPC on
+all tablet replicas. Each replica then downloads and attaches the SSTables that are contained in
+the tablet's token range. If the operation succeeds or fails, the transition is cleared and the
+failure to download SSTables is propagated back to user by the API handler itself.
+
+Restore transitions are serialized per-tablet like any other transition (invariant [INV-TABL-2]),
+so they do not run concurrently with migrations or repairs on the same tablet.
--- a/docs/dev/view-building-coordinator.md
+++ b/docs/dev/view-building-coordinator.md
@@ -106,6 +106,7 @@ The most important table is `system.view_building_tasks`, which stores all unfin
 CREATE TABLE system.view_building_tasks (
    key text,
    id timeuuid,
+    min_task_id timeuuid STATIC, -- lower bound for task scans; see "Tombstone avoidance" below
    type text,
    aborted boolean,
    base_id uuid,
@@ -117,6 +118,26 @@ CREATE TABLE system.view_building_tasks (
 )
 ```

+### Tombstone avoidance
+
+`system.view_building_tasks` is a single partition. When `finished_task_gc_fiber()` removes
+finished tasks in batches, the deleted rows remain as tombstones in SSTables until compaction,
+causing `tombstone_warn_threshold` warnings on subsequent reloads in large clusters.
+
+Two mechanisms address this:
+
+**Range tombstone on GC.** Instead of one row tombstone per deleted task, the coordinator emits
+a single range tombstone `[before_all, min_alive_uuid)` where `min_alive_uuid` is the smallest
+timeuuid among surviving tasks. Tasks above the boundary (rare) still get individual row tombstones.
+When all tasks are deleted, a single full-partition range tombstone is used.
+
+**Bounded scan on reload.** Physical rows remain until compaction and are still counted as dead cells.
+After each GC batch, `min_task_id = min_alive_uuid` is written atomically as a static cell (same Raft
+batch as the range tombstone). On reload, `min_task_id` is read using a **static-only partition slice**
+(empty `_row_ranges` + `always_return_static_content`) — this makes the SSTable reader stop immediately
+after the static row, before any clustering tombstones, so zero dead cells are counted. The value is
+then used as `AND id >= min_task_id` to skip all tombstoned rows in the main scan.
+
 The view building coordinator stores currently processing base table in `system.scylla_local` 
 under `view_building_processing_base` key. 
 The entry is managed by group0.
--- a/docs/operating-scylla/nodetool-commands/removenode.rst
+++ b/docs/operating-scylla/nodetool-commands/removenode.rst
@@ -45,7 +45,7 @@ Example:

 .. code-block:: console

-    nodetool removenode 675ed9f4-6564-6dbd-can8-43fddce952gy
+    nodetool removenode 675ed9f4-6564-6dbd-ca08-43fddce952de

 To only mark the node as permanently down without doing actual removal, use :doc:`nodetool excludenode </operating-scylla/nodetool-commands/excludenode>`:

@@ -79,6 +79,6 @@ Example:

 .. code-block:: console

-    nodetool removenode --ignore-dead-nodes 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1dbn-mac8-43fddce9123e 675ed9f4-6564-6dbd-can8-43fddce952gy   
+    nodetool removenode --ignore-dead-nodes 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1db0-aac8-43fddce9123e 675ed9f4-6564-6dbd-ca08-43fddce952de   

 .. include:: nodetool-index.rst
--- a/docs/operating-scylla/procedures/cluster-management/add-node-to-cluster.rst
+++ b/docs/operating-scylla/procedures/cluster-management/add-node-to-cluster.rst
@@ -74,7 +74,7 @@ Procedure
       --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
       UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   B1
       UN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-mac8-43fddce9123e   B1
-       UJ  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   B1
+       UJ  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   B1

   Nodes in the cluster finished streaming data to the new node:

@@ -86,7 +86,7 @@ Procedure
        --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
        UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   B1
        UN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-mac8-43fddce9123e   B1
-        UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   B1
+        UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   B1

 #. When the new node status is Up Normal (UN), run the :doc:`nodetool cleanup </operating-scylla/nodetool-commands/cleanup>` command on all nodes in the cluster except for the new node that has just been added. Cleanup removes keys that were streamed to the newly added node and are no longer owned by the node.

--- a/docs/operating-scylla/procedures/cluster-management/cluster-platform-migration.rst
+++ b/docs/operating-scylla/procedures/cluster-management/cluster-platform-migration.rst
@@ -192,7 +192,7 @@ Adding new nodes
      --  Address        Load       Tokens  Owns   Host ID                               Rack
      UN  192.168.1.10   500 MB     256     33.3%  8d5ed9f4-7764-4dbd-bad8-43fddce94b7c  RACK0
      UN  192.168.1.11   500 MB     256     33.3%  125ed9f4-7777-1dbn-mac8-43fddce9123e  RACK1
-      UN  192.168.1.12   500 MB     256     33.3%  675ed9f4-6564-6dbd-can8-43fddce952gy  RACK2
+      UN  192.168.1.12   500 MB     256     33.3%  675ed9f4-6564-6dbd-ca08-43fddce952de  RACK2
      UJ  192.168.2.10   250 MB     256     ?      a1b2c3d4-5678-90ab-cdef-112233445566  RACK0

   **Example output after bootstrap completes:**
@@ -205,7 +205,7 @@ Adding new nodes
      --  Address        Load       Tokens  Owns   Host ID                               Rack
      UN  192.168.1.10   400 MB     256     25.0%  8d5ed9f4-7764-4dbd-bad8-43fddce94b7c  RACK0
      UN  192.168.1.11   400 MB     256     25.0%  125ed9f4-7777-1dbn-mac8-43fddce9123e  RACK1
-      UN  192.168.1.12   400 MB     256     25.0%  675ed9f4-6564-6dbd-can8-43fddce952gy  RACK2
+      UN  192.168.1.12   400 MB     256     25.0%  675ed9f4-6564-6dbd-ca08-43fddce952de  RACK2
      UN  192.168.2.10   400 MB     256     25.0%  a1b2c3d4-5678-90ab-cdef-112233445566  RACK0

 #. For tablets-enabled clusters, wait for tablet load balancing to complete.
--- a/docs/operating-scylla/procedures/cluster-management/create-cluster.rst
+++ b/docs/operating-scylla/procedures/cluster-management/create-cluster.rst
@@ -163,5 +163,5 @@ This example shows how to install and configure a three-node cluster using Gossi
   --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
   UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   43
   UN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-mac8-43fddce9123e   44
-   UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   45
+   UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   45

--- a/docs/operating-scylla/procedures/cluster-management/decommissioning-data-center.rst
+++ b/docs/operating-scylla/procedures/cluster-management/decommissioning-data-center.rst
@@ -19,7 +19,7 @@ Prerequisites
   --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
   UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   B1
   UN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-lac8-23fddce9123e   B1
-   UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   B1
+   UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   B1

   Datacenter: ASIA-DC
   Status=Up/Down
@@ -165,7 +165,7 @@ Procedure
      --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
      UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   B1
      UN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-mac8-43fddce9123e   B1
-      UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   B1
+      UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   B1

      Datacenter: EUROPE-DC
      Status=Up/Down
--- a/docs/operating-scylla/procedures/cluster-management/remove-node.rst
+++ b/docs/operating-scylla/procedures/cluster-management/remove-node.rst
@@ -18,7 +18,7 @@ Removing a Running Node
         --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
         UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   B1
         UN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-mac8-43fddce9123e   B1
-         UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   B1
+         UN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   B1

 #. If the node status is **Up Normal (UN)**, run the :doc:`nodetool decommission </operating-scylla/nodetool-commands/decommission>` command
   to remove the node you are connected to. Using ``nodetool decommission`` is the recommended method for cluster scale-down operations. It prevents data loss
@@ -75,7 +75,7 @@ command providing the Host ID of the node you are removing. See :doc:`nodetool r

 .. code-block:: console
   
-   nodetool removenode 675ed9f4-6564-6dbd-can8-43fddce952gy
+   nodetool removenode 675ed9f4-6564-6dbd-ca08-43fddce952de

 The ``nodetool removenode`` command notifies other nodes that the token range it owns needs to be moved and
 the nodes should redistribute the data using streaming. Using the command does not guarantee the consistency of the rebalanced data if
--- a/docs/operating-scylla/procedures/cluster-management/replace-dead-node-or-more.rst
+++ b/docs/operating-scylla/procedures/cluster-management/replace-dead-node-or-more.rst
@@ -23,7 +23,7 @@ Prerequisites
   --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
   UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   B1
   DN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-mac8-43fddce9123e   B1
-   DN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   B1
+   DN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   B1

 Login to one of the nodes in the cluster with (UN) status, collect the following info from the node:

--- a/docs/operating-scylla/procedures/cluster-management/replace-dead-node.rst
+++ b/docs/operating-scylla/procedures/cluster-management/replace-dead-node.rst
@@ -29,7 +29,7 @@ Down (DN), and the node can be replaced.
   --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
   UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   B1
   UN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-mac8-43fddce9123e   B1
-   DN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   B1
+   DN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   B1

 Remove the Data
 ==================
@@ -72,7 +72,7 @@ Procedure

   For example (using the Host ID of the failed node from above):

-   ``replace_node_first_boot: 675ed9f4-6564-6dbd-can8-43fddce952gy``
+   ``replace_node_first_boot: 675ed9f4-6564-6dbd-ca08-43fddce952de``

 #. Start the new node.

@@ -90,7 +90,7 @@ Procedure
       --  Address        Load       Tokens  Owns (effective)                         Host ID         Rack
       UN  192.168.1.201  112.82 KB  256     32.7%             8d5ed9f4-7764-4dbd-bad8-43fddce94b7c   B1
       UN  192.168.1.202  91.11 KB   256     32.9%             125ed9f4-7777-1dbn-mac8-43fddce9123e   B1
-       DN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-can8-43fddce952gy   B1
+       DN  192.168.1.203  124.42 KB  256     32.6%             675ed9f4-6564-6dbd-ca08-43fddce952de   B1
    
    ``192.168.1.203`` is the dead node.
    
@@ -121,7 +121,7 @@ Procedure
       /192.168.1.203
         generation:1553759866
         heartbeat:2147483647
-         HOST_ID:675ed9f4-6564-6dbd-can8-43fddce952gy
+         HOST_ID:675ed9f4-6564-6dbd-ca08-43fddce952de
         STATUS:shutdown,true
         RELEASE_VERSION:3.0.8
         X3:3
@@ -178,7 +178,7 @@ In this case, the node's data will be cleaned after restart. To remedy this, you

   .. code-block:: none

-      echo 'replace_node_first_boot: 675ed9f4-6564-6dbd-can8-43fddce952gy' | sudo tee --append /etc/scylla/scylla.yaml
+      echo 'replace_node_first_boot: 675ed9f4-6564-6dbd-ca08-43fddce952de' | sudo tee --append /etc/scylla/scylla.yaml

 #. Run the following command to re-setup RAID

--- a/docs/operating-scylla/procedures/config-change/migrate-vnodes-to-tablets.rst
+++ b/docs/operating-scylla/procedures/config-change/migrate-vnodes-to-tablets.rst
@@ -1,5 +1,5 @@
-Migrate a Keyspace from Vnodes to Tablets
-==========================================
+Migrate a Keyspace from Vnodes to Tablets :label-caution:`Experimental`
+=========================================================================

 This procedure describes how to migrate an existing keyspace from vnodes
 to tablets. Tablets are designed to be the long-term replacement for vnodes,
@@ -8,6 +8,9 @@ balancing, automatic cleanups, and improved streaming performance. Migrating to
 tablets is strongly recommended. See :doc:`Data Distribution with Tablets </architecture/tablets/>`
 for details.

+ℹ️ This feature is experimental and will change in future releases, including
+the removal of current limitations.
+
 .. note::

   The migration is an online operation. This means that the keyspace remains
--- a/docs/reference/limits.rst
+++ b/docs/reference/limits.rst
@@ -16,7 +16,7 @@ Cluster and Node Limits
   * - Nodes per cluster
     - Low hundreds
   * - Node size
-     - 256 vcpu
+     - 4096 CPUs
  
 See :ref:`Hardware Requirements <system-requirements-hardware>` for storage
 and memory requirements and limits.
--- a/docs/upgrade/upgrade-guides/index.rst
+++ b/docs/upgrade/upgrade-guides/index.rst
@@ -4,7 +4,7 @@ Upgrade ScyllaDB

 .. toctree::
   
-   ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1/index>
+   ScyllaDB 2026.1 to ScyllaDB 2026.2 <upgrade-guide-from-2026.1-to-2026.2/index>
   ScyllaDB 2026.x Patch Upgrades <upgrade-guide-from-2026.x.y-to-2026.x.z>
   ScyllaDB Image <ami-upgrade>

--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/index.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/index.rst
@@ -1,13 +0,0 @@
-==========================================================
-Upgrade - ScyllaDB 2025.x to ScyllaDB 2026.1
-==========================================================
-
-.. toctree::
-   :maxdepth: 2
-   :hidden:
-
-   Upgrade ScyllaDB <upgrade-guide-from-2025.x-to-2026.1>
-   Metrics Update <metric-update-2025.x-to-2026.1>
-
-* :doc:`Upgrade from ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1>`
-* :doc:`Metrics Update Between 2025.x and 2026.1 <metric-update-2025.x-to-2026.1>`
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/metric-update-2025.x-to-2026.1.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2026.1/metric-update-2025.x-to-2026.1.rst
@@ -1,82 +0,0 @@
-.. |SRC_VERSION| replace:: 2025.x
-.. |NEW_VERSION| replace:: 2026.1
-.. |PRECEDING_VERSION| replace:: 2025.4
-
-================================================================
-Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
-================================================================
-
-.. toctree::
-   :maxdepth: 2
-   :hidden:
-
-ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
-
-
-New Metrics in |NEW_VERSION|
--------------------------------------
-
-The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
-
-.. list-table::
-   :widths: 25 150
-   :header-rows: 1
-
-   * - Metric
-     - Description
-   * - scylla_alternator_operation_size_kb
-     - Histogram of item sizes involved in a request.
-   * - scylla_column_family_total_disk_space_before_compression
-     - Hypothetical total disk space used if data files weren't compressed
-   * - scylla_group_name_auto_repair_enabled_nr
-     - Number of tablets with auto repair enabled.
-   * - scylla_group_name_auto_repair_needs_repair_nr
-     - Number of tablets with auto repair enabled that currently need repair.
-   * - scylla_lsa_compact_time_ms
-     - Total time spent on segment compaction that was not accounted under ``reclaim_time_ms``.
-   * - scylla_lsa_evict_time_ms
-     - Total time spent on evicting objects that was not accounted under ``reclaim_time_ms``,
-   * - scylla_lsa_reclaim_time_ms
-     - Total time spent in reclaiming LSA memory back to std allocator.
-   * - scylla_object_storage_memory_usage
-     - Total number of bytes consumed by the object storage client.
-   * - scylla_tablet_ops_failed
-     - Number of failed tablet auto repair attempts.
-   * - scylla_tablet_ops_succeeded
-     - Number of successful tablet auto repair attempts.
-   
-Renamed Metrics in |NEW_VERSION|
--------------------------------------
-
-The following metrics are renamed in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
-
-.. list-table::
-   :widths: 25 150
-   :header-rows: 1
-
-   * - Metric Name in |PRECEDING_VERSION|
-     - Metric Name in |NEW_VERSION|
-   * - scylla_s3_memory_usage
-     - scylla_object_storage_memory_usage
-
-Removed Metrics in |NEW_VERSION|
--------------------------------------
-
-The following metrics are removed in ScyllaDB |NEW_VERSION|.
-
-* scylla_redis_current_connections
-* scylla_redis_op_latency
-* scylla_redis_operation
-* scylla_redis_operation
-* scylla_redis_requests_latency
-* scylla_redis_requests_served
-* scylla_redis_requests_serving
-
-New and Updated Metrics in Previous Releases
-------------------------------------------------------
-
-* `Metrics Update Between 2025.3 and 2025.4 <https://docs.scylladb.com/manual/branch-2025.4/upgrade/upgrade-guides/upgrade-guide-from-2025.x-to-2025.4/metric-update-2025.x-to-2025.4.html>`_
-* `Metrics Update Between 2025.2 and 2025.3 <https://docs.scylladb.com/manual/branch-2025.3/upgrade/upgrade-guides/upgrade-guide-from-2025.2-to-2025.3/metric-update-2025.2-to-2025.3.html>`_
-* `Metrics Update Between 2025.1 and 2025.2 <https://docs.scylladb.com/manual/branch-2025.2/upgrade/upgrade-guides/upgrade-guide-from-2025.1-to-2025.2/metric-update-2025.1-to-2025.2.html>`_
-
-
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/index.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/index.rst
@@ -0,0 +1,13 @@
+==========================================================
+Upgrade - ScyllaDB 2026.1 to ScyllaDB 2026.2
+==========================================================
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+   Upgrade ScyllaDB <upgrade-guide-from-2026.1-to-2026.2>
+   Metrics Update <metric-update-2026.1-to-2026.2>
+
+* :doc:`Upgrade from ScyllaDB 2026.1 to ScyllaDB 2026.2 <upgrade-guide-from-2026.1-to-2026.2>`
+* :doc:`Metrics Update Between 2026.1 and 2026.2 <metric-update-2026.1-to-2026.2>`
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/metric-update-2026.1-to-2026.2.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/metric-update-2026.1-to-2026.2.rst
@@ -0,0 +1,126 @@
+.. |SRC_VERSION| replace:: 2026.1
+.. |NEW_VERSION| replace:: 2026.2
+.. |PRECEDING_VERSION| replace:: 2026.1
+
+================================================================
+Metrics Update Between |SRC_VERSION| and |NEW_VERSION|
+================================================================
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+ScyllaDB |NEW_VERSION| Dashboards are available as part of the latest |mon_root|.
+
+
+New Metrics in |NEW_VERSION|
+--------------------------------------
+
+The following metrics are new in ScyllaDB |NEW_VERSION| compared to |PRECEDING_VERSION|.
+
+.. list-table::
+   :widths: 25 150
+   :header-rows: 1
+
+   * - Metric
+     - Description
+   * - scylla_auth_cache_permissions
+     - Total number of permission sets currently cached across all roles.
+   * - scylla_auth_cache_roles
+     - Number of roles currently cached.
+   * - scylla_cql_forwarded_requests
+     - Counts the total number of attempts to forward CQL requests to other nodes.
+       One request may be forwarded multiple times, particularly when a write is
+       handled by a non-replica node.
+   * - scylla_cql_write_consistency_levels_disallowed_violations
+     - Counts the number of write_consistency_levels_disallowed guardrail violations,
+       i.e. attempts to write with a forbidden consistency level.
+   * - scylla_cql_write_consistency_levels_warned_violations
+     - Counts the number of write_consistency_levels_warned guardrail violations,
+       i.e. attempts to write with a discouraged consistency level.
+   * - scylla_cql_writes_per_consistency_level
+     - Counts the number of writes for each consistency level.
+   * - scylla_io_queue_integrated_disk_queue_length
+     - Length of the integrated disk queue.
+   * - scylla_io_queue_integrated_queue_length
+     - Length of the integrated queue.
+   * - scylla_logstor_sm_bytes_freed
+     - Counts the number of data bytes freed.
+   * - scylla_logstor_sm_bytes_read
+     - Counts the number of bytes read from the disk.
+   * - scylla_logstor_sm_bytes_written
+     - Counts the number of bytes written to the disk.
+   * - scylla_logstor_sm_compaction_bytes_written
+     - Counts the number of bytes written to the disk by compaction.
+   * - scylla_logstor_sm_compaction_data_bytes_written
+     - Counts the number of data bytes written to the disk by compaction.
+   * - scylla_logstor_sm_compaction_records_rewritten
+     - Counts the number of records rewritten during compaction.
+   * - scylla_logstor_sm_compaction_records_skipped
+     - Counts the number of records skipped during compaction.
+   * - scylla_logstor_sm_compaction_segments_freed
+     - Counts the number of data bytes written to the disk.
+   * - scylla_logstor_sm_disk_usage
+     - Total disk usage.
+   * - scylla_logstor_sm_free_segments
+     - Counts the number of free segments currently available.
+   * - scylla_logstor_sm_segment_pool_compaction_segments_get
+     - Counts the number of segments taken from the segment pool for compaction.
+   * - scylla_logstor_sm_segment_pool_normal_segments_get
+     - Counts the number of segments taken from the segment pool for normal writes.
+   * - scylla_logstor_sm_segment_pool_normal_segments_wait
+     - Counts the number of times normal writes had to wait for a segment to become
+       available in the segment pool.
+   * - scylla_logstor_sm_segment_pool_segments_put
+     - Counts the number of segments returned to the segment pool.
+   * - scylla_logstor_sm_segment_pool_separator_segments_get
+     - Counts the number of segments taken from the segment pool for separator writes.
+   * - scylla_logstor_sm_segment_pool_size
+     - Counts the number of segments in the segment pool.
+   * - scylla_logstor_sm_segments_allocated
+     - Counts the number of segments allocated.
+   * - scylla_logstor_sm_segments_compacted
+     - Counts the number of segments compacted.
+   * - scylla_logstor_sm_segments_freed
+     - Counts the number of segments freed.
+   * - scylla_logstor_sm_segments_in_use
+     - Counts the number of segments currently in use.
+   * - scylla_logstor_sm_separator_buffer_flushed
+     - Counts the number of times the separator buffer has been flushed.
+   * - scylla_logstor_sm_separator_bytes_written
+     - Counts the number of bytes written to the separator.
+   * - scylla_logstor_sm_separator_data_bytes_written
+     - Counts the number of data bytes written to the separator.
+   * - scylla_logstor_sm_separator_flow_control_delay
+     - Current delay applied to writes to control separator debt in microseconds.
+   * - scylla_logstor_sm_separator_segments_freed
+     - Counts the number of segments freed by the separator.
+   * - scylla_transport_cql_pending_response_memory
+     - Holds the total memory in bytes consumed by responses waiting to be sent.
+   * - scylla_transport_cql_request_histogram_bytes
+     - A histogram of received bytes in CQL messages of a specific kind and
+       specific scheduling group.
+   * - scylla_transport_cql_requests_serving
+     - Holds the number of requests that are being processed right now.
+   * - scylla_transport_cql_response_histogram_bytes
+     - A histogram of received bytes in CQL messages of a specific kind and
+       specific scheduling group.
+   * - scylla_transport_requests_forwarded_failed
+     - Counts the number of requests that were forwarded to another replica
+       but failed to execute there.
+   * - scylla_transport_requests_forwarded_prepared_not_found
+     - Counts the number of requests that were forwarded to another replica
+       but failed there because the statement was not prepared on the target.
+       When this happens, the coordinator performs an additional remote call
+       to prepare the statement on the replica and retries the EXECUTE request
+       afterwards.
+   * - scylla_transport_requests_forwarded_redirected
+     - Counts the number of requests that were forwarded to another replica
+       but that replica responded with a redirect to another node. This can
+       happen when replica has stale information about the cluster topology or
+       when the request is handled by a node that is not a replica for the data
+       being accessed by the request.
+   * - scylla_transport_requests_forwarded_successfully
+     - Counts the number of requests that were forwarded to another replica
+       and executed successfully there.
+
--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/upgrade-guide-from-2026.1-to-2026.2.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.1-to-2026.2/upgrade-guide-from-2026.1-to-2026.2.rst
@@ -1,13 +1,13 @@
 .. |SCYLLA_NAME| replace:: ScyllaDB

-.. |SRC_VERSION| replace:: 2025.x
-.. |NEW_VERSION| replace:: 2026.1
+.. |SRC_VERSION| replace:: 2026.1
+.. |NEW_VERSION| replace:: 2026.2

 .. |ROLLBACK| replace:: rollback
 .. _ROLLBACK: ./#rollback-procedure

-.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2025.x to 2026.1
-.. _SCYLLA_METRICS: ../metric-update-2025.x-to-2026.1
+.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - ScyllaDB 2026.1 to 2026.2
+.. _SCYLLA_METRICS: ../metric-update-2026.1-to-2026.2

 =======================================================================================
 Upgrade from |SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION|
--- a/ent/encryption/kmip_host.cc
+++ b/ent/encryption/kmip_host.cc
@@ -289,8 +289,8 @@ private:

    sstring _host;
    host_options& _options;
-    output_stream<char> _output;
-    input_stream<char> _input;
+    std::optional<output_stream<char>> _output;
+    std::optional<input_stream<char>> _input;
    seastar::connected_socket _socket;
    std::optional<temporary_buffer<char>> _in_buffer;
    std::optional<future<>> _pending;
@@ -347,8 +347,8 @@ future<> kmip_host::impl::connection::connect() {
                // #998 Set keepalive to try avoiding connection going stale in between commands.
                s.set_keepalive_parameters(net::tcp_keepalive_params{60s, 60s, 10});
                s.set_keepalive(true);
-                _input = s.input();
-                _output = s.output();
+                _input.emplace(s.input());
+                _output.emplace(s.output());
            });
        });
    });
@@ -367,9 +367,9 @@ int kmip_host::impl::connection::send(void* data, unsigned int len, unsigned int
    }
    kmip_log.trace("{}: Sending {} bytes", *this, len);

-    auto f = _output.write(reinterpret_cast<char *>(data), len).then([this] {
+    auto f = _output->write(reinterpret_cast<char *>(data), len).then([this] {
        kmip_log.trace("{}: send done. flushing...", *this);
-        return _output.flush();
+        return _output->flush();
    });
    // if the call failed already, we still want to
    // drop back to "wait_for_io()", because we cannot throw
@@ -405,7 +405,7 @@ int kmip_host::impl::connection::recv(void* data, unsigned int len, unsigned int
        }

        kmip_log.trace("{}: issue read", *this);
-        auto f = _input.read().then([this](temporary_buffer<char> buf) {
+        auto f = _input->read().then([this](temporary_buffer<char> buf) {
            kmip_log.trace("{}: got {} bytes", *this, buf.size());
           _in_buffer = std::move(buf);
        });
@@ -462,8 +462,8 @@ void kmip_host::impl::connection::attach(KMIP_CMD* cmd) {
 }

 future<> kmip_host::impl::connection::close() {
-    return _output.close().finally([this] {
-        return _input.close();
+    return _output->close().finally([this] {
+        return _input->close();
    });
 }

@@ -598,7 +598,7 @@ future<int> kmip_host::impl::do_cmd(KMIP_CMD* cmd, con_ptr cp, Func& f, bool ret

 template<typename Func>
 future<kmip_host::impl::kmip_cmd> kmip_host::impl::do_cmd(kmip_cmd cmd_in, Func && f) {
-    kmip_log.trace("{}: begin do_cmd", *this, cmd_in);
+    kmip_log.trace("{}: begin do_cmd {}", *this, cmd_in);
    KMIP_CMD* cmd = cmd_in;

    // #998 Need to do retry loop, because we can have either timed out connection,
--- a/ent/encryption/kms_host.cc
+++ b/ent/encryption/kms_host.cc
@@ -616,7 +616,7 @@ future<rjson::value> encryption::kms_host::impl::do_post(std::string_view target
            static auto get_xml_node = [](node_type* node, const char* what) {
                auto res = node->first_node(what);
                if (!res) {
-                    throw malformed_response_error(fmt::format("XML parse error", what));
+                    throw malformed_response_error(fmt::format("XML parse error: {}", what));
                }
                return res;
            };
--- a/gms/feature_service.cc
+++ b/gms/feature_service.cc
@@ -109,6 +109,7 @@ std::set<std::string_view> feature_service::supported_feature_set() const {
        "UUID_SSTABLE_IDENTIFIERS"sv,
        "GROUP0_SCHEMA_VERSIONING"sv,
        "VIEW_BUILD_STATUS_ON_GROUP0"sv,
+        "CDC_GENERATIONS_V2"sv,
    };

    if (is_test_only_feature_deprecated()) {
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -83,7 +83,6 @@ public:
    gms::feature alternator_ttl { *this, "ALTERNATOR_TTL"sv };
    gms::feature cql_row_ttl { *this, "CQL_ROW_TTL"sv };
    gms::feature range_scan_data_variant { *this, "RANGE_SCAN_DATA_VARIANT"sv };
-    gms::feature cdc_generations_v2 { *this, "CDC_GENERATIONS_V2"sv };
    gms::feature user_defined_aggregates { *this, "UDA"sv };
    // Historically max_result_size contained only two fields: soft_limit and
    // hard_limit. It was somehow obscure because for normal paged queries both
@@ -183,6 +182,7 @@ public:
    gms::feature arbitrary_tablet_boundaries { *this, "ARBITRARY_TABLET_BOUNDARIES"sv };
    gms::feature large_data_virtual_tables { *this, "LARGE_DATA_VIRTUAL_TABLES"sv };
    gms::feature keyspace_multi_rf_change { *this, "KEYSPACE_MULTI_RF_CHANGE"sv };
+    gms::feature view_building_tasks_min_task_id { *this, "VIEW_BUILDING_TASKS_MIN_TASK_ID"sv };
 public:

    const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
--- a/Show More
+++ b/Show More