Store active transitions count in load_balancer instance

Move _active_transitions from migration_plan to load_balancer class as a member variable. This is cleaner since there's no concurrent make_plan() calls on the same load_balancer instance. Changes: - Removed _active_transitions field from migration_plan - Added _active_transitions member to load_balancer class - Updated make_plan() to reset counter at the start - Log messages now reference _active_transitions directly Co-authored-by: tgrabiec <283695+tgrabiec@users.noreply.github.com>
Add active tablet transition count to load balancer logs
2026-01-08 13:38:27 +00:00 · 2026-01-07 18:51:18 +00:00 · 2026-01-07 18:12:46 +00:00 · 2026-01-07 12:31:21 +01:00 · 2026-01-07 11:49:01 +01:00 · 2026-01-06 17:47:09 +02:00
265 changed files with 9513 additions and 1927 deletions
--- a/.github/workflows/call_jira_status_in_progress.yml
+++ b/.github/workflows/call_jira_status_in_progress.yml
@@ -1,12 +0,0 @@
-name: Call Jira Status In Progress
-
-on:
-  pull_request_target:
-    types: [opened]
-
-jobs:
-  call-jira-status-in-progress:
-    uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_in_progress.yml@main
-    secrets:
-      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
--- a/.github/workflows/call_jira_status_in_review.yml
+++ b/.github/workflows/call_jira_status_in_review.yml
@@ -1,12 +0,0 @@
-name: Call Jira Status In Review
-
-on:
-  pull_request_target:
-    types: [ready_for_review, review_requested]
-
-jobs:
-  call-jira-status-in-review:
-    uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_in_review.yml@main
-    secrets:
-      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
--- a/.github/workflows/call_jira_status_ready_for_merge.yml
+++ b/.github/workflows/call_jira_status_ready_for_merge.yml
@@ -1,12 +0,0 @@
-name: Call Jira Status Ready For Merge
-
-on:
-  pull_request_target:
-    types: [labeled]
-
-jobs:
-  call-jira-status-update:
-    uses: scylladb/github-automation/.github/workflows/main_update_jira_status_to_ready_for_merge.yml@main
-    secrets:
-      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
-
--- a/.github/workflows/call_jira_sync.yml
+++ b/.github/workflows/call_jira_sync.yml
@@ -0,0 +1,41 @@
+name: Sync Jira Based on PR Events
+
+on:
+  pull_request_target:
+    types: [opened, ready_for_review, review_requested, labeled, unlabeled, closed]
+
+permissions:
+  contents: read
+  pull-requests: write
+  issues: write
+
+jobs:
+  jira-sync-pr-opened:
+    if: github.event.action == 'opened'
+    uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_opened.yml@main
+    secrets:
+      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
+
+  jira-sync-in-review:
+    if: github.event.action == 'ready_for_review' || github.event.action == 'review_requested'
+    uses: scylladb/github-automation/.github/workflows/main_jira_sync_in_review.yml@main
+    secrets:
+      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
+
+  jira-sync-add-label:
+    if: github.event.action == 'labeled'
+    uses: scylladb/github-automation/.github/workflows/main_jira_sync_add_label.yml@main
+    secrets:
+      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
+
+  jira-status-remove-label:
+    if: github.event.action == 'unlabeled'
+    uses: scylladb/github-automation/.github/workflows/main_jira_sync_remove_label.yml@main
+    secrets:
+      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
+
+  jira-status-pr-closed:
+    if: github.event.action == 'closed' 
+    uses: scylladb/github-automation/.github/workflows/main_jira_sync_pr_closed.yml@main
+    secrets:
+      caller_jira_auth: ${{ secrets.USER_AND_KEY_FOR_JIRA_AUTOMATION }}
--- a/.github/workflows/call_validate_pr_author_email.yml
+++ b/.github/workflows/call_validate_pr_author_email.yml
@@ -0,0 +1,13 @@
+name: validate_pr_author_email
+
+on:
+  pull_request_target:
+    types:
+      - opened
+      - synchronize
+      - reopened
+
+jobs:
+  validate_pr_author_email:
+    uses: scylladb/github-automation/.github/workflows/validate_pr_author_email.yml@main
+
--- a/.github/workflows/codespell.yaml
+++ b/.github/workflows/codespell.yaml
@@ -13,5 +13,5 @@ jobs:
      - uses: codespell-project/actions-codespell@master
        with:
          only_warn: 1
-          ignore_words_list: "ans,datas,fo,ser,ue,crate,nd,reenable,strat,stap,te,raison"
+          ignore_words_list: "ans,datas,fo,ser,ue,crate,nd,reenable,strat,stap,te,raison,iif,tread"
          skip: "./.git,./build,./tools,*.js,*.lock,./test,./licenses,./redis/lolwut.cc,*.svg"
--- a/alternator/CMakeLists.txt
+++ b/alternator/CMakeLists.txt
@@ -18,6 +18,7 @@ target_sources(alternator
    consumed_capacity.cc
    ttl.cc
    parsed_expression_cache.cc
+    http_compression.cc
    ${cql_grammar_srcs})
 target_include_directories(alternator
  PUBLIC
--- a/alternator/controller.cc
+++ b/alternator/controller.cc
@@ -28,6 +28,7 @@ static logging::logger logger("alternator_controller");
 controller::controller(
        sharded<gms::gossiper>& gossiper,
        sharded<service::storage_proxy>& proxy,
+        sharded<service::storage_service>& ss,
        sharded<service::migration_manager>& mm,
        sharded<db::system_distributed_keyspace>& sys_dist_ks,
        sharded<cdc::generation_service>& cdc_gen_svc,
@@ -39,6 +40,7 @@ controller::controller(
    : protocol_server(sg)
    , _gossiper(gossiper)
    , _proxy(proxy)
+    , _ss(ss)
    , _mm(mm)
    , _sys_dist_ks(sys_dist_ks)
    , _cdc_gen_svc(cdc_gen_svc)
@@ -89,7 +91,7 @@ future<> controller::start_server() {
        auto get_timeout_in_ms = [] (const db::config& cfg) -> utils::updateable_value<uint32_t> {
            return cfg.alternator_timeout_in_ms;
        };
-        _executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_mm), std::ref(_sys_dist_ks),
+        _executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_ss), std::ref(_mm), std::ref(_sys_dist_ks),
                        sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value(),
                        sharded_parameter(get_timeout_in_ms, std::ref(_config))).get();
        _server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper), std::ref(_auth_service), std::ref(_sl_controller)).get();
@@ -169,7 +171,7 @@ future<> controller::request_stop_server() {
    });
 }

-future<utils::chunked_vector<client_data>> controller::get_client_data() {
+future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> controller::get_client_data() {
    return _server.local().get_client_data();
 }

--- a/alternator/controller.hh
+++ b/alternator/controller.hh
@@ -15,6 +15,7 @@

 namespace service {
 class storage_proxy;
+class storage_service;
 class migration_manager;
 class memory_limiter;
 }
@@ -57,6 +58,7 @@ class server;
 class controller : public protocol_server {
    sharded<gms::gossiper>& _gossiper;
    sharded<service::storage_proxy>& _proxy;
+    sharded<service::storage_service>& _ss;
    sharded<service::migration_manager>& _mm;
    sharded<db::system_distributed_keyspace>& _sys_dist_ks;
    sharded<cdc::generation_service>& _cdc_gen_svc;
@@ -74,6 +76,7 @@ public:
    controller(
        sharded<gms::gossiper>& gossiper,
        sharded<service::storage_proxy>& proxy,
+        sharded<service::storage_service>& ss,
        sharded<service::migration_manager>& mm,
        sharded<db::system_distributed_keyspace>& sys_dist_ks,
        sharded<cdc::generation_service>& cdc_gen_svc,
@@ -93,7 +96,7 @@ public:
    // This virtual function is called (on each shard separately) when the
    // virtual table "system.clients" is read. It is expected to generate a
    // list of clients connected to this server (on this shard).
-    virtual future<utils::chunked_vector<client_data>> get_client_data() override;
+    virtual future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> get_client_data() override;
 };

 }
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -67,6 +67,14 @@ using namespace std::chrono_literals;

 logging::logger elogger("alternator-executor");

+namespace std {
+    template <> struct hash<std::pair<sstring, sstring>> {
+        size_t operator () (const std::pair<sstring, sstring>& p) const {
+            return std::hash<sstring>()(p.first) * 1009 + std::hash<sstring>()(p.second) * 3;
+        }
+    };
+}
+
 namespace alternator {

 // Alternator-specific table properties stored as hidden table tags:
@@ -248,14 +256,66 @@ static const rjson::value::Member& get_single_member(const rjson::value& v, cons
    return *(v.MemberBegin());
 }

+class executor::describe_table_info_manager : public service::migration_listener::empty_listener {
+    executor &_executor;
+
+    struct table_info {
+        utils::simple_value_with_expiry<std::uint64_t> size_in_bytes;
+    };
+    std::unordered_map<std::pair<sstring, sstring>, table_info> info_for_tables;
+    bool active = false;
+
+public:
+    describe_table_info_manager(executor& executor) : _executor(executor) {
+        _executor._proxy.data_dictionary().real_database_ptr()->get_notifier().register_listener(this);
+        active = true;
+    }
+    describe_table_info_manager(const describe_table_info_manager &) = delete;
+    describe_table_info_manager(describe_table_info_manager&&) = delete;
+    ~describe_table_info_manager() {
+        if (active) {
+            on_fatal_internal_error(elogger, "describe_table_info_manager was not stopped before destruction");
+        }
+    }
+
+    describe_table_info_manager &operator = (const describe_table_info_manager &) = delete;
+    describe_table_info_manager &operator = (describe_table_info_manager&&) = delete;
+
+    static std::chrono::high_resolution_clock::time_point now() {
+        return std::chrono::high_resolution_clock::now();
+    }
+
+    std::optional<std::uint64_t> get_cached_size_in_bytes(const sstring &ks_name, const sstring &cf_name) const {
+        auto it = info_for_tables.find({ks_name, cf_name});
+        if (it != info_for_tables.end()) {
+            return it->second.size_in_bytes.get();
+        }
+        return std::nullopt;
+    }
+    void cache_size_in_bytes(sstring ks_name, sstring cf_name, std::uint64_t size_in_bytes, std::chrono::high_resolution_clock::time_point expiry) {
+        info_for_tables[{std::move(ks_name), std::move(cf_name)}].size_in_bytes.set_if_longer_expiry(size_in_bytes, expiry);
+    }
+    future<> stop() {
+        co_await _executor._proxy.data_dictionary().real_database_ptr()->get_notifier().unregister_listener(this);
+        active = false;
+        co_return;
+    }
+    void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {
+        if (!ks_name.starts_with(executor::KEYSPACE_NAME_PREFIX)) return;
+        info_for_tables.erase({ks_name, cf_name});
+    }
+};
+
 executor::executor(gms::gossiper& gossiper,
         service::storage_proxy& proxy,
+         service::storage_service& ss,
         service::migration_manager& mm,
         db::system_distributed_keyspace& sdks,
         cdc::metadata& cdc_metadata,
         smp_service_group ssg,
         utils::updateable_value<uint32_t> default_timeout_in_ms)
    : _gossiper(gossiper),
+      _ss(ss),
      _proxy(proxy),
      _mm(mm),
      _sdks(sdks),
@@ -268,6 +328,7 @@ executor::executor(gms::gossiper& gossiper,
        _stats))
 {
    s_default_timeout_in_ms = std::move(default_timeout_in_ms);
+    _describe_table_info_manager = std::make_unique<describe_table_info_manager>(*this);
    register_metrics(_metrics, _stats);
 }

@@ -752,12 +813,44 @@ static future<bool> is_view_built(

 }

-static future<rjson::value> fill_table_description(schema_ptr schema, table_status tbl_status, service::storage_proxy& proxy, service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit)
+future<> executor::cache_newly_calculated_size_on_all_shards(schema_ptr schema, std::uint64_t size_in_bytes, std::chrono::nanoseconds ttl) {
+    auto expiry = describe_table_info_manager::now() + ttl;
+    return container().invoke_on_all(
+        [schema, size_in_bytes, expiry] (executor& exec) {
+            exec._describe_table_info_manager->cache_size_in_bytes(schema->ks_name(), schema->cf_name(), size_in_bytes, expiry);
+        });
+}
+
+future<> executor::fill_table_size(rjson::value &table_description, schema_ptr schema, bool deleting) {
+    auto cached_size = _describe_table_info_manager->get_cached_size_in_bytes(schema->ks_name(), schema->cf_name());
+    std::uint64_t total_size = 0;
+    if (cached_size) {
+        total_size = *cached_size;
+    } else {
+        // there's no point in trying to estimate value of table that is being deleted, as other nodes more often than not might
+        // move forward with deletion faster than we calculate the size
+        if (!deleting) {
+            total_size = co_await _ss.estimate_total_sstable_volume(schema->id(), service::storage_service::ignore_errors::yes);
+            const auto expiry = std::chrono::seconds{ _proxy.data_dictionary().get_config().alternator_describe_table_info_cache_validity_in_seconds() };
+            // Note: we don't care when the notification of other shards will finish, as long as it will be done
+            // it's possible to get into race condition (next DescribeTable comes to other shard, that new shard doesn't have
+            // the size yet, so it will calculate it again) - this is not a problem, because it will call cache_newly_calculated_size_on_all_shards
+            // with expiry, which is extremely unlikely to be exactly the same as the previous one, all shards will keep the size coming with expiry that is further into the future.
+            // In case of the same expiry, some shards will have different size, which means DescribeTable will return different values depending on the shard
+            // which is also fine, as the specification doesn't give precision guarantees of any kind.
+            co_await cache_newly_calculated_size_on_all_shards(schema, total_size, expiry);
+        }
+    }
+    rjson::add(table_description, "TableSizeBytes", total_size);
+}
+
+future<rjson::value> executor::fill_table_description(schema_ptr schema, table_status tbl_status, service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit)
 {
    rjson::value table_description = rjson::empty_object();
    auto tags_ptr = db::get_tags_of_table(schema);

    rjson::add(table_description, "TableName", rjson::from_string(schema->cf_name()));
+    co_await fill_table_size(table_description, schema, tbl_status == table_status::deleting);

    auto creation_timestamp = get_table_creation_time(*schema);

@@ -801,9 +894,7 @@ static future<rjson::value> fill_table_description(schema_ptr schema, table_stat
    rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", wcu);
    rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);

-
-
-    data_dictionary::table t = proxy.data_dictionary().find_column_family(schema);
+    data_dictionary::table t = _proxy.data_dictionary().find_column_family(schema);

    if (tbl_status != table_status::deleting) {
        rjson::add(table_description, "CreationDateTime", rjson::value(creation_timestamp));
@@ -840,7 +931,7 @@ static future<rjson::value> fill_table_description(schema_ptr schema, table_stat
                // (for a built view) or CREATING+Backfilling (if view building
                // is in progress).
                if (!is_lsi) {
-                    if (co_await is_view_built(vptr, proxy, client_state, trace_state, permit)) {
+                    if (co_await is_view_built(vptr, _proxy, client_state, trace_state, permit)) {
                        rjson::add(view_entry, "IndexStatus", "ACTIVE");
                    } else {
                        rjson::add(view_entry, "IndexStatus", "CREATING");
@@ -868,9 +959,8 @@ static future<rjson::value> fill_table_description(schema_ptr schema, table_stat
        }
        rjson::add(table_description, "AttributeDefinitions", std::move(attribute_definitions));
    }
-    executor::supplement_table_stream_info(table_description, *schema, proxy);
+    executor::supplement_table_stream_info(table_description, *schema, _proxy);

-    // FIXME: still missing some response fields (issue #5026)
    co_return table_description;
 }

@@ -890,7 +980,7 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
    get_stats_from_schema(_proxy, *schema)->api_operations.describe_table++;
    tracing::add_alternator_table_name(trace_state, schema->cf_name());

-    rjson::value table_description = co_await fill_table_description(schema, table_status::active, _proxy, client_state, trace_state, permit);
+    rjson::value table_description = co_await fill_table_description(schema, table_status::active, client_state, trace_state, permit);
    rjson::value response = rjson::empty_object();
    rjson::add(response, "Table", std::move(table_description));
    elogger.trace("returning {}", response);
@@ -993,7 +1083,7 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
    auto& p = _proxy.container();

    schema_ptr schema = get_table(_proxy, request);
-    rjson::value table_description = co_await fill_table_description(schema, table_status::deleting, _proxy, client_state, trace_state, permit);
+    rjson::value table_description = co_await fill_table_description(schema, table_status::deleting, client_state, trace_state, permit);
    co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::DROP, _stats);
    co_await _mm.container().invoke_on(0, [&, cs = client_state.move_to_other_shard()] (service::migration_manager& mm) -> future<> {
        size_t retries = mm.get_concurrent_ddl_retries();
@@ -1557,8 +1647,7 @@ static future<> mark_view_schemas_as_built(utils::chunked_vector<mutation>& out,
    }
 }

-static future<executor::request_return_type> create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request,
-            service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper, bool enforce_authorization, bool warn_authorization, stats& stats, const db::tablets_mode_t::mode tablets_mode) {
+future<executor::request_return_type> executor::create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization, const db::tablets_mode_t::mode tablets_mode) {
    SCYLLA_ASSERT(this_shard_id() == 0);

    // We begin by parsing and validating the content of the CreateTable
@@ -1745,7 +1834,7 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli

    rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
    if (stream_specification && stream_specification->IsObject()) {
-        if (executor::add_stream_options(*stream_specification, builder, sp)) {
+        if (executor::add_stream_options(*stream_specification, builder, _proxy)) {
            validate_cdc_log_name_length(builder.cf_name());
        }
    }
@@ -1764,7 +1853,7 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
    set_table_creation_time(tags_map, db_clock::now());
    builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(tags_map));

-    co_await verify_create_permission(enforce_authorization, warn_authorization, client_state, stats);
+    co_await verify_create_permission(enforce_authorization, warn_authorization, client_state, _stats);

    schema_ptr schema = builder.build();
    for (auto& view_builder : view_builders) {
@@ -1780,18 +1869,18 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
        view_builder.with_view_info(schema, include_all_columns, ""/*where clause*/);
    }

-    size_t retries = mm.get_concurrent_ddl_retries();
+    size_t retries = _mm.get_concurrent_ddl_retries();
    for (;;) {
-        auto group0_guard = co_await mm.start_group0_operation();
+        auto group0_guard = co_await _mm.start_group0_operation();
        auto ts = group0_guard.write_timestamp();
        utils::chunked_vector<mutation> schema_mutations;
-        auto ksm = create_keyspace_metadata(keyspace_name, sp, gossiper, ts, tags_map, sp.features(), tablets_mode);
+        auto ksm = create_keyspace_metadata(keyspace_name, _proxy, _gossiper, ts, tags_map, _proxy.features(), tablets_mode);
        // Alternator Streams doesn't yet work when the table uses tablets (#23838)
        if (stream_specification && stream_specification->IsObject()) {
            auto stream_enabled = rjson::find(*stream_specification, "StreamEnabled");
            if (stream_enabled && stream_enabled->IsBool() && stream_enabled->GetBool()) {
                locator::replication_strategy_params params(ksm->strategy_options(), ksm->initial_tablets(), ksm->consistency_option());
-                const auto& topo = sp.local_db().get_token_metadata().get_topology();
+                const auto& topo = _proxy.local_db().get_token_metadata().get_topology();
                auto rs = locator::abstract_replication_strategy::create_replication_strategy(ksm->strategy_name(), params, topo);
                if (rs->uses_tablets()) {
                    co_return api_error::validation("Streams not yet supported on a table using tablets (issue #23838). "
@@ -1801,17 +1890,17 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
        }
        // Creating an index in tablets mode requires the rf_rack_valid_keyspaces option to be enabled.
        // GSI and LSI indexes are based on materialized views which require this option to avoid consistency issues.
-        if (!view_builders.empty() && ksm->uses_tablets() && !sp.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
+        if (!view_builders.empty() && ksm->uses_tablets() && !_proxy.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
            co_return api_error::validation("GlobalSecondaryIndexes and LocalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
        }
        try {
-            schema_mutations = service::prepare_new_keyspace_announcement(sp.local_db(), ksm, ts);
+            schema_mutations = service::prepare_new_keyspace_announcement(_proxy.local_db(), ksm, ts);
        } catch (exceptions::already_exists_exception&) {
-            if (sp.data_dictionary().has_schema(keyspace_name, table_name)) {
+            if (_proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
                co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
            }
        }
-        if (sp.data_dictionary().try_find_table(schema->id())) {
+        if (_proxy.data_dictionary().try_find_table(schema->id())) {
            // This should never happen, the ID is supposed to be unique
            co_return api_error::internal(format("Table with ID {} already exists", schema->id()));
        }
@@ -1820,9 +1909,9 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
        for (schema_builder& view_builder : view_builders) {
            schemas.push_back(view_builder.build());
        }
-        co_await service::prepare_new_column_families_announcement(schema_mutations, sp, *ksm, schemas, ts);
+        co_await service::prepare_new_column_families_announcement(schema_mutations, _proxy, *ksm, schemas, ts);
        if (ksm->uses_tablets()) {
-            co_await mark_view_schemas_as_built(schema_mutations, schemas, ts, sp);
+            co_await mark_view_schemas_as_built(schema_mutations, schemas, ts, _proxy);
        }

        // If a role is allowed to create a table, we must give it permissions to
@@ -1847,7 +1936,7 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
        }
        std::tie(schema_mutations, group0_guard) = co_await std::move(mc).extract();
        try {
-            co_await mm.announce(std::move(schema_mutations), std::move(group0_guard), fmt::format("alternator-executor: create {} table", table_name));
+            co_await _mm.announce(std::move(schema_mutations), std::move(group0_guard), fmt::format("alternator-executor: create {} table", table_name));
            break;
        }  catch (const service::group0_concurrent_modification& ex) {
            elogger.info("Failed to execute CreateTable {} due to concurrent schema modifications. {}.",
@@ -1859,9 +1948,9 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
        }
    }

-    co_await mm.wait_for_schema_agreement(sp.local_db(), db::timeout_clock::now() + 10s, nullptr);
+    co_await _mm.wait_for_schema_agreement(_proxy.local_db(), db::timeout_clock::now() + 10s, nullptr);
    rjson::value status = rjson::empty_object();
-    executor::supplement_table_info(request, *schema, sp);
+    executor::supplement_table_info(request, *schema, _proxy);
    rjson::add(status, "TableDescription", std::move(request));
    co_return rjson::print(std::move(status));
 }
@@ -1870,10 +1959,11 @@ future<executor::request_return_type> executor::create_table(client_state& clien
    _stats.api_operations.create_table++;
    elogger.trace("Creating table {}", request);

-    co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &sp = _proxy.container(), &g = _gossiper.container(), &e = this->container(), client_state_other_shard = client_state.move_to_other_shard(), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization)]
+    co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &e = this->container(), client_state_other_shard = client_state.move_to_other_shard(), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization)]
                                        (service::migration_manager& mm) mutable -> future<executor::request_return_type> {
        const db::tablets_mode_t::mode tablets_mode = _proxy.data_dictionary().get_config().tablets_mode_for_new_keyspaces(); // type cast
-        co_return co_await create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), sp.local(), mm, g.local(), enforce_authorization, warn_authorization, e.local()._stats, std::move(tablets_mode));
+        // `invoke_on` hopped us to shard 0, but `this` points to `executor` is from 'old' shard, we need to hop it too.
+        co_return co_await e.local().create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), enforce_authorization, warn_authorization, std::move(tablets_mode));
    });
 }

@@ -6087,9 +6177,10 @@ future<> executor::start() {
 }

 future<> executor::stop() {
+    co_await _describe_table_info_manager->stop();
    // disconnect from the value source, but keep the value unchanged.
    s_default_timeout_in_ms = utils::updateable_value<uint32_t>{s_default_timeout_in_ms()};
-    return _parsed_expression_cache->stop();
+    co_await _parsed_expression_cache->stop();
 }

 } // namespace alternator
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -17,11 +17,13 @@
 #include "service/client_state.hh"
 #include "service_permit.hh"
 #include "db/timeout_clock.hh"
+#include "db/config.hh"

 #include "alternator/error.hh"
 #include "stats.hh"
 #include "utils/rjson.hh"
 #include "utils/updateable_value.hh"
+#include "utils/simple_value_with_expiry.hh"

 #include "tracing/trace_state.hh"

@@ -41,6 +43,7 @@ namespace cql3::selection {
 namespace service {
    class storage_proxy;
    class cas_shard;
+    class storage_service;
 }

 namespace cdc {
@@ -57,6 +60,7 @@ class schema_builder;

 namespace alternator {

+enum class table_status;
 class rmw_operation;
 class put_or_delete_item;

@@ -136,6 +140,7 @@ class expression_cache;

 class executor : public peering_sharded_service<executor> {
    gms::gossiper& _gossiper;
+    service::storage_service& _ss;
    service::storage_proxy& _proxy;
    service::migration_manager& _mm;
    db::system_distributed_keyspace& _sdks;
@@ -148,6 +153,11 @@ class executor : public peering_sharded_service<executor> {

    std::unique_ptr<parsed::expression_cache> _parsed_expression_cache;

+    struct describe_table_info_manager;
+    std::unique_ptr<describe_table_info_manager> _describe_table_info_manager;
+
+    future<> cache_newly_calculated_size_on_all_shards(schema_ptr schema, std::uint64_t size_in_bytes, std::chrono::nanoseconds ttl);
+    future<> fill_table_size(rjson::value &table_description, schema_ptr schema, bool deleting);
 public:
    using client_state = service::client_state;
    // request_return_type is the return type of the executor methods, which
@@ -173,6 +183,7 @@ public:

    executor(gms::gossiper& gossiper,
             service::storage_proxy& proxy,
+             service::storage_service& ss,
             service::migration_manager& mm,
             db::system_distributed_keyspace& sdks,
             cdc::metadata& cdc_metadata,
@@ -220,6 +231,8 @@ private:
    friend class rmw_operation;

    static void describe_key_schema(rjson::value& parent, const schema&, std::unordered_map<std::string,std::string> * = nullptr, const std::map<sstring, sstring> *tags = nullptr);
+    future<rjson::value> fill_table_description(schema_ptr schema, table_status tbl_status, service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit);
+    future<executor::request_return_type> create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization, const db::tablets_mode_t::mode tablets_mode);

    future<> do_batch_write(
        std::vector<std::pair<schema_ptr, put_or_delete_item>> mutation_builders,
--- a/alternator/http_compression.cc
+++ b/alternator/http_compression.cc
@@ -0,0 +1,301 @@
+/*
+ * Copyright 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#include "alternator/http_compression.hh"
+#include "alternator/server.hh"
+#include <seastar/coroutine/maybe_yield.hh>
+#include <zlib.h>
+
+static logging::logger slogger("alternator-http-compression");
+
+namespace alternator {
+
+
+static constexpr size_t compressed_buffer_size = 1024;
+class zlib_compressor {
+    z_stream _zs;
+    temporary_buffer<char> _output_buf;
+    noncopyable_function<future<>(temporary_buffer<char>&&)> _write_func;
+public:
+    zlib_compressor(bool gzip, int compression_level, noncopyable_function<future<>(temporary_buffer<char>&&)> write_func)
+     : _write_func(std::move(write_func)) {
+        memset(&_zs, 0, sizeof(_zs));
+        if (deflateInit2(&_zs, std::clamp(compression_level, Z_NO_COMPRESSION, Z_BEST_COMPRESSION), Z_DEFLATED,
+                (gzip ? 16 : 0) + MAX_WBITS, 8, Z_DEFAULT_STRATEGY) != Z_OK) {
+            // Should only happen if memory allocation fails
+            throw std::bad_alloc();
+        }
+    }
+    ~zlib_compressor() {
+        deflateEnd(&_zs);
+    }
+    future<> close() {
+        return compress(nullptr, 0, true);
+    }
+
+    future<> compress(const char* buf, size_t len, bool is_last_chunk = false) {
+        _zs.next_in = reinterpret_cast<unsigned char*>(const_cast<char*>(buf));
+        _zs.avail_in = (uInt) len;
+        int mode = is_last_chunk ? Z_FINISH : Z_NO_FLUSH;
+        while(_zs.avail_in > 0 || is_last_chunk) {
+            co_await coroutine::maybe_yield();
+            if (_output_buf.empty()) {
+                if (is_last_chunk) {
+                    uint32_t max_buffer_size = 0;
+                    deflatePending(&_zs, &max_buffer_size, nullptr);
+                    max_buffer_size += deflateBound(&_zs, _zs.avail_in) + 1;
+                    _output_buf = temporary_buffer<char>(std::min(compressed_buffer_size, (size_t) max_buffer_size));
+                } else {
+                    _output_buf = temporary_buffer<char>(compressed_buffer_size);
+                }
+                _zs.next_out = reinterpret_cast<unsigned char*>(_output_buf.get_write());
+                _zs.avail_out = compressed_buffer_size;
+            }
+            int e = deflate(&_zs, mode);
+            if (e < Z_OK) {
+                throw api_error::internal("Error during compression of response body");
+            }
+            if (e == Z_STREAM_END || _zs.avail_out < compressed_buffer_size / 4) {
+                _output_buf.trim(compressed_buffer_size - _zs.avail_out);
+                co_await _write_func(std::move(_output_buf));
+                if (e == Z_STREAM_END) {
+                    break;
+                }
+            }
+        }
+    }
+};
+
+// Helper string_view functions for parsing Accept-Encoding header
+struct case_insensitive_cmp_sv {
+    bool operator()(std::string_view s1, std::string_view s2) const {
+        return std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(),
+            [](char a, char b) { return ::tolower(a) == ::tolower(b); });
+    }
+};
+static inline std::string_view trim_left(std::string_view sv) {
+    while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.front())))
+        sv.remove_prefix(1);
+    return sv;
+}
+static inline std::string_view trim_right(std::string_view sv) {
+    while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back())))
+        sv.remove_suffix(1);
+    return sv;
+}
+static inline std::string_view trim(std::string_view sv) {
+    return trim_left(trim_right(sv));
+}
+
+inline std::vector<std::string_view> split(std::string_view text, char separator) {
+    std::vector<std::string_view> tokens;
+    if (text == "") {
+        return tokens;
+    }
+
+    while (true) {
+        auto pos = text.find_first_of(separator);
+        if (pos != std::string_view::npos) {
+            tokens.emplace_back(text.data(), pos);
+            text.remove_prefix(pos + 1);
+        } else {
+            tokens.emplace_back(text);
+            break;
+        }
+    }
+    return tokens;
+}
+
+constexpr response_compressor::compression_type response_compressor::get_compression_type(std::string_view encoding) {
+    for (size_t i = 0; i < static_cast<size_t>(compression_type::count); ++i) {
+        if (case_insensitive_cmp_sv{}(encoding, compression_names[i])) {
+            return static_cast<compression_type>(i);
+        }
+    }
+    return compression_type::unknown;
+}
+
+response_compressor::compression_type response_compressor::find_compression(std::string_view accept_encoding, size_t response_size) {
+    std::optional<float> ct_q[static_cast<size_t>(compression_type::count)];
+    ct_q[static_cast<size_t>(compression_type::none)] = std::numeric_limits<float>::min(); // enabled, but lowest priority
+    compression_type selected_ct = compression_type::none;
+
+    std::vector<std::string_view> entries = split(accept_encoding, ',');
+    for (auto& e : entries) {
+        std::vector<std::string_view> params = split(e, ';');
+        if (params.size() == 0) {
+            continue;
+        }
+        compression_type ct = get_compression_type(trim(params[0]));
+        if (ct == compression_type::unknown) {
+            continue; // ignore unknown encoding types
+        }
+        if (ct_q[static_cast<size_t>(ct)].has_value() && ct_q[static_cast<size_t>(ct)] != 0.0f) {
+            continue; // already processed this encoding
+        }
+        if (response_size < _threshold[static_cast<size_t>(ct)]) {
+            continue; // below threshold treat as unknown
+        }
+        for (size_t i = 1; i < params.size(); ++i) { // find "q=" parameter
+            auto pos = params[i].find("q=");
+            if (pos == std::string_view::npos) {
+                continue;
+            }
+            std::string_view param = params[i].substr(pos + 2);
+            param = trim(param);
+            // parse quality value
+            float q_value = 1.0f;
+            auto [ptr, ec] = std::from_chars(param.data(), param.data() + param.size(), q_value);
+            if (ec != std::errc() || ptr != param.data() + param.size()) {
+                continue;
+            }
+            if (q_value < 0.0) {
+                q_value = 0.0;
+            } else if (q_value > 1.0) {
+                q_value = 1.0;
+            }
+            ct_q[static_cast<size_t>(ct)] = q_value;
+            break; // we parsed quality value
+        }
+        if (!ct_q[static_cast<size_t>(ct)].has_value()) {
+            ct_q[static_cast<size_t>(ct)] = 1.0f; // default quality value
+        }
+        // keep the highest encoding (in the order, unless 'any')
+        if (selected_ct == compression_type::any) {
+            if (ct_q[static_cast<size_t>(ct)] >= ct_q[static_cast<size_t>(selected_ct)]) {
+                selected_ct = ct;
+            }
+        } else {
+            if (ct_q[static_cast<size_t>(ct)] > ct_q[static_cast<size_t>(selected_ct)]) {
+                selected_ct = ct;
+            }
+        }
+    }
+    if (selected_ct == compression_type::any) {
+        // select any not mentioned or highest quality
+        selected_ct = compression_type::none;
+        for (size_t i = 0; i < static_cast<size_t>(compression_type::compressions_count); ++i) {
+            if (!ct_q[i].has_value()) {
+                return static_cast<compression_type>(i);
+            }
+            if (ct_q[i] > ct_q[static_cast<size_t>(selected_ct)]) {
+                selected_ct = static_cast<compression_type>(i);
+            }
+        }
+    }
+    return selected_ct;
+}
+
+static future<chunked_content> compress(response_compressor::compression_type ct, const db::config& cfg, std::string str) {
+    chunked_content compressed;
+    auto write = [&compressed](temporary_buffer<char>&& buf) -> future<> {
+        compressed.push_back(std::move(buf));
+        return make_ready_future<>();
+    };
+    zlib_compressor compressor(ct != response_compressor::compression_type::deflate,
+        cfg.alternator_response_gzip_compression_level(), std::move(write));
+    co_await compressor.compress(str.data(), str.size(), true);
+    co_return compressed;
+}
+
+static sstring flatten(chunked_content&& cc) {
+    size_t total_size = 0;
+    for (const auto& chunk : cc) {
+        total_size += chunk.size();
+    }
+    sstring result = sstring{ sstring::initialized_later{}, total_size };
+    size_t offset = 0;
+    for (const auto& chunk : cc) {
+        std::copy(chunk.begin(), chunk.end(), result.begin() + offset);
+        offset += chunk.size();
+    }
+    return result;
+}
+
+future<std::unique_ptr<http::reply>> response_compressor::generate_reply(std::unique_ptr<http::reply> rep, sstring accept_encoding, const char* content_type, std::string&& response_body) {
+    response_compressor::compression_type ct = find_compression(accept_encoding, response_body.size());
+    if (ct != response_compressor::compression_type::none) {
+        rep->add_header("Content-Encoding", get_encoding_name(ct));
+        rep->set_content_type(content_type);
+        return compress(ct, cfg, std::move(response_body)).then([rep = std::move(rep)] (chunked_content compressed) mutable {
+            rep->_content = flatten(std::move(compressed));
+            return make_ready_future<std::unique_ptr<http::reply>>(std::move(rep));
+        });
+    } else {
+        // Note that despite the move, there is a copy here -
+        // as str is std::string and rep->_content is sstring.
+        rep->_content = std::move(response_body);
+        rep->set_content_type(content_type);
+    }
+    return make_ready_future<std::unique_ptr<http::reply>>(std::move(rep));
+}
+
+template<typename Compressor>
+class compressed_data_sink_impl : public data_sink_impl {
+    output_stream<char> _out;
+    Compressor _compressor;
+public:
+    template<typename... Args>
+    compressed_data_sink_impl(output_stream<char>&& out, Args&&... args)
+     : _out(std::move(out)), _compressor(std::forward<Args>(args)..., [this](temporary_buffer<char>&& buf) {
+        return _out.write(std::move(buf));
+    }) { }
+
+    future<> put(std::span<temporary_buffer<char>> data) override {
+        return data_sink_impl::fallback_put(data, [this] (temporary_buffer<char>&& buf) {
+            return do_put(std::move(buf));
+        });
+    }
+
+private:
+    future<> do_put(temporary_buffer<char> buf) {
+        co_return co_await _compressor.compress(buf.get(), buf.size());
+
+    }
+    future<> close() override {
+        return _compressor.close().then([this] {
+            return _out.close();
+        });
+    }
+};
+
+executor::body_writer compress(response_compressor::compression_type ct, const db::config& cfg, executor::body_writer&& bw) {
+    return [bw = std::move(bw), ct, level = cfg.alternator_response_gzip_compression_level()](output_stream<char>&& out) mutable -> future<> {
+        output_stream_options opts;
+        opts.trim_to_size = true;
+        std::unique_ptr<data_sink_impl> data_sink_impl;
+        switch (ct) {
+            case response_compressor::compression_type::gzip:
+                data_sink_impl = std::make_unique<compressed_data_sink_impl<zlib_compressor>>(std::move(out), true, level);
+                break;
+            case response_compressor::compression_type::deflate:
+                data_sink_impl = std::make_unique<compressed_data_sink_impl<zlib_compressor>>(std::move(out), false, level);
+                break;
+            case response_compressor::compression_type::none:
+            case response_compressor::compression_type::any:
+            case response_compressor::compression_type::unknown:
+                on_internal_error(slogger,"Compression not selected");
+            default:
+                on_internal_error(slogger, "Unsupported compression type for data sink");
+        }
+        return bw(output_stream<char>(data_sink(std::move(data_sink_impl)), compressed_buffer_size, opts));
+    };
+}
+
+future<std::unique_ptr<http::reply>> response_compressor::generate_reply(std::unique_ptr<http::reply> rep, sstring accept_encoding, const char* content_type, executor::body_writer&& body_writer) {
+    response_compressor::compression_type ct = find_compression(accept_encoding, std::numeric_limits<size_t>::max());
+    if (ct != response_compressor::compression_type::none) {
+        rep->add_header("Content-Encoding", get_encoding_name(ct));
+        rep->write_body(content_type, compress(ct, cfg, std::move(body_writer)));
+    } else {
+        rep->write_body(content_type, std::move(body_writer));
+    }
+    return make_ready_future<std::unique_ptr<http::reply>>(std::move(rep));
+}
+
+} // namespace alternator
--- a/alternator/http_compression.hh
+++ b/alternator/http_compression.hh
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include "alternator/executor.hh"
+#include <seastar/http/httpd.hh>
+#include "db/config.hh"
+
+namespace alternator {
+
+class response_compressor {
+public:
+    enum class compression_type {
+        gzip,
+        deflate,
+        compressions_count,
+        any = compressions_count,
+        none,
+        count,
+        unknown = count
+    };
+    static constexpr std::string_view compression_names[] = {
+        "gzip",
+        "deflate",
+        "*",
+        "identity"
+    };
+
+    static sstring get_encoding_name(compression_type ct) {
+        return sstring(compression_names[static_cast<size_t>(ct)]);
+    }
+    static constexpr compression_type get_compression_type(std::string_view encoding);
+
+    sstring get_accepted_encoding(const http::request& req) {
+        if (get_threshold() == 0) {
+            return "";
+        }
+        return req.get_header("Accept-Encoding");
+    }
+    compression_type find_compression(std::string_view accept_encoding, size_t response_size);
+
+    response_compressor(const db::config& cfg)
+        : cfg(cfg)
+        ,_gzip_level_observer(
+            cfg.alternator_response_gzip_compression_level.observe([this](int v) {
+                    update_threshold();
+                }))
+        ,_gzip_threshold_observer(
+            cfg.alternator_response_compression_threshold_in_bytes.observe([this](uint32_t v) {
+                    update_threshold();
+                }))
+    {
+        update_threshold();
+    }
+    response_compressor(const response_compressor& rhs) : response_compressor(rhs.cfg) {}
+
+private:
+    const db::config& cfg;
+    utils::observable<int>::observer _gzip_level_observer;
+    utils::observable<uint32_t>::observer _gzip_threshold_observer;
+    uint32_t _threshold[static_cast<size_t>(compression_type::count)];
+
+    size_t get_threshold() { return _threshold[static_cast<size_t>(compression_type::any)]; }
+    void update_threshold() {
+        _threshold[static_cast<size_t>(compression_type::none)] = std::numeric_limits<uint32_t>::max();
+        _threshold[static_cast<size_t>(compression_type::any)] = std::numeric_limits<uint32_t>::max();
+        uint32_t gzip = cfg.alternator_response_gzip_compression_level() <= 0 ? std::numeric_limits<uint32_t>::max()
+            : cfg.alternator_response_compression_threshold_in_bytes();
+        _threshold[static_cast<size_t>(compression_type::gzip)] = gzip;
+        _threshold[static_cast<size_t>(compression_type::deflate)] = gzip;
+        for (size_t i = 0; i < static_cast<size_t>(compression_type::compressions_count); ++i) {
+            if (_threshold[i] < _threshold[static_cast<size_t>(compression_type::any)]) {
+                _threshold[static_cast<size_t>(compression_type::any)] = _threshold[i];
+            }
+        }
+    }
+
+public:
+    future<std::unique_ptr<http::reply>> generate_reply(std::unique_ptr<http::reply> rep,
+         sstring accept_encoding, const char* content_type, std::string&& response_body);
+    future<std::unique_ptr<http::reply>> generate_reply(std::unique_ptr<http::reply> rep,
+         sstring accept_encoding, const char* content_type, executor::body_writer&& body_writer);
+};
+
+}
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -34,6 +34,7 @@
 #include "client_data.hh"
 #include "utils/updateable_value.hh"
 #include <zlib.h>
+#include "alternator/http_compression.hh"

 static logging::logger slogger("alternator-server");

@@ -111,9 +112,12 @@ class api_handler : public handler_base {
    // type applies to all replies, both success and error.
    static constexpr const char* REPLY_CONTENT_TYPE = "application/x-amz-json-1.0";
 public:
-    api_handler(const std::function<future<executor::request_return_type>(std::unique_ptr<request> req)>& _handle) : _f_handle(
+    api_handler(const std::function<future<executor::request_return_type>(std::unique_ptr<request> req)>& _handle,
+                const db::config& config) : _response_compressor(config), _f_handle(
         [this, _handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
-         return seastar::futurize_invoke(_handle, std::move(req)).then_wrapped([this, rep = std::move(rep)](future<executor::request_return_type> resf) mutable {
+         sstring accept_encoding = _response_compressor.get_accepted_encoding(*req);
+         return seastar::futurize_invoke(_handle, std::move(req)).then_wrapped(
+            [this, rep = std::move(rep), accept_encoding=std::move(accept_encoding)](future<executor::request_return_type> resf) mutable {
             if (resf.failed()) {
                 // Exceptions of type api_error are wrapped as JSON and
                 // returned to the client as expected. Other types of
@@ -133,22 +137,20 @@ public:
                 return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
             }
             auto res = resf.get();
-             std::visit(overloaded_functor {
+             return std::visit(overloaded_functor {
                [&] (std::string&& str) {
-                    // Note that despite the move, there is a copy here -
-                    // as str is std::string and rep->_content is sstring.
-                    rep->_content = std::move(str);
-                    rep->set_content_type(REPLY_CONTENT_TYPE);
+                    return _response_compressor.generate_reply(std::move(rep), std::move(accept_encoding),
+                                                               REPLY_CONTENT_TYPE, std::move(str));
                },
                [&] (executor::body_writer&& body_writer) {
-                    rep->write_body(REPLY_CONTENT_TYPE, std::move(body_writer));
+                    return _response_compressor.generate_reply(std::move(rep), std::move(accept_encoding),
+                                                               REPLY_CONTENT_TYPE, std::move(body_writer));
                },
                [&] (const api_error& err) {
                    generate_error_reply(*rep, err);
+                    return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
                }
             }, std::move(res));
-
-             return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
         });
    }) { }

@@ -177,6 +179,7 @@ protected:
        slogger.trace("api_handler error case: {}", rep._content);
    }

+    response_compressor _response_compressor;
    future_handler_function _f_handle;
 };

@@ -708,8 +711,12 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
    // As long as the system_clients_entry object is alive, this request will
    // be visible in the "system.clients" virtual table. When requested, this
    // entry will be formatted by server::ongoing_request::make_client_data().
+    auto user_agent_header = co_await _connection_options_keys_and_values.get_or_load(req->get_header("User-Agent"), [] (const client_options_cache_key_type&) {
+        return make_ready_future<options_cache_value_type>(options_cache_value_type{});
+    });
+
    auto system_clients_entry = _ongoing_requests.emplace(
-        req->get_client_address(), req->get_header("User-Agent"),
+        req->get_client_address(), std::move(user_agent_header),
        username, current_scheduling_group(),
        req->get_protocol_name() == "https");

@@ -754,7 +761,7 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
 void server::set_routes(routes& r) {
    api_handler* req_handler = new api_handler([this] (std::unique_ptr<request> req) mutable {
        return handle_api_request(std::move(req));
-    });
+    }, _proxy.data_dictionary().get_config());

    r.put(operation_type::POST, "/", req_handler);
    r.put(operation_type::GET, "/", new health_handler(_pending_requests));
@@ -985,10 +992,10 @@ client_data server::ongoing_request::make_client_data() const {
    return cd;
 }

-future<utils::chunked_vector<client_data>> server::get_client_data() {
-    utils::chunked_vector<client_data> ret;
+future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> server::get_client_data() {
+    utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>> ret;
    co_await _ongoing_requests.for_each_gently([&ret] (const ongoing_request& r) {
-        ret.emplace_back(r.make_client_data());
+        ret.emplace_back(make_foreign(std::make_unique<client_data>(r.make_client_data())));
    });
    co_return ret;
 }
--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -55,6 +55,7 @@ class server : public peering_sharded_service<server> {
    // though it isn't really relevant for Alternator which defines its own
    // timeouts separately. We can create this object only once.
    updateable_timeout_config _timeout_config;
+    client_options_cache_type _connection_options_keys_and_values;

    alternator_callbacks_map _callbacks;

@@ -88,7 +89,7 @@ class server : public peering_sharded_service<server> {
    // is called when reading the "system.clients" virtual table.
    struct ongoing_request {
        socket_address _client_address;
-        sstring _user_agent;
+        client_options_cache_entry_type _user_agent;
        sstring _username;
        scheduling_group _scheduling_group;
        bool _is_https;
@@ -107,7 +108,7 @@ public:
    // table "system.clients" is read. It is expected to generate a list of
    // clients connected to this server (on this shard). This function is
    // called by alternator::controller::get_client_data().
-    future<utils::chunked_vector<client_data>> get_client_data();
+    future<utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>> get_client_data();
 private:
    void set_routes(seastar::httpd::routes& r);
    // If verification succeeds, returns the authenticated user's username
--- a/api/client_routes.cc
+++ b/api/client_routes.cc
@@ -100,9 +100,8 @@ rest_set_client_routes(http_context& ctx, sharded<service::client_routes_service
    rapidjson::Document root;
    auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
    root.Parse(content.c_str());
-    const auto route_entries = parse_set_client_array(root);

-    co_await cr.local().set_client_routes(route_entries);
+    co_await cr.local().set_client_routes(parse_set_client_array(root));
    co_return seastar::json::json_void();
 }

@@ -132,8 +131,7 @@ rest_delete_client_routes(http_context& ctx, sharded<service::client_routes_serv
    auto content = co_await util::read_entire_stream_contiguous(*req->content_stream);
    root.Parse(content.c_str());

-    const auto route_keys = parse_delete_client_array(root);
-    co_await cr.local().delete_client_routes(route_keys);
+    co_await cr.local().delete_client_routes(parse_delete_client_array(root));
    co_return seastar::json::json_void();
 }

--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -547,17 +547,13 @@ void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_build
                vp.insert(b.second);
            }
        }
-        std::vector<sstring> res;
        replica::database& db = vb.local().get_db();
        auto uuid = validate_table(db, ks, cf_name);
        replica::column_family& cf = db.find_column_family(uuid);
-        res.reserve(cf.get_index_manager().list_indexes().size());
-        for (auto&& i : cf.get_index_manager().list_indexes()) {
-            if (vp.contains(secondary_index::index_table_name(i.metadata().name()))) {
-                res.emplace_back(i.metadata().name());
-            }
-        }
-        co_return res;
+        co_return cf.get_index_manager().list_indexes()
+                | std::views::transform([] (const auto& i) { return i.metadata().name(); })
+                | std::views::filter([&vp] (const auto& n) { return vp.contains(secondary_index::index_table_name(n)); })
+                | std::ranges::to<std::vector>();
    });

 }
--- a/auth/cache.cc
+++ b/auth/cache.cc
@@ -15,6 +15,7 @@
 #include "db/system_keyspace.hh"
 #include "schema/schema.hh"
 #include <iterator>
+#include <seastar/core/abort_source.hh>
 #include <seastar/coroutine/maybe_yield.hh>
 #include <seastar/core/format.hh>

@@ -22,9 +23,11 @@ namespace auth {

 logging::logger logger("auth-cache");

-cache::cache(cql3::query_processor& qp) noexcept
+cache::cache(cql3::query_processor& qp, abort_source& as) noexcept
    : _current_version(0)
-    , _qp(qp) {
+    , _qp(qp)
+    , _loading_sem(1)
+    , _as(as) {
 }

 lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
@@ -116,6 +119,8 @@ future<> cache::load_all() {
        co_return;
    }
    SCYLLA_ASSERT(this_shard_id() == 0);
+    auto units = co_await get_units(_loading_sem, 1, _as);
+
    ++_current_version;

    logger.info("Loading all roles");
@@ -146,6 +151,9 @@ future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
    if (legacy_mode(_qp)) {
        co_return;
    }
+    SCYLLA_ASSERT(this_shard_id() == 0);
+    auto units = co_await get_units(_loading_sem, 1, _as);
+
    for (const auto& name : roles) {
        logger.info("Loading role {}", name);
        auto role = co_await fetch_role(name);
--- a/auth/cache.hh
+++ b/auth/cache.hh
@@ -8,6 +8,7 @@

 #pragma once

+#include <seastar/core/abort_source.hh>
 #include <unordered_set>
 #include <unordered_map>

@@ -15,6 +16,7 @@
 #include <seastar/core/future.hh>
 #include <seastar/core/sharded.hh>
 #include <seastar/core/shared_ptr.hh>
+#include <seastar/core/semaphore.hh>

 #include <absl/container/flat_hash_map.h>

@@ -41,7 +43,7 @@ public:
        version_tag_t version; // used for seamless cache reloads
    };

-    explicit cache(cql3::query_processor& qp) noexcept;
+    explicit cache(cql3::query_processor& qp, abort_source& as) noexcept;
    lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
    future<> load_all();
    future<> load_roles(std::unordered_set<role_name_t> roles);
@@ -52,6 +54,8 @@ private:
    roles_map _roles;
    version_tag_t _current_version;
    cql3::query_processor& _qp;
+    semaphore _loading_sem;
+    abort_source& _as;

    future<lw_shared_ptr<role_record>> fetch_role(const role_name_t& role) const;
    future<> prune_all() noexcept;
--- a/client_data.hh
+++ b/client_data.hh
@@ -10,7 +10,9 @@
 #include <seastar/net/inet_address.hh>
 #include <seastar/core/sstring.hh>
 #include "seastarx.hh"
+#include "utils/loading_shared_values.hh"

+#include <list>
 #include <optional>

 enum class client_type {
@@ -27,6 +29,20 @@ enum class client_connection_stage {
    ready,
 };

+// We implement a keys cache using a map-like utils::loading_shared_values container by storing empty values.
+struct options_cache_value_type {};
+using client_options_cache_type = utils::loading_shared_values<sstring, options_cache_value_type>;
+using client_options_cache_entry_type = client_options_cache_type::entry_ptr;
+using client_options_cache_key_type = client_options_cache_type::key_type;
+
+// This struct represents a single OPTION key-value pair from the client's connection options.
+// Both key and value are represented by corresponding "references" to their cached values.
+// Each "reference" is effectively a lw_shared_ptr value.
+struct client_option_key_value_cached_entry {
+    client_options_cache_entry_type key;
+    client_options_cache_entry_type value;
+};
+
 sstring to_string(client_connection_stage ct);

 // Representation of a row in `system.clients'. std::optionals are for nullable cells.
@@ -37,8 +53,8 @@ struct client_data {
    client_connection_stage connection_stage = client_connection_stage::established;
    int32_t shard_id;  /// ID of server-side shard which is processing the connection.

-    std::optional<sstring> driver_name;
-    std::optional<sstring> driver_version;
+    std::optional<client_options_cache_entry_type> driver_name;
+    std::optional<client_options_cache_entry_type> driver_version;
    std::optional<sstring> hostname;
    std::optional<int32_t> protocol_version;
    std::optional<sstring> ssl_cipher_suite;
@@ -46,6 +62,7 @@ struct client_data {
    std::optional<sstring> ssl_protocol;
    std::optional<sstring> username;
    std::optional<sstring> scheduling_group_name;
+    std::list<client_option_key_value_cached_entry> client_options;

    sstring stage_str() const { return to_string(connection_stage); }
    sstring client_type_str() const { return to_string(ct); }
--- a/cmake/mode.common.cmake
+++ b/cmake/mode.common.cmake
@@ -125,10 +125,6 @@ if(target_arch)
  add_compile_options("-march=${target_arch}")
 endif()

-if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-  add_compile_options("SHELL:-Xclang -fexperimental-assignment-tracking=disabled")
-endif()
-
 function(maybe_limit_stack_usage_in_KB stack_usage_threshold_in_KB config)
  math(EXPR _stack_usage_threshold_in_bytes "${stack_usage_threshold_in_KB} * 1024")
  set(_stack_usage_threshold_flag "-Wstack-usage=${_stack_usage_threshold_in_bytes}")
--- a/compaction/compaction_group_view.hh
+++ b/compaction/compaction_group_view.hh
@@ -12,6 +12,7 @@
 #include <seastar/core/condition-variable.hh>

 #include "schema/schema_fwd.hh"
+#include "sstables/open_info.hh"
 #include "compaction_descriptor.hh"

 class reader_permit;
@@ -44,7 +45,7 @@ public:
    virtual compaction_strategy_state& get_compaction_strategy_state() noexcept = 0;
    virtual reader_permit make_compaction_reader_permit() const = 0;
    virtual sstables::sstables_manager& get_sstables_manager() noexcept = 0;
-    virtual sstables::shared_sstable make_sstable() const = 0;
+    virtual sstables::shared_sstable make_sstable(sstables::sstable_state) const = 0;
    virtual sstables::sstable_writer_config configure_writer(sstring origin) const = 0;
    virtual api::timestamp_type min_memtable_timestamp() const = 0;
    virtual api::timestamp_type min_memtable_live_timestamp() const = 0;
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -416,7 +416,9 @@ future<compaction_result> compaction_task_executor::compact_sstables(compaction_
        descriptor.enable_garbage_collection(co_await sstable_set_for_tombstone_gc(t));
    }
    descriptor.creator = [&t] (shard_id) {
-        return t.make_sstable();
+        // All compaction types going through this path will work on normal input sstables only.
+        // Off-strategy, for example, waits until the sstables move out of staging state.
+        return t.make_sstable(sstables::sstable_state::normal);
    };
    descriptor.replacer = [this, &t, &on_replace, offstrategy] (compaction_completion_desc desc) {
        t.get_compaction_strategy().notify_completion(t, desc.old_sstables, desc.new_sstables);
@@ -1847,6 +1849,10 @@ protected:
                throw make_compaction_stopped_exception();
            }
        }, false);
+        if (utils::get_local_injector().is_enabled("split_sstable_force_stop_exception")) {
+            throw make_compaction_stopped_exception();
+        }
+
        co_return co_await do_rewrite_sstable(std::move(sst));
    }
 };
@@ -2284,12 +2290,16 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
 }

 future<std::vector<sstables::shared_sstable>>
-compaction_manager::maybe_split_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
+compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
    if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
        co_return std::vector<sstables::shared_sstable>{sst};
    }
-    if (!can_proceed(&t)) {
-        co_return std::vector<sstables::shared_sstable>{sst};
+    // Throw an error if split cannot be performed due to e.g. out of space prevention.
+    // We don't want to prevent split because compaction is temporarily disabled on a view only for synchronization,
+    // which is unneeded against new sstables that aren't part of any set yet, so never use can_proceed(&t) here.
+    if (is_disabled()) {
+        co_return coroutine::exception(std::make_exception_ptr(std::runtime_error(format("Cannot split {} because manager has compaction disabled, " \
+                                                                                         "reason might be out of space prevention", sst->get_filename()))));
    }
    std::vector<sstables::shared_sstable> ret;

@@ -2297,8 +2307,11 @@ compaction_manager::maybe_split_sstable(sstables::shared_sstable sst, compaction
    compaction_progress_monitor monitor;
    compaction_data info = create_compaction_data();
    compaction_descriptor desc = split_compaction_task_executor::make_descriptor(sst, opt);
-    desc.creator = [&t] (shard_id _) {
-        return t.make_sstable();
+    desc.creator = [&t, sst] (shard_id _) {
+        // NOTE: preserves the sstable state, since we want the output to be on the same state as the original.
+        // For example, if base table has views, it's important that sstable produced by repair will be
+        // in the staging state.
+        return t.make_sstable(sst->state());
    };
    desc.replacer = [&] (compaction_completion_desc d) {
        std::move(d.new_sstables.begin(), d.new_sstables.end(), std::back_inserter(ret));
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -376,7 +376,8 @@ public:
    // Splits a single SSTable by segregating all its data according to the classifier.
    // If SSTable doesn't need split, the same input SSTable is returned as output.
    // If SSTable needs split, then output SSTables are returned and the input SSTable is deleted.
-    future<std::vector<sstables::shared_sstable>> maybe_split_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt);
+    // Exception is thrown if the input sstable cannot be split due to e.g. out of space prevention.
+    future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt);

    // Run a custom job for a given table, defined by a function
    // it completes when future returned by job is ready or returns immediately
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -571,10 +571,10 @@ commitlog_total_space_in_mb: -1
 #   - "none": auditing is disabled (default)
 #   - "table": save audited events in audit.audit_log column family
 #   - "syslog": send audited events via syslog (depends on OS, but usually to /dev/log)
-# audit: "none"
+audit: "table"
 #
 # List of statement categories that should be audited.
-# audit_categories: "DCL,DDL,AUTH"
+audit_categories: "DCL,DDL,AUTH,ADMIN"
 #
 # List of tables that should be audited.
 # audit_tables: "<keyspace_name>.<table_name>,<keyspace_name>.<table_name>"
--- a/configure.py
+++ b/configure.py
@@ -368,6 +368,87 @@ def find_ninja():
    sys.exit(1)


+def find_compiler(name):
+    """
+    Find a compiler by name, skipping ccache wrapper directories.
+
+    This is useful when using sccache to avoid double-caching through ccache.
+
+    Args:
+        name: The compiler name (e.g., 'clang++', 'clang', 'gcc')
+
+    Returns:
+        Path to the compiler, skipping ccache directories, or None if not found.
+    """
+    ccache_dirs = {'/usr/lib/ccache', '/usr/lib64/ccache'}
+    for path_dir in os.environ.get('PATH', '').split(os.pathsep):
+        # Skip ccache wrapper directories
+        if os.path.realpath(path_dir) in ccache_dirs or path_dir in ccache_dirs:
+            continue
+        candidate = os.path.join(path_dir, name)
+        if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
+            return candidate
+    return None
+
+
+def resolve_compilers_for_compiler_cache(args, compiler_cache):
+    """
+    When using a compiler cache, resolve compiler paths to avoid ccache directories.
+
+    This prevents double-caching when ccache symlinks are in PATH.
+
+    Args:
+        args: The argument namespace with cc and cxx attributes.
+        compiler_cache: Path to the compiler cache binary, or None.
+    """
+    if not compiler_cache:
+        return
+    if not os.path.isabs(args.cxx):
+        real_cxx = find_compiler(args.cxx)
+        if real_cxx:
+            args.cxx = real_cxx
+    if not os.path.isabs(args.cc):
+        real_cc = find_compiler(args.cc)
+        if real_cc:
+            args.cc = real_cc
+
+
+def find_compiler_cache(preference):
+    """
+    Find a compiler cache based on the preference.
+
+    Args:
+        preference: One of 'auto', 'sccache', 'ccache', 'none', or a path to a binary.
+
+    Returns:
+        Path to the compiler cache binary, or None if not found/disabled.
+    """
+    if preference == 'none':
+        return None
+
+    if preference == 'auto':
+        # Prefer sccache over ccache
+        for cache in ['sccache', 'ccache']:
+            path = which(cache)
+            if path:
+                return path
+        return None
+
+    if preference in ('sccache', 'ccache'):
+        path = which(preference)
+        if path:
+            return path
+        print(f"Warning: {preference} not found on PATH, disabling compiler cache")
+        return None
+
+    # Assume it's a path to a binary
+    if os.path.isfile(preference) and os.access(preference, os.X_OK):
+        return preference
+
+    print(f"Warning: compiler cache '{preference}' not found or not executable, disabling compiler cache")
+    return None
+
+
 modes = {
    'debug': {
        'cxxflags': '-DDEBUG -DSANITIZE -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
@@ -732,6 +813,8 @@ arg_parser.add_argument('--compiler', action='store', dest='cxx', default='clang
                        help='C++ compiler path')
 arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='clang',
                        help='C compiler path')
+arg_parser.add_argument('--compiler-cache', action='store', dest='compiler_cache', default='auto',
+                        help='Compiler cache to use: auto (default, prefers sccache), sccache, ccache, none, or a path to a binary')
 add_tristate(arg_parser, name='dpdk', dest='dpdk', default=False,
                        help='Use dpdk (from seastar dpdk sources)')
 arg_parser.add_argument('--dpdk-target', action='store', dest='dpdk_target', default='',
@@ -951,6 +1034,7 @@ scylla_core = (['message/messaging_service.cc',
                'cql3/functions/aggregate_fcts.cc',
                'cql3/functions/castas_fcts.cc',
                'cql3/functions/error_injection_fcts.cc',
+                'cql3/functions/vector_similarity_fcts.cc',
                'cql3/statements/cf_prop_defs.cc',
                'cql3/statements/cf_statement.cc',
                'cql3/statements/authentication_statement.cc',
@@ -1370,6 +1454,7 @@ alternator = [
       'alternator/auth.cc',
       'alternator/streams.cc',
       'alternator/ttl.cc',
+       'alternator/http_compression.cc'
 ]

 idls = ['idl/gossip_digest.idl.hh',
@@ -1615,6 +1700,7 @@ deps['test/boost/combined_tests'] += [
    'test/boost/schema_registry_test.cc',
    'test/boost/secondary_index_test.cc',
    'test/boost/sessions_test.cc',
+    'test/boost/simple_value_with_expiry_test.cc',
    'test/boost/sstable_compaction_test.cc',
    'test/boost/sstable_compressor_factory_test.cc',
    'test/boost/sstable_compression_config_test.cc',
@@ -1698,6 +1784,18 @@ deps['test/vector_search/vector_store_client_test'] =  ['test/vector_search/vect
 deps['test/vector_search/load_balancer_test'] = ['test/vector_search/load_balancer_test.cc'] + scylla_tests_dependencies
 deps['test/vector_search/client_test'] = ['test/vector_search/client_test.cc'] + scylla_tests_dependencies

+boost_tests_prefixes = ["test/boost/", "test/vector_search/", "test/raft/", "test/manual/", "test/ldap/"]
+
+# We need to link these files to all Boost tests to make sure that
+# we can execute `--list_json_content` on them. That will produce
+# a similar result as calling `--list_content={HRF,DOT}`.
+# Unfortunately, to be able to do that, we're forced to link the
+# relevant code by hand.
+for key in deps.keys():
+    for prefix in boost_tests_prefixes:
+        if key.startswith(prefix):
+            deps[key] += ["test/lib/boost_tree_lister_injector.cc", "test/lib/boost_test_tree_lister.cc"]
+
 wasm_deps = {}

 wasm_deps['wasm/return_input.wat'] = 'test/resource/wasm/rust/return_input.rs'
@@ -2002,7 +2100,7 @@ def semicolon_separated(*flags):
 def real_relpath(path, start):
    return os.path.relpath(os.path.realpath(path), os.path.realpath(start))

-def configure_seastar(build_dir, mode, mode_config):
+def configure_seastar(build_dir, mode, mode_config, compiler_cache=None):
    seastar_cxx_ld_flags = mode_config['cxx_ld_flags']
    # We want to "undo" coverage for seastar if we have it enabled.
    if args.coverage:
@@ -2049,6 +2147,10 @@ def configure_seastar(build_dir, mode, mode_config):
        '-DSeastar_IO_URING=ON',
    ]

+    if compiler_cache:
+        seastar_cmake_args += [f'-DCMAKE_CXX_COMPILER_LAUNCHER={compiler_cache}',
+                               f'-DCMAKE_C_COMPILER_LAUNCHER={compiler_cache}']
+
    if args.stack_guards is not None:
        stack_guards = 'ON' if args.stack_guards else 'OFF'
        seastar_cmake_args += ['-DSeastar_STACK_GUARDS={}'.format(stack_guards)]
@@ -2080,7 +2182,7 @@ def configure_seastar(build_dir, mode, mode_config):
    subprocess.check_call(seastar_cmd, shell=False, cwd=cmake_dir)


-def configure_abseil(build_dir, mode, mode_config):
+def configure_abseil(build_dir, mode, mode_config, compiler_cache=None):
    abseil_cflags = mode_config['lib_cflags']
    cxx_flags = mode_config['cxxflags']
    if '-DSANITIZE' in cxx_flags:
@@ -2106,6 +2208,10 @@ def configure_abseil(build_dir, mode, mode_config):
        '-DABSL_PROPAGATE_CXX_STD=ON',
    ]

+    if compiler_cache:
+        abseil_cmake_args += [f'-DCMAKE_CXX_COMPILER_LAUNCHER={compiler_cache}',
+                              f'-DCMAKE_C_COMPILER_LAUNCHER={compiler_cache}']
+
    cmake_args = abseil_cmake_args[:]
    abseil_build_dir = os.path.join(build_dir, mode, 'abseil')
    abseil_cmd = ['cmake', '-G', 'Ninja', real_relpath('abseil', abseil_build_dir)] + cmake_args
@@ -2251,15 +2357,6 @@ def get_extra_cxxflags(mode, mode_config, cxx, debuginfo):
    if debuginfo and mode_config['can_have_debug_info']:
        cxxflags += ['-g', '-gz']

-    if 'clang' in cxx:
-        # Since AssignmentTracking was enabled by default in clang
-        # (llvm/llvm-project@de6da6ad55d3ca945195d1cb109cb8efdf40a52a)
-        # coroutine frame debugging info (`coro_frame_ty`) is broken.
-        #
-        # It seems that we aren't losing much by disabling AssigmentTracking,
-        # so for now we choose to disable it to get `coro_frame_ty` back.
-        cxxflags.append('-Xclang -fexperimental-assignment-tracking=disabled')
-
    return cxxflags


@@ -2287,10 +2384,15 @@ def write_build_file(f,
                     scylla_product,
                     scylla_version,
                     scylla_release,
+                     compiler_cache,
                     args):
    use_precompiled_header = not args.disable_precompiled_header
    warnings = get_warning_options(args.cxx)
    rustc_target = pick_rustc_target('wasm32-wasi', 'wasm32-wasip1')
+    # If compiler cache is available, prefix the compiler with it
+    cxx_with_cache = f'{compiler_cache} {args.cxx}' if compiler_cache else args.cxx
+    # For Rust, sccache is used via RUSTC_WRAPPER environment variable
+    rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache else ''
    f.write(textwrap.dedent('''\
        configure_args = {configure_args}
        builddir = {outdir}
@@ -2353,7 +2455,7 @@ def write_build_file(f,
            command = clang --target=wasm32 --no-standard-libraries -Wl,--export-all -Wl,--no-entry $in -o $out
            description = C2WASM $out
        rule rust2wasm
-            command = cargo build --target={rustc_target} --example=$example --locked --manifest-path=test/resource/wasm/rust/Cargo.toml --target-dir=$builddir/wasm/ $
+            command = {rustc_wrapper}cargo build --target={rustc_target} --example=$example --locked --manifest-path=test/resource/wasm/rust/Cargo.toml --target-dir=$builddir/wasm/ $
                && wasm-opt -Oz $builddir/wasm/{rustc_target}/debug/examples/$example.wasm -o $builddir/wasm/$example.wasm $
                && wasm-strip $builddir/wasm/$example.wasm
            description = RUST2WASM $out
@@ -2369,7 +2471,7 @@ def write_build_file(f,
          command = llvm-profdata merge $in -output=$out
        ''').format(configure_args=configure_args,
                    outdir=outdir,
-                    cxx=args.cxx,
+                    cxx=cxx_with_cache,
                    user_cflags=user_cflags,
                    warnings=warnings,
                    defines=defines,
@@ -2377,6 +2479,7 @@ def write_build_file(f,
                    user_ldflags=user_ldflags,
                    libs=libs,
                    rustc_target=rustc_target,
+                    rustc_wrapper=rustc_wrapper,
                    link_pool_depth=link_pool_depth,
                    seastar_path=args.seastar_path,
                    ninja=ninja,
@@ -2461,10 +2564,10 @@ def write_build_file(f,
              description = TEST {mode}
            # This rule is unused for PGO stages. They use the rust lib from the parent mode.
            rule rust_lib.{mode}
-              command = CARGO_BUILD_DEP_INFO_BASEDIR='.' cargo build --locked --manifest-path=rust/Cargo.toml --target-dir=$builddir/{mode} --profile=rust-{mode} $
+              command = CARGO_BUILD_DEP_INFO_BASEDIR='.' {rustc_wrapper}cargo build --locked --manifest-path=rust/Cargo.toml --target-dir=$builddir/{mode} --profile=rust-{mode} $
                        && touch $out
              description = RUST_LIB $out
-            ''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, **modeval))
+            ''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, rustc_wrapper=rustc_wrapper, **modeval))
        f.write(
            'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
                mode=mode,
@@ -2528,7 +2631,7 @@ def write_build_file(f,
                # In debug/sanitize modes, we compile with fsanitizers,
                # so must use the same options during the link:
                if '-DSANITIZE' in modes[mode]['cxxflags']:
-                    f.write('   libs = -fsanitize=address -fsanitize=undefined\n')
+                    f.write('   libs = -fsanitize=address -fsanitize=undefined -lubsan\n')
                else:
                    f.write('   libs =\n')
                f.write(f'build $builddir/{mode}/{binary}.stripped: strip $builddir/{mode}/{binary}\n')
@@ -2924,6 +3027,9 @@ def create_build_system(args):

    os.makedirs(outdir, exist_ok=True)

+    compiler_cache = find_compiler_cache(args.compiler_cache)
+    resolve_compilers_for_compiler_cache(args, compiler_cache)
+
    scylla_product, scylla_version, scylla_release = generate_version(args.date_stamp)

    for mode, mode_config in build_modes.items():
@@ -2940,8 +3046,8 @@ def create_build_system(args):
        # {outdir}/{mode}/seastar/build.ninja, and
        # {outdir}/{mode}/seastar/seastar.pc is queried for building flags
        for mode, mode_config in build_modes.items():
-            configure_seastar(outdir, mode, mode_config)
-            configure_abseil(outdir, mode, mode_config)
+            configure_seastar(outdir, mode, mode_config, compiler_cache)
+            configure_abseil(outdir, mode, mode_config, compiler_cache)
        user_cflags += ' -isystem abseil'

    for mode, mode_config in build_modes.items():
@@ -2964,6 +3070,7 @@ def create_build_system(args):
                         scylla_product,
                         scylla_version,
                         scylla_release,
+                         compiler_cache,
                         args)
    generate_compdb('compile_commands.json', ninja, args.buildfile, selected_modes)

@@ -3006,6 +3113,10 @@ def configure_using_cmake(args):
    selected_modes = args.selected_modes or default_modes
    selected_configs = ';'.join(build_modes[mode].cmake_build_type for mode
                                in selected_modes)
+
+    compiler_cache = find_compiler_cache(args.compiler_cache)
+    resolve_compilers_for_compiler_cache(args, compiler_cache)
+
    settings = {
        'CMAKE_CONFIGURATION_TYPES': selected_configs,
        'CMAKE_CROSS_CONFIGS': selected_configs,
@@ -3023,6 +3134,14 @@ def configure_using_cmake(args):
        'Scylla_WITH_DEBUG_INFO' : 'ON' if args.debuginfo else 'OFF',
        'Scylla_USE_PRECOMPILED_HEADER': 'OFF' if args.disable_precompiled_header else 'ON',
    }
+
+    if compiler_cache:
+        settings['CMAKE_CXX_COMPILER_LAUNCHER'] = compiler_cache
+        settings['CMAKE_C_COMPILER_LAUNCHER'] = compiler_cache
+        # For Rust, sccache is used via RUSTC_WRAPPER
+        if 'sccache' in compiler_cache:
+            settings['Scylla_RUSTC_WRAPPER'] = compiler_cache
+
    if args.date_stamp:
        settings['Scylla_DATE_STAMP'] = args.date_stamp
    if args.staticboost:
@@ -3054,7 +3173,7 @@ def configure_using_cmake(args):

    if not args.dist_only:
        for mode in selected_modes:
-            configure_seastar(build_dir, build_modes[mode].cmake_build_type, modes[mode])
+            configure_seastar(build_dir, build_modes[mode].cmake_build_type, modes[mode], compiler_cache)

    cmake_command = ['cmake']
    cmake_command += [f'-D{var}={value}' for var, value in settings.items()]
--- a/cql3/CMakeLists.txt
+++ b/cql3/CMakeLists.txt
@@ -47,6 +47,7 @@ target_sources(cql3
    functions/aggregate_fcts.cc
    functions/castas_fcts.cc
    functions/error_injection_fcts.cc
+    functions/vector_similarity_fcts.cc
    statements/cf_prop_defs.cc
    statements/cf_statement.cc
    statements/authentication_statement.cc
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -431,6 +431,7 @@ unaliasedSelector returns [uexpression tmp]
       | K_TTL       '(' c=cident ')'              { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::ttl,
                                                                                              unresolved_identifier{std::move(c)}}; }
       | f=functionName args=selectionFunctionArgs { tmp = function_call{std::move(f), std::move(args)}; }
+       | f=similarityFunctionName args=vectorSimilarityArgs            { tmp = function_call{std::move(f), std::move(args)}; }
       | K_CAST      '(' arg=unaliasedSelector K_AS t=native_type ')'  { tmp = cast{.style = cast::cast_style::sql, .arg = std::move(arg), .type = std::move(t)}; }
       )
       ( '.' fi=cident { tmp = field_selection{std::move(tmp), std::move(fi)}; }
@@ -445,6 +446,18 @@ selectionFunctionArgs returns [std::vector<expression> a]
      ')'
    ;

+vectorSimilarityArgs returns [std::vector<expression> a]
+    : '(' ')'
+    | '(' v1=vectorSimilarityArg { a.push_back(std::move(v1)); }
+          ( ',' vn=vectorSimilarityArg { a.push_back(std::move(vn)); } )*
+      ')'
+    ;
+
+vectorSimilarityArg returns [uexpression a]
+    : s=unaliasedSelector { a = std::move(s); }
+    | v=value             { a = std::move(v); }
+    ;
+
 countArgument
    : '*'
    | i=INTEGER { if (i->getText() != "1") {
@@ -1683,6 +1696,10 @@ functionName returns [cql3::functions::function_name s]
    : (ks=keyspaceName '.')? f=allowedFunctionName   { $s.keyspace = std::move(ks); $s.name = std::move(f); }
    ;

+similarityFunctionName returns [cql3::functions::function_name s]
+    : f=allowedSimilarityFunctionName { $s = cql3::functions::function_name::native_function(std::move(f)); }
+    ;
+
 allowedFunctionName returns [sstring s]
    : f=IDENT                       { $s = $f.text; std::transform(s.begin(), s.end(), s.begin(), ::tolower); }
    | f=QUOTED_NAME                 { $s = $f.text; }
@@ -1691,6 +1708,11 @@ allowedFunctionName returns [sstring s]
    | K_COUNT                       { $s = "count"; }
    ;

+allowedSimilarityFunctionName returns [sstring s]
+    : f=(K_SIMILARITY_COSINE | K_SIMILARITY_EUCLIDEAN | K_SIMILARITY_DOT_PRODUCT)
+      { $s = $f.text; std::transform(s.begin(), s.end(), s.begin(), ::tolower); }
+    ;
+
 functionArgs returns [std::vector<expression> a]
    : '(' ')'
    | '(' t1=term { a.push_back(std::move(t1)); }
@@ -2387,6 +2409,10 @@ K_MUTATION_FRAGMENTS:    M U T A T I O N '_' F R A G M E N T S;

 K_VECTOR_SEARCH_INDEXING: V E C T O R '_' S E A R C H '_' I N D E X I N G;

+K_SIMILARITY_EUCLIDEAN:     S I M I L A R I T Y '_' E U C L I D E A N;
+K_SIMILARITY_COSINE:        S I M I L A R I T Y '_' C O S I N E;
+K_SIMILARITY_DOT_PRODUCT:   S I M I L A R I T Y '_' D O T '_' P R O D U C T;
+
 // Case-insensitive alpha characters
 fragment A: ('a'|'A');
 fragment B: ('b'|'B');
--- a/cql3/assignment_testable.hh
+++ b/cql3/assignment_testable.hh
@@ -25,6 +25,11 @@ public:
        NOT_ASSIGNABLE,
    };

+    struct vector_test_result {
+        test_result result;
+        std::optional<size_t> dimension_opt;
+    };
+
    static bool is_assignable(test_result tr) {
        return tr != test_result::NOT_ASSIGNABLE;
    }
@@ -44,6 +49,8 @@ public:
     */
    virtual test_result test_assignment(data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, const column_specification& receiver) const = 0;

+    virtual vector_test_result test_assignment_any_size_float_vector() const = 0;
+
    virtual std::optional<data_type> assignment_testable_type_opt() const = 0;

    // for error reporting
--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -1434,6 +1434,112 @@ test_assignment(const expression& expr, data_dictionary::database db, const sstr
    }, expr);
 }

+template <cql3_type::kind... Kinds>
+assignment_testable::vector_test_result
+test_assignment_any_size_float_vector(const expression& expr) {
+    using test_result = assignment_testable::vector_test_result;
+    const test_result NOT_ASSIGNABLE = {assignment_testable::test_result::NOT_ASSIGNABLE, std::nullopt};
+    const test_result WEAKLY_ASSIGNABLE = {assignment_testable::test_result::WEAKLY_ASSIGNABLE, std::nullopt};
+    auto is_float_or_bind = [] (const expression& e) {
+        return expr::visit(overloaded_functor{
+            [] (const bind_variable&) {
+                return true;
+            },
+            [] (const untyped_constant& uc) {
+                return uc.partial_type == untyped_constant::type_class::floating_point
+                    || uc.partial_type == untyped_constant::type_class::integer;
+            },
+            [] (const constant& value) {
+                auto kind = value.type->as_cql3_type().get_kind();
+                return cql3_type::kind_enum_set::frozen<Kinds...>().contains(kind);
+            },
+            [] (const auto&) {
+                return false;
+            },
+        }, e);
+    };
+    auto validate_assignment = [&] (const data_type& dt) -> test_result {
+         auto vt = dynamic_pointer_cast<const vector_type_impl>(dt->underlying_type());
+            if (!vt) {
+                return NOT_ASSIGNABLE;
+            }
+            auto elem_kind = vt->get_elements_type()->as_cql3_type().get_kind();
+            if (cql3_type::kind_enum_set::frozen<Kinds...>().contains(elem_kind)) {
+                return {assignment_testable::test_result::WEAKLY_ASSIGNABLE, vt->get_dimension()};
+            }
+            return NOT_ASSIGNABLE;
+    };
+    return expr::visit(overloaded_functor{
+        [&] (const constant& value) -> test_result {
+            return validate_assignment(value.type);
+        },
+        [&] (const binary_operator&) -> test_result {
+            return NOT_ASSIGNABLE;
+        },
+        [&] (const conjunction&) -> test_result {
+            return NOT_ASSIGNABLE;
+        },
+        [&] (const column_value& col_val) -> test_result {
+            return validate_assignment(col_val.col->type);
+        },
+        [&] (const subscript&) -> test_result {
+            return NOT_ASSIGNABLE;
+        },
+        [&] (const unresolved_identifier& ui) -> test_result {
+            return NOT_ASSIGNABLE;
+        },
+        [&] (const column_mutation_attribute& cma) -> test_result {
+            return NOT_ASSIGNABLE;
+        },
+        [&] (const function_call& fc) -> test_result {
+            return NOT_ASSIGNABLE;
+        },
+        [&] (const cast& c) -> test_result {
+            return NOT_ASSIGNABLE;
+        },
+        [&] (const field_selection& fs) -> test_result {
+            return NOT_ASSIGNABLE;
+        },
+        [&] (const bind_variable& bv) -> test_result {
+            return WEAKLY_ASSIGNABLE;
+        },
+        [&] (const untyped_constant& uc) -> test_result {
+            return uc.partial_type == untyped_constant::type_class::null
+                ? WEAKLY_ASSIGNABLE
+                : NOT_ASSIGNABLE;
+        },
+        [&] (const tuple_constructor& tc) -> test_result {
+            return NOT_ASSIGNABLE;
+        },
+        [&] (const collection_constructor& c) -> test_result {
+            switch (c.style) {
+            case collection_constructor::style_type::list_or_vector: {
+                if(std::ranges::all_of(c.elements, is_float_or_bind)) {
+                    return {assignment_testable::test_result::WEAKLY_ASSIGNABLE, c.elements.size()};
+                }
+                return NOT_ASSIGNABLE;
+            }
+            case collection_constructor::style_type::set: return NOT_ASSIGNABLE;
+            case collection_constructor::style_type::map: return NOT_ASSIGNABLE;
+            case collection_constructor::style_type::vector:
+                on_internal_error(expr_logger, "vector style type found in test_assignment, should have been introduced post-prepare");
+            }
+            on_internal_error(expr_logger, fmt::format("unexpected collection_constructor style {}", static_cast<unsigned>(c.style)));
+        },
+        [&] (const usertype_constructor& uc) -> test_result {
+            return NOT_ASSIGNABLE;
+        },
+        [&] (const temporary& t) -> test_result {
+            return NOT_ASSIGNABLE;
+        },
+    }, expr);
+}
+
+assignment_testable::vector_test_result
+test_assignment_any_size_float_vector(const expression& expr) {
+    return test_assignment_any_size_float_vector<cql3_type::kind::FLOAT, cql3_type::kind::DOUBLE>(expr);
+}
+
 expression
 prepare_expression(const expression& expr, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
    auto e_opt = try_prepare_expression(expr, db, keyspace, schema_opt, std::move(receiver));
@@ -1467,6 +1573,9 @@ public:
    virtual test_result test_assignment(data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, const column_specification& receiver) const override {
        return expr::test_assignment(_e, db, keyspace, schema_opt, receiver);
    }
+    virtual vector_test_result test_assignment_any_size_float_vector() const override {
+        return expr::test_assignment_any_size_float_vector(_e);
+    }
    virtual sstring assignment_testable_source_context() const override {
        return fmt::format("{}", _e);
    }
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -16,6 +16,7 @@
 #include "cql3/functions/user_function.hh"
 #include "cql3/functions/user_aggregate.hh"
 #include "cql3/functions/uuid_fcts.hh"
+#include "cql3/functions/vector_similarity_fcts.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "as_json_function.hh"
 #include "cql3/prepare_context.hh"
@@ -398,6 +399,14 @@ functions::get(data_dictionary::database db,
        }
    });

+    const auto func_name = name.has_keyspace() ? name : name.as_native_function();
+    if (SIMILARITY_FUNCTIONS.contains(func_name)) {
+        auto arg_types = retrieve_vector_arg_types(func_name, provided_args);
+        auto fun = ::make_shared<vector_similarity_fct>(func_name.name, arg_types);
+        validate_types(db, keyspace, schema.get(), fun, provided_args, receiver_ks, receiver_cf);
+        return fun;
+    }
+
    if (name.has_keyspace()
                ? name == TOKEN_FUNCTION_NAME
                : name.name == TOKEN_FUNCTION_NAME.name) {
--- a/cql3/functions/vector_similarity_fcts.cc
+++ b/cql3/functions/vector_similarity_fcts.cc
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#include "vector_similarity_fcts.hh"
+#include "types/types.hh"
+#include "types/vector.hh"
+#include "exceptions/exceptions.hh"
+
+namespace cql3 {
+namespace functions {
+namespace {
+
+// The computations of similarity scores match the exact formulas of Cassandra's (jVector's) implementation to ensure compatibility.
+// There exist tests checking the compliance of the results.
+// Reference:
+// https://github.com/datastax/jvector/blob/f967f1c9249035b63b55a566fac7d4dc38380349/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorSimilarityFunction.java#L36-L69
+
+// You should only use this function if you need to preserve the original vectors and cannot normalize
+// them in advance.
+float compute_cosine_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
+    double dot_product = 0.0;
+    double squared_norm_a = 0.0;
+    double squared_norm_b = 0.0;
+
+    for (size_t i = 0; i < v1.size(); ++i) {
+        double a = value_cast<float>(v1[i]);
+        double b = value_cast<float>(v2[i]);
+
+        dot_product += a * b;
+        squared_norm_a += a * a;
+        squared_norm_b += b * b;
+    }
+
+    if (squared_norm_a == 0 || squared_norm_b == 0) {
+        throw exceptions::invalid_request_exception("Function system.similarity_cosine doesn't support all-zero vectors");
+    }
+
+    // The cosine similarity is in the range [-1, 1].
+    // It is mapped to a similarity score in the range [0, 1] (-1 -> 0, 1 -> 1)
+    // for consistency with other similarity functions.
+    return (1 + (dot_product / (std::sqrt(squared_norm_a * squared_norm_b)))) / 2;
+}
+
+float compute_euclidean_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
+    double sum = 0.0;
+
+    for (size_t i = 0; i < v1.size(); ++i) {
+        double a = value_cast<float>(v1[i]);
+        double b = value_cast<float>(v2[i]);
+
+        double diff = a - b;
+        sum += diff * diff;
+    }
+
+    // The squared Euclidean (L2) distance is of range [0, inf).
+    // It is mapped to a similarity score in the range (0, 1] (0 -> 1, inf -> 0)
+    // for consistency with other similarity functions.
+    return (1 / (1 + sum));
+}
+
+// Assumes that both vectors are L2-normalized.
+// This similarity is intended as an optimized way to perform cosine similarity calculation.
+float compute_dot_product_similarity(const std::vector<data_value>& v1, const std::vector<data_value>& v2) {
+    double dot_product = 0.0;
+
+    for (size_t i = 0; i < v1.size(); ++i) {
+        double a = value_cast<float>(v1[i]);
+        double b = value_cast<float>(v2[i]);
+        dot_product += a * b;
+    }
+
+    // The dot product is in the range [-1, 1] for L2-normalized vectors.
+    // It is mapped to a similarity score in the range [0, 1] (-1 -> 0, 1 -> 1)
+    // for consistency with other similarity functions.
+    return ((1 + dot_product) / 2);
+}
+
+} // namespace
+
+thread_local const std::unordered_map<function_name, similarity_function_t> SIMILARITY_FUNCTIONS = {
+        {SIMILARITY_COSINE_FUNCTION_NAME, compute_cosine_similarity},
+        {SIMILARITY_EUCLIDEAN_FUNCTION_NAME, compute_euclidean_similarity},
+        {SIMILARITY_DOT_PRODUCT_FUNCTION_NAME, compute_dot_product_similarity},
+};
+
+std::vector<data_type> retrieve_vector_arg_types(const function_name& name, const std::vector<shared_ptr<assignment_testable>>& provided_args) {
+    if (provided_args.size() != 2) {
+        throw exceptions::invalid_request_exception(fmt::format("Invalid number of arguments for function {}(vector<float, n>, vector<float, n>)", name));
+    }
+
+    auto [first_result, first_dim_opt] = provided_args[0]->test_assignment_any_size_float_vector();
+    auto [second_result, second_dim_opt] = provided_args[1]->test_assignment_any_size_float_vector();
+
+    auto invalid_type_error_message = [&name](const shared_ptr<assignment_testable>& arg) {
+        auto type = arg->assignment_testable_type_opt();
+        const auto& source_context = arg->assignment_testable_source_context();
+        if (type) {
+            return fmt::format("Function {} requires a float vector argument, but found {} of type {}", name, source_context, type.value()->cql3_type_name());
+        } else {
+            return fmt::format("Function {} requires a float vector argument, but found {}", name, source_context);
+        }
+    };
+
+    if (!is_assignable(first_result)) {
+        throw exceptions::invalid_request_exception(invalid_type_error_message(provided_args[0]));
+    }
+    if (!is_assignable(second_result)) {
+        throw exceptions::invalid_request_exception(invalid_type_error_message(provided_args[1]));
+    }
+
+    if (!first_dim_opt && !second_dim_opt) {
+        throw exceptions::invalid_request_exception(fmt::format("Cannot infer type of argument {} for function {}(vector<float, n>, vector<float, n>)",
+                provided_args[0]->assignment_testable_source_context(), name));
+    }
+    if (first_dim_opt && second_dim_opt) {
+        if (*first_dim_opt != *second_dim_opt) {
+            throw exceptions::invalid_request_exception(fmt::format(
+                    "All arguments must have the same vector dimensions, but found vector<float, {}> and vector<float, {}>", *first_dim_opt, *second_dim_opt));
+        }
+    }
+
+    size_t dimension = first_dim_opt ? *first_dim_opt : *second_dim_opt;
+    auto type = vector_type_impl::get_instance(float_type, dimension);
+    return {type, type};
+}
+
+bytes_opt vector_similarity_fct::execute(std::span<const bytes_opt> parameters) {
+    if (std::any_of(parameters.begin(), parameters.end(), [](const auto& param) {
+            return !param;
+        })) {
+        return std::nullopt;
+    }
+
+    const auto& type = arg_types()[0];
+    data_value v1 = type->deserialize(*parameters[0]);
+    data_value v2 = type->deserialize(*parameters[1]);
+    const auto& v1_elements = value_cast<std::vector<data_value>>(v1);
+    const auto& v2_elements = value_cast<std::vector<data_value>>(v2);
+
+    float result = SIMILARITY_FUNCTIONS.at(_name)(v1_elements, v2_elements);
+    return float_type->decompose(result);
+}
+
+} // namespace functions
+} // namespace cql3
--- a/cql3/functions/vector_similarity_fcts.hh
+++ b/cql3/functions/vector_similarity_fcts.hh
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include "native_scalar_function.hh"
+#include "cql3/assignment_testable.hh"
+#include "cql3/functions/function_name.hh"
+
+namespace cql3 {
+namespace functions {
+
+static const function_name SIMILARITY_COSINE_FUNCTION_NAME = function_name::native_function("similarity_cosine");
+static const function_name SIMILARITY_EUCLIDEAN_FUNCTION_NAME = function_name::native_function("similarity_euclidean");
+static const function_name SIMILARITY_DOT_PRODUCT_FUNCTION_NAME = function_name::native_function("similarity_dot_product");
+
+using similarity_function_t = float (*)(const std::vector<data_value>&, const std::vector<data_value>&);
+extern thread_local const std::unordered_map<function_name, similarity_function_t> SIMILARITY_FUNCTIONS;
+
+std::vector<data_type> retrieve_vector_arg_types(const function_name& name, const std::vector<shared_ptr<assignment_testable>>& provided_args);
+
+class vector_similarity_fct : public native_scalar_function {
+public:
+    vector_similarity_fct(const sstring& name, const std::vector<data_type>& arg_types)
+        : native_scalar_function(name, float_type, arg_types) {
+    }
+
+    virtual bytes_opt execute(std::span<const bytes_opt> parameters) override;
+};
+
+} // namespace functions
+} // namespace cql3
--- a/cql3/selection/selectable.cc
+++ b/cql3/selection/selectable.cc
@@ -32,7 +32,7 @@ bool
 selectable_processes_selection(const expr::expression& selectable) {
    return expr::visit(overloaded_functor{
        [&] (const expr::constant&) -> bool {
-            on_internal_error(slogger, "no way to express SELECT constant in the grammar yet");
+            return true;
        },
        [&] (const expr::conjunction& conj) -> bool {
            on_internal_error(slogger, "no way to express 'SELECT a AND b' in the grammar yet");
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -190,7 +190,7 @@ future<utils::chunked_vector<mutation>> batch_statement::get_mutations(query_pro
    co_return vresult;
 }

-void batch_statement::verify_batch_size(query_processor& qp, const utils::chunked_vector<mutation>& mutations) {
+void batch_statement::verify_batch_size(query_processor& qp, const utils::chunked_vector<mutation>& mutations) const {
    if (mutations.size() <= 1) {
        return;     // We only warn for batch spanning multiple mutations
    }
@@ -209,8 +209,9 @@ void batch_statement::verify_batch_size(query_processor& qp, const utils::chunke
            for (auto&& m : mutations) {
                ks_cf_pairs.insert(m.schema()->ks_name() + "." + m.schema()->cf_name());
            }
-            return seastar::format("Batch modifying {:d} partitions in {} is of size {:d} bytes, exceeding specified {} threshold of {:d} by {:d}.",
-                    mutations.size(), fmt::join(ks_cf_pairs, ", "), size, type, threshold, size - threshold);
+            const auto batch_type = _type == type::LOGGED ? "Logged" : "Unlogged";
+            return seastar::format("{} batch modifying {:d} partitions in {} is of size {:d} bytes, exceeding specified {} threshold of {:d} by {:d}.",
+                    batch_type, mutations.size(), fmt::join(ks_cf_pairs, ", "), size, type, threshold, size - threshold);
        };
        if (size > fail_threshold) {
            _logger.error("{}", error("FAIL", fail_threshold).c_str());
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -116,7 +116,7 @@ public:
     * Checks batch size to ensure threshold is met. If not, a warning is logged.
     * @param cfs ColumnFamilies that will store the batch's mutations.
     */
-    static void verify_batch_size(query_processor& qp, const utils::chunked_vector<mutation>& mutations);
+    void verify_batch_size(query_processor& qp, const utils::chunked_vector<mutation>& mutations) const;

    virtual future<shared_ptr<cql_transport::messages::result_message>> execute(
            query_processor& qp, service::query_state& state, const query_options& options, std::optional<service::group0_guard> guard) const override;
--- a/cql3/statements/describe_statement.cc
+++ b/cql3/statements/describe_statement.cc
@@ -710,11 +710,12 @@ std::vector<lw_shared_ptr<column_specification>> listing_describe_statement::get

 future<std::vector<std::vector<managed_bytes_opt>>> listing_describe_statement::describe(cql3::query_processor& qp, const service::client_state& client_state) const {
    auto db = qp.db();
-    auto raw_ks = client_state.get_raw_keyspace();
-
    std::vector<sstring> keyspaces;
-    if (!raw_ks.empty()) {
-        keyspaces.push_back(raw_ks);
+    // For most describe statements we should limit the results to the USEd
+    // keyspace (client_state.get_raw_keyspace()), if any. However for DESC
+    // KEYSPACES we must list all keyspaces, not just the USEd one.
+    if (_element != element_type::keyspace && !client_state.get_raw_keyspace().empty()) {
+        keyspaces.push_back(client_state.get_raw_keyspace());
    } else {
        keyspaces = db.get_all_keyspaces();
        std::ranges::sort(keyspaces);
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -17,6 +17,7 @@
 #include <seastar/core/metrics.hh>
 #include <seastar/core/coroutine.hh>
 #include <seastar/core/sleep.hh>
+#include <seastar/coroutine/parallel_for_each.hh>

 #include "batchlog_manager.hh"
 #include "batchlog.hh"
@@ -318,8 +319,8 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches

    std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;

-    // Use a stable `now` accross all batches, so skip/replay decisions are the
-    // same accross a while prefix of written_at (accross all ids).
+    // Use a stable `now` across all batches, so skip/replay decisions are the
+    // same across a while prefix of written_at (across all ids).
    const auto now = db_clock::now();

    auto batch = [this, cleanup, limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
--- a/db/config.cc
+++ b/db/config.cc
@@ -1105,6 +1105,14 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Like native_transport_port, but clients-side port number (modulo smp) is used to route the connection to the specific shard.")
    , native_shard_aware_transport_port_ssl(this, "native_shard_aware_transport_port_ssl", value_status::Used, 19142,
        "Like native_transport_port_ssl, but clients-side port number (modulo smp) is used to route the connection to the specific shard.")
+    , native_transport_port_proxy_protocol(this, "native_transport_port_proxy_protocol", value_status::Used, 0,
+        "Port on which the CQL native transport listens for clients using proxy protocol v2. Disabled (0) by default.")
+    , native_transport_port_ssl_proxy_protocol(this, "native_transport_port_ssl_proxy_protocol", value_status::Used, 0,
+        "Port on which the CQL TLS native transport listens for clients using proxy protocol v2. Disabled (0) by default.")
+    , native_shard_aware_transport_port_proxy_protocol(this, "native_shard_aware_transport_port_proxy_protocol", value_status::Used, 0,
+        "Like native_transport_port_proxy_protocol, but clients-side port number (modulo smp) is used to route the connection to the specific shard.")
+    , native_shard_aware_transport_port_ssl_proxy_protocol(this, "native_shard_aware_transport_port_ssl_proxy_protocol", value_status::Used, 0,
+        "Like native_transport_port_ssl_proxy_protocol, but clients-side port number (modulo smp) is used to route the connection to the specific shard.")
    , native_transport_max_threads(this, "native_transport_max_threads", value_status::Invalid, 128,
        "The maximum number of thread handling requests. The meaning is the same as rpc_max_threads.\n"
        "Default is different (128 versus unlimited).\n"
@@ -1470,6 +1478,15 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , alternator_max_expression_cache_entries_per_shard(this, "alternator_max_expression_cache_entries_per_shard", liveness::LiveUpdate, value_status::Used, 2000, "Maximum number of cached parsed request expressions, per shard.")
    , alternator_max_users_query_size_in_trace_output(this, "alternator_max_users_query_size_in_trace_output", liveness::LiveUpdate, value_status::Used, uint64_t(4096),
            "Maximum size of user's command in trace output (`alternator_op` entry). Larger traces will be truncated and have `<truncated>` message appended - which doesn't count to the maximum limit.")
+    , alternator_describe_table_info_cache_validity_in_seconds(this, "alternator_describe_table_info_cache_validity_in_seconds", liveness::LiveUpdate, value_status::Used, 60 * 60 * 6,
+        "The validity of DescribeTable information - table size in bytes. This is how long calculated value will be reused before recalculation.")
+    , alternator_response_gzip_compression_level(this, "alternator_response_gzip_compression_level", liveness::LiveUpdate, value_status::Used, int8_t(6),
+            "Controls gzip and deflate compression level for Alternator response bodies (if the client requests it via Accept-Encoding header) Default of 6 is a compromise between speed and compression.\n"
+            "Valid values:\n"
+            "\t0 : No compression (disables gzip/deflate)\n"
+            "\t1-9: Compression levels (1 = fastest, 9 = best compression)")
+    , alternator_response_compression_threshold_in_bytes(this, "alternator_response_compression_threshold_in_bytes", liveness::LiveUpdate, value_status::Used, uint64_t(4096),
+            "When the compression is enabled, this value indicates the minimum size of data to compress. Smaller responses will not be compressed.")
    , abort_on_ebadf(this, "abort_on_ebadf", value_status::Used, true, "Abort the server on incorrect file descriptor access. Throws exception when disabled.")
    , sanitizer_report_backtrace(this, "sanitizer_report_backtrace", value_status::Used, false,
            "In debug mode, report log-structured allocator sanitizer violations with a backtrace. Slow.")
@@ -1566,6 +1583,12 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    // Bigger tables will take longer to be resized. similar-sized tables can be batched into same iteration.
    , tablet_load_stats_refresh_interval_in_seconds(this, "tablet_load_stats_refresh_interval_in_seconds", liveness::LiveUpdate, value_status::Used, 60,
        "Tablet load stats refresh rate in seconds.")
+    , force_capacity_based_balancing(this, "force_capacity_based_balancing", liveness::LiveUpdate, value_status::Used, false,
+        "Forces the load balancer to perform capacity based balancing, instead of size based balancing.")
+    , size_based_balance_threshold_percentage(this, "size_based_balance_threshold_percentage", liveness::LiveUpdate, value_status::Used, 1.0,
+        "Sets the maximum difference in percentages between the most loaded and least loaded nodes, below which the load balancer considers nodes balanced.")
+    , minimal_tablet_size_for_balancing(this, "minimal_tablet_size_for_balancing", liveness::LiveUpdate, value_status::Used, service::default_target_tablet_size / 100,
+        "Sets the minimal tablet size for the load balancer. For any tablet smaller than this, the balancer will use this size instead of the actual tablet size.")
    , default_log_level(this, "default_log_level", value_status::Used, seastar::log_level::info, "Default log level for log messages")
    , logger_log_level(this, "logger_log_level", value_status::Used, {}, "Map of logger name to log level. Valid log levels are 'error', 'warn', 'info', 'debug' and 'trace'")
    , log_to_stdout(this, "log_to_stdout", value_status::Used, true, "Send log output to stdout")
--- a/db/config.hh
+++ b/db/config.hh
@@ -324,6 +324,10 @@ public:
    named_value<uint16_t> native_transport_port_ssl;
    named_value<uint16_t> native_shard_aware_transport_port;
    named_value<uint16_t> native_shard_aware_transport_port_ssl;
+    named_value<uint16_t> native_transport_port_proxy_protocol;
+    named_value<uint16_t> native_transport_port_ssl_proxy_protocol;
+    named_value<uint16_t> native_shard_aware_transport_port_proxy_protocol;
+    named_value<uint16_t> native_shard_aware_transport_port_ssl_proxy_protocol;
    named_value<uint32_t> native_transport_max_threads;
    named_value<uint32_t> native_transport_max_frame_size_in_mb;
    named_value<sstring> broadcast_rpc_address;
@@ -473,6 +477,9 @@ public:
    named_value<bool> alternator_allow_system_table_write;
    named_value<uint32_t> alternator_max_expression_cache_entries_per_shard;
    named_value<uint64_t> alternator_max_users_query_size_in_trace_output;
+    named_value<uint32_t> alternator_describe_table_info_cache_validity_in_seconds;
+    named_value<int> alternator_response_gzip_compression_level;
+    named_value<uint32_t> alternator_response_compression_threshold_in_bytes;

    named_value<bool> abort_on_ebadf;

@@ -590,6 +597,9 @@ public:
    named_value<bool> rf_rack_valid_keyspaces;

    named_value<uint32_t> tablet_load_stats_refresh_interval_in_seconds;
+    named_value<bool> force_capacity_based_balancing;
+    named_value<float> size_based_balance_threshold_percentage;
+    named_value<uint64_t> minimal_tablet_size_for_balancing;

    static const sstring default_tls_priority;
 private:
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -26,6 +26,7 @@
 #include <seastar/core/smp.hh>
 #include <seastar/coroutine/exception.hh>
 #include <seastar/coroutine/parallel_for_each.hh>
+#include <seastar/util/file.hh>

 // Boost features.

@@ -643,6 +644,12 @@ future<> manager::drain_for(endpoint_id host_id, gms::inet_address ip) noexcept
        co_return;
    }

+    if (!replay_allowed()) {
+        auto reason = seastar::format("Precondition violdated while trying to drain {} / {}: "
+                "hint replay is not allowed", host_id, ip);
+        on_internal_error(manager_logger, std::move(reason));
+    }
+
    manager_logger.info("Draining starts for {}", host_id);

    const auto holder = seastar::gate::holder{_draining_eps_gate};
@@ -899,7 +906,7 @@ future<> manager::migrate_ip_directories() {
    co_await coroutine::parallel_for_each(dirs_to_remove, [] (auto& directory) -> future<> {
        try {
            manager_logger.warn("Removing hint directory {}", directory.native());
-            co_await lister::rmdir(directory);
+            co_await seastar::recursive_remove_directory(directory);
        } catch (...) {
            on_internal_error(manager_logger,
                    seastar::format("Removing a hint directory has failed. Reason: {}", std::current_exception()));
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -318,6 +318,10 @@ public:
    /// In both cases - removes the corresponding hints' directories after all hints have been drained and erases the
    /// corresponding hint_endpoint_manager objects.
    ///
+    /// Preconditions:
+    /// * Hint replay must be allowed (i.e. `replay_allowed()` must be true) throughout
+    ///   the execution of this function.
+    ///
    /// \param host_id host ID of the node that left the cluster
    /// \param ip the IP of the node that left the cluster
    future<> drain_for(endpoint_id host_id, gms::inet_address ip) noexcept;
@@ -342,15 +346,15 @@ public:
        return _state.contains(state::started);
    }

+    bool replay_allowed() const noexcept {
+        return _state.contains(state::replay_allowed);
+    }
+
 private:
    void set_started() noexcept {
        _state.set(state::started);
    }

-    bool replay_allowed() const noexcept {
-        return _state.contains(state::replay_allowed);
-    }
-
    void set_draining_all() noexcept {
        _state.set(state::draining_all);
    }
--- a/db/snapshot/backup_task.cc
+++ b/db/snapshot/backup_task.cc
@@ -152,7 +152,8 @@ future<> backup_task_impl::do_backup() {
 }

 future<> backup_task_impl::process_snapshot_dir() {
-    auto snapshot_dir_lister = directory_lister(_snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
+    auto directory = co_await io_check(open_directory, _snapshot_dir.native());
+    auto snapshot_dir_lister = directory_lister(directory, _snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
    size_t num_sstable_comps = 0;

    try {
@@ -161,7 +162,7 @@ future<> backup_task_impl::process_snapshot_dir() {
        while (auto component_ent = co_await snapshot_dir_lister.get()) {
            const auto& name = component_ent->name;
            auto file_path = _snapshot_dir / name;
-            auto st = co_await file_stat(file_path.native());
+            auto st = co_await file_stat(directory, name);
            total += st.size;
            try {
                auto desc = sstables::parse_path(file_path, "", "");
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -55,6 +55,7 @@
 #include "message/shared_dict.hh"
 #include "replica/database.hh"
 #include "db/compaction_history_entry.hh"
+#include "mutation/async_utils.hh"

 #include <unordered_map>

@@ -2999,7 +3000,9 @@ future<mutation> system_keyspace::get_group0_history(sharded<replica::database>&
    SCYLLA_ASSERT(rs);
    auto& ps = rs->partitions();
    for (auto& p: ps) {
-        auto mut = p.mut().unfreeze(s);
+        // Note: we could decorate the frozen_mutation's key to check if it's the expected one
+        // but since this is a single partition table, we can just check after unfreezing the whole mutation.
+        auto mut = co_await unfreeze_gently(p.mut(), s);
        auto partition_key = value_cast<sstring>(utf8_type->deserialize(mut.key().get_component(*s, 0)));
        if (partition_key == GROUP0_HISTORY_KEY) {
            co_return mut;
@@ -3157,7 +3160,10 @@ static bool must_have_tokens(service::node_state nst) {
    // A decommissioning node doesn't have tokens at the end, they are
    // removed during transition to the left_token_ring state.
    case service::node_state::decommissioning: return false;
-    case service::node_state::removing: return true;
+    // A removing node might or might not have tokens depending on whether
+    // REMOVENODE_WITH_LEFT_TOKEN_RING feature is enabled. To support both
+    // cases, we allow removing nodes to not have tokens.
+    case service::node_state::removing: return false;
    case service::node_state::rebuilding: return true;
    case service::node_state::normal: return true;
    case service::node_state::left: return false;
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -200,6 +200,7 @@ public:
    static constexpr auto DICTS = "dicts";
    static constexpr auto VIEW_BUILDING_TASKS = "view_building_tasks";
    static constexpr auto CLIENT_ROUTES = "client_routes";
+    static constexpr auto VERSIONS = "versions";

    // auth
    static constexpr auto ROLES = "roles";
--- a/db/view/view_building_worker.cc
+++ b/db/view/view_building_worker.cc
@@ -198,6 +198,7 @@ future<> view_building_worker::register_staging_sstable_tasks(std::vector<sstabl

 future<> view_building_worker::run_staging_sstables_registrator() {
    while (!_as.abort_requested()) {
+        bool sleep = false;
        try {
            auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
            co_await create_staging_sstable_tasks();
@@ -214,6 +215,14 @@ future<> view_building_worker::run_staging_sstables_registrator() {
            vbw_logger.warn("Got group0_concurrent_modification while creating staging sstable tasks");
        } catch (raft::request_aborted&) {
            vbw_logger.warn("Got raft::request_aborted while creating staging sstable tasks");
+        } catch (...) {
+            vbw_logger.error("Exception while creating staging sstable tasks: {}", std::current_exception());
+            sleep = true;
+        }
+
+        if (sleep) {
+            vbw_logger.debug("Sleeping after exception.");
+            co_await seastar::sleep_abortable(1s, _as).handle_exception([] (auto x) { return make_ready_future<>(); });
        }
    }
 }
@@ -417,9 +426,12 @@ future<> view_building_worker::check_for_aborted_tasks() {

        auto my_host_id = vbw._db.get_token_metadata().get_topology().my_host_id();
        auto my_replica = locator::tablet_replica{my_host_id, this_shard_id()};
-        auto tasks_map = vbw._state._batch->tasks; // Potentially, we'll remove elements from the map, so we need a copy to iterate over it
-        for (auto& [id, t]: tasks_map) {
-            auto task_opt = building_state.get_task(t.base_id, my_replica, id);
+        auto it = vbw._state._batch->tasks.begin();
+        while (it != vbw._state._batch->tasks.end()) {
+            auto id = it->first;
+            auto task_opt = building_state.get_task(it->second.base_id, my_replica, id);
+
+            ++it; // Advance the iterator before potentially removing the entry from the map.
            if (!task_opt || task_opt->get().aborted) {
                co_await vbw._state._batch->abort_task(id);
            }
@@ -449,7 +461,7 @@ static std::unordered_set<table_id> get_ids_of_all_views(replica::database& db,
    }) | std::ranges::to<std::unordered_set>();;
 }

-// If `state::processing_base_table` is diffrent that the `view_building_state::currently_processed_base_table`,
+// If `state::processing_base_table` is different that the `view_building_state::currently_processed_base_table`,
 // clear the state, save and flush new base table
 future<> view_building_worker::state::update_processing_base_table(replica::database& db, const view_building_state& building_state, abort_source& as) {
    if (processing_base_table != building_state.currently_processed_base_table) {
@@ -571,8 +583,6 @@ future<> view_building_worker::batch::do_work() {
            break;
        }
    }
-
-    _vbw.local()._vb_state_machine.event.broadcast();
 }

 future<> view_building_worker::do_build_range(table_id base_id, std::vector<table_id> views_ids, dht::token last_token, abort_source& as) {
@@ -774,13 +784,15 @@ future<std::vector<utils::UUID>> view_building_worker::work_on_tasks(raft::term_
            tasks.insert({id, *task_opt});
        }
 #ifdef SEASTAR_DEBUG
-        auto& some_task = tasks.begin()->second;
-        for (auto& [_, t]: tasks) {
-            SCYLLA_ASSERT(t.base_id == some_task.base_id);
-            SCYLLA_ASSERT(t.last_token == some_task.last_token);
-            SCYLLA_ASSERT(t.replica == some_task.replica);
-            SCYLLA_ASSERT(t.type == some_task.type);
-            SCYLLA_ASSERT(t.replica.shard == this_shard_id());
+        {
+            auto& some_task = tasks.begin()->second;
+            for (auto& [_, t]: tasks) {
+                SCYLLA_ASSERT(t.base_id == some_task.base_id);
+                SCYLLA_ASSERT(t.last_token == some_task.last_token);
+                SCYLLA_ASSERT(t.replica == some_task.replica);
+                SCYLLA_ASSERT(t.type == some_task.type);
+                SCYLLA_ASSERT(t.replica.shard == this_shard_id());
+            }
        }
 #endif

@@ -811,25 +823,6 @@ future<std::vector<utils::UUID>> view_building_worker::work_on_tasks(raft::term_
    co_return collect_completed_tasks();
 }

-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 }

 }
--- a/db/virtual_tables.cc
+++ b/db/virtual_tables.cc
@@ -605,8 +605,8 @@ public:
    }

    static schema_ptr build_schema() {
-        auto id = generate_legacy_id(system_keyspace::NAME, "versions");
-        return schema_builder(system_keyspace::NAME, "versions", std::make_optional(id))
+        auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::VERSIONS);
+        return schema_builder(system_keyspace::NAME, system_keyspace::VERSIONS, std::make_optional(id))
            .with_column("key", utf8_type, column_kind::partition_key)
            .with_column("version", utf8_type)
            .with_column("build_mode", utf8_type)
@@ -749,6 +749,7 @@ class clients_table : public streaming_virtual_table {
            .with_column("ssl_protocol", utf8_type)
            .with_column("username", utf8_type)
            .with_column("scheduling_group", utf8_type)
+            .with_column("client_options", map_type_impl::get_instance(utf8_type, utf8_type, false))
            .with_hash_version()
            .build();
    }
@@ -766,7 +767,7 @@ class clients_table : public streaming_virtual_table {

    future<> execute(reader_permit permit, result_collector& result, const query_restrictions& qr) override {
        // Collect
-        using client_data_vec = utils::chunked_vector<client_data>;
+        using client_data_vec = utils::chunked_vector<foreign_ptr<std::unique_ptr<client_data>>>;
        using shard_client_data = std::vector<client_data_vec>;
        std::vector<foreign_ptr<std::unique_ptr<shard_client_data>>> cd_vec;
        cd_vec.resize(smp::count);
@@ -806,13 +807,13 @@ class clients_table : public streaming_virtual_table {
        for (unsigned i = 0; i < smp::count; i++) {
            for (auto&& ps_cdc : *cd_vec[i]) {
                for (auto&& cd : ps_cdc) {
-                    if (cd_map.contains(cd.ip)) {
-                        cd_map[cd.ip].emplace_back(std::move(cd));
+                    if (cd_map.contains(cd->ip)) {
+                        cd_map[cd->ip].emplace_back(std::move(cd));
                    } else {
-                        dht::decorated_key key = make_partition_key(cd.ip);
+                        dht::decorated_key key = make_partition_key(cd->ip);
                        if (this_shard_owns(key) && contains_key(qr.partition_range(), key)) {
-                            ips.insert(decorated_ip{std::move(key), cd.ip});
-                            cd_map[cd.ip].emplace_back(std::move(cd));
+                            ips.insert(decorated_ip{std::move(key), cd->ip});
+                            cd_map[cd->ip].emplace_back(std::move(cd));
                        }
                    }
                    co_await coroutine::maybe_yield();
@@ -825,39 +826,58 @@ class clients_table : public streaming_virtual_table {
            co_await result.emit_partition_start(dip.key);
            auto& clients = cd_map[dip.ip];

-            std::ranges::sort(clients, [] (const client_data& a, const client_data& b) {
-                return a.port < b.port || a.client_type_str() < b.client_type_str();
+            std::ranges::sort(clients, [] (const foreign_ptr<std::unique_ptr<client_data>>& a, const foreign_ptr<std::unique_ptr<client_data>>& b) {
+                return a->port < b->port || a->client_type_str() < b->client_type_str();
            });

            for (const auto& cd : clients) {
-                clustering_row cr(make_clustering_key(cd.port, cd.client_type_str()));
-                set_cell(cr.cells(), "shard_id", cd.shard_id);
-                set_cell(cr.cells(), "connection_stage", cd.stage_str());
-                if (cd.driver_name) {
-                    set_cell(cr.cells(), "driver_name", *cd.driver_name);
+                clustering_row cr(make_clustering_key(cd->port, cd->client_type_str()));
+                set_cell(cr.cells(), "shard_id", cd->shard_id);
+                set_cell(cr.cells(), "connection_stage", cd->stage_str());
+                if (cd->driver_name) {
+                    set_cell(cr.cells(), "driver_name", cd->driver_name->key());
                }
-                if (cd.driver_version) {
-                    set_cell(cr.cells(), "driver_version", *cd.driver_version);
+                if (cd->driver_version) {
+                    set_cell(cr.cells(), "driver_version", cd->driver_version->key());
                }
-                if (cd.hostname) {
-                    set_cell(cr.cells(), "hostname", *cd.hostname);
+                if (cd->hostname) {
+                    set_cell(cr.cells(), "hostname", *cd->hostname);
                }
-                if (cd.protocol_version) {
-                    set_cell(cr.cells(), "protocol_version", *cd.protocol_version);
+                if (cd->protocol_version) {
+                    set_cell(cr.cells(), "protocol_version", *cd->protocol_version);
                }
-                if (cd.ssl_cipher_suite) {
-                    set_cell(cr.cells(), "ssl_cipher_suite", *cd.ssl_cipher_suite);
+                if (cd->ssl_cipher_suite) {
+                    set_cell(cr.cells(), "ssl_cipher_suite", *cd->ssl_cipher_suite);
                }
-                if (cd.ssl_enabled) {
-                    set_cell(cr.cells(), "ssl_enabled", *cd.ssl_enabled);
+                if (cd->ssl_enabled) {
+                    set_cell(cr.cells(), "ssl_enabled", *cd->ssl_enabled);
                }
-                if (cd.ssl_protocol) {
-                    set_cell(cr.cells(), "ssl_protocol", *cd.ssl_protocol);
+                if (cd->ssl_protocol) {
+                    set_cell(cr.cells(), "ssl_protocol", *cd->ssl_protocol);
                }
-                set_cell(cr.cells(), "username", cd.username ? *cd.username : sstring("anonymous"));
-                if (cd.scheduling_group_name) {
-                    set_cell(cr.cells(), "scheduling_group", *cd.scheduling_group_name);
+                set_cell(cr.cells(), "username", cd->username ? *cd->username : sstring("anonymous"));
+                if (cd->scheduling_group_name) {
+                    set_cell(cr.cells(), "scheduling_group", *cd->scheduling_group_name);
                }
+
+                auto map_type = map_type_impl::get_instance(
+                    utf8_type,
+                    utf8_type,
+                    false
+                );
+
+                auto prepare_client_options = [] (const auto& client_options) {
+                    map_type_impl::native_type tmp;
+                    for (auto& co: client_options) {
+                        auto map_element = std::make_pair(data_value(co.key.key()), data_value(co.value.key()));
+                        tmp.push_back(std::move(map_element));
+                    }
+                    return tmp;
+                };
+
+                set_cell(cr.cells(), "client_options",
+                    make_map_value(map_type, prepare_client_options(cd->client_options)));
+
                co_await result.emit_row(std::move(cr));
            }
            co_await result.emit_partition_end();
@@ -1100,9 +1120,10 @@ public:
        }

        auto tm = _db.local().get_token_metadata_ptr();
-        auto target_tablet_size = _db.local().get_config().target_tablet_size_in_bytes();

-        locator::load_sketch load(tm);
+        const uint64_t default_tablet_size = _db.local().get_config().target_tablet_size_in_bytes();
+
+        locator::load_sketch load(tm, stats, default_tablet_size);
        co_await load.populate();

        tm->get_topology().for_each_node([&] (const auto& node) {
@@ -1116,18 +1137,23 @@ public:
            if (auto ip = _gossiper.local().get_address_map().find(host)) {
                set_cell(r.cells(), "ip", data_value(inet_address(*ip)));
            }
-            set_cell(r.cells(), "tablets_allocated", load.get_load(host));
-            set_cell(r.cells(), "tablets_allocated_per_shard", data_value(double(load.get_real_avg_shard_load(host))));
-            set_cell(r.cells(), "storage_allocated_load", data_value(int64_t(load.get_load(host) * target_tablet_size)));
+            set_cell(r.cells(), "tablets_allocated", int64_t(load.get_tablet_count(host)));
+            set_cell(r.cells(), "tablets_allocated_per_shard", data_value(double(load.get_real_avg_tablet_count(host))));
+            set_cell(r.cells(), "storage_allocated_load", data_value(int64_t(load.get_tablet_count(host) * default_tablet_size)));

            if (stats && stats->capacity.contains(host)) {
                auto capacity = stats->capacity.at(host);
                set_cell(r.cells(), "storage_capacity", data_value(int64_t(capacity)));

-                auto utilization = load.get_allocated_utilization(host, *stats, target_tablet_size);
-                if (utilization) {
+                if (auto utilization = load.get_allocated_utilization(host)) {
                    set_cell(r.cells(), "storage_allocated_utilization", data_value(double(*utilization)));
                }
+                if (load.has_complete_data(host)) {
+                    if (auto utilization = load.get_storage_utilization(host)) {
+                        set_cell(r.cells(), "storage_utilization", data_value(double(*utilization)));
+                    }
+                    set_cell(r.cells(), "storage_load", data_value(int64_t(load.get_disk_used(host))));
+                }
            }
            mutation_sink(m);
        });
@@ -1147,6 +1173,8 @@ private:
            .with_column("storage_capacity", long_type)
            .with_column("storage_allocated_load", long_type)
            .with_column("storage_allocated_utilization", double_type)
+            .with_column("storage_load", long_type)
+            .with_column("storage_utilization", double_type)
            .with_sharder(1, 0) // shard0-only
            .with_hash_version()
            .build();
--- a/docs/alternator/compatibility.md
+++ b/docs/alternator/compatibility.md
@@ -271,6 +271,12 @@ is different, or can be configured in Alternator:
  So for example, if you create a table whose name is 192 characters, you
  can't create a GSI whose name is longer than 29 characters.

+* DynamoDB's DescribeTable will return information about the table. According to
+  AWS documentation, fields TableSizeBytes, IndexSizeBytes and ItemCount can
+  lag behind by up to 6 hours.
+  The `alternator_describe_table_info_cache_validity_in_seconds` parameter allows
+  users to change this timeout - the default value in seconds is set to 21600 (6 hours).
+
 ## Experimental API features

 Some DynamoDB API features are supported by Alternator, but considered
@@ -290,6 +296,14 @@ experimental:
  considered experimental so needs to be enabled explicitly with the
  `--experimental-features=alternator-streams` configuration option.

+  In this version, Alternator Streams is only supported if the base table
+  uses vnodes instead of tablets. However, by default new tables use tablets
+  so to create a table that can be used with Streams, you must set the tag
+  `system:initial_tablets` set to `none` during CreateTable - so that the
+  new table will use vnodes. Streams cannot be enabled on an already-existing
+  table that uses tablets.
+  See <https://github.com/scylladb/scylla/issues/23838>.
+
  Alternator streams also differ in some respects from DynamoDB Streams:
  * The number of separate "shards" in Alternator's streams is significantly
    larger than is typical on DynamoDB.
@@ -375,11 +389,11 @@ they should be easy to detect. Here is a list of these unimplemented features:
  another cache in front of the it. We wrote more about this here:
  <https://www.scylladb.com/2017/07/31/database-caches-not-good/>

-* The DescribeTable is missing information about size estimates, and 
-  also part of the information about indexes enabled on the table.
+* The DescribeTable is missing some information about size estimates
+  (IndexSizeBytes and ItemCount - TableSizeBytes is available), and also
+  part of the information about indexes enabled on the table.
  <https://github.com/scylladb/scylla/issues/5320>
  <https://github.com/scylladb/scylla/issues/7550>
-  <https://github.com/scylladb/scylla/issues/7551>

 * The PartiQL syntax (SQL-like SELECT/UPDATE/INSERT/DELETE expressions)
  and the operations ExecuteStatement, BatchExecuteStatement and
--- a/docs/cql/ddl.rst
+++ b/docs/cql/ddl.rst
@@ -365,7 +365,7 @@ Modifying a keyspace with tablets enabled is possible and doesn't require any sp

 - The replication factor (RF) can be increased or decreased by at most 1 at a time. To reach the desired RF value, modify the RF repeatedly.
 - The ``ALTER`` statement rejects the ``replication_factor`` tag. List the DCs explicitly when altering a keyspace. See :ref:`NetworkTopologyStrategy <replication-strategy>`.
- If there's any other ongoing global topology operation, executing the ``ALTER`` statement will fail (with an explicit and specific error) and needs to be repeated.
+- An RF change cannot be requested while another RF change is pending for the same keyspace. Attempting to execute an ``ALTER`` statement in this scenario will fail with an explicit error. Wait for the ongoing RF change to complete before issuing another ``ALTER`` statement.
 - The ``ALTER`` statement may take longer than the regular query timeout, and even if it times out, it will continue to execute in the background.
 - The replication strategy cannot be modified, as keyspaces with tablets only support ``NetworkTopologyStrategy``.
 - The ``ALTER`` statement will fail if it would make the keyspace :term:`RF-rack-invalid <RF-rack-valid keyspace>`.
@@ -1043,6 +1043,8 @@ The following modes are available:
   * - ``immediate``
     - Tombstone GC is immediately performed. There is no wait time or repair requirement. This mode is useful for a table that uses the TWCS compaction strategy with no user deletes. After data is expired after TTL, ScyllaDB can perform compaction to drop the expired data immediately.

+.. warning:: The ``repair`` mode is not supported for :term:`Colocated Tables <Colocated Table>` in this version.
+
 .. _cql-per-table-tablet-options:

 Per-table tablet options
--- a/docs/cql/dml/select.rst
+++ b/docs/cql/dml/select.rst
@@ -272,9 +272,17 @@ For example::
 This query returns up to 5 rows with the closest distance of ``embedding`` vector to the provided query vector,
 in this case ``[0.1, 0.2, 0.3, 0.4]``.

+There's also possibility to return the similarity score along with the results by using the :ref:`similarity functions <vector-similarity-functions>`.
+
+For example::
+
+    SELECT image_id, similarity_cosine(embedding, [0.1, 0.2, 0.3, 0.4])
+      FROM ImageEmbeddings
+      ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;
+
 .. warning:: 

-  Currently, vector queries do not support filtering with ``WHERE`` clause, returning similarity distances,
+  Currently, vector queries do not support filtering with ``WHERE`` clause,
  grouping with ``GROUP BY`` and paging. This will be added in the future releases.


--- a/docs/cql/functions.rst
+++ b/docs/cql/functions.rst
@@ -227,6 +227,39 @@ A number of functions are provided to “convert” the native types into binary
 takes a 64-bit ``blob`` argument and converts it to a ``bigint`` value. For example, ``bigintAsBlob(3)`` is
 ``0x0000000000000003`` and ``blobAsBigint(0x0000000000000003)`` is ``3``.

+
+.. _vector-similarity-functions:
+
+Vector similarity functions
+```````````````````````````
+
+To obtain the similarity of the given vectors, use a ``SELECT`` query::
+
+    SELECT comment, similarity_cosine(comment_vector, [0.1, 0.15, 0.3, 0.12, 0.05])
+        FROM cycling.comments_vs;
+
+The supported functions for this type of query are:
+
+- ``similarity_cosine``
+- ``similarity_euclidean``
+- ``similarity_dot_product``
+
+with the parameters of (``<vector>``, ``<vector>``).
+
+The ``vector`` is either the name of the float vector column or :ref:`vector of floats <vectors>`.
+Both arguments must be of the same dimension.
+
+These functions return a ``float`` value representing the similarity between the given vectors for each row.
+The similarity value is a floating-point number in a range of [0, 1] that describes how similar two vectors are.
+Values closer to 1 indicate greater similarity.
+The ``similarity_euclidean`` and ``similarity_dot_product`` functions do not perform vector normalization prior to computing similarity.
+
+.. note::
+    The ``similarity_dot_product`` function assumes that all input vectors are L2-normalized.
+    Supplying non-normalized vectors will result in dot product values that do not represent cosine similarity and therefore are not meaningful for similarity comparison.
+    If the input vectors are not normalized, consider using the ``similarity_cosine`` function instead.
+
+
 .. _udfs:

 User-defined functions :label-caution:`Experimental`
--- a/docs/dev/audit.md
+++ b/docs/dev/audit.md
@@ -1,6 +1,6 @@
 # Introduction

-Similar to the approach described in CASSANDRA-14471, we add the
+Similar to the approach described in CASSANDRA-12151, we add the
 concept of an audit specification.  An audit has a target (syslog or a
 table) and a set of events/actions that it wants recorded.  We
 introduce new CQL syntax for Scylla users to describe and manipulate
--- a/docs/dev/docker-hub.md
+++ b/docs/dev/docker-hub.md
@@ -2,8 +2,11 @@

 ## What is ScyllaDB?

-ScyllaDB is a high-performance NoSQL database system, fully compatible with Apache Cassandra.
-ScyllaDB is released under the GNU Affero General Public License version 3 and the Apache License, ScyllaDB is free and open-source software.
+ScyllaDB is a high-performance NoSQL database optimized for speed and scalability.
+It is designed to efficiently handle large volumes of data with minimal latency,
+making it ideal for data-intensive applications.
+
+ScyllaDB is distributed under the [ScyllaDB Source Available License](https://github.com/scylladb/scylladb/blob/master/LICENSE-ScyllaDB-Source-Available.md).

 > [ScyllaDB](http://www.scylladb.com/)

--- a/docs/dev/protocol-extensions.md
+++ b/docs/dev/protocol-extensions.md
@@ -74,6 +74,8 @@ The keys and values are:
    as an indicator to which shard client wants to connect. The desired shard number
    is calculated as: `desired_shard_no = client_port % SCYLLA_NR_SHARDS`.
    Its value is a decimal representation of type `uint16_t`, by default `19142`.
+  - `CLIENT_OPTIONS` is a string containing a JSON object representation that
+    contains CQL Driver configuration, e.g. load balancing policy, retry policy, timeouts, etc.

 Currently, one `SCYLLA_SHARDING_ALGORITHM` is defined,
 `biased-token-round-robin`. To apply the algorithm,
--- a/docs/dev/system_keyspace.md
+++ b/docs/dev/system_keyspace.md
@@ -372,6 +372,8 @@ Columns:
 * `storage_allocated_load` - Disk space allocated for tablets, assuming each tablet has a fixed size (target_tablet_size).
 * `storage_allocated_utilization` - Fraction of node's disk capacity taken for `storage_allocated_load`, where 1.0 means full utilization.
 * `storage_capacity` - Total disk capacity in bytes. Used to compute `storage_allocated_utilization`. By default equal to file system's capacity.
+* `storage_load` - Disk space allocated for tablets, computed with actual tablet sizes. Can be null if some of the tablet sizes are not known.
+* `storage_utilization` - Fraction of node's disk capacity taken for `storage_load` (with actual tablet sizes), where 1.0 means full utilization. Can be null if some of the tablet sizes are not known.
 * `tablets_allocated` - Number of tablet replicas on the node. Migrating tablets are accounted as if migration already finished.
 * `tablets_allocated_per_shard` - `tablets_allocated` divided by shard count on the node.

--- a/docs/dev/topology-over-raft.md
+++ b/docs/dev/topology-over-raft.md
@@ -86,6 +86,7 @@ stateDiagram-v2
        de_left_token_ring --> [*]
    }
    state removing {
+        re_left_token_ring : left_token_ring
        re_tablet_draining : tablet_draining
        re_tablet_migration : tablet_migration
        re_write_both_read_old : write_both_read_old
@@ -98,7 +99,8 @@ stateDiagram-v2
        re_tablet_draining --> re_write_both_read_old
        re_write_both_read_old --> re_write_both_read_new: streaming completed
        re_write_both_read_old --> re_rollback_to_normal: rollback
-        re_write_both_read_new --> [*]
+        re_write_both_read_new --> re_left_token_ring
+        re_left_token_ring --> [*]
    }
    rebuilding --> normal: streaming completed
    decommissioning --> left: operation succeeded
@@ -122,9 +124,10 @@ Note that these are not all states, as there are other states specific to tablet
    Writes to vnodes-based tables are going to both new and old replicas (new replicas means calculated according
    to modified token ring), reads are using old replicas.
 - `write_both_read_new` - as above, but reads are using new replicas.
- `left_token_ring` - the decommissioning node left the token ring, but we still need to wait until other
-    nodes observe it and stop sending writes to this node. Then, we tell the node to shut down and remove
-    it from group 0. We also use this state to rollback a failed bootstrap or decommission.
+- `left_token_ring` - the decommissioning or removing node left the token ring, but we still need to wait until other
+    nodes observe it and stop sending writes to this node. For decommission, we tell the node to shut down,
+    then remove it from group 0. For removenode, the node is already down, so we skip the shutdown step.
+    We also use this state to rollback a failed bootstrap or decommission.
 - `rollback_to_normal` - the decommission or removenode operation failed. Rollback the operation by
    moving the node we tried to decommission/remove back to the normal state.
 - `lock` - the topology stays in this state until externally changed (to null state), preventing topology
@@ -141,7 +144,9 @@ reads that started before this point exist in the system. Finally we remove the
 transitioning state.

 Decommission, removenode and replace work similarly, except they don't go through
-`commit_cdc_generation`.
+`commit_cdc_generation`. Both decommission and removenode go through the
+`left_token_ring` state to run a global barrier ensuring all nodes are aware
+of the topology change before the operation completes.

 The state machine may also go only through the `commit_cdc_generation` state
 after getting a request from the user to create a new CDC generation if the
--- a/docs/dev/view-building-coordinator.md
+++ b/docs/dev/view-building-coordinator.md
@@ -41,12 +41,12 @@ Unless the task was aborted, the worker will eventually reply that the task was
 it temporarily saves list of ids of finished tasks and removes those tasks from group0 state (pernamently marking them as finished) in 200ms intervals. (*)
 This batching of removing finished tasks is done in order to reduce number of generated group0 operations.

-On the other hand, view buildind tasks can can also be aborted due to 2 main reasons:
+On the other hand, view building tasks can can also be aborted due to 2 main reasons:
 - a keyspace/view was dropped
 - tablet operations (see [tablet operations section](#tablet-operations))
 In the first case we simply delete relevant view building tasks as they are no longer needed.
-But if a task needs to be aborted due to tablet operation, we're firstly setting the `aborted` flag to true. We need to do this because we need the task informations
-to created a new adjusted tasks (if the operation succeeded) or rollback them (if the operation failed).
+But if a task needs to be aborted due to tablet operation, we're firstly setting the `aborted` flag to true. We need to do this because we need the task information
+to create new adjusted tasks (if the operation succeeded) or rollback them (if the operation failed).
 Once a task is aborted by setting the flag, this cannot be revoked, so rolling back a task means creating its duplicate and removing the original task.

 (*) - Because there is a time gap between when the coordinator learns that a task is finished (from the RPC response) and when the task is marked as completed,
--- a/docs/features/index.rst
+++ b/docs/features/index.rst
@@ -17,6 +17,7 @@ This document highlights ScyllaDB's key data modeling features.
   Workload Prioritization </features/workload-prioritization>
   Backup and Restore </features/backup-and-restore>
   Incremental Repair </features/incremental-repair/>
+   Vector Search </features/vector-search/>

 .. panel-box::
  :title: ScyllaDB Features
@@ -43,3 +44,5 @@ This document highlights ScyllaDB's key data modeling features.
  * :doc:`Incremental Repair </features/incremental-repair/>` provides a much more
    efficient and lightweight approach to maintaining data consistency by
    repairing only the data that has changed since the last repair.
+  * :doc:`Vector Search in ScyllaDB </features/vector-search/>` enables
+    similarity-based queries on vector embeddings.
--- a/docs/features/vector-search.rst
+++ b/docs/features/vector-search.rst
@@ -0,0 +1,55 @@
+=================================
+Vector Search in ScyllaDB
+=================================
+
+.. note::
+
+    This feature is currently available only in `ScyllaDB Cloud <https://cloud.docs.scylladb.com/>`_.
+
+What Is Vector Search
+-------------------------
+
+Vector Search enables similarity-based queries over high-dimensional data,
+such as text, images, audio, or user behavior. Instead of searching for exact
+matches, it allows applications to find items that are semantically similar to
+a given input.
+
+To do this, Vector Search works on vector embeddings, which are numerical
+representations of data that capture semantic meaning. This enables queries
+such as:
+
+* “Find documents similar to this paragraph”
+* “Find products similar to what the user just viewed”
+* “Find previous tickets related to this support request”
+
+Rather than relying on exact values or keywords, Vector Search returns results
+based on distance or similarity between vectors. This capability is
+increasingly used in modern workloads such as AI-powered search, recommendation
+systems, and retrieval-augmented generation (RAG).
+
+Why Vector Search Matters
+------------------------------------
+
+Many applications already rely on ScyllaDB for high throughput, low and
+predictable latency, and large-scale data storage.
+
+Vector Search complements these strengths by enabling new classes of workloads,
+including:
+
+* Semantic search over text or documents
+* Recommendations based on user or item similarity
+* AI and ML applications, including RAG pipelines
+* Anomaly and pattern detection
+
+With Vector Search, ScyllaDB can serve as the similarity search backend for
+AI-driven applications.
+
+Availability
+--------------
+
+Vector Search is currently available only in ScyllaDB Cloud, the fully managed
+ScyllaDB service.
+
+
+👉 For details on using Vector Search, refer to the
+`ScyllaDB Cloud documentation <https://cloud.docs.scylladb.com/stable/vector-search/index.html>`_.
--- a/docs/getting-started/_common/setup-after-install.rst
+++ b/docs/getting-started/_common/setup-after-install.rst
@@ -45,10 +45,3 @@ Run cqlsh:
     
     cqlsh

-Run cassandra-stress:
-
-.. code-block:: console
-     
-     cassandra-stress write -mode cql3 native 
-
-
--- a/docs/getting-started/cloud-instance-recommendations.rst
+++ b/docs/getting-started/cloud-instance-recommendations.rst
@@ -20,7 +20,10 @@ You can run your ScyllaDB workloads on AWS, GCE, and Azure using a ScyllaDB imag
 Amazon Web Services (AWS)
 -----------------------------

-The recommended instance types are :ref:`i3en <system-requirements-i3en-instances>`, :ref:`i4i <system-requirements-i4i-instances>`, :ref:`i7i <system-requirements-i7i-instances>`, and :ref:`i7ie <system-requirements-i7ie-instances>`.
+The recommended instance types are :ref:`i3en <system-requirements-i3en-instances>`,
+:ref:`i4i <system-requirements-i4i-instances>`, :ref:`i7i <system-requirements-i7i-instances>`,
+:ref:`i7ie <system-requirements-i7ie-instances>`, :ref:`i8g<system-requirements-i8g-instances>`,
+and :ref:`i8ge <system-requirements-i8ge-instances>`.

 .. note::

@@ -195,6 +198,118 @@ All i7i instances have the following specs:

 See `Amazon EC2 I7i Instances <https://aws.amazon.com/ec2/instance-types/i7i/>`_ for details.

+
+.. _system-requirements-i8g-instances:
+
+i8g instances
+^^^^^^^^^^^^^^
+
+The following i8g instances are supported.
+
+.. list-table::
+   :widths: 30 20 20 30
+   :header-rows: 1
+
+   * - Model
+     - vCPU
+     - Mem (GiB)
+     - Storage (GB)
+   * - i8g.large
+     - 2
+     - 16
+     - 1 x 468 GB
+   * - i8g.xlarge
+     - 4
+     - 32
+     - 1 x 937 GB
+   * - i8g.2xlarge
+     - 8
+     - 64
+     - 1 x 1,875 GB
+   * - i8g.4xlarge
+     - 16
+     - 128
+     - 1 x 3,750 GB
+   * - i8g.8xlarge
+     - 32
+     - 256
+     - 2 x 3,750 GB
+   * - i8g.12xlarge
+     - 48
+     - 384
+     - 3 x 3,750 GB
+   * - i8g.16xlarge
+     - 64
+     - 512
+     - 4 x 3,750 GB
+
+All i8g instances have the following specs:
+
+* Powered by AWS Graviton4 processors
+* 3rd generation AWS Nitro SSD storage
+* DDR5-5600 memory for improved throughput
+* Up to 100 Gbps of networking bandwidth and up to 60 Gbps of bandwidth to
+  Amazon Elastic Block Store (EBS)
+* Instance sizes offer up to 45 TB of total local NVMe instance storage
+
+See `Amazon EC2 I8g Instances <https://aws.amazon.com/ec2/instance-types/i8g/>`_ for details.
+
+.. _system-requirements-i8ge-instances:
+
+i8ge instances
+^^^^^^^^^^^^^^
+
+The following i8ge instances are supported.
+
+.. list-table::
+   :widths: 30 20 20 30
+   :header-rows: 1
+
+   * - Model
+     - vCPU
+     - Mem (GiB)
+     - Storage (GB)
+   * - i8ge.large
+     - 2
+     - 16
+     - 1 x 1,250 GB
+   * - i8ge.xlarge
+     - 4
+     - 32
+     - 1 x 2,500 GB
+   * - i8ge.2xlarge
+     - 8
+     - 64
+     - 2 x 2,500 GB
+   * - i8ge.3xlarge
+     - 12
+     - 96
+     - 1 x 7,500 GB
+   * - i8ge.6xlarge
+     - 24
+     - 192
+     - 2 x 7,500 GB
+   * - i8ge.12xlarge
+     - 48
+     - 384
+     - 4 x 7,500 GB
+   * - i8ge.18xlarge
+     - 72
+     - 576
+     - 6 x 7,500 GB
+
+All i8ge instances have the following specs:
+
+* Powered by AWS Graviton4 processors
+* 3rd generation AWS Nitro SSD storage
+* DDR5-5600 memory for improved throughput
+* Up to 300 Gbps of networking bandwidth and up to 60 Gbps of bandwidth to
+  Amazon Elastic Block Store (EBS)
+* Instance sizes offer up to 120 TB of total local NVMe instance storage
+
+See `Amazon EC2 I8g Instances <https://aws.amazon.com/ec2/instance-types/i8g/>`_ for details.
+
+
 Im4gn and Is4gen instances
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ScyllaDB supports Arm-based Im4gn and Is4gen instances. See  `Amazon EC2 Im4gn and Is4gen instances <https://aws.amazon.com/ec2/instance-types/i4g/>`_ for specification details. 
--- a/docs/getting-started/index.rst
+++ b/docs/getting-started/index.rst
@@ -25,8 +25,7 @@ Getting Started
  :id: "getting-started"
  :class: my-panel

-  * `Install ScyllaDB (Binary Packages, Docker, or EC2) <https://www.scylladb.com/download/#core>`_ - Links to the ScyllaDB Download Center
-  
+  * :doc:`Install ScyllaDB </getting-started/install-scylla/index/>`
  * :doc:`Configure ScyllaDB </getting-started/system-configuration/>`
  * :doc:`Run ScyllaDB in a Shared Environment </getting-started/scylla-in-a-shared-environment>`
  * :doc:`Create a ScyllaDB Cluster - Single Data Center (DC) </operating-scylla/procedures/cluster-management/create-cluster/>`
@@ -60,4 +59,5 @@ Getting Started
  
  * `Build an IoT App with sensor simulator and a REST API <https://iot.scylladb.com/stable/>`_ - ScyllaDB Tutorial
  * `Implement CRUD operations with a TODO App <https://github.com/scylladb/scylla-cloud-getting-started/>`_ - ScyllaDB Cloud Tutorial
-  * `Build a machine learning (ML) feature store with ScyllaDB <https://feature-store.scylladb.com/stable/>`_ - ScyllaDB Cloud Tutorial  ` <>`_
+  * `Build a machine learning (ML) feature store with ScyllaDB <https://feature-store.scylladb.com/stable/>`_ - ScyllaDB Cloud Tutorial
+  
--- a/docs/getting-started/installation-common/disable-housekeeping.rst
+++ b/docs/getting-started/installation-common/disable-housekeeping.rst
@@ -3,8 +3,7 @@
 ScyllaDB Housekeeping and how to disable it
 ============================================

-It is always recommended to run the latest version of ScyllaDB. 
-The latest stable release version is always available from the `Download Center <https://www.scylladb.com/download/>`_.
+It is always recommended to run the latest stable version of ScyllaDB. 

 When you install ScyllaDB, it installs by default two services: **scylla-housekeeping-restart** and **scylla-housekeeping-daily**. These services check for the latest ScyllaDB version and prompt the user if they are using a version that is older than what is publicly available.
 Information about your ScyllaDB deployment, including the ScyllaDB version currently used, as well as unique user and server identifiers, are collected by a centralized service.
--- a/docs/kb/consistency.rst
+++ b/docs/kb/consistency.rst
@@ -83,7 +83,7 @@ Additional References

 * `Jepsen and ScyllaDB: Putting Consistency to the Test blog post <https://www.scylladb.com/2020/12/23/jepsen-and-scylla-putting-consistency-to-the-test/>`_ 
 * `Nauto: Achieving Consistency in an Eventually Consistent Environment blog post <https://www.scylladb.com/2020/02/20/nauto-achieving-consistency-in-an-eventually-consistent-environment/>`_ 
-* `Consistency Levels documentation <https://docs.scylladb.com/manual/stable/cql/consistency.html>`_ 
+* :doc:`Consistency Levels documentation </cql/consistency/>`
 * `High Availability lesson on ScyllaDB University <https://university.scylladb.com/courses/scylla-essentials-overview/lessons/high-availability/>`_ 
 * `Lightweight Transactions lesson on ScyllaDB University <https://university.scylladb.com/courses/data-modeling/lessons/lightweight-transactions/>`_ 
 * `Getting the Most out of Lightweight Transactions in ScyllaDB blog post <https://www.scylladb.com/2020/07/15/getting-the-most-out-of-lightweight-transactions-in-scylla/>`_ 
--- a/docs/kb/tombstones-flush.rst
+++ b/docs/kb/tombstones-flush.rst
@@ -38,7 +38,7 @@ Steps:

 4. Run compaction (this will remove big partitions with tombstones from specified table)

-.. note:: By default, major compaction runs on all the keyspaces and tables, so if we want to specyfy e.g. only one table, we should point at it using arguments: ``<keyspace>.<mytable>``. For more information, please refer to `this article <https://docs.scylladb.com/operating-scylla/nodetool-commands/compact/>`_.
+.. note:: By default, major compaction runs on all the keyspaces and tables, so if we want to specyfy e.g. only one table, we should point at it using arguments: ``<keyspace>.<mytable>``. For more information, please see :doc:`Nodetool compact </operating-scylla/nodetool-commands/compact/>`.

 .. code-block:: sh
   
--- a/docs/operating-scylla/admin-tools/index.rst
+++ b/docs/operating-scylla/admin-tools/index.rst
@@ -10,6 +10,7 @@ Admin Tools
   Admin REST API </operating-scylla/rest>
   Tracing </using-scylla/tracing>
   ScyllaDB SStable </operating-scylla/admin-tools/scylla-sstable/>
+   ScyllaDB SStable Script API </operating-scylla/admin-tools/scylla-sstable-script-api/>
   ScyllaDB Types </operating-scylla/admin-tools/scylla-types/>
   sstableloader
   cassandra-stress </operating-scylla/admin-tools/cassandra-stress/>
--- a/docs/operating-scylla/admin-tools/scylla-sstable-script-api.rst
+++ b/docs/operating-scylla/admin-tools/scylla-sstable-script-api.rst
@@ -0,0 +1,530 @@
+ScyllaDB SStable Script API
+---------------------------
+
+The script API consists of two parts:
+
+* `ScyllaDB Consume API <scylla-consume-api_>`_ - Hook methods implemented by the script to consume a :ref:`mutation fragment stream <scylla-sstable-sstable-content>`;
+* `ScyllaDB Lua API <scylla-script-lua-api_>`_ - types and methods exposed to the script to work with ScyllaDB types and values.
+
+.. _scylla-consume-api:
+
+ScyllaDB Consume API
+~~~~~~~~~~~~~~~~~~~~~~
+
+These methods represent the glue code between scylla-sstable's C++ code and the Lua script.
+Conceptually a script is an implementation of a consumer interface. The script has to implement only the methods it is interested in. Each method has a default implementation in the interface, which simply drops the respective :ref:`mutation fragment <scylla-sstable-sstable-content>`.
+For example, a script only interested in partitions can define only :ref:`consume_partition_start() <scylla-consume-partition-start-method>` and nothing else.
+Therefore a completely empty script is also valid, although not very useful.
+Below you will find the listing of the API methods.
+These methods (if provided by the script) will be called by the scylla-sstable runtime for the appropriate events and fragment types.
+
+.. _scylla-consume-stream-start-method:
+
+consume_stream_start(args)
+""""""""""""""""""""""""""
+
+* Part of the Consume API. Called on the very start of the stream.
+* Parameter is a Lua table containing command line arguments for the script, passed via ``--script-arg``.
+* Can be used to initialize global state.
+
+.. _scylla-consume-sstable-start-method:
+
+consume_sstable_start(sst)
+""""""""""""""""""""""""""
+
+* Part of the Consume API.
+* Called on the start of each stable. 
+* The parameter is of type `ScyllaDB.sstable <scylla-sstable-type_>`_. 
+* When SStables are merged (``--merge``), the parameter is ``nil``.
+
+Returns whether to stop. If ``true``, `consume_sstable_end() <scylla-consume-sstable-end-method_>`_ is called, skipping the content of the sstable (or that of the entire stream if ``--merge`` is used). If ``false``, consumption follows with the content of the sstable.
+
+.. _scylla-consume-partition-start-method:
+
+consume_partition_start(ps)
+"""""""""""""""""""""""""""
+
+* Part of the Consume API. Called on the start of each partition. 
+* The parameter is of type `ScyllaDB.partition_start <scylla-partition-start-type_>`_.
+* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, skipping the content of the partition. If ``false``, consumption follows with the content of the partition.
+
+consume_static_row(sr)
+""""""""""""""""""""""
+
+* Part of the Consume API. 
+* Called if the partition has a static row. 
+* The parameter is of type `ScyllaDB.static_row <scylla-static-row-type_>`_.
+* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, and the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
+
+consume_clustering_row(cr)
+""""""""""""""""""""""""""
+
+* Part of the Consume API. 
+* Called for each clustering row. 
+* The parameter is of type `ScyllaDB.clustering_row <scylla-clustering-row-type_>`_.
+* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
+
+consume_range_tombstone_change(crt)
+"""""""""""""""""""""""""""""""""""
+
+* Part of the Consume API.
+* Called for each range tombstone change. 
+* The parameter is of type `ScyllaDB.range_tombstone_change <scylla-range-tombstone-change-type_>`_.
+* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
+
+.. _scylla-consume-partition-end-method:
+
+consume_partition_end()
+"""""""""""""""""""""""
+
+* Part of the Consume API.
+* Called at the end of the partition.
+* Returns whether to stop. If ``true``, `consume_sstable_end() <scylla-consume-sstable-end-method_>`_ is called,  the remaining content of the SStable is skipped. If ``false``, consumption follows with the remaining content of the SStable.
+
+.. _scylla-consume-sstable-end-method:
+
+consume_sstable_end()
+"""""""""""""""""""""
+
+* Part of the Consume API.
+* Called at the end of the SStable.
+* Returns whether to stop. If true, `consume_stream_end() <scylla-consume-stream-end-method_>`_ is called, the remaining content of the stream is skipped. If false, consumption follows with the remaining content of the stream.
+
+.. _scylla-consume-stream-end-method:
+
+consume_stream_end()
+""""""""""""""""""""
+
+* Part of the Consume API. 
+* Called at the very end of the stream.
+
+.. _scylla-script-lua-api:
+
+ScyllaDB LUA API
+~~~~~~~~~~~~~~~~
+
+In addition to the `ScyllaDB Consume API <scylla-consume-api_>`_, the Lua bindings expose various types and methods that allow you to work with ScyllaDB types and values.
+The listing uses the following terminology:
+
+* Attribute - a simple attribute accessible via ``obj.attribute_name``;
+* Method - a method operating on an instance of said type, invocable as ``obj:method()``;
+* Magic method - magic methods defined in the metatable which define behaviour of these objects w.r.t. `Lua operators and more <http://www.lua.org/manual/5.4/manual.html#2.4>`_;
+
+The format of an attribute description is the following:
+
+.. code-block:: none
+    :class: hide-copy-button
+
+    attribute_name (type) - description
+
+and that of a method:
+
+.. code-block:: none
+    :class: hide-copy-button
+
+    method_name(arg1_type, arg2_type...) (return_type) - description
+
+Magic methods have their signature defined by Lua and so that is not described here (these methods are not used directly anyway).
+
+.. _scylla-atomic-cell-type:
+
+ScyllaDB.atomic_cell
+""""""""""""""""""""
+
+Attributes:
+
+* timestamp (integer)
+* is_live (boolean) - is the cell live?
+* type (string) - one of: ``regular``, ``counter-update``, ``counter-shards``, ``frozen-collection`` or ``collection``.
+* has_ttl (boolean) - is the cell expiring?
+* ttl (integer) - time to live in seconds, ``nil`` if cell is not expiring.
+* expiry (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which cell expires, ``nil`` if cell is not expiring.
+* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which cell was deleted, ``nil`` unless cell is dead or expiring.
+* value:
+
+    - ``nil`` if cell is dead.
+    - appropriate Lua native type if type == ``regular``.
+    - integer if type == ``counter-update``.
+    - `ScyllaDB.counter_shards_value <scylla-counter-shards-value-type_>`_ if type == ``counter-shards``.
+
+A counter-shard table has the following keys:
+
+* id (string)
+* value (integer)
+* clock (integer)
+
+.. _scylla-clustering-key-type:
+
+ScyllaDB.clustering_key
+"""""""""""""""""""""""
+
+Attributes:
+
+* components (table) - the column values (`ScyllaDB.data_value <scylla-data-value-type_>`_) making up the composite clustering key.
+
+Methods:
+
+* to_hex - convert the key to its serialized format, encoded in hex.
+
+Magic methods:
+
+* __tostring - can be converted to string with tostring(), uses the built-in operator<< in ScyllaDB.
+
+.. _scylla-clustering-row-type:
+
+ScyllaDB.clustering_row
+"""""""""""""""""""""""
+
+Attributes:
+
+* key ($TYPE) - the clustering key's value as the appropriate Lua native type.
+* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - row tombstone, ``nil`` if no tombstone.
+* shadowable_tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - shadowable tombstone of the row tombstone, ``nil`` if no tombstone.
+* marker (`ScyllaDB.row_marker <scylla-row-marker-type_>`_) - the row marker, ``nil`` if row doesn't have one.
+* cells (table) - table of cells, where keys are the column names and the values are either of type `ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_ or `ScyllaDB.collection <scylla-collection-type_>`_.
+
+See also:
+
+* `ScyllaDB.unserialize_clustering_key() <scylla-unserialize-clustering-key-method_>`_.
+
+.. _scylla-collection-type:
+
+ScyllaDB.collection
+"""""""""""""""""""
+
+Attributes:
+
+* type (string) - always ``collection`` for collection.
+* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - ``nil`` if no tombstone.
+* cells (table) - the collection cells, each collection cell is a table, with a ``key`` and ``value`` attribute. The key entry is the key of the collection cell for actual collections (list, set and map) and is of type `ScyllaDB.data-value <scylla-data-value-type_>`_. For tuples and UDT this is just an empty string. The value entry is the value of the collection cell and is of type `ScyllaDB.atomic-cell <scylla-atomic-cell-type_>`_. 
+
+.. _scylla-collection-cell-value-type:
+
+ScyllaDB.collection_cell_value
+""""""""""""""""""""""""""""""
+
+Attributes:
+
+* key (sstring) - collection cell key in human readable form.
+* value (`ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_) - collection cell value.
+
+.. _scylla-column-definition-type:
+
+ScyllaDB.column_definition
+""""""""""""""""""""""""""
+
+Attributes:
+
+* id (integer) - the id of the column.
+* name (string) - the name of the column.
+* kind (string) - the kind of the column, one of ``partition_key``, ``clustering_key``, ``static_column`` or ``regular_column``.
+
+.. _scylla-counter-shards-value-type:
+
+ScyllaDB.counter_shards_value
+"""""""""""""""""""""""""""""
+
+Attributes:
+
+* value (integer) - the total value of the counter (the sum of all the shards).
+* shards (table) - the shards making up this counter, a lua list containing tables, representing shards, with the following key/values:
+
+    - id (string) - the shard's id (UUID).
+    - value (integer) - the shard's value.
+    - clock (integer) - the shard's logical clock.
+
+Magic methods:
+
+* __tostring - can be converted to string with tostring().
+
+.. _scylla-data-value-type:
+
+ScyllaDB.data_value
+"""""""""""""""""""
+
+Attributes:
+
+* value - the value represented as the appropriate Lua type
+
+Magic methods:
+
+* __tostring - can be converted to string with tostring().
+
+.. _scylla-gc-clock-time-point-type:
+
+ScyllaDB.gc_clock_time_point
+""""""""""""""""""""""""""""
+
+A time point belonging to the gc_clock, in UTC.
+
+Attributes:
+
+* year (integer) - [1900, +inf).
+* month (integer) - [1, 12].
+* day (integer) - [1, 31].
+* hour (integer) - [0, 23].
+* min (integer) - [0, 59].
+* sec (integer) - [0, 59].
+
+Magic methods:
+
+* __eq - can be equal compared.
+* __lt - can be less compared.
+* __le - can be less-or-equal compared.
+* __tostring - can be converted to string with tostring().
+
+See also:
+
+* `ScyllaDB.now() <scylla-now-method_>`_.
+* `ScyllaDB.time_point_from_string() <scylla-time-point-from-string-method_>`_.
+
+.. _scylla-json-writer-type:
+
+ScyllaDB.json_writer
+""""""""""""""""""""
+
+A JSON writer object, with both low-level and high-level APIs.
+The low-level API allows you to write custom JSON and it loosely follows the API of `rapidjson::Writer <https://rapidjson.org/classrapidjson_1_1_writer.html>`_ (upon which it is implemented).
+The high-level API is for writing :ref:`mutation fragments <scylla-sstable-sstable-content>` as JSON directly, using the built-in JSON conversion logic that is used by :ref:`dump-data <scylla-sstable-dump-data-operation>` operation.
+
+Low level API Methods:
+
+* null() - write a null json value.
+* bool(boolean) - write a bool json value.
+* int(integer) - write an integer json value.
+* double(number) - write a double json value.
+* string(string) - write a string json value.
+* start_object() - start a json object.
+* key(string) - write the key of a json object.
+* end_object() - write the end of a json object.
+* start_array() - write the start of a json array.
+* end_array() - write the end of a json array.
+
+High level API Methods:
+
+* start_stream() - start the stream, call at the very beginning.
+* start_sstable() - start an sstable.
+* start_partition() - start a partition.
+* static_row() - write a static row to the stream.
+* clustering_row() - write a clustering row to the stream.
+* range_tombstone_change() - write a range tombstone change to the stream.
+* end_partition() - end the current partition.
+* end_sstable() - end the current sstable.
+* end_stream() - end the stream, call at the very end.
+
+.. _scylla-new-json-writer-method:
+
+ScyllaDB.new_json_writer()
+""""""""""""""""""""""""""
+
+Create a `ScyllaDB.json_writer <scylla-json-writer-type_>`_ instance.
+
+.. _scylla-new-position-in-partition-method:
+
+ScyllaDB.new_position_in_partition()
+""""""""""""""""""""""""""""""""""""
+
+Creates a `ScyllaDB.position_in_partition <scylla-position-in-partition-type_>`_ instance.
+
+Arguments:
+
+* weight (integer) - the weight of the key.
+* key (`ScyllaDB.clustering_key <scylla-clustering-key-type_>`_) - the clustering key, optional.
+
+.. _scylla-new-ring-position-method:
+
+ScyllaDB.new_ring_position()
+""""""""""""""""""""""""""""
+
+Creates a `ScyllaDB.ring_position <scylla-ring-position-type_>`_ instance.
+
+Has several overloads:
+
+* ``ScyllaDB.new_ring_position(weight, key)``.
+* ``ScyllaDB.new_ring_position(weight, token)``.
+* ``ScyllaDB.new_ring_position(weight, key, token)``.
+
+Where:
+
+* weight (integer) - the weight of the key.
+* key (`ScyllaDB.partition_key <scylla-partition-key-type_>`_) - the partition key.
+* token (integer) - the token (of the key if a key is provided).
+
+.. _scylla-now-method:
+
+ScyllaDB.now()
+""""""""""""""
+
+Create a `ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_ instance, representing the current time.
+
+.. _scylla-partition-key-type:
+
+ScyllaDB.partition_key
+""""""""""""""""""""""
+
+Attributes:
+
+* components (table) - the column values (`ScyllaDB.data_value <scylla-data-value-type_>`_) making up the composite partition key.
+
+Methods:
+
+* to_hex - convert the key to its serialized format, encoded in hex.
+
+Magic methods:
+
+* __tostring - can be converted to string with tostring(), uses the built-in operator<< in ScyllaDB.
+
+See also:
+
+* :ref:`ScyllaDB.unserialize_partition_key() <scylla-unserialize-partition-key-method>`.
+* :ref:`ScyllaDB.token_of() <scylla-token-of-method>`.
+
+.. _scylla-partition-start-type:
+
+ScyllaDB.partition_start
+""""""""""""""""""""""""
+
+Attributes:
+
+* key - the partition key's value as the appropriate Lua native type.
+* token (integer) - the partition key's token.
+* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - the partition tombstone, ``nil`` if no tombstone.
+
+.. _scylla-position-in-partition-type:
+
+ScyllaDB.position_in_partition
+""""""""""""""""""""""""""""""
+
+Currently used only for clustering positions.
+
+Attributes:
+
+* key (`ScyllaDB.clustering_key <scylla-clustering-key-type_>`_) - the clustering key, ``nil`` if the position in partition represents the min or max clustering positions.
+* weight (integer) - weight of the position, either -1 (before key), 0 (at key) or 1 (after key). If key attribute is ``nil``, the weight is never 0.
+
+Methods:
+
+* tri_cmp - compare this position in partition to another position in partition, returns -1 (``<``), 0 (``==``) or 1 (``>``).
+
+See also:
+
+* `ScyllaDB.new_position_in_partition() <scylla-new-position-in-partition-method_>`_.
+
+.. _scylla-range-tombstone-change-type:
+
+ScyllaDB.range_tombstone_change
+"""""""""""""""""""""""""""""""
+
+Attributes:
+
+* key ($TYPE) - the clustering key's value as the appropriate Lua native type.
+* key_weight (integer) - weight of the position, either -1 (before key), 0 (at key) or 1 (after key).
+* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - tombstone, ``nil`` if no tombstone.
+
+.. _scylla-ring-position-type:
+
+ScyllaDB.ring_position
+""""""""""""""""""""""
+
+Attributes:
+
+* token (integer) - the token, ``nil`` if the ring position represents the min or max ring positions.
+* key (`ScyllaDB.partition_key <scylla-partition-key-type_>`_) - the partition key, ``nil`` if the ring position represents a position before/after a token.
+* weight (integer) - weight of the position, either -1 (before key/token), 0 (at key) or 1 (after key/token). If key attribute is ``nil``, the weight is never 0.
+
+Methods:
+
+* tri_cmp - compare this ring position to another ring position, returns -1 (``<``), 0 (``==``) or 1 (``>``).
+
+See also:
+
+* `ScyllaDB.new_ring_position() <scylla-new-ring-position-method_>`_.
+
+.. _scylla-row-marker-type:
+
+ScyllaDB.row_marker
+"""""""""""""""""""
+
+Attributes:
+
+* timestamp (integer).
+* is_live (boolean) - is the marker live?
+* has_ttl (boolean) - is the marker expiring?
+* ttl (integer) - time to live in seconds, ``nil`` if marker is not expiring.
+* expiry (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which marker expires, ``nil`` if marker is not expiring.
+* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which marker was deleted, ``nil`` unless marker is dead or expiring.
+
+.. _scylla-schema-type:
+
+ScyllaDB.schema
+"""""""""""""""
+
+Attributes:
+
+* partition_key_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the key columns making up the partition key.
+* clustering_key_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the key columns making up the clustering key.
+* static_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the static columns.
+* regular_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the regular columns.
+* all_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of all columns.
+
+.. _scylla-sstable-type:
+
+ScyllaDB.sstable
+""""""""""""""""
+
+Attributes:
+
+* filename (string) - the full path of the sstable Data component file;
+
+.. _scylla-static-row-type:
+
+ScyllaDB.static_row
+"""""""""""""""""""
+
+Attributes:
+
+* cells (table) - table of cells, where keys are the column names and the values are either of type `ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_ or `ScyllaDB.collection <scylla-collection-type_>`_.
+
+.. _scylla-time-point-from-string-method:
+
+ScyllaDB.time_point_from_string()
+"""""""""""""""""""""""""""""""""
+
+Create a `ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_ instance from the passed in string.
+Argument is string, using the same format as the CQL timestamp type, see https://en.wikipedia.org/wiki/ISO_8601.
+
+.. _scylla-token-of-method:
+
+ScyllaDB.token_of()
+"""""""""""""""""""
+
+Compute and return the token (integer) for a `ScyllaDB.partition_key <scylla-partition-key-type_>`_.
+
+.. _scylla-tombstone-type:
+
+ScyllaDB.tombstone
+""""""""""""""""""
+
+Attributes:
+
+* timestamp (integer)
+* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - the point in time at which the tombstone was deleted.
+
+.. _scylla-unserialize-clustering-key-method:
+
+ScyllaDB.unserialize_clustering_key()
+"""""""""""""""""""""""""""""""""""""
+
+Create a `ScyllaDB.clustering_key <scylla-clustering-key-type_>`_ instance.
+
+Argument is a string representing serialized clustering key in hex format.
+
+.. _scylla-unserialize-partition-key-method:
+
+ScyllaDB.unserialize_partition_key()
+""""""""""""""""""""""""""""""""""""
+
+Create a `ScyllaDB.partition_key <scylla-partition-key-type_>`_ instance.
+
+Argument is a string representing serialized partition key in hex format.
+
--- a/docs/operating-scylla/admin-tools/scylla-sstable.rst
+++ b/docs/operating-scylla/admin-tools/scylla-sstable.rst
@@ -667,7 +667,7 @@ write
 Writes an SStable based on a description of the content.
 The description can be provided in two formats: ``CQL`` and ``JSON``.
 The input format can be selected with the ``--input-format`` flag. Default is ``cql``.
-In both cases the input is expected to be provided via the file whoose path is passed to ``--input-file``.
+In both cases the input is expected to be provided via the file whose path is passed to ``--input-file``.

 CQL input format
 ~~~~~~~~~~~~~~~~
@@ -858,527 +858,9 @@ Alternatively, you can provide each key-value pair via a separate ``--script-arg

    --script-arg $key1=$value1 --script-arg $key2=$value2

-Command line arguments will be received by the `consume_stream_start() <scylla-consume-stream-start-method_>`_ API method.
+Command line arguments will be received by the :ref:`consume_stream_start() <scylla-consume-stream-start-method>` API method.

-.. _scylla-consume-api:
-
-ScyllaDB Consume API
-~~~~~~~~~~~~~~~~~~~~~~
-
-These methods represent the glue code between scylla-sstable's C++ code and the Lua script.
-Conceptually a script is an implementation of a consumer interface. The script has to implement only the methods it is interested in. Each method has a default implementation in the interface, which simply drops the respective `mutation fragment <scylla-sstable-sstable-content_>`_.
-For example, a script only interested in partitions can define only `consume_partition_start() <scylla-consume-partition-start-method_>`_ and nothing else.
-Therefore a completely empty script is also valid, although not very useful.
-Below you will find the listing of the API methods.
-These methods (if provided by the script) will be called by the scylla-sstable runtime for the appropriate events and fragment types.
-
-.. _scylla-consume-stream-start-method:
-
-consume_stream_start(args)
-""""""""""""""""""""""""""
-
-* Part of the Consume API. Called on the very start of the stream.
-* Parameter is a Lua table containing command line arguments for the script, passed via ``--script-arg``.
-* Can be used to initialize global state.
-
-.. _scylla-consume-sstable-start-method:
-
-consume_sstable_start(sst)
-""""""""""""""""""""""""""
-
-* Part of the Consume API.
-* Called on the start of each stable. 
-* The parameter is of type `ScyllaDB.sstable <scylla-sstable-type_>`_. 
-* When SStables are merged (``--merge``), the parameter is ``nil``.
-
-Returns whether to stop. If ``true``, `consume_sstable_end() <scylla-consume-sstable-end-method_>`_ is called, skipping the content of the sstable (or that of the entire stream if ``--merge`` is used). If ``false``, consumption follows with the content of the sstable.
-
-.. _scylla-consume-partition-start-method:
-
-consume_partition_start(ps)
-"""""""""""""""""""""""""""
-
-* Part of the Consume API. Called on the start of each partition. 
-* The parameter is of type `ScyllaDB.partition_start <scylla-partition-start-type_>`_.
-* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, skipping the content of the partition. If ``false``, consumption follows with the content of the partition.
-
-consume_static_row(sr)
-""""""""""""""""""""""
-
-* Part of the Consume API. 
-* Called if the partition has a static row. 
-* The parameter is of type `ScyllaDB.static_row <scylla-static-row-type_>`_.
-* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, and the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
-
-consume_clustering_row(cr)
-""""""""""""""""""""""""""
-
-* Part of the Consume API. 
-* Called for each clustering row. 
-* The parameter is of type `ScyllaDB.clustering_row <scylla-clustering-row-type_>`_.
-* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
-
-consume_range_tombstone_change(crt)
-"""""""""""""""""""""""""""""""""""
-
-* Part of the Consume API.
-* Called for each range tombstone change. 
-* The parameter is of type `ScyllaDB.range_tombstone_change <scylla-range-tombstone-change-type_>`_.
-* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
-
-.. _scylla-consume-partition-end-method:
-
-consume_partition_end()
-"""""""""""""""""""""""
-
-* Part of the Consume API.
-* Called at the end of the partition.
-* Returns whether to stop. If ``true``, `consume_sstable_end() <scylla-consume-sstable-end-method_>`_ is called,  the remaining content of the SStable is skipped. If ``false``, consumption follows with the remaining content of the SStable.
-
-.. _scylla-consume-sstable-end-method:
-
-consume_sstable_end()
-"""""""""""""""""""""
-
-* Part of the Consume API.
-* Called at the end of the SStable.
-* Returns whether to stop. If true, `consume_stream_end() <scylla-consume-stream-end-method_>`_ is called, the remaining content of the stream is skipped. If false, consumption follows with the remaining content of the stream.
-
-.. _scylla-consume-stream-end-method:
-
-consume_stream_end()
-""""""""""""""""""""
-
-* Part of the Consume API. 
-* Called at the very end of the stream.
-
-ScyllaDB LUA API
-~~~~~~~~~~~~~~~~
-
-In addition to the `ScyllaDB Consume API <scylla-consume-api_>`_, the Lua bindings expose various types and methods that allow you to work with ScyllaDB types and values.
-The listing uses the following terminology:
-
-* Attribute - a simple attribute accessible via ``obj.attribute_name``;
-* Method - a method operating on an instance of said type, invocable as ``obj:method()``;
-* Magic method - magic methods defined in the metatable which define behaviour of these objects w.r.t. `Lua operators and more <http://www.lua.org/manual/5.4/manual.html#2.4>`_;
-
-The format of an attribute description is the following:
-
-.. code-block:: none
-    :class: hide-copy-button
-
-    attribute_name (type) - description
-
-and that of a method:
-
-.. code-block:: none
-    :class: hide-copy-button
-
-    method_name(arg1_type, arg2_type...) (return_type) - description
-
-Magic methods have their signature defined by Lua and so that is not described here (these methods are not used directly anyway).
-
-.. _scylla-atomic-cell-type:
-
-ScyllaDB.atomic_cell
-""""""""""""""""""""
-
-Attributes:
-
-* timestamp (integer)
-* is_live (boolean) - is the cell live?
-* type (string) - one of: ``regular``, ``counter-update``, ``counter-shards``, ``frozen-collection`` or ``collection``.
-* has_ttl (boolean) - is the cell expiring?
-* ttl (integer) - time to live in seconds, ``nil`` if cell is not expiring.
-* expiry (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which cell expires, ``nil`` if cell is not expiring.
-* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which cell was deleted, ``nil`` unless cell is dead or expiring.
-* value:
-
-    - ``nil`` if cell is dead.
-    - appropriate Lua native type if type == ``regular``.
-    - integer if type == ``counter-update``.
-    - `ScyllaDB.counter_shards_value <scylla-counter-shards-value-type_>`_ if type == ``counter-shards``.
-
-A counter-shard table has the following keys:
-
-* id (string)
-* value (integer)
-* clock (integer)
-
-.. _scylla-clustering-key-type:
-
-ScyllaDB.clustering_key
-"""""""""""""""""""""""
-
-Attributes:
-
-* components (table) - the column values (`ScyllaDB.data_value <scylla-data-value-type_>`_) making up the composite clustering key.
-
-Methods:
-
-* to_hex - convert the key to its serialized format, encoded in hex.
-
-Magic methods:
-
-* __tostring - can be converted to string with tostring(), uses the built-in operator<< in ScyllaDB.
-
-.. _scylla-clustering-row-type:
-
-ScyllaDB.clustering_row
-"""""""""""""""""""""""
-
-Attributes:
-
-* key ($TYPE) - the clustering key's value as the appropriate Lua native type.
-* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - row tombstone, ``nil`` if no tombstone.
-* shadowable_tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - shadowable tombstone of the row tombstone, ``nil`` if no tombstone.
-* marker (`ScyllaDB.row_marker <scylla-row-marker-type_>`_) - the row marker, ``nil`` if row doesn't have one.
-* cells (table) - table of cells, where keys are the column names and the values are either of type `ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_ or `ScyllaDB.collection <scylla-collection-type_>`_.
-
-See also:
-
-* `ScyllaDB.unserialize_clustering_key() <scylla-unserialize-clustering-key-method_>`_.
-
-.. _scylla-collection-type:
-
-ScyllaDB.collection
-"""""""""""""""""""
-
-Attributes:
-
-* type (string) - always ``collection`` for collection.
-* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - ``nil`` if no tombstone.
-* cells (table) - the collection cells, each collection cell is a table, with a ``key`` and ``value`` attribute. The key entry is the key of the collection cell for actual collections (list, set and map) and is of type `ScyllaDB.data-value <scylla-data-value-type_>`_. For tuples and UDT this is just an empty string. The value entry is the value of the collection cell and is of type `ScyllaDB.atomic-cell <scylla-atomic-cell-type_>`_. 
-
-.. _scylla-collection-cell-value-type:
-
-ScyllaDB.collection_cell_value
-""""""""""""""""""""""""""""""
-
-Attributes:
-
-* key (sstring) - collection cell key in human readable form.
-* value (`ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_) - collection cell value.
-
-.. _scylla-column-definition-type:
-
-ScyllaDB.column_definition
-""""""""""""""""""""""""""
-
-Attributes:
-
-* id (integer) - the id of the column.
-* name (string) - the name of the column.
-* kind (string) - the kind of the column, one of ``partition_key``, ``clustering_key``, ``static_column`` or ``regular_column``.
-
-.. _scylla-counter-shards-value-type:
-
-ScyllaDB.counter_shards_value
-"""""""""""""""""""""""""""""
-
-Attributes:
-
-* value (integer) - the total value of the counter (the sum of all the shards).
-* shards (table) - the shards making up this counter, a lua list containing tables, representing shards, with the following key/values:
-
-    - id (string) - the shard's id (UUID).
-    - value (integer) - the shard's value.
-    - clock (integer) - the shard's logical clock.
-
-Magic methods:
-
-* __tostring - can be converted to string with tostring().
-
-.. _scylla-data-value-type:
-
-ScyllaDB.data_value
-"""""""""""""""""""
-
-Attributes:
-
-* value - the value represented as the appropriate Lua type
-
-Magic methods:
-
-* __tostring - can be converted to string with tostring().
-
-.. _scylla-gc-clock-time-point-type:
-
-ScyllaDB.gc_clock_time_point
-""""""""""""""""""""""""""""
-
-A time point belonging to the gc_clock, in UTC.
-
-Attributes:
-
-* year (integer) - [1900, +inf).
-* month (integer) - [1, 12].
-* day (integer) - [1, 31].
-* hour (integer) - [0, 23].
-* min (integer) - [0, 59].
-* sec (integer) - [0, 59].
-
-Magic methods:
-
-* __eq - can be equal compared.
-* __lt - can be less compared.
-* __le - can be less-or-equal compared.
-* __tostring - can be converted to string with tostring().
-
-See also:
-
-* `ScyllaDB.now() <scylla-now-method_>`_.
-* `ScyllaDB.time_point_from_string() <scylla-time-point-from-string-method_>`_.
-
-.. _scylla-json-writer-type:
-
-ScyllaDB.json_writer
-""""""""""""""""""""
-
-A JSON writer object, with both low-level and high-level APIs.
-The low-level API allows you to write custom JSON and it loosely follows the API of `rapidjson::Writer <https://rapidjson.org/classrapidjson_1_1_writer.html>`_ (upon which it is implemented).
-The high-level API is for writing `mutation fragments <scylla-sstable-sstable-content_>`_ as JSON directly, using the built-in JSON conversion logic that is used by `dump-data <dump-data_>`_ operation.
-
-Low level API Methods:
-
-* null() - write a null json value.
-* bool(boolean) - write a bool json value.
-* int(integer) - write an integer json value.
-* double(number) - write a double json value.
-* string(string) - write a string json value.
-* start_object() - start a json object.
-* key(string) - write the key of a json object.
-* end_object() - write the end of a json object.
-* start_array() - write the start of a json array.
-* end_array() - write the end of a json array.
-
-High level API Methods:
-
-* start_stream() - start the stream, call at the very beginning.
-* start_sstable() - start an sstable.
-* start_partition() - start a partition.
-* static_row() - write a static row to the stream.
-* clustering_row() - write a clustering row to the stream.
-* range_tombstone_change() - write a range tombstone change to the stream.
-* end_partition() - end the current partition.
-* end_sstable() - end the current sstable.
-* end_stream() - end the stream, call at the very end.
-
-.. _scylla-new-json-writer-method:
-
-ScyllaDB.new_json_writer()
-""""""""""""""""""""""""""
-
-Create a `ScyllaDB.json_writer <scylla-json-writer-type_>`_ instance.
-
-.. _scylla-new-position-in-partition-method:
-
-ScyllaDB.new_position_in_partition()
-""""""""""""""""""""""""""""""""""""
-
-Creates a `ScyllaDB.position_in_partition <scylla-position-in-partition-type_>`_ instance.
-
-Arguments:
-
-* weight (integer) - the weight of the key.
-* key (`ScyllaDB.clustering_key <scylla-clustering-key-type_>`_) - the clustering key, optional.
-
-.. _scylla-new-ring-position-method:
-
-ScyllaDB.new_ring_position()
-""""""""""""""""""""""""""""
-
-Creates a `ScyllaDB.ring_position <scylla-ring-position-type_>`_ instance.
-
-Has several overloads:
-
-* ``ScyllaDB.new_ring_position(weight, key)``.
-* ``ScyllaDB.new_ring_position(weight, token)``.
-* ``ScyllaDB.new_ring_position(weight, key, token)``.
-
-Where:
-
-* weight (integer) - the weight of the key.
-* key (`ScyllaDB.partition_key <scylla-partition-key-type_>`_) - the partition key.
-* token (integer) - the token (of the key if a key is provided).
-
-.. _scylla-now-method:
-
-ScyllaDB.now()
-""""""""""""""
-
-Create a `ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_ instance, representing the current time.
-
-.. _scylla-partition-key-type:
-
-ScyllaDB.partition_key
-""""""""""""""""""""""
-
-Attributes:
-
-* components (table) - the column values (`ScyllaDB.data_value <scylla-data-value-type_>`_) making up the composite partition key.
-
-Methods:
-
-* to_hex - convert the key to its serialized format, encoded in hex.
-
-Magic methods:
-
-* __tostring - can be converted to string with tostring(), uses the built-in operator<< in ScyllaDB.
-
-See also:
-
-* :ref:`ScyllaDB.unserialize_partition_key() <scylla-unserialize-partition-key-method>`.
-* :ref:`ScyllaDB.token_of() <scylla-token-of-method>`.
-
-.. _scylla-partition-start-type:
-
-ScyllaDB.partition_start
-""""""""""""""""""""""""
-
-Attributes:
-
-* key - the partition key's value as the appropriate Lua native type.
-* token (integer) - the partition key's token.
-* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - the partition tombstone, ``nil`` if no tombstone.
-
-.. _scylla-position-in-partition-type:
-
-ScyllaDB.position_in_partition
-""""""""""""""""""""""""""""""
-
-Currently used only for clustering positions.
-
-Attributes:
-
-* key (`ScyllaDB.clustering_key <scylla-clustering-key-type_>`_) - the clustering key, ``nil`` if the position in partition represents the min or max clustering positions.
-* weight (integer) - weight of the position, either -1 (before key), 0 (at key) or 1 (after key). If key attribute is ``nil``, the weight is never 0.
-
-Methods:
-
-* tri_cmp - compare this position in partition to another position in partition, returns -1 (``<``), 0 (``==``) or 1 (``>``).
-
-See also:
-
-* `ScyllaDB.new_position_in_partition() <scylla-new-position-in-partition-method_>`_.
-
-.. _scylla-range-tombstone-change-type:
-
-ScyllaDB.range_tombstone_change
-"""""""""""""""""""""""""""""""
-
-Attributes:
-
-* key ($TYPE) - the clustering key's value as the appropriate Lua native type.
-* key_weight (integer) - weight of the position, either -1 (before key), 0 (at key) or 1 (after key).
-* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - tombstone, ``nil`` if no tombstone.
-
-.. _scylla-ring-position-type:
-
-ScyllaDB.ring_position
-""""""""""""""""""""""
-
-Attributes:
-
-* token (integer) - the token, ``nil`` if the ring position represents the min or max ring positions.
-* key (`ScyllaDB.partition_key <scylla-partition-key-type_>`_) - the partition key, ``nil`` if the ring position represents a position before/after a token.
-* weight (integer) - weight of the position, either -1 (before key/token), 0 (at key) or 1 (after key/token). If key attribute is ``nil``, the weight is never 0.
-
-Methods:
-
-* tri_cmp - compare this ring position to another ring position, returns -1 (``<``), 0 (``==``) or 1 (``>``).
-
-See also:
-
-* `ScyllaDB.new_ring_position() <scylla-new-ring-position-method_>`_.
-
-.. _scylla-row-marker-type:
-
-ScyllaDB.row_marker
-"""""""""""""""""""
-
-Attributes:
-
-* timestamp (integer).
-* is_live (boolean) - is the marker live?
-* has_ttl (boolean) - is the marker expiring?
-* ttl (integer) - time to live in seconds, ``nil`` if marker is not expiring.
-* expiry (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which marker expires, ``nil`` if marker is not expiring.
-* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which marker was deleted, ``nil`` unless marker is dead or expiring.
-
-.. _scylla-schema-type:
-
-ScyllaDB.schema
-"""""""""""""""
-
-Attributes:
-
-* partition_key_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the key columns making up the partition key.
-* clustering_key_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the key columns making up the clustering key.
-* static_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the static columns.
-* regular_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the regular columns.
-* all_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of all columns.
-
-.. _scylla-sstable-type:
-
-ScyllaDB.sstable
-""""""""""""""""
-
-Attributes:
-
-* filename (string) - the full path of the sstable Data component file;
-
-.. _scylla-static-row-type:
-
-ScyllaDB.static_row
-"""""""""""""""""""
-
-Attributes:
-
-* cells (table) - table of cells, where keys are the column names and the values are either of type `ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_ or `ScyllaDB.collection <scylla-collection-type_>`_.
-
-.. _scylla-time-point-from-string-method:
-
-ScyllaDB.time_point_from_string()
-"""""""""""""""""""""""""""""""""
-
-Create a `ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_ instance from the passed in string.
-Argument is string, using the same format as the CQL timestamp type, see https://en.wikipedia.org/wiki/ISO_8601.
-
-.. _scylla-token-of-method:
-
-ScyllaDB.token_of()
-"""""""""""""""""""
-
-Compute and return the token (integer) for a `ScyllaDB.partition_key <scylla-partition-key-type_>`_.
-
-.. _scylla-tombstone-type:
-
-ScyllaDB.tombstone
-""""""""""""""""""
-
-Attributes:
-
-* timestamp (integer)
-* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - the point in time at which the tombstone was deleted.
-
-.. _scylla-unserialize-clustering-key-method:
-
-ScyllaDB.unserialize_clustering_key()
-"""""""""""""""""""""""""""""""""""""
-
-Create a `ScyllaDB.clustering_key <scylla-clustering-key-type_>`_ instance.
-
-Argument is a string representing serialized clustering key in hex format.
-
-.. _scylla-unserialize-partition-key-method:
-
-ScyllaDB.unserialize_partition_key()
-""""""""""""""""""""""""""""""""""""
-
-Create a `ScyllaDB.partition_key <scylla-partition-key-type_>`_ instance.
-
-Argument is a string representing serialized partition key in hex format.
+See the :doc:`scripting API </operating-scylla/admin-tools/scylla-sstable-script-api/>` for more details.

 Examples
 ~~~~~~~~
@@ -1388,7 +870,7 @@ You can find example scripts at https://github.com/scylladb/scylladb/tree/master
 upgrade
 ^^^^^^^

-Offline, scylla-sstable variant of `nodetool upgradesstables </operating-scylla/nodetool-commands/upgradesstables/>`_.
+Offline, scylla-sstable variant of :doc:`nodetool upgradesstables </operating-scylla/nodetool-commands/upgradesstables/>`.
 Rewrites the input SSTable(s) to the latest supported version and latest schema version.
 The SSTable version to be used can be overridden with the ``--version`` flag, allowing for switching sstables between all versions supported for writing (some SSTable versions are supported for reading only).

@@ -1397,7 +879,7 @@ SSTables which are already on the designated version are skipped. To force rewri
 Output SSTables are written to the path provided by the ``--output-dir`` flag, or to the current directory if not specified.
 This directory is expected to exist and be empty. If not empty the tool will refuse to run. This can be overridden with the ``--unsafe-accept-nonempty-output-dir`` flag.

-It is strongly recommended to use the system schema tables as the schema source for this command, see the `schema options <scylla-sstable-schema_>`_ for more details.
+It is strongly recommended to use the system schema tables as the schema source for this command, see the :ref:`schema options <scylla-sstable-schema>` for more details.
 A schema which is good enough to read the SSTable and dump its content, may not be good enough to write its content back verbatim.
 An incomplete or incorrect schema can lead to the tool crashing or even data loss.

--- a/docs/operating-scylla/diagnostics.rst
+++ b/docs/operating-scylla/diagnostics.rst
@@ -11,7 +11,7 @@ Logs

 The most obvious source of information to find out more about why ScyllaDB is misbehaving.
 On production systems, ScyllaDB logs to syslog; thus logs can usually be viewed via ``journalctl``.
-See `Logging </getting-started/logging/>`_ on more information on how to access the logs.
+See :doc:`Logging </getting-started/logging/>` on more information on how to access the logs.


 ScyllaDB has the following log levels: ``trace``, ``debug``, ``info``, ``warn``, ``error``.
@@ -64,21 +64,21 @@ Tracing
 Tracing allows you to retrieve the internal log of events happening in the context of a single query.
 Therefore, tracing is only useful to diagnose problems related to a certain query and cannot be used to diagnose generic problems.
 That said, when it comes to diagnosing problems with a certain query, tracing is an excellent tool, allowing you to have a peek at what happens when that query is processed, including the timestamp of each event.
-For more details, see `Tracing </using-scylla/tracing>`_.
+For more details, see :doc:`Tracing </using-scylla/tracing>`.

 Nodetool
 --------

 Although ``nodetool`` is primarily an administration tool, it has various commands that retrieve and display useful information about the state of a certain ScyllaDB node.
 Look for commands with "stats", "info", "describe", "get", "histogram" in their names.
-For a comprehensive list of all available nodetool commands, see the `Nodetool Reference </operating-scylla/nodetool>`_.
+For a comprehensive list of all available nodetool commands, see the :doc:`Nodetool Reference </operating-scylla/nodetool>`.

 REST API
 --------

 ScyllaDB has a REST API which is a superset of all ``nodetool`` commands, in the sense that it is the backend serving all of them.
 It has many more endpoints, many of which can supply valuable information about the internal state of ScyllaDB.
-For more information, see `REST API </operating-scylla/rest>`_.
+For more information, see :doc:`REST API </operating-scylla/rest>`.

 System Tables
 -------------
@@ -102,9 +102,9 @@ Other Tools
 ScyllaDB has various other tools, mainly to work with sstables.
 If you are diagnosing a problem that is related to sstables misbehaving or being corrupt, you may find these useful:

-* `sstabledump </operating-scylla/admin-tools/sstabledump/>`_
-* `ScyllaDB SStable </operating-scylla/admin-tools/scylla-sstable/>`_
-* `ScyllaDB Types </operating-scylla/admin-tools/scylla-types/>`_
+* :doc:`sstabledump </operating-scylla/admin-tools/sstabledump/>`
+* :doc:`ScyllaDB SStable </operating-scylla/admin-tools/scylla-sstable/>`
+* :doc:`ScyllaDB Types </operating-scylla/admin-tools/scylla-types/>`

 GDB
 ---
--- a/docs/operating-scylla/nodetool-commands/cluster/repair.rst
+++ b/docs/operating-scylla/nodetool-commands/cluster/repair.rst
@@ -9,6 +9,8 @@ Running ``cluster repair`` on a **single node** synchronizes all data on all nod
 To synchronize all data in clusters that have both tablets-based and vnodes-based keyspaces, run :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair/>` on **all**
 of the nodes in the cluster, and :doc:`nodetool cluster repair </operating-scylla/nodetool-commands/cluster/repair/>` on  **any** of the nodes in the cluster.

+.. warning:: :term:`Colocated Tables <Colocated Table>` cannot be synchronized using cluster repair in this version.
+
 To check if a keyspace enables tablets, use:

 .. code-block:: cql
--- a/docs/operating-scylla/security/auditing.rst
+++ b/docs/operating-scylla/security/auditing.rst
@@ -14,12 +14,13 @@ Enable ScyllaDB :doc:`Authentication </operating-scylla/security/authentication>
 Enabling Audit
 ---------------

-By default, auditing is **disabled**. Enabling auditing is controlled by the ``audit:`` parameter in the ``scylla.yaml`` file.
+By default, auditing is **enabled**. Enabling auditing is controlled by the ``audit:`` parameter in the ``scylla.yaml`` file.
 You can set the following options:

 * ``none`` - Audit is disabled (default).
 * ``table`` - Audit is enabled, and messages are stored in a Scylla table.
 * ``syslog`` - Audit is enabled, and messages are sent to Syslog.
+* ``syslog,table`` - Audit is enabled, and messages are stored in a Scylla table and sent to Syslog.

 Configuring any other value results in an error at Scylla startup.

--- a/docs/reference/glossary.rst
+++ b/docs/reference/glossary.rst
@@ -202,3 +202,7 @@ Glossary
       The name comes from two basic operations, multiply (MU) and rotate (R), used in its inner loop.
       The MurmurHash3 version used in ScyllaDB originated from `Apache Cassandra <https://commons.apache.org/proper/commons-codec/apidocs/org/apache/commons/codec/digest/MurmurHash3.html>`_, and is **not** identical to the `official MurmurHash3 calculation <https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/utils/MurmurHash.java#L31-L33>`_. More `here <https://github.com/russss/murmur3-cassandra>`_.

+    Colocated Table
+       An internal table of a special type in a :doc:`tablets </architecture/tablets>` enabled keyspace that is colocated with another base table, meaning it always has the same tablet replicas as the base table.
+       Current types of colocated tables include CDC log tables, local indexes, and materialized views that have the same partition key as their base table.
+
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -177,6 +177,8 @@ public:
    gms::feature driver_service_level { *this, "DRIVER_SERVICE_LEVEL"sv };
    gms::feature strongly_consistent_tables { *this, "STRONGLY_CONSISTENT_TABLES"sv };
    gms::feature client_routes { *this, "CLIENT_ROUTES"sv };
+    gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
+    gms::feature size_based_load_balancing { *this, "SIZE_BASED_LOAD_BALANCING"sv };
 public:

    const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;
--- a/locator/load_sketch.hh
+++ b/locator/load_sketch.hh
@@ -8,76 +8,120 @@

 #pragma once

+#include "service/tablet_allocator_fwd.hh"
 #include "locator/topology.hh"
 #include "locator/token_metadata.hh"
 #include "locator/tablets.hh"
 #include "utils/stall_free.hh"
 #include "utils/extremum_tracking.hh"
 #include "utils/div_ceil.hh"
+#include "utils/pretty_printers.hh"

 #include <absl/container/btree_set.h>
+#include <seastar/util/defer.hh>

 #include <optional>
 #include <vector>

 namespace locator {

+struct disk_usage {
+    using load_type = double; // Disk usage factor (0.0 to 1.0)
+
+    uint64_t capacity = 0;
+    uint64_t used = 0;
+
+    load_type get_load() const {
+        if (capacity == 0) {
+            return 0;
+        }
+        return load_type(used) / capacity;
+    }
+};
+
 /// A data structure which keeps track of load associated with data ownership
 /// on shards of the whole cluster.
 class load_sketch {
    using shard_id = seastar::shard_id;
-    using load_type = ssize_t; // In tablets.
+    using load_type = disk_usage::load_type;

    struct shard_load {
        shard_id id;
-        load_type load;
+        disk_usage du;
+        size_t tablet_count = 0;
+
+        // Returns storage utilization for the shard
+        load_type get_load() const {
+            return du.get_load();
+        }
    };

    // Less-comparator which orders by load first (ascending), and then by shard id (ascending).
    struct shard_load_cmp {
-        bool operator()(const shard_load& a, const shard_load& b) const {
-            return a.load == b.load ? a.id < b.id : a.load < b.load;
+        bool operator()(const shard_load& shard_a, const shard_load& shard_b) const {
+            auto load_a = shard_a.get_load();
+            auto load_b = shard_b.get_load();
+            return load_a == load_b ? shard_a.id < shard_b.id : load_a < load_b;
        }
    };

    struct node_load {
+        std::vector<shard_load> _shards;
        absl::btree_set<shard_load, shard_load_cmp> _shards_by_load;
-        std::vector<load_type> _shards;
-        load_type _load = 0;
+        disk_usage _du;
+        size_t _tablet_count = 0;

-        node_load(size_t shard_count) : _shards(shard_count) {
+        // These can be false only when _load_stats != nullptr
+        bool _has_valid_disk_capacity = true;
+        bool _has_all_tablet_sizes = true;
+
+        node_load(size_t shard_count, uint64_t capacity)
+                : _shards(shard_count) {
+            _du.capacity = capacity;
+            uint64_t shard_capacity = capacity / shard_count;
            for (shard_id i = 0; i < shard_count; ++i) {
-                _shards[i] = 0;
+                _shards[i].id = i;
+                _shards[i].du.capacity = shard_capacity;
            }
        }

-        void update_shard_load(shard_id shard, load_type load_delta) {
-            _load += load_delta;
-
-            auto old_load = _shards[shard];
-            auto new_load = old_load + load_delta;
-            _shards_by_load.erase(shard_load{shard, old_load});
-            _shards[shard] = new_load;
-            _shards_by_load.insert(shard_load{shard, new_load});
+        void update_shard_load(shard_id shard, ssize_t tablet_count_delta, int64_t tablet_size_delta) {
+            _shards_by_load.erase(_shards[shard]);
+            _shards[shard].tablet_count += tablet_count_delta;
+            _shards[shard].du.used += tablet_size_delta;
+            _shards_by_load.insert(_shards[shard]);
+            _du.used += tablet_size_delta;
+            _tablet_count += tablet_count_delta;
        }

        void populate_shards_by_load() {
            _shards_by_load.clear();
+            _shards_by_load.insert(_shards.begin(), _shards.end());
+        }
+
+        void normalize(load_type factor) {
+            _du.used /= factor;
            for (shard_id i = 0; i < _shards.size(); ++i) {
-                _shards_by_load.insert(shard_load{i, _shards[i]});
+                _shards[i].du.used /= factor;
            }
+            populate_shards_by_load();
        }

-        load_type& load() noexcept {
-            return _load;
-        }
-
-        const load_type& load() const noexcept {
-            return _load;
+        // Returns storage utilization for the node
+        load_type get_load() const noexcept {
+            return _du.get_load();
        }
    };
    std::unordered_map<host_id, node_load> _nodes;
    token_metadata_ptr _tm;
+    load_stats_ptr _load_stats;
+    uint64_t _default_tablet_size = service::default_target_tablet_size;
+    uint64_t _minimal_tablet_size = 0;
+
+    // When set to true, it will use gross disk capacity instead of effective_capacity and
+    // treat all tablet as having the same size: _default_tablet_size
+    bool _force_capacity_based_load = false;
+
 private:
    tablet_replica_set get_replicas_for_tablet_load(const tablet_info& ti, const tablet_transition_info* trinfo) const {
        // We reflect migrations in the load as if they already happened,
@@ -85,10 +129,34 @@ private:
        return trinfo ? trinfo->next : ti.replicas;
    }

-    future<> populate_table(const tablet_map& tmap, std::optional<host_id> host, std::optional<sstring> only_dc) {
+    std::optional<uint64_t> get_disk_capacity_for_node(host_id node) {
+        if (_load_stats) {
+            if (_load_stats->tablet_stats.contains(node) && !_force_capacity_based_load) {
+                return _load_stats->tablet_stats.at(node).effective_capacity;
+            } else if (_load_stats->capacity.contains(node)) {
+                return _load_stats->capacity.at(node);
+            }
+        }
+        return std::nullopt;
+    }
+
+    std::optional<uint64_t> get_tablet_size(host_id host, const range_based_tablet_id& rb_tid, const tablet_info& ti, const tablet_transition_info* trinfo) const {
+        if (_force_capacity_based_load) {
+            return _default_tablet_size;
+        }
+
+        std::optional<uint64_t> tablet_size_opt;
+        if (_load_stats) {
+            tablet_size_opt = _load_stats->get_tablet_size_in_transition(host, rb_tid, ti, trinfo);
+        }
+        return tablet_size_opt;
+    }
+
+    future<> populate_table(table_id table, const tablet_map& tmap, std::optional<host_id> host, std::optional<sstring> only_dc) {
        const topology& topo = _tm->get_topology();
        co_await tmap.for_each_tablet([&] (tablet_id tid, const tablet_info& ti) -> future<> {
-            for (auto&& replica : get_replicas_for_tablet_load(ti, tmap.get_tablet_transition_info(tid))) {
+            auto trinfo = tmap.get_tablet_transition_info(tid);
+            for (auto&& replica : get_replicas_for_tablet_load(ti, trinfo)) {
                if (host && *host != replica.host) {
                    continue;
                }
@@ -97,28 +165,50 @@ private:
                    if (only_dc && node->dc_rack().dc != *only_dc) {
                        continue;
                    }
-                    _nodes.emplace(replica.host, node_load{node->get_shard_count()});
+                    auto disk_capacity_opt = get_disk_capacity_for_node(replica.host);
+                    auto [i, _] = _nodes.emplace(replica.host, node_load{node->get_shard_count(), disk_capacity_opt.value_or(_default_tablet_size)});
+                    if (!disk_capacity_opt && _load_stats) {
+                        i->second._has_valid_disk_capacity = false;
+                    }
                }
                node_load& n = _nodes.at(replica.host);
                if (replica.shard < n._shards.size()) {
-                    n.load() += 1;
-                    n._shards[replica.shard] += 1;
+                    const range_based_tablet_id rb_tid {table, tmap.get_token_range(tid)};
+                    auto tablet_size_opt = get_tablet_size(replica.host, rb_tid, ti, trinfo);
+                    if (!tablet_size_opt && _load_stats) {
+                        n._has_all_tablet_sizes = false;
+                    }
+                    const uint64_t tablet_size = std::max(tablet_size_opt.value_or(_default_tablet_size), _minimal_tablet_size);
+                    n._du.used += tablet_size;
+                    n._tablet_count++;
+                    n._shards[replica.shard].du.used += tablet_size;
+                    n._shards[replica.shard].tablet_count++;
                    // Note: as an optimization, _shards_by_load is populated later in populate_shards_by_load()
                }
            }
            return make_ready_future<>();
        });
    }
+
+    void throw_on_incomplete_data(host_id host, bool only_check_disk_capacity = false) const {
+        if (!has_complete_data(host, only_check_disk_capacity)) {
+            throw std::runtime_error(format("Can't provide accurate load computation with incomplete load_stats for host: {}", host));
+        }
+    }
 public:
-    load_sketch(token_metadata_ptr tm)
-        : _tm(std::move(tm)) {
+    load_sketch(token_metadata_ptr tm, load_stats_ptr load_stats = {}, uint64_t default_tablet_size = service::default_target_tablet_size)
+        : _tm(std::move(tm))
+        , _load_stats(std::move(load_stats))
+        , _default_tablet_size(default_tablet_size) {
+    }
+
+    future<> clear() {
+        return utils::clear_gently(_nodes);
    }

    future<> populate(std::optional<host_id> host = std::nullopt,
                      std::optional<table_id> only_table = std::nullopt,
                      std::optional<sstring> only_dc = std::nullopt) {
-        co_await utils::clear_gently(_nodes);
-
        if (host) {
            ensure_node(*host);
        } else {
@@ -132,11 +222,11 @@ public:
        if (only_table) {
            if (_tm->tablets().has_tablet_map(*only_table)) {
                auto& tmap = _tm->tablets().get_tablet_map(*only_table);
-                co_await populate_table(tmap, host, only_dc);
+                co_await populate_table(*only_table, tmap, host, only_dc);
            }
        } else {
            for (const auto& [table, tmap] : _tm->tablets().all_tables_ungrouped()) {
-                co_await populate_table(*tmap, host, only_dc);
+                co_await populate_table(table, *tmap, host, only_dc);
            }
        }

@@ -149,12 +239,52 @@ public:
        return populate(std::nullopt, std::nullopt, dc);
    }

-    shard_id next_shard(host_id node) {
+
+    future<> populate_with_normalized_load() {
+        co_await populate();
+
+        min_max_tracker<load_type> minmax;
+        minmax.update(1);
+        for (auto&& id : _nodes | std::views::keys) {
+            minmax.update(get_shard_minmax(id).max());
+        }
+
+        for (auto&& n : _nodes | std::views::values) {
+            n.normalize(minmax.max());
+        }
+    }
+
+    shard_id next_shard(host_id node, size_t tablet_count, uint64_t tablet_size_sum) {
        auto shard = get_least_loaded_shard(node);
-        pick(node, shard);
+        pick(node, shard, tablet_count, tablet_size_sum);
        return shard;
    }

+    bool has_complete_data(host_id node, bool only_check_disk_capacity = false) const {
+        if (!_nodes.contains(node)) {
+            return false;
+        }
+        auto& n = _nodes.at(node);
+        return n._has_valid_disk_capacity && (only_check_disk_capacity || n._has_all_tablet_sizes);
+    }
+
+    void ignore_incomplete_data(host_id node) {
+        if (!_nodes.contains(node)) {
+            return;
+        }
+        auto& n = _nodes.at(node);
+        n._has_valid_disk_capacity = true;
+        n._has_all_tablet_sizes = true;
+    }
+
+    void set_minimal_tablet_size(uint64_t min_ts) {
+        _minimal_tablet_size = min_ts;
+    }
+
+    void set_force_capacity_based_load(bool force_capacity_based_load) {
+        _force_capacity_based_load = force_capacity_based_load;
+    }
+
    node_load& ensure_node(host_id node) {
        if (!_nodes.contains(node)) {
            const topology& topo = _tm->get_topology();
@@ -162,55 +292,69 @@ public:
            if (shard_count == 0) {
                throw std::runtime_error(format("Shard count not known for node {}", node));
            }
-            auto [i, _] = _nodes.emplace(node, node_load{shard_count});
+            auto disk_capacity_opt = get_disk_capacity_for_node(node);
+            auto [i, _] = _nodes.emplace(node, node_load{shard_count, disk_capacity_opt.value_or(_default_tablet_size)});
            i->second.populate_shards_by_load();
+            if (!disk_capacity_opt && _load_stats) {
+                i->second._has_valid_disk_capacity = false;
+            }
        }
        return _nodes.at(node);
    }

    shard_id get_least_loaded_shard(host_id node) {
        auto& n = ensure_node(node);
-        const shard_load& s = *n._shards_by_load.begin();
-        return s.id;
+        throw_on_incomplete_data(node);
+        return n._shards_by_load.begin()->id;
    }

    shard_id get_most_loaded_shard(host_id node) {
        auto& n = ensure_node(node);
-        const shard_load& s = *std::prev(n._shards_by_load.end());
-        return s.id;
+        throw_on_incomplete_data(node);
+        return std::prev(n._shards_by_load.end())->id;
    }

-    void unload(host_id node, shard_id shard) {
+    void unload(host_id node, shard_id shard, size_t tablet_count_delta, uint64_t tablet_sizes_delta) {
+        throw_on_incomplete_data(node);
        auto& n = _nodes.at(node);
-        n.update_shard_load(shard, -1);
+        n.update_shard_load(shard, -ssize_t(tablet_count_delta), -int64_t(tablet_sizes_delta));
    }

-    void pick(host_id node, shard_id shard) {
+    void pick(host_id node, shard_id shard, size_t tablet_count_delta, uint64_t tablet_sizes_delta) {
+        throw_on_incomplete_data(node);
        auto& n = _nodes.at(node);
-        n.update_shard_load(shard, 1);
+        n.update_shard_load(shard, tablet_count_delta, tablet_sizes_delta);
    }

    load_type get_load(host_id node) const {
        if (!_nodes.contains(node)) {
            return 0;
        }
-        return _nodes.at(node).load();
+        throw_on_incomplete_data(node);
+        return _nodes.at(node).get_load();
    }

-    load_type total_load() const {
-        load_type total = 0;
-        for (auto&& n : _nodes) {
-            total += n.second.load();
+    uint64_t get_tablet_count(host_id node) const {
+        if (!_nodes.contains(node)) {
+            return 0;
        }
-        return total;
+        return _nodes.at(node)._tablet_count;
    }

-    load_type get_avg_shard_load(host_id node) const {
+    uint64_t get_avg_tablet_count(host_id node) const {
        if (!_nodes.contains(node)) {
            return 0;
        }
        auto& n = _nodes.at(node);
-        return div_ceil(n.load(), n._shards.size());
+        return div_ceil(n._tablet_count, n._shards.size());
+    }
+
+    double get_real_avg_tablet_count(host_id node) const {
+        if (!_nodes.contains(node)) {
+            return 0;
+        }
+        auto& n = _nodes.at(node);
+        return double(n._tablet_count) / n._shards.size();
    }

    double get_real_avg_shard_load(host_id node) const {
@@ -218,7 +362,23 @@ public:
            return 0;
        }
        auto& n = _nodes.at(node);
-        return double(n.load()) / n._shards.size();
+        return double(n.get_load()) / n._shards.size();
+    }
+
+    uint64_t get_disk_used(host_id node) const {
+        if (!_nodes.contains(node)) {
+            return 0;
+        }
+        throw_on_incomplete_data(node);
+        return _nodes.at(node)._du.used;
+    }
+
+    uint64_t get_capacity(host_id node) const {
+        if (!_nodes.contains(node)) {
+            return 0;
+        }
+        throw_on_incomplete_data(node, true);
+        return _nodes.at(node)._du.capacity;
    }

    shard_id get_shard_count(host_id node) const {
@@ -231,17 +391,18 @@ public:
    // Returns the difference in tablet count between highest-loaded shard and lowest-loaded shard.
    // Returns 0 when shards are perfectly balanced.
    // Returns 1 when shards are imbalanced, but it's not possible to balance them.
-    load_type get_shard_imbalance(host_id node) const {
-        auto minmax = get_shard_minmax(node);
-        return minmax.max() - minmax.max();
+    size_t get_shard_tablet_count_imbalance(host_id node) const {
+        auto minmax = get_shard_minmax_tablet_count(node);
+        return minmax.max() - minmax.min();
    }

    min_max_tracker<load_type> get_shard_minmax(host_id node) const {
        min_max_tracker<load_type> minmax;
        if (_nodes.contains(node)) {
+            throw_on_incomplete_data(node);
            auto& n = _nodes.at(node);
-            for (auto&& load: n._shards) {
-                minmax.update(load);
+            for (auto&& shard: n._shards) {
+                minmax.update(shard.get_load());
            }
        } else {
            minmax.update(0);
@@ -249,18 +410,44 @@ public:
        return minmax;
    }

-    // Returns nullopt if capacity is not known.
-    std::optional<double> get_allocated_utilization(host_id node, const locator::load_stats& stats, uint64_t target_tablet_size) const {
-        if (!_nodes.contains(node)) {
+    min_max_tracker<size_t> get_shard_minmax_tablet_count(host_id node) const {
+        min_max_tracker<size_t> minmax;
+        if (_nodes.contains(node)) {
+            auto& n = _nodes.at(node);
+            for (auto&& shard: n._shards) {
+                minmax.update(shard.tablet_count);
+            }
+        } else {
+            minmax.update(0);
+        }
+        return minmax;
+    }
+
+    // Returns nullopt if node is not known, or we don't have valid disk capacity.
+    std::optional<load_type> get_allocated_utilization(host_id node) const {
+        if (!_nodes.contains(node) || !has_complete_data(node, true)) {
            return std::nullopt;
        }
-        auto& n = _nodes.at(node);
-        if (!stats.capacity.contains(node)) {
+        const node_load& n = _nodes.at(node);
+        return load_type(n._tablet_count * _default_tablet_size) / n._du.capacity;
+    }
+
+    // Returns nullopt if node is not known, or we don't have tablet sizes or valid disk capacity.
+    std::optional<load_type> get_storage_utilization(host_id node) const {
+        if (!_nodes.contains(node) || !has_complete_data(node)) {
            return std::nullopt;
        }
-        auto capacity = stats.capacity.at(node);
-        return capacity > 0 ? double(n.load() * target_tablet_size) / capacity : 0;
+        return _nodes.at(node).get_load();
    }
 };

 } // namespace locator
+
+template<>
+struct fmt::formatter<locator::disk_usage> : fmt::formatter<string_view> {
+    template <typename FormatContext>
+    auto format(const locator::disk_usage& du, FormatContext& ctx) const {
+        return fmt::format_to(ctx.out(), "cap: {:i} used: {:i} load: {}",
+                              utils::pretty_printed_data_size(du.capacity), utils::pretty_printed_data_size(du.used), du.get_load());
+    }
+};
--- a/locator/network_topology_strategy.cc
+++ b/locator/network_topology_strategy.cc
@@ -317,6 +317,7 @@ future<tablet_map> network_topology_strategy::allocate_tablets_for_new_table(sch
 future<tablet_map> network_topology_strategy::reallocate_tablets(schema_ptr s, token_metadata_ptr tm, tablet_map tablets) const {
    natural_endpoints_tracker::check_enough_endpoints(*tm, _dc_rep_factor);
    load_sketch load(tm);
+    co_await load.populate_with_normalized_load();
    co_await load.populate(std::nullopt, s->id());

    tablet_logger.debug("Allocating tablets for {}.{} ({}): dc_rep_factor={} tablet_count={}", s->ks_name(), s->cf_name(), s->id(), _dc_rep_factor, tablets.tablet_count());
@@ -403,7 +404,7 @@ tablet_replica_set network_topology_strategy::drop_tablets_in_racks(schema_ptr s
        if (node.dc_rack().dc == dc && is_rack_to_drop(node.dc_rack().rack)) {
            tablet_logger.debug("drop_tablets_in_rack {}.{} tablet_id={} dc={} rack={} removing replica: {}",
                            s->ks_name(), s->cf_name(), tb, node.dc_rack().dc, node.dc_rack().rack, tr);
-            load.unload(tr.host, tr.shard);
+            load.unload(tr.host, tr.shard, 1, service::default_target_tablet_size);
        } else {
            filtered.emplace_back(tr);
        }
@@ -445,7 +446,7 @@ tablet_replica_set network_topology_strategy::add_tablets_in_racks(schema_ptr s,
                    fmt::format("No candidate node in rack {}.{} to allocate tablet replica", dc, rack));
        }

-        auto new_replica = tablet_replica{min_node, load.next_shard(min_node)};
+        auto new_replica = tablet_replica{min_node, load.next_shard(min_node, 1, service::default_target_tablet_size)};
        new_replicas.push_back(new_replica);

        tablet_logger.trace("add_tablet_in_rack {}.{} tablet_id={} dc={} rack={} load={} new_replica={}",
@@ -468,10 +469,10 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
    // Track all nodes with no replicas on them for this tablet, per rack.
    struct node_load {
        locator::host_id host;
-        uint64_t load;
+        double load;
    };
    // for sorting in descending load order
-    // (in terms of number of replicas)
+    // (in terms of load)
    auto node_load_cmp = [] (const node_load& a, const node_load& b) {
        return a.load > b.load;
    };
@@ -484,7 +485,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
    candidates_list existing_racks;

    // We use this list to start allocating from an
-    // unpoplated rack.
+    // unpopulated rack.
    candidates_list new_racks;

    for (const auto& [rack, nodes] : all_dc_racks) {
@@ -502,7 +503,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
            const auto& host_id = node.get().host_id();
            if (!existing.contains(host_id)) {
                // FIXME: https://github.com/scylladb/scylladb/issues/26366
-                candidate.nodes.emplace_back(host_id, load.get_avg_shard_load(host_id));
+                candidate.nodes.emplace_back(host_id, load.get_real_avg_shard_load(host_id));
            }
        }
        if (candidate.nodes.empty()) {
@@ -552,7 +553,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
                            s->ks_name(), s->cf_name(), tb.id, rack, dc, dc_node_count, dc_rf));
        }
        auto host_id = nodes.back().host;
-        auto replica = tablet_replica{host_id, load.next_shard(host_id)};
+        auto replica = tablet_replica{host_id, load.next_shard(host_id, 1, service::default_target_tablet_size)};
        const auto& node = tm->get_topology().get_node(host_id);
        auto inserted = replicas_per_rack[node.dc_rack().rack].insert(host_id).second;
        // Sanity check that a node is not used more than once
@@ -614,7 +615,7 @@ tablet_replica_set network_topology_strategy::drop_tablets_in_dc(schema_ptr s, c
        if (topo.get_node(tr.host).dc_rack().dc != dc || ++nodes_in_dc <= dc_rf) {
            filtered.emplace_back(tr);
        } else {
-            load.unload(tr.host, tr.shard);
+            load.unload(tr.host, tr.shard, 1, service::default_target_tablet_size);
        }
    }
    return filtered;
--- a/locator/tablets.cc
+++ b/locator/tablets.cc
@@ -927,6 +927,56 @@ std::optional<uint64_t> load_stats::get_tablet_size(host_id host, const range_ba
    return std::nullopt;
 }

+std::optional<uint64_t> load_stats::get_tablet_size_in_transition(host_id host, const range_based_tablet_id& rb_tid, const tablet_info& ti, const tablet_transition_info* trinfo) const {
+    std::optional<uint64_t> tablet_size_opt;
+    tablet_size_opt = get_tablet_size(host, rb_tid);
+    if (tablet_size_opt) {
+        return tablet_size_opt;
+    }
+
+    // If the tablet is in transition,
+    // try to find it on the leaving replica, in case of tablet migration,
+    // or get the avg tablet size of all the replicas, in case we have a rebuild
+    if (trinfo) {
+        switch (trinfo->transition) {
+            case tablet_transition_kind::migration:
+                // Search for the tablet size on leaving replica
+                if (trinfo->pending_replica && trinfo->pending_replica->host == host) {
+                    if (auto leaving_replica = get_leaving_replica(ti, *trinfo)) {
+                        tablet_size_opt = get_tablet_size(leaving_replica->host, rb_tid);
+                    } else {
+                        on_internal_error_noexcept(tablet_logger, ::format("No leaving replica for tablet migration in table {}. ti.replicas: {} trinfo->next: {}",
+                                                rb_tid.table, ti.replicas, trinfo->next));
+                    }
+                }
+                break;
+            case tablet_transition_kind::rebuild:
+                [[fallthrough]];
+            case tablet_transition_kind::rebuild_v2: {
+                // Get the avg tablet size from the available replicas
+                size_t replica_count = 0;
+                uint64_t tablet_size_sum = 0;
+                for (auto& replica : ti.replicas) {
+                    auto new_tablet_size_opt = get_tablet_size(replica.host, rb_tid);
+                    if (new_tablet_size_opt) {
+                        tablet_size_sum += *new_tablet_size_opt;
+                        replica_count++;
+                    }
+                }
+                if (replica_count) {
+                    tablet_size_opt = tablet_size_sum / replica_count;
+                }
+                break;
+            }
+            case tablet_transition_kind::intranode_migration:
+                [[fallthrough]];
+            case tablet_transition_kind::repair:
+                break;
+        }
+    }
+    return tablet_size_opt;
+}
+
 lw_shared_ptr<load_stats> load_stats::reconcile_tablets_resize(const std::unordered_set<table_id>& tables, const token_metadata& old_tm, const token_metadata& new_tm) const {
    lw_shared_ptr<load_stats> reconciled_stats { make_lw_shared<load_stats>(*this) };
    load_stats& new_stats = *reconciled_stats;
--- a/locator/tablets.hh
+++ b/locator/tablets.hh
@@ -489,6 +489,12 @@ struct load_stats {

    std::optional<uint64_t> get_tablet_size(host_id host, const range_based_tablet_id& rb_tid) const;

+    // Returns the tablet size on the given host. If the tablet size is not found on the host, we will search for it on
+    // other hosts based on the tablet transition info:
+    // - if the tablet is in migration, and the given host is pending, the tablet size will be searched on the leaving replica
+    // - if the tablet is being rebuilt, we will return the average tablet size of all the replicas
+    std::optional<uint64_t> get_tablet_size_in_transition(host_id host, const range_based_tablet_id& rb_tid, const tablet_info& ti, const tablet_transition_info* trinfo) const;
+
    // Modifies the tablet sizes in load_stats for the given table after a split or merge. The old_tm argument has
    // to contain the token_metadata pre-resize. The function returns load_stats with tablet token ranges
    // corresponding to the post-resize tablet_map.
--- a/main.cc
+++ b/main.cc
@@ -959,7 +959,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
                auto ip = utils::resolve(cfg->prometheus_address || cfg->listen_address, family, preferred).get();

                prometheus::config pctx;
-                pctx.metric_help = "Scylla server statistics";
                pctx.prefix = cfg->prometheus_prefix();
                pctx.allow_protobuf = cfg->prometheus_allow_protobuf();
                prometheus::start(prometheus_server, pctx).get();
@@ -1791,7 +1790,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            });

            checkpoint(stop_signal, "starting auth cache");
-            auth_cache.start(std::ref(qp)).get();
+            auth_cache.start(std::ref(qp), std::ref(stop_signal.as_sharded_abort_source())).get();
            auto stop_auth_cache = defer_verbose_shutdown("auth cache", [&] {
                auth_cache.stop().get();
            });
@@ -2527,7 +2526,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl

            api::set_server_service_levels(ctx, cql_server_ctl, qp).get();

-            alternator::controller alternator_ctl(gossiper, proxy, mm, sys_dist_ks, cdc_generation_service, service_memory_limiter, auth_service, sl_controller, *cfg, dbcfg.statement_scheduling_group);
+            alternator::controller alternator_ctl(gossiper, proxy, ss, mm, sys_dist_ks, cdc_generation_service, service_memory_limiter, auth_service, sl_controller, *cfg, dbcfg.statement_scheduling_group);

            // Register at_exit last, so that storage_service::drain_on_shutdown will be called first
            auto do_drain = defer_verbose_shutdown("local storage", [&ss] {
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -1292,7 +1292,7 @@ future<std::tuple<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation
 messaging_service::make_sink_and_source_for_stream_mutation_fragments(table_schema_version schema_id, streaming::plan_id plan_id, table_id cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, service::session_id session, locator::host_id id) {
    using value_type = std::tuple<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>>;
    if (is_shutting_down()) {
-        return make_exception_future<value_type>(rpc::closed_error());
+        return make_exception_future<value_type>(rpc::closed_error("local node is shutting down"));
    }
    auto rpc_client = get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, addr_for_host_id(id), id);
    return rpc_client->make_stream_sink<netw::serializer, frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>().then([this, session, plan_id, schema_id, cf_id, estimated_partitions, reason, rpc_client] (rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd> sink) mutable {
@@ -1321,7 +1321,7 @@ rpc::sink<streaming::stream_blob_cmd_data> messaging_service::make_sink_for_stre
 future<std::tuple<rpc::sink<streaming::stream_blob_cmd_data>, rpc::source<streaming::stream_blob_cmd_data>>>
 messaging_service::make_sink_and_source_for_stream_blob(streaming::stream_blob_meta meta, locator::host_id id) {
    if (is_shutting_down()) {
-        co_await coroutine::return_exception(rpc::closed_error());
+        co_await coroutine::return_exception(rpc::closed_error("local node is shutting down"));
    }
    auto rpc_client = get_rpc_client(messaging_verb::STREAM_BLOB, addr_for_host_id(id), id);
    auto sink = co_await rpc_client->make_stream_sink<netw::serializer, streaming::stream_blob_cmd_data>();
@@ -1370,7 +1370,7 @@ future<std::tuple<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wir
 messaging_service::make_sink_and_source_for_repair_get_row_diff_with_rpc_stream(uint32_t repair_meta_id, shard_id dst_cpu_id, locator::host_id id) {
    auto verb = messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM;
    if (is_shutting_down()) {
-        return make_exception_future<std::tuple<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>>>(rpc::closed_error());
+        return make_exception_future<std::tuple<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>>>(rpc::closed_error("local node is shutting down"));
    }
    auto rpc_client = get_rpc_client(verb, addr_for_host_id(id), id);
    return do_make_sink_source<repair_hash_with_cmd, repair_row_on_wire_with_cmd>(verb, repair_meta_id, dst_cpu_id, std::move(rpc_client), rpc());
@@ -1392,7 +1392,7 @@ future<std::tuple<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_str
 messaging_service::make_sink_and_source_for_repair_put_row_diff_with_rpc_stream(uint32_t repair_meta_id, shard_id dst_cpu_id, locator::host_id id) {
    auto verb = messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM;
    if (is_shutting_down()) {
-        return make_exception_future<std::tuple<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>>(rpc::closed_error());
+        return make_exception_future<std::tuple<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>>(rpc::closed_error("local node is shutting down"));
    }
    auto rpc_client = get_rpc_client(verb, addr_for_host_id(id), id);
    return do_make_sink_source<repair_row_on_wire_with_cmd, repair_stream_cmd>(verb, repair_meta_id, dst_cpu_id, std::move(rpc_client), rpc());
@@ -1414,7 +1414,7 @@ future<std::tuple<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd
 messaging_service::make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(uint32_t repair_meta_id, shard_id dst_cpu_id, locator::host_id id) {
    auto verb = messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM;
    if (is_shutting_down()) {
-        return make_exception_future<std::tuple<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>>(rpc::closed_error());
+        return make_exception_future<std::tuple<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>>(rpc::closed_error("local node is shutting down"));
    }
    auto rpc_client = get_rpc_client(verb, addr_for_host_id(id), id);
    return do_make_sink_source<repair_stream_cmd, repair_hash_with_cmd>(verb, repair_meta_id, dst_cpu_id, std::move(rpc_client), rpc());
--- a/message/rpc_protocol_impl.hh
+++ b/message/rpc_protocol_impl.hh
@@ -127,20 +127,21 @@ auto send_message(messaging_service* ms, messaging_verb verb, std::optional<loca
    auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
    using futurator = futurize<std::invoke_result_t<decltype(rpc_handler), rpc_protocol::client&, MsgOut...>>;
    if (ms->is_shutting_down()) {
-        return futurator::make_exception_future(rpc::closed_error());
+        return futurator::make_exception_future(rpc::closed_error("local node is shutting down"));
    }
    auto rpc_client_ptr = ms->get_rpc_client(verb, id, host_id);
    auto& rpc_client = *rpc_client_ptr;
    return rpc_handler(rpc_client, std::forward<MsgOut>(msg)...).handle_exception([ms = ms->shared_from_this(), id, host_id, verb, rpc_client_ptr = std::move(rpc_client_ptr)] (std::exception_ptr&& eptr) {
        ms->increment_dropped_messages(verb);
-        if (try_catch<rpc::closed_error>(eptr)) {
+        if (const auto* exp = try_catch<rpc::closed_error>(eptr)) {
            // This is a transport error
            if (host_id) {
                ms->remove_error_rpc_client(verb, *host_id);
            } else {
                ms->remove_error_rpc_client(verb, id);
            }
-            return futurator::make_exception_future(std::move(eptr));
+            return futurator::make_exception_future(rpc::closed_error(fmt::format("got error from node {}/{}: {}",
+                    host_id.value_or(locator::host_id{}), id.addr, exp->what())));
        } else {
            // This is expected to be a rpc server error, e.g., the rpc handler throws a std::runtime_error.
            return futurator::make_exception_future(std::move(eptr));
@@ -165,20 +166,21 @@ auto send_message_timeout(messaging_service* ms, messaging_verb verb, std::optio
    auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
    using futurator = futurize<std::invoke_result_t<decltype(rpc_handler), rpc_protocol::client&, MsgOut...>>;
    if (ms->is_shutting_down()) {
-        return futurator::make_exception_future(rpc::closed_error());
+        return futurator::make_exception_future(rpc::closed_error("local node is shutting down"));
    }
    auto rpc_client_ptr = ms->get_rpc_client(verb, id, host_id);
    auto& rpc_client = *rpc_client_ptr;
    return rpc_handler(rpc_client, timeout, std::forward<MsgOut>(msg)...).handle_exception([ms = ms->shared_from_this(), id, host_id, verb, rpc_client_ptr = std::move(rpc_client_ptr)] (std::exception_ptr&& eptr) {
        ms->increment_dropped_messages(verb);
-        if (try_catch<rpc::closed_error>(eptr)) {
+        if (const auto* exp = try_catch<rpc::closed_error>(eptr)) {
            // This is a transport error
            if (host_id) {
                ms->remove_error_rpc_client(verb, *host_id);
            } else {
                ms->remove_error_rpc_client(verb, id);
            }
-            return futurator::make_exception_future(std::move(eptr));
+            return futurator::make_exception_future(rpc::closed_error(fmt::format("got error from node {}/{}: {}",
+                    host_id.value_or(locator::host_id{}), id.addr, exp->what())));
        } else {
            // This is expected to be a rpc server error, e.g., the rpc handler throws a std::runtime_error.
            return futurator::make_exception_future(std::move(eptr));
@@ -206,7 +208,7 @@ auto send_message_cancellable(messaging_service* ms, messaging_verb verb, std::o
    auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
    using futurator = futurize<std::invoke_result_t<decltype(rpc_handler), rpc_protocol::client&, MsgOut...>>;
    if (ms->is_shutting_down()) {
-        return futurator::make_exception_future(rpc::closed_error());
+        return futurator::make_exception_future(rpc::closed_error("local node is shutting down"));
    }
    auto rpc_client_ptr = ms->get_rpc_client(verb, id, host_id);
    auto& rpc_client = *rpc_client_ptr;
@@ -222,14 +224,15 @@ auto send_message_cancellable(messaging_service* ms, messaging_verb verb, std::o

    return rpc_handler(rpc_client, c_ref, std::forward<MsgOut>(msg)...).handle_exception([ms = ms->shared_from_this(), id, host_id, verb, rpc_client_ptr = std::move(rpc_client_ptr), sub = std::move(sub)] (std::exception_ptr&& eptr) {
        ms->increment_dropped_messages(verb);
-        if (try_catch<rpc::closed_error>(eptr)) {
+        if (const auto* exp = try_catch<rpc::closed_error>(eptr)) {
            // This is a transport error
            if (host_id) {
                ms->remove_error_rpc_client(verb, *host_id);
            } else {
                ms->remove_error_rpc_client(verb, id);
            }
-            return futurator::make_exception_future(std::move(eptr));
+            return futurator::make_exception_future(rpc::closed_error(fmt::format("got error from node {}/{}: {}",
+                    host_id.value_or(locator::host_id{}), id.addr, exp->what())));
        } else if (try_catch<rpc::canceled_error>(eptr)) {
            // Translate low-level canceled_error into high-level abort_requested_exception.
            return futurator::make_exception_future(abort_requested_exception{});
@@ -255,9 +258,10 @@ auto send_message_timeout_cancellable(messaging_service* ms, messaging_verb verb
    auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
    using futurator = futurize<std::invoke_result_t<decltype(rpc_handler), rpc_protocol::client&, MsgOut...>>;
    if (ms->is_shutting_down()) {
-        return futurator::make_exception_future(rpc::closed_error());
+        return futurator::make_exception_future(rpc::closed_error("local node is shutting down"));
    }
-    auto rpc_client_ptr = ms->get_rpc_client(verb, ms->addr_for_host_id(host_id), host_id);
+    auto address = ms->addr_for_host_id(host_id);
+    auto rpc_client_ptr = ms->get_rpc_client(verb, address, host_id);
    auto& rpc_client = *rpc_client_ptr;

    auto c = std::make_unique<seastar::rpc::cancellable>();
@@ -269,12 +273,13 @@ auto send_message_timeout_cancellable(messaging_service* ms, messaging_verb verb
        return futurator::make_exception_future(abort_requested_exception{});
    }

-    return rpc_handler(rpc_client, timeout, c_ref, std::forward<MsgOut>(msg)...).handle_exception([ms = ms->shared_from_this(), host_id, verb, rpc_client_ptr = std::move(rpc_client_ptr), sub = std::move(sub)] (std::exception_ptr&& eptr) {
+    return rpc_handler(rpc_client, timeout, c_ref, std::forward<MsgOut>(msg)...).handle_exception([ms = ms->shared_from_this(), host_id, address, verb, rpc_client_ptr = std::move(rpc_client_ptr), sub = std::move(sub)] (std::exception_ptr&& eptr) {
        ms->increment_dropped_messages(verb);
-        if (try_catch<rpc::closed_error>(eptr)) {
+        if (const auto* exp = try_catch<rpc::closed_error>(eptr)) {
            // This is a transport error
            ms->remove_error_rpc_client(verb, host_id);
-            return futurator::make_exception_future(std::move(eptr));
+            return futurator::make_exception_future(rpc::closed_error(fmt::format("got error from node {}/{}: {}",
+                    host_id, address.addr, exp->what())));
        } else if (try_catch<rpc::canceled_error>(eptr)) {
            // Translate low-level canceled_error into high-level abort_requested_exception.
            return futurator::make_exception_future(abort_requested_exception{});
--- a/pgo/profiles/aarch64/profile.profdata.xz
+++ b/pgo/profiles/aarch64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3cbe2dd05945f8fb76ebce2ea70864063d2b282c4d5080af1f290ead43321ab3
-size 6444732
+oid sha256:9d387b5ff44094e9b6c587d3e0cb2e7098ea68924f3f9947ff7574be3c378a4e
+size 6475784
--- a/pgo/profiles/x86_64/profile.profdata.xz
+++ b/pgo/profiles/x86_64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ad1705d5c37cc6b6cd24354b83fee8da64a14f918351d357f21cf771a650ad3d
-size 6452816
+oid sha256:3b35c1ed982e025b4c3d079c2d14873a115ff8e8d364a19633bf83052e52a059
+size 6473408
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -176,7 +176,7 @@ void fsm::become_leader() {

    _last_election_time = _clock.now();
    _ping_leader = false;
-    // a new leader needs to commit at lease one entry to make sure that
+    // a new leader needs to commit at least one entry to make sure that
    // all existing entries in its log are committed as well. Also it should
    // send append entries RPC as soon as possible to establish its leadership
    // (3.4). Do both of those by committing a dummy entry.
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -1020,6 +1020,13 @@ void reader_concurrency_semaphore::consume(reader_permit::impl& permit, resource

 void reader_concurrency_semaphore::signal(const resources& r) noexcept {
    _resources += r;
+    if (_resources.count > _initial_resources.count || _resources.memory > _initial_resources.memory) [[unlikely]] {
+        on_internal_error_noexcept(rcslog,
+                format("reader_concurrency_semaphore::signal(): semaphore {} detected resource leak, available {} exceeds initial {}", _name,
+                        _resources, _initial_resources));
+        _resources.count = std::max(_resources.count, _initial_resources.count);
+        _resources.memory = std::max(_resources.memory, _initial_resources.memory);
+    }
    maybe_wake_execution_loop();
 }

--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -1179,6 +1179,7 @@ private:
        bool full = is_incremental_repair_using_all_sstables();
        auto& tinfo = tmap.get_tablet_info(id);
        auto sstables_repaired_at = tinfo.sstables_repaired_at;
+        auto gid = locator::global_tablet_id{tid, id};
        // Consider this:
        // 1) n1 is the topology coordinator
        // 2) n1 schedules and executes a tablet repair with session id s1 for a tablet on n3 an n4.
@@ -1190,14 +1191,16 @@ private:
        // To avoid the deadlock, we can throw in step 7 so that n2 will
        // proceed to the end_repair stage and release the lock. After that,
        // the scheduler could schedule the tablet repair again.
-        if (_rs._repair_compaction_locks.contains(_frozen_topology_guard)) {
+        if (_rs._repair_compaction_locks.contains(gid)) {
            auto msg = fmt::format("Tablet repair session={} table={} is in progress", _frozen_topology_guard, tid);
            rlogger.info("{}", msg);
            throw std::runtime_error(msg);
        }
+
+        co_await utils::get_local_injector().inject("incremental_repair_prepare_wait", utils::wait_for_message(60s));
        auto reenablers_and_holders = co_await table.get_compaction_reenablers_and_lock_holders_for_repair(_db.local(), _frozen_topology_guard, _range);
        for (auto& lock_holder : reenablers_and_holders.lock_holders) {
-            _rs._repair_compaction_locks[_frozen_topology_guard].push_back(std::move(lock_holder));
+            _rs._repair_compaction_locks[gid].push_back(std::move(lock_holder));
        }
        auto sstables = co_await table.take_storage_snapshot(_range);
        _incremental_repair_meta.sst_set = make_lw_shared<sstables::sstable_set>(sstables::make_partitioned_sstable_set(_schema, _range));
@@ -2836,9 +2839,20 @@ future<> repair_service::init_ms_handlers() {
            auto& table = local_repair.get_db().local().find_column_family(gid.table);
            auto erm = table.get_effective_replication_map();
            auto& tmap = erm->get_token_metadata_ptr()->tablets().get_tablet_map(gid.table);
+            auto* trinfo = tmap.get_tablet_transition_info(gid.tablet);
+            if (!trinfo) {
+                auto msg = fmt::format("Skipped repair_update_compaction_ctrl gid={} session_id={} since tablet is not in transition", gid, topo_guard);
+                rlogger.warn("{}", msg);
+                throw std::runtime_error(msg);
+            }
+            if (trinfo->stage != locator::tablet_transition_stage::end_repair) {
+                auto msg = fmt::format("Skipped repair_update_compaction_ctrl gid={} session_id={} since tablet is not in tablet_transition_stage::end_repair", gid, topo_guard);
+                rlogger.warn("{}", msg);
+                throw std::runtime_error(msg);
+            }
            auto range = tmap.get_token_range(gid.tablet);
            co_await table.clear_being_repaired_for_range(range);
-            auto removed = local_repair._repair_compaction_locks.erase(topo_guard);
+            auto removed = local_repair._repair_compaction_locks.erase(gid);
            rlogger.info("Got repair_update_compaction_ctrl gid={} session_id={} removed={}", gid, topo_guard, removed);
        });
    });
--- a/repair/row_level.hh
+++ b/repair/row_level.hh
@@ -154,7 +154,7 @@ class repair_service : public seastar::peering_sharded_service<repair_service> {
            std::unordered_set<locator::host_id> ignore_nodes);

 public:
-    std::unordered_map<service::session_id, std::vector<seastar::rwlock::holder>> _repair_compaction_locks;
+    std::unordered_map<locator::global_tablet_id, std::vector<seastar::rwlock::holder>> _repair_compaction_locks;

 public:
    repair_service(sharded<service::topology_state_machine>& tsm,
--- a/replica/compaction_group.hh
+++ b/replica/compaction_group.hh
@@ -84,6 +84,10 @@ class compaction_group {
    seastar::named_gate _async_gate;
    // Gates flushes.
    seastar::named_gate _flush_gate;
+    // Gates sstable being added to the group.
+    // This prevents the group from being considered empty when sstables are being added.
+    // Crucial for tablet split which ACKs split for a table when all pre-split groups are empty.
+    seastar::named_gate _sstable_add_gate;
    bool _tombstone_gc_enabled = true;
    std::optional<compaction::compaction_backlog_tracker> _backlog_tracker;
    repair_classifier_func _repair_sstable_classifier;
@@ -248,6 +252,10 @@ public:
        return _flush_gate;
    }

+    seastar::named_gate& sstable_add_gate() noexcept {
+        return _sstable_add_gate;
+    }
+
    compaction::compaction_manager& get_compaction_manager() noexcept;
    const compaction::compaction_manager& get_compaction_manager() const noexcept;

@@ -306,8 +314,8 @@ public:
    uint64_t live_disk_space_used() const;

    void for_each_compaction_group(std::function<void(const compaction_group_ptr&)> action) const;
-    utils::small_vector<compaction_group_ptr, 3> compaction_groups();
-    utils::small_vector<const_compaction_group_ptr, 3> compaction_groups() const;
+    utils::small_vector<compaction_group_ptr, 3> compaction_groups_immediate();
+    utils::small_vector<const_compaction_group_ptr, 3> compaction_groups_immediate() const;

    utils::small_vector<compaction_group_ptr, 3> split_unready_groups() const;
    bool split_unready_groups_are_empty() const;
@@ -434,7 +442,7 @@ public:
    virtual bool all_storage_groups_split() = 0;
    virtual future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) = 0;
    virtual future<> maybe_split_compaction_group_of(size_t idx) = 0;
-    virtual future<std::vector<sstables::shared_sstable>> maybe_split_sstable(const sstables::shared_sstable& sst) = 0;
+    virtual future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) = 0;
    virtual dht::token_range get_token_range_after_split(const dht::token&) const noexcept = 0;

    virtual lw_shared_ptr<sstables::sstable_set> make_sstable_set() const = 0;
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -2793,6 +2793,7 @@ future<> database::flush_all_tables() {
    });
    _all_tables_flushed_at = db_clock::now();
    co_await _commitlog->wait_for_pending_deletes();
+    dblog.info("Forcing new commitlog segment and flushing all tables complete");
 }

 future<db_clock::time_point> database::get_all_tables_flushed_at(sharded<database>& sharded_db) {
@@ -2815,7 +2816,7 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, t
        co_await flush_table_on_all_shards(sharded_db, uuid);
    }
    auto table_shards = co_await get_table_on_all_shards(sharded_db, uuid);
-    co_await table::snapshot_on_all_shards(sharded_db, table_shards, tag);
+    co_await snapshot_table_on_all_shards(sharded_db, table_shards, tag);
 }

 future<> database::snapshot_tables_on_all_shards(sharded<database>& sharded_db, std::string_view ks_name, std::vector<sstring> table_names, sstring tag, bool skip_flush) {
@@ -2951,7 +2952,7 @@ future<> database::truncate_table_on_all_shards(sharded<database>& sharded_db, s
        auto truncated_at = truncated_at_opt.value_or(db_clock::now());
        auto name = snapshot_name_opt.value_or(
            format("{:d}-{}", truncated_at.time_since_epoch().count(), cf.schema()->cf_name()));
-        co_await table::snapshot_on_all_shards(sharded_db, table_shards, name);
+        co_await snapshot_table_on_all_shards(sharded_db, table_shards, name);
    }

    co_await sharded_db.invoke_on_all([&] (database& db) {
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -604,9 +604,28 @@ public:

    data_dictionary::table as_data_dictionary() const;

+    // The usage of these functions are restricted to preexisting sstables that aren't being
+    // moved anywhere, so should never be used in the context of file streaming and intra
+    // node migration. The only user today is distributed loader, which populates the
+    // sstables for each column family on boot.
    future<> add_sstable_and_update_cache(sstables::shared_sstable sst,
                                          sstables::offstrategy offstrategy = sstables::offstrategy::no);
    future<> add_sstables_and_update_cache(const std::vector<sstables::shared_sstable>& ssts);
+
+    // Restricted to new sstables produced by external processes such as repair.
+    // The sstable might undergo split if table is in split mode.
+    // If no need for split, the input sstable will only be attached to the sstable set.
+    // If split happens, the output sstables will be attached and the input sstable unlinked.
+    // On failure, the input sstable is unlinked and exception propagated to the caller.
+    // The on_add callback will be called on all sstables to be added into the set.
+    [[nodiscard]] future<std::vector<sstables::shared_sstable>>
+    add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
+                                     std::function<future<>(sstables::shared_sstable)> on_add,
+                                     sstables::offstrategy offstrategy = sstables::offstrategy::no);
+    [[nodiscard]] future<std::vector<sstables::shared_sstable>>
+    add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> new_ssts,
+                                      std::function<future<>(sstables::shared_sstable)> on_add);
+
    future<> move_sstables_from_staging(std::vector<sstables::shared_sstable>);
    sstables::shared_sstable make_sstable();
    void set_truncation_time(db_clock::time_point truncated_at) noexcept {
@@ -724,7 +743,9 @@ private:
        return _config.enable_cache && _schema->caching_options().enabled();
    }
    void update_stats_for_new_sstable(const sstables::shared_sstable& sst) noexcept;
-    future<> do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable sst, sstables::offstrategy, bool trigger_compaction);
+    // This function can throw even if the sstable was added into the set. When the sstable was successfully
+    // added, the sstable ptr @sst will be set to nullptr. Allowing caller to optionally discard the sstable.
+    future<> do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable& sst, sstables::offstrategy, bool trigger_compaction);
    future<> do_add_sstable_and_update_cache(sstables::shared_sstable sst, sstables::offstrategy offstrategy, bool trigger_compaction);
    // Helpers which add sstable on behalf of a compaction group and refreshes compound set.
    void add_sstable(compaction_group& cg, sstables::shared_sstable sstable);
@@ -1037,37 +1058,11 @@ public:
    db::replay_position set_low_replay_position_mark();
    db::replay_position highest_flushed_replay_position() const;

-private:
-    using snapshot_file_set = foreign_ptr<std::unique_ptr<std::unordered_set<sstring>>>;
-
-    future<snapshot_file_set> take_snapshot(sstring jsondir);
-    // Writes the table schema and the manifest of all files in the snapshot directory.
-    future<> finalize_snapshot(const global_table_ptr& table_shards, sstring jsondir, std::vector<snapshot_file_set> file_sets);
-    static future<> seal_snapshot(sstring jsondir, std::vector<snapshot_file_set> file_sets);
-public:
-    static future<> snapshot_on_all_shards(sharded<database>& sharded_db, const global_table_ptr& table_shards, sstring name);
+    future<std::pair<std::vector<sstables::shared_sstable>, sstable_list_permit>> snapshot_sstables();

    future<std::unordered_map<sstring, snapshot_details>> get_snapshot_details();
    static future<snapshot_details> get_snapshot_details(std::filesystem::path snapshot_dir, std::filesystem::path datadir);

-    /*!
-     * \brief write the schema to a 'schema.cql' file at the given directory.
-     *
-     * When doing a snapshot, the snapshot directory contains a 'schema.cql' file
-     * with a CQL command that can be used to generate the schema.
-     * The content is is similar to the result of the CQL DESCRIBE command of the table.
-     *
-     * When a schema has indexes, local indexes or views, those indexes and views
-     * are represented by their own schemas.
-     * In those cases, the method would write the relevant information for each of the schemas:
-     *
-     * The schema of the base table would output a file with the CREATE TABLE command
-     * and the schema of the view that is used for the index would output a file with the
-     * CREATE INDEX command.
-     * The same is true for local index and MATERIALIZED VIEW.
-     */
-    future<> write_schema_as_cql(const global_table_ptr& table_shards, sstring dir) const;
-
    bool incremental_backups_enabled() const {
        return _config.enable_incremental_backups;
    }
@@ -1358,7 +1353,8 @@ public:

    // Clones storage of a given tablet. Memtable is flushed first to guarantee that the
    // snapshot (list of sstables) will include all the data written up to the time it was taken.
-    future<utils::chunked_vector<sstables::entry_descriptor>> clone_tablet_storage(locator::tablet_id tid);
+    // If leave_unsealead is set, all the destination sstables will be left unsealed.
+    future<utils::chunked_vector<sstables::entry_descriptor>> clone_tablet_storage(locator::tablet_id tid, bool leave_unsealed);

    friend class compaction_group;
    friend class compaction::compaction_task_impl;
@@ -2019,6 +2015,7 @@ private:
    keyspace::config make_keyspace_config(const keyspace_metadata& ksm, system_keyspace is_system);
    struct table_truncate_state;

+    static future<> snapshot_table_on_all_shards(sharded<database>& sharded_db, const global_table_ptr& table_shards, sstring name);
    static future<> truncate_table_on_all_shards(sharded<database>& db, sharded<db::system_keyspace>& sys_ks, const global_table_ptr&, std::optional<db_clock::time_point> truncated_at_opt, bool with_snapshot, std::optional<sstring> snapshot_name_opt);
    future<> truncate(db::system_keyspace& sys_ks, column_family& cf, std::vector<lw_shared_ptr<replica::table>>& views, const table_truncate_state&);
 public:
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -210,9 +210,9 @@ table::add_memtables_to_reader_list(std::vector<mutation_reader>& readers,
    auto sgs = storage_groups_for_token_range(token_range);
    reserve_fn(std::ranges::fold_left(sgs | std::views::transform(std::mem_fn(&storage_group::memtable_count)), uint64_t(0), std::plus{}));
    for (auto& sg : sgs) {
-        for (auto& cg : sg->compaction_groups()) {
+        sg->for_each_compaction_group([&] (const compaction_group_ptr &cg) {
            add_memtables_from_cg(*cg);
-        }
+        });
    }
 }

@@ -423,15 +423,27 @@ bool compaction_group::memtable_has_key(const dht::decorated_key& key) const {
 }

 api::timestamp_type storage_group::min_memtable_timestamp() const {
-    return std::ranges::min(compaction_groups() | std::views::transform(std::mem_fn(&compaction_group::min_memtable_timestamp)));
+    api::timestamp_type min_timestamp = api::max_timestamp;
+    for_each_compaction_group([&min_timestamp] (const compaction_group_ptr& cg) {
+        min_timestamp = std::min(min_timestamp, cg->min_memtable_timestamp());
+    });
+    return min_timestamp;
 }

 api::timestamp_type storage_group::min_memtable_live_timestamp() const {
-    return std::ranges::min(compaction_groups() | std::views::transform(std::mem_fn(&compaction_group::min_memtable_live_timestamp)));
+    api::timestamp_type min_timestamp = api::max_timestamp;
+    for_each_compaction_group([&min_timestamp] (const compaction_group_ptr& cg) {
+        min_timestamp = std::min(min_timestamp, cg->min_memtable_live_timestamp());
+    });
+    return min_timestamp;
 }

 api::timestamp_type storage_group::min_memtable_live_row_marker_timestamp() const {
-    return std::ranges::min(compaction_groups() | std::views::transform(std::mem_fn(&compaction_group::min_memtable_live_row_marker_timestamp)));
+    api::timestamp_type min_timestamp = api::max_timestamp;
+    for_each_compaction_group([&min_timestamp] (const compaction_group_ptr& cg) {
+        min_timestamp = std::min(min_timestamp, cg->min_memtable_live_row_marker_timestamp());
+    });
+    return min_timestamp;
 }

 api::timestamp_type table::min_memtable_timestamp() const {
@@ -721,7 +733,7 @@ public:
    bool all_storage_groups_split() override { return true; }
    future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override { return make_ready_future(); }
    future<> maybe_split_compaction_group_of(size_t idx) override { return make_ready_future(); }
-    future<std::vector<sstables::shared_sstable>> maybe_split_sstable(const sstables::shared_sstable& sst) override {
+    future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) override {
        return make_ready_future<std::vector<sstables::shared_sstable>>(std::vector<sstables::shared_sstable>{sst});
    }
    dht::token_range get_token_range_after_split(const dht::token&) const noexcept override { return dht::token_range(); }
@@ -879,7 +891,7 @@ public:
    bool all_storage_groups_split() override;
    future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override;
    future<> maybe_split_compaction_group_of(size_t idx) override;
-    future<std::vector<sstables::shared_sstable>> maybe_split_sstable(const sstables::shared_sstable& sst) override;
+    future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) override;
    dht::token_range get_token_range_after_split(const dht::token& token) const noexcept override {
        return tablet_map().get_token_range_after_split(token);
    }
@@ -933,7 +945,7 @@ void storage_group::for_each_compaction_group(std::function<void(const compactio
    }
 }

-utils::small_vector<compaction_group_ptr, 3> storage_group::compaction_groups() {
+utils::small_vector<compaction_group_ptr, 3> storage_group::compaction_groups_immediate() {
    utils::small_vector<compaction_group_ptr, 3> cgs;
    for_each_compaction_group([&cgs] (const compaction_group_ptr& cg) {
        cgs.push_back(cg);
@@ -941,7 +953,7 @@ utils::small_vector<compaction_group_ptr, 3> storage_group::compaction_groups()
    return cgs;
 }

-utils::small_vector<const_compaction_group_ptr, 3> storage_group::compaction_groups() const {
+utils::small_vector<const_compaction_group_ptr, 3> storage_group::compaction_groups_immediate() const {
    utils::small_vector<const_compaction_group_ptr, 3> cgs;
    for_each_compaction_group([&cgs] (const compaction_group_ptr& cg) {
        cgs.push_back(cg);
@@ -1130,7 +1142,8 @@ future<> tablet_storage_group_manager::maybe_split_compaction_group_of(size_t id
 }

 future<std::vector<sstables::shared_sstable>>
-tablet_storage_group_manager::maybe_split_sstable(const sstables::shared_sstable& sst) {
+tablet_storage_group_manager::maybe_split_new_sstable(const sstables::shared_sstable& sst) {
+    co_await utils::get_local_injector().inject("maybe_split_new_sstable_wait", utils::wait_for_message(120s));
    if (!tablet_map().needs_split()) {
        co_return std::vector<sstables::shared_sstable>{sst};
    }
@@ -1138,8 +1151,7 @@ tablet_storage_group_manager::maybe_split_sstable(const sstables::shared_sstable
    auto& cg = compaction_group_for_sstable(sst);
    auto holder = cg.async_gate().hold();
    auto& view = cg.view_for_sstable(sst);
-    auto lock_holder = co_await _t.get_compaction_manager().get_incremental_repair_read_lock(view, "maybe_split_sstable");
-    co_return co_await _t.get_compaction_manager().maybe_split_sstable(sst, view, co_await split_compaction_options());
+    co_return co_await _t.get_compaction_manager().maybe_split_new_sstable(sst, view, co_await split_compaction_options());
 }

 future<> table::maybe_split_compaction_group_of(locator::tablet_id tablet_id) {
@@ -1149,7 +1161,7 @@ future<> table::maybe_split_compaction_group_of(locator::tablet_id tablet_id) {

 future<std::vector<sstables::shared_sstable>> table::maybe_split_new_sstable(const sstables::shared_sstable& sst) {
    auto holder = async_gate().hold();
-    co_return co_await _sg_manager->maybe_split_sstable(sst);
+    co_return co_await _sg_manager->maybe_split_new_sstable(sst);
 }

 dht::token_range table::get_token_range_after_split(const dht::token& token) const noexcept {
@@ -1257,7 +1269,7 @@ future<> table::parallel_foreach_compaction_group(std::function<future<>(compact
            tlogger.info("foreach_compaction_group_wait: released");
        });

-        co_await coroutine::parallel_for_each(sg.compaction_groups(), [&] (compaction_group_ptr cg) -> future<> {
+        co_await coroutine::parallel_for_each(sg.compaction_groups_immediate(), [&] (compaction_group_ptr cg) -> future<> {
            if (auto holder = try_hold_gate(cg->async_gate())) {
                co_await action(*cg);
            }
@@ -1330,7 +1342,7 @@ future<utils::chunked_vector<sstables::shared_sstable>> table::take_sstable_set_
 }

 future<utils::chunked_vector<sstables::entry_descriptor>>
-table::clone_tablet_storage(locator::tablet_id tid) {
+table::clone_tablet_storage(locator::tablet_id tid, bool leave_unsealed) {
    utils::chunked_vector<sstables::entry_descriptor> ret;
    auto holder = async_gate().hold();

@@ -1342,7 +1354,7 @@ table::clone_tablet_storage(locator::tablet_id tid) {
    // by compaction while we are waiting for the lock.
    auto deletion_guard = co_await get_sstable_list_permit();
    co_await sg.make_sstable_set()->for_each_sstable_gently([&] (const sstables::shared_sstable& sst) -> future<> {
-        ret.push_back(co_await sst->clone(calculate_generation_for_new_table()));
+        ret.push_back(co_await sst->clone(calculate_generation_for_new_table(), leave_unsealed));
    });
    co_return ret;
 }
@@ -1354,10 +1366,10 @@ void table::update_stats_for_new_sstable(const sstables::shared_sstable& sst) no
 }

 future<>
-table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable sst, sstables::offstrategy offstrategy,
+table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable& sst, sstables::offstrategy offstrategy,
                                       bool trigger_compaction) {
    auto permit = co_await seastar::get_units(_sstable_set_mutation_sem, 1);
-    co_return co_await get_row_cache().invalidate(row_cache::external_updater([&] () noexcept {
+    co_return co_await get_row_cache().invalidate(row_cache::external_updater([&] () mutable noexcept {
        // FIXME: this is not really noexcept, but we need to provide strong exception guarantees.
        // atomically load all opened sstables into column family.
        if (!offstrategy) {
@@ -1369,6 +1381,8 @@ table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_ss
        if (trigger_compaction) {
            try_trigger_compaction(cg);
        }
+        // Resetting sstable ptr to inform the caller the sstable has been loaded successfully.
+        sst = nullptr;
    }), dht::partition_range::make({sst->get_first_decorated_key(), true}, {sst->get_last_decorated_key(), true}), [sst, schema = _schema] (const dht::decorated_key& key) {
        return sst->filter_has_key(sstables::key::from_partition_key(*schema, key.key()));
    });
@@ -1376,12 +1390,10 @@ table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_ss

 future<>
 table::do_add_sstable_and_update_cache(sstables::shared_sstable new_sst, sstables::offstrategy offstrategy, bool trigger_compaction) {
-    for (auto sst : co_await maybe_split_new_sstable(new_sst)) {
-        auto& cg = compaction_group_for_sstable(sst);
-        // Hold gate to make share compaction group is alive.
-        auto holder = cg.async_gate().hold();
-        co_await do_add_sstable_and_update_cache(cg, std::move(sst), offstrategy, trigger_compaction);
-    }
+    auto& cg = compaction_group_for_sstable(new_sst);
+    // Hold gate to make share compaction group is alive.
+    auto holder = cg.async_gate().hold();
+    co_await do_add_sstable_and_update_cache(cg, new_sst, offstrategy, trigger_compaction);
 }

 future<>
@@ -1399,6 +1411,85 @@ table::add_sstables_and_update_cache(const std::vector<sstables::shared_sstable>
    trigger_compaction();
 }

+future<std::vector<sstables::shared_sstable>>
+table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
+                                        std::function<future<>(sstables::shared_sstable)> on_add,
+                                        sstables::offstrategy offstrategy) {
+    std::vector<sstables::shared_sstable> ret, ssts;
+    std::exception_ptr ex;
+    try {
+        bool trigger_compaction = offstrategy == sstables::offstrategy::no;
+        auto& cg = compaction_group_for_sstable(new_sst);
+        // This prevents compaction group from being considered empty until the holder is released.
+        // Helpful for tablet split, where split is acked for a table when all pre-split groups are empty.
+        auto sstable_add_holder = cg.sstable_add_gate().hold();
+
+        ret = ssts = co_await maybe_split_new_sstable(new_sst);
+        // on successful split, input sstable is unlinked.
+        new_sst = nullptr;
+        for (auto& sst : ssts) {
+            auto& cg = compaction_group_for_sstable(sst);
+            // Hold gate to make sure compaction group is alive.
+            auto holder = cg.async_gate().hold();
+            co_await on_add(sst);
+            // If do_add_sstable_and_update_cache() throws after sstable has been loaded, the pointer
+            // sst passed by reference will be set to nullptr, so it won't be unlinked in the exception
+            // handler below.
+            co_await do_add_sstable_and_update_cache(cg, sst, offstrategy, trigger_compaction);
+            sst = nullptr;
+        }
+    } catch (...) {
+        ex = std::current_exception();
+    }
+
+    if (ex) {
+        // on failed split, input sstable is unlinked here.
+        if (new_sst) {
+            tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
+            co_await new_sst->unlink();
+        }
+        // on failure after successful split, sstables not attached yet will be unlinked
+        co_await coroutine::parallel_for_each(ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
+            if (sst) {
+                tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
+                co_await sst->unlink();
+            }
+        });
+        co_await coroutine::return_exception_ptr(std::move(ex));
+    }
+    co_return std::move(ret);
+}
+
+future<std::vector<sstables::shared_sstable>>
+table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> new_ssts,
+                                         std::function<future<>(sstables::shared_sstable)> on_add) {
+    std::exception_ptr ex;
+    std::vector<sstables::shared_sstable> ret;
+
+    // We rely on add_new_sstable_and_update_cache() to unlink the sstable fed into it,
+    // so the exception handling below will only have to unlink sstables not processed yet.
+    try {
+        for (auto& sst: new_ssts) {
+            auto ssts = co_await add_new_sstable_and_update_cache(std::exchange(sst, nullptr), on_add);
+            std::ranges::move(ssts, std::back_inserter(ret));
+
+        }
+    } catch (...) {
+        ex = std::current_exception();
+    }
+
+    if (ex) {
+        co_await coroutine::parallel_for_each(new_ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
+            if (sst) {
+                tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
+                co_await sst->unlink();
+            }
+        });
+        co_await coroutine::return_exception_ptr(std::move(ex));
+    }
+    co_return std::move(ret);
+}
+
 future<>
 table::update_cache(compaction_group& cg, lw_shared_ptr<memtable> m, std::vector<sstables::shared_sstable> ssts) {
    auto permit = co_await seastar::get_units(_sstable_set_mutation_sem, 1);
@@ -1892,7 +1983,7 @@ sstables::file_size_stats compaction_group::live_disk_space_used_full_stats() co
 }

 uint64_t storage_group::live_disk_space_used() const {
-    auto cgs = const_cast<storage_group&>(*this).compaction_groups();
+    auto cgs = const_cast<storage_group&>(*this).compaction_groups_immediate();
    return std::ranges::fold_left(cgs | std::views::transform(std::mem_fn(&compaction_group::live_disk_space_used)), uint64_t(0), std::plus{});
 }

@@ -2019,10 +2110,9 @@ future<std::vector<compaction::compaction_group_view*>> table::get_compaction_gr
    auto sgs = storage_groups_for_token_range(range);
    for (auto& sg : sgs) {
        co_await coroutine::maybe_yield();
-        auto cgs = sg->compaction_groups();
-        for (auto& cg : cgs) {
+        sg->for_each_compaction_group([&ret] (const compaction_group_ptr& cg) {
            ret.push_back(&cg->view_for_unrepaired_data());
-        }
+        });
    }
    co_return ret;
 }
@@ -2049,7 +2139,7 @@ future<compaction_reenablers_and_lock_holders> table::get_compaction_reenablers_
 future<> table::clear_being_repaired_for_range(dht::token_range range) {
    auto sgs = storage_groups_for_token_range(range);
    for (auto& sg : sgs) {
-        auto cgs = sg->compaction_groups();
+        auto cgs = sg->compaction_groups_immediate();
        for (auto& cg : cgs) {
            auto sstables = cg->all_sstables();
            co_await coroutine::maybe_yield();
@@ -2491,9 +2581,11 @@ future<> table::drop_quarantined_sstables() {
 }

 bool storage_group::no_compacted_sstable_undeleted() const {
-    return std::ranges::all_of(compaction_groups(), [] (const_compaction_group_ptr& cg) {
-        return cg->compacted_undeleted_sstables().empty();
+    auto ret = true;
+    for_each_compaction_group([&ret] (const compaction_group_ptr& cg) {
+        ret &= cg->compacted_undeleted_sstables().empty();
    });
+    return ret;
 }

 // Gets the list of all sstables in the column family, including ones that are
@@ -2612,8 +2704,8 @@ public:
    sstables::sstables_manager& get_sstables_manager() noexcept override {
        return _t.get_sstables_manager();
    }
-    sstables::shared_sstable make_sstable() const override {
-        return _t.make_sstable();
+    sstables::shared_sstable make_sstable(sstables::sstable_state state) const override {
+        return _t.make_sstable(state);
    }
    sstables::sstable_writer_config configure_writer(sstring origin) const override {
        auto cfg = _t.get_sstables_manager().configure_writer(std::move(origin));
@@ -2731,6 +2823,7 @@ future<> compaction_group::stop(sstring reason) noexcept {
    auto flush_future = co_await seastar::coroutine::as_future(flush());

    co_await _flush_gate.close();
+    co_await _sstable_add_gate.close();
  // FIXME: indentation
  _compaction_disabler_for_views.clear();
  co_await utils::get_local_injector().inject("compaction_group_stop_wait", utils::wait_for_message(60s));
@@ -2744,7 +2837,7 @@ future<> compaction_group::stop(sstring reason) noexcept {
 }

 bool compaction_group::empty() const noexcept {
-    return _memtables->empty() && live_sstable_count() == 0;
+    return _memtables->empty() && live_sstable_count() == 0 && _sstable_add_gate.get_count() == 0;
 }

 const schema_ptr& compaction_group::schema() const {
@@ -2757,9 +2850,9 @@ void compaction_group::clear_sstables() {
 }

 void storage_group::clear_sstables() {
-    for (auto cg : compaction_groups()) {
+    for_each_compaction_group([] (const compaction_group_ptr& cg) {
        cg->clear_sstables();
-    }
+    });
 }

 table::table(schema_ptr schema, config config, lw_shared_ptr<const storage_options> sopts, compaction::compaction_manager& compaction_manager,
@@ -3086,7 +3179,7 @@ future<> table::update_repaired_at_for_merge() {
    for (auto& x : sgs) {
        auto sg = x.second;
        if (sg) {
-            auto cgs = sg->compaction_groups();
+            auto cgs = sg->compaction_groups_immediate();
            for (auto& cg : cgs) {
                auto cre = co_await cg->get_compaction_manager().stop_and_disable_compaction("update_repaired_at_for_merge", cg->view_for_unrepaired_data());
                co_await cg->update_repaired_at_for_merge();
@@ -3200,7 +3293,7 @@ db::replay_position table::highest_flushed_replay_position() const {
 }

 struct manifest_json : public json::json_base {
-    json::json_chunked_list<sstring> files;
+    json::json_chunked_list<std::string_view> files;

    manifest_json() {
        register_params();
@@ -3219,22 +3312,25 @@ private:
    }
 };

-future<>
-table::seal_snapshot(sstring jsondir, std::vector<snapshot_file_set> file_sets) {
+class snapshot_writer {
+public:
+    virtual future<> init() = 0;
+    virtual future<> sync() = 0;
+    virtual future<output_stream<char>> stream_for(sstring component) = 0;
+    virtual ~snapshot_writer() = default;
+};
+
+using snapshot_file_set = foreign_ptr<std::unique_ptr<std::unordered_set<sstring>>>;
+
+static future<> write_manifest(snapshot_writer& writer, std::vector<snapshot_file_set> file_sets) {
    manifest_json manifest;
    for (const auto& fsp : file_sets) {
        for (auto& rf : *fsp) {
-            manifest.files.push(std::move(rf));
+            manifest.files.push(std::string_view(rf));
        }
    }
    auto streamer = json::stream_object(std::move(manifest));
-    auto jsonfile = jsondir + "/manifest.json";
-
-    tlogger.debug("Storing manifest {}", jsonfile);
-
-    co_await io_check([jsondir] { return recursive_touch_directory(jsondir); });
-    auto f = co_await open_checked_file_dma(general_disk_error_handler, jsonfile, open_flags::wo | open_flags::create | open_flags::truncate);
-    auto out = co_await make_file_output_stream(std::move(f));
+    auto out = co_await writer.stream_for("manifest.json");
    std::exception_ptr ex;
    try {
        co_await streamer(std::move(out));
@@ -3245,19 +3341,27 @@ table::seal_snapshot(sstring jsondir, std::vector<snapshot_file_set> file_sets)
    if (ex) {
        co_await coroutine::return_exception_ptr(std::move(ex));
    }
-
-    co_await io_check(sync_directory, std::move(jsondir));
 }

-future<> table::write_schema_as_cql(const global_table_ptr& table_shards, sstring dir) const {
-    auto schema_desc = schema()->describe(
-            replica::make_schema_describe_helper(table_shards),
-            cql3::describe_option::STMTS);
-
+/*!
+ * \brief write the schema to a 'schema.cql' file at the given directory.
+ *
+ * When doing a snapshot, the snapshot directory contains a 'schema.cql' file
+ * with a CQL command that can be used to generate the schema.
+ * The content is is similar to the result of the CQL DESCRIBE command of the table.
+ *
+ * When a schema has indexes, local indexes or views, those indexes and views
+ * are represented by their own schemas.
+ * In those cases, the method would write the relevant information for each of the schemas:
+ *
+ * The schema of the base table would output a file with the CREATE TABLE command
+ * and the schema of the view that is used for the index would output a file with the
+ * CREATE INDEX command.
+ * The same is true for local index and MATERIALIZED VIEW.
+ */
+static future<> write_schema_as_cql(snapshot_writer& writer, cql3::description schema_desc) {
    auto schema_description = std::move(*schema_desc.create_statement);
-    auto schema_file_name = dir + "/schema.cql";
-    auto f = co_await open_checked_file_dma(general_disk_error_handler, schema_file_name, open_flags::wo | open_flags::create | open_flags::truncate);
-    auto out = co_await make_file_output_stream(std::move(f));
+    auto out = co_await writer.stream_for("schema.cql");
    std::exception_ptr ex;

    auto view = managed_bytes_view(schema_description.as_managed_bytes());
@@ -3278,73 +3382,87 @@ future<> table::write_schema_as_cql(const global_table_ptr& table_shards, sstrin
    }
 }

-// Runs the orchestration code on an arbitrary shard to balance the load.
-future<> table::snapshot_on_all_shards(sharded<database>& sharded_db, const global_table_ptr& table_shards, sstring name) {
-    auto* so = std::get_if<storage_options::local>(&table_shards->get_storage_options().value);
-    if (so == nullptr) {
-        throw std::runtime_error("Snapshotting non-local tables is not implemented");
+class local_snapshot_writer : public snapshot_writer {
+    std::filesystem::path _dir;
+
+public:
+    local_snapshot_writer(std::filesystem::path dir, sstring name)
+            : _dir(dir / sstables::snapshots_dir / name)
+    {}
+    future<> init() override {
+        co_await io_check([this] { return recursive_touch_directory(_dir.native()); });
    }
-    if (so->dir.empty()) { // virtual tables don't have initialized local storage
+    future<> sync() override {
+        co_await io_check([this] { return sync_directory(_dir.native()); });
+    }
+    future<output_stream<char>> stream_for(sstring component) override {
+        auto file_name = (_dir / component).native();
+        auto f = co_await open_checked_file_dma(general_disk_error_handler, file_name, open_flags::wo | open_flags::create | open_flags::truncate);
+        co_return co_await make_file_output_stream(std::move(f));
+    }
+};
+
+// Runs the orchestration code on an arbitrary shard to balance the load.
+future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, const global_table_ptr& table_shards, sstring name) {
+    auto writer = std::visit(overloaded_functor{
+        [&name] (const data_dictionary::storage_options::local& loc) -> std::unique_ptr<snapshot_writer> {
+            if (loc.dir.empty()) {
+                // virtual tables don't have initialized local storage
+                return nullptr;
+            }
+
+            return std::make_unique<local_snapshot_writer>(loc.dir, name);
+        },
+        [] (const data_dictionary::storage_options::s3&) -> std::unique_ptr<snapshot_writer> {
+            throw std::runtime_error("Snapshotting non-local tables is not implemented");
+        }
+    }, table_shards->get_storage_options().value);
+    if (!writer) {
        co_return;
    }

-    auto jsondir = (so->dir / sstables::snapshots_dir / name).native();
-    auto orchestrator = std::hash<sstring>()(jsondir) % smp::count;
-
+    auto orchestrator = std::hash<sstring>()(name) % smp::count;
    co_await smp::submit_to(orchestrator, [&] () -> future<> {
        auto& t = *table_shards;
        auto s = t.schema();
-        tlogger.debug("Taking snapshot of {}.{}: directory={}", s->ks_name(), s->cf_name(), jsondir);
+        tlogger.debug("Taking snapshot of {}.{}: name={}", s->ks_name(), s->cf_name(), name);

-        std::vector<table::snapshot_file_set> file_sets;
-        file_sets.reserve(smp::count);
+        std::vector<snapshot_file_set> file_sets(smp::count);

-        co_await io_check([&jsondir] { return recursive_touch_directory(jsondir); });
-        co_await coroutine::parallel_for_each(smp::all_cpus(), [&] (unsigned shard) -> future<> {
-            file_sets.emplace_back(co_await smp::submit_to(shard, [&] {
-                return table_shards->take_snapshot(jsondir);
-            }));
+        co_await writer->init();
+        co_await smp::invoke_on_all([&] -> future<> {
+            auto& t = *table_shards;
+            auto [tables, permit] = co_await t.snapshot_sstables();
+            auto table_names = co_await t.get_sstables_manager().take_snapshot(std::move(tables), name);
+            file_sets[this_shard_id()] = make_foreign(std::make_unique<std::unordered_set<sstring>>(std::move(table_names)));
        });
-        co_await io_check(sync_directory, jsondir);
+        co_await writer->sync();

-        co_await t.finalize_snapshot(table_shards, std::move(jsondir), std::move(file_sets));
+        std::exception_ptr ex;
+
+        tlogger.debug("snapshot {}: writing schema.cql", name);
+        auto schema_desc = s->describe(replica::make_schema_describe_helper(table_shards), cql3::describe_option::STMTS);
+        co_await write_schema_as_cql(*writer, std::move(schema_desc)).handle_exception([&] (std::exception_ptr ptr) {
+            tlogger.error("Failed writing schema file in snapshot in {} with exception {}", name, ptr);
+            ex = std::move(ptr);
+        });
+        tlogger.debug("snapshot {}: seal_snapshot", name);
+        co_await write_manifest(*writer, std::move(file_sets)).handle_exception([&] (std::exception_ptr ptr) {
+            tlogger.error("Failed to seal snapshot in {}: {}.", name, ptr);
+            ex = std::move(ptr);
+        });
+        if (ex) {
+            co_await coroutine::return_exception_ptr(std::move(ex));
+        }
+
+        co_await writer->sync();
    });
 }

-future<table::snapshot_file_set> table::take_snapshot(sstring jsondir) {
-    tlogger.trace("take_snapshot {}", jsondir);
-
-    auto sstable_deletion_guard = co_await get_sstable_list_permit();
-
+future<std::pair<std::vector<sstables::shared_sstable>, table::sstable_list_permit>> table::snapshot_sstables() {
+    auto permit = co_await get_sstable_list_permit();
    auto tables = *_sstables->all() | std::ranges::to<std::vector<sstables::shared_sstable>>();
-    auto table_names = std::make_unique<std::unordered_set<sstring>>();
-
-    co_await _sstables_manager.dir_semaphore().parallel_for_each(tables, [&jsondir, &table_names] (sstables::shared_sstable sstable) {
-        table_names->insert(sstable->component_basename(sstables::component_type::Data));
-        return io_check([sstable, &dir = jsondir] {
-            return sstable->snapshot(dir);
-        });
-    });
-    co_return make_foreign(std::move(table_names));
-}
-
-future<> table::finalize_snapshot(const global_table_ptr& table_shards, sstring jsondir, std::vector<snapshot_file_set> file_sets) {
-    std::exception_ptr ex;
-
-    tlogger.debug("snapshot {}: writing schema.cql", jsondir);
-    co_await write_schema_as_cql(table_shards, jsondir).handle_exception([&] (std::exception_ptr ptr) {
-        tlogger.error("Failed writing schema file in snapshot in {} with exception {}", jsondir, ptr);
-        ex = std::move(ptr);
-    });
-    tlogger.debug("snapshot {}: seal_snapshot", jsondir);
-    co_await seal_snapshot(jsondir, std::move(file_sets)).handle_exception([&] (std::exception_ptr ptr) {
-        tlogger.error("Failed to seal snapshot in {}: {}.", jsondir, ptr);
-        ex = std::move(ptr);
-    });
-
-    if (ex) {
-        co_await coroutine::return_exception_ptr(std::move(ex));
-    }
+    co_return std::make_pair(std::move(tables), std::move(permit));
 }

 future<bool> table::snapshot_exists(sstring tag) {
@@ -3356,6 +3474,7 @@ future<bool> table::snapshot_exists(sstring tag) {
    sstring jsondir = (so->dir / sstables::snapshots_dir / tag).native();
    bool exists = false;
    try {
+        future<stat_data> (&file_stat)(std::string_view, follow_symlink) noexcept = seastar::file_stat;
        auto sd = co_await io_check(file_stat, jsondir, follow_symlink::no);
        if (sd.type != directory_entry_type::directory) {
            throw std::error_code(ENOTDIR, std::system_category());
@@ -3385,16 +3504,15 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
                continue;
            }

-            lister::scan_dir(snapshots_dir,  lister::dir_entry_types::of<directory_entry_type::directory>(), [datadir, &all_snapshots] (fs::path snapshots_dir, directory_entry de) {
-                auto snapshot_name = de.name;
+            auto lister = directory_lister(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>());
+            while (auto de = lister.get().get()) {
+                auto snapshot_name = de->name;
                all_snapshots.emplace(snapshot_name, snapshot_details());
-                return get_snapshot_details(snapshots_dir / fs::path(snapshot_name), datadir).then([&all_snapshots, snapshot_name] (auto details) {
-                    auto& sd = all_snapshots.at(snapshot_name);
-                    sd.total += details.total;
-                    sd.live += details.live;
-                    return make_ready_future<>();
-                });
-            }).get();
+                auto details = get_snapshot_details(snapshots_dir / fs::path(snapshot_name), datadir).get();
+                auto& sd = all_snapshots.at(snapshot_name);
+                sd.total += details.total;
+                sd.live += details.live;
+            }
        }
        return all_snapshots;
    });
@@ -3402,38 +3520,65 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot

 future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_dir, fs::path datadir) {
    table::snapshot_details details{};
+    file snapshot_directory = co_await io_check(open_directory, snapshot_dir.native());
+    file data_directory = co_await io_check(open_directory, datadir.native());
+    file staging_directory;
+    std::optional<fs::path> staging_dir = datadir / sstables::staging_dir;
+    if (!co_await file_exists(staging_dir->native())) {
+        staging_dir.reset();
+    } else {
+        staging_directory = co_await io_check(open_directory, staging_dir->native());
+    }

-    co_await lister::scan_dir(snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>(), [datadir, &details] (fs::path snapshot_dir, directory_entry de) -> future<> {
-        auto sd = co_await io_check(file_stat, (snapshot_dir / de.name).native(), follow_symlink::no);
+    auto lister = directory_lister(snapshot_directory, snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
+    while (auto de = co_await lister.get()) {
+        const auto& name = de->name;
+        future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
+        auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
        auto size = sd.allocated_size;

        // The manifest and schema.sql files are the only files expected to be in this directory not belonging to the SSTable.
        //
        // All the others should just generate an exception: there is something wrong, so don't blindly
        // add it to the size.
-        if (de.name != "manifest.json" && de.name != "schema.cql") {
+        if (name != "manifest.json" && name != "schema.cql") {
            details.total += size;
+            if (sd.number_of_links == 1) {
+                // File exists only in the snapshot directory.
+                details.live += size;
+                continue;
+            }
+            // If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
+            // So check the datadir for the file too.
        } else {
-            size = 0;
+            continue;
        }

-        try {
+        auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
+          try {
            // File exists in the main SSTable directory. Snapshots are not contributing to size
-            auto psd = co_await io_check(file_stat, (datadir / de.name).native(), follow_symlink::no);
+            auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
            // File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
            if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
                dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
-                        (datadir / de.name).native(), psd.device_id, psd.inode_number, psd.size,
-                        (snapshot_dir / de.name).native(), sd.device_id, sd.inode_number, sd.size);
-                details.live += size;
+                        (path / name).native(), psd.device_id, psd.inode_number, psd.size,
+                        (snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
+                co_return false;
            }
-        } catch (std::system_error& e) {
+            co_return true;
+          } catch (std::system_error& e) {
            if (e.code() != std::error_code(ENOENT, std::system_category())) {
                throw;
            }
+            co_return false;
+          }
+        };
+        // Check staging dir first, as files might be moved from there to the datadir concurrently to this check
+        if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
+                !co_await exists_in_dir(data_directory, datadir, name)) {
            details.live += size;
        }
-    });
+    }

    co_return details;
 }
@@ -3447,7 +3592,7 @@ future<> compaction_group::flush() noexcept {
 }

 future<> storage_group::flush() noexcept {
-    for (auto& cg : compaction_groups()) {
+    for (auto& cg : compaction_groups_immediate()) {
        co_await cg->flush();
    }
 }
@@ -3465,7 +3610,11 @@ size_t compaction_group::memtable_count() const noexcept {
 }

 size_t storage_group::memtable_count() const {
-    return std::ranges::fold_left(compaction_groups() | std::views::transform(std::mem_fn(&compaction_group::memtable_count)), size_t(0), std::plus{});
+    size_t count = 0;
+    for_each_compaction_group([&count] (const compaction_group_ptr& cg) {
+        count += cg->memtable_count();
+    });
+    return count;
 }

 future<> table::flush(std::optional<db::replay_position> pos) {
@@ -3483,7 +3632,7 @@ future<> table::flush(std::optional<db::replay_position> pos) {
 }

 bool storage_group::can_flush() const {
-    return std::ranges::any_of(compaction_groups(), std::mem_fn(&compaction_group::can_flush));
+    return std::ranges::any_of(compaction_groups_immediate(), std::mem_fn(&compaction_group::can_flush));
 }

 bool table::can_flush() const {
@@ -3514,9 +3663,11 @@ bool storage_group::compaction_disabled() const {
    // Compaction group that has been stopped will be excluded, since the group will not be available for a caller
    // to disable compaction explicitly on it, e.g. on truncate, and the caller might want to perform a check
    // that compaction was disabled on all groups. Stopping a group is equivalent to disabling compaction on it.
-    return std::ranges::all_of(compaction_groups()
-            | std::views::filter(std::not_fn(&compaction_group::stopped)), [] (const_compaction_group_ptr& cg) {
-        return cg->compaction_disabled(); });
+    bool all_disabled = true;
+    for_each_compaction_group([&all_disabled] (const compaction_group_ptr& cg) {
+        all_disabled &= cg->stopped() || cg->compaction_disabled();
+    });
+    return all_disabled;
 }

 // NOTE: does not need to be futurized, but might eventually, depending on
@@ -4301,11 +4452,11 @@ std::vector<mutation_source> table::select_memtables_as_mutation_sources(dht::to
    auto& sg = storage_group_for_token(token);
    std::vector<mutation_source> mss;
    mss.reserve(sg.memtable_count());
-    for (auto& cg : sg.compaction_groups()) {
+    sg.for_each_compaction_group([&mss] (const compaction_group_ptr &cg) {
        for (auto& mt : *cg->memtables()) {
            mss.emplace_back(mt->as_data_source());
        }
-    }
+    });
    return mss;
 }

@@ -4465,7 +4616,7 @@ future<> compaction_group::cleanup() {
 }

 future<> table::clear_inactive_reads_for_tablet(database& db, storage_group& sg) {
-    for (auto& cg_ptr : sg.compaction_groups()) {
+    for (auto& cg_ptr : sg.compaction_groups_immediate()) {
        co_await db.clear_inactive_reads_for_tablet(_schema->id(), cg_ptr->token_range());
    }
 }
@@ -4506,13 +4657,13 @@ future<> table::stop_compaction_groups(storage_group& sg) {
 }

 future<> table::flush_compaction_groups(storage_group& sg) {
-    for (auto& cg_ptr : sg.compaction_groups()) {
+    for (auto& cg_ptr : sg.compaction_groups_immediate()) {
        co_await cg_ptr->flush();
    }
 }

 future<> table::cleanup_compaction_groups(database& db, db::system_keyspace& sys_ks, locator::tablet_id tid, storage_group& sg) {
-    for (auto& cg_ptr : sg.compaction_groups()) {
+    for (auto& cg_ptr : sg.compaction_groups_immediate()) {
        co_await cg_ptr->cleanup();
        // FIXME: at this point _highest_rp might be greater than the replay_position of the last cleaned mutation,
        // and can cover some mutations which weren't cleaned, causing them to be lost during replay.
--- a/rust/CMakeLists.txt
+++ b/rust/CMakeLists.txt
@@ -1,6 +1,13 @@
 find_program(CARGO cargo
  REQUIRED)

+# Set up RUSTC_WRAPPER for sccache support if configured
+if(Scylla_RUSTC_WRAPPER)
+  set(RUSTC_WRAPPER_ENV "RUSTC_WRAPPER=${Scylla_RUSTC_WRAPPER}")
+else()
+  set(RUSTC_WRAPPER_ENV "")
+endif()
+
 function(add_rust_library name)
  # used for profiles defined in Cargo.toml
  if(CMAKE_CONFIGURATION_TYPES)
@@ -16,7 +23,7 @@ function(add_rust_library name)
  set(library ${target_dir}/lib${name}.a)
  add_custom_command(
    OUTPUT ${library}
-    COMMAND ${CMAKE_COMMAND} -E env CARGO_BUILD_DEP_INFO_BASEDIR=. ${CARGO} build --locked --target-dir=${target_dir} --profile=${profile}
+    COMMAND ${CMAKE_COMMAND} -E env CARGO_BUILD_DEP_INFO_BASEDIR=. ${RUSTC_WRAPPER_ENV} ${CARGO} build --locked --target-dir=${target_dir} --profile=${profile}
    COMMAND ${CMAKE_COMMAND} -E copy ${target_dir}/${profile}/lib${name}.a ${library}
    DEPENDS Cargo.lock
    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
--- a/scripts/tablet-mon.py
+++ b/scripts/tablet-mon.py
@@ -390,9 +390,11 @@ dark_green = (195, 215, 195)
 light_red = (255, 200, 200)
 light_green = (200, 255, 200)
 light_gray = (240, 240, 240)
+scylla_blue = (87, 209, 229)

 tablet_colors = {
    (Tablet.STATE_NORMAL, None): GRAY,
+    (Tablet.STATE_NORMAL, 'repair'): scylla_blue,
    (Tablet.STATE_JOINING, 'allow_write_both_read_old'): dark_green,
    (Tablet.STATE_LEAVING, 'allow_write_both_read_old'): dark_red,
    (Tablet.STATE_JOINING, 'write_both_read_old'): dark_green,
@@ -532,6 +534,8 @@ def update_from_cql(initial=False):
                state = (Tablet.STATE_JOINING, tablet.stage)
            elif replica in leaving:
                state = (Tablet.STATE_LEAVING, tablet.stage)
+            elif tablet.stage == 'repair':
+                state = (Tablet.STATE_NORMAL, tablet.stage)
            else:
                state = (Tablet.STATE_NORMAL, None)

--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -4109,6 +4109,16 @@ class scylla_fiber(gdb.Command):
                    return res
            return None

+        # Coroutines need special handling as they allocate the future object on their frame.
+        if name.strip().endswith('[clone .resume]'):
+            self._maybe_log(f"Current task is a coroutine, trying to find the promise in the coroutine frame: 0x{ptr_meta.ptr:x}+{ptr_meta.size}\n", verbose)
+            # Skip the first two pointers, these are the coroutine resume and destroy function pointers.
+            for maybe_tptr in range(ptr_meta.ptr + 2 * _vptr_type().sizeof, ptr_meta.ptr + ptr_meta.size, _vptr_type().sizeof):
+                res = self._probe_pointer(maybe_tptr, scanned_region_size, using_seastar_allocator, verbose)
+                if res is not None:
+                    return res
+            return None
+
        if name.startswith('vtable for seastar::internal::when_all_state'):
            when_all_state_base_ptr_type = gdb.lookup_type('seastar::internal::when_all_state_base').pointer()
            when_all_state_base = gdb.Value(int(ptr_meta.ptr)).reinterpret_cast(when_all_state_base_ptr_type)
@@ -4195,6 +4205,9 @@ class scylla_fiber(gdb.Command):
        parser.add_argument("--force-fallback-mode", action="store_true", default=False,
                help="Force fallback mode to be used, that is, scan a fixed-size region of memory"
                " (configurable via --scanned-region-size), instead of relying on `scylla ptr` for determining the size of the task objects.")
+        parser.add_argument("--direction", action="store", choices=['forward', 'backward', 'both'], default='both',
+                help="Direction in which to walk the continuation chain. 'forward' walks futures waiting on the given task,"
+                " 'backward' walks futures the given task is waiting on, 'both' does both.")
        parser.add_argument("task", action="store", help="An expression that evaluates to a valid `seastar::task*` value. Cannot contain white-space.")

        try:
@@ -4224,14 +4237,20 @@ class scylla_fiber(gdb.Command):
                gdb.write("Provided pointer 0x{:016x} is not an object managed by seastar or not a task pointer\n".format(initial_task_ptr))
                return

-            backwards_fiber = self._walk(self._walk_backward, this_task[0], this_task[2], args.max_depth, args.scanned_region_size, using_seastar_allocator, args.verbose)
+            if (args.direction == 'backward' or args.direction == 'both'):
+                backwards_fiber = self._walk(self._walk_backward, this_task[0], this_task[2], args.max_depth, args.scanned_region_size, using_seastar_allocator, args.verbose)
+            else:
+                backwards_fiber = []

            for i, task_info in enumerate(reversed(backwards_fiber)):
                format_task_line(i - len(backwards_fiber), task_info)

            format_task_line(0, this_task)

-            forward_fiber = self._walk(self._walk_forward, this_task[0], this_task[2], args.max_depth, args.scanned_region_size, using_seastar_allocator, args.verbose)
+            if (args.direction == 'forward' or args.direction == 'both'):
+                forward_fiber = self._walk(self._walk_forward, this_task[0], this_task[2], args.max_depth, args.scanned_region_size, using_seastar_allocator, args.verbose)
+            else:
+                forward_fiber = []

            for i, task_info in enumerate(forward_fiber):
                format_task_line(i + 1, task_info)
@@ -5104,10 +5123,15 @@ class scylla_small_objects(gdb.Command):
            span_end = int(span_start + span.size() * self._page_size)

            # span's free list
-            span_next_free = span.page['freelist']
-            while span_next_free:
-                self._free_in_span.add(int(span_next_free))
-                span_next_free = span_next_free['next']
+            try:
+                span_next_free = span.page['freelist']
+                while span_next_free:
+                    self._free_in_span.add(int(span_next_free))
+                    span_next_free = span_next_free['next']
+            except gdb.error:
+                # This loop sometimes steps on "Cannot access memory at address", causing CI instability.
+                # Catch the exception and break the freelist traversal loop gracefully.
+                gdb.write(f"Warning: error traversing freelist of span [0x{span_start:x}, 0x{span_end:x}), some of the listed objects in this span may be free objects.\n")

            return span_start, span_end

@@ -5850,6 +5874,18 @@ class scylla_read_stats(gdb.Command):
    def __init__(self):
        gdb.Command.__init__(self, 'scylla read-stats', gdb.COMMAND_USER, gdb.COMPLETE_COMMAND)

+    @staticmethod
+    def foreach_permit(semaphore, fn):
+        """Mirror of reader_concurrency_semaphore::foreach_permit()"""
+        for permit_list in (
+                semaphore['_permit_list'],
+                semaphore['_wait_list']['_admission_queue'],
+                semaphore['_wait_list']['_memory_queue'],
+                semaphore['_ready_list'],
+                semaphore['_inactive_reads']):
+            for permit in intrusive_list(permit_list):
+                fn(permit)
+
    @staticmethod
    def dump_reads_from_semaphore(semaphore):
        try:
@@ -5864,7 +5900,7 @@ class scylla_read_stats(gdb.Command):
        permit_summaries = defaultdict(permit_stats)
        total = permit_stats()

-        for permit in intrusive_list(permit_list):
+        def summarize_permit(permit):
            schema_name = "*.*"
            schema = permit['_schema']
            try:
@@ -5884,6 +5920,8 @@ class scylla_read_stats(gdb.Command):
            permit_summaries[(schema_name, description, state)].add(summary)
            total.add(summary)

+        scylla_read_stats.foreach_permit(semaphore, summarize_permit)
+
        if not permit_summaries:
            return

@@ -5893,7 +5931,9 @@ class scylla_read_stats(gdb.Command):
        inactive_read_count = len(intrusive_list(semaphore['_inactive_reads']))
        waiters = int(semaphore["_stats"]["waiters"])

-        gdb.write("Semaphore {} with: {}/{} count and {}/{} memory resources, queued: {}, inactive={}\n".format(
+        gdb.write("Semaphore ({}*) 0x{:x} {} with: {}/{} count and {}/{} memory resources, queued: {}, inactive={}\n".format(
+                semaphore.type.name,
+                int(semaphore.address),
                semaphore_name,
                initial_count - int(semaphore['_resources']['count']), initial_count,
                initial_memory - int(semaphore['_resources']['memory']), initial_memory,
--- a/2
+++ b/2
--- a/service/client_routes.cc
+++ b/service/client_routes.cc
@@ -82,7 +82,7 @@ seastar::future<> service::client_routes_service::set_client_routes_inner(const
    auto guard = co_await _group0_client.start_operation(_abort_source, service::raft_timeout{});
    utils::chunked_vector<canonical_mutation> cmuts;

-    for (auto& entry : route_entries) {
+    for (const auto& entry : route_entries) {
        auto mut = co_await make_update_client_route_mutation(guard.write_timestamp(), entry);
        cmuts.emplace_back(std::move(mut));
    }
@@ -103,24 +103,24 @@ seastar::future<> service::client_routes_service::delete_client_routes_inner(con
    co_await _group0_client.add_entry(std::move(cmd), std::move(guard), _abort_source);
 }

-seastar::future<> service::client_routes_service::set_client_routes(const std::vector<service::client_routes_service::client_route_entry>& route_entries) {
-    return container().invoke_on(0, [route_entries = std::move(route_entries)] (service::client_routes_service& cr) -> future<> {
-        return cr.with_retry([&] {
+seastar::future<> service::client_routes_service::set_client_routes(std::vector<service::client_routes_service::client_route_entry> route_entries) {
+    return container().invoke_on(0, [route_entries = std::move(route_entries)] (service::client_routes_service& cr) mutable -> future<> {
+        return cr.with_retry([&cr, route_entries = std::move(route_entries)]  {
            return cr.set_client_routes_inner(route_entries);
        });
    });
 }

-seastar::future<> service::client_routes_service::delete_client_routes(const std::vector<service::client_routes_service::client_route_key>& route_keys) {
-    return container().invoke_on(0, [route_keys = std::move(route_keys)] (service::client_routes_service& cr) -> future<> {
-        return cr.with_retry([&] {
+seastar::future<> service::client_routes_service::delete_client_routes(std::vector<service::client_routes_service::client_route_key> route_keys) {
+    return container().invoke_on(0, [route_keys = std::move(route_keys)] (service::client_routes_service& cr) mutable -> future<> {
+        return cr.with_retry([&cr, route_keys = std::move(route_keys)]  {
            return cr.delete_client_routes_inner(route_keys);
        });
    });
 }

 template <typename Func>
-seastar::future<> service::client_routes_service::with_retry(Func&& func) const {
+seastar::future<> service::client_routes_service::with_retry(Func func) const {
    int retries = 10;
    while (true) {
        try {
--- a/service/client_routes.hh
+++ b/service/client_routes.hh
@@ -66,8 +66,8 @@ public:
    future<mutation> make_remove_client_route_mutation(api::timestamp_type ts, const service::client_routes_service::client_route_key& key);
    future<mutation> make_update_client_route_mutation(api::timestamp_type ts, const client_route_entry& entry);
    future<std::vector<client_route_entry>> get_client_routes() const;
-    seastar::future<> set_client_routes(const std::vector<service::client_routes_service::client_route_entry>& route_entries);
-    seastar::future<> delete_client_routes(const std::vector<service::client_routes_service::client_route_key>& route_keys);
+    seastar::future<> set_client_routes(std::vector<service::client_routes_service::client_route_entry> route_entries);
+    seastar::future<> delete_client_routes(std::vector<service::client_routes_service::client_route_key> route_keys);


    // notifications
@@ -76,7 +76,7 @@ private:
    seastar::future<> set_client_routes_inner(const std::vector<service::client_routes_service::client_route_entry>& route_entries);
    seastar::future<> delete_client_routes_inner(const std::vector<service::client_routes_service::client_route_key>& route_keys);
    template <typename Func>
-    seastar::future<> with_retry(Func&& func) const;
+    seastar::future<> with_retry(Func func) const;

    abort_source& _abort_source;
    gms::feature_service& _feature_service;
--- a/service/client_state.cc
+++ b/service/client_state.cc
@@ -224,7 +224,13 @@ future<> service::client_state::has_access(const sstring& ks, auth::command_desc
                ks + " can be granted only SELECT or DESCRIBE permissions to a non-superuser.");
    }

-    if (cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) {
+    static const std::unordered_set<auth::resource> vector_search_system_resources = {
+        auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::GROUP0_HISTORY),
+        auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::VERSIONS),
+    };
+
+    if ((cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) ||
+        (cmd.permission == auth::permission::SELECT && vector_search_system_resources.contains(cmd.resource))) {

        co_return co_await ensure_has_permission<auth::command_desc_with_permission_set>({auth::permission_set::of<auth::permission::SELECT, auth::permission::VECTOR_SEARCH_INDEXING>(), cmd.resource});

@@ -344,3 +350,17 @@ void service::client_state::update_per_service_level_params(qos::service_level_o

    _workload_type = slo.workload;
 }
+
+future<> service::client_state::set_client_options(
+        client_options_cache_type& keys_and_values_cache,
+        const std::unordered_map<sstring, sstring>& client_options) {
+    for (const auto& [key, value] : client_options) {
+        auto cached_key = co_await keys_and_values_cache.get_or_load(key, [] (const client_options_cache_key_type&) {
+            return make_ready_future<options_cache_value_type>(options_cache_value_type{});
+        });
+        auto cached_value = co_await keys_and_values_cache.get_or_load(value, [] (const client_options_cache_key_type&) {
+            return make_ready_future<options_cache_value_type>(options_cache_value_type{});
+        });
+        _client_options.emplace_back(std::move(cached_key), std::move(cached_value));
+    }
+}
--- a/service/client_state.hh
+++ b/service/client_state.hh
@@ -18,6 +18,7 @@
 #include "auth/authenticated_user.hh"
 #include "auth/authenticator.hh"
 #include "auth/permission.hh"
+#include "client_data.hh"

 #include "transport/cql_protocol_extension.hh"
 #include "service/qos/service_level_controller.hh"
@@ -102,7 +103,8 @@ private:
    private volatile String keyspace;
 #endif
    std::optional<auth::authenticated_user> _user;
-    std::optional<sstring> _driver_name, _driver_version;
+    std::optional<client_options_cache_entry_type> _driver_name, _driver_version;
+	std::list<client_option_key_value_cached_entry> _client_options;

    auth_state _auth_state = auth_state::UNINITIALIZED;
    bool _control_connection = false;
@@ -151,18 +153,33 @@ public:
        return _control_connection = true;
    }

-    std::optional<sstring> get_driver_name() const {
+    std::optional<client_options_cache_entry_type> get_driver_name() const {
        return _driver_name;
    }
-    void set_driver_name(sstring driver_name) {
-        _driver_name = std::move(driver_name);
+    future<> set_driver_name(client_options_cache_type& keys_and_values_cache, const sstring& driver_name) {
+        _driver_name = co_await keys_and_values_cache.get_or_load(driver_name, [] (const client_options_cache_key_type&) {
+            return make_ready_future<options_cache_value_type>(options_cache_value_type{});
+        });
    }

-    std::optional<sstring> get_driver_version() const {
+    const auto& get_client_options() const {
+        return _client_options;
+    }
+
+    future<> set_client_options(
+        client_options_cache_type& keys_and_values_cache,
+        const std::unordered_map<sstring, sstring>& client_options);
+
+    std::optional<client_options_cache_entry_type> get_driver_version() const {
        return _driver_version;
    }
-    void set_driver_version(sstring driver_version) {
-        _driver_version = std::move(driver_version);
+    future<> set_driver_version(
+        client_options_cache_type& keys_and_values_cache,
+        const sstring& driver_version)
+    {
+        _driver_version = co_await keys_and_values_cache.get_or_load(driver_version, [] (const client_options_cache_key_type&) {
+            return make_ready_future<options_cache_value_type>(options_cache_value_type{});
+        });
    }

    client_state(external_tag,
--- a/service/raft/group0_state_machine.cc
+++ b/service/raft/group0_state_machine.cc
@@ -79,7 +79,8 @@ group0_state_machine::group0_state_machine(raft_group0_client& client, migration
        // the node won't try to fetch a topology snapshot if the other
        // node doesn't support it yet.
        _topology_change_enabled = true;
-    })) {
+    }))
+    , _in_memory_state_machine_enabled(utils::get_local_injector().is_enabled("group0_enable_sm_immediately")) {
    _state_id_handler.run();
 }

@@ -154,6 +155,27 @@ static future<> notify_client_route_change_if_needed(storage_service& storage_se
    }
 }

+// Meant to be used only in error injections.
+static future<> maybe_partially_apply_cdc_generation_deletion_then_get_stuck(
+        std::function<future<>(utils::chunked_vector<frozen_mutation_and_schema>)> mutate,
+        const utils::chunked_vector<frozen_mutation_and_schema>& mutations) {
+
+    auto is_cdc_generation_data_clearing_mutation = [] (const frozen_mutation_and_schema& fm_s) {
+        return fm_s.s->id() == db::system_keyspace::cdc_generations_v3()->id()
+                && !fm_s.fm.unfreeze(fm_s.s).partition().row_tombstones().empty();
+    };
+
+    if (std::any_of(mutations.begin(), mutations.end(), is_cdc_generation_data_clearing_mutation)) {
+        utils::chunked_vector<frozen_mutation_and_schema> filtered_mutations;
+        std::copy_if(mutations.begin(), mutations.end(), std::back_inserter(filtered_mutations), is_cdc_generation_data_clearing_mutation);
+        co_await mutate(std::move(filtered_mutations));
+        while (true) {
+            slogger.info("group0 has hung on error injection, waiting for the process to be killed");
+            co_await seastar::sleep(std::chrono::seconds(1));
+        }
+    }
+}
+
 future<> write_mutations_to_database(storage_service& storage_service, storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms) {
    utils::chunked_vector<frozen_mutation_and_schema> mutations;
    client_routes_service::client_route_keys client_routes_update;
@@ -178,7 +200,13 @@ future<> write_mutations_to_database(storage_service& storage_service, storage_p
        throw std::runtime_error(::format("Error while applying mutations: {}", e));
    }

-    co_await proxy.mutate_locally(std::move(mutations), tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+    auto mutate = [&proxy] (utils::chunked_vector<frozen_mutation_and_schema> mutations) {
+        return proxy.mutate_locally(std::move(mutations), tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+    };
+    if (utils::get_local_injector().is_enabled("group0_simulate_partial_application_of_cdc_generation_deletion")) {
+        co_await maybe_partially_apply_cdc_generation_deletion_then_get_stuck(mutate, mutations);
+    }
+    co_await mutate(std::move(mutations));

    if (need_system_topology_flush) {
        slogger.trace("write_mutations_to_database: flushing {}.{}", db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY);
@@ -271,42 +299,41 @@ future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merg
    // If we crash before appending the state ID, when we reapply the command after restart, the change will be applied because
    // the state ID was not yet appended so the above check will pass.

-    // TODO: reapplication of a command after a crash may require contacting a quorum (we need to learn that the command
-    // is committed from a leader). But we may want to ensure that group 0 state is consistent after restart even without
-    // access to quorum, which means we cannot allow partially applied commands. We need to ensure that either the entire
-    // change is applied and the state ID is updated or none of this happens.
-    // E.g. use a write-ahead-entry which contains all this information and make sure it's replayed during restarts.
+    std::optional<storage_service::state_change_hint> topology_state_change_hint;
+    modules_to_reload modules_to_reload;

    co_await std::visit(make_visitor(
    [&] (schema_change& chng) -> future<> {
-        auto modules_to_reload = get_modules_to_reload(chng.mutations);
+        modules_to_reload = get_modules_to_reload(chng.mutations);
        co_await _mm.merge_schema_from(locator::host_id{cmd.creator_id.uuid()}, std::move(chng.mutations));
-        co_await reload_modules(std::move(modules_to_reload));
    },
    [&] (broadcast_table_query& query) -> future<> {
        auto result = co_await service::broadcast_tables::execute_broadcast_table_query(_sp, query.query, cmd.new_state_id);
        _client.set_query_result(cmd.new_state_id, std::move(result));
    },
    [&] (topology_change& chng) -> future<> {
-        auto modules_to_reload = get_modules_to_reload(chng.mutations);
-        auto tablet_keys = replica::get_tablet_metadata_change_hint(chng.mutations);
+        modules_to_reload = get_modules_to_reload(chng.mutations);
+        topology_state_change_hint = {.tablets_hint = replica::get_tablet_metadata_change_hint(chng.mutations)};
        co_await write_mutations_to_database(_ss, _sp, cmd.creator_addr, std::move(chng.mutations));
-        co_await _ss.topology_transition({.tablets_hint = std::move(tablet_keys)});
-        co_await reload_modules(std::move(modules_to_reload));
    },
    [&] (mixed_change& chng) -> future<> {
-        auto modules_to_reload = get_modules_to_reload(chng.mutations);
+        modules_to_reload = get_modules_to_reload(chng.mutations);
+        topology_state_change_hint.emplace();
        co_await _mm.merge_schema_from(locator::host_id{cmd.creator_id.uuid()}, std::move(chng.mutations));
-        co_await _ss.topology_transition();
-        co_await reload_modules(std::move(modules_to_reload));
    },
    [&] (write_mutations& muts) -> future<> {
-        auto modules_to_reload = get_modules_to_reload(muts.mutations);
+        modules_to_reload = get_modules_to_reload(muts.mutations);
        co_await write_mutations_to_database(_ss, _sp, cmd.creator_addr, std::move(muts.mutations));
-        co_await reload_modules(std::move(modules_to_reload));
    }
    ), cmd.change);

+    if (_in_memory_state_machine_enabled) {
+        if (topology_state_change_hint) {
+            co_await _ss.topology_transition(std::move(*topology_state_change_hint));
+        }
+        co_await reload_modules(std::move(modules_to_reload));
+    }
+
    co_await _sp.mutate_locally({std::move(history)}, nullptr);
 }

@@ -413,9 +440,23 @@ void group0_state_machine::drop_snapshot(raft::snapshot_id id) {
 }

 future<> group0_state_machine::load_snapshot(raft::snapshot_id id) {
-    // topology_state_load applies persisted state machine state into
-    // memory and thus needs to be protected with apply mutex
    auto read_apply_mutex_holder = co_await _client.hold_read_apply_mutex(_abort_source);
+    if (_in_memory_state_machine_enabled) {
+        co_await reload_state();
+    }
+}
+
+future<> group0_state_machine::enable_in_memory_state_machine() {
+    auto read_apply_mutex_holder = co_await _client.hold_read_apply_mutex(_abort_source);
+    if (!_in_memory_state_machine_enabled) {
+        _in_memory_state_machine_enabled = true;
+        co_await reload_state();
+    }
+}
+
+future<> group0_state_machine::reload_state() {
+    // we assume that the apply mutex is held, topology_state_load applies
+    // persisted state machine into memory so it needs to be protected with it
    co_await _ss.topology_state_load();
    co_await _ss.view_building_state_load();
    if (_feature_service.compression_dicts) {
--- a/service/raft/group0_state_machine.hh
+++ b/service/raft/group0_state_machine.hh
@@ -113,9 +113,33 @@ class group0_state_machine : public raft_state_machine {
    gms::feature_service& _feature_service;
    gms::feature::listener_registration _topology_on_raft_support_listener;

+    // This boolean controls whether the in-memory data structures should be updated
+    // after snapshot transfer / command application.
+    //
+    // The reason for the flag is to protect from reading a partially applied state.
+    // A group0 command may consist of multiple mutations that are not applied
+    // in a single, atomic operation, but rather separately. A node can crash
+    // in the middle of applying such a command, leaving the group0 in an inconsistent
+    // state. Thanks to the idempotency of mutations, applying the group0 command
+    // again, fully, will make the state consistent again. Therefore, we use this
+    // flag to control when the in memory state machine should be updated from the
+    // on-disk state - we can only do that if we know that the group0 table state
+    // is consistent.
+    //
+    // The only exception to the above rule is the schema - the schema state is
+    // loaded into memory before group0 is initialized, and the in-memory state
+    // is reloaded even if _in_memory_state_machine_enabled is set to false.
+    // Resolving this exception should be possible, but would require considerable
+    // effort in refactoring the migration manager code. In the meantime, we are
+    // fine with this exception because the migration manager applies all schema
+    // mutations of a single command atomically, in a single commitlog entry -
+    // therefore, we should not observe broken invariants in the schema module.
+    bool _in_memory_state_machine_enabled;
+
    modules_to_reload get_modules_to_reload(const utils::chunked_vector<canonical_mutation>& mutations);
    future<> reload_modules(modules_to_reload modules);
    future<> merge_and_apply(group0_state_machine_merger& merger);
+    future<> reload_state();
 public:
    group0_state_machine(raft_group0_client& client, migration_manager& mm, storage_proxy& sp, storage_service& ss,
            gms::gossiper& gossiper, gms::feature_service& feat, bool topology_change_enabled);
@@ -125,6 +149,7 @@ public:
    future<> load_snapshot(raft::snapshot_id id) override;
    future<> transfer_snapshot(raft::server_id from_id, raft::snapshot_descriptor snp) override;
    future<> abort() override;
+    future<> enable_in_memory_state_machine();
 };

 bool should_flush_system_topology_after_applying(const mutation& mut, const data_dictionary::database db);
--- a/Show More
+++ b/Show More