sstables: use chunked_managed_vector for promoted indexes in partition_index_page

Switch _promoted_indexes storage in partition_index_page from managed_vector to chunked_managed_vector to avoid large contiguous allocations. Avoid allocation failure (or crashes with --abort-on-internal-error) when large partitions have enough promoted index entries to trigger a large allocation with managed_vector. Fixes: SCYLLADB-1315 Closes scylladb/scylladb#29283 (cherry picked from commit 2d2ff4fbda) Closes scylladb/scylladb#29304
raft_group0: join_group0: fix join hang when node joins group 0 before post_server_start
2026-04-19 16:15:07 +00:00 · 2026-04-01 16:47:46 +03:00 · 2026-04-01 09:58:20 +02:00 · 2026-04-01 04:22:01 +03:00 · 2026-03-27 18:04:33 +01:00 · 2026-03-25 10:04:39 +01:00
84 changed files with 2206 additions and 798 deletions
--- a/.github/workflows/trigger-scylla-ci.yaml
+++ b/.github/workflows/trigger-scylla-ci.yaml
@@ -15,13 +15,19 @@ jobs:
      - name: Verify Org Membership
        id: verify_author
        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          EVENT_NAME: ${{ github.event_name }}
+          PR_AUTHOR: ${{ github.event.pull_request.user.login }}
+          PR_ASSOCIATION: ${{ github.event.pull_request.author_association }}
+          COMMENT_AUTHOR: ${{ github.event.comment.user.login }}
+          COMMENT_ASSOCIATION: ${{ github.event.comment.author_association }}
        shell: bash
        run: |
-          if [[ "${{ github.event_name }}" == "pull_request_target" ]]; then
-            AUTHOR="${{ github.event.pull_request.user.login }}"
+          if [[ "$EVENT_NAME" == "pull_request_target" ]]; then
+            AUTHOR="$PR_AUTHOR"
+            ASSOCIATION="$PR_ASSOCIATION"
          else
-            AUTHOR="${{ github.event.comment.user.login }}"
+            AUTHOR="$COMMENT_AUTHOR"
+            ASSOCIATION="$COMMENT_ASSOCIATION"
          fi
          ORG="scylladb"
          if gh api "/orgs/${ORG}/members/${AUTHOR}" --silent 2>/dev/null; then
@@ -34,13 +40,11 @@ jobs:
      - name: Validate Comment Trigger
        if: github.event_name == 'issue_comment'
        id: verify_comment
+        env:
+          COMMENT_BODY: ${{ github.event.comment.body }}
        shell: bash
        run: |
-          BODY=$(cat << 'EOF'
-          ${{ github.event.comment.body }}
-          EOF
-          )
-          CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
+          CLEAN_BODY=$(echo "$COMMENT_BODY" | grep -v '^[[:space:]]*>')

          if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
            echo "trigger=true" >> $GITHUB_OUTPUT
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2026.1.0
+VERSION=2026.1.1

 if test -f version
 then
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -48,6 +48,7 @@
 #include "mutation/mutation_fragment_stream_validator.hh"
 #include "utils/assert.hh"
 #include "utils/error_injection.hh"
+#include "utils/chunked_vector.hh"
 #include "utils/pretty_printers.hh"
 #include "readers/multi_range.hh"
 #include "readers/compacting.hh"
@@ -611,23 +612,23 @@ private:
    }

    // Called in a seastar thread
-    dht::partition_range_vector
+    utils::chunked_vector<dht::partition_range>
    get_ranges_for_invalidation(const std::vector<sstables::shared_sstable>& sstables) {
        // If owned ranges is disengaged, it means no cleanup work was done and
        // so nothing needs to be invalidated.
        if (!_owned_ranges) {
-            return dht::partition_range_vector{};
+            return {};
        }
-        auto owned_ranges = dht::to_partition_ranges(*_owned_ranges, utils::can_yield::yes);
+        auto owned_ranges = dht::to_partition_ranges_chunked(*_owned_ranges).get();

        auto non_owned_ranges = sstables
                | std::views::transform([] (const sstables::shared_sstable& sst) {
            seastar::thread::maybe_yield();
            return dht::partition_range::make({sst->get_first_decorated_key(), true},
                                              {sst->get_last_decorated_key(), true});
-        })      | std::ranges::to<dht::partition_range_vector>();
+        })      | std::ranges::to<utils::chunked_vector<dht::partition_range>>();

-        return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
+        return dht::subtract_ranges(*_schema, std::move(non_owned_ranges), std::move(owned_ranges)).get();
    }
 protected:
    compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker)
@@ -718,8 +719,8 @@ protected:

    compaction_completion_desc
    get_compaction_completion_desc(std::vector<sstables::shared_sstable> input_sstables, std::vector<sstables::shared_sstable> output_sstables) {
-        auto ranges_for_for_invalidation = get_ranges_for_invalidation(input_sstables);
-        return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges_for_for_invalidation)};
+        auto ranges = get_ranges_for_invalidation(input_sstables);
+        return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges)};
    }

    // Tombstone expiration is enabled based on the presence of sstable set.
--- a/compaction/compaction_descriptor.hh
+++ b/compaction/compaction_descriptor.hh
@@ -16,6 +16,7 @@
 #include "sstables/sstable_set.hh"
 #include "compaction_fwd.hh"
 #include "mutation_writer/token_group_based_splitting_writer.hh"
+#include "utils/chunked_vector.hh"

 namespace compaction {

@@ -38,7 +39,7 @@ struct compaction_completion_desc {
    // New, fresh SSTables that should be added to SSTable set, replacing the old ones.
    std::vector<sstables::shared_sstable> new_sstables;
    // Set of compacted partition ranges that should be invalidated in the cache.
-    dht::partition_range_vector ranges_for_cache_invalidation;
+    utils::chunked_vector<dht::partition_range> ranges_for_cache_invalidation;
 };

 // creates a new SSTable for a given shard
--- a/cql3/prepared_statements_cache.hh
+++ b/cql3/prepared_statements_cache.hh
@@ -105,6 +105,7 @@ public:
    static const std::chrono::minutes entry_expiry;

    using key_type = prepared_cache_key_type;
+    using pinned_value_type = cache_value_ptr;
    using value_type = checked_weak_ptr;
    using statement_is_too_big = typename cache_type::entry_is_too_big;

@@ -116,9 +117,14 @@ public:
        : _cache(size, entry_expiry, logger)
    {}

+    template <typename LoadFunc>
+    future<pinned_value_type> get_pinned(const key_type& key, LoadFunc&& load) {
+        return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); });
+    }
+
    template <typename LoadFunc>
    future<value_type> get(const key_type& key, LoadFunc&& load) {
-        return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); }).then([] (cache_value_ptr v_ptr) {
+        return get_pinned(key, std::forward<LoadFunc>(load)).then([] (cache_value_ptr v_ptr) {
            return make_ready_future<value_type>((*v_ptr)->checked_weak_from_this());
        });
    }
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -697,7 +697,7 @@ future<::shared_ptr<cql_transport::messages::result_message::prepared>>
 query_processor::prepare(sstring query_string, const service::client_state& client_state, cql3::dialect d) {
    try {
        auto key = compute_id(query_string, client_state.get_raw_keyspace(), d);
-        auto prep_ptr = co_await _prepared_cache.get(key, [this, &query_string, &client_state, d] {
+        auto prep_entry = co_await _prepared_cache.get_pinned(key, [this, &query_string, &client_state, d] {
                auto prepared = get_statement(query_string, client_state, d);
                prepared->calculate_metadata_id();
                auto bound_terms = prepared->statement->get_bound_terms();
@@ -711,13 +711,13 @@ query_processor::prepare(sstring query_string, const service::client_state& clie
                return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
            });

-        const auto& warnings = prep_ptr->warnings;
-        const auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_ptr),
+        co_await utils::get_local_injector().inject(
+                "query_processor_prepare_wait_after_cache_get",
+                utils::wait_for_message(std::chrono::seconds(60)));
+  
+        auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_entry),
                    client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
-        for (const auto& w : warnings) {
-            msg->add_warning(w);
-        }
-        co_return ::shared_ptr<cql_transport::messages::result_message::prepared>(std::move(msg));
+        co_return std::move(msg);
    } catch(typename prepared_statements_cache::statement_is_too_big&) {
        throw prepared_statement_is_too_big(query_string);
    }
--- a/db/row_cache.cc
+++ b/db/row_cache.cc
@@ -29,6 +29,7 @@
 #include "utils/assert.hh"
 #include "utils/updateable_value.hh"
 #include "utils/labels.hh"
+#include "utils/chunked_vector.hh"

 namespace cache {

@@ -1215,10 +1216,10 @@ future<> row_cache::invalidate(external_updater eu, const dht::decorated_key& dk
 }

 future<> row_cache::invalidate(external_updater eu, const dht::partition_range& range, cache_invalidation_filter filter) {
-    return invalidate(std::move(eu), dht::partition_range_vector({range}), std::move(filter));
+    return invalidate(std::move(eu), utils::chunked_vector<dht::partition_range>({range}), std::move(filter));
 }

-future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&& ranges, cache_invalidation_filter filter) {
+future<> row_cache::invalidate(external_updater eu, utils::chunked_vector<dht::partition_range>&& ranges, cache_invalidation_filter filter) {
    return do_update(std::move(eu), [this, ranges = std::move(ranges), filter = std::move(filter)] mutable {
        return seastar::async([this, ranges = std::move(ranges), filter = std::move(filter)] {
            auto on_failure = defer([this] () noexcept {
--- a/db/row_cache.hh
+++ b/db/row_cache.hh
@@ -17,6 +17,7 @@
 #include "utils/histogram.hh"
 #include "mutation/partition_version.hh"
 #include "utils/double-decker.hh"
+#include "utils/chunked_vector.hh"
 #include "db/cache_tracker.hh"
 #include "readers/empty.hh"
 #include "readers/mutation_source.hh"
@@ -457,7 +458,7 @@ public:
    // mutation source made prior to the call to invalidate().
    future<> invalidate(external_updater, const dht::decorated_key&);
    future<> invalidate(external_updater, const dht::partition_range& = query::full_partition_range, cache_invalidation_filter filter = [] (const auto&) { return true; });
-    future<> invalidate(external_updater, dht::partition_range_vector&&, cache_invalidation_filter filter = [] (const auto&) { return true; });
+    future<> invalidate(external_updater, utils::chunked_vector<dht::partition_range>&&, cache_invalidation_filter filter = [] (const auto&) { return true; });

    // Evicts entries from cache.
    //
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -105,7 +105,7 @@ namespace {
        schema_builder::register_schema_initializer([](schema_builder& builder) {
            if (builder.ks_name() == schema_tables::NAME) {
                // all schema tables are group0 tables
-                builder.set_is_group0_table(true);
+                builder.set_is_group0_table();
            }
        });
 }
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -87,31 +87,15 @@ namespace {
        static const std::unordered_set<sstring> tables = {
            schema_tables::SCYLLA_TABLE_SCHEMA_HISTORY,
            system_keyspace::BROADCAST_KV_STORE,
-            system_keyspace::CDC_GENERATIONS_V3,
            system_keyspace::RAFT,
            system_keyspace::RAFT_SNAPSHOTS,
            system_keyspace::RAFT_SNAPSHOT_CONFIG,
            system_keyspace::GROUP0_HISTORY,
            system_keyspace::DISCOVERY,
-            system_keyspace::TABLETS,
-            system_keyspace::TOPOLOGY,
-            system_keyspace::TOPOLOGY_REQUESTS,
            system_keyspace::LOCAL,
            system_keyspace::PEERS,
-            system_keyspace::SCYLLA_LOCAL,
            system_keyspace::COMMITLOG_CLEANUPS,
-            system_keyspace::SERVICE_LEVELS_V2,
-            system_keyspace::VIEW_BUILD_STATUS_V2,
-            system_keyspace::CDC_STREAMS_STATE,
-            system_keyspace::CDC_STREAMS_HISTORY,
-            system_keyspace::ROLES,
-            system_keyspace::ROLE_MEMBERS,
-            system_keyspace::ROLE_ATTRIBUTES,
-            system_keyspace::ROLE_PERMISSIONS,
            system_keyspace::CDC_LOCAL,
-            system_keyspace::DICTS,
-            system_keyspace::VIEW_BUILDING_TASKS,
-            system_keyspace::CLIENT_ROUTES,
        };
        if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
            builder.enable_schema_commitlog();
@@ -143,7 +127,7 @@ namespace {
                system_keyspace::REPAIR_TASKS,
            };
            if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
-                builder.set_is_group0_table(true);
+                builder.set_is_group0_table();
            }
        });
 }
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -930,8 +930,7 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
    const row& existing_row = existing.cells();
    const row& updated_row = update.cells();

-    const bool base_has_nonexpiring_marker = update.marker().is_live() && !update.marker().is_expiring();
-    return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row, base_has_nonexpiring_marker] (const column_definition& cdef) {
+    return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row] (const column_definition& cdef) {
        const auto view_it = _view->columns_by_name().find(cdef.name());
        const bool column_is_selected = view_it != _view->columns_by_name().end();

@@ -939,49 +938,29 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
        // as part of its PK, there are NO virtual columns corresponding to the unselected columns in the view.
        // Because of that, we don't generate view updates when the value in an unselected column is created
        // or changes.
-        if (!column_is_selected && _base_info.has_base_non_pk_columns_in_view_pk) {
+        if (!column_is_selected) {
            return true;
        }

-        //TODO(sarna): Optimize collections case - currently they do not go under optimization
-        if (!cdef.is_atomic()) {
-            return false;
-        }
-
-        // We cannot skip if the value was created or deleted, unless we have a non-expiring marker
+        // We cannot skip if the value was created or deleted
        const auto* existing_cell = existing_row.find_cell(cdef.id);
        const auto* updated_cell = updated_row.find_cell(cdef.id);
        if (existing_cell == nullptr || updated_cell == nullptr) {
-            return existing_cell == updated_cell || (!column_is_selected && base_has_nonexpiring_marker);
+            return existing_cell == updated_cell;
        }
+
+        if (!cdef.is_atomic()) {
+            return existing_cell->as_collection_mutation().data == updated_cell->as_collection_mutation().data;
+        }
+
        atomic_cell_view existing_cell_view = existing_cell->as_atomic_cell(cdef);
        atomic_cell_view updated_cell_view = updated_cell->as_atomic_cell(cdef);

        // We cannot skip when a selected column is changed
-        if (column_is_selected) {
-            if (view_it->second->is_view_virtual()) {
-                return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
-            }
-            return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
+        if (view_it->second->is_view_virtual()) {
+            return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
        }
-
-        // With non-expiring row marker, liveness checks below are not relevant
-        if (base_has_nonexpiring_marker) {
-            return true;
-        }
-
-        if (existing_cell_view.is_live() != updated_cell_view.is_live()) {
-            return false;
-        }
-
-        // We cannot skip if the change updates TTL
-        const bool existing_has_ttl = existing_cell_view.is_live_and_has_ttl();
-        const bool updated_has_ttl = updated_cell_view.is_live_and_has_ttl();
-        if (existing_has_ttl || updated_has_ttl) {
-            return existing_has_ttl == updated_has_ttl && existing_cell_view.expiry() == updated_cell_view.expiry();
-        }
-
-        return true;
+        return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
    });
 }

@@ -1749,7 +1728,7 @@ static endpoints_to_update get_view_natural_endpoint_vnodes(
        std::vector<std::reference_wrapper<const locator::node>> base_nodes,
        std::vector<std::reference_wrapper<const locator::node>> view_nodes,
        locator::endpoint_dc_rack my_location,
-        const locator::network_topology_strategy* network_topology,
+        const bool network_topology,
        replica::cf_stats& cf_stats) {
    using node_vector = std::vector<std::reference_wrapper<const locator::node>>;
    node_vector base_endpoints, view_endpoints;
@@ -1902,7 +1881,7 @@ endpoints_to_update get_view_natural_endpoint(
        locator::host_id me,
        const locator::effective_replication_map_ptr& base_erm,
        const locator::effective_replication_map_ptr& view_erm,
-        const locator::abstract_replication_strategy& replication_strategy,
+        const bool network_topology,
        const dht::token& base_token,
        const dht::token& view_token,
        bool use_tablets,
@@ -1910,7 +1889,6 @@ endpoints_to_update get_view_natural_endpoint(
    auto& topology = base_erm->get_token_metadata_ptr()->get_topology();
    auto& view_topology = view_erm->get_token_metadata_ptr()->get_topology();
    auto& my_location = topology.get_location(me);
-    auto* network_topology = dynamic_cast<const locator::network_topology_strategy*>(&replication_strategy);

    auto resolve = [&] (const locator::topology& topology, const locator::host_id& ep, bool is_view) -> const locator::node& {
        if (auto* np = topology.find_node(ep)) {
@@ -1944,7 +1922,7 @@ endpoints_to_update get_view_natural_endpoint(
                // view pairing as the leaving base replica.
                // note that the recursive call will not recurse again because leaving_base is in base_nodes.
                auto leaving_base = it->get().host_id();
-                return get_view_natural_endpoint(leaving_base, base_erm, view_erm, replication_strategy, base_token,
+                return get_view_natural_endpoint(leaving_base, base_erm, view_erm, network_topology, base_token,
                        view_token, use_tablets, cf_stats);
            }
        }
@@ -2040,7 +2018,9 @@ future<> view_update_generator::mutate_MV(
        wait_for_all_updates wait_for_all)
 {
    auto& ks = _db.find_keyspace(base->ks_name());
-    auto& replication = ks.get_replication_strategy();
+    const bool uses_tablets = ks.uses_tablets();
+    const bool uses_nts = dynamic_cast<const locator::network_topology_strategy*>(&ks.get_replication_strategy()) != nullptr;
+    // The object pointed by `ks` may disappear after preeemption. It should not be touched again after this comment.
    std::unordered_map<table_id, locator::effective_replication_map_ptr> erms;
    auto get_erm = [&] (table_id id) {
        auto it = erms.find(id);
@@ -2059,8 +2039,8 @@ future<> view_update_generator::mutate_MV(
    co_await max_concurrent_for_each(view_updates, max_concurrent_updates, [&] (frozen_mutation_and_schema mut) mutable -> future<> {
        auto view_token = dht::get_token(*mut.s, mut.fm.key());
        auto view_ermp = erms.at(mut.s->id());
-        auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, replication, base_token, view_token,
-                ks.uses_tablets(), cf_stats);
+        auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, uses_nts, base_token, view_token,
+                uses_tablets, cf_stats);
        auto remote_endpoints = view_ermp->get_pending_replicas(view_token);
        auto memory_units = seastar::make_lw_shared<db::timeout_semaphore_units>(pending_view_update_memory_units.split(memory_usage_of(mut)));
        if (no_pairing_endpoint) {
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -303,7 +303,7 @@ endpoints_to_update get_view_natural_endpoint(
    locator::host_id node,
    const locator::effective_replication_map_ptr& base_erm,
    const locator::effective_replication_map_ptr& view_erm,
-    const locator::abstract_replication_strategy& replication_strategy,
+    const bool network_topology,
    const dht::token& base_token,
    const dht::token& view_token,
    bool use_tablets,
--- a/dht/i_partitioner.cc
+++ b/dht/i_partitioner.cc
@@ -352,6 +352,16 @@ dht::partition_range_vector to_partition_ranges(const dht::token_range_vector& r
    return prs;
 }

+future<utils::chunked_vector<dht::partition_range>> to_partition_ranges_chunked(const dht::token_range_vector& ranges) {
+    utils::chunked_vector<dht::partition_range> prs;
+    prs.reserve(ranges.size());
+    for (auto& range : ranges) {
+        prs.push_back(dht::to_partition_range(range));
+        co_await coroutine::maybe_yield();
+    }
+    co_return prs;
+}
+
 std::map<unsigned, dht::partition_range_vector>
 split_range_to_shards(dht::partition_range pr, const schema& s, const sharder& raw_sharder) {
    std::map<unsigned, dht::partition_range_vector> ret;
@@ -364,11 +374,11 @@ split_range_to_shards(dht::partition_range pr, const schema& s, const sharder& r
    return ret;
 }

-future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& source_ranges, dht::partition_range_vector ranges_to_subtract) {
+future<utils::chunked_vector<dht::partition_range>> subtract_ranges(const schema& schema, utils::chunked_vector<dht::partition_range> source_ranges, utils::chunked_vector<dht::partition_range> ranges_to_subtract) {
    auto cmp = dht::ring_position_comparator(schema);
    // optimize set of potentially overlapping ranges by deoverlapping them.
-    auto ranges = dht::partition_range::deoverlap(source_ranges, cmp);
-    dht::partition_range_vector res;
+    auto ranges = dht::partition_range::deoverlap(std::move(source_ranges), cmp);
+    utils::chunked_vector<dht::partition_range> res;
    res.reserve(ranges.size() * 2);

    auto range = ranges.begin();
--- a/dht/i_partitioner.hh
+++ b/dht/i_partitioner.hh
@@ -91,6 +91,7 @@ inline token get_token(const schema& s, partition_key_view key) {

 dht::partition_range to_partition_range(dht::token_range);
 dht::partition_range_vector to_partition_ranges(const dht::token_range_vector& ranges, utils::can_yield can_yield = utils::can_yield::no);
+future<utils::chunked_vector<dht::partition_range>> to_partition_ranges_chunked(const dht::token_range_vector& ranges);

 // Each shard gets a sorted, disjoint vector of ranges
 std::map<unsigned, dht::partition_range_vector>
@@ -105,7 +106,7 @@ std::unique_ptr<dht::i_partitioner> make_partitioner(sstring name);
 // Returns a sorted and deoverlapped list of ranges that are
 // the result of subtracting all ranges from ranges_to_subtract.
 // ranges_to_subtract must be sorted and deoverlapped.
-future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& ranges, dht::partition_range_vector ranges_to_subtract);
+future<utils::chunked_vector<dht::partition_range>> subtract_ranges(const schema& schema, utils::chunked_vector<dht::partition_range> ranges, utils::chunked_vector<dht::partition_range> ranges_to_subtract);

 // Returns a token_range vector split based on the given number of most-significant bits
 dht::token_range_vector split_token_range_msb(unsigned most_significant_bits);
--- a/dht/token.hh
+++ b/dht/token.hh
@@ -30,6 +30,31 @@ enum class token_kind {
    after_all_keys,
 };

+// Represents a token for partition keys.
+// Has a disengaged state, which sorts before all engaged states.
+struct raw_token {
+    int64_t value;
+
+    /// Constructs a disengaged token.
+    raw_token() : value(std::numeric_limits<int64_t>::min()) {}
+
+    /// Constructs an engaged token.
+    /// The token must be of token_kind::key kind.
+    explicit raw_token(const token&);
+
+    explicit raw_token(int64_t v) : value(v) {};
+
+    std::strong_ordering operator<=>(const raw_token& o) const noexcept = default;
+    std::strong_ordering operator<=>(const token& o) const noexcept;
+
+    /// Returns true iff engaged.
+    explicit operator bool() const noexcept {
+        return value != std::numeric_limits<int64_t>::min();
+    }
+};
+
+using raw_token_opt = seastar::optimized_optional<raw_token>;
+
 class token {
    // INT64_MIN is not a legal token, but a special value used to represent
    // infinity in token intervals.
@@ -52,6 +77,10 @@ public:

    constexpr explicit token(int64_t d) noexcept : token(kind::key, normalize(d)) {}

+    token(raw_token raw) noexcept
+        : token(raw ? kind::key : kind::before_all_keys, raw.value)
+    { }
+
    // This constructor seems redundant with the bytes_view constructor, but
    // it's necessary for IDL, which passes a deserialized_bytes_proxy here.
    // (deserialized_bytes_proxy is convertible to bytes&&, but not bytes_view.)
@@ -223,6 +252,29 @@ public:
    }
 };

+inline
+raw_token::raw_token(const token& t)
+    : value(t.raw())
+{
+#ifdef DEBUG
+    assert(t._kind == token::kind::key);
+#endif
+}
+
+inline
+std::strong_ordering raw_token::operator<=>(const token& o) const noexcept {
+    switch (o._kind) {
+        case token::kind::after_all_keys:
+            return std::strong_ordering::less;
+        case token::kind::before_all_keys:
+            // before_all_keys has a raw value set to the same raw value as a disengaged raw_token, and sorts before all keys.
+            // So we can order them by just comparing raw values.
+            [[fallthrough]];
+        case token::kind::key:
+            return value <=> o._data;
+    }
+}
+
 inline constexpr std::strong_ordering tri_compare_raw(const int64_t l1, const int64_t l2) noexcept {
    if (l1 == l2) {
        return std::strong_ordering::equal;
@@ -329,6 +381,17 @@ struct fmt::formatter<dht::token> : fmt::formatter<string_view> {
    }
 };

+template <>
+struct fmt::formatter<dht::raw_token> : fmt::formatter<string_view> {
+    template <typename FormatContext>
+    auto format(const dht::raw_token& t, FormatContext& ctx) const {
+        if (!t) {
+            return fmt::format_to(ctx.out(), "null");
+        }
+        return fmt::format_to(ctx.out(), "{}", t.value);
+    }
+};
+
 namespace std {

 template<>
--- a/docs/cql/dml/select.rst
+++ b/docs/cql/dml/select.rst
@@ -281,8 +281,8 @@ For example::
      ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;


-Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key
-or columns provided in a definition of the index.
+Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key.
+See :ref:`WHERE <where-clause>`.

 For example::

@@ -290,10 +290,6 @@ For example::
      WHERE user_id = 'user123'
      ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;

-The supported operations are equal relations (``=`` and ``IN``) with restrictions as in regular ``WHERE`` clauses. See :ref:`WHERE <where-clause>`.
-
-Other filtering scenarios are currently not supported.
-
 .. note::

   Vector indexes are supported in ScyllaDB Cloud only in clusters that have the Vector Search feature enabled.
--- a/docs/getting-started/install-scylla/install-on-linux.rst
+++ b/docs/getting-started/install-scylla/install-on-linux.rst
@@ -52,7 +52,7 @@ Install ScyllaDB
            .. code-block:: console
               :substitutions:
    
-               sudo wget -O /etc/apt/sources.list.d/scylla.list http://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
+               sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|


        #. Install ScyllaDB packages.
@@ -125,7 +125,7 @@ Install ScyllaDB
            .. code-block:: console
               :substitutions:
    
-               sudo curl -o /etc/yum.repos.d/scylla.repo -L http://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
+               sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|

        #. Install ScyllaDB packages.

@@ -133,19 +133,19 @@ Install ScyllaDB
    
               sudo yum install scylla

-            Running the command installs the latest official version of ScyllaDB Open Source.
-            Alternatively, you can to install a specific patch version:
+            Running the command installs the latest official version of ScyllaDB.
+            Alternatively, you can install a specific patch version:

            .. code-block:: console
    
               sudo yum install scylla-<your patch version>

-            Example: The following example shows the command to install ScyllaDB 5.2.3.
+            Example: The following example shows installing ScyllaDB 2025.3.1.

            .. code-block:: console
               :class: hide-copy-button
    
-               sudo yum install scylla-5.2.3
+               sudo yum install scylla-2025.3.1

 .. include:: /getting-started/_common/setup-after-install.rst

--- a/docs/getting-started/installation-common/scylla-web-installer.rst
+++ b/docs/getting-started/installation-common/scylla-web-installer.rst
@@ -36,11 +36,8 @@ release versions, run:
  curl -sSf get.scylladb.com/server | sudo bash -s -- --list-active-releases


-Versions 2025.1 and Later
-==============================
-
-Run the command with the ``--scylla-version`` option to specify the version
-you want to install.
+To install a non-default version, run the command with the ``--scylla-version``
+option to specify the version you want to install.

 **Example**

@@ -50,20 +47,4 @@ you want to install.
  curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-version |CURRENT_VERSION|


-Versions Earlier than 2025.1
-================================
-
-To install a supported version of *ScyllaDB Enterprise*, run the command with:
-
-* ``--scylla-product scylla-enterprise`` to specify that you want to install
-  ScyllaDB Entrprise.
-* ``--scylla-version`` to specify the version you want to install.
-
-For example:
-
-.. code:: console
-  
-  curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-product scylla-enterprise --scylla-version 2024.1
-
-
 .. include:: /getting-started/_common/setup-after-install.rst
--- a/docs/operating-scylla/procedures/cluster-management/cluster-platform-migration.rst
+++ b/docs/operating-scylla/procedures/cluster-management/cluster-platform-migration.rst
@@ -0,0 +1,492 @@
+=================================================
+Cluster Platform Migration Using Node Cycling
+=================================================
+
+This procedure describes how to migrate a ScyllaDB cluster to new instance types
+using the add-and-replace approach, which is commonly used for:
+
+* Migrating from one CPU architecture to another (e.g., x86_64 to ARM/Graviton)
+* Upgrading to newer instance types with better performance
+* Changing instance families within the same cloud provider
+
+The add-and-replace approach maintains data replication throughout the migration
+and ensures zero downtime for client applications.
+
+.. note::
+
+   This procedure does **not** change the ScyllaDB software version. All nodes
+   (both existing and new) must run the same ScyllaDB version. For software
+   version upgrades, see :doc:`Upgrade </upgrade/index>`.
+
+Overview
+--------
+
+The add-and-replace migration follows these steps:
+
+#. Add new nodes (on target instance type) to the existing cluster
+#. Wait for data to stream to the new nodes
+#. Decommission old nodes (on source instance type)
+
+This approach keeps the cluster operational throughout the migration while
+maintaining the configured replication factor.
+
+Key characteristics
+===================
+
+* **Zero downtime**: Client applications continue to operate during migration
+* **Data safety**: Replication factor is maintained throughout the process
+* **Flexible**: Works with both vnodes and tablets-enabled clusters
+* **Multi-DC support**: Can migrate nodes across multiple datacenters
+
+.. warning::
+
+   Ensure your cluster has sufficient capacity during the migration. At the peak
+   of the process, your cluster will temporarily have double the number of nodes.
+
+Prerequisites
+-------------
+
+Check cluster health
+====================
+
+Before starting the migration, verify that your cluster is healthy:
+
+#. Check that all nodes are in Up Normal (UN) status:
+
+   .. code-block:: shell
+
+      nodetool status
+
+   All nodes should show ``UN`` status. Do not proceed if any nodes are down.
+
+#. Ensure no streaming or repair operations are in progress:
+
+   .. code-block:: shell
+
+      nodetool netstats
+      nodetool compactionstats
+
+Plan the migration
+==================
+
+Before provisioning new instances, plan the following:
+
+**Instance type mapping**: Identify the source and target instance types.
+If your cluster uses vnodes (not tablets), consider that mismatched shard
+counts between source and target instance types can cause slower repairs.
+With tablets enabled, shard count mismatch is fully supported.
+
+**Rack assignment planning**: Each new node must be assigned to the same rack
+as the node it will replace. This maintains rack-aware topology for:
+
+* Rack-aware replication (NetworkTopologyStrategy)
+* Proper data distribution across failure domains
+* Minimizing data movement during decommission
+
+Example mapping for a 3-node cluster:
+
+.. code-block:: none
+
+   Source nodes (to be decommissioned):     Target nodes (to be added):
+   192.168.1.10 - RACK0                 →   192.168.2.10 - RACK0
+   192.168.1.11 - RACK1                 →   192.168.2.11 - RACK1
+   192.168.1.12 - RACK2                 →   192.168.2.12 - RACK2
+
+Create a backup
+===============
+
+Back up the data before starting the migration. One of the following
+methods can be used:
+
+* **ScyllaDB Manager** (recommended): Use ScyllaDB Manager to perform a
+  cluster-wide backup. See the
+  `ScyllaDB Manager documentation <https://manager.docs.scylladb.com/stable/backup/>`_
+  for details.
+
+* **Snapshots**: On each node in the cluster, create a snapshot:
+
+  .. code-block:: shell
+
+     nodetool snapshot -t pre_migration_backup
+     nodetool listsnapshots
+
+  .. note::
+
+     Snapshots are local to each node and do not protect against node or disk
+     failure. For full disaster recovery, use ScyllaDB Manager backup.
+
+
+Procedure
+---------
+
+Adding new nodes
+================
+
+#. Provision new instances with the target instance type. Ensure:
+
+   * The same ScyllaDB version as existing nodes
+   * Same network configuration and security groups
+   * Appropriate storage configuration
+
+#. On each new node, configure ``/etc/scylla/scylla.yaml`` to join the existing
+   cluster:
+
+   * **cluster_name**: Must match the existing cluster name
+   * **seeds**: IP address of an existing node in the cluster (used to discover cluster topology on join)
+   * **endpoint_snitch**: Must match the existing cluster configuration
+   * **listen_address**: IP address of the new node
+   * **rpc_address**: IP address of the new node
+
+   All other cluster-wide settings (tablets configuration, encryption settings,
+   experimental features, etc.) must match the existing nodes.
+
+   .. caution::
+
+      Make sure that the ScyllaDB version on the new node is identical to the
+      version on the other nodes in the cluster. Running nodes with different
+      versions is not supported.
+
+#. If using ``GossipingPropertyFileSnitch``, configure
+   ``/etc/scylla/cassandra-rackdc.properties`` with the correct datacenter
+   and rack assignment for this node:
+
+   .. code-block:: none
+
+      dc = <datacenter-name>
+      rack = <rack-name>
+      prefer_local = true
+
+   .. warning::
+
+      Each node must have the correct rack assignment. Using the same rack for
+      all new nodes breaks rack-aware replication topology.
+
+#. Start ScyllaDB on the new node:
+
+   .. code-block:: shell
+
+      sudo systemctl start scylla-server
+
+   For Docker deployments:
+
+   .. code-block:: shell
+
+      docker exec -it <container-name> supervisorctl start scylla
+
+#. Monitor the bootstrap process from an existing node:
+
+   .. code-block:: shell
+
+      nodetool status
+
+   The new node will appear with ``UJ`` (Up, Joining) status while streaming
+   data from existing nodes. Wait until it transitions to ``UN`` (Up, Normal).
+
+   **Example output during bootstrap:**
+
+   .. code-block:: shell
+
+      Datacenter: dc1
+      Status=Up/Down
+      State=Normal/Leaving/Joining/Moving
+      --  Address        Load       Tokens  Owns   Host ID                               Rack
+      UN  192.168.1.10   500 MB     256     33.3%  8d5ed9f4-7764-4dbd-bad8-43fddce94b7c  RACK0
+      UN  192.168.1.11   500 MB     256     33.3%  125ed9f4-7777-1dbn-mac8-43fddce9123e  RACK1
+      UN  192.168.1.12   500 MB     256     33.3%  675ed9f4-6564-6dbd-can8-43fddce952gy  RACK2
+      UJ  192.168.2.10   250 MB     256     ?      a1b2c3d4-5678-90ab-cdef-112233445566  RACK0
+
+   **Example output after bootstrap completes:**
+
+   .. code-block:: shell
+
+      Datacenter: dc1
+      Status=Up/Down
+      State=Normal/Leaving/Joining/Moving
+      --  Address        Load       Tokens  Owns   Host ID                               Rack
+      UN  192.168.1.10   400 MB     256     25.0%  8d5ed9f4-7764-4dbd-bad8-43fddce94b7c  RACK0
+      UN  192.168.1.11   400 MB     256     25.0%  125ed9f4-7777-1dbn-mac8-43fddce9123e  RACK1
+      UN  192.168.1.12   400 MB     256     25.0%  675ed9f4-6564-6dbd-can8-43fddce952gy  RACK2
+      UN  192.168.2.10   400 MB     256     25.0%  a1b2c3d4-5678-90ab-cdef-112233445566  RACK0
+
+#. For tablets-enabled clusters, wait for tablet load balancing to complete.
+   After the node reaches ``UN`` status, verify no streaming is in progress:
+
+   .. code-block:: shell
+
+      nodetool netstats
+
+   Wait until output shows "Not sending any streams" and no active receiving streams.
+
+#. Repeat steps 1-6 for each new node to be added.
+
+.. note::
+
+   You can add multiple nodes in parallel if they are in different datacenters.
+   Within a single datacenter, add nodes one at a time for best results.
+
+
+Updating seed node configuration
+================================
+
+If any of your original nodes are configured as seed nodes, you must update
+the seed configuration before decommissioning them.
+
+#. Check the current seed configuration on any node:
+
+   .. code-block:: shell
+
+      grep -A 4 "seed_provider" /etc/scylla/scylla.yaml
+
+#. If the seeds include nodes you plan to decommission, update ``scylla.yaml``
+   on **all new nodes** to use the new node IPs as seeds:
+
+   .. code-block:: yaml
+
+      seed_provider:
+        - class_name: org.apache.cassandra.locator.SimpleSeedProvider
+          parameters:
+            - seeds: "192.168.2.10,192.168.2.11,192.168.2.12"
+
+   .. note::
+
+      Updating seed configuration on the **old nodes** (that will be
+      decommissioned) is optional. Seeds are only used during node startup
+      to discover the cluster. If you don't plan to restart the old nodes
+      before decommissioning them, their seed configuration doesn't matter.
+      However, updating all nodes is recommended for safety in case an old
+      node unexpectedly restarts during the migration.
+
+#. Restart ScyllaDB on each new node (one at a time) to apply the new seed
+   configuration:
+
+   .. code-block:: shell
+
+      sudo systemctl restart scylla-server
+
+   Wait for the node to fully start before restarting the next node.
+
+#. After restarting the new nodes, verify the cluster is healthy:
+
+   .. code-block:: shell
+
+      nodetool status
+      nodetool describecluster
+
+.. warning::
+
+   Complete this seed list update on **all new nodes** before decommissioning
+   any old nodes. This ensures the new nodes can reform the cluster after
+   the old nodes are removed.
+
+
+Decommissioning old nodes
+=========================
+
+After all new nodes are added and healthy, decommission the old nodes one
+at a time.
+
+#. Verify all nodes are healthy before starting decommission:
+
+   .. code-block:: shell
+
+      nodetool status
+
+   All nodes should show ``UN`` status.
+
+#. On the node to be decommissioned, run:
+
+   .. code-block:: shell
+
+      nodetool decommission
+
+   This command blocks until the decommission is complete. The node will
+   stream its data to the remaining nodes.
+
+#. Monitor the decommission progress from another node:
+
+   .. code-block:: shell
+
+      nodetool status
+
+   The decommissioning node will transition from ``UN`` → ``UL`` (Up, Leaving)
+   → removed from the cluster.
+
+   You can also monitor streaming progress:
+
+   .. code-block:: shell
+
+      nodetool netstats
+
+#. After decommission completes, verify the node is no longer in the cluster:
+
+   .. code-block:: shell
+
+      nodetool status
+
+   The decommissioned node should no longer appear in the output.
+
+#. Run ``nodetool cleanup`` on the remaining nodes to remove data that
+   no longer belongs to them after the topology change:
+
+   .. code-block:: shell
+
+      nodetool cleanup
+
+   .. note::
+
+      ``nodetool cleanup`` can be resource-intensive. Run it on one node at a
+      time during low-traffic periods.
+
+#. Wait for the cluster to stabilize before decommissioning the next node.
+   Ensure no streaming operations are in progress.
+
+#. Repeat steps 1-7 for each old node to be decommissioned.
+
+
+Post-migration verification
+---------------------------
+
+After all old nodes are decommissioned, verify the migration was successful.
+
+Verify cluster topology
+=======================
+
+.. code-block:: shell
+
+   nodetool status
+
+Confirm:
+
+* All nodes show ``UN`` (Up, Normal) status
+* Only the new instance type nodes are present
+* Nodes are balanced across racks
+
+Verify schema agreement
+=======================
+
+.. code-block:: shell
+
+   nodetool describecluster
+
+All nodes should report the same schema version.
+
+Verify data connectivity
+========================
+
+Connect to the cluster and run a test query:
+
+.. code-block:: shell
+
+   cqlsh <node-ip> -e "SELECT count(*) FROM system_schema.keyspaces;"
+
+.. note::
+
+   If ScyllaDB is configured with ``listen_interface``, you must use the
+   node's interface IP address (not localhost) for cqlsh connections.
+
+Verify ScyllaDB version
+=======================
+
+Confirm all nodes are running the same ScyllaDB version:
+
+.. code-block:: shell
+
+   scylla --version
+
+Verify data integrity (optional)
+================================
+
+Run data validation on each keyspace to verify sstable integrity:
+
+.. code-block:: shell
+
+   nodetool scrub --mode=VALIDATE <keyspace_name>
+
+Rollback
+--------
+
+If issues occur during the migration, you can roll back by reversing the
+procedure.
+
+During add phase
+================
+
+If a new node fails to bootstrap:
+
+#. Stop ScyllaDB on the new node:
+
+   .. code-block:: shell
+
+      sudo systemctl stop scylla-server
+
+#. From an existing node, remove the failed node:
+
+   .. code-block:: shell
+
+      nodetool removenode <host-id-of-failed-node>
+
+During decommission phase
+=========================
+
+If a decommission operation gets stuck:
+
+#. If the node is still reachable, try stopping and restarting ScyllaDB
+#. If the node is unresponsive, from another node:
+
+   .. code-block:: shell
+
+      nodetool removenode <host-id>
+
+   See :doc:`Remove a Node from a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/remove-node>`
+   for more details.
+
+Full rollback
+=============
+
+To roll back after the migration is complete (all nodes on new instance type),
+apply the same add-and-replace procedure in reverse:
+
+#. Add new nodes on the original instance type
+#. Wait for data streaming to complete
+#. Decommission the nodes on the new instance type
+
+
+Troubleshooting
+---------------
+
+Node stuck in Joining (UJ) state
+================================
+
+If a new node remains in ``UJ`` state for an extended period:
+
+* Check ScyllaDB logs for streaming errors: ``journalctl -u scylla-server``
+* Verify network connectivity between nodes
+* Ensure sufficient disk space on all nodes
+* Check for any ongoing operations that may be blocking
+
+Decommission taking too long
+============================
+
+Decommission duration depends on data size. If it appears stuck:
+
+* Check streaming progress: ``nodetool netstats``
+* Look for errors in ScyllaDB logs
+* Verify network bandwidth between nodes
+
+Schema disagreement
+===================
+
+If nodes report different schema versions:
+
+* Wait a few minutes for schema to propagate
+* If disagreement persists, restart the nodes one by one
+* Run ``nodetool describecluster`` to verify agreement
+
+
+Additional resources
+--------------------
+
+* :doc:`Adding a New Node Into an Existing ScyllaDB Cluster </operating-scylla/procedures/cluster-management/add-node-to-cluster>`
+* :doc:`Remove a Node from a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/remove-node>`
+* :doc:`Replace a Running Node in a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/replace-running-node>`
+* :doc:`Upgrade </upgrade/index>`
--- a/docs/operating-scylla/procedures/cluster-management/index.rst
+++ b/docs/operating-scylla/procedures/cluster-management/index.rst
@@ -26,6 +26,7 @@ Cluster Management Procedures
   Safely Restart Your Cluster <safe-start>
   repair-based-node-operation
   Prevent Quorum Loss in Symmetrical Multi-DC Clusters <arbiter-dc>
+   Cluster Platform Migration <cluster-platform-migration>


 .. panel-box::
@@ -85,6 +86,8 @@ Cluster Management Procedures

  * :doc:`Preventing Quorum Loss in Symmetrical Multi-DC Clusters <arbiter-dc>`

+  * :doc:`Cluster Platform Migration Using Node Cycling </operating-scylla/procedures/cluster-management/cluster-platform-migration>`
+
 .. panel-box::
  :title: Topology Changes
  :id: "getting-started"
--- a/docs/operating-scylla/procedures/config-change/advanced-internode-compression.rst
+++ b/docs/operating-scylla/procedures/config-change/advanced-internode-compression.rst
@@ -57,12 +57,11 @@ To enable shared dictionaries:
    internode_compression_enable_advanced: true
    rpc_dict_training_when: when_leader

-.. warning:: Enabling shared dictionary training might leak unencrypted data to disk.
+.. note::

-             Trained dictionaries contain randomly chosen samples of data transferred between
-             nodes. The data samples are persisted in the Raft log, which is not encrypted.
-             As a result, some data from otherwise encrypted tables might be stored on disk
-             unencrypted.
+   Some dictionary training data may be encrypted using storage-level encryption
+   (if enabled) instead of database-level encryption, meaning protection is
+   applied at the storage layer rather than within the database itself.


 Reference
--- a/locator/everywhere_replication_strategy.cc
+++ b/locator/everywhere_replication_strategy.cc
@@ -42,7 +42,14 @@ void everywhere_replication_strategy::validate_options(const gms::feature_servic

 sstring everywhere_replication_strategy::sanity_check_read_replicas(const effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) const {
    const auto replication_factor = erm.get_replication_factor();
-    if (read_replicas.size() > replication_factor) {
+    if (const auto& topo_info = erm.get_token_metadata().get_topology_change_info(); topo_info && topo_info->read_new) {
+        if (read_replicas.size() > replication_factor + 1) {
+            return seastar::format(
+                    "everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, "
+                    "cannot be higher than replication factor {} + 1 during the 'read from new replicas' stage of a topology change",
+                    read_replicas.size(), replication_factor);
+        }
+    } else if (read_replicas.size() > replication_factor) {
        return seastar::format("everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, cannot be higher than replication factor {}", read_replicas.size(), replication_factor);
    }
    return {};
--- a/mutation/collection_mutation.cc
+++ b/mutation/collection_mutation.cc
@@ -261,7 +261,7 @@ static collection_mutation serialize_collection_mutation(

        writev(v.serialize());
    }
-    return collection_mutation(type, ret);
+    return collection_mutation(type, std::move(ret));
 }

 collection_mutation collection_mutation_description::serialize(const abstract_type& type) const {
--- a/node_ops/task_manager_module.cc
+++ b/node_ops/task_manager_module.cc
@@ -103,7 +103,7 @@ future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(task
        .entity = stats.entity,
        .progress_units = "",
        .progress = tasks::task_manager::task::progress{},
-        .children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()))
+        .children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr())
    };
 }

--- a/pgo/profiles/aarch64/profile.profdata.xz
+++ b/pgo/profiles/aarch64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:52c9772c9ac334650d8b179b591c47769ee38d34fad784b61c682e11c03f2506
-size 6530196
+oid sha256:088a9d7e165d33436eb3029ab092582cbae61f0e17486c226d8947ff44658c78
+size 6535832
--- a/pgo/profiles/x86_64/profile.profdata.xz
+++ b/pgo/profiles/x86_64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d1a869ebfe4e90d9681499246eb86bb032ae402c350357e19d97b989037a5bd3
-size 6528308
+oid sha256:5f0c0709f9724cd3a545ebcc50ed587f28b2424d55e2334ac2db5d917903bcaf
+size 6536892
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -1021,8 +1021,8 @@ void reader_concurrency_semaphore::signal(const resources& r) noexcept {
        on_internal_error_noexcept(rcslog,
                format("reader_concurrency_semaphore::signal(): semaphore {} detected resource leak, available {} exceeds initial {}", _name,
                        _resources, _initial_resources));
-        _resources.count = std::max(_resources.count, _initial_resources.count);
-        _resources.memory = std::max(_resources.memory, _initial_resources.memory);
+        _resources.count = std::min(_resources.count, _initial_resources.count);
+        _resources.memory = std::min(_resources.memory, _initial_resources.memory);
    }
    maybe_wake_execution_loop();
 }
--- a/replica/compaction_group.hh
+++ b/replica/compaction_group.hh
@@ -432,7 +432,9 @@ public:
    // refresh_mutation_source must be called when there are changes to data source
    // structures but logical state of data is not changed (e.g. when state for a
    // new tablet replica is allocated).
-    virtual void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) = 0;
+    virtual void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
+                                                  const locator::effective_replication_map& erm,
+                                                  noncopyable_function<void()> refresh_mutation_source) = 0;

    virtual compaction_group& compaction_group_for_token(dht::token token) const = 0;
    virtual compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const = 0;
@@ -442,7 +444,7 @@ public:
    virtual storage_group& storage_group_for_token(dht::token) const = 0;
    virtual utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const = 0;

-    virtual locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const = 0;
+    virtual locator::combined_load_stats table_load_stats() const = 0;
    virtual bool all_storage_groups_split() = 0;
    virtual future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) = 0;
    virtual future<> maybe_split_compaction_group_of(size_t idx) = 0;
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -1697,7 +1697,7 @@ static db::rate_limiter::can_proceed account_singular_ranges_to_rate_limit(
        if (!range.is_singular()) {
            continue;
        }
-        auto token = dht::token::to_int64(ranges.front().start()->value().token());
+        auto token = dht::token::to_int64(range.start()->value().token());
        if (limiter.account_operation(read_label, token, table_limit, rate_limit_info) == db::rate_limiter::can_proceed::no) {
            // Don't return immediately - account all ranges first
            ret = can_proceed::no;
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -1129,9 +1129,7 @@ public:
        return _stats;
    }

-    // The tablet filter is used to not double account migrating tablets, so it's important that
-    // only one of pending or leaving replica is accounted based on current migration stage.
-    locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const;
+    locator::combined_load_stats table_load_stats() const;

    const db::view::stats& get_view_stats() const {
        return _view_stats;
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -711,7 +711,9 @@ public:
        return make_ready_future<>();
    }

-    void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override {}
+    void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
+                                          const locator::effective_replication_map& erm,
+                                          noncopyable_function<void()> refresh_mutation_source) override {}

    compaction_group& compaction_group_for_token(dht::token token) const override {
        return get_compaction_group();
@@ -734,7 +736,7 @@ public:
        return *_single_sg;
    }

-    locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)>) const override {
+    locator::combined_load_stats table_load_stats() const override {
        return locator::combined_load_stats{
            .table_ls = locator::table_load_stats{
                            .size_in_bytes = _single_sg->live_disk_space_used(),
@@ -757,6 +759,11 @@ public:
    }
 };

+struct background_merge_guard {
+    compaction::compaction_reenabler compaction_guard;
+    locator::effective_replication_map_ptr erm_guard;
+};
+
 class tablet_storage_group_manager final : public storage_group_manager {
    replica::table& _t;
    locator::host_id _my_host_id;
@@ -777,7 +784,7 @@ class tablet_storage_group_manager final : public storage_group_manager {
    utils::phased_barrier _merge_fiber_barrier;
    std::optional<utils::phased_barrier::operation> _pending_merge_fiber_work;
    // Holds compaction reenabler which disables compaction temporarily during tablet merge
-    std::vector<compaction::compaction_reenabler> _compaction_reenablers_for_merging;
+    std::vector<background_merge_guard> _compaction_reenablers_for_merging;
 private:
    const schema_ptr& schema() const {
        return _t.schema();
@@ -801,7 +808,8 @@ private:
    // Called when coordinator executes tablet merge. Tablet ids X and X+1 are merged into
    // the new tablet id (X >> 1). In practice, that means storage groups for X and X+1
    // are merged into a new storage group with id (X >> 1).
-    void handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
+    void handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
+                                        const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);

    // When merge completes, compaction groups of sibling tablets are added to same storage
    // group, but they're not merged yet into one, since the merge completion handler happens
@@ -895,7 +903,9 @@ public:
                std::exchange(_stop_fut, make_ready_future())).discard_result();
    }

-    void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override;
+    void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
+                                          const locator::effective_replication_map& erm,
+                                          noncopyable_function<void()> refresh_mutation_source) override;

    compaction_group& compaction_group_for_token(dht::token token) const override;
    utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const override;
@@ -909,7 +919,7 @@ public:
        return storage_group_for_id(storage_group_of(token).first);
    }

-    locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const override;
+    locator::combined_load_stats table_load_stats() const override;
    bool all_storage_groups_split() override;
    future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override;
    future<> maybe_split_compaction_group_of(size_t idx) override;
@@ -2933,17 +2943,108 @@ void table::on_flush_timer() {
    });
 }

-locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const {
+// The following functions return true if we should return the tablet size of a tablet in
+// migration depending on its transition stage and whether it is a leaving or pending replica
+bool has_size_on_leaving (locator::tablet_transition_stage stage) {
+    switch (stage) {
+        case locator::tablet_transition_stage::allow_write_both_read_old:               [[fallthrough]];
+        case locator::tablet_transition_stage::write_both_read_old:                     [[fallthrough]];
+        case locator::tablet_transition_stage::streaming:                               [[fallthrough]];
+        case locator::tablet_transition_stage::write_both_read_new:                     [[fallthrough]];
+        case locator::tablet_transition_stage::use_new:                                 [[fallthrough]];
+        case locator::tablet_transition_stage::cleanup_target:                          [[fallthrough]];
+        case locator::tablet_transition_stage::revert_migration:                        [[fallthrough]];
+        case locator::tablet_transition_stage::rebuild_repair:                          [[fallthrough]];
+        case locator::tablet_transition_stage::repair:                                  [[fallthrough]];
+        case locator::tablet_transition_stage::end_repair:
+            return true;
+        case locator::tablet_transition_stage::cleanup:                                 [[fallthrough]];
+        case locator::tablet_transition_stage::end_migration:
+            return false;
+    }
+}
+
+bool has_size_on_pending (locator::tablet_transition_stage stage) {
+    switch (stage) {
+        case locator::tablet_transition_stage::allow_write_both_read_old:               [[fallthrough]];
+        case locator::tablet_transition_stage::write_both_read_old:                     [[fallthrough]];
+        case locator::tablet_transition_stage::streaming:                               [[fallthrough]];
+        case locator::tablet_transition_stage::cleanup_target:                          [[fallthrough]];
+        case locator::tablet_transition_stage::revert_migration:                        [[fallthrough]];
+        case locator::tablet_transition_stage::rebuild_repair:
+            return false;
+        case locator::tablet_transition_stage::write_both_read_new:                     [[fallthrough]];
+        case locator::tablet_transition_stage::use_new:                                 [[fallthrough]];
+        case locator::tablet_transition_stage::cleanup:                                 [[fallthrough]];
+        case locator::tablet_transition_stage::end_migration:                           [[fallthrough]];
+        case locator::tablet_transition_stage::repair:                                  [[fallthrough]];
+        case locator::tablet_transition_stage::end_repair:
+            return true;
+    }
+}
+
+locator::combined_load_stats tablet_storage_group_manager::table_load_stats() const {
    locator::table_load_stats table_stats;
    table_stats.split_ready_seq_number = _split_ready_seq_number;

    locator::tablet_load_stats tablet_stats;

    for_each_storage_group([&] (size_t id, storage_group& sg) {
-        locator::global_tablet_id gid { _t.schema()->id(), locator::tablet_id(id) };
-        if (tablet_filter(*_tablet_map, gid)) {
-            const uint64_t tablet_size = sg.live_disk_space_used();
+        auto tid = locator::tablet_id(id);
+        locator::global_tablet_id gid { _t.schema()->id(), tid };
+        locator::tablet_replica me { _my_host_id, this_shard_id() };
+        const uint64_t tablet_size = sg.live_disk_space_used();
+
+        auto transition = _tablet_map->get_tablet_transition_info(tid);
+        auto& info = _tablet_map->get_tablet_info(tid);
+        bool is_pending = transition && transition->pending_replica == me;
+        bool is_leaving = transition && locator::get_leaving_replica(info, *transition) == me;
+
+        // It's important to tackle the anomaly in reported size, since both leaving and
+        // pending replicas could otherwise be accounted during tablet migration.
+        // If transition hasn't reached write_both_read_new stage, then leaving replicas are accounted.
+        // Otherwise, pending replicas are accounted.
+        // This helps to reduce the discrepancy window.
+        auto table_size_filter = [&] () {
+            // if tablet is not in transit, it's filtered in.
+            if (!transition) {
+                return true;
+            }
+
+            auto s = transition->reads; // read selector
+
+            return (!is_pending && !is_leaving)
+                    || (is_leaving && s == locator::read_replica_set_selector::previous)
+                    || (is_pending && s == locator::read_replica_set_selector::next);
+        };
+
+        // When a tablet is in migration, we want to send its size during any migration stage when
+        // we still know the tablet's size. This way the balancer will have better information about
+        // tablet sizes, and we reduce the chance that the node will be ignored during balancing
+        // due to missing tablet size. On the leaving replica we include tablets until the use_new
+        // stage (inclusive), and on the pending we include tablets after the streaming stage.
+        // There is an overlap in tablet sizes (we report sizes on both the leaving and pending
+        // replicas for some stages), but that should not be a problem.
+        auto tablet_size_filter = [&] () {
+            // if tablet is not in transit, it's filtered in.
+            if (!transition) {
+                return true;
+            }
+
+            if (is_leaving) {
+                return has_size_on_leaving(transition->stage);
+            } else if (is_pending) {
+                return has_size_on_pending(transition->stage);
+            }
+
+            return true;
+        };
+
+        if (table_size_filter()) {
            table_stats.size_in_bytes += tablet_size;
+        }
+
+        if (tablet_size_filter()) {
            const dht::token_range trange = _tablet_map->get_token_range(gid.tablet);
            // Make sure the token range is in the form (a, b]
            SCYLLA_ASSERT(!trange.start()->is_inclusive() && trange.end()->is_inclusive());
@@ -2956,8 +3057,8 @@ locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std:
    };
 }

-locator::combined_load_stats table::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const {
-    return _sg_manager->table_load_stats(std::move(tablet_filter));
+locator::combined_load_stats table::table_load_stats() const {
+    return _sg_manager->table_load_stats();
 }

 void tablet_storage_group_manager::handle_tablet_split_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
@@ -3069,7 +3170,9 @@ future<> tablet_storage_group_manager::merge_completion_fiber() {
    }
 }

-void tablet_storage_group_manager::handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
+void tablet_storage_group_manager::handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
+                                                                  const locator::tablet_map& old_tmap,
+                                                                  const locator::tablet_map& new_tmap) {
    auto table_id = schema()->id();
    size_t old_tablet_count = old_tmap.tablet_count();
    size_t new_tablet_count = new_tmap.tablet_count();
@@ -3093,7 +3196,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
        auto new_cg = make_lw_shared<compaction_group>(_t, new_tid, new_range, make_repair_sstable_classifier_func());
        for (auto& view : new_cg->all_views()) {
            auto cre = _t.get_compaction_manager().stop_and_disable_compaction_no_wait(*view, "tablet merging");
-            _compaction_reenablers_for_merging.push_back(std::move(cre));
+            _compaction_reenablers_for_merging.push_back(background_merge_guard{std::move(cre), old_erm});
        }
        auto new_sg = make_lw_shared<storage_group>(std::move(new_cg));

@@ -3126,7 +3229,11 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
    _merge_completion_event.signal();
 }

-void tablet_storage_group_manager::update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) {
+void tablet_storage_group_manager::update_effective_replication_map(
+        const locator::effective_replication_map_ptr& old_erm,
+        const locator::effective_replication_map& erm,
+        noncopyable_function<void()> refresh_mutation_source)
+{
    auto* new_tablet_map = &erm.get_token_metadata().tablets().get_tablet_map(schema()->id());
    auto* old_tablet_map = std::exchange(_tablet_map, new_tablet_map);

@@ -3142,7 +3249,7 @@ void tablet_storage_group_manager::update_effective_replication_map(const locato
        if (utils::get_local_injector().is_enabled("tablet_force_tablet_count_decrease_once")) {
            utils::get_local_injector().disable("tablet_force_tablet_count_decrease");
        }
-        handle_tablet_merge_completion(*old_tablet_map, *new_tablet_map);
+        handle_tablet_merge_completion(old_erm, *old_tablet_map, *new_tablet_map);
    }

    // Allocate storage group if tablet is migrating in, or deallocate if it's migrating out.
@@ -3228,7 +3335,7 @@ void table::update_effective_replication_map(locator::effective_replication_map_
    };

    if (uses_tablets()) {
-        _sg_manager->update_effective_replication_map(*_erm, refresh_mutation_source);
+        _sg_manager->update_effective_replication_map(old_erm, *_erm, refresh_mutation_source);
    }
    if (old_erm) {
        old_erm->invalidate();
@@ -3690,7 +3797,6 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
        tlogger.debug("Taking snapshot of {}.{}: name={}", s->ks_name(), s->cf_name(), name);

        std::vector<snapshot_sstable_set> sstable_sets(smp::count);
-        std::vector<int64_t> tablet_counts(smp::count);

        co_await writer->init();
        co_await smp::invoke_on_all([&] -> future<> {
@@ -3698,7 +3804,6 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
            auto [tables, permit] = co_await t.snapshot_sstables();
            auto sstables_metadata = co_await t.get_sstables_manager().take_snapshot(std::move(tables), name);
            sstable_sets[this_shard_id()] = make_foreign(std::make_unique<utils::chunked_vector<sstables::sstable_snapshot_metadata>>(std::move(sstables_metadata)));
-            tablet_counts[this_shard_id()] = t.calculate_tablet_count();
        });
        co_await writer->sync();

@@ -3712,12 +3817,13 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
        });
        tlogger.debug("snapshot {}: seal_snapshot", name);
        const auto& topology = sharded_db.local().get_token_metadata().get_topology();
-        std::optional<int64_t> min_tablet_count;
+        std::optional<int64_t> tablet_count;
        if (t.uses_tablets()) {
-            SCYLLA_ASSERT(!tablet_counts.empty());
-            min_tablet_count = *std::ranges::min_element(tablet_counts);
+            auto erm = t.get_effective_replication_map();
+            auto& tm = erm->get_token_metadata().tablets().get_tablet_map(s->id());
+            tablet_count = tm.tablet_count();
        }
-        co_await write_manifest(topology, *writer, std::move(sstable_sets), name, std::move(opts), s, min_tablet_count).handle_exception([&] (std::exception_ptr ptr) {
+        co_await write_manifest(topology, *writer, std::move(sstable_sets), name, std::move(opts), s, tablet_count).handle_exception([&] (std::exception_ptr ptr) {
            tlogger.error("Failed to seal snapshot in {}: {}.", name, ptr);
            ex = std::move(ptr);
        });
@@ -3775,6 +3881,7 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
            }

            auto lister = directory_lister(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>());
+            auto close_lister = deferred_close(lister);
            while (auto de = lister.get().get()) {
                auto snapshot_name = de->name;
                all_snapshots.emplace(snapshot_name, snapshot_details());
@@ -3782,6 +3889,9 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
                auto& sd = all_snapshots.at(snapshot_name);
                sd.total += details.total;
                sd.live += details.live;
+                utils::get_local_injector().inject("get_snapshot_details", [&] (auto& handler) -> future<> {
+                    throw std::runtime_error("Injected exception in get_snapshot_details");
+                }).get();
            }
        }
        return all_snapshots;
@@ -3801,53 +3911,66 @@ future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_di
    }

    auto lister = directory_lister(snapshot_directory, snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
-    while (auto de = co_await lister.get()) {
-        const auto& name = de->name;
-        future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
-        auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
-        auto size = sd.allocated_size;
+    std::exception_ptr ex;
+    try {
+        while (auto de = co_await lister.get()) {
+            const auto& name = de->name;
+            future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
+            auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
+            auto size = sd.allocated_size;

-        // The manifest and schema.sql files are the only files expected to be in this directory not belonging to the SSTable.
-        //
-        // All the others should just generate an exception: there is something wrong, so don't blindly
-        // add it to the size.
-        if (name != "manifest.json" && name != "schema.cql") {
-            details.total += size;
-            if (sd.number_of_links == 1) {
-                // File exists only in the snapshot directory.
-                details.live += size;
+            utils::get_local_injector().inject("per-snapshot-get_snapshot_details", [&] (auto& handler) -> future<> {
+                throw std::runtime_error("Injected exception in per-snapshot-get_snapshot_details");
+            }).get();
+
+            // The manifest and schema.cql files are the only files expected to be in this directory not belonging to the SSTable.
+            //
+            // All the others should just generate an exception: there is something wrong, so don't blindly
+            // add it to the size.
+            if (name != "manifest.json" && name != "schema.cql") {
+                details.total += size;
+                if (sd.number_of_links == 1) {
+                    // File exists only in the snapshot directory.
+                    details.live += size;
+                    continue;
+                }
+                // If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
+                // So check the datadir for the file too.
+            } else {
                continue;
            }
-            // If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
-            // So check the datadir for the file too.
-        } else {
-            continue;
-        }

-        auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
-          try {
-            // File exists in the main SSTable directory. Snapshots are not contributing to size
-            auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
-            // File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
-            if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
-                dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
-                        (path / name).native(), psd.device_id, psd.inode_number, psd.size,
-                        (snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
+            auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
+              try {
+                // File exists in the main SSTable directory. Snapshots are not contributing to size
+                auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
+                // File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
+                if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
+                    dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
+                            (path / name).native(), psd.device_id, psd.inode_number, psd.size,
+                            (snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
+                    co_return false;
+                }
+                co_return true;
+              } catch (std::system_error& e) {
+                if (e.code() != std::error_code(ENOENT, std::system_category())) {
+                    throw;
+                }
                co_return false;
+              }
+            };
+            // Check staging dir first, as files might be moved from there to the datadir concurrently to this check
+            if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
+                    !co_await exists_in_dir(data_directory, datadir, name)) {
+                details.live += size;
            }
-            co_return true;
-          } catch (std::system_error& e) {
-            if (e.code() != std::error_code(ENOENT, std::system_category())) {
-                throw;
-            }
-            co_return false;
-          }
-        };
-        // Check staging dir first, as files might be moved from there to the datadir concurrently to this check
-        if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
-                !co_await exists_in_dir(data_directory, datadir, name)) {
-            details.live += size;
        }
+    } catch (...) {
+        ex = std::current_exception();
+    }
+    co_await lister.close();
+    if (ex) {
+        co_await coroutine::return_exception_ptr(std::move(ex));
    }

    co_return details;
--- a/schema/schema_builder.hh
+++ b/schema/schema_builder.hh
@@ -263,8 +263,9 @@ public:
    void enable_schema_commitlog() {
        _static_props.enable_schema_commitlog();
    }
-    void set_is_group0_table(bool enabled = true) {
-        _static_props.is_group0_table = enabled;
+    void set_is_group0_table() {
+        _static_props.is_group0_table = true;
+        enable_schema_commitlog();
    }

    class default_names {
--- a/service/paxos/paxos_state.cc
+++ b/service/paxos/paxos_state.cc
@@ -454,7 +454,7 @@ static future<cql3::untyped_result_set> do_execute_cql_with_timeout(sstring req,
    auto ps_ptr = qp.get_prepared(cache_key);
    if (!ps_ptr) {
        const auto msg_ptr = co_await qp.prepare(req, qs, cql3::internal_dialect());
-        ps_ptr = std::move(msg_ptr->get_prepared());
+        ps_ptr = msg_ptr->get_prepared();
        if (!ps_ptr) {
            on_internal_error(paxos_state::logger, "prepared statement is null");
        }
--- a/service/raft/group0_state_machine.cc
+++ b/service/raft/group0_state_machine.cc
@@ -350,6 +350,10 @@ static void ensure_group0_schema(const group0_command& cmd, const replica::datab
            if (!schema->static_props().is_group0_table) {
                on_internal_error(slogger, fmt::format("ensure_group0_schema: schema is not group0: {}", schema->cf_name()));
            }
+
+            if (!schema->static_props().use_schema_commitlog) {
+                on_internal_error(slogger, fmt::format("ensure_group0_schema: group0 table {} does not use schema commitlog", schema->cf_name()));
+            }
        }
    };

--- a/service/raft/raft_group0.cc
+++ b/service/raft/raft_group0.cc
@@ -559,6 +559,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
        group0_id = g0_info.group0_id;
        raft::server_address my_addr{my_id, {}};

+        bool starting_server_as_follower = false;
        if (server == nullptr) {
            // This is the first time discovery is run. Create and start a Raft server for group 0 on this node.
            raft::configuration initial_configuration;
@@ -586,6 +587,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
                // trigger an empty snapshot transfer.
                nontrivial_snapshot = true;
            } else {
+                starting_server_as_follower = true;
                co_await handshaker->pre_server_start(g0_info);
            }

@@ -614,7 +616,9 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
        }

        SCYLLA_ASSERT(server);
-        if (server->get_configuration().contains(my_id)) {
+        co_await utils::get_local_injector().inject("join_group0_pause_before_config_check",
+                utils::wait_for_message(std::chrono::minutes{5}));
+        if (!starting_server_as_follower && server->get_configuration().contains(my_id)) {
            // True if we started a new group or completed a configuration change initiated earlier.
            group0_log.info("server {} already in group 0 (id {}) as {}", my_id, group0_id,
                    server->get_configuration().can_vote(my_id)? "voter" : "non-voter");
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -6156,6 +6156,57 @@ future<> storage_service::snitch_reconfigured() {
    }
 }

+future<> storage_service::local_topology_barrier() {
+    if (this_shard_id() != 0) {
+        co_await container().invoke_on(0, [] (storage_service& ss) {
+            return ss.local_topology_barrier();
+        });
+        co_return;
+    }
+
+    auto version = _topology_state_machine._topology.version;
+
+    utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
+        throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
+    });
+
+    co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
+    if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
+        for (auto& n : _topology_state_machine._topology.transition_nodes) {
+            if (!_address_map.find(locator::host_id{n.first.uuid()})) {
+                rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
+                break;
+            }
+        }
+    }
+
+    co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
+        const auto current_version = ss._shared_token_metadata.get()->get_version();
+        rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, current version {}",
+                      version, current_version);
+
+        // This shouldn't happen under normal operation, it's only plausible
+        // if the topology change coordinator has
+        // moved to another node and managed to update the topology
+        // parallel to this method. The previous coordinator
+        // should be inactive now, so it won't observe this
+        // exception. By returning exception we aim
+        // to reveal any other conditions where this may arise.
+        if (current_version != version) {
+            co_await coroutine::return_exception(std::runtime_error(
+                    ::format("raft topology: command::barrier_and_drain, the version has changed, "
+                             "version {}, current_version {}, the topology change coordinator "
+                             " had probably migrated to another node",
+                             version, current_version)));
+        }
+
+        co_await ss._shared_token_metadata.stale_versions_in_use();
+        co_await get_topology_session_manager().drain_closing_sessions();
+
+        rtlogger.info("raft_topology_cmd::barrier_and_drain done");
+    });
+}
+
 future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft::term_t term, uint64_t cmd_index, const raft_topology_cmd& cmd) {
    raft_topology_cmd_result result;
    rtlogger.info("topology cmd rpc {} is called index={}", cmd.cmd, cmd_index);
@@ -6183,12 +6234,6 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
            state.last_index = cmd_index;
        }

-        // We capture the topology version right after the checks
-        // above, before any yields. This is crucial since _topology_state_machine._topology
-        // might be altered concurrently while this method is running,
-        // which can cause the fence command to apply an invalid fence version.
-        const auto version = _topology_state_machine._topology.version;
-
        switch (cmd.cmd) {
            case raft_topology_cmd::command::barrier: {
                utils::get_local_injector().inject("raft_topology_barrier_fail",
@@ -6227,43 +6272,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
            }
            break;
            case raft_topology_cmd::command::barrier_and_drain: {
-                utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
-                    throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
-                });
-                co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
-                if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
-                    for (auto& n : _topology_state_machine._topology.transition_nodes) {
-                        if (!_address_map.find(locator::host_id{n.first.uuid()})) {
-                            rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
-                            break;
-                        }
-                    }
-                }
-                co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
-                    const auto current_version = ss._shared_token_metadata.get()->get_version();
-                    rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, current version {}",
-                        version, current_version);
-
-                    // This shouldn't happen under normal operation, it's only plausible
-                    // if the topology change coordinator has
-                    // moved to another node and managed to update the topology
-                    // parallel to this method. The previous coordinator
-                    // should be inactive now, so it won't observe this
-                    // exception. By returning exception we aim
-                    // to reveal any other conditions where this may arise.
-                    if (current_version != version) {
-                        co_await coroutine::return_exception(std::runtime_error(
-                            ::format("raft topology: command::barrier_and_drain, the version has changed, "
-                                     "version {}, current_version {}, the topology change coordinator "
-                                     " had probably migrated to another node",
-                                version, current_version)));
-                    }
-
-                    co_await ss._shared_token_metadata.stale_versions_in_use();
-                    co_await get_topology_session_manager().drain_closing_sessions();
-
-                    rtlogger.info("raft_topology_cmd::barrier_and_drain done");
-                });
+                co_await local_topology_barrier();

                co_await utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail", [this] (auto& handler) -> future<> {
                    auto ks = handler.get("keyspace");
@@ -7359,34 +7368,8 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
            if (!table) {
                continue;
            }
-            auto erm = table->get_effective_replication_map();
-            auto& token_metadata = erm->get_token_metadata();
-            auto me = locator::tablet_replica { token_metadata.get_my_id(), this_shard_id() };

-            // It's important to tackle the anomaly in reported size, since both leaving and
-            // pending replicas could otherwise be accounted during tablet migration.
-            // If transition hasn't reached cleanup stage, then leaving replicas are accounted.
-            // If transition is past cleanup stage, then pending replicas are accounted.
-            // This helps to reduce the discrepancy window.
-            auto tablet_filter = [&me] (const locator::tablet_map& tmap, locator::global_tablet_id id) {
-                auto transition = tmap.get_tablet_transition_info(id.tablet);
-                auto& info = tmap.get_tablet_info(id.tablet);
-
-                // if tablet is not in transit, it's filtered in.
-                if (!transition) {
-                    return true;
-                }
-
-                bool is_pending = transition->pending_replica == me;
-                bool is_leaving = locator::get_leaving_replica(info, *transition) == me;
-                auto s = transition->reads; // read selector
-
-                return (!is_pending && !is_leaving)
-                       || (is_leaving && s == locator::read_replica_set_selector::previous)
-                       || (is_pending && s == locator::read_replica_set_selector::next);
-            };
-
-            locator::combined_load_stats combined_ls { table->table_load_stats(tablet_filter) };
+            locator::combined_load_stats combined_ls { table->table_load_stats() };
            load_stats.tables.emplace(id, std::move(combined_ls.table_ls));
            tablet_sizes_per_shard[this_shard_id()].size += load_stats.tablet_stats[this_host].add_tablet_sizes(combined_ls.tablet_ls);

--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -944,6 +944,9 @@ public:
    future<bool> ongoing_rf_change(const group0_guard& guard, sstring ks) const;
    future<> raft_initialize_discovery_leader(const join_node_request_params& params);
    future<> initialize_done_topology_upgrade_state();
+    // Does the local part of global_token_metadata_barrier(), without a raft group0 barrier.
+    // In particular, waits for non-latest local erms to go die.
+    future<> local_topology_barrier();
 private:
     // State machine that is responsible for topology change
    topology_state_machine& _topology_state_machine;
--- a/service/task_manager_module.cc
+++ b/service/task_manager_module.cc
@@ -21,7 +21,6 @@ namespace service {

 struct status_helper {
    tasks::task_status status;
-    utils::chunked_vector<locator::tablet_id> tablets;
    std::optional<locator::tablet_replica> pending_replica;
 };

@@ -148,18 +147,40 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
    }

    tasks::tmlogger.info("tablet_virtual_task: wait until tablet operation is finished");
-    co_await _ss._topology_state_machine.event.wait([&] {
-        auto& tmap = _ss.get_token_metadata().tablets().get_tablet_map(table);
-        if (is_resize_task(task_type)) {    // Resize task.
-            return tmap.resize_task_info().tablet_task_id.uuid() != id.uuid();
-        } else if (tablet_id_opt.has_value()) {    // Migration task.
-            return tmap.get_tablet_info(tablet_id_opt.value()).migration_task_info.tablet_task_id.uuid() != id.uuid();
-        } else {    // Repair task.
-            return std::all_of(res->tablets.begin(), res->tablets.end(), [&] (const locator::tablet_id& tablet) {
-                return tmap.get_tablet_info(tablet).repair_task_info.tablet_task_id.uuid() != id.uuid();
-            });
+    co_await utils::get_local_injector().inject("tablet_virtual_task_wait", utils::wait_for_message(60s));
+    while (true) {
+        co_await _ss._topology_state_machine.event.wait([&] {
+            if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
+                return true;
+            }
+            auto& tmap = _ss.get_token_metadata().tablets().get_tablet_map(table);
+            if (is_resize_task(task_type)) {    // Resize task.
+                return tmap.resize_task_info().tablet_task_id.uuid() != id.uuid();
+            } else if (tablet_id_opt.has_value()) {    // Migration task.
+                return tmap.get_tablet_info(tablet_id_opt.value()).migration_task_info.tablet_task_id.uuid() != id.uuid();
+            } else {    // Repair task.
+                return true;
+            }
+        });
+
+        if (!is_repair_task(task_type)) {
+            break;
        }
-    });
+
+        auto tmptr = _ss.get_token_metadata_ptr();
+        if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
+            break;
+        }
+        auto& tmap = tmptr->tablets().get_tablet_map(table);
+        bool repair_still_running = false;
+        co_await tmap.for_each_tablet([&] (locator::tablet_id tid, const locator::tablet_info& info) {
+            repair_still_running = repair_still_running || (info.repair_task_info.is_valid() && info.repair_task_info.tablet_task_id.uuid() == id.uuid());
+            return make_ready_future();
+        });
+        if (!repair_still_running) {
+            break;
+        }
+    }

    res->status.state = tasks::task_manager::task_state::done; // Failed repair task is retried.
    if (is_migration_task(task_type)) {
@@ -169,9 +190,9 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
    } else if (is_resize_task(task_type)) {
        auto new_tablet_count = _ss.get_token_metadata().tablets().get_tablet_map(table).tablet_count();
        res->status.state = new_tablet_count == tablet_count ? tasks::task_manager::task_state::suspended : tasks::task_manager::task_state::done;
-        res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
+        res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
    } else {
-        res->status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
+        res->status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
    }
    res->status.end_time = db_clock::now(); // FIXME: Get precise end time.
    co_return res->status;
@@ -257,6 +278,7 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
    auto& tmap = tmptr->tablets().get_tablet_map(table);
    bool repair_task_finished = false;
    bool repair_task_pending = false;
+    bool no_tablets_processed = true;
    if (is_repair_task(task_type)) {
        auto progress = co_await _ss._repair.local().get_tablet_repair_task_progress(id);
        if (progress) {
@@ -273,37 +295,37 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
            auto& task_info = info.repair_task_info;
            if (task_info.tablet_task_id.uuid() == id.uuid()) {
                update_status(task_info, res.status, sched_nr);
-                res.tablets.push_back(tid);
+                no_tablets_processed = false;
            }
            return make_ready_future();
        });
-        res.status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
+        res.status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
    } else if (is_migration_task(task_type)) {    // Migration task.
        auto tablet_id = hint.get_tablet_id();
        res.pending_replica = tmap.get_tablet_transition_info(tablet_id)->pending_replica;
        auto& task_info = tmap.get_tablet_info(tablet_id).migration_task_info;
        if (task_info.tablet_task_id.uuid() == id.uuid()) {
            update_status(task_info, res.status, sched_nr);
-            res.tablets.push_back(tablet_id);
+            no_tablets_processed = false;
        }
    } else {    // Resize task.
        auto& task_info = tmap.resize_task_info();
        if (task_info.tablet_task_id.uuid() == id.uuid()) {
            update_status(task_info, res.status, sched_nr);
            res.status.state = tasks::task_manager::task_state::running;
-            res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
+            res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
            co_return res;
        }
    }

-    if (!res.tablets.empty()) {
+    if (!no_tablets_processed) {
        res.status.state = sched_nr == 0 ? tasks::task_manager::task_state::created : tasks::task_manager::task_state::running;
        co_return res;
    }

    if (repair_task_pending) {
        // When repair_task_pending is true, the res.tablets will be empty iff the request is aborted by user.
-        res.status.state = res.tablets.empty() ? tasks::task_manager::task_state::failed : tasks::task_manager::task_state::running;
+        res.status.state = no_tablets_processed ? tasks::task_manager::task_state::failed : tasks::task_manager::task_state::running;
        co_return res;
    }
    if (repair_task_finished) {
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -2193,6 +2193,19 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                _tablet_allocator.set_load_stats(reconciled_stats);
            }
        }
+
+        // Wait for the background storage group merge to finish before releasing the state machine.
+        // Background merge holds the old erm, so a successful barrier joins with it.
+        // This guarantees that the background merge doesn't run concurrently with the next merge.
+        // Replica-side storage group merge takes compaction locks on the tablet's main compaction group, released
+        // by the background merge. If the next merge starts before the background merge finishes, it can cause a deadlock.
+        // The background merge fiber will try to stop a compaction group which is locked, and the lock is held
+        // by the background merge fiber.
+        tm = nullptr;
+        if (!guard) {
+            guard = co_await start_operation();
+        }
+        co_await global_tablet_token_metadata_barrier(std::move(guard));
    }

    future<> handle_truncate_table(group0_guard guard) {
--- a/sstables/index_entry.hh
+++ b/sstables/index_entry.hh
@@ -201,95 +201,47 @@ public:
    virtual future<std::optional<entry_info>> next_entry() = 0;
 };

-// Allocated inside LSA.
-class promoted_index {
-    deletion_time _del_time;
-    uint64_t _promoted_index_start;
-    uint32_t _promoted_index_size;
-    uint32_t _num_blocks;
-public:
-    promoted_index(const schema& s,
-        deletion_time del_time,
-        uint64_t promoted_index_start,
-        uint32_t promoted_index_size,
-        uint32_t num_blocks)
-            : _del_time{del_time}
-            , _promoted_index_start(promoted_index_start)
-            , _promoted_index_size(promoted_index_size)
-            , _num_blocks(num_blocks)
-    { }
-
-    [[nodiscard]] deletion_time get_deletion_time() const { return _del_time; }
-    [[nodiscard]] uint32_t get_promoted_index_size() const { return _promoted_index_size; }
-
-    // Call under allocating_section.
-    // For sstable versions >= mc the returned cursor will be of type `bsearch_clustered_cursor`.
-    std::unique_ptr<clustered_index_cursor> make_cursor(shared_sstable,
-        reader_permit,
-        tracing::trace_state_ptr,
-        file_input_stream_options,
-        use_caching);
+// Promoted index information produced by the parser.
+struct parsed_promoted_index_entry {
+    deletion_time del_time;
+    uint64_t promoted_index_start;
+    uint32_t promoted_index_size;
+    uint32_t num_blocks;
 };

+using promoted_index = parsed_promoted_index_entry;
+
 // A partition index element.
 // Allocated inside LSA.
-class index_entry {
-private:
-    managed_bytes _key;
-    mutable std::optional<dht::token> _token;
-    uint64_t _position;
-    managed_ref<promoted_index> _index;
+struct [[gnu::packed]] index_entry {
+    mutable int64_t raw_token;
+    uint64_t data_file_offset;
+    uint32_t key_offset;

-public:
-
-    key_view get_key() const {
-        return key_view{_key};
-    }
-
-    // May allocate so must be called under allocating_section.
-    decorated_key_view get_decorated_key(const schema& s) const {
-        if (!_token) {
-            _token.emplace(s.get_partitioner().get_token(get_key()));
-        }
-        return decorated_key_view(*_token, get_key());
-    }
-
-    uint64_t position() const { return _position; };
-
-    std::optional<deletion_time> get_deletion_time() const {
-        if (_index) {
-            return _index->get_deletion_time();
-        }
-
-        return {};
-    }
-
-    index_entry(managed_bytes&& key, uint64_t position, managed_ref<promoted_index>&& index)
-        : _key(std::move(key))
-        , _position(position)
-        , _index(std::move(index))
-    {}
-
-    index_entry(index_entry&&) = default;
-    index_entry& operator=(index_entry&&) = default;
-
-    // Can be nullptr
-    const managed_ref<promoted_index>& get_promoted_index() const { return _index; }
-    managed_ref<promoted_index>& get_promoted_index() { return _index; }
-    uint32_t get_promoted_index_size() const { return _index ? _index->get_promoted_index_size() : 0; }
-
-    size_t external_memory_usage() const {
-        return _key.external_memory_usage() + _index.external_memory_usage();
-    }
+    uint64_t position() const { return data_file_offset; }
+    dht::raw_token token() const { return dht::raw_token(raw_token); }
 };

+// Required for optimized LSA migration of storage of managed_vector.
+static_assert(std::is_trivially_move_assignable_v<index_entry>);
+static_assert(std::is_trivially_move_assignable_v<parsed_promoted_index_entry>);
+
 // A partition index page.
 //
 // Allocated in the standard allocator space but with an LSA allocator as the current allocator.
 // So the shallow part is in the standard allocator but all indirect objects are inside LSA.
 class partition_index_page {
 public:
-    lsa::chunked_managed_vector<managed_ref<index_entry>> _entries;
+    lsa::chunked_managed_vector<index_entry> _entries;
+    managed_bytes _key_storage;
+
+    // Stores promoted index information of index entries.
+    // The i-th element corresponds to the i-th entry in _entries.
+    // Can be smaller than _entries. If _entries[i] doesn't have a matching element in _promoted_indexes then
+    // that entry doesn't have a promoted index.
+    // Kept separately to avoid paying for storage cost in pages where no entry has a promoted index,
+    // which is typical in workloads with small partitions.
+    lsa::chunked_managed_vector<promoted_index> _promoted_indexes;
 public:
    partition_index_page() = default;
    partition_index_page(partition_index_page&&) noexcept = default;
@@ -298,15 +250,68 @@ public:
    bool empty() const { return _entries.empty(); }
    size_t size() const { return _entries.size(); }

+    stop_iteration clear_gently() {
+        // Vectors have trivial storage, so are fast to destroy.
+        return stop_iteration::yes;
+    }
+
    void clear_one_entry() {
        _entries.pop_back();
    }

+    bool has_promoted_index(size_t i) const {
+        return i < _promoted_indexes.size() && _promoted_indexes[i].promoted_index_size > 0;
+    }
+
+    /// Get promoted index for the i-th entry.
+    /// Call only when has_promoted_index(i) is true.
+    const promoted_index& get_promoted_index(size_t i) const {
+        return _promoted_indexes[i];
+    }
+
+    /// Get promoted index for the i-th entry.
+    /// Call only when has_promoted_index(i) is true.
+    promoted_index& get_promoted_index(size_t i) {
+        return _promoted_indexes[i];
+    }
+
+    /// Get promoted index size for the i-th entry.
+    uint32_t get_promoted_index_size(size_t i) const {
+        return has_promoted_index(i) ? get_promoted_index(i).promoted_index_size : 0;
+    }
+
+    /// Get deletion_time for partition represented by the i-th entry.
+    /// Returns disengaged optional if the entry doesn't have a promoted index, so we don't know the deletion_time.
+    /// It has to be read from the data file.
+    std::optional<deletion_time> get_deletion_time(size_t i) const {
+        if (has_promoted_index(i)) {
+            return get_promoted_index(i).del_time;
+        }
+        return {};
+    }
+
+    key_view get_key(size_t i) const {
+        auto start = _entries[i].key_offset;
+        auto end = i + 1 < _entries.size() ? _entries[i + 1].key_offset : _key_storage.size();
+        auto v = managed_bytes_view(_key_storage).prefix(end);
+        v.remove_prefix(start);
+        return key_view(v);
+    }
+
+    decorated_key_view get_decorated_key(const schema& s, size_t i) const {
+        auto key = get_key(i);
+        auto t = _entries[i].token();
+        if (!t) {
+            t = dht::raw_token(s.get_partitioner().get_token(key));
+            _entries[i].raw_token = t.value;
+        }
+        return decorated_key_view(dht::token(t), key);
+    }
+
    size_t external_memory_usage() const {
        size_t size = _entries.external_memory_usage();
-        for (auto&& e : _entries) {
-            size += sizeof(index_entry) + e->external_memory_usage();
-        }
+        size += _promoted_indexes.external_memory_usage();
+        size += _key_storage.external_memory_usage();
        return size;
    }
 };
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -25,14 +25,6 @@ namespace sstables {
 extern seastar::logger sstlog;
 extern thread_local mc::cached_promoted_index::metrics promoted_index_cache_metrics;

-// Promoted index information produced by the parser.
-struct parsed_promoted_index_entry {
-    deletion_time del_time;
-    uint64_t promoted_index_start;
-    uint32_t promoted_index_size;
-    uint32_t num_blocks;
-};
-
 // Partition index entry information produced by the parser.
 struct parsed_partition_index_entry {
    temporary_buffer<char> key;
@@ -53,9 +45,10 @@ class index_consumer {
    schema_ptr _s;
    logalloc::allocating_section _alloc_section;
    logalloc::region& _region;
+    utils::chunked_vector<parsed_partition_index_entry> _parsed_entries;
+    size_t _max_promoted_index_entry_plus_one = 0; // Highest index +1 in _parsed_entries which has a promoted index.
+    size_t _key_storage_size = 0;
 public:
-    index_list indexes;
-
    index_consumer(logalloc::region& r, schema_ptr s)
        : _s(s)
        , _alloc_section(abstract_formatter([s] (fmt::format_context& ctx) {
@@ -64,36 +57,63 @@ public:
        , _region(r)
    { }

-    ~index_consumer() {
-        with_allocator(_region.allocator(), [&] {
-            indexes._entries.clear_and_release();
-        });
+    void consume_entry(parsed_partition_index_entry&& e) {
+        _key_storage_size += e.key.size();
+        _parsed_entries.emplace_back(std::move(e));
+        if (e.promoted_index) {
+            _max_promoted_index_entry_plus_one = std::max(_max_promoted_index_entry_plus_one, _parsed_entries.size());
+        }
    }

-    void consume_entry(parsed_partition_index_entry&& e) {
-        _alloc_section(_region, [&] {
+    future<index_list> finalize() {
+        index_list result;
+        // In case of exception, need to deallocate under region allocator.
+        auto delete_result = seastar::defer([&] {
            with_allocator(_region.allocator(), [&] {
-                managed_ref<promoted_index> pi;
-                if (e.promoted_index) {
-                    pi = make_managed<promoted_index>(*_s,
-                            e.promoted_index->del_time,
-                            e.promoted_index->promoted_index_start,
-                            e.promoted_index->promoted_index_size,
-                            e.promoted_index->num_blocks);
-                }
-                auto key = managed_bytes(reinterpret_cast<const bytes::value_type*>(e.key.get()), e.key.size());
-                indexes._entries.emplace_back(make_managed<index_entry>(std::move(key), e.data_file_offset, std::move(pi)));
+                result._entries = {};
+                result._promoted_indexes = {};
+                result._key_storage = {};
            });
        });
+        auto i = _parsed_entries.begin();
+        size_t key_offset = 0;
+        while (i != _parsed_entries.end()) {
+            _alloc_section(_region, [&] {
+                with_allocator(_region.allocator(), [&] {
+                    result._entries.reserve(_parsed_entries.size());
+                    result._promoted_indexes.resize(_max_promoted_index_entry_plus_one);
+                    if (result._key_storage.empty()) {
+                        result._key_storage = managed_bytes(managed_bytes::initialized_later(), _key_storage_size);
+                    }
+                    managed_bytes_mutable_view key_out(result._key_storage);
+                    key_out.remove_prefix(key_offset);
+                    while (i != _parsed_entries.end()) {
+                        parsed_partition_index_entry& e = *i;
+                        if (e.promoted_index) {
+                            result._promoted_indexes[result._entries.size()] = *e.promoted_index;
+                        }
+                        write_fragmented(key_out, std::string_view(e.key.begin(), e.key.size()));
+                        result._entries.emplace_back(index_entry{dht::raw_token().value, e.data_file_offset, key_offset});
+                        ++i;
+                        key_offset += e.key.size();
+                        if (need_preempt()) {
+                            break;
+                        }
+                    }
+                });
+            });
+            co_await coroutine::maybe_yield();
+        }
+        delete_result.cancel();
+        _parsed_entries.clear();
+        co_return std::move(result);
    }

    void prepare(uint64_t size) {
-        _alloc_section = logalloc::allocating_section();
-        _alloc_section(_region, [&] {
-            with_allocator(_region.allocator(), [&] {
-                indexes._entries.reserve(size);
-            });
-        });
+        _max_promoted_index_entry_plus_one = 0;
+        _key_storage_size = 0;
+        _parsed_entries.clear();
+        _parsed_entries.reserve(size);
    }
 };

@@ -198,10 +218,14 @@ public:

        switch (_state) {
        // START comes first, to make the handling of the 0-quantity case simpler
+            state_START:
        case state::START:
            sstlog.trace("{}: pos {} state {} - data.size()={}", fmt::ptr(this), current_pos(), state::START, data.size());
            _state = state::KEY_SIZE;
-            break;
+            if (data.size() == 0) {
+                break;
+            }
+            [[fallthrough]];
        case state::KEY_SIZE:
            sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::KEY_SIZE);
            _entry_offset = current_pos();
@@ -227,7 +251,16 @@ public:
        case state::PROMOTED_SIZE:
            sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::PROMOTED_SIZE);
            _position = this->_u64;
-            if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
+            if (is_mc_format() && data.size() && *data.begin() == 0) { // promoted_index_size == 0
+                data.trim_front(1);
+                _consumer.consume_entry(parsed_partition_index_entry{
+                    .key = std::move(_key),
+                    .data_file_offset = _position,
+                    .index_offset = _entry_offset,
+                    .promoted_index = std::nullopt
+                });
+                goto state_START;
+            } else if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
                _state = state::PARTITION_HEADER_LENGTH_1;
                break;
            }
@@ -339,33 +372,6 @@ inline file make_tracked_index_file(sstable& sst, reader_permit permit, tracing:
    return tracing::make_traced_file(std::move(f), std::move(trace_state), format("{}:", sst.index_filename()));
 }

-inline
-std::unique_ptr<clustered_index_cursor> promoted_index::make_cursor(shared_sstable sst,
-    reader_permit permit,
-    tracing::trace_state_ptr trace_state,
-    file_input_stream_options options,
-    use_caching caching)
-{
-    if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
-        seastar::shared_ptr<cached_file> cached_file_ptr = caching
-                ? sst->_cached_index_file
-                : seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
-                                                    sst->manager().get_cache_tracker().get_index_cached_file_stats(),
-                                                    sst->manager().get_cache_tracker().get_lru(),
-                                                    sst->manager().get_cache_tracker().region(),
-                                                    sst->_index_file_size);
-        return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
-            _promoted_index_start, _promoted_index_size,
-            promoted_index_cache_metrics, permit,
-            sst->get_column_translation(), cached_file_ptr, _num_blocks, trace_state, sst->features());
-    }
-
-    auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
-    auto promoted_index_stream = make_file_input_stream(std::move(file), _promoted_index_start, _promoted_index_size,options);
-    return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
-        std::move(promoted_index_stream), _promoted_index_size, _num_blocks, std::nullopt);
-}
-
 // Less-comparator for lookups in the partition index.
 class index_comparator {
    dht::ring_position_comparator_for_sstables _tri_cmp;
@@ -376,27 +382,17 @@ public:
        return _tri_cmp(e.get_decorated_key(), rp) < 0;
    }

-    bool operator()(const index_entry& e, dht::ring_position_view rp) const {
-        return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) < 0;
-    }
-
-    bool operator()(const managed_ref<index_entry>& e, dht::ring_position_view rp) const {
-        return operator()(*e, rp);
-    }
-
-    bool operator()(dht::ring_position_view rp, const managed_ref<index_entry>& e) const {
-        return operator()(rp, *e);
-    }
-
    bool operator()(dht::ring_position_view rp, const summary_entry& e) const {
        return _tri_cmp(e.get_decorated_key(), rp) > 0;
    }
-
-    bool operator()(dht::ring_position_view rp, const index_entry& e) const {
-        return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) > 0;
-    }
 };

+inline
+std::strong_ordering index_entry_tri_cmp(const schema& s, partition_index_page& page, size_t idx, dht::ring_position_view rp) {
+    dht::ring_position_comparator_for_sstables tri_cmp(s);
+    return tri_cmp(page.get_decorated_key(s, idx), rp);
+}
+
 // Contains information about index_reader position in the index file
 struct index_bound {
    index_bound() = default;
@@ -537,7 +533,7 @@ private:
                    if (ex) {
                        return make_exception_future<index_list>(std::move(ex));
                    }
-                    return make_ready_future<index_list>(std::move(bound.consumer->indexes));
+                    return bound.consumer->finalize();
                });
            });
        };
@@ -550,17 +546,18 @@ private:
            if (bound.current_list->empty()) {
                throw malformed_sstable_exception(format("missing index entry for summary index {} (bound {})", summary_idx, fmt::ptr(&bound)), _sstable->index_filename());
            }
-            bound.data_file_position = bound.current_list->_entries[0]->position();
+            bound.data_file_position = bound.current_list->_entries[0].position();
            bound.element = indexable_element::partition;
            bound.end_open_marker.reset();

            if (sstlog.is_enabled(seastar::log_level::trace)) {
                sstlog.trace("index {} bound {}: page:", fmt::ptr(this), fmt::ptr(&bound));
                logalloc::reclaim_lock rl(_region);
-                for (auto&& e : bound.current_list->_entries) {
+                for (size_t i = 0; i < bound.current_list->_entries.size(); ++i) {
+                    auto& e = bound.current_list->_entries[i];
                    auto dk = dht::decorate_key(*_sstable->_schema,
-                        e->get_key().to_partition_key(*_sstable->_schema));
-                    sstlog.trace("  {} -> {}", dk, e->position());
+                        bound.current_list->get_key(i).to_partition_key(*_sstable->_schema));
+                    sstlog.trace("  {} -> {}", dk, e.position());
                }
            }

@@ -604,7 +601,13 @@ private:
    // Valid if partition_data_ready(bound)
    index_entry& current_partition_entry(index_bound& bound) {
        parse_assert(bool(bound.current_list), _sstable->index_filename());
-        return *bound.current_list->_entries[bound.current_index_idx];
+        return bound.current_list->_entries[bound.current_index_idx];
+    }
+
+    // Valid if partition_data_ready(bound)
+    partition_index_page& current_page(index_bound& bound) {
+        parse_assert(bool(bound.current_list), _sstable->index_filename());
+        return *bound.current_list;
    }

    future<> advance_to_next_partition(index_bound& bound) {
@@ -617,7 +620,7 @@ private:
        if (bound.current_index_idx + 1 < bound.current_list->size()) {
            ++bound.current_index_idx;
            bound.current_pi_idx = 0;
-            bound.data_file_position = bound.current_list->_entries[bound.current_index_idx]->position();
+            bound.data_file_position = bound.current_list->_entries[bound.current_index_idx].position();
            bound.element = indexable_element::partition;
            bound.end_open_marker.reset();
            return reset_clustered_cursor(bound);
@@ -680,9 +683,13 @@ private:
        return advance_to_page(bound, summary_idx).then([this, &bound, pos, summary_idx] {
            sstlog.trace("index {}: old page index = {}", fmt::ptr(this), bound.current_index_idx);
            auto i = _alloc_section(_region, [&] {
-                auto& entries = bound.current_list->_entries;
-                return std::lower_bound(std::begin(entries) + bound.current_index_idx, std::end(entries), pos,
-                    index_comparator(*_sstable->_schema));
+                auto& page = *bound.current_list;
+                auto& s = *_sstable->_schema;
+                auto r = std::views::iota(bound.current_index_idx, page._entries.size());
+                auto it = std::ranges::partition_point(r, [&] (int idx) {
+                    return index_entry_tri_cmp(s, page, idx, pos) < 0;
+                });
+                return page._entries.begin() + bound.current_index_idx + std::ranges::distance(r.begin(), it);
            });
            // i is valid until next allocation point
            auto& entries = bound.current_list->_entries;
@@ -697,7 +704,7 @@ private:
            }
            bound.current_index_idx = std::distance(std::begin(entries), i);
            bound.current_pi_idx = 0;
-            bound.data_file_position = (*i)->position();
+            bound.data_file_position = (*i).position();
            bound.element = indexable_element::partition;
            bound.end_open_marker.reset();
            sstlog.trace("index {}: new page index = {}, pos={}", fmt::ptr(this), bound.current_index_idx, bound.data_file_position);
@@ -800,6 +807,34 @@ public:
        }
    }

+    static
+    std::unique_ptr<clustered_index_cursor> make_cursor(const parsed_promoted_index_entry& pi,
+        shared_sstable sst,
+        reader_permit permit,
+        tracing::trace_state_ptr trace_state,
+        file_input_stream_options options,
+        use_caching caching)
+    {
+        if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
+            seastar::shared_ptr<cached_file> cached_file_ptr = caching
+                    ? sst->_cached_index_file
+                    : seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
+                                                        sst->manager().get_cache_tracker().get_index_cached_file_stats(),
+                                                        sst->manager().get_cache_tracker().get_lru(),
+                                                        sst->manager().get_cache_tracker().region(),
+                                                        sst->_index_file_size);
+            return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
+                pi.promoted_index_start, pi.promoted_index_size,
+                promoted_index_cache_metrics, permit,
+                sst->get_column_translation(), cached_file_ptr, pi.num_blocks, trace_state, sst->features());
+        }
+
+        auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
+        auto promoted_index_stream = make_file_input_stream(std::move(file), pi.promoted_index_start, pi.promoted_index_size,options);
+        return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
+            std::move(promoted_index_stream), pi.promoted_index_size, pi.num_blocks, std::nullopt);
+    }
+
    // Ensures that partition_data_ready() returns true.
    // Can be called only when !eof()
    future<> read_partition_data() override {
@@ -835,10 +870,10 @@ public:
    clustered_index_cursor* current_clustered_cursor(index_bound& bound) {
        if (!bound.clustered_cursor) {
            _alloc_section(_region, [&] {
-                index_entry& e = current_partition_entry(bound);
-                promoted_index* pi = e.get_promoted_index().get();
-                if (pi) {
-                    bound.clustered_cursor = pi->make_cursor(_sstable, _permit, _trace_state,
+                partition_index_page& page = current_page(bound);
+                if (page.has_promoted_index(bound.current_index_idx)) {
+                    promoted_index& pi = page.get_promoted_index(bound.current_index_idx);
+                    bound.clustered_cursor = make_cursor(pi, _sstable, _permit, _trace_state,
                        get_file_input_stream_options(), _use_caching);
                }
            });
@@ -861,15 +896,15 @@ public:
    // It may be unavailable for old sstables for which this information was not generated.
    // Can be called only when partition_data_ready().
    std::optional<sstables::deletion_time> partition_tombstone() override {
-        return current_partition_entry(_lower_bound).get_deletion_time();
+        return current_page(_lower_bound).get_deletion_time(_lower_bound.current_index_idx);
    }

    // Returns the key for current partition.
    // Can be called only when partition_data_ready().
    std::optional<partition_key> get_partition_key() override {
        return _alloc_section(_region, [this] {
-            index_entry& e = current_partition_entry(_lower_bound);
-            return e.get_key().to_partition_key(*_sstable->_schema);
+            return current_page(_lower_bound).get_key(_lower_bound.current_index_idx)
+                .to_partition_key(*_sstable->_schema);
        });
    }

@@ -883,8 +918,8 @@ public:
    // Returns the number of promoted index entries for the current partition.
    // Can be called only when partition_data_ready().
    uint64_t get_promoted_index_size() {
-        index_entry& e = current_partition_entry(_lower_bound);
-        return e.get_promoted_index_size();
+        partition_index_page& page = current_page(_lower_bound);
+        return page.get_promoted_index_size(_lower_bound.current_index_idx);
    }

    bool partition_data_ready() const override {
@@ -975,9 +1010,9 @@ public:
                return make_ready_future<bool>(false);
            }
            return read_partition_data().then([this, key] {
-                index_comparator cmp(*_sstable->_schema);
                bool found = _alloc_section(_region, [&] {
-                    return cmp(key, current_partition_entry(_lower_bound)) == 0;
+                    auto& page = current_page(_lower_bound);
+                    return index_entry_tri_cmp(*_sstable->_schema, page, _lower_bound.current_index_idx, key) == 0;
                });
                return make_ready_future<bool>(found);
            });
--- a/sstables/partition_index_cache.hh
+++ b/sstables/partition_index_cache.hh
@@ -257,14 +257,11 @@ public:
        while (partial_page || i != _cache.end()) {
            if (partial_page) {
                auto preempted = with_allocator(_region.allocator(), [&] {
-                    while (!partial_page->empty()) {
-                        partial_page->clear_one_entry();
-                        if (need_preempt()) {
-                            return true;
-                        }
+                    while (partial_page->clear_gently() != stop_iteration::yes) {
+                        return true;
                    }
                    partial_page.reset();
-                    return false;
+                    return need_preempt();
                });
                if (preempted) {
                    auto key = (i != _cache.end()) ? std::optional(i->key()) : std::nullopt;
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -1094,7 +1094,6 @@ public:

    friend class mc::writer;
    friend class index_reader;
-    friend class promoted_index;
    friend class sstables_manager;
    template <typename DataConsumeRowsContext>
    friend future<std::unique_ptr<DataConsumeRowsContext>>
--- a/streaming/stream_blob.cc
+++ b/streaming/stream_blob.cc
@@ -436,7 +436,10 @@ tablet_stream_files(netw::messaging_service& ms, std::list<stream_blob_info> sou
    stream_options.buffer_size = file_stream_buffer_size;
    stream_options.read_ahead = file_stream_read_ahead;

-    for (auto& info : sources) {
+    for (auto&& source_info : sources) {
+        // Keep stream_blob_info alive only at duration of streaming. Allowing the file descriptor
+        // of the sstable component to be released right after it has been streamed.
+        auto info = std::exchange(source_info, {});
        auto& filename = info.filename;
        std::optional<input_stream<char>> fstream;
        bool fstream_closed = false;
@@ -617,6 +620,7 @@ tablet_stream_files(netw::messaging_service& ms, std::list<stream_blob_info> sou
                    ops_id, filename, targets, total_size, get_bw(total_size, start_time));
        }
    }
+    co_await utils::get_local_injector().inject("tablet_stream_files_end_wait", utils::wait_for_message(std::chrono::seconds(60)));
    if (error) {
        blogger.warn("fstream[{}] Master failed sending files_nr={} files={} targets={} send_size={} bw={} error={}",
                ops_id, sources.size(), sources, targets, ops_total_size, get_bw(ops_total_size, ops_start_time), error);
@@ -680,15 +684,20 @@ future<stream_files_response> tablet_stream_files_handler(replica::database& db,
    if (files.empty()) {
        co_return resp;
    }
+    auto sstable_nr = sstables.size();
+    // Release reference to sstables to be streamed here. Since one sstable is streamed at a time,
+    // a sstable - that has been compacted - can have its space released from disk right after
+    // that sstable's content has been fully streamed.
+    sstables.clear();
    blogger.debug("stream_sstables[{}] Started sending sstable_nr={} files_nr={} files={} range={}",
-            req.ops_id, sstables.size(), files.size(), files, req.range);
+            req.ops_id, sstable_nr, files.size(), files, req.range);
    auto ops_start_time = std::chrono::steady_clock::now();
    auto files_nr = files.size();
    size_t stream_bytes = co_await tablet_stream_files(ms, std::move(files), req.targets, req.table, req.ops_id, req.topo_guard);
    resp.stream_bytes = stream_bytes;
    auto duration = std::chrono::steady_clock::now() - ops_start_time;
    blogger.info("stream_sstables[{}] Finished sending sstable_nr={} files_nr={} range={} stream_bytes={} stream_time={} stream_bw={}",
-            req.ops_id, sstables.size(), files_nr, req.range, stream_bytes, duration, get_bw(stream_bytes, ops_start_time));
+            req.ops_id, sstable_nr, files_nr, req.range, stream_bytes, duration, get_bw(stream_bytes, ops_start_time));
    co_return resp;
 }

--- a/table_helper.cc
+++ b/table_helper.cc
@@ -75,7 +75,7 @@ future<bool> table_helper::try_prepare(bool fallback, cql3::query_processor& qp,
    auto& stmt = fallback ? _insert_cql_fallback.value() : _insert_cql;
    try {
        shared_ptr<cql_transport::messages::result_message::prepared> msg_ptr = co_await qp.prepare(stmt, qs.get_client_state(), dialect);
-        _prepared_stmt = std::move(msg_ptr->get_prepared());
+        _prepared_stmt = msg_ptr->get_prepared();
        shared_ptr<cql3::cql_statement> cql_stmt = _prepared_stmt->statement;
        _insert_stmt = dynamic_pointer_cast<cql3::statements::modification_statement>(cql_stmt);
        _is_fallback_stmt = fallback;
--- a/tasks/task_manager.cc
+++ b/tasks/task_manager.cc
@@ -400,7 +400,7 @@ task_manager::virtual_task::impl::impl(module_ptr module) noexcept
    : _module(std::move(module))
 {}

-future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::get_children(module_ptr module, task_id parent_id, std::function<bool(locator::host_id)> is_host_alive) {
+future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::get_children(module_ptr module, task_id parent_id, locator::token_metadata_ptr tmptr) {
    auto ms = module->get_task_manager()._messaging;
    if (!ms) {
        auto ids = co_await module->get_task_manager().get_virtual_task_children(parent_id);
@@ -417,19 +417,18 @@ future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::g
        tmlogger.info("tasks_vt_get_children: waiting");
        co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::seconds{10});
    });
-    co_return co_await map_reduce(nodes, [ms, parent_id, is_host_alive = std::move(is_host_alive)] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
-        if (is_host_alive(host_id)) {
-            return ser::tasks_rpc_verbs::send_tasks_get_children(ms, host_id, parent_id).then([host_id] (auto resp) {
-                return resp | std::views::transform([host_id] (auto id) {
-                    return task_identity{
-                        .host_id = host_id,
-                        .task_id = id
-                    };
-                }) | std::ranges::to<utils::chunked_vector<task_identity>>();
-            });
-        } else {
-            return make_ready_future<utils::chunked_vector<task_identity>>();
-        }
+    co_return co_await map_reduce(nodes, [ms, parent_id] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
+        return ser::tasks_rpc_verbs::send_tasks_get_children(ms, host_id, parent_id).then([host_id] (auto resp) {
+            return resp | std::views::transform([host_id] (auto id) {
+                return task_identity{
+                    .host_id = host_id,
+                    .task_id = id
+                };
+            }) | std::ranges::to<utils::chunked_vector<task_identity>>();
+        }).handle_exception_type([host_id, parent_id] (const rpc::closed_error& ex) {
+            tmlogger.warn("Failed to get children of virtual task with id={} from node {}: {}", parent_id, host_id, ex);
+            return utils::chunked_vector<task_identity>{};
+        });
    }, utils::chunked_vector<task_identity>{}, [] (auto a, auto&& b) {
        std::move(b.begin(), b.end(), std::back_inserter(a));
        return a;
--- a/tasks/task_manager.hh
+++ b/tasks/task_manager.hh
@@ -19,6 +19,7 @@
 #include "db_clock.hh"
 #include "utils/log.hh"
 #include "locator/host_id.hh"
+#include "locator/token_metadata_fwd.hh"
 #include "schema/schema_fwd.hh"
 #include "tasks/types.hh"
 #include "utils/chunked_vector.hh"
@@ -282,7 +283,7 @@ public:
            impl& operator=(impl&&) = delete;
            virtual ~impl() = default;
        protected:
-            static future<utils::chunked_vector<task_identity>> get_children(module_ptr module, task_id parent_id, std::function<bool(locator::host_id)> is_host_alive);
+            static future<utils::chunked_vector<task_identity>> get_children(module_ptr module, task_id parent_id, locator::token_metadata_ptr tmptr);
        public:
            virtual task_group get_group() const noexcept = 0;
            // Returns std::nullopt if an operation with task_id isn't tracked by this virtual_task.
--- a/test/boost/cache_algorithm_test.cc
+++ b/test/boost/cache_algorithm_test.cc
@@ -62,7 +62,11 @@ SEASTAR_TEST_CASE(test_index_doesnt_flood_cache_in_small_partition_workload) {
    // cfg.db_config->index_cache_fraction.set(1.0);
    return do_with_cql_env_thread([] (cql_test_env& e) {
        // We disable compactions because they cause confusing cache mispopulations.
-        e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
+        // We disable compression because the sstable writer targets a specific
+        // (*compressed* data file size : summary file size) ratio,
+        // so the number of keys per index page becomes hard to control,
+        // and might be arbitrarily large.
+        e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
        auto insert_query = e.prepare("INSERT INTO ks.t(pk) VALUES (?)").get();
        auto select_query = e.prepare("SELECT * FROM t WHERE pk = ?").get();

@@ -154,7 +158,11 @@ SEASTAR_TEST_CASE(test_index_is_cached_in_big_partition_workload) {
    // cfg.db_config->index_cache_fraction.set(0.0);
    return do_with_cql_env_thread([] (cql_test_env& e) {
        // We disable compactions because they cause confusing cache mispopulations.
-        e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
+        // We disable compression because the sstable writer targets a specific
+        // (*compressed* data file size : summary file size) ratio,
+        // so the number of keys per index page becomes hard to control,
+        // and might be arbitrarily large.
+        e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
        auto insert_query = e.prepare("INSERT INTO ks.t(pk, ck, v) VALUES (?, ?, ?)").get();
        auto select_query = e.prepare("SELECT * FROM t WHERE pk = ? AND ck = ?").get();

--- a/test/boost/database_test.cc
+++ b/test/boost/database_test.cc
@@ -1111,6 +1111,30 @@ SEASTAR_TEST_CASE(test_snapshot_ctl_true_snapshots_size) {
    });
 }

+SEASTAR_TEST_CASE(test_snapshot_ctl_details_exception_handling) {
+#ifndef SCYLLA_ENABLE_ERROR_INJECTION
+    testlog.debug("Skipping test as it depends on error injection. Please run in mode where it's enabled (debug,dev).\n");
+    return make_ready_future();
+#endif
+    return do_with_some_data_in_thread({"cf"}, [] (cql_test_env& e) {
+        sharded<db::snapshot_ctl> sc;
+        sc.start(std::ref(e.db()), std::ref(e.get_task_manager()), std::ref(e.get_sstorage_manager()), db::snapshot_ctl::config{}).get();
+        auto stop_sc = deferred_stop(sc);
+
+        auto& cf = e.local_db().find_column_family("ks", "cf");
+        take_snapshot(e).get();
+
+        utils::get_local_injector().enable("get_snapshot_details", true);
+        BOOST_REQUIRE_THROW(cf.get_snapshot_details().get(), std::runtime_error);
+
+        utils::get_local_injector().enable("per-snapshot-get_snapshot_details", true);
+        BOOST_REQUIRE_THROW(cf.get_snapshot_details().get(), std::runtime_error);
+
+        auto details = cf.get_snapshot_details().get();
+        BOOST_REQUIRE_EQUAL(details.size(), 1);
+    });
+}
+
 // toppartitions_query caused a lw_shared_ptr to cross shards when moving results, #5104
 SEASTAR_TEST_CASE(toppartitions_cross_shard_schema_ptr) {
    return do_with_cql_env_and_compaction_groups([] (cql_test_env& e) {
@@ -1857,7 +1881,7 @@ SEASTAR_THREAD_TEST_CASE(test_tombstone_gc_state_snapshot) {

    schema_builder::register_schema_initializer([] (schema_builder& builder) {
        if (builder.ks_name() == "test" && builder.cf_name() == "table_gc_mode_group0") {
-            builder.set_is_group0_table(true);
+            builder.set_is_group0_table();
        }
    });
    auto table_gc_mode_group0 = schema_builder("test", "table_gc_mode_group0")
--- a/test/boost/group0_test.cc
+++ b/test/boost/group0_test.cc
@@ -252,7 +252,7 @@ SEASTAR_TEST_CASE(test_group0_batch) {
        // (group0 mutations are not allowed on non-group0 tables)
        schema_builder::register_schema_initializer([](schema_builder& builder) {
            if (builder.cf_name() == "test_group0_batch") {
-                builder.set_is_group0_table(true);
+                builder.set_is_group0_table();
            }
        });

@@ -345,4 +345,29 @@ SEASTAR_TEST_CASE(test_group0_batch) {
    });
 }

+SEASTAR_TEST_CASE(test_group0_tables_use_schema_commitlog) {
+    return do_with_cql_env([] (cql_test_env& e) {
+        schema_builder::register_schema_initializer([](schema_builder& builder) {
+            if (builder.cf_name() == "test_group0_tables_use_schema_commitlog1") {
+                builder.set_is_group0_table();
+            }
+        });
+
+        auto test_group0_tables_use_schema_commitlog1 = schema_builder("test", "test_group0_tables_use_schema_commitlog1")
+            .with_column("pk", utf8_type, column_kind::partition_key)
+            .build();
+
+        auto test_group0_tables_use_schema_commitlog2 = schema_builder("test", "test_group0_tables_use_schema_commitlog2")
+            .with_column("pk", utf8_type, column_kind::partition_key)
+            .build();
+
+        BOOST_REQUIRE(test_group0_tables_use_schema_commitlog1->static_props().is_group0_table);
+        BOOST_REQUIRE(test_group0_tables_use_schema_commitlog1->static_props().use_schema_commitlog);
+        BOOST_REQUIRE(!test_group0_tables_use_schema_commitlog2->static_props().is_group0_table);
+        BOOST_REQUIRE(!test_group0_tables_use_schema_commitlog2->static_props().use_schema_commitlog);
+
+        return make_ready_future();
+    });
+}
+
 BOOST_AUTO_TEST_SUITE_END()
--- a/test/boost/network_topology_strategy_test.cc
+++ b/test/boost/network_topology_strategy_test.cc
@@ -1499,7 +1499,7 @@ SEASTAR_THREAD_TEST_CASE(tablets_simple_rack_aware_view_pairing_test) {
            base_host,
            base_erm,
            view_erm,
-            *ars_ptr,
+            true, // uses NTS
            base_token,
            view_token,
            use_tablets,
--- a/test/boost/partitioner_test.cc
+++ b/test/boost/partitioner_test.cc
@@ -719,7 +719,7 @@ SEASTAR_THREAD_TEST_CASE(test_dht_subtract_ranges) {

    auto get_random_ranges = [&] (size_t max_count) {
        auto count = tests::random::get_int<size_t>(1, max_count);
-        dht::partition_range_vector ranges;
+        utils::chunked_vector<dht::partition_range> ranges;
        ranges.reserve(count);

        for (size_t i = 0; i < count; i++) {
--- a/test/boost/sstable_partition_index_cache_test.cc
+++ b/test/boost/sstable_partition_index_cache_test.cc
@@ -20,16 +20,24 @@ static void add_entry(logalloc::region& r,
      const schema& s,
      partition_index_page& page,
      const partition_key& key,
-      uint64_t position)
+      uint64_t position,
+      std::optional<parsed_promoted_index_entry> promoted_index = std::nullopt)
 {
    logalloc::allocating_section as;
    as(r, [&] {
        with_allocator(r.allocator(), [&] {
            sstables::key sst_key = sstables::key::from_partition_key(s, key);
-            page._entries.push_back(make_managed<index_entry>(
-                    managed_bytes(sst_key.get_bytes()),
-                    position,
-                    managed_ref<promoted_index>()));
+            auto key_offset = page._key_storage.size();
+            auto old_storage = std::move(page._key_storage);
+            page._key_storage = managed_bytes(managed_bytes::initialized_later(), key_offset + sst_key.get_bytes().size());
+            auto out = managed_bytes_mutable_view(page._key_storage);
+            write_fragmented(out, managed_bytes_view(old_storage));
+            write_fragmented(out, single_fragmented_view(bytes_view(sst_key)));
+            page._entries.push_back(index_entry{dht::raw_token_opt()->value, position, key_offset});
+            if (promoted_index) {
+                page._promoted_indexes.resize(page._entries.size());
+                page._promoted_indexes[page._entries.size() - 1] = *promoted_index;
+            }
        });
    });
 }
@@ -54,10 +62,10 @@ static partition_index_page make_page0(logalloc::region& r, simple_schema& s) {
 static void has_page0(partition_index_cache::entry_ptr ptr) {
    BOOST_REQUIRE(!ptr->empty());
    BOOST_REQUIRE_EQUAL(ptr->_entries.size(), 4);
-    BOOST_REQUIRE_EQUAL(ptr->_entries[0]->position(), 0);
-    BOOST_REQUIRE_EQUAL(ptr->_entries[1]->position(), 1);
-    BOOST_REQUIRE_EQUAL(ptr->_entries[2]->position(), 2);
-    BOOST_REQUIRE_EQUAL(ptr->_entries[3]->position(), 3);
+    BOOST_REQUIRE_EQUAL(ptr->_entries[0].position(), 0);
+    BOOST_REQUIRE_EQUAL(ptr->_entries[1].position(), 1);
+    BOOST_REQUIRE_EQUAL(ptr->_entries[2].position(), 2);
+    BOOST_REQUIRE_EQUAL(ptr->_entries[3].position(), 3);
 };

 SEASTAR_THREAD_TEST_CASE(test_caching) {
@@ -139,6 +147,59 @@ SEASTAR_THREAD_TEST_CASE(test_caching) {
    }
 }

+SEASTAR_THREAD_TEST_CASE(test_sparse_promoted_index) {
+    ::lru lru;
+    simple_schema s;
+    logalloc::region r;
+    partition_index_cache_stats stats;
+    partition_index_cache cache(lru, r, stats);
+
+    auto page0_loader = [&] (partition_index_cache::key_type k) -> future<partition_index_page> {
+        partition_index_page page;
+        auto destroy_page = defer([&] {
+            with_allocator(r.allocator(), [&] {
+                auto p = std::move(page);
+            });
+        });
+
+        add_entry(r, *s.schema(), page, s.make_pkey(0).key(), 0);
+        add_entry(r, *s.schema(), page, s.make_pkey(1).key(), 1, parsed_promoted_index_entry{
+            .promoted_index_start = 1,
+            .promoted_index_size = 10,
+            .num_blocks = 3
+        });
+        add_entry(r, *s.schema(), page, s.make_pkey(2).key(), 2);
+        add_entry(r, *s.schema(), page, s.make_pkey(3).key(), 3, parsed_promoted_index_entry{
+            .promoted_index_start = 2,
+            .promoted_index_size = 13,
+            .num_blocks = 1
+        });
+        add_entry(r, *s.schema(), page, s.make_pkey(4).key(), 4);
+        destroy_page.cancel();
+        co_return std::move(page);
+    };
+
+    auto page = cache.get_or_load(0, page0_loader).get();
+
+    BOOST_REQUIRE_EQUAL(page->has_promoted_index(0), false);
+    BOOST_REQUIRE_EQUAL(page->has_promoted_index(1), true);
+    BOOST_REQUIRE_EQUAL(page->has_promoted_index(2), false);
+    BOOST_REQUIRE_EQUAL(page->has_promoted_index(3), true);
+    BOOST_REQUIRE_EQUAL(page->has_promoted_index(4), false);
+
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).promoted_index_start, 1);
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).promoted_index_size, 10);
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).num_blocks, 3);
+
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).promoted_index_start, 2);
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).promoted_index_size, 13);
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).num_blocks, 1);
+
+    with_allocator(r.allocator(), [&] {
+        lru.evict_all();
+    });
+}
+
 template <typename T>
 static future<> ignore_result(future<T>&& f) {
    return f.then_wrapped([] (auto&& f) {
--- a/test/boost/tablets_test.cc
+++ b/test/boost/tablets_test.cc
@@ -1607,6 +1607,29 @@ future<> apply_resize_plan(token_metadata& tm, const migration_plan& plan) {
    }
 }

+static
+future<group0_guard> save_token_metadata(cql_test_env& e, group0_guard guard) {
+    auto& stm = e.local_db().get_shared_token_metadata();
+    auto tm = stm.get();
+
+    e.get_topology_state_machine().local()._topology.version = tm->get_version();
+
+    co_await save_tablet_metadata(e.local_db(), tm->tablets(), guard.write_timestamp());
+    utils::chunked_vector<frozen_mutation> muts;
+    muts.push_back(freeze(topology_mutation_builder(guard.write_timestamp())
+                                  .set_version(tm->get_version())
+                                  .build().to_mutation(db::system_keyspace::topology())));
+    co_await e.local_db().apply(muts, db::no_timeout);
+    co_await e.get_storage_service().local().update_tablet_metadata({});
+
+    // Need a new guard to make sure later changes use later timestamp.
+    // Also, so that the table layer processes the changes we persisted, which is important for splits.
+    // Before we can finalize a split, the storage group needs to process the split by creating split-ready compaction groups.
+    release_guard(std::move(guard));
+    abort_source as;
+    co_return co_await e.get_raft_group0_client().start_operation(as);
+}
+
 static
 future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migration_plan& plan, shared_load_stats* load_stats) {
    auto& talloc = e.get_tablet_allocator().local();
@@ -1626,19 +1649,14 @@ future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migr
        co_await stm.mutate_token_metadata([table_id, &new_tmap, &changed] (token_metadata& tm) {
            changed = true;
            tm.tablets().set_tablet_map(table_id, std::move(new_tmap));
+            tm.set_version(tm.get_version() + 1);
            return make_ready_future<>();
        });
    }

    if (changed) {
        // Need to reload on each resize because table object expects tablet count to change by a factor of 2.
-        co_await save_tablet_metadata(e.local_db(), stm.get()->tablets(), guard.write_timestamp());
-        co_await e.get_storage_service().local().update_tablet_metadata({});
-
-        // Need a new guard to make sure later changes use later timestamp.
-        release_guard(std::move(guard));
-        abort_source as;
-        guard = co_await e.get_raft_group0_client().start_operation(as);
+        guard = co_await save_token_metadata(e, std::move(guard));

        if (load_stats) {
            auto new_tm = stm.get();
@@ -1647,6 +1665,11 @@ future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migr
                load_stats->stats = *reconciled_stats;
            }
        }
+
+        testlog.debug("Calling local_topology_barrier()");
+        old_tm = nullptr;
+        co_await e.get_storage_service().local().local_topology_barrier();
+        testlog.debug("Finished local_topology_barrier()");
    }
 }

@@ -1750,13 +1773,22 @@ void do_rebalance_tablets(cql_test_env& e,
        }).get();

        if (auto_split && load_stats) {
+            bool reload = false;
            auto& tm = *stm.get();
            for (const auto& [table, tmap]: tm.tablets().all_tables_ungrouped()) {
                if (std::holds_alternative<resize_decision::split>(tmap->resize_decision().way)) {
-                    testlog.debug("set_split_ready_seq_number({}, {})", table, tmap->resize_decision().sequence_number);
-                    load_stats->set_split_ready_seq_number(table, tmap->resize_decision().sequence_number);
+                    if (load_stats->stats.tables[table].split_ready_seq_number != tmap->resize_decision().sequence_number) {
+                        testlog.debug("set_split_ready_seq_number({}, {})", table, tmap->resize_decision().sequence_number);
+                        load_stats->set_split_ready_seq_number(table, tmap->resize_decision().sequence_number);
+                        reload = true;
+                    }
                }
            }
+
+            // Need to order split-ack before split finalization, storage_group assumes that.
+            if (reload) {
+                guard = save_token_metadata(e, std::move(guard)).get();
+            }
        }

        handle_resize_finalize(e, guard, plan, load_stats).get();
--- a/test/boost/token_metadata_test.cc
+++ b/test/boost/token_metadata_test.cc
@@ -331,4 +331,28 @@ SEASTAR_THREAD_TEST_CASE(test_stale_version_notification) {
    std::cerr.rdbuf(oldCerr);

    BOOST_TEST(my_stream.str().find("topology version 0 held for") != std::string::npos);
-}
+}
+
+SEASTAR_THREAD_TEST_CASE(test_raw_token) {
+    const auto t1 = dht::token::from_int64(1);
+    const auto t2 = dht::token::from_int64(2);
+
+    dht::raw_token_opt rt_opt;
+    BOOST_REQUIRE(!rt_opt);
+    rt_opt = dht::raw_token(t1);
+    BOOST_REQUIRE(*rt_opt == t1);
+
+    BOOST_REQUIRE(dht::raw_token() == dht::minimum_token());
+    BOOST_REQUIRE(dht::raw_token() < dht::raw_token(dht::first_token()));
+    BOOST_REQUIRE(dht::raw_token() < dht::first_token());
+    BOOST_REQUIRE(dht::raw_token() < dht::maximum_token());
+
+    auto rt1 = dht::raw_token(t1);
+    BOOST_REQUIRE(bool(rt1));
+    BOOST_REQUIRE(rt1 > dht::raw_token());
+    BOOST_REQUIRE(rt1 > dht::minimum_token());
+    BOOST_REQUIRE_EQUAL(rt1, t1);
+    BOOST_REQUIRE(rt1 == t1);
+    BOOST_REQUIRE(rt1 < t2);
+    BOOST_REQUIRE(rt1 < dht::maximum_token());
+}
--- a/test/boost/view_schema_test.cc
+++ b/test/boost/view_schema_test.cc
@@ -3221,6 +3221,87 @@ SEASTAR_TEST_CASE(test_view_update_generating_writetime) {
    });
 }

+// Usually if only an unselected column in the base table is modified, we expect an optimization that a view
+// update is not done, but we had an bug(https://scylladb.atlassian.net/browse/SCYLLADB-808) where the existence
+// of a collection selected in the view caused us to skip this optimization, even when it was not modified.
+// This test reproduces this bug.
+SEASTAR_TEST_CASE(test_view_update_unmodified_collection) {
+    // In this test we verify that we correctly skip (or not) view updates to a view that selects
+    // a collection column. We use two MVs, similarly as in the test above test.
+    return do_with_cql_env_thread([] (cql_test_env& e) {
+
+        auto f1 = e.local_view_builder().wait_until_built("ks", "mv1");
+        auto f2 = e.local_view_builder().wait_until_built("ks", "mv2");
+
+        e.execute_cql("CREATE TABLE t (k int, c int, a int, b list<int>, g int, primary key(k, c))").get();
+        e.execute_cql("CREATE MATERIALIZED VIEW mv1 AS SELECT k,c,a,b FROM t "
+                         "WHERE k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (c, k)").get();
+        e.execute_cql("CREATE MATERIALIZED VIEW mv2 AS SELECT k,c,a,b FROM t "
+                         "WHERE k IS NOT NULL AND c IS NOT NULL AND a IS NOT NULL PRIMARY KEY (c, k, a)").get();
+
+        f1.get();
+        f2.get();
+
+        auto total_t_view_updates = [&] {
+            return e.db().map_reduce0([] (replica::database& local_db) {
+                const db::view::stats& local_stats = local_db.find_column_family("ks", "t").get_view_stats();
+                return local_stats.view_updates_pushed_local + local_stats.view_updates_pushed_remote;
+            }, 0, std::plus<int64_t>()).get();
+        };
+
+        auto total_mv1_updates = [&] {
+            return e.db().map_reduce0([] (replica::database& local_db) {
+                return local_db.find_column_family("ks", "mv1").get_stats().writes.hist.count;
+            }, 0, std::plus<int64_t>()).get();
+        };
+
+        auto total_mv2_updates = [&] {
+            return e.db().map_reduce0([] (replica::database& local_db) {
+                return local_db.find_column_family("ks", "mv2").get_stats().writes.hist.count;
+            }, 0, std::plus<int64_t>()).get();
+        };
+
+        ::shared_ptr<cql_transport::messages::result_message> msg;
+
+        e.execute_cql("INSERT INTO t (k, c, a) VALUES (1, 1, 1)").get();
+        eventually([&] {
+            const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
+            const update_counter expected{1, 1, 2};
+
+            BOOST_REQUIRE_EQUAL(results, expected);
+        });
+
+        // We update an unselected column and the collection remains NULL, so we should generate an
+        // update to the virtual column in mv1 but not to mv2.
+        e.execute_cql("UPDATE t SET g=1 WHERE k=1 AND c=1;").get();
+        eventually([&] {
+            const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
+            const update_counter expected{2, 1, 3};
+
+            BOOST_REQUIRE_EQUAL(results, expected);
+        });
+
+        // We update the collection with an initial value
+        e.execute_cql("UPDATE t SET b=[1] WHERE k=1 AND c=1;").get();
+        eventually([&] {
+            const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
+            const update_counter expected{3, 2, 5};
+
+            BOOST_REQUIRE_EQUAL(results, expected);
+        });
+
+        // We update an unselected column again with a non-NULL selected collection. Because the liveness of the updated column is unchanged
+        // and no other selected column is updated (in particular, the collection column), we should generate no view updates.
+        e.execute_cql("UPDATE t SET g=2 WHERE k=1 AND c=1;").get();
+        eventually([&] {
+            const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
+            const update_counter expected{3, 2, 5};
+
+            BOOST_REQUIRE_EQUAL(results, expected);
+        });
+    });
+}
+
 SEASTAR_TEST_CASE(test_conflicting_batch) {
    return do_with_cql_env_thread([] (cql_test_env& e) {

--- a/test/cluster/tasks/test_node_ops_tasks.py
+++ b/test/cluster/tasks/test_node_ops_tasks.py
@@ -254,27 +254,3 @@ async def test_node_ops_task_wait(manager: ManagerClient):

    await decommission_task
    await waiting_task
-
-@pytest.mark.asyncio
-async def test_get_children(manager: ManagerClient):
-    module_name = "node_ops"
-    tm = TaskManagerClient(manager.api)
-    servers = [await manager.server_add(cmdline=cmdline) for _ in range(2)]
-
-    injection = "tasks_vt_get_children"
-    handler = await inject_error_one_shot(manager.api, servers[0].ip_addr, injection)
-
-    log = await manager.server_open_log(servers[0].server_id)
-    mark = await log.mark()
-
-    bootstrap_task = [task for task in await tm.list_tasks(servers[0].ip_addr, module_name) if task.kind == "cluster"][0]
-
-    async def _decommission():
-        await log.wait_for('tasks_vt_get_children: waiting', from_mark=mark)
-        await manager.decommission_node(servers[1].server_id)
-        await handler.message()
-
-    async def _get_status():
-        await tm.get_task_status(servers[0].ip_addr, bootstrap_task.task_id)
-
-    await asyncio.gather(*(_decommission(), _get_status()))
--- a/test/cluster/tasks/test_tablet_tasks.py
+++ b/test/cluster/tasks/test_tablet_tasks.py
@@ -12,9 +12,11 @@ import pytest
 from test.pylib.internal_types import ServerInfo
 from test.pylib.manager_client import ManagerClient
 from test.pylib.repair import create_table_insert_data_for_repair, get_tablet_task_id
+from test.pylib.rest_client import read_barrier
 from test.pylib.tablets import get_all_tablet_replicas
 from test.cluster.conftest import skip_mode
-from test.cluster.util import create_new_test_keyspace, new_test_keyspace
+from test.cluster.util import create_new_test_keyspace, new_test_keyspace, get_topology_coordinator, find_server_by_host_id
+from test.cluster.test_incremental_repair import trigger_tablet_merge
 from test.cluster.test_tablets2 import inject_error_on
 from test.cluster.tasks.task_manager_client import TaskManagerClient
 from test.cluster.tasks.task_manager_types import TaskStatus, TaskStats
@@ -151,6 +153,45 @@ async def test_tablet_repair_task_list(manager: ManagerClient):

    await asyncio.gather(run_repair(0, "test"), run_repair(1, "test2"), run_repair(2, "test3"), check_repair_task_list(tm, servers, module_name, ks))

+@pytest.mark.asyncio
+@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
+async def test_tablet_repair_wait(manager: ManagerClient):
+    module_name = "tablets"
+    tm = TaskManagerClient(manager.api)
+
+    stop_repair_injection = "repair_tablet_repair_task_impl_run"
+    servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager)
+    assert module_name in await tm.list_modules(servers[0].ip_addr), "tablets module wasn't registered"
+
+    await inject_error_on(manager, stop_repair_injection, servers)
+    await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", "all", await_completion=False)
+
+    repair_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, "user_repair", keyspace=ks)
+    task = repair_tasks[0]
+
+    log = await manager.server_open_log(servers[0].server_id)
+    mark = await log.mark()
+
+    async def wait_for_task():
+        await enable_injection(manager, servers, "tablet_virtual_task_wait")
+        status_wait = await tm.wait_for_task(servers[0].ip_addr, task.task_id)
+
+    async def merge_tablets():
+        await log.wait_for('tablet_virtual_task: wait until tablet operation is finished', from_mark=mark)
+
+        # Resume repair.
+        await message_injection(manager, servers, stop_repair_injection)
+
+        # Merge tablets.
+        coord = await find_server_by_host_id(manager, servers, await get_topology_coordinator(manager))
+        log2 = await manager.server_open_log(coord.server_id)
+        await trigger_tablet_merge(manager, servers, [log2])
+
+        await read_barrier(manager.api, servers[0].ip_addr)
+        await message_injection(manager, servers, "tablet_virtual_task_wait")
+
+    await asyncio.gather(wait_for_task(), merge_tablets())
+
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
 async def test_tablet_repair_task_children(manager: ManagerClient):
--- a/test/cluster/test_bootstrap_with_quick_group0_join.py
+++ b/test/cluster/test_bootstrap_with_quick_group0_join.py
@@ -0,0 +1,70 @@
+#
+# Copyright (C) 2026-present ScyllaDB
+#
+# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+#
+import logging
+import asyncio
+import time
+
+import pytest
+
+from test.cluster.util import get_current_group0_config
+from test.pylib.manager_client import ManagerClient
+from test.pylib.rest_client import read_barrier
+from test.pylib.util import wait_for
+
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.mark.asyncio
+@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
+async def test_bootstrap_with_quick_group0_join(manager: ManagerClient):
+    """Regression test for https://scylladb.atlassian.net/browse/SCYLLADB-959.
+
+    The bug was that when the bootstrapping node joined group0 before reaching
+    post_server_start, it skipped post_server_start and thus hung forever.
+
+    The test simulates the scenario by starting the second node with the
+    join_group0_pause_before_config_check injection. Without the fix, the
+    startup times out.
+    """
+    logger.info("Adding first server")
+    s1 = await manager.server_add()
+
+    logger.info("Adding second server with join_group0_pause_before_config_check enabled")
+    s2 = await manager.server_add(start=False, config={
+        'error_injections_at_startup': ['join_group0_pause_before_config_check']
+    })
+
+    logger.info(f"Starting {s2}")
+    start_task = asyncio.create_task(manager.server_start(s2.server_id))
+
+    s2_log = await manager.server_open_log(s2.server_id)
+
+    await s2_log.wait_for("join_group0_pause_before_config_check: waiting for message", timeout=60)
+
+    s1_host_id = await manager.get_host_id(s1.server_id)
+    s2_host_id = await manager.get_host_id(s2.server_id)
+
+    async def s2_in_group0_config_on_s1():
+        config = await get_current_group0_config(manager, s1)
+        ids = {m[0] for m in config}
+        assert s1_host_id in ids  # sanity check
+        return True if s2_host_id in ids else None
+
+    # Note: we would like to wait for s2 to see itself in the group0 config, but we can't execute
+    # get_current_group0_config for s2, as s2 doesn't handle CQL requests at this point. As a workaround, we wait for s1
+    # to see s2 and then perform a read barrier on s2.
+    logger.info(f"Waiting for {s1} to see {s2} in the group0 config")
+    await wait_for(s2_in_group0_config_on_s1, deadline=time.time() + 60, period=0.1)
+
+    logger.info(f"Performing read barrier on {s2} to make sure it sees itself in the group0 config")
+    await read_barrier(manager.api, s2.ip_addr)
+
+    logger.info(f"Unblocking {s2}")
+    await manager.api.message_injection(s2.ip_addr, 'join_group0_pause_before_config_check')
+
+    logger.info(f"Waiting for {s2} to complete bootstrap")
+    await asyncio.wait_for(start_task, timeout=60)
--- a/test/cluster/test_encryption.py
+++ b/test/cluster/test_encryption.py
@@ -433,7 +433,8 @@ async def test_non_existant_table_master_key(manager: ManagerClient, tmpdir):

 async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
    cfg = {"authenticator": "org.apache.cassandra.auth.PasswordAuthenticator", 
-               "authorizer": "org.apache.cassandra.auth.CassandraAuthorizer"}
+               "authorizer": "org.apache.cassandra.auth.CassandraAuthorizer",
+                "commitlog_sync": "batch" }

    servers: list[ServerInfo] = await manager.servers_add(servers_num = 1, config=cfg, 
                                                          driver_connect_opts={'auth_provider': PlainTextAuthProvider(username='cassandra', password='cassandra')})
@@ -450,11 +451,14 @@ async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
            file_paths = [f for f in file_paths if os.path.isfile(f) and not os.path.islink(f)]

            for file_path in file_paths:
-                with open(file_path, 'rb') as f:
-                    data = f.read()
-                    if pbytes in data:
-                        pattern_found_counter += 1
-                        logger.debug("Pattern '%s' found in %s", pattern, file_path)
+                try:
+                    with open(file_path, 'rb') as f:
+                        data = f.read()
+                        if pbytes in data:
+                            pattern_found_counter += 1
+                            logger.debug("Pattern '%s' found in %s", pattern, file_path)
+                except FileNotFoundError:
+                    pass # assume just compacted away

        if expect:
            assert pattern_found_counter > 0
@@ -462,15 +466,15 @@ async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
            assert pattern_found_counter == 0

    async def verify_system_info(expect: bool):
-        user = f"user_{str(uuid.uuid4())}"
+        user = f"user_{str(uuid.uuid4())}".replace('-','_')
        pwd = f"pwd_{str(uuid.uuid4())}"
        cql.execute(f"CREATE USER {user} WITH PASSWORD '{pwd}' NOSUPERUSER")
        assert_one(cql, f"LIST ROLES of {user}", [user, False, True, {}])

        logger.debug("Verify PART 1: check commitlogs -------------")

-        grep_database_files(pwd, "commitlog", "**/*.log", expect)
-        grep_database_files(user, "commitlog", "**/*.log", True)
+        await grep_database_files(pwd, "commitlog", "**/*.log", False)
+        await grep_database_files(user, "commitlog", "**/*.log", expect)

        salted_hash = None
        system_auth = None
@@ -487,39 +491,38 @@ async def test_system_auth_encryption(manager: ManagerClient, tmpdir):

        assert salted_hash is not None
        assert system_auth is not None
-        grep_database_files(salted_hash, "commitlog", "**/*.log", expect)
+        await grep_database_files(salted_hash, "commitlog", "**/*.log", expect)

        rand_comment = f"comment_{str(uuid.uuid4())}"

        async with await create_ks(manager) as ks:
-            async with await new_test_table(cql, ks, "key text PRIMARY KEY, c1 text, c2 text") as table:
+            async with new_test_table(manager, ks, "key text PRIMARY KEY, c1 text, c2 text") as table:
                cql.execute(f"ALTER TABLE {table} WITH comment = '{rand_comment}'")
-                grep_database_files(rand_comment, "commitlog/schema", "**/*.log", expect)
-                nodetool.flush_all(cql)
+                await grep_database_files(rand_comment, "commitlog/schema", "**/*.log", expect)
+                # Note: original test did greping in sstables. This does no longer work
+                # since all system tables are compressed, and thus binary greping will 
+                # not work. We could do scylla sstable dump-data and grep in the json,
+                # but this is somewhat pointless as this would, if it handles it, just
+                # decrypt the info from the sstable, thus we can't really verify anything.
+                # We could maybe check that the expected system tables are in fact encrypted,
+                # though this is more a promise than guarantee... Also, the only tables
+                # encrypted are paxos and batchlog -> pointless

-                logger.debug("Verify PART 2: check sstable files -------------\n`system_info_encryption` won't encrypt sstable files on disk")
-                logger.debug("GREP_DB_FILES: Check PM key user in sstable file ....")
-                grep_database_files(user, f"data/{system_auth}/", "**/*-Data.db", expect=True)
-                logger.debug("GREP_DB_FILES: Check original password in commitlogs .... Original password should never be saved")
-                grep_database_files(pwd, f"data/{system_auth}/", "**/*-Data.db", expect=False)
-                logger.debug("GREP_DB_FILES: Check salted_hash of password in sstable file ....")
-                grep_database_files(salted_hash, f"data/{system_auth}/", "**/*-Data.db", expect=False)
-                logger.debug("GREP_DB_FILES: Check table comment in sstable file ....")
-                grep_database_files(rand_comment, "data/system_schema/", "**/*-Data.db", expect=True)
-
-    verify_system_info(True) # not encrypted
+    await verify_system_info(True) # not encrypted

    cfg = {"system_info_encryption": {
        "enabled": True, 
-        "key_provider": "LocalFileSystemKeyProviderFactory"}
+        "key_provider": "LocalFileSystemKeyProviderFactory"},
+        "system_key_directory": os.path.join(tmpdir, "resources/system_keys")
        }

    for server in servers:
-        manager.server_update_config(server.server_id, config_options=cfg)
+        await manager.server_update_config(server.server_id, config_options=cfg)
+        await manager.server_restart(server.server_id)

    await manager.rolling_restart(servers)

-    verify_system_info(False) # should not see stuff now
+    await verify_system_info(False) # should not see stuff now


 async def test_system_encryption_reboot(manager: ManagerClient, tmpdir):
--- a/test/cluster/test_incremental_repair.py
+++ b/test/cluster/test_incremental_repair.py
@@ -609,14 +609,19 @@ async def do_test_tablet_incremental_repair_merge_error(manager, error):

    scylla_path = get_scylla_path(cql)

+    coord = await get_topology_coordinator(manager)
+    coord_serv = await find_server_by_host_id(manager, servers, coord)
+    coord_log = await manager.server_open_log(coord_serv.server_id)
+
    # Trigger merge and error in merge
-    s1_mark = await logs[0].mark()
-    await inject_error_on(manager, error, servers[:1])
+    mark = await coord_log.mark()
+    await inject_error_on(manager, error, [coord_serv])
    await inject_error_on(manager, "tablet_force_tablet_count_decrease", servers)
-    await logs[0].wait_for(f'Got {error}', from_mark=s1_mark)
+    await inject_error_on(manager, "tablet_force_tablet_count_decrease_once", servers)
+    await coord_log.wait_for(f'Got {error}', from_mark=mark)
    await inject_error_off(manager, "tablet_force_tablet_count_decrease", servers)
-    await manager.server_stop(servers[0].server_id)
-    await manager.server_start(servers[0].server_id)
+    await manager.server_stop(coord_serv.server_id)
+    await manager.server_start(coord_serv.server_id)

    for server in servers:
        await manager.server_stop_gracefully(server.server_id)
@@ -862,50 +867,6 @@ async def test_repair_sigsegv_with_diff_shard_count(manager: ManagerClient, use_
            logger.info("Starting vnode repair")
            await manager.api.repair(servers[1].ip_addr, ks, "test")

-# Reproducer for https://github.com/scylladb/scylladb/issues/27365
-# Incremental repair vs tablet merge
-@pytest.mark.asyncio
-@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
-async def test_tablet_incremental_repair_tablet_merge_compaction_group_gone(manager: ManagerClient):
-    cmdline = ['--logger-log-level', 'repair=debug']
-    servers, cql, hosts, ks, table_id, logs, _, _, _, _ = await preapre_cluster_for_incremental_repair(manager, cmdline=cmdline)
-
-    coord = await get_topology_coordinator(manager)
-    coord_serv = await find_server_by_host_id(manager, servers, coord)
-    coord_log = await manager.server_open_log(coord_serv.server_id)
-
-    # Trigger merge and wait until the merge fiber starts
-    s1_mark = await coord_log.mark()
-    await inject_error_on(manager, "merge_completion_fiber", servers)
-    await inject_error_on(manager, "tablet_force_tablet_count_decrease_once", servers)
-    await inject_error_on(manager, "tablet_force_tablet_count_decrease", servers)
-    await coord_log.wait_for(f'Detected tablet merge for table', from_mark=s1_mark)
-    await inject_error_off(manager, "tablet_force_tablet_count_decrease", servers)
-    await coord_log.wait_for(f'merge_completion_fiber: waiting for message', from_mark=s1_mark)
-
-    # Trigger repair and wait for the inc repair prepare preparation to start
-    s1_mark = await coord_log.mark()
-    await inject_error_on(manager, "wait_after_prepare_sstables_for_incremental_repair", servers)
-    await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token=-1, await_completion=False, incremental_mode='incremental')
-    # Wait for preparation to start.
-    await coord_log.wait_for('Disabling compaction for range', from_mark=s1_mark)
-    # Without the serialization, sleep to increase chances of preparation finishing before merge fiber.
-    # With the serialization, preparation will wait for merge fiber to finish.
-    await asyncio.sleep(0.1)
-
-    # Continue to execute the merge fiber so that the compaction group is removed
-    await inject_error_on(manager, "replica_merge_completion_wait", servers)
-    for s in servers:
-        await manager.api.message_injection(s.ip_addr, "merge_completion_fiber")
-
-    await coord_log.wait_for(f'Merge completion fiber finished', from_mark=s1_mark)
-
-    # Continue the repair to trigger use-after-free
-    for s in servers:
-        await manager.api.message_injection(s.ip_addr, "wait_after_prepare_sstables_for_incremental_repair")
-
-    await coord_log.wait_for(f'Finished tablet repair', from_mark=s1_mark)
-
 # Reproducer for https://github.com/scylladb/scylladb/issues/27365
 # Incremental repair vs table drop
@pytest.mark.asyncio
--- a/test/cluster/test_internode_compression.py
+++ b/test/cluster/test_internode_compression.py
@@ -162,7 +162,12 @@ async def do_test_internode_compression_between_datacenters(manager: ManagerClie

    await asyncio.gather(*[manager.server_stop(s.server_id) for s,_ in servers])
    await asyncio.gather(*[p.stop() for p in proxies])
-
+    # these will all except, because we just stopped them above
+    for coro in proxy_futs:
+        try:
+            await coro
+        except:
+            pass

 async def test_internode_compression_compress_packets_between_nodes(request, manager: ManagerClient) -> None:
    def check_expected(msg_size, node1_proxy, node2_proxy, node3_proxy):
--- a/test/cluster/test_prepare_race.py
+++ b/test/cluster/test_prepare_race.py
@@ -0,0 +1,65 @@
+#
+# Copyright (C) 2026-present ScyllaDB
+#
+# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+#
+
+import asyncio
+import pytest
+
+from test.cluster.util import new_test_keyspace, new_test_table
+from test.pylib.manager_client import ManagerClient
+from test.pylib.rest_client import inject_error_one_shot
+
+
+@pytest.mark.asyncio
+@pytest.mark.skip_mode(mode="release", reason="error injections are not supported in release mode")
+async def test_prepare_fails_if_cached_statement_is_invalidated_mid_prepare(manager: ManagerClient):
+    server = await manager.server_add()
+    cql = manager.get_cql()
+    log = await manager.server_open_log(server.server_id)
+    
+    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1};") as ks:
+        async with new_test_table(manager, ks, "pk int PRIMARY KEY") as table:
+            query = f"SELECT * FROM {table} WHERE pk = ?"
+            loop = asyncio.get_running_loop()
+            await cql.run_async(f"INSERT INTO {table} (pk) VALUES (7)")
+            await cql.run_async(f"INSERT INTO {table} (pk) VALUES (8)")
+
+            handler = await inject_error_one_shot(manager.api, server.ip_addr, "query_processor_prepare_wait_after_cache_get")
+            mark = await log.mark()
+            prepare_future = loop.run_in_executor(None, lambda: cql.prepare(query))
+            await log.wait_for("query_processor_prepare_wait_after_cache_get: waiting for message", from_mark=mark, timeout=60)
+
+            # Trigger table schema update (metadata-only) to invalidate prepared statements while PREPARE is paused.
+            await cql.run_async(f"ALTER TABLE {table} WITH comment = 'invalidate-prepared-race'")
+
+            await handler.message()
+            done, _ = await asyncio.wait({prepare_future}, timeout=15)
+            if not done:
+                pytest.fail("Timed out waiting for PREPARE to complete after signaling injection")
+
+            result = done.pop().result()
+            print(f"PREPARE succeeded as expected: {result!r}")
+
+            rows = cql.execute(result, [7])
+            row = rows.one()
+            assert row is not None and row.pk == 7
+
+            # Invalidate prepared statements again, then execute the same prepared object.
+            # The driver should transparently re-prepare and re-request execution.
+            await cql.run_async(f"ALTER TABLE {table} WITH comment = 'invalidate-prepared-race-again'")
+
+            reprepare_handler = await inject_error_one_shot(manager.api, server.ip_addr, "query_processor_prepare_wait_after_cache_get")
+            reprepare_mark = await log.mark()
+            execute_future = loop.run_in_executor(None, lambda: cql.execute(result, [8]))
+            await log.wait_for("query_processor_prepare_wait_after_cache_get: waiting for message", from_mark=reprepare_mark, timeout=60)
+
+            await reprepare_handler.message()
+            execute_done, _ = await asyncio.wait({execute_future}, timeout=15)
+            if not execute_done:
+                pytest.fail("Timed out waiting for driver execute to finish after re-prepare signaling")
+
+            retried_rows = execute_done.pop().result()
+            retried_row = retried_rows.one()
+            assert retried_row is not None and retried_row.pk == 8
--- a/test/cluster/test_proxy_protocol.py
+++ b/test/cluster/test_proxy_protocol.py
@@ -16,8 +16,10 @@ import pytest
 import socket
 import ssl
 import struct
+import time

 from test.pylib.manager_client import ManagerClient
+from test.pylib.util import wait_for

 logger = logging.getLogger(__name__)

@@ -269,6 +271,28 @@ async def send_cql_with_proxy_header_tls(
            sock.close()


+async def wait_for_results(cql, query: str, expected_count: int, timeout: float = 30.0, filter_fn=None):
+    """
+    Polls `query` until at least `expected_count` rows satisfy `filter_fn` (all rows if no filter is given).
+    On timeout, logs the full result set from the last poll to aid debugging.
+    """
+    last_rows: list = []
+
+    async def check_resultset():
+        nonlocal last_rows
+        last_rows = list(await cql.run_async(query))
+        matching = filter_fn(last_rows) if filter_fn is not None else last_rows
+        if len(matching) >= expected_count:
+            return matching
+        return None
+
+    try:
+        return await wait_for(check_resultset, time.time() + timeout, period=0.1)
+    except Exception:
+        logger.error('Timed out waiting for %d matching rows in system.clients. Last poll returned %d total rows:\n%s',
+                     expected_count, len(last_rows),'\n'.join(str(r) for r in last_rows))
+        raise
+
 # Shared server configuration for all tests
 # We configure explicit SSL ports to keep the standard ports unencrypted
 # so the Python driver can connect without TLS.
@@ -368,9 +392,12 @@ async def test_proxy_protocol_shard_aware(proxy_server):
            await do_cql_handshake(reader, writer)

        # Now query system.clients to verify shard assignments
-        rows = list(cql.execute(
-            f"SELECT address, port, shard_id FROM system.clients WHERE address = '{fake_src_addr}' ALLOW FILTERING"
-        ))
+        rows = await wait_for_results(
+            cql,
+            'SELECT address, port, shard_id FROM system.clients',
+            expected_count=num_shards,
+            filter_fn=lambda all_rows: [r for r in all_rows if str(r.address) == fake_src_addr],
+        )

        # Build a map of port -> shard_id from the results
        port_to_shard = {row.port: row.shard_id for row in rows}
@@ -446,9 +473,12 @@ async def test_proxy_protocol_port_preserved_in_system_clients(proxy_server):

        # Now query system.clients using the driver to see our connection
        cql = manager.get_cql()
-        rows = list(cql.execute(
-            f"SELECT address, port FROM system.clients WHERE address = '{fake_src_addr}' ALLOW FILTERING"
-        ))
+        rows = await wait_for_results(
+            cql,
+            'SELECT address, port FROM system.clients',
+            expected_count=1,
+            filter_fn=lambda all_rows: [r for r in all_rows if str(r.address) == fake_src_addr],
+        )

        # We should find our connection with the fake source address and port
        assert len(rows) > 0, f"Expected to find connection from {fake_src_addr} in system.clients"
@@ -569,9 +599,12 @@ async def test_proxy_protocol_ssl_shard_aware(proxy_server):
                ssl_sock.recv(4096)

        # Now query system.clients to verify shard assignments
-        rows = list(cql.execute(
-            f"SELECT address, port, shard_id, ssl_enabled FROM system.clients WHERE address = '{fake_src_addr}' ALLOW FILTERING"
-        ))
+        rows = await wait_for_results(
+            cql,
+            'SELECT address, port, shard_id, ssl_enabled FROM system.clients',
+            expected_count=num_shards,
+            filter_fn=lambda all_rows: [r for r in all_rows if str(r.address) == fake_src_addr],
+        )

        # Build a map of port -> (shard_id, ssl_enabled) from the results
        port_to_info = {row.port: (row.shard_id, row.ssl_enabled) for row in rows}
@@ -656,9 +689,12 @@ async def test_proxy_protocol_ssl_port_preserved(proxy_server):

        # Now query system.clients using the driver to see our connection
        cql = manager.get_cql()
-        rows = list(cql.execute(
-            f"SELECT address, port, ssl_enabled FROM system.clients WHERE address = '{fake_src_addr}' ALLOW FILTERING"
-        ))
+        rows = await wait_for_results(
+            cql,
+            'SELECT address, port, ssl_enabled FROM system.clients',
+            expected_count=1,
+            filter_fn=lambda all_rows: [r for r in all_rows if str(r.address) == fake_src_addr],
+        )

        # We should find our connection
        assert len(rows) > 0, f"Expected to find connection from {fake_src_addr} in system.clients"
--- a/test/cluster/test_raft_no_quorum.py
+++ b/test/cluster/test_raft_no_quorum.py
@@ -7,6 +7,7 @@ import logging

 import pytest
 import asyncio
+from test.pylib.internal_types import ServerNum
 from test.pylib.manager_client import ManagerClient
 from test.cluster.conftest import skip_mode
 from test.pylib.rest_client import inject_error_one_shot, InjectionHandler, read_barrier
@@ -20,6 +21,20 @@ def fixture_raft_op_timeout(build_mode):
    return 10000 if build_mode == 'debug' else 1000


+async def update_group0_raft_op_timeout(server_id: ServerNum, manager: ManagerClient, timeout: int) -> None:
+    logger.info(f"Updating group0_raft_op_timeout_in_ms on server {server_id} to {timeout}")
+    running_ids = [srv.server_id for srv in await manager.running_servers()]
+    if server_id in running_ids:
+        # If the node is alive, server_update_config only sends the SIGHUP signal to the Scylla process, so awaiting it
+        # doesn't guarantee that the new config file is active. Work around this by looking at the logs.
+        log_file = await manager.server_open_log(server_id)
+        mark = await log_file.mark()
+        await manager.server_update_config(server_id, 'group0_raft_op_timeout_in_ms', timeout)
+        await log_file.wait_for("completed re-reading configuration file", from_mark=mark, timeout=60)
+    else:
+        await manager.server_update_config(server_id, 'group0_raft_op_timeout_in_ms', timeout)
+
+
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
@pytest.mark.skip_mode(mode='debug', reason='aarch64/debug is unpredictably slow', platform_key='aarch64')
@@ -42,7 +57,6 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)

    config = {
        'direct_failure_detector_ping_timeout_in_ms': 300,
-        'group0_raft_op_timeout_in_ms': raft_op_timeout,
        'error_injections_at_startup': [
            {
                'name': 'raft-group-registry-fd-threshold-in-ms',
@@ -64,6 +78,10 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
                         manager.server_stop_gracefully(servers[3].server_id),
                         manager.server_stop_gracefully(servers[4].server_id))

+    # Do it here to prevent unexpected timeouts before quorum loss.
+    await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, raft_op_timeout)
+                           for srv in servers[:2]))
+
    logger.info("starting a sixth node with no quorum")
    await manager.server_add(expected_error="raft operation \\[read_barrier\\] timed out, there is no raft quorum",
                             timeout=60)
@@ -76,7 +94,6 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
@pytest.mark.skip_mode(mode='debug', reason='aarch64/debug is unpredictably slow', platform_key='aarch64')
 async def test_quorum_lost_during_node_join(manager: ManagerClient, raft_op_timeout: int) -> None:
    config = {
-        'group0_raft_op_timeout_in_ms': raft_op_timeout,
        'error_injections_at_startup': [
            {
                'name': 'raft-group-registry-fd-threshold-in-ms',
@@ -107,6 +124,9 @@ async def test_quorum_lost_during_node_join(manager: ManagerClient, raft_op_time
    await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
                         manager.server_stop_gracefully(servers[2].server_id))

+    # Do it here to prevent unexpected timeouts before quorum loss.
+    await update_group0_raft_op_timeout(servers[0].server_id, manager, raft_op_timeout)
+
    logger.info("release join-node-before-add-entry injection")
    await injection_handler.message()

@@ -126,7 +146,6 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli

    logger.info("adding a fourth node")
    servers += [await manager.server_add(config={
-        'group0_raft_op_timeout_in_ms': raft_op_timeout,
        'error_injections_at_startup': [
            {
                'name': 'raft-group-registry-fd-threshold-in-ms',
@@ -153,6 +172,9 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli
    await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
                         manager.server_stop_gracefully(servers[2].server_id))

+    # Do it here to prevent unexpected timeouts before quorum loss.
+    await update_group0_raft_op_timeout(servers[3].server_id, manager, raft_op_timeout)
+
    logger.info("release join-node-response_handler-before-read-barrier injection")
    injection_handler = InjectionHandler(manager.api,
                                         'join-node-response_handler-before-read-barrier',
@@ -169,7 +191,6 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli
 async def test_cannot_run_operations(manager: ManagerClient, raft_op_timeout: int) -> None:
    logger.info("starting a first node (the leader)")
    servers = [await manager.server_add(config={
-        'group0_raft_op_timeout_in_ms': raft_op_timeout,
        'error_injections_at_startup': [
            {
                'name': 'raft-group-registry-fd-threshold-in-ms',
@@ -189,6 +210,9 @@ async def test_cannot_run_operations(manager: ManagerClient, raft_op_timeout: in
    await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
                         manager.server_stop_gracefully(servers[2].server_id))

+    # Do it here to prevent unexpected timeouts before quorum loss.
+    await update_group0_raft_op_timeout(servers[0].server_id, manager, raft_op_timeout)
+
    logger.info("attempting removenode for the second node")
    await manager.remove_node(servers[0].server_id, servers[1].server_id,
                            expected_error="raft operation [read_barrier] timed out, there is no raft quorum",
@@ -232,9 +256,7 @@ async def test_can_restart(manager: ManagerClient, raft_op_timeout: int) -> None
    await asyncio.gather(*(manager.server_stop(srv.server_id) for srv in servers))

    # This ensures the read barriers below fail quickly without group 0 quorum.
-    logger.info(f"Decreasing group0_raft_op_timeout_in_ms on {servers}")
-    await asyncio.gather(*(manager.server_update_config(srv.server_id, 'group0_raft_op_timeout_in_ms', raft_op_timeout)
-                           for srv in servers))
+    await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, raft_op_timeout) for srv in servers))

    logger.info(f"Restarting {servers[:2]} with no group 0 quorum")
    for idx, srv in enumerate(servers[:2]):
@@ -246,8 +268,7 @@ async def test_can_restart(manager: ManagerClient, raft_op_timeout: int) -> None

    # Increase the timeout back to 300s to ensure the new group 0 leader is elected before the first read barrier below
    # times out.
-    await asyncio.gather(*(manager.server_update_config(srv.server_id, 'group0_raft_op_timeout_in_ms', 300000)
-                           for srv in servers))
+    await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, 300000) for srv in servers))

    logger.info(f"Restarting {servers[2:]} with group 0 quorum")
    for srv in servers[2:]:
--- a/test/cluster/test_tablets_lwt.py
+++ b/test/cluster/test_tablets_lwt.py
@@ -978,7 +978,7 @@ async def test_tablets_merge_waits_for_lwt(manager: ManagerClient):
        await wait_for_tablet_count(manager, s0, ks, 'test', lambda c: c == 1, 1, timeout_s=15)

        logger.info("Ensure the guard decided to retain the erm")
-        await log0.wait_for("tablet_metadata_guard::check: retain the erm and abort the guard",
+        m, _ = await log0.wait_for("tablet_metadata_guard::check: retain the erm and abort the guard",
                            from_mark=m, timeout=10)

        tablets = await get_all_tablet_replicas(manager, s0, ks, 'test')
@@ -986,7 +986,11 @@ async def test_tablets_merge_waits_for_lwt(manager: ManagerClient):
        tablet = tablets[0]
        assert tablet.replicas == [(s0_host_id, 0)]

-        m = await log0.mark()
+        # Since merge now waits for erms before releasing the state machine,
+        # the migration initiated below will not start until paxos released the erm.
+        # The barrier which is blocked is the one in merge finalization.
+        # I keep the tablet movement as a guard against regressions in case the behavior changes.
+
        migration_task = asyncio.create_task(manager.api.move_tablet(s0.ip_addr, ks, "test",
                                                                     s0_host_id, 0,
                                                                     s0_host_id, 1,
--- a/test/cluster/test_tablets_merge.py
+++ b/test/cluster/test_tablets_merge.py
@@ -441,84 +441,6 @@ async def test_tablet_split_merge_with_many_tables(build_mode: str, manager: Man

    await check_logs("after merge completion")

-# Reproduces use-after-free when migration right after merge, but concurrently to background
-# merge completion handler.
-# See: https://github.com/scylladb/scylladb/issues/24045
-@pytest.mark.asyncio
-@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
-async def test_migration_running_concurrently_to_merge_completion_handling(manager: ManagerClient):
-    cmdline = []
-    # Size based balancing can attempt to migrate the merged tablet as soon as the merge is complete
-    # because of a lower transient effective_capacity on the node with the merged tablet.
-    # This migration will timeout on cleanup because the compaction group still has an active task,
-    # which is held by the merge_completion_fiber injection, so the tablet's compaction group gate
-    # can not be closed, resulting in cleanup getting stuck. We force capacity based balancing to
-    # avoid this problem.
-    cfg = {'force_capacity_based_balancing': True}
-    servers = [await manager.server_add(cmdline=cmdline, config=cfg)]
-
-    await manager.disable_tablet_balancing()
-
-    cql = manager.get_cql()
-
-    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") as ks:
-        await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);")
-
-        tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
-        assert tablet_count == 2
-
-        old_tablet_count = tablet_count
-
-        keys = range(100)
-        await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys])
-
-        await cql.run_async(f"ALTER KEYSPACE {ks} WITH tablets = {{'initial': 1}};")
-
-        s0_log = await manager.server_open_log(servers[0].server_id)
-        s0_mark = await s0_log.mark()
-
-        await manager.api.enable_injection(servers[0].ip_addr, "merge_completion_fiber", one_shot=True)
-        await manager.api.enable_injection(servers[0].ip_addr, "replica_merge_completion_wait", one_shot=True)
-        await manager.enable_tablet_balancing()
-
-        servers.append(await manager.server_add(cmdline=cmdline, config=cfg))
-        s1_host_id = await manager.get_host_id(servers[1].server_id)
-
-        async def finished_merging():
-            tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
-            return tablet_count < old_tablet_count or None
-
-        await wait_for(finished_merging, time.time() + 120)
-
-        await manager.disable_tablet_balancing()
-        await manager.api.enable_injection(servers[0].ip_addr, "take_storage_snapshot", one_shot=True)
-
-        await s0_log.wait_for(f"merge_completion_fiber: waiting", from_mark=s0_mark)
-
-        tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
-        assert tablet_count == 1
-
-        tablet_token = 0 # Doesn't matter since there is one tablet
-        replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token)
-
-        s0_host_id = await manager.get_host_id(servers[0].server_id)
-        src_shard = replica[1]
-        dst_shard = src_shard
-
-        migration = asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], src_shard, s1_host_id, dst_shard, tablet_token))
-
-        await s0_log.wait_for(f"take_storage_snapshot: waiting", from_mark=s0_mark)
-
-        await manager.api.message_injection(servers[0].ip_addr, "merge_completion_fiber")
-        await s0_log.wait_for(f"Merge completion fiber finished", from_mark=s0_mark)
-
-        await manager.api.message_injection(servers[0].ip_addr, "take_storage_snapshot")
-
-        await migration
-
-        rows = await cql.run_async(f"SELECT * FROM {ks}.test;")
-        assert len(rows) == len(keys)
-
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
 async def test_missing_data(manager: ManagerClient):
@@ -655,3 +577,77 @@ async def test_merge_with_drop(manager: ManagerClient):
        await asyncio.sleep(0.1)
        await manager.api.message_injection(server.ip_addr, "compaction_group_stop_wait")
        await drop_table_fut
+
+
+@pytest.mark.asyncio
+@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
+async def test_background_merge_deadlock(manager: ManagerClient):
+    """
+    Reproducer for https://scylladb.atlassian.net/browse/SCYLLADB-928
+
+    Reproduces a deadlock in the background merge completion handler that can happen when multiple merges accumulate.
+    If we accumulate more than 1 merge cycle for the fiber, deadlock occurs due to compaction lock taken
+    on the main group (post-merge). The lock is held until compaction groups are precessed by the background merge
+    fiber
+
+    Example:
+
+    Initial state:
+
+      cg0: main,
+      cg1: main
+      cg2: main
+      cg3: main
+
+    After 1st merge:
+
+      cg0': main [locked], merging_groups=[cg0.main, cg1.main]
+      cg1': main [locked], merging_groups=[cg2.main, cg3.main]
+
+    After 2nd merge:
+
+      cg0'': main [locked], merging_groups=[cg0'.main [locked], cg0.main, cg1.main, cg1'.main [locked], cg2.main, cg3.main]
+
+    The test reproduces this by doing a tablet merge from 8 tablets to 1 (8 -> 4 -> 2 -> 1). The background merge fiber
+    is blocked until after the first merge (to 4), so that there is a higher chance of two merges queueing in the fiber.
+
+    If deadlock occurs, node shutdown will hang waiting for the background merge fiber. That's why the test
+    tries to stop the node at the end.
+    """
+
+    cmdline = [
+        '--logger-log-level', 'load_balancer=debug',
+        '--logger-log-level', 'raft_topology=debug',
+    ]
+
+    servers = [await manager.server_add(cmdline=cmdline)]
+    cql, _ = await manager.get_ready_cql(servers)
+
+    ks = await create_new_test_keyspace(cql, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}")
+
+    # Create a table which will go through 3 merge cycles.
+    await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int) with tablets = {{'min_tablet_count': 8}};")
+
+    await manager.api.enable_injection(servers[0].ip_addr, "merge_completion_fiber", one_shot=True)
+    log = await manager.server_open_log(servers[0].server_id)
+    mark = await log.mark()
+
+    # Trigger tablet merging
+    await cql.run_async(f"ALTER TABLE {ks}.test WITH tablets = {{'min_tablet_count': 1}};")
+
+    async def produced_one_merge():
+        tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
+        return tablet_count == 4 or None
+    await wait_for(produced_one_merge, time.time() + 120)
+
+    mark, _ = await log.wait_for(f"merge_completion_fiber: waiting", from_mark=mark)
+    await manager.api.message_injection(servers[0].ip_addr, "merge_completion_fiber")
+    mark, _ = await log.wait_for(f"merge_completion_fiber: message received", from_mark=mark)
+
+    async def finished_merge():
+        tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
+        return tablet_count == 1 or None
+
+    await wait_for(finished_merge, time.time() + 120)
+
+    await manager.server_stop(servers[0].server_id)
--- a/test/cluster/test_topology_remove_garbage_group0.py
+++ b/test/cluster/test_topology_remove_garbage_group0.py
@@ -94,6 +94,8 @@ async def test_remove_garbage_group0_members(manager: ManagerClient):
    logging.info(f'stop {servers[1]}')
    await manager.server_stop_gracefully(servers[1].server_id)

+    await wait_for_token_ring_and_group0_consistency(manager, time.time() + 60)
+
    logging.info(f'removenode {servers[1]} using {servers[2]}')
    await manager.remove_node(servers[2].server_id, servers[1].server_id)

--- a/test/lib/cql_test_env.cc
+++ b/test/lib/cql_test_env.cc
@@ -559,6 +559,9 @@ private:
            cfg->ring_delay_ms.set(500);
            cfg->shutdown_announce_in_ms.set(0);
            cfg->broadcast_to_all_shards().get();
+            smp::invoke_on_all([&] {
+                sstables::global_cache_index_pages = cfg->cache_index_pages.operator utils::updateable_value<bool>();
+            }).get();
            create_directories((data_dir_path + "/system").c_str());
            create_directories(cfg->commitlog_directory().c_str());
            create_directories(cfg->schema_commitlog_directory().c_str());
--- a/test/nodetool/test_cluster_repair.py
+++ b/test/nodetool/test_cluster_repair.py
@@ -449,3 +449,68 @@ def test_repair_incremenatal_repair(nodetool, mode):
 Starting repair with task_id={id1} keyspace=ks table=table1
 Repair with task_id={id1} finished
 """
+
+def test_cluster_repair_table_dropped(nodetool):
+    id1 = "ef1b7a61-66c8-494c-bb03-6f65724e6eee"
+    res = nodetool("cluster", "repair", "ks", expected_requests=[
+        expected_request("GET", "/storage_service/keyspaces", response=["ks"]),
+        expected_request("GET", "/storage_service/keyspaces", params={"replication": "tablets"}, response=["ks"]),
+        expected_request("GET", "/column_family", response=[{"ks": "ks", "cf": "table1"}, {"ks": "ks", "cf": "table2"}]),
+        expected_request(
+            "POST",
+            "/storage_service/tablets/repair",
+            params={
+                "ks": "ks",
+                "table": "table1",
+                "tokens": "all"},
+            response={"message": "Can't find a column family table1 in keyspace ks", "code": 400}, response_status=400),
+        expected_request(
+            "POST",
+            "/storage_service/tablets/repair",
+            params={
+                "ks": "ks",
+                "table": "table2",
+                "tokens": "all"},
+            response={"tablet_task_id": id1}),
+        expected_request(
+            "GET",
+            f"/task_manager/wait_task/{id1}",
+            response={"state": "done"}),
+        ])
+
+    assert _remove_log_timestamp(res.stdout) == f"""\
+Starting repair with task_id={id1} keyspace=ks table=table2
+Repair with task_id={id1} finished
+"""
+
+def test_cluster_repair_specified_table_dropped(nodetool):
+    id1 = "ef1b7a61-66c8-494c-bb03-6f65724e6eee"
+    check_nodetool_fails_with_error_contains(
+            nodetool,
+            ("cluster", "repair", "ks", "table1", "table2"),
+            {"expected_requests": [
+                expected_request("GET", "/storage_service/keyspaces", response=["ks"]),
+                expected_request("GET", "/storage_service/keyspaces", params={"replication": "tablets"}, response=["ks"]),
+                expected_request(
+                    "POST",
+                    "/storage_service/tablets/repair",
+                    params={
+                        "ks": "ks",
+                        "table": "table1",
+                        "tokens": "all"},
+                    response={"message": "Can't find a column family table1 in keyspace ks", "code": 400}, response_status=400),
+                expected_request(
+                    "POST",
+                    "/storage_service/tablets/repair",
+                    params={
+                        "ks": "ks",
+                        "table": "table2",
+                        "tokens": "all"},
+                    response={"tablet_task_id": id1}),
+                expected_request(
+                    "GET",
+                    f"/task_manager/wait_task/{id1}",
+                    response={"state": "done"}),
+                ]
+            },
+            [f"Can't find a column family table1 in keyspace ks"])
--- a/test/perf/perf_alternator.cc
+++ b/test/perf/perf_alternator.cc
@@ -10,6 +10,7 @@
 #include <memory>
 #include <signal.h>
 #include <seastar/core/future.hh>
+#include <seastar/core/sleep.hh>
 #include <seastar/core/thread.hh>
 #include <seastar/core/app-template.hh>
 #include <seastar/http/client.hh>
@@ -78,6 +79,23 @@ static future<> make_request(http::experimental::client& cli, sstring operation,
    });
 }

+static void wait_for_alternator(const test_config& c) {
+    for (int attempt = 0; attempt < 3000; ++attempt) {
+        try {
+            auto cli = get_client(c);
+            auto close = defer([&] { cli.close().get(); });
+            make_request(cli, "ListTables", "{}").get();
+            return;
+        } catch (...) {
+        }
+        seastar::sleep(std::chrono::milliseconds(100)).get();
+        if (attempt >= 100 && attempt % 10 == 0) {
+            std::cout << fmt::format("Retrying connect to alternator port (attempt {})", attempt + 1) << std::endl;
+        }
+    }
+    throw std::runtime_error("Timed out waiting for alternator port to become ready");
+}
+
 static void delete_alternator_table(http::experimental::client& cli) {
    try {
        make_request(cli, "DeleteTable", R"({"TableName": "workloads_test"})").get();
@@ -373,6 +391,8 @@ auto make_client_pool(const test_config& c) {
 void workload_main(const test_config& c) {
    std::cout << "Running test with config: " << c << std::endl;

+    wait_for_alternator(c);
+
    auto cli = get_client(c);
    auto finally = defer([&] {
        delete_alternator_table(cli);
--- a/test/perf/perf_simple_query.cc
+++ b/test/perf/perf_simple_query.cc
@@ -330,10 +330,13 @@ int scylla_simple_query_main(int argc, char** argv) {
        ("counters", "test counters")
        ("tablets", "use tablets")
        ("initial-tablets", bpo::value<unsigned>()->default_value(128), "initial number of tablets")
+        ("sstable-summary-ratio", bpo::value<double>(), "Generate summary entry, so that summary file size / data file size ~= this ratio")
+        ("sstable-format", bpo::value<std::string>(), "SSTable format name to use")
        ("flush", "flush memtables before test")
        ("memtable-partitions", bpo::value<unsigned>(), "apply this number of partitions to memtable, then flush")
        ("json-result", bpo::value<std::string>(), "name of the json result file")
        ("enable-cache", bpo::value<bool>()->default_value(true), "enable row cache")
+        ("enable-index-cache", bpo::value<bool>()->default_value(true), "enable partition index cache")
        ("stop-on-error", bpo::value<bool>()->default_value(true), "stop after encountering the first error")
        ("timeout", bpo::value<std::string>()->default_value(""), "use timeout")
        ("bypass-cache", "use bypass cache when querying")
@@ -357,8 +360,19 @@ int scylla_simple_query_main(int argc, char** argv) {
            auto db_cfg = ::make_shared<db::config>(ext);

            const auto enable_cache = app.configuration()["enable-cache"].as<bool>();
+            const auto enable_index_cache = app.configuration()["enable-index-cache"].as<bool>();
            std::cout << "enable-cache=" << enable_cache << '\n';
+            std::cout << "enable-index-cache=" << enable_index_cache << '\n';
            db_cfg->enable_cache(enable_cache);
+            db_cfg->cache_index_pages(enable_index_cache);
+            if (app.configuration().contains("sstable-summary-ratio")) {
+                db_cfg->sstable_summary_ratio(app.configuration()["sstable-summary-ratio"].as<double>());
+            }
+            std::cout << "sstable-summary-ratio=" << db_cfg->sstable_summary_ratio() << '\n';
+            if (app.configuration().contains("sstable-format")) {
+                db_cfg->sstable_format(app.configuration()["sstable-format"].as<std::string>());
+            }
+            std::cout << "sstable-format=" << db_cfg->sstable_format() << '\n';
            cql_test_config cfg(db_cfg);
            if (app.configuration().contains("tablets")) {
                cfg.db_config->tablets_mode_for_new_keyspaces.set(db::tablets_mode_t::mode::enabled);
--- a/test/storage/test_out_of_space_prevention.py
+++ b/test/storage/test_out_of_space_prevention.py
@@ -15,9 +15,8 @@ from cassandra.cluster import ConsistencyLevel
 from cassandra.query import SimpleStatement
 from typing import Callable

-from test.cluster.conftest import skip_mode
-from test.cluster.util import get_topology_coordinator, find_server_by_host_id, new_test_keyspace, new_test_table
-from test.pylib.manager_client import ManagerClient
+from test.cluster.util import get_topology_coordinator, find_server_by_host_id, new_test_keyspace, new_test_table, reconnect_driver
+from test.pylib.manager_client import ManagerClient, wait_for_cql_and_get_hosts
 from test.pylib.tablets import get_tablet_count
 from test.pylib.util import Host
 from test.storage.conftest import space_limited_servers
@@ -81,6 +80,7 @@ async def test_user_writes_rejection(manager: ManagerClient, volumes_factory: Ca
                logger.info("Create a big file on the target node to reach critical disk utilization level")
                disk_info = psutil.disk_usage(workdir)
                with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
+                    mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
                    for _ in range(2):
                        mark, _ = await log.wait_for("database - Set critical disk utilization mode: true", from_mark=mark)

@@ -91,8 +91,9 @@ async def test_user_writes_rejection(manager: ManagerClient, volumes_factory: Ca
                    logger.info("Restart the node")
                    mark = await log.mark()
                    await manager.server_restart(servers[0].server_id)
-                    await manager.driver_connect()
-                    cql = manager.get_cql()
+                    mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
+                    cql = await reconnect_driver(manager)
+                    await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
                    for _ in range(2):
                        mark, _ = await log.wait_for("database - Set critical disk utilization mode: true", from_mark=mark)

@@ -104,6 +105,7 @@ async def test_user_writes_rejection(manager: ManagerClient, volumes_factory: Ca
                    await validate_data_existence(cql, hosts[1:], [hosts[0]], cf, 1)

                logger.info("With blob file removed, wait for DB to drop below the critical disk utilization level")
+                mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
                for _ in range(2):
                    mark, _ = await log.wait_for("database - Set critical disk utilization mode: false", from_mark=mark)

@@ -112,7 +114,7 @@ async def test_user_writes_rejection(manager: ManagerClient, volumes_factory: Ca


@pytest.mark.asyncio
-async def test_autotoogle_compaction(manager: ManagerClient, volumes_factory: Callable) -> None:
+async def test_autotoggle_compaction(manager: ManagerClient, volumes_factory: Callable) -> None:
    cmdline = [*global_cmdline,
               "--logger-log-level", "compaction=debug"]
    async with space_limited_servers(manager, volumes_factory, ["100M"]*3, cmdline=cmdline) as servers:
@@ -136,15 +138,20 @@ async def test_autotoogle_compaction(manager: ManagerClient, volumes_factory: Ca
                logger.info("Create a big file on the target node to reach critical disk utilization level")
                disk_info = psutil.disk_usage(workdir)
                with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
+                    mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
                    for _ in range(2):
                        mark, _ = await log.wait_for("compaction_manager - Drained", from_mark=mark)

                    logger.info("Restart the node")
+                    mark = await log.mark()
                    await manager.server_restart(servers[0].server_id)
+                    await reconnect_driver(manager)
+                    mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
                    for _ in range(2):
                        mark, _ = await log.wait_for("compaction_manager - Drained", from_mark=mark)

                logger.info("With blob file removed, wait for DB to drop below the critical disk utilization level")
+                mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
                for _ in range(2):
                    mark, _ = await log.wait_for("compaction_manager - Enabled", from_mark=mark)

@@ -235,7 +242,8 @@ async def test_reject_split_compaction(manager: ManagerClient, volumes_factory:
                logger.info("Create a big file on the target node to reach critical disk utilization level")
                disk_info = psutil.disk_usage(workdir)
                with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
-                    await log.wait_for(f"Split task .* for table {cf} .* stopped, reason: Compaction for {cf} was stopped due to: drain")
+                    mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
+                    await log.wait_for(f"Split task .* for table {cf} .* stopped, reason: Compaction for {cf} was stopped due to: drain", from_mark=mark)


@pytest.mark.asyncio
@@ -260,6 +268,7 @@ async def test_split_compaction_not_triggered(manager: ManagerClient, volumes_fa
                logger.info("Create a big file on the target node to reach critical disk utilization level")
                disk_info = psutil.disk_usage(workdir)
                with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
+                    s1_mark, _ = await s1_log.wait_for("Reached the critical disk utilization level", from_mark=s1_mark)
                    for _ in range(2):
                        s1_mark, _ = await s1_log.wait_for("compaction_manager - Drained", from_mark=s1_mark)

@@ -294,10 +303,13 @@ async def test_tablet_repair(manager: ManagerClient, volumes_factory: Callable)
                await manager.server_stop_gracefully(servers[0].server_id)
                await manager.server_wipe_sstables(servers[0].server_id, ks, table)
                await manager.server_start(servers[0].server_id)
+                cql = await reconnect_driver(manager)
+                await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)

                logger.info("Create a big file on the target node to reach critical disk utilization level")
                disk_info = psutil.disk_usage(workdir)
                with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
+                    mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
                    for _ in range(2):
                        mark, _ = await log.wait_for("repair - Drained", from_mark=mark)

@@ -328,16 +340,18 @@ async def test_tablet_repair(manager: ManagerClient, volumes_factory: Callable)
                    logger.info("Restart the node")
                    mark = await log.mark()
                    await manager.server_restart(servers[0].server_id, wait_others=2)
-                    await manager.driver_connect()
+                    await reconnect_driver(manager)
+                    mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
                    for _ in range(2):
                        mark, _ = await log.wait_for("repair - Drained", from_mark=mark)

                logger.info("With blob file removed, wait for the tablet repair to succeed")
+                mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
                await manager.api.wait_task(servers[0].ip_addr, task_id)


@pytest.mark.asyncio
-async def test_autotoogle_reject_incoming_migrations(manager: ManagerClient, volumes_factory: Callable) -> None:
+async def test_autotoggle_reject_incoming_migrations(manager: ManagerClient, volumes_factory: Callable) -> None:
    cfg = {
        'tablet_load_stats_refresh_interval_in_seconds': 1,
        }
@@ -377,6 +391,7 @@ async def test_autotoogle_reject_incoming_migrations(manager: ManagerClient, vol

                disk_info = psutil.disk_usage(workdir)
                with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
+                    mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
                    for _ in range(2):
                        mark, _ = await log.wait_for("database - Set critical disk utilization mode: true", from_mark=mark)

@@ -387,6 +402,7 @@ async def test_autotoogle_reject_incoming_migrations(manager: ManagerClient, vol
                    mark, _ = await log.wait_for("Streaming for tablet migration .* failed", from_mark=mark)

                logger.info("With blob file removed, wait for DB to drop below the critical disk utilization level")
+                mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
                for _ in range(2):
                    mark, _ = await log.wait_for("database - Set critical disk utilization mode: false", from_mark=mark)

@@ -435,6 +451,7 @@ async def test_node_restart_while_tablet_split(manager: ManagerClient, volumes_f
                logger.info("Create a big file on the target node to reach critical disk utilization level")
                disk_info = psutil.disk_usage(workdir)
                with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
+                    mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
                    for _ in range(2):
                        mark, _ = await log.wait_for("compaction_manager - Drained", from_mark=mark)

@@ -447,7 +464,11 @@ async def test_node_restart_while_tablet_split(manager: ManagerClient, volumes_f
                    await cql.run_async(f"ALTER TABLE {cf} WITH tablets = {{'min_tablet_count': 2}};")
                    await coord_log.wait_for(f"Generating resize decision for table {table_id} of type split")

+                    mark = await log.mark()
                    await manager.server_restart(servers[0].server_id, wait_others=2)
+                    cql = await reconnect_driver(manager)
+                    await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
+                    mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)

                    logger.info("Check if tablet split happened")
                    await assert_resize_task_info(table_id, lambda response: len(response) == 1 and response[0].resize_task_info is not None)
@@ -456,6 +477,7 @@ async def test_node_restart_while_tablet_split(manager: ManagerClient, volumes_f
                    assert await log.grep(f"compaction.*Split {cf}", from_mark=mark) == []

                logger.info("With blob file removed, wait for DB to drop below the critical disk utilization level")
+                mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
                for _ in range(2):
                    mark, _ = await log.wait_for("compaction_manager - Enabled", from_mark=mark)
                mark, _ = await log.wait_for(f"Detected tablet split for table {cf}, increasing from 1 to 2 tablets", from_mark=mark)
@@ -521,6 +543,7 @@ async def test_repair_failure_on_split_rejection(manager: ManagerClient, volumes
                logger.info("Create a big file on the target node to reach critical disk utilization level")
                disk_info = psutil.disk_usage(workdir)
                with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
+                    mark, _ = await log.wait_for("Reached the critical disk utilization level", from_mark=mark)
                    for _ in range(2):
                        mark, _ = await log.wait_for("compaction_manager - Drained", from_mark=mark)

@@ -533,9 +556,100 @@ async def test_repair_failure_on_split_rejection(manager: ManagerClient, volumes
                    assert await log.grep(f"compaction.*Split {cf}", from_mark=mark) == []

                logger.info("With blob file removed, wait for DB to drop below the critical disk utilization level")
+                mark, _ = await log.wait_for("Dropped below the critical disk utilization level", from_mark=mark)
                for _ in range(2):
                    mark, _ = await log.wait_for("compaction_manager - Enabled", from_mark=mark)

                await repair_task

                mark, _ = await log.wait_for(f"Detected tablet split for table {cf}", from_mark=mark)
+
+# Since we create 20M volumes, we need to reduce the commitlog segment size
+# otherwise we hit out of space.
+global_cmdline_with_disabled_monitor = [
+    "--disk-space-monitor-normal-polling-interval-in-seconds", "1",
+    "--critical-disk-utilization-level", "1.0",
+    "--commitlog-segment-size-in-mb", "2",
+    "--schema-commitlog-segment-size-in-mb", "4",
+    "--tablet-load-stats-refresh-interval-in-seconds", "1",
+]
+@pytest.mark.asyncio
+@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
+async def test_sstables_incrementally_released_during_streaming(manager: ManagerClient, volumes_factory: Callable) -> None:
+    """
+    Test that source node will not run out of space if major compaction rewrites the sstables being streamed.
+    Expects the file streaming and major will both release sstables incrementally, reducing chances of 2x
+    space amplification.
+
+    Scenario:
+      - Create a 2-node cluster with limited disk space.
+      - Create a table with 2 tablets, one in each node
+      - Write 20% of node capacity to each tablet.
+      - Start decommissioning one node.
+      - During streaming, create a large file on the source node to push it over 85%
+      - Run major expecting the file streaming released the sstables incrementally. Had it not, source node runs out of space.
+      - Unblock streaming
+      - Verify that the decommission operation succeeds.
+    """
+    cmdline = [*global_cmdline_with_disabled_monitor,
+               "--logger-log-level", "load_balancer=debug",
+               "--logger-log-level", "debug_error_injection=debug"
+               ]
+    # the coordinator needs more space, so creating a 40M volume for it.
+    async with space_limited_servers(manager, volumes_factory, ["40M", "20M"], cmdline=cmdline,
+                                     property_file=[{"dc": "dc1", "rack": "r1"}]*2) as servers:
+        cql, _ = await manager.get_ready_cql(servers)
+
+        workdir = await manager.server_get_workdir(servers[1].server_id)
+        log = await manager.server_open_log(servers[1].server_id)
+
+        async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'dc1': ['{servers[1].rack}'] }}"
+                                              " AND tablets = {'initial': 2}") as ks:
+            await manager.disable_tablet_balancing()
+
+            # Needs 1mb fragments in order to stress incremental release in file streaming
+            extra_table_param = "WITH compaction = {'class' : 'IncrementalCompactionStrategy', 'sstable_size_in_mb' : '1'} and compression = {}"
+            async with new_test_table(manager, ks, "pk int PRIMARY KEY, t text", extra_table_param) as cf:
+                before_disk_info = psutil.disk_usage(workdir)
+                # About 4mb per tablet
+                await asyncio.gather(*[cql.run_async(query) for query in write_generator(cf, 8000)])
+
+                # split data into 1mb fragments
+                await manager.api.keyspace_flush(servers[1].ip_addr, ks)
+                await manager.api.keyspace_compaction(servers[1].ip_addr, ks)
+
+                after_disk_info = psutil.disk_usage(workdir)
+                percent_by_writes = after_disk_info.percent - before_disk_info.percent
+                logger.info(f"Percent taken by writes {percent_by_writes}")
+
+                # assert sstable data content account for more than 20% of node's storage.
+                assert percent_by_writes > 20
+
+                # We want to trap only migrations which happened during decommission
+                await manager.api.quiesce_topology(servers[0].ip_addr)
+
+                await manager.api.enable_injection(servers[1].ip_addr, "tablet_stream_files_end_wait", one_shot=True)
+                mark = await log.mark()
+
+                logger.info(f"Workdir {workdir}")
+
+                decomm_task = asyncio.create_task(manager.decommission_node(servers[1].server_id))
+                await manager.enable_tablet_balancing()
+                mark, _ = await log.wait_for("tablet_stream_files_end_wait: waiting", from_mark=mark)
+
+                disk_info = psutil.disk_usage(workdir)
+                with random_content_file(workdir, int(disk_info.total*0.85) - disk_info.used):
+                    disk_info = psutil.disk_usage(workdir)
+                    logger.info(f"Percent used before major {disk_info.percent}")
+
+                    # Run major in order to try to reproduce 2x space amplification if files aren't released
+                    # incrementally by streamer.
+                    await manager.api.keyspace_compaction(servers[1].ip_addr, ks)
+                    await asyncio.gather(*[cql.run_async(query) for query in write_generator(cf, 100)])
+
+                    disk_info = psutil.disk_usage(workdir)
+                    logger.info(f"Percent used after major {disk_info.percent}")
+
+                    await manager.api.message_injection(servers[1].ip_addr, "tablet_stream_files_end_wait")
+
+                    await decomm_task
--- a/test/vector_search/vector_store_client_test.cc
+++ b/test/vector_search/vector_store_client_test.cc
@@ -1102,7 +1102,7 @@ SEASTAR_TEST_CASE(vector_store_client_https_wrong_hostname) {
            }));
 }

-SEASTAR_TEST_CASE(vector_store_client_https_different_ca_cert_verification_error) {
+SEASTAR_TEST_CASE(vector_store_client_https_wrong_cacert_verification_error) {
    auto broken_cert = co_await seastar::make_tmp_file();
    certificates certs;
    auto server = co_await make_vs_mock_server(co_await make_server_credentials(certs));
@@ -1129,6 +1129,33 @@ SEASTAR_TEST_CASE(vector_store_client_https_different_ca_cert_verification_error
            }));
 }

+SEASTAR_TEST_CASE(vector_store_client_https_wrong_cacert_verification_error_host_is_ip) {
+    auto broken_cert = co_await seastar::make_tmp_file();
+    certificates certs;
+    auto server = co_await make_vs_mock_server(co_await make_server_credentials(certs));
+    auto cfg = make_config();
+    cfg.db_config->vector_store_primary_uri.set(format("https://{}:{}", server->host(), server->port()));
+    cfg.db_config->vector_store_encryption_options.set({{"truststore", broken_cert.get_path().string()}});
+    co_await do_with_cql_env(
+            [&](cql_test_env& env) -> future<> {
+                auto as = abort_source_timeout();
+                auto schema = co_await create_test_table(env, "ks", "idx");
+                auto& vs = env.local_qp().vector_store_client();
+                configure(vs).with_dns({{server->host(), std::vector<std::string>{server->host()}}});
+                vs.start_background_tasks();
+
+                auto keys = co_await vs.ann("ks", "idx", schema, std::vector<float>{0.1, 0.2, 0.3}, 2, rjson::empty_object(), as.reset());
+
+                BOOST_REQUIRE(!keys);
+                BOOST_CHECK(std::holds_alternative<vector_store_client::service_unavailable>(keys.error()));
+            },
+            cfg)
+            .finally(seastar::coroutine::lambda([&] -> future<> {
+                co_await server->stop();
+                co_await remove(broken_cert);
+            }));
+}
+
 SEASTAR_TEST_CASE(vector_store_client_high_availability_unreachable) {
    auto server = co_await make_vs_mock_server();
    auto unreachable = co_await make_unreachable_socket();
--- a/tools/scylla-nodetool.cc
+++ b/tools/scylla-nodetool.cc
@@ -690,6 +690,9 @@ void cluster_repair_operation(scylla_rest_client& client, const bpo::variables_m
                        // will repair also their colocated tables.
                        continue;
                    }
+                    if (tables.empty() && std::string(ex.what()).contains("Can't find a column family")) {
+                        continue;
+                    }
                    log("ERROR: Repair request for keyspace={} table={} failed with {}", keyspace, table, ex);
                    exit_code = EXIT_FAILURE;
                }
--- a/transport/messages/result_message.cc
+++ b/transport/messages/result_message.cc
@@ -67,14 +67,17 @@ void result_message::visitor_base::visit(const result_message::exception& ex) {
    ex.throw_me();
 }

-result_message::prepared::prepared(cql3::statements::prepared_statement::checked_weak_ptr prepared, bool support_lwt_opt)
-        : _prepared(std::move(prepared))
+result_message::prepared::prepared(cql3::prepared_statements_cache::pinned_value_type prepared_entry, bool support_lwt_opt)
+        : _prepared_entry(std::move(prepared_entry))
        , _metadata(
-            _prepared->bound_names,
-            _prepared->partition_key_bind_indices,
-            support_lwt_opt ? _prepared->statement->is_conditional() : false)
-        , _result_metadata{extract_result_metadata(_prepared->statement)}
+            (*_prepared_entry)->bound_names,
+            (*_prepared_entry)->partition_key_bind_indices,
+            support_lwt_opt ? (*_prepared_entry)->statement->is_conditional() : false)
+        , _result_metadata{extract_result_metadata((*_prepared_entry)->statement)}
 {
+    for (const auto& w : (*_prepared_entry)->warnings){
+        add_warning(w);
+    }
 }

 ::shared_ptr<const cql3::metadata> result_message::prepared::extract_result_metadata(::shared_ptr<cql3::cql_statement> statement) {
--- a/transport/messages/result_message.hh
+++ b/transport/messages/result_message.hh
@@ -13,6 +13,7 @@
 #include <concepts>

 #include "cql3/result_set.hh"
+#include "cql3/prepared_statements_cache.hh"
 #include "cql3/statements/prepared_statement.hh"
 #include "cql3/query_options.hh"

@@ -30,14 +31,14 @@ namespace messages {

 class result_message::prepared : public result_message {
 private:
-    cql3::statements::prepared_statement::checked_weak_ptr _prepared;
+    cql3::prepared_statements_cache::pinned_value_type _prepared_entry;
    cql3::prepared_metadata _metadata;
    ::shared_ptr<const cql3::metadata> _result_metadata;
 protected:
-    prepared(cql3::statements::prepared_statement::checked_weak_ptr prepared, bool support_lwt_opt);
+    prepared(cql3::prepared_statements_cache::pinned_value_type prepared_entry, bool support_lwt_opt);
 public:
-    cql3::statements::prepared_statement::checked_weak_ptr& get_prepared() {
-        return _prepared;
+    cql3::statements::prepared_statement::checked_weak_ptr get_prepared() {
+        return (*_prepared_entry)->checked_weak_from_this();
    }

    const cql3::prepared_metadata& metadata() const {
@@ -49,7 +50,7 @@ public:
    }

    cql3::cql_metadata_id_type get_metadata_id() const {
-        return _prepared->get_metadata_id();
+        return (*_prepared_entry)->get_metadata_id();
    }

    class cql;
@@ -166,8 +167,8 @@ std::ostream& operator<<(std::ostream& os, const result_message::set_keyspace& m
 class result_message::prepared::cql : public result_message::prepared {
    bytes _id;
 public:
-    cql(const bytes& id, cql3::statements::prepared_statement::checked_weak_ptr p, bool support_lwt_opt)
-        : result_message::prepared(std::move(p), support_lwt_opt)
+    cql(const bytes& id, cql3::prepared_statements_cache::pinned_value_type prepared_entry, bool support_lwt_opt)
+        : result_message::prepared(std::move(prepared_entry), support_lwt_opt)
        , _id{id}
    { }

--- a/types/types.cc
+++ b/types/types.cc
@@ -715,15 +715,6 @@ void write_collection_value(bytes_ostream& out, atomic_cell_value_view val) {
    }
 }

-void write_fragmented(managed_bytes_mutable_view& out, std::string_view val) {
-    while (val.size() > 0) {
-        size_t current_n = std::min(val.size(), out.current_fragment().size());
-        memcpy(out.current_fragment().data(), val.data(), current_n);
-        val.remove_prefix(current_n);
-        out.remove_prefix(current_n);
-    }
-}
-
 template<std::integral T>
 void write_simple(managed_bytes_mutable_view& out, std::type_identity_t<T> val) {
    val = net::hton(val);
--- a/utils/managed_bytes.hh
+++ b/utils/managed_bytes.hh
@@ -566,6 +566,16 @@ inline managed_bytes::managed_bytes(const managed_bytes& o) {
    }
 }

+inline
+void write_fragmented(managed_bytes_mutable_view& out, std::string_view val) {
+    while (val.size() > 0) {
+        size_t current_n = std::min(val.size(), out.current_fragment().size());
+        memcpy(out.current_fragment().data(), val.data(), current_n);
+        val.remove_prefix(current_n);
+        out.remove_prefix(current_n);
+    }
+}
+
 template<>
 struct appending_hash<managed_bytes_view> {
    template<Hasher Hasher>
--- a/utils/managed_vector.hh
+++ b/utils/managed_vector.hh
@@ -10,6 +10,7 @@

 #include <array>
 #include <type_traits>
+#include <algorithm>

 #include "utils/allocation_strategy.hh"

@@ -27,10 +28,8 @@ private:
        T _data[0];

        external(external&& other) noexcept : _backref(other._backref) {
-            for (unsigned i = 0; i < _backref->size(); i++) {
-                new (_data + i) T(std::move(other._data[i]));
-                other._data[i].~T();
-            }
+            std::uninitialized_move(other._data, other._data + other._backref->_size, _data);
+            std::destroy(other._data, other._data + other._backref->_size);
            _backref->_data = _data;
        }
        size_t storage_size() const noexcept {
--- a/vector_search/client.cc
+++ b/vector_search/client.cc
@@ -21,6 +21,7 @@
 #include <chrono>
 #include <fmt/format.h>
 #include <netinet/tcp.h>
+#include <seastar/net/inet_address.hh>

 using namespace seastar;
 using namespace std::chrono_literals;
@@ -28,6 +29,10 @@ using namespace std::chrono_literals;
 namespace vector_search {
 namespace {

+bool is_ip_address(const sstring& host) {
+    return net::inet_address::parse_numerical(host).has_value();
+}
+
 class client_connection_factory : public http::experimental::connection_factory {
    client::endpoint_type _endpoint;
    shared_ptr<tls::certificate_credentials> _creds;
@@ -55,7 +60,11 @@ private:
    future<connected_socket> connect() {
        auto addr = socket_address(_endpoint.ip, _endpoint.port);
        if (_creds) {
-            auto socket = co_await tls::connect(_creds, addr, tls::tls_options{.server_name = _endpoint.host});
+            tls::tls_options opts;
+            if (!is_ip_address(_endpoint.host)) {
+                opts.server_name = _endpoint.host;
+            }
+            auto socket = co_await tls::connect(_creds, addr, std::move(opts));
            // tls::connect() only performs the TCP handshake — the TLS handshake is deferred until the first I/O operation.
            // Force the TLS handshake to happen here so that the connection timeout applies to it.
            co_await tls::check_session_is_resumed(socket);
@@ -124,7 +133,7 @@ seastar::future<client::request_result> client::request(
            co_return std::unexpected{aborted_error{}};
        }
        if (is_server_problem(err)) {
-            handle_server_unavailable();
+            handle_server_unavailable(err);
        }
        co_return std::unexpected{co_await map_err(err)};
    }
@@ -165,8 +174,9 @@ seastar::future<> client::close() {
    co_await _http_client.close();
 }

-void client::handle_server_unavailable() {
+void client::handle_server_unavailable(std::exception_ptr err) {
    if (!is_checking_status_in_progress()) {
+        _logger.warn("Request to vector store {} {}:{} failed: {}", _endpoint.host, _endpoint.ip, _endpoint.port, err);
        _checking_status_future = run_checking_status();
    }
 }
--- a/vector_search/client.hh
+++ b/vector_search/client.hh
@@ -12,6 +12,7 @@
 #include "utils/log.hh"
 #include "utils/updateable_value.hh"
 #include <chrono>
+#include <exception>
 #include <seastar/core/future.hh>
 #include <seastar/core/sstring.hh>
 #include <seastar/core/abort_source.hh>
@@ -60,7 +61,7 @@ private:
    seastar::future<response> request_impl(seastar::httpd::operation_type method, seastar::sstring path, std::optional<seastar::sstring> content,
            std::optional<seastar::http::reply::status_type>&& expected, seastar::abort_source& as);
    seastar::future<bool> check_status();
-    void handle_server_unavailable();
+    void handle_server_unavailable(std::exception_ptr err);
    seastar::future<> run_checking_status();
    bool is_checking_status_in_progress() const;
    std::chrono::milliseconds backoff_retry_max() const;
--- a/vint-serialization.cc
+++ b/vint-serialization.cc
@@ -18,15 +18,6 @@

 static_assert(-1 == ~0, "Not a twos-complement architecture");

-// Accounts for the case that all bits are zero.
-static vint_size_type count_leading_zero_bits(uint64_t n) noexcept {
-    if (n == 0) {
-        return vint_size_type(std::numeric_limits<uint64_t>::digits);
-    }
-
-    return vint_size_type(count_leading_zeros(n));
-}
-
 static constexpr uint64_t encode_zigzag(int64_t n) noexcept {
    // The right shift has to be arithmetic and not logical.
    return (static_cast<uint64_t>(n) << 1) ^ static_cast<uint64_t>(n >> 63);
@@ -55,16 +46,9 @@ int64_t signed_vint::deserialize(bytes_view v) {
    return decode_zigzag(un);
 }

-vint_size_type signed_vint::serialized_size_from_first_byte(bytes::value_type first_byte) {
-    return unsigned_vint::serialized_size_from_first_byte(first_byte);
-}
-
 // The number of additional bytes that we need to read.
 static vint_size_type count_extra_bytes(int8_t first_byte) {
-    // Sign extension.
-    const int64_t v(first_byte);
-
-    return count_leading_zero_bits(static_cast<uint64_t>(~v)) - vint_size_type(64 - 8);
+    return std::countl_zero(static_cast<uint8_t>(~first_byte));
 }

 static void encode(uint64_t value, vint_size_type size, bytes::iterator out) {
@@ -139,8 +123,3 @@ uint64_t unsigned_vint::deserialize(bytes_view v) {
 #endif
    return result;
 }
-
-vint_size_type unsigned_vint::serialized_size_from_first_byte(bytes::value_type first_byte) {
-    int8_t first_byte_casted = first_byte;
-    return 1 + (first_byte_casted >= 0 ? 0 : count_extra_bytes(first_byte_casted));
-}
--- a/vint-serialization.hh
+++ b/vint-serialization.hh
@@ -35,6 +35,7 @@
 #include "bytes.hh"

 #include <cstdint>
+#include <bit>

 using vint_size_type = bytes::size_type;

@@ -49,7 +50,9 @@ struct unsigned_vint final {

    static value_type deserialize(bytes_view v);

-    static vint_size_type serialized_size_from_first_byte(bytes::value_type first_byte);
+    static vint_size_type serialized_size_from_first_byte(bytes::value_type first_byte) {
+        return 1 + std::countl_zero(static_cast<uint8_t>(~first_byte));
+    }
 };

 struct signed_vint final {
@@ -61,5 +64,7 @@ struct signed_vint final {

    static value_type deserialize(bytes_view v);

-    static vint_size_type serialized_size_from_first_byte(bytes::value_type first_byte);
+    static vint_size_type serialized_size_from_first_byte(bytes::value_type first_byte) {
+        return unsigned_vint::serialized_size_from_first_byte(first_byte);
+    }
 };