Merge 'auth: sanitize {USER} substitution in LDAP URL template' from Piotr Smaron

`LDAPRoleManager` interpolated usernames directly into `ldap_url_template`, allowing LDAP filter injection and URL structure manipulation via crafted usernames. This PR adds two layers of encoding when substituting `{USER}`: 1. **RFC 4515 filter escaping** — neutralises `*`, `(`, `)`, `\`, NUL 2. **URL percent-encoding** — prevents `%`, `?`, `#` from breaking `ldap_url_parse`'s component splitting or undoing the filter escaping It also adds `validate_query_template()` at startup to reject templates that place `{USER}` outside the filter component (e.g. in the host or base DN), where filter escaping would be the wrong defense. Fixes: SCYLLADB-1309 Compatibility note: Templates with `{USER}` in the host, base DN, attributes, or extensions were previously silently accepted. They are now rejected at startup with a descriptive error. Only templates with `{USER}` in the filter component (after the third `?`) are valid. Fixes: SCYLLADB-1309 Due to severeness, should be backported to all maintained versions. Closes scylladb/scylladb#29388 * github.com:scylladb/scylladb: auth: sanitize {USER} substitution in LDAP URL templates test/ldap: add LDAP filter-injection reproducers (cherry picked from commit aecb6b1d76) Closes scylladb/scylladb#29495
test/cqlpy: Harden mutation_fragments tests against background flushes
2026-04-19 16:15:07 +00:00 · 2026-04-17 11:45:50 +02:00 · 2026-04-16 21:12:09 +03:00 · 2026-04-16 21:09:35 +03:00 · 2026-04-16 21:09:09 +03:00 · 2026-04-16 18:08:32 +03:00
140 changed files with 4017 additions and 1021 deletions
--- a/.github/workflows/trigger-scylla-ci.yaml
+++ b/.github/workflows/trigger-scylla-ci.yaml
@@ -15,13 +15,19 @@ jobs:
      - name: Verify Org Membership
        id: verify_author
        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          EVENT_NAME: ${{ github.event_name }}
+          PR_AUTHOR: ${{ github.event.pull_request.user.login }}
+          PR_ASSOCIATION: ${{ github.event.pull_request.author_association }}
+          COMMENT_AUTHOR: ${{ github.event.comment.user.login }}
+          COMMENT_ASSOCIATION: ${{ github.event.comment.author_association }}
        shell: bash
        run: |
-          if [[ "${{ github.event_name }}" == "pull_request_target" ]]; then
-            AUTHOR="${{ github.event.pull_request.user.login }}"
+          if [[ "$EVENT_NAME" == "pull_request_target" ]]; then
+            AUTHOR="$PR_AUTHOR"
+            ASSOCIATION="$PR_ASSOCIATION"
          else
-            AUTHOR="${{ github.event.comment.user.login }}"
+            AUTHOR="$COMMENT_AUTHOR"
+            ASSOCIATION="$COMMENT_ASSOCIATION"
          fi
          ORG="scylladb"
          if gh api "/orgs/${ORG}/members/${AUTHOR}" --silent 2>/dev/null; then
@@ -34,13 +40,11 @@ jobs:
      - name: Validate Comment Trigger
        if: github.event_name == 'issue_comment'
        id: verify_comment
+        env:
+          COMMENT_BODY: ${{ github.event.comment.body }}
        shell: bash
        run: |
-          BODY=$(cat << 'EOF'
-          ${{ github.event.comment.body }}
-          EOF
-          )
-          CLEAN_BODY=$(echo "$BODY" | grep -v '^[[:space:]]*>')
+          CLEAN_BODY=$(echo "$COMMENT_BODY" | grep -v '^[[:space:]]*>')

          if echo "$CLEAN_BODY" | grep -qi '@scylladbbot' && echo "$CLEAN_BODY" | grep -qi 'trigger-ci'; then
            echo "trigger=true" >> $GITHUB_OUTPUT
--- a/2
+++ b/2
@@ -78,7 +78,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=2026.1.0
+VERSION=2026.1.2

 if test -f version
 then
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -3464,7 +3464,11 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
    if (should_add_wcu) {
        rjson::add(ret, "ConsumedCapacity", std::move(consumed_capacity));
    }
-    _stats.api_operations.batch_write_item_latency.mark(std::chrono::steady_clock::now() - start_time);
+    auto duration = std::chrono::steady_clock::now() - start_time;
+    _stats.api_operations.batch_write_item_latency.mark(duration);
+    for (const auto& w : per_table_wcu) {
+        w.first->api_operations.batch_write_item_latency.mark(duration);
+    }
    co_return rjson::print(std::move(ret));
 }

@@ -4975,7 +4979,12 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
    if (!some_succeeded && eptr) {
        co_await coroutine::return_exception_ptr(std::move(eptr));
    }
-    _stats.api_operations.batch_get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
+    auto duration = std::chrono::steady_clock::now() - start_time;
+    _stats.api_operations.batch_get_item_latency.mark(duration);
+    for (const table_requests& rs : requests) {
+        lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
+        per_table_stats->api_operations.batch_get_item_latency.mark(duration);
+    }
    if (is_big(response)) {
        co_return make_streamed(std::move(response));
    } else {
--- a/auth/ldap_role_manager.cc
+++ b/auth/ldap_role_manager.cc
@@ -32,6 +32,8 @@ namespace {

 logger mylog{"ldap_role_manager"}; // `log` is taken by math.

+constexpr std::string_view user_placeholder = "{USER}";
+
 struct url_desc_deleter {
    void operator()(LDAPURLDesc *p) {
        ldap_free_urldesc(p);
@@ -40,9 +42,141 @@ struct url_desc_deleter {

 using url_desc_ptr = std::unique_ptr<LDAPURLDesc, url_desc_deleter>;

-url_desc_ptr parse_url(std::string_view url) {
+/// Escapes LDAP filter assertion value per RFC 4515 Section 3.
+/// The characters *, (, ), \, and NUL must be backslash-hex-escaped
+/// to prevent filter injection when interpolating untrusted input.
+sstring escape_filter_value(std::string_view value) {
+    size_t escapable_chars = 0;
+    for (unsigned char ch : value) {
+        switch (ch) {
+        case '*':
+        case '(':
+        case ')':
+        case '\\':
+        case '\0':
+            ++escapable_chars;
+            break;
+        default:
+            break;
+        }
+    }
+
+    if (escapable_chars == 0) {
+        return sstring(value);
+    }
+
+    sstring escaped(value.size() + escapable_chars * 2, 0);
+    size_t pos = 0;
+    for (unsigned char ch : value) {
+        switch (ch) {
+        case '*':
+            escaped[pos++] = '\\';
+            escaped[pos++] = '2';
+            escaped[pos++] = 'a';
+            break;
+        case '(':
+            escaped[pos++] = '\\';
+            escaped[pos++] = '2';
+            escaped[pos++] = '8';
+            break;
+        case ')':
+            escaped[pos++] = '\\';
+            escaped[pos++] = '2';
+            escaped[pos++] = '9';
+            break;
+        case '\\':
+            escaped[pos++] = '\\';
+            escaped[pos++] = '5';
+            escaped[pos++] = 'c';
+            break;
+        case '\0':
+            escaped[pos++] = '\\';
+            escaped[pos++] = '0';
+            escaped[pos++] = '0';
+            break;
+        default:
+            escaped[pos++] = static_cast<char>(ch);
+            break;
+        }
+    }
+
+    return escaped;
+}
+
+/// Percent-encodes characters that are not RFC 3986 "unreserved"
+/// (ALPHA / DIGIT / '-' / '.' / '_' / '~').
+///
+/// Uses explicit ASCII range checks instead of std::isalnum() because
+/// the latter is locale-dependent and could pass non-ASCII characters
+/// through unencoded under certain locale settings.
+///
+/// This is applied AFTER RFC 4515 filter escaping when the value is
+/// substituted into an LDAP URL.  It serves two purposes:
+///  1. Prevents URL-level metacharacters ('?', '#') from breaking
+///     the URL structure parsed by ldap_url_parse.
+///  2. Prevents percent-decoding (which ldap_url_parse performs on
+///     each component) from undoing the filter escaping, e.g. a
+///     literal "%2a" in the username would otherwise decode to '*'.
+sstring percent_encode_for_url(std::string_view value) {
+    static constexpr char hex[] = "0123456789ABCDEF";
+
+    size_t chars_to_encode = 0;
+    for (unsigned char ch : value) {
+        if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')
+                || ch == '-' || ch == '.' || ch == '_' || ch == '~')) {
+            ++chars_to_encode;
+        }
+    }
+
+    if (chars_to_encode == 0) {
+        return sstring(value);
+    }
+
+    sstring encoded(value.size() + chars_to_encode * 2, 0);
+    size_t pos = 0;
+    for (unsigned char ch : value) {
+        if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')
+                || ch == '-' || ch == '.' || ch == '_' || ch == '~') {
+            encoded[pos++] = static_cast<char>(ch);
+        } else {
+            encoded[pos++] = '%';
+            encoded[pos++] = hex[ch >> 4];
+            encoded[pos++] = hex[ch & 0x0F];
+        }
+    }
+
+    return encoded;
+}
+
+/// Checks whether \p sentinel appears in any parsed URL component
+/// other than the filter (host, DN, attributes, extensions).
+bool sentinel_outside_filter(const LDAPURLDesc& desc, std::string_view sentinel) {
+    auto contains = [&](const char* field) {
+        return field && std::string_view(field).find(sentinel) != std::string_view::npos;
+    };
+    if (contains(desc.lud_host) || contains(desc.lud_dn)) {
+        return true;
+    }
+    if (desc.lud_attrs) {
+        for (int i = 0; desc.lud_attrs[i]; ++i) {
+            if (contains(desc.lud_attrs[i])) {
+                return true;
+            }
+        }
+    }
+    if (desc.lud_exts) {
+        for (int i = 0; desc.lud_exts[i]; ++i) {
+            if (contains(desc.lud_exts[i])) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+url_desc_ptr parse_url(const sstring& url) {
    LDAPURLDesc *desc = nullptr;
-    if (ldap_url_parse(url.data(), &desc)) {
+    if (ldap_url_parse(url.c_str(), &desc)) {
        mylog.error("error in ldap_url_parse({})", url);
    }
    return url_desc_ptr(desc);
@@ -115,6 +249,7 @@ const resource_set& ldap_role_manager::protected_resources() const {
 }

 future<> ldap_role_manager::start() {
+    validate_query_template();
    if (!parse_url(get_url("dummy-user"))) { // Just need host and port -- any user should do.
        return make_exception_future(
                std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
@@ -199,7 +334,7 @@ future<> ldap_role_manager::revoke(std::string_view, std::string_view, ::service
 }

 future<role_set> ldap_role_manager::query_granted(std::string_view grantee_name, recursive_role_query) {
-    const auto url = get_url(grantee_name.data());
+    const auto url = get_url(grantee_name);
    auto desc = parse_url(url);
    if (!desc) {
        return make_exception_future<role_set>(std::runtime_error(format("Error parsing URL {}", url)));
@@ -331,7 +466,46 @@ future<> ldap_role_manager::remove_attribute(std::string_view role_name, std::st
 }

 sstring ldap_role_manager::get_url(std::string_view user) const {
-    return boost::replace_all_copy(_query_template, "{USER}", user);
+    // Two-layer encoding protects against injection:
+    // 1. RFC 4515 filter escaping neutralizes filter metacharacters (*, (, ), \, NUL)
+    // 2. URL percent-encoding prevents URL structure injection (?, #) and blocks
+    //    ldap_url_parse's percent-decoding from undoing the filter escaping (%2a -> *)
+    return boost::replace_all_copy(_query_template, user_placeholder,
+            percent_encode_for_url(escape_filter_value(user)));
+}
+
+void ldap_role_manager::validate_query_template() const {
+    if (_query_template.find(user_placeholder) == sstring::npos) {
+        return;
+    }
+
+    // Substitute {USER} with a sentinel and let ldap_url_parse tell us
+    // which URL component it landed in.  The sentinel is purely
+    // alphanumeric so it cannot affect URL parsing.
+    static constexpr std::string_view sentinel = "XLDAPSENTINELX";
+    sstring test_url = boost::replace_all_copy(_query_template, user_placeholder, sentinel);
+    auto desc = parse_url(test_url);
+    if (!desc) {
+        throw url_error(format("LDAP URL template is not a valid URL when {{USER}} is substituted: {}", _query_template));
+    }
+
+    // The sentinel must appear in the filter ...
+    if (!desc->lud_filter
+            || std::string_view(desc->lud_filter).find(sentinel) == std::string_view::npos) {
+        throw url_error(format(
+                "LDAP URL template places {{USER}} outside the filter component. "
+                "RFC 4515 filter escaping only protects the filter; other components "
+                "(e.g. the base DN) require different escaping and are not supported. "
+                "Template: {}", _query_template));
+    }
+    // ... and nowhere else (host, DN, attributes, extensions).
+    if (sentinel_outside_filter(*desc, sentinel)) {
+        throw url_error(format(
+                "LDAP URL template places {{USER}} outside the filter component. "
+                "RFC 4515 filter escaping only protects the filter; other components "
+                "(e.g. the host) require different escaping and are not supported. "
+                "Template: {}", _query_template));
+    }
 }

 future<std::vector<cql3::description>> ldap_role_manager::describe_role_grants() {
--- a/auth/ldap_role_manager.hh
+++ b/auth/ldap_role_manager.hh
@@ -107,6 +107,9 @@ class ldap_role_manager : public role_manager {
    /// Macro-expands _query_template, returning the result.
    sstring get_url(std::string_view user) const;

+    /// Validates that {USER}, if present, is used only in the LDAP filter component.
+    void validate_query_template() const;
+
    /// Used to auto-create roles returned by ldap.
    future<> create_role(std::string_view role_name);

--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -48,6 +48,7 @@
 #include "mutation/mutation_fragment_stream_validator.hh"
 #include "utils/assert.hh"
 #include "utils/error_injection.hh"
+#include "utils/chunked_vector.hh"
 #include "utils/pretty_printers.hh"
 #include "readers/multi_range.hh"
 #include "readers/compacting.hh"
@@ -611,23 +612,23 @@ private:
    }

    // Called in a seastar thread
-    dht::partition_range_vector
+    utils::chunked_vector<dht::partition_range>
    get_ranges_for_invalidation(const std::vector<sstables::shared_sstable>& sstables) {
        // If owned ranges is disengaged, it means no cleanup work was done and
        // so nothing needs to be invalidated.
        if (!_owned_ranges) {
-            return dht::partition_range_vector{};
+            return {};
        }
-        auto owned_ranges = dht::to_partition_ranges(*_owned_ranges, utils::can_yield::yes);
+        auto owned_ranges = dht::to_partition_ranges_chunked(*_owned_ranges).get();

        auto non_owned_ranges = sstables
                | std::views::transform([] (const sstables::shared_sstable& sst) {
            seastar::thread::maybe_yield();
            return dht::partition_range::make({sst->get_first_decorated_key(), true},
                                              {sst->get_last_decorated_key(), true});
-        })      | std::ranges::to<dht::partition_range_vector>();
+        })      | std::ranges::to<utils::chunked_vector<dht::partition_range>>();

-        return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
+        return dht::subtract_ranges(*_schema, std::move(non_owned_ranges), std::move(owned_ranges)).get();
    }
 protected:
    compaction(compaction_group_view& table_s, compaction_descriptor descriptor, compaction_data& cdata, compaction_progress_monitor& progress_monitor, use_backlog_tracker use_backlog_tracker)
@@ -718,8 +719,8 @@ protected:

    compaction_completion_desc
    get_compaction_completion_desc(std::vector<sstables::shared_sstable> input_sstables, std::vector<sstables::shared_sstable> output_sstables) {
-        auto ranges_for_for_invalidation = get_ranges_for_invalidation(input_sstables);
-        return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges_for_for_invalidation)};
+        auto ranges = get_ranges_for_invalidation(input_sstables);
+        return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges)};
    }

    // Tombstone expiration is enabled based on the presence of sstable set.
--- a/compaction/compaction_descriptor.hh
+++ b/compaction/compaction_descriptor.hh
@@ -16,6 +16,7 @@
 #include "sstables/sstable_set.hh"
 #include "compaction_fwd.hh"
 #include "mutation_writer/token_group_based_splitting_writer.hh"
+#include "utils/chunked_vector.hh"

 namespace compaction {

@@ -38,7 +39,7 @@ struct compaction_completion_desc {
    // New, fresh SSTables that should be added to SSTable set, replacing the old ones.
    std::vector<sstables::shared_sstable> new_sstables;
    // Set of compacted partition ranges that should be invalidated in the cache.
-    dht::partition_range_vector ranges_for_cache_invalidation;
+    utils::chunked_vector<dht::partition_range> ranges_for_cache_invalidation;
 };

 // creates a new SSTable for a given shard
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -1268,9 +1268,15 @@ future<> compaction_manager::start(const db::config& cfg, utils::disk_space_moni
    if (dsm && (this_shard_id() == 0)) {
        _out_of_space_subscription = dsm->subscribe(cfg.critical_disk_utilization_level, [this] (auto threshold_reached) {
            if (threshold_reached) {
-                return container().invoke_on_all([] (compaction_manager& cm) { return cm.drain(); });
+                return container().invoke_on_all([] (compaction_manager& cm) {
+                    cm._in_critical_disk_utilization_mode = true;
+                    return cm.drain();
+                });
            }
-            return container().invoke_on_all([] (compaction_manager& cm) { cm.enable(); });
+            return container().invoke_on_all([] (compaction_manager& cm) {
+                cm._in_critical_disk_utilization_mode = false;
+                cm.enable();
+            });
        });
    }

@@ -2291,6 +2297,16 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
    return perform_task_on_all_files<split_compaction_task_executor>("split", info, t, std::move(options), std::move(owned_ranges_ptr), std::move(get_sstables), throw_if_stopping::no);
 }

+std::exception_ptr compaction_manager::make_disabled_exception(compaction::compaction_group_view& cg) {
+    std::exception_ptr ex;
+    if (_in_critical_disk_utilization_mode) {
+        ex = std::make_exception_ptr(std::runtime_error("critical disk utilization"));
+    } else {
+        ex = std::make_exception_ptr(compaction_stopped_exception(cg.schema()->ks_name(), cg.schema()->cf_name(), "compaction disabled"));
+    }
+    return ex;
+}
+
 future<std::vector<sstables::shared_sstable>>
 compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
    if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
@@ -2300,8 +2316,7 @@ compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compac
    // We don't want to prevent split because compaction is temporarily disabled on a view only for synchronization,
    // which is unneeded against new sstables that aren't part of any set yet, so never use can_proceed(&t) here.
    if (is_disabled()) {
-        co_return coroutine::exception(std::make_exception_ptr(std::runtime_error(format("Cannot split {} because manager has compaction disabled, " \
-                                                                                         "reason might be out of space prevention", sst->get_filename()))));
+        co_return coroutine::exception(make_disabled_exception(t));
    }
    std::vector<sstables::shared_sstable> ret;

--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -114,6 +114,8 @@ private:
    uint32_t _disabled_state_count = 0;

    bool is_disabled() const { return _state != state::running || _disabled_state_count > 0; }
+    // precondition: is_disabled() is true.
+    std::exception_ptr make_disabled_exception(compaction::compaction_group_view& cg);

    std::optional<future<>> _stop_future;

@@ -173,6 +175,7 @@ private:
    tombstone_gc_state _tombstone_gc_state;

    utils::disk_space_monitor::subscription _out_of_space_subscription;
+    bool _in_critical_disk_utilization_mode = false;
 private:
    // Requires task->_compaction_state.gate to be held and task to be registered in _tasks.
    future<compaction_stats_opt> perform_task(shared_ptr<compaction::compaction_task_executor> task, throw_if_stopping do_throw_if_stopping);
--- a/cql3/prepared_statements_cache.hh
+++ b/cql3/prepared_statements_cache.hh
@@ -105,6 +105,7 @@ public:
    static const std::chrono::minutes entry_expiry;

    using key_type = prepared_cache_key_type;
+    using pinned_value_type = cache_value_ptr;
    using value_type = checked_weak_ptr;
    using statement_is_too_big = typename cache_type::entry_is_too_big;

@@ -116,9 +117,14 @@ public:
        : _cache(size, entry_expiry, logger)
    {}

+    template <typename LoadFunc>
+    future<pinned_value_type> get_pinned(const key_type& key, LoadFunc&& load) {
+        return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); });
+    }
+
    template <typename LoadFunc>
    future<value_type> get(const key_type& key, LoadFunc&& load) {
-        return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); }).then([] (cache_value_ptr v_ptr) {
+        return get_pinned(key, std::forward<LoadFunc>(load)).then([] (cache_value_ptr v_ptr) {
            return make_ready_future<value_type>((*v_ptr)->checked_weak_from_this());
        });
    }
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -697,7 +697,7 @@ future<::shared_ptr<cql_transport::messages::result_message::prepared>>
 query_processor::prepare(sstring query_string, const service::client_state& client_state, cql3::dialect d) {
    try {
        auto key = compute_id(query_string, client_state.get_raw_keyspace(), d);
-        auto prep_ptr = co_await _prepared_cache.get(key, [this, &query_string, &client_state, d] {
+        auto prep_entry = co_await _prepared_cache.get_pinned(key, [this, &query_string, &client_state, d] {
                auto prepared = get_statement(query_string, client_state, d);
                prepared->calculate_metadata_id();
                auto bound_terms = prepared->statement->get_bound_terms();
@@ -711,13 +711,13 @@ query_processor::prepare(sstring query_string, const service::client_state& clie
                return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
            });

-        const auto& warnings = prep_ptr->warnings;
-        const auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_ptr),
+        co_await utils::get_local_injector().inject(
+                "query_processor_prepare_wait_after_cache_get",
+                utils::wait_for_message(std::chrono::seconds(60)));
+  
+        auto msg = ::make_shared<result_message::prepared::cql>(prepared_cache_key_type::cql_id(key), std::move(prep_entry),
                    client_state.is_protocol_extension_set(cql_transport::cql_protocol_extension::LWT_ADD_METADATA_MARK));
-        for (const auto& w : warnings) {
-            msg->add_warning(w);
-        }
-        co_return ::shared_ptr<cql_transport::messages::result_message::prepared>(std::move(msg));
+        co_return std::move(msg);
    } catch(typename prepared_statements_cache::statement_is_too_big&) {
        throw prepared_statement_is_too_big(query_string);
    }
@@ -1029,6 +1029,11 @@ query_processor::execute_batch_without_checking_exception_message(
        query_options& options,
        std::unordered_map<prepared_cache_key_type, authorized_prepared_statements_cache::value_type> pending_authorization_entries) {
    auto access_future = co_await coroutine::as_future(batch->check_access(*this, query_state.get_client_state()));
+    bool failed = access_future.failed();
+    co_await audit::inspect(batch, query_state, options, failed);
+    if (failed) {
+        std::rethrow_exception(access_future.get_exception());
+    }
    co_await coroutine::parallel_for_each(pending_authorization_entries, [this, &query_state] (auto& e) -> future<> {
            try {
                co_await _authorized_prepared_cache.insert(*query_state.get_client_state().user(), e.first, std::move(e.second));
@@ -1036,11 +1041,6 @@ query_processor::execute_batch_without_checking_exception_message(
                log.error("failed to cache the entry: {}", std::current_exception());
            }
        });
-    bool failed = access_future.failed();
-    co_await audit::inspect(batch, query_state, options, failed);
-    if (access_future.failed()) {
-        std::rethrow_exception(access_future.get_exception());
-    }
    batch->validate();
    batch->validate(*this, query_state.get_client_state());
    _stats.queries_by_cl[size_t(options.get_consistency())] += batch->get_statements().size();
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -201,6 +201,10 @@ public:
        return _clustering_columns_restrictions;
    }

+    const expr::expression& get_nonprimary_key_restrictions() const {
+        return _nonprimary_key_restrictions;
+    }
+
    // Get a set of columns restricted by the IS NOT NULL restriction.
    // IS NOT NULL is a special case that is handled separately from other restrictions.
    const std::unordered_set<const column_definition*> get_not_null_columns() const;
--- a/db/partition_snapshot_row_cursor.hh
+++ b/db/partition_snapshot_row_cursor.hh
@@ -461,7 +461,17 @@ public:
                    }
                }
            } else {
-                _background_continuity = true; // Default continuity
+                if (_reversed) [[unlikely]] {
+                    if (!rows.empty()) {
+                        it = std::prev(rows.end());
+                        cont = is_continuous::yes;
+                        rt = {};
+                    } else {
+                        _background_continuity = true;
+                    }
+                } else {
+                    _background_continuity = true;
+                }
            }

            if (!it) {
--- a/db/row_cache.cc
+++ b/db/row_cache.cc
@@ -29,6 +29,7 @@
 #include "utils/assert.hh"
 #include "utils/updateable_value.hh"
 #include "utils/labels.hh"
+#include "utils/chunked_vector.hh"

 namespace cache {

@@ -1215,10 +1216,10 @@ future<> row_cache::invalidate(external_updater eu, const dht::decorated_key& dk
 }

 future<> row_cache::invalidate(external_updater eu, const dht::partition_range& range, cache_invalidation_filter filter) {
-    return invalidate(std::move(eu), dht::partition_range_vector({range}), std::move(filter));
+    return invalidate(std::move(eu), utils::chunked_vector<dht::partition_range>({range}), std::move(filter));
 }

-future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&& ranges, cache_invalidation_filter filter) {
+future<> row_cache::invalidate(external_updater eu, utils::chunked_vector<dht::partition_range>&& ranges, cache_invalidation_filter filter) {
    return do_update(std::move(eu), [this, ranges = std::move(ranges), filter = std::move(filter)] mutable {
        return seastar::async([this, ranges = std::move(ranges), filter = std::move(filter)] {
            auto on_failure = defer([this] () noexcept {
--- a/db/row_cache.hh
+++ b/db/row_cache.hh
@@ -17,6 +17,7 @@
 #include "utils/histogram.hh"
 #include "mutation/partition_version.hh"
 #include "utils/double-decker.hh"
+#include "utils/chunked_vector.hh"
 #include "db/cache_tracker.hh"
 #include "readers/empty.hh"
 #include "readers/mutation_source.hh"
@@ -457,7 +458,7 @@ public:
    // mutation source made prior to the call to invalidate().
    future<> invalidate(external_updater, const dht::decorated_key&);
    future<> invalidate(external_updater, const dht::partition_range& = query::full_partition_range, cache_invalidation_filter filter = [] (const auto&) { return true; });
-    future<> invalidate(external_updater, dht::partition_range_vector&&, cache_invalidation_filter filter = [] (const auto&) { return true; });
+    future<> invalidate(external_updater, utils::chunked_vector<dht::partition_range>&&, cache_invalidation_filter filter = [] (const auto&) { return true; });

    // Evicts entries from cache.
    //
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -105,7 +105,7 @@ namespace {
        schema_builder::register_schema_initializer([](schema_builder& builder) {
            if (builder.ks_name() == schema_tables::NAME) {
                // all schema tables are group0 tables
-                builder.set_is_group0_table(true);
+                builder.set_is_group0_table();
            }
        });
 }
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -87,31 +87,15 @@ namespace {
        static const std::unordered_set<sstring> tables = {
            schema_tables::SCYLLA_TABLE_SCHEMA_HISTORY,
            system_keyspace::BROADCAST_KV_STORE,
-            system_keyspace::CDC_GENERATIONS_V3,
            system_keyspace::RAFT,
            system_keyspace::RAFT_SNAPSHOTS,
            system_keyspace::RAFT_SNAPSHOT_CONFIG,
            system_keyspace::GROUP0_HISTORY,
            system_keyspace::DISCOVERY,
-            system_keyspace::TABLETS,
-            system_keyspace::TOPOLOGY,
-            system_keyspace::TOPOLOGY_REQUESTS,
            system_keyspace::LOCAL,
            system_keyspace::PEERS,
-            system_keyspace::SCYLLA_LOCAL,
            system_keyspace::COMMITLOG_CLEANUPS,
-            system_keyspace::SERVICE_LEVELS_V2,
-            system_keyspace::VIEW_BUILD_STATUS_V2,
-            system_keyspace::CDC_STREAMS_STATE,
-            system_keyspace::CDC_STREAMS_HISTORY,
-            system_keyspace::ROLES,
-            system_keyspace::ROLE_MEMBERS,
-            system_keyspace::ROLE_ATTRIBUTES,
-            system_keyspace::ROLE_PERMISSIONS,
            system_keyspace::CDC_LOCAL,
-            system_keyspace::DICTS,
-            system_keyspace::VIEW_BUILDING_TASKS,
-            system_keyspace::CLIENT_ROUTES,
        };
        if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
            builder.enable_schema_commitlog();
@@ -143,7 +127,7 @@ namespace {
                system_keyspace::REPAIR_TASKS,
            };
            if (builder.ks_name() == system_keyspace::NAME && tables.contains(builder.cf_name())) {
-                builder.set_is_group0_table(true);
+                builder.set_is_group0_table();
            }
        });
 }
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -930,8 +930,7 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
    const row& existing_row = existing.cells();
    const row& updated_row = update.cells();

-    const bool base_has_nonexpiring_marker = update.marker().is_live() && !update.marker().is_expiring();
-    return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row, base_has_nonexpiring_marker] (const column_definition& cdef) {
+    return std::ranges::all_of(_base->regular_columns(), [this, &updated_row, &existing_row] (const column_definition& cdef) {
        const auto view_it = _view->columns_by_name().find(cdef.name());
        const bool column_is_selected = view_it != _view->columns_by_name().end();

@@ -939,49 +938,29 @@ bool view_updates::can_skip_view_updates(const clustering_or_static_row& update,
        // as part of its PK, there are NO virtual columns corresponding to the unselected columns in the view.
        // Because of that, we don't generate view updates when the value in an unselected column is created
        // or changes.
-        if (!column_is_selected && _base_info.has_base_non_pk_columns_in_view_pk) {
+        if (!column_is_selected) {
            return true;
        }

-        //TODO(sarna): Optimize collections case - currently they do not go under optimization
-        if (!cdef.is_atomic()) {
-            return false;
-        }
-
-        // We cannot skip if the value was created or deleted, unless we have a non-expiring marker
+        // We cannot skip if the value was created or deleted
        const auto* existing_cell = existing_row.find_cell(cdef.id);
        const auto* updated_cell = updated_row.find_cell(cdef.id);
        if (existing_cell == nullptr || updated_cell == nullptr) {
-            return existing_cell == updated_cell || (!column_is_selected && base_has_nonexpiring_marker);
+            return existing_cell == updated_cell;
        }
+
+        if (!cdef.is_atomic()) {
+            return existing_cell->as_collection_mutation().data == updated_cell->as_collection_mutation().data;
+        }
+
        atomic_cell_view existing_cell_view = existing_cell->as_atomic_cell(cdef);
        atomic_cell_view updated_cell_view = updated_cell->as_atomic_cell(cdef);

        // We cannot skip when a selected column is changed
-        if (column_is_selected) {
-            if (view_it->second->is_view_virtual()) {
-                return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
-            }
-            return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
+        if (view_it->second->is_view_virtual()) {
+            return atomic_cells_liveness_equal(existing_cell_view, updated_cell_view);
        }
-
-        // With non-expiring row marker, liveness checks below are not relevant
-        if (base_has_nonexpiring_marker) {
-            return true;
-        }
-
-        if (existing_cell_view.is_live() != updated_cell_view.is_live()) {
-            return false;
-        }
-
-        // We cannot skip if the change updates TTL
-        const bool existing_has_ttl = existing_cell_view.is_live_and_has_ttl();
-        const bool updated_has_ttl = updated_cell_view.is_live_and_has_ttl();
-        if (existing_has_ttl || updated_has_ttl) {
-            return existing_has_ttl == updated_has_ttl && existing_cell_view.expiry() == updated_cell_view.expiry();
-        }
-
-        return true;
+        return compare_atomic_cell_for_merge(existing_cell_view, updated_cell_view) == 0;
    });
 }

@@ -1749,7 +1728,7 @@ static endpoints_to_update get_view_natural_endpoint_vnodes(
        std::vector<std::reference_wrapper<const locator::node>> base_nodes,
        std::vector<std::reference_wrapper<const locator::node>> view_nodes,
        locator::endpoint_dc_rack my_location,
-        const locator::network_topology_strategy* network_topology,
+        const bool network_topology,
        replica::cf_stats& cf_stats) {
    using node_vector = std::vector<std::reference_wrapper<const locator::node>>;
    node_vector base_endpoints, view_endpoints;
@@ -1902,7 +1881,7 @@ endpoints_to_update get_view_natural_endpoint(
        locator::host_id me,
        const locator::effective_replication_map_ptr& base_erm,
        const locator::effective_replication_map_ptr& view_erm,
-        const locator::abstract_replication_strategy& replication_strategy,
+        const bool network_topology,
        const dht::token& base_token,
        const dht::token& view_token,
        bool use_tablets,
@@ -1910,7 +1889,6 @@ endpoints_to_update get_view_natural_endpoint(
    auto& topology = base_erm->get_token_metadata_ptr()->get_topology();
    auto& view_topology = view_erm->get_token_metadata_ptr()->get_topology();
    auto& my_location = topology.get_location(me);
-    auto* network_topology = dynamic_cast<const locator::network_topology_strategy*>(&replication_strategy);

    auto resolve = [&] (const locator::topology& topology, const locator::host_id& ep, bool is_view) -> const locator::node& {
        if (auto* np = topology.find_node(ep)) {
@@ -1944,7 +1922,7 @@ endpoints_to_update get_view_natural_endpoint(
                // view pairing as the leaving base replica.
                // note that the recursive call will not recurse again because leaving_base is in base_nodes.
                auto leaving_base = it->get().host_id();
-                return get_view_natural_endpoint(leaving_base, base_erm, view_erm, replication_strategy, base_token,
+                return get_view_natural_endpoint(leaving_base, base_erm, view_erm, network_topology, base_token,
                        view_token, use_tablets, cf_stats);
            }
        }
@@ -2040,7 +2018,9 @@ future<> view_update_generator::mutate_MV(
        wait_for_all_updates wait_for_all)
 {
    auto& ks = _db.find_keyspace(base->ks_name());
-    auto& replication = ks.get_replication_strategy();
+    const bool uses_tablets = ks.uses_tablets();
+    const bool uses_nts = dynamic_cast<const locator::network_topology_strategy*>(&ks.get_replication_strategy()) != nullptr;
+    // The object pointed by `ks` may disappear after preeemption. It should not be touched again after this comment.
    std::unordered_map<table_id, locator::effective_replication_map_ptr> erms;
    auto get_erm = [&] (table_id id) {
        auto it = erms.find(id);
@@ -2059,8 +2039,8 @@ future<> view_update_generator::mutate_MV(
    co_await max_concurrent_for_each(view_updates, max_concurrent_updates, [&] (frozen_mutation_and_schema mut) mutable -> future<> {
        auto view_token = dht::get_token(*mut.s, mut.fm.key());
        auto view_ermp = erms.at(mut.s->id());
-        auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, replication, base_token, view_token,
-                ks.uses_tablets(), cf_stats);
+        auto [target_endpoint, no_pairing_endpoint] = get_view_natural_endpoint(me, base_ermp, view_ermp, uses_nts, base_token, view_token,
+                uses_tablets, cf_stats);
        auto remote_endpoints = view_ermp->get_pending_replicas(view_token);
        auto memory_units = seastar::make_lw_shared<db::timeout_semaphore_units>(pending_view_update_memory_units.split(memory_usage_of(mut)));
        if (no_pairing_endpoint) {
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -303,7 +303,7 @@ endpoints_to_update get_view_natural_endpoint(
    locator::host_id node,
    const locator::effective_replication_map_ptr& base_erm,
    const locator::effective_replication_map_ptr& view_erm,
-    const locator::abstract_replication_strategy& replication_strategy,
+    const bool network_topology,
    const dht::token& base_token,
    const dht::token& view_token,
    bool use_tablets,
--- a/db/view/view_building_worker.cc
+++ b/db/view/view_building_worker.cc
@@ -200,9 +200,7 @@ future<> view_building_worker::run_staging_sstables_registrator() {
    while (!_as.abort_requested()) {
        bool sleep = false;
        try {
-            auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
            co_await create_staging_sstable_tasks();
-            lock.return_all();
            _as.check();
            co_await _sstables_to_register_event.when();
        } catch (semaphore_aborted&) {
@@ -227,13 +225,45 @@ future<> view_building_worker::run_staging_sstables_registrator() {
    }
 }

+future<std::vector<foreign_ptr<semaphore_units<>>>> view_building_worker::lock_staging_mutex_on_multiple_shards(std::flat_set<shard_id> shards) {
+    SCYLLA_ASSERT(this_shard_id() == 0);
+    // Collect `_staging_sstables_mutex` locks from multiple shards,
+    // so other shards won't interact with their `_staging_sstables` map
+    // until the caller releases them.
+    std::vector<foreign_ptr<semaphore_units<>>> locks;
+    locks.resize(smp::count);
+    // Locks are acquired from multiple shards in parallel.
+    // This is the only place where multiple-shard locks are acquired at once
+    // and the method is called only once at a time (from `create_staging_sstable_tasks()`
+    // on shard 0), so no deadlock may occur.
+    co_await coroutine::parallel_for_each(shards, [&locks, &sharded_vbw = container()] (auto shard_id) -> future<> {
+        auto lock_ptr = co_await smp::submit_to(shard_id, [&sharded_vbw] () -> future<foreign_ptr<semaphore_units<>>> {
+            auto& vbw = sharded_vbw.local();
+            auto lock = co_await get_units(vbw._staging_sstables_mutex, 1, vbw._as);
+            co_return make_foreign(std::move(lock));
+        });
+        locks[shard_id] = std::move(lock_ptr);
+    });
+    co_return std::move(locks);
+}
+
 future<> view_building_worker::create_staging_sstable_tasks() {
+    // Explicitly lock shard0 beforehand to prevent other shards from modifying `_sstables_to_register` from `register_staging_sstable_tasks()`
+    auto lock0 = co_await get_units(_staging_sstables_mutex, 1, _as);
+
    if (_sstables_to_register.empty()) {
        co_return;
    }

-    utils::chunked_vector<canonical_mutation> cmuts;
+    auto shards = _sstables_to_register 
+        | std::views::values 
+        | std::views::join 
+        | std::views::transform([] (const auto& sst_info) { return sst_info.shard; }) 
+        | std::ranges::to<std::flat_set<shard_id>>();
+    shards.erase(0); // We're already holding shard0 lock
+    auto locks = co_await lock_staging_mutex_on_multiple_shards(std::move(shards));

+    utils::chunked_vector<canonical_mutation> cmuts;
    auto guard = co_await _group0.client().start_operation(_as);
    auto my_host_id = _db.get_token_metadata().get_topology().my_host_id();
    for (auto& [table_id, sst_infos]: _sstables_to_register) {
@@ -672,24 +702,34 @@ future<> view_building_worker::do_build_range(table_id base_id, std::vector<tabl
 }

 future<> view_building_worker::do_process_staging(table_id table_id, dht::token last_token) {
-    if (_staging_sstables[table_id].empty()) {
+    auto table = _db.get_tables_metadata().get_table(table_id).shared_from_this();
+    std::vector<sstables::shared_sstable> sstables_to_process;
+
+    try {
+        // Acquire `_staging_sstables_mutex` to prevent `create_staging_sstable_tasks()` from
+        // concurrently modifying `_staging_sstables` (moving entries from `_sstables_to_register`)
+        // while we read them.
+        auto lock = co_await get_units(_staging_sstables_mutex, 1, _as);
+        auto& tablet_map = table->get_effective_replication_map()->get_token_metadata().tablets().get_tablet_map(table_id);
+        auto tid = tablet_map.get_tablet_id(last_token);
+        auto tablet_range = tablet_map.get_token_range(tid);
+
+        // Select sstables belonging to the tablet (identified by `last_token`)
+        for (auto& sst: _staging_sstables[table_id]) {
+            auto sst_last_token = sst->get_last_decorated_key().token();
+            if (tablet_range.contains(sst_last_token, dht::token_comparator())) {
+                sstables_to_process.push_back(sst);
+            }
+        }
+        lock.return_all();
+    } catch (semaphore_aborted&) {
+        vbw_logger.warn("Semaphore was aborted while waiting to removed processed sstables for table {}", table_id);
        co_return;
    }

-    auto table = _db.get_tables_metadata().get_table(table_id).shared_from_this();
-    auto& tablet_map = table->get_effective_replication_map()->get_token_metadata().tablets().get_tablet_map(table_id);
-    auto tid = tablet_map.get_tablet_id(last_token);
-    auto tablet_range = tablet_map.get_token_range(tid);
-
-    // Select sstables belonging to the tablet (identified by `last_token`)
-    std::vector<sstables::shared_sstable> sstables_to_process;
-    for (auto& sst: _staging_sstables[table_id]) {
-        auto sst_last_token = sst->get_last_decorated_key().token();
-        if (tablet_range.contains(sst_last_token, dht::token_comparator())) {
-            sstables_to_process.push_back(sst);
-        }
+    if (sstables_to_process.empty()) {
+        co_return;
    }
-
    co_await _vug.process_staging_sstables(std::move(table), sstables_to_process);

    try {
--- a/db/view/view_building_worker.hh
+++ b/db/view/view_building_worker.hh
@@ -14,6 +14,7 @@
 #include <seastar/core/shared_future.hh>
 #include <unordered_map>
 #include <unordered_set>
+#include <flat_set>
 #include "locator/abstract_replication_strategy.hh"
 #include "locator/tablets.hh"
 #include "raft/raft.hh"
@@ -169,10 +170,15 @@ private:
    future<> do_process_staging(table_id base_id, dht::token last_token);

    future<> run_staging_sstables_registrator();
-    // Caller must hold units from `_staging_sstables_mutex`
+    // Acquires `_staging_sstables_mutex` on all shards internally,
+    // so callers must not hold `_staging_sstables_mutex` when invoking it.
    future<> create_staging_sstable_tasks();
    future<> discover_existing_staging_sstables();
    std::unordered_map<table_id, std::vector<staging_sstable_task_info>> discover_local_staging_sstables(building_tasks building_tasks);
+    // Acquire `_staging_sstables_mutex` on multiple shards in parallel.
+    // Must be called only from shard 0.
+    // Must be called ONLY by `create_staging_sstable_tasks()` and only once at a time to avoid deadlock.
+    future<std::vector<foreign_ptr<semaphore_units<>>>> lock_staging_mutex_on_multiple_shards(std::flat_set<shard_id> shards);

    void init_messaging_service();
    future<> uninit_messaging_service();
--- a/dht/i_partitioner.cc
+++ b/dht/i_partitioner.cc
@@ -352,6 +352,16 @@ dht::partition_range_vector to_partition_ranges(const dht::token_range_vector& r
    return prs;
 }

+future<utils::chunked_vector<dht::partition_range>> to_partition_ranges_chunked(const dht::token_range_vector& ranges) {
+    utils::chunked_vector<dht::partition_range> prs;
+    prs.reserve(ranges.size());
+    for (auto& range : ranges) {
+        prs.push_back(dht::to_partition_range(range));
+        co_await coroutine::maybe_yield();
+    }
+    co_return prs;
+}
+
 std::map<unsigned, dht::partition_range_vector>
 split_range_to_shards(dht::partition_range pr, const schema& s, const sharder& raw_sharder) {
    std::map<unsigned, dht::partition_range_vector> ret;
@@ -364,11 +374,11 @@ split_range_to_shards(dht::partition_range pr, const schema& s, const sharder& r
    return ret;
 }

-future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& source_ranges, dht::partition_range_vector ranges_to_subtract) {
+future<utils::chunked_vector<dht::partition_range>> subtract_ranges(const schema& schema, utils::chunked_vector<dht::partition_range> source_ranges, utils::chunked_vector<dht::partition_range> ranges_to_subtract) {
    auto cmp = dht::ring_position_comparator(schema);
    // optimize set of potentially overlapping ranges by deoverlapping them.
-    auto ranges = dht::partition_range::deoverlap(source_ranges, cmp);
-    dht::partition_range_vector res;
+    auto ranges = dht::partition_range::deoverlap(std::move(source_ranges), cmp);
+    utils::chunked_vector<dht::partition_range> res;
    res.reserve(ranges.size() * 2);

    auto range = ranges.begin();
--- a/dht/i_partitioner.hh
+++ b/dht/i_partitioner.hh
@@ -91,6 +91,7 @@ inline token get_token(const schema& s, partition_key_view key) {

 dht::partition_range to_partition_range(dht::token_range);
 dht::partition_range_vector to_partition_ranges(const dht::token_range_vector& ranges, utils::can_yield can_yield = utils::can_yield::no);
+future<utils::chunked_vector<dht::partition_range>> to_partition_ranges_chunked(const dht::token_range_vector& ranges);

 // Each shard gets a sorted, disjoint vector of ranges
 std::map<unsigned, dht::partition_range_vector>
@@ -105,7 +106,7 @@ std::unique_ptr<dht::i_partitioner> make_partitioner(sstring name);
 // Returns a sorted and deoverlapped list of ranges that are
 // the result of subtracting all ranges from ranges_to_subtract.
 // ranges_to_subtract must be sorted and deoverlapped.
-future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& ranges, dht::partition_range_vector ranges_to_subtract);
+future<utils::chunked_vector<dht::partition_range>> subtract_ranges(const schema& schema, utils::chunked_vector<dht::partition_range> ranges, utils::chunked_vector<dht::partition_range> ranges_to_subtract);

 // Returns a token_range vector split based on the given number of most-significant bits
 dht::token_range_vector split_token_range_msb(unsigned most_significant_bits);
--- a/dht/token.hh
+++ b/dht/token.hh
@@ -30,6 +30,31 @@ enum class token_kind {
    after_all_keys,
 };

+// Represents a token for partition keys.
+// Has a disengaged state, which sorts before all engaged states.
+struct raw_token {
+    int64_t value;
+
+    /// Constructs a disengaged token.
+    raw_token() : value(std::numeric_limits<int64_t>::min()) {}
+
+    /// Constructs an engaged token.
+    /// The token must be of token_kind::key kind.
+    explicit raw_token(const token&);
+
+    explicit raw_token(int64_t v) : value(v) {};
+
+    std::strong_ordering operator<=>(const raw_token& o) const noexcept = default;
+    std::strong_ordering operator<=>(const token& o) const noexcept;
+
+    /// Returns true iff engaged.
+    explicit operator bool() const noexcept {
+        return value != std::numeric_limits<int64_t>::min();
+    }
+};
+
+using raw_token_opt = seastar::optimized_optional<raw_token>;
+
 class token {
    // INT64_MIN is not a legal token, but a special value used to represent
    // infinity in token intervals.
@@ -52,6 +77,10 @@ public:

    constexpr explicit token(int64_t d) noexcept : token(kind::key, normalize(d)) {}

+    token(raw_token raw) noexcept
+        : token(raw ? kind::key : kind::before_all_keys, raw.value)
+    { }
+
    // This constructor seems redundant with the bytes_view constructor, but
    // it's necessary for IDL, which passes a deserialized_bytes_proxy here.
    // (deserialized_bytes_proxy is convertible to bytes&&, but not bytes_view.)
@@ -223,6 +252,29 @@ public:
    }
 };

+inline
+raw_token::raw_token(const token& t)
+    : value(t.raw())
+{
+#ifdef DEBUG
+    assert(t._kind == token::kind::key);
+#endif
+}
+
+inline
+std::strong_ordering raw_token::operator<=>(const token& o) const noexcept {
+    switch (o._kind) {
+        case token::kind::after_all_keys:
+            return std::strong_ordering::less;
+        case token::kind::before_all_keys:
+            // before_all_keys has a raw value set to the same raw value as a disengaged raw_token, and sorts before all keys.
+            // So we can order them by just comparing raw values.
+            [[fallthrough]];
+        case token::kind::key:
+            return value <=> o._data;
+    }
+}
+
 inline constexpr std::strong_ordering tri_compare_raw(const int64_t l1, const int64_t l2) noexcept {
    if (l1 == l2) {
        return std::strong_ordering::equal;
@@ -329,6 +381,17 @@ struct fmt::formatter<dht::token> : fmt::formatter<string_view> {
    }
 };

+template <>
+struct fmt::formatter<dht::raw_token> : fmt::formatter<string_view> {
+    template <typename FormatContext>
+    auto format(const dht::raw_token& t, FormatContext& ctx) const {
+        if (!t) {
+            return fmt::format_to(ctx.out(), "null");
+        }
+        return fmt::format_to(ctx.out(), "{}", t.value);
+    }
+};
+
 namespace std {

 template<>
--- a/dist/common/scripts/scylla_swap_setup
+++ b/dist/common/scripts/scylla_swap_setup
@@ -9,6 +9,7 @@

 import os
 import sys
+import shlex
 import argparse
 import psutil
 from pathlib import Path
@@ -103,16 +104,41 @@ if __name__ == '__main__':
        run('dd if=/dev/zero of={} bs=1M count={}'.format(swapfile, swapsize_mb), shell=True, check=True)
    swapfile.chmod(0o600)
    run('mkswap -f {}'.format(swapfile), shell=True, check=True)
+
+    mount_point = find_mount_point(swap_directory)
+    mount_unit = out(f'systemd-escape -p --suffix=mount {shlex.quote(str(mount_point))}')
+
+    # Add DefaultDependencies=no to the swap unit to avoid getting the default
+    # Before=swap.target dependency. We apply this to all clouds, but the
+    # requirement came from Azure:
+    #
+    # On Azure, the swap directory is on the Azure ephemeral disk (mounted on /mnt).
+    # However, cloud-init makes this mount (i.e., the mnt.mount unit) depend on
+    # the network (After=network-online.target). By extension, this means that
+    # the swap unit depends on the network. If we didn't use DefaultDependencies=no,
+    # then the swap unit would be part of the swap.target which other services
+    # assume to be a local boot target, so we would end up with dependency cycles
+    # such as:
+    #
+    # swap.target -> mnt-swapfile.swap -> mnt.mount -> network-online.target -> network.target -> systemd-resolved.service -> tmp.mount -> swap.target
+    #
+    # By removing the automatic Before=swap.target, the swap unit is no longer
+    # part of swap.target, avoiding such cycles. The swap will still be
+    # activated via WantedBy=multi-user.target.
    unit_data = '''
 [Unit]
 Description=swapfile
+DefaultDependencies=no
+After={}
+Conflicts=umount.target
+Before=umount.target

 [Swap]
 What={}

 [Install]
 WantedBy=multi-user.target
-'''[1:-1].format(swapfile)
+'''[1:-1].format(mount_unit, swapfile)
    with swapunit.open('w') as f:
        f.write(unit_data)
    systemd_unit.reload()
--- a/docs/alternator/network.md
+++ b/docs/alternator/network.md
@@ -31,7 +31,7 @@ was used. Alternator currently supports two compression algorithms, `gzip`
 and `deflate`, both standardized in ([RFC 9110](https://www.rfc-editor.org/rfc/rfc9110.html)).
 Other standard compression types which are listed in
 [IANA's HTTP Content Coding Registry](https://www.iana.org/assignments/http-parameters/http-parameters.xhtml#content-coding),
-including `zstd` ([RFC 8878][https://www.rfc-editor.org/rfc/rfc8878.html]),
+including `zstd` ([RFC 8878](https://www.rfc-editor.org/rfc/rfc8878.html)),
 are not yet supported by Alternator.

 Note that HTTP's compression only compresses the request's _body_ - not the
--- a/docs/cql/dml/select.rst
+++ b/docs/cql/dml/select.rst
@@ -281,8 +281,8 @@ For example::
      ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;


-Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key
-or columns provided in a definition of the index.
+Vector queries also support filtering with ``WHERE`` clauses on columns that are part of the primary key.
+See :ref:`WHERE <where-clause>`.

 For example::

@@ -290,10 +290,6 @@ For example::
      WHERE user_id = 'user123'
      ORDER BY embedding ANN OF [0.1, 0.2, 0.3, 0.4] LIMIT 5;

-The supported operations are equal relations (``=`` and ``IN``) with restrictions as in regular ``WHERE`` clauses. See :ref:`WHERE <where-clause>`.
-
-Other filtering scenarios are currently not supported.
-
 .. note::

   Vector indexes are supported in ScyllaDB Cloud only in clusters that have the Vector Search feature enabled.
--- a/docs/getting-started/install-scylla/install-on-linux.rst
+++ b/docs/getting-started/install-scylla/install-on-linux.rst
@@ -52,7 +52,7 @@ Install ScyllaDB
            .. code-block:: console
               :substitutions:
    
-               sudo wget -O /etc/apt/sources.list.d/scylla.list http://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|
+               sudo wget -O /etc/apt/sources.list.d/scylla.list https://downloads.scylladb.com/deb/debian/|UBUNTU_SCYLLADB_LIST|


        #. Install ScyllaDB packages.
@@ -125,7 +125,7 @@ Install ScyllaDB
            .. code-block:: console
               :substitutions:
    
-               sudo curl -o /etc/yum.repos.d/scylla.repo -L http://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|
+               sudo curl -o /etc/yum.repos.d/scylla.repo -L https://downloads.scylladb.com/rpm/centos/|CENTOS_SCYLLADB_REPO|

        #. Install ScyllaDB packages.

@@ -133,19 +133,19 @@ Install ScyllaDB
    
               sudo yum install scylla

-            Running the command installs the latest official version of ScyllaDB Open Source.
-            Alternatively, you can to install a specific patch version:
+            Running the command installs the latest official version of ScyllaDB.
+            Alternatively, you can install a specific patch version:

            .. code-block:: console
    
               sudo yum install scylla-<your patch version>

-            Example: The following example shows the command to install ScyllaDB 5.2.3.
+            Example: The following example shows installing ScyllaDB 2025.3.1.

            .. code-block:: console
               :class: hide-copy-button
    
-               sudo yum install scylla-5.2.3
+               sudo yum install scylla-2025.3.1

 .. include:: /getting-started/_common/setup-after-install.rst

--- a/docs/getting-started/installation-common/scylla-web-installer.rst
+++ b/docs/getting-started/installation-common/scylla-web-installer.rst
@@ -36,11 +36,8 @@ release versions, run:
  curl -sSf get.scylladb.com/server | sudo bash -s -- --list-active-releases


-Versions 2025.1 and Later
-==============================
-
-Run the command with the ``--scylla-version`` option to specify the version
-you want to install.
+To install a non-default version, run the command with the ``--scylla-version``
+option to specify the version you want to install.

 **Example**

@@ -50,20 +47,4 @@ you want to install.
  curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-version |CURRENT_VERSION|


-Versions Earlier than 2025.1
-================================
-
-To install a supported version of *ScyllaDB Enterprise*, run the command with:
-
-* ``--scylla-product scylla-enterprise`` to specify that you want to install
-  ScyllaDB Entrprise.
-* ``--scylla-version`` to specify the version you want to install.
-
-For example:
-
-.. code:: console
-  
-  curl -sSf get.scylladb.com/server | sudo bash -s -- --scylla-product scylla-enterprise --scylla-version 2024.1
-
-
 .. include:: /getting-started/_common/setup-after-install.rst
--- a/docs/operating-scylla/procedures/cluster-management/cluster-platform-migration.rst
+++ b/docs/operating-scylla/procedures/cluster-management/cluster-platform-migration.rst
@@ -0,0 +1,492 @@
+=================================================
+Cluster Platform Migration Using Node Cycling
+=================================================
+
+This procedure describes how to migrate a ScyllaDB cluster to new instance types
+using the add-and-replace approach, which is commonly used for:
+
+* Migrating from one CPU architecture to another (e.g., x86_64 to ARM/Graviton)
+* Upgrading to newer instance types with better performance
+* Changing instance families within the same cloud provider
+
+The add-and-replace approach maintains data replication throughout the migration
+and ensures zero downtime for client applications.
+
+.. note::
+
+   This procedure does **not** change the ScyllaDB software version. All nodes
+   (both existing and new) must run the same ScyllaDB version. For software
+   version upgrades, see :doc:`Upgrade </upgrade/index>`.
+
+Overview
+--------
+
+The add-and-replace migration follows these steps:
+
+#. Add new nodes (on target instance type) to the existing cluster
+#. Wait for data to stream to the new nodes
+#. Decommission old nodes (on source instance type)
+
+This approach keeps the cluster operational throughout the migration while
+maintaining the configured replication factor.
+
+Key characteristics
+===================
+
+* **Zero downtime**: Client applications continue to operate during migration
+* **Data safety**: Replication factor is maintained throughout the process
+* **Flexible**: Works with both vnodes and tablets-enabled clusters
+* **Multi-DC support**: Can migrate nodes across multiple datacenters
+
+.. warning::
+
+   Ensure your cluster has sufficient capacity during the migration. At the peak
+   of the process, your cluster will temporarily have double the number of nodes.
+
+Prerequisites
+-------------
+
+Check cluster health
+====================
+
+Before starting the migration, verify that your cluster is healthy:
+
+#. Check that all nodes are in Up Normal (UN) status:
+
+   .. code-block:: shell
+
+      nodetool status
+
+   All nodes should show ``UN`` status. Do not proceed if any nodes are down.
+
+#. Ensure no streaming or repair operations are in progress:
+
+   .. code-block:: shell
+
+      nodetool netstats
+      nodetool compactionstats
+
+Plan the migration
+==================
+
+Before provisioning new instances, plan the following:
+
+**Instance type mapping**: Identify the source and target instance types.
+If your cluster uses vnodes (not tablets), consider that mismatched shard
+counts between source and target instance types can cause slower repairs.
+With tablets enabled, shard count mismatch is fully supported.
+
+**Rack assignment planning**: Each new node must be assigned to the same rack
+as the node it will replace. This maintains rack-aware topology for:
+
+* Rack-aware replication (NetworkTopologyStrategy)
+* Proper data distribution across failure domains
+* Minimizing data movement during decommission
+
+Example mapping for a 3-node cluster:
+
+.. code-block:: none
+
+   Source nodes (to be decommissioned):     Target nodes (to be added):
+   192.168.1.10 - RACK0                 →   192.168.2.10 - RACK0
+   192.168.1.11 - RACK1                 →   192.168.2.11 - RACK1
+   192.168.1.12 - RACK2                 →   192.168.2.12 - RACK2
+
+Create a backup
+===============
+
+Back up the data before starting the migration. One of the following
+methods can be used:
+
+* **ScyllaDB Manager** (recommended): Use ScyllaDB Manager to perform a
+  cluster-wide backup. See the
+  `ScyllaDB Manager documentation <https://manager.docs.scylladb.com/stable/backup/>`_
+  for details.
+
+* **Snapshots**: On each node in the cluster, create a snapshot:
+
+  .. code-block:: shell
+
+     nodetool snapshot -t pre_migration_backup
+     nodetool listsnapshots
+
+  .. note::
+
+     Snapshots are local to each node and do not protect against node or disk
+     failure. For full disaster recovery, use ScyllaDB Manager backup.
+
+
+Procedure
+---------
+
+Adding new nodes
+================
+
+#. Provision new instances with the target instance type. Ensure:
+
+   * The same ScyllaDB version as existing nodes
+   * Same network configuration and security groups
+   * Appropriate storage configuration
+
+#. On each new node, configure ``/etc/scylla/scylla.yaml`` to join the existing
+   cluster:
+
+   * **cluster_name**: Must match the existing cluster name
+   * **seeds**: IP address of an existing node in the cluster (used to discover cluster topology on join)
+   * **endpoint_snitch**: Must match the existing cluster configuration
+   * **listen_address**: IP address of the new node
+   * **rpc_address**: IP address of the new node
+
+   All other cluster-wide settings (tablets configuration, encryption settings,
+   experimental features, etc.) must match the existing nodes.
+
+   .. caution::
+
+      Make sure that the ScyllaDB version on the new node is identical to the
+      version on the other nodes in the cluster. Running nodes with different
+      versions is not supported.
+
+#. If using ``GossipingPropertyFileSnitch``, configure
+   ``/etc/scylla/cassandra-rackdc.properties`` with the correct datacenter
+   and rack assignment for this node:
+
+   .. code-block:: none
+
+      dc = <datacenter-name>
+      rack = <rack-name>
+      prefer_local = true
+
+   .. warning::
+
+      Each node must have the correct rack assignment. Using the same rack for
+      all new nodes breaks rack-aware replication topology.
+
+#. Start ScyllaDB on the new node:
+
+   .. code-block:: shell
+
+      sudo systemctl start scylla-server
+
+   For Docker deployments:
+
+   .. code-block:: shell
+
+      docker exec -it <container-name> supervisorctl start scylla
+
+#. Monitor the bootstrap process from an existing node:
+
+   .. code-block:: shell
+
+      nodetool status
+
+   The new node will appear with ``UJ`` (Up, Joining) status while streaming
+   data from existing nodes. Wait until it transitions to ``UN`` (Up, Normal).
+
+   **Example output during bootstrap:**
+
+   .. code-block:: shell
+
+      Datacenter: dc1
+      Status=Up/Down
+      State=Normal/Leaving/Joining/Moving
+      --  Address        Load       Tokens  Owns   Host ID                               Rack
+      UN  192.168.1.10   500 MB     256     33.3%  8d5ed9f4-7764-4dbd-bad8-43fddce94b7c  RACK0
+      UN  192.168.1.11   500 MB     256     33.3%  125ed9f4-7777-1dbn-mac8-43fddce9123e  RACK1
+      UN  192.168.1.12   500 MB     256     33.3%  675ed9f4-6564-6dbd-can8-43fddce952gy  RACK2
+      UJ  192.168.2.10   250 MB     256     ?      a1b2c3d4-5678-90ab-cdef-112233445566  RACK0
+
+   **Example output after bootstrap completes:**
+
+   .. code-block:: shell
+
+      Datacenter: dc1
+      Status=Up/Down
+      State=Normal/Leaving/Joining/Moving
+      --  Address        Load       Tokens  Owns   Host ID                               Rack
+      UN  192.168.1.10   400 MB     256     25.0%  8d5ed9f4-7764-4dbd-bad8-43fddce94b7c  RACK0
+      UN  192.168.1.11   400 MB     256     25.0%  125ed9f4-7777-1dbn-mac8-43fddce9123e  RACK1
+      UN  192.168.1.12   400 MB     256     25.0%  675ed9f4-6564-6dbd-can8-43fddce952gy  RACK2
+      UN  192.168.2.10   400 MB     256     25.0%  a1b2c3d4-5678-90ab-cdef-112233445566  RACK0
+
+#. For tablets-enabled clusters, wait for tablet load balancing to complete.
+   After the node reaches ``UN`` status, verify no streaming is in progress:
+
+   .. code-block:: shell
+
+      nodetool netstats
+
+   Wait until output shows "Not sending any streams" and no active receiving streams.
+
+#. Repeat steps 1-6 for each new node to be added.
+
+.. note::
+
+   You can add multiple nodes in parallel if they are in different datacenters.
+   Within a single datacenter, add nodes one at a time for best results.
+
+
+Updating seed node configuration
+================================
+
+If any of your original nodes are configured as seed nodes, you must update
+the seed configuration before decommissioning them.
+
+#. Check the current seed configuration on any node:
+
+   .. code-block:: shell
+
+      grep -A 4 "seed_provider" /etc/scylla/scylla.yaml
+
+#. If the seeds include nodes you plan to decommission, update ``scylla.yaml``
+   on **all new nodes** to use the new node IPs as seeds:
+
+   .. code-block:: yaml
+
+      seed_provider:
+        - class_name: org.apache.cassandra.locator.SimpleSeedProvider
+          parameters:
+            - seeds: "192.168.2.10,192.168.2.11,192.168.2.12"
+
+   .. note::
+
+      Updating seed configuration on the **old nodes** (that will be
+      decommissioned) is optional. Seeds are only used during node startup
+      to discover the cluster. If you don't plan to restart the old nodes
+      before decommissioning them, their seed configuration doesn't matter.
+      However, updating all nodes is recommended for safety in case an old
+      node unexpectedly restarts during the migration.
+
+#. Restart ScyllaDB on each new node (one at a time) to apply the new seed
+   configuration:
+
+   .. code-block:: shell
+
+      sudo systemctl restart scylla-server
+
+   Wait for the node to fully start before restarting the next node.
+
+#. After restarting the new nodes, verify the cluster is healthy:
+
+   .. code-block:: shell
+
+      nodetool status
+      nodetool describecluster
+
+.. warning::
+
+   Complete this seed list update on **all new nodes** before decommissioning
+   any old nodes. This ensures the new nodes can reform the cluster after
+   the old nodes are removed.
+
+
+Decommissioning old nodes
+=========================
+
+After all new nodes are added and healthy, decommission the old nodes one
+at a time.
+
+#. Verify all nodes are healthy before starting decommission:
+
+   .. code-block:: shell
+
+      nodetool status
+
+   All nodes should show ``UN`` status.
+
+#. On the node to be decommissioned, run:
+
+   .. code-block:: shell
+
+      nodetool decommission
+
+   This command blocks until the decommission is complete. The node will
+   stream its data to the remaining nodes.
+
+#. Monitor the decommission progress from another node:
+
+   .. code-block:: shell
+
+      nodetool status
+
+   The decommissioning node will transition from ``UN`` → ``UL`` (Up, Leaving)
+   → removed from the cluster.
+
+   You can also monitor streaming progress:
+
+   .. code-block:: shell
+
+      nodetool netstats
+
+#. After decommission completes, verify the node is no longer in the cluster:
+
+   .. code-block:: shell
+
+      nodetool status
+
+   The decommissioned node should no longer appear in the output.
+
+#. Run ``nodetool cleanup`` on the remaining nodes to remove data that
+   no longer belongs to them after the topology change:
+
+   .. code-block:: shell
+
+      nodetool cleanup
+
+   .. note::
+
+      ``nodetool cleanup`` can be resource-intensive. Run it on one node at a
+      time during low-traffic periods.
+
+#. Wait for the cluster to stabilize before decommissioning the next node.
+   Ensure no streaming operations are in progress.
+
+#. Repeat steps 1-7 for each old node to be decommissioned.
+
+
+Post-migration verification
+---------------------------
+
+After all old nodes are decommissioned, verify the migration was successful.
+
+Verify cluster topology
+=======================
+
+.. code-block:: shell
+
+   nodetool status
+
+Confirm:
+
+* All nodes show ``UN`` (Up, Normal) status
+* Only the new instance type nodes are present
+* Nodes are balanced across racks
+
+Verify schema agreement
+=======================
+
+.. code-block:: shell
+
+   nodetool describecluster
+
+All nodes should report the same schema version.
+
+Verify data connectivity
+========================
+
+Connect to the cluster and run a test query:
+
+.. code-block:: shell
+
+   cqlsh <node-ip> -e "SELECT count(*) FROM system_schema.keyspaces;"
+
+.. note::
+
+   If ScyllaDB is configured with ``listen_interface``, you must use the
+   node's interface IP address (not localhost) for cqlsh connections.
+
+Verify ScyllaDB version
+=======================
+
+Confirm all nodes are running the same ScyllaDB version:
+
+.. code-block:: shell
+
+   scylla --version
+
+Verify data integrity (optional)
+================================
+
+Run data validation on each keyspace to verify sstable integrity:
+
+.. code-block:: shell
+
+   nodetool scrub --mode=VALIDATE <keyspace_name>
+
+Rollback
+--------
+
+If issues occur during the migration, you can roll back by reversing the
+procedure.
+
+During add phase
+================
+
+If a new node fails to bootstrap:
+
+#. Stop ScyllaDB on the new node:
+
+   .. code-block:: shell
+
+      sudo systemctl stop scylla-server
+
+#. From an existing node, remove the failed node:
+
+   .. code-block:: shell
+
+      nodetool removenode <host-id-of-failed-node>
+
+During decommission phase
+=========================
+
+If a decommission operation gets stuck:
+
+#. If the node is still reachable, try stopping and restarting ScyllaDB
+#. If the node is unresponsive, from another node:
+
+   .. code-block:: shell
+
+      nodetool removenode <host-id>
+
+   See :doc:`Remove a Node from a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/remove-node>`
+   for more details.
+
+Full rollback
+=============
+
+To roll back after the migration is complete (all nodes on new instance type),
+apply the same add-and-replace procedure in reverse:
+
+#. Add new nodes on the original instance type
+#. Wait for data streaming to complete
+#. Decommission the nodes on the new instance type
+
+
+Troubleshooting
+---------------
+
+Node stuck in Joining (UJ) state
+================================
+
+If a new node remains in ``UJ`` state for an extended period:
+
+* Check ScyllaDB logs for streaming errors: ``journalctl -u scylla-server``
+* Verify network connectivity between nodes
+* Ensure sufficient disk space on all nodes
+* Check for any ongoing operations that may be blocking
+
+Decommission taking too long
+============================
+
+Decommission duration depends on data size. If it appears stuck:
+
+* Check streaming progress: ``nodetool netstats``
+* Look for errors in ScyllaDB logs
+* Verify network bandwidth between nodes
+
+Schema disagreement
+===================
+
+If nodes report different schema versions:
+
+* Wait a few minutes for schema to propagate
+* If disagreement persists, restart the nodes one by one
+* Run ``nodetool describecluster`` to verify agreement
+
+
+Additional resources
+--------------------
+
+* :doc:`Adding a New Node Into an Existing ScyllaDB Cluster </operating-scylla/procedures/cluster-management/add-node-to-cluster>`
+* :doc:`Remove a Node from a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/remove-node>`
+* :doc:`Replace a Running Node in a ScyllaDB Cluster </operating-scylla/procedures/cluster-management/replace-running-node>`
+* :doc:`Upgrade </upgrade/index>`
--- a/docs/operating-scylla/procedures/cluster-management/index.rst
+++ b/docs/operating-scylla/procedures/cluster-management/index.rst
@@ -26,6 +26,7 @@ Cluster Management Procedures
   Safely Restart Your Cluster <safe-start>
   repair-based-node-operation
   Prevent Quorum Loss in Symmetrical Multi-DC Clusters <arbiter-dc>
+   Cluster Platform Migration <cluster-platform-migration>


 .. panel-box::
@@ -85,6 +86,8 @@ Cluster Management Procedures

  * :doc:`Preventing Quorum Loss in Symmetrical Multi-DC Clusters <arbiter-dc>`

+  * :doc:`Cluster Platform Migration Using Node Cycling </operating-scylla/procedures/cluster-management/cluster-platform-migration>`
+
 .. panel-box::
  :title: Topology Changes
  :id: "getting-started"
--- a/docs/operating-scylla/procedures/config-change/advanced-internode-compression.rst
+++ b/docs/operating-scylla/procedures/config-change/advanced-internode-compression.rst
@@ -57,12 +57,11 @@ To enable shared dictionaries:
    internode_compression_enable_advanced: true
    rpc_dict_training_when: when_leader

-.. warning:: Enabling shared dictionary training might leak unencrypted data to disk.
+.. note::

-             Trained dictionaries contain randomly chosen samples of data transferred between
-             nodes. The data samples are persisted in the Raft log, which is not encrypted.
-             As a result, some data from otherwise encrypted tables might be stored on disk
-             unencrypted.
+   Some dictionary training data may be encrypted using storage-level encryption
+   (if enabled) instead of database-level encryption, meaning protection is
+   applied at the storage layer rather than within the database itself.


 Reference
--- a/docs/operating-scylla/security/ldap-authorization.rst
+++ b/docs/operating-scylla/security/ldap-authorization.rst
@@ -27,6 +27,16 @@ This configuration takes the form of a query template which is defined in the sc
 The value of ``ldap_url_template`` parameter should contain a valid LDAP URL (e.g., as returned by the ldapurl utility from OpenLDAP) representing an LDAP query that returns entries for all the user's roles.
 Scylla will replace the text ``{USER}`` in the URL with the user's Scylla username before querying LDAP.

+.. note:: Usernames substituted into ``{USER}`` are automatically escaped
+   using RFC 4515 filter escaping and URL percent-encoding, so LDAP filter
+   metacharacters (``*``, ``(``, ``)``, ``\``, NUL) and URL metacharacters
+   (``%``, ``?``, ``#``) in usernames are handled safely.
+   
+   ``{USER}`` must appear only in the **filter** component of the LDAP URL
+   (the part after the third ``?``).  Templates that place ``{USER}`` in the
+   host, base DN, attributes, or extensions are rejected at startup, because
+   filter escaping is not the correct encoding for those components.
+
 Workflow
 --------

--- a/docs/upgrade/upgrade-guides/index.rst
+++ b/docs/upgrade/upgrade-guides/index.rst
@@ -5,6 +5,7 @@ Upgrade ScyllaDB
 .. toctree::
   
   ScyllaDB 2025.x to ScyllaDB 2026.1 <upgrade-guide-from-2025.x-to-2026.1/index>
+   ScyllaDB 2026.x Patch Upgrades <upgrade-guide-from-2026.x.y-to-2026.x.z>
   ScyllaDB Image <ami-upgrade>


--- a/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.x.y-to-2026.x.z.rst
+++ b/docs/upgrade/upgrade-guides/upgrade-guide-from-2026.x.y-to-2026.x.z.rst
@@ -0,0 +1,268 @@
+.. |SCYLLA_NAME| replace:: ScyllaDB
+
+.. |SRC_VERSION| replace:: 2026.x.y
+.. |NEW_VERSION| replace:: 2026.x.z
+
+==========================================================================
+Upgrade - |SCYLLA_NAME| |SRC_VERSION| to |NEW_VERSION| (Patch Upgrades)
+==========================================================================
+
+This document describes a step-by-step procedure for upgrading from
+|SCYLLA_NAME| |SRC_VERSION|  to |SCYLLA_NAME| |NEW_VERSION| (where "z" is
+the latest available version), and rolling back to version |SRC_VERSION|
+if necessary.
+
+This guide covers upgrading ScyllaDB on Red Hat Enterprise Linux (RHEL),
+CentOS, Debian, and Ubuntu.
+See `OS Support by Platform and Version <https://docs.scylladb.com/stable/versioning/os-support-per-version.html>`_
+for information about supported versions.
+
+It also applies to the ScyllaDB official image on EC2, GCP, or Azure.
+
+See `Upgrade Policy <https://docs.scylladb.com/stable/versioning/upgrade-policy.html>`_ for the ScyllaDB upgrade policy.
+
+Upgrade Procedure
+=================
+
+.. note::
+   Apply the following procedure **serially** on each node. Do not move to the next
+   node before validating that the node is up and running the new version.
+
+A ScyllaDB upgrade is a rolling procedure that does **not** require a full cluster
+shutdown. For each of the nodes in the cluster, you will:
+
+#. Drain the node and back up the data.
+#. Backup configuration file.
+#. Stop ScyllaDB.
+#. Download and install new ScyllaDB packages.
+#. Start ScyllaDB.
+#. Validate that the upgrade was successful.
+
+**Before** upgrading, check which version you are running now using
+``scylla --version``. Note the current version in case you want to roll back
+the upgrade.
+
+**During** the rolling upgrade it is highly recommended:
+
+* Not to use new |NEW_VERSION| features.
+* Not to run administration functions, like repairs, refresh, rebuild or add
+  or remove nodes. See
+  `sctool <https://manager.docs.scylladb.com/stable/sctool/>`_ for suspending
+  ScyllaDB Manager's scheduled or running repairs.
+* Not to apply schema changes.
+
+Upgrade Steps
+=============
+
+Back up the data
+------------------------------
+
+Back up all the data to an external device. We recommend using
+`ScyllaDB Manager <https://manager.docs.scylladb.com/stable/backup/index.html>`_
+to create backups.
+
+Alternatively, you can use the ``nodetool snapshot`` command.
+For **each** node in the cluster, run the following:
+
+.. code:: sh
+
+   nodetool drain
+   nodetool snapshot
+
+Take note of the directory name that nodetool gives you, and copy all
+the directories with this name under ``/var/lib/scylla`` to a backup device.
+
+When the upgrade is completed on all nodes, remove the snapshot with the 
+``nodetool clearsnapshot -t <snapshot>`` command to prevent running out of
+space.
+
+Back up the configuration file
+------------------------------
+
+Back up the ``scylla.yaml`` configuration file and the ScyllaDB packages
+in case you need to roll back the upgrade.
+
+.. tabs::
+
+   .. group-tab:: Debian/Ubuntu
+
+      .. code:: sh
+
+         sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+         sudo cp /etc/apt/sources.list.d/scylla.list ~/scylla.list-backup
+
+   .. group-tab:: RHEL/CentOS
+
+      .. code:: sh 
+         
+         sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup
+         sudo cp /etc/yum.repos.d/scylla.repo ~/scylla.repo-backup
+
+Gracefully stop the node
+------------------------
+
+.. code:: sh
+
+   sudo service scylla-server stop
+
+Download and install the new release
+------------------------------------
+
+You don’t need to update the ScyllaDB DEB or RPM repo when you upgrade to
+a patch release.
+
+.. tabs::
+
+   .. group-tab:: Debian/Ubuntu
+
+        To install a patch version on Debian or Ubuntu, run:
+
+        .. code:: sh
+            
+            sudo apt-get clean all
+            sudo apt-get update
+            sudo apt-get dist-upgrade scylla
+
+        Answer ‘y’ to the first two questions.
+
+   .. group-tab:: RHEL/CentOS
+
+        To install a patch version on RHEL or CentOS, run:
+
+        .. code:: sh
+            
+            sudo yum clean all
+            sudo yum update scylla\* -y
+        
+   .. group-tab:: EC2/GCP/Azure Ubuntu Image
+
+        If you're using the ScyllaDB official image (recommended), see 
+        the **Debian/Ubuntu** tab for upgrade instructions.
+
+        If you're using your own image and have installed ScyllaDB packages for 
+        Ubuntu or Debian, you need to apply an extended upgrade procedure:
+
+        #. Install the new ScyllaDB version with the additional
+           ``scylla-machine-image`` package:
+
+            .. code-block:: console
+
+               sudo apt-get clean all
+               sudo apt-get update
+               sudo apt-get dist-upgrade scylla
+               sudo apt-get dist-upgrade scylla-machine-image
+        #. Run ``scylla_setup`` without ``running io_setup``.
+        #. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.
+
+Start the node
+--------------
+
+.. code:: sh
+
+   sudo service start scylla-server
+
+Validate
+--------
+#. Check cluster status with ``nodetool status`` and make sure **all** nodes,
+   including the one you just upgraded, are in UN status.
+#. Use ``curl -X GET "http://localhost:10000/storage_service/scylla_release_version"``
+   to check the ScyllaDB version.
+#. Use ``journalctl _COMM=scylla`` to check there are no new errors in the log.
+#. Check again after 2 minutes to validate that no new issues are introduced.
+
+Once you are sure the node upgrade is successful, move to the next node in
+the cluster.
+
+Rollback Procedure
+==================
+
+The following procedure describes a rollback from ScyllaDB release
+|NEW_VERSION| to |SRC_VERSION|. Apply this procedure if an upgrade from
+|SRC_VERSION| to |NEW_VERSION| failed before completing on all nodes. 
+
+* Use this procedure only on nodes you upgraded to |NEW_VERSION|.
+* Execute the following commands one node at a time, moving to the next node only
+  after the rollback procedure is completed successfully.
+
+ScyllaDB rollback is a rolling procedure that does **not** require a full
+cluster shutdown. For each of the nodes to roll back to |SRC_VERSION|, you will:
+
+#. Drain the node and stop ScyllaDB.
+#. Downgrade to the previous release.
+#. Restore the configuration file.
+#. Restart ScyllaDB.
+#. Validate the rollback success.
+
+Rollback Steps
+==============
+
+Gracefully shutdown ScyllaDB
+-----------------------------
+
+.. code:: sh
+    
+   nodetool drain
+   sudo service stop scylla-server
+
+Downgrade to the previous release
+----------------------------------
+
+.. tabs::
+
+   .. group-tab:: Debian/Ubuntu
+
+        To downgrade to |SRC_VERSION| on Debian or Ubuntu, run:
+
+        .. code-block:: console
+            :substitutions:
+
+            sudo apt-get install scylla=|SRC_VERSION|\* scylla-server=|SRC_VERSION|\* scylla-tools=|SRC_VERSION|\* scylla-tools-core=|SRC_VERSION|\* scylla-kernel-conf=|SRC_VERSION|\* scylla-conf=|SRC_VERSION|\*
+        
+        Answer ‘y’ to the first two questions.
+
+   .. group-tab:: RHEL/CentOS
+
+        To downgrade to |SRC_VERSION| on RHEL or CentOS, run:
+
+        .. code-block:: console
+            :substitutions:
+
+            sudo yum downgrade scylla\*-|SRC_VERSION|-\* -y
+
+   .. group-tab:: EC2/GCP/Azure Ubuntu Image
+
+        If you’re using the ScyllaDB official image (recommended), see
+        the **Debian/Ubuntu** tab for upgrade instructions.
+
+        If you’re using your own image and have installed ScyllaDB packages for
+        Ubuntu or Debian, you need to additionally downgrade
+        the ``scylla-machine-image`` package.
+
+        .. code-block:: console
+            :substitutions:
+
+            sudo apt-get install scylla=|SRC_VERSION|\* scylla-server=|SRC_VERSION|\* scylla-tools=|SRC_VERSION|\* scylla-tools-core=|SRC_VERSION|\* scylla-kernel-conf=|SRC_VERSION|\* scylla-conf=|SRC_VERSION|\*
+            sudo apt-get install scylla-machine-image=|SRC_VERSION|\*
+        
+        Answer ‘y’ to the first two questions.
+
+
+Restore the configuration file
+------------------------------
+
+.. code:: sh
+   
+   sudo rm -rf /etc/scylla/scylla.yaml
+   sudo cp -a /etc/scylla/scylla.yaml.backup /etc/scylla/scylla.yaml
+
+Start the node
+--------------
+
+.. code:: sh
+
+   sudo service scylla-server start
+
+Validate
+--------
+Check upgrade instruction above for validation. Once you are sure the node
+rollback is successful, move to the next node in the cluster.
--- a/ent/encryption/encrypted_file_impl.cc
+++ b/ent/encryption/encrypted_file_impl.cc
@@ -727,7 +727,12 @@ public:

        // now we need one page more to be able to save one for next lap
        auto fill_size = align_up(buf1.size(), block_size) + block_size - buf1.size();
-        auto buf2 = co_await _input.read_exactly(fill_size);
+        // If the underlying stream is already at EOF (e.g. buf1 came from
+        // cached _next while the previous read_exactly drained the source),
+        // skip the read_exactly call — it would return empty anyway.
+        auto buf2 = _input.eof()
+            ? temporary_buffer<char>()
+            : co_await _input.read_exactly(fill_size);

        temporary_buffer<char> output(buf1.size() + buf2.size());

--- a/ent/ldap/ldap_connection.cc
+++ b/ent/ldap/ldap_connection.cc
@@ -437,7 +437,6 @@ void ldap_connection::poll_results() {
            const auto found = _msgid_to_promise.find(id);
            if (found == _msgid_to_promise.end()) {
                mylog.error("poll_results: got valid result for unregistered id {}, dropping it", id);
-                ldap_msgfree(result);
            } else {
                found->second.set_value(std::move(result_ptr));
                _msgid_to_promise.erase(found);
--- a/locator/everywhere_replication_strategy.cc
+++ b/locator/everywhere_replication_strategy.cc
@@ -42,7 +42,14 @@ void everywhere_replication_strategy::validate_options(const gms::feature_servic

 sstring everywhere_replication_strategy::sanity_check_read_replicas(const effective_replication_map& erm, const host_id_vector_replica_set& read_replicas) const {
    const auto replication_factor = erm.get_replication_factor();
-    if (read_replicas.size() > replication_factor) {
+    if (const auto& topo_info = erm.get_token_metadata().get_topology_change_info(); topo_info && topo_info->read_new) {
+        if (read_replicas.size() > replication_factor + 1) {
+            return seastar::format(
+                    "everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, "
+                    "cannot be higher than replication factor {} + 1 during the 'read from new replicas' stage of a topology change",
+                    read_replicas.size(), replication_factor);
+        }
+    } else if (read_replicas.size() > replication_factor) {
        return seastar::format("everywhere_replication_strategy: the number of replicas for everywhere_replication_strategy is {}, cannot be higher than replication factor {}", read_replicas.size(), replication_factor);
    }
    return {};
--- a/mutation/collection_mutation.cc
+++ b/mutation/collection_mutation.cc
@@ -261,7 +261,7 @@ static collection_mutation serialize_collection_mutation(

        writev(v.serialize());
    }
-    return collection_mutation(type, ret);
+    return collection_mutation(type, std::move(ret));
 }

 collection_mutation collection_mutation_description::serialize(const abstract_type& type) const {
--- a/node_ops/task_manager_module.cc
+++ b/node_ops/task_manager_module.cc
@@ -103,7 +103,7 @@ future<std::optional<tasks::task_status>> node_ops_virtual_task::get_status(task
        .entity = stats.entity,
        .progress_units = "",
        .progress = tasks::task_manager::task::progress{},
-        .children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()))
+        .children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr())
    };
 }

--- a/pgo/profiles/aarch64/profile.profdata.xz
+++ b/pgo/profiles/aarch64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:52c9772c9ac334650d8b179b591c47769ee38d34fad784b61c682e11c03f2506
-size 6530196
+oid sha256:762ffcd253ff9a784fc58e36e1cbe83643e3fe576ac60eb1ce6e4bf8ac2eda8c
+size 6548000
--- a/pgo/profiles/x86_64/profile.profdata.xz
+++ b/pgo/profiles/x86_64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d1a869ebfe4e90d9681499246eb86bb032ae402c350357e19d97b989037a5bd3
-size 6528308
+oid sha256:3f788e2b36a4b87328997c60f0903e197bd193f977e02b5fc8888d79c364e21d
+size 6540076
--- a/raft/server.cc
+++ b/raft/server.cc
@@ -1101,6 +1101,18 @@ future<> server_impl::process_fsm_output(index_t& last_stable, fsm_output&& batc
        // case.
        co_await _persistence->store_term_and_vote(batch.term_and_vote->first, batch.term_and_vote->second);
        _stats.store_term_and_vote++;
+
+        // When the term advances, any in-flight snapshot transfers
+        // belong to an outdated term: the progress tracker has been
+        // reset in become_leader() or we are now a follower.
+        // Abort them before we dispatch this batch's messages, which
+        // may start fresh transfers for the new term.
+        //
+        // A vote may also change independently of the term (e.g. a
+        // follower voting for a candidate at the same term), but in
+        // that case there are no in-flight transfers and the abort
+        // is a no-op.
+        abort_snapshot_transfers();
    }

    if (batch.snp) {
@@ -1210,8 +1222,6 @@ future<> server_impl::process_fsm_output(index_t& last_stable, fsm_output&& batc
            // quickly) stop happening (we're outside the config after all).
            co_await _apply_entries.push_eventually(removed_from_config{});
        }
-        // request aborts of snapshot transfers
-        abort_snapshot_transfers();
        // abort all read barriers
        for (auto& r : _reads) {
            r.promise.set_value(not_a_leader{_fsm->current_leader()});
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -1021,8 +1021,8 @@ void reader_concurrency_semaphore::signal(const resources& r) noexcept {
        on_internal_error_noexcept(rcslog,
                format("reader_concurrency_semaphore::signal(): semaphore {} detected resource leak, available {} exceeds initial {}", _name,
                        _resources, _initial_resources));
-        _resources.count = std::max(_resources.count, _initial_resources.count);
-        _resources.memory = std::max(_resources.memory, _initial_resources.memory);
+        _resources.count = std::min(_resources.count, _initial_resources.count);
+        _resources.memory = std::min(_resources.memory, _initial_resources.memory);
    }
    maybe_wake_execution_loop();
 }
--- a/replica/compaction_group.hh
+++ b/replica/compaction_group.hh
@@ -432,7 +432,9 @@ public:
    // refresh_mutation_source must be called when there are changes to data source
    // structures but logical state of data is not changed (e.g. when state for a
    // new tablet replica is allocated).
-    virtual void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) = 0;
+    virtual void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
+                                                  const locator::effective_replication_map& erm,
+                                                  noncopyable_function<void()> refresh_mutation_source) = 0;

    virtual compaction_group& compaction_group_for_token(dht::token token) const = 0;
    virtual compaction_group& compaction_group_for_key(partition_key_view key, const schema_ptr& s) const = 0;
@@ -442,7 +444,7 @@ public:
    virtual storage_group& storage_group_for_token(dht::token) const = 0;
    virtual utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const = 0;

-    virtual locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const = 0;
+    virtual locator::combined_load_stats table_load_stats() const = 0;
    virtual bool all_storage_groups_split() = 0;
    virtual future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) = 0;
    virtual future<> maybe_split_compaction_group_of(size_t idx) = 0;
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -1697,7 +1697,7 @@ static db::rate_limiter::can_proceed account_singular_ranges_to_rate_limit(
        if (!range.is_singular()) {
            continue;
        }
-        auto token = dht::token::to_int64(ranges.front().start()->value().token());
+        auto token = dht::token::to_int64(range.start()->value().token());
        if (limiter.account_operation(read_label, token, table_limit, rate_limit_info) == db::rate_limiter::can_proceed::no) {
            // Don't return immediately - account all ranges first
            ret = can_proceed::no;
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -1129,9 +1129,7 @@ public:
        return _stats;
    }

-    // The tablet filter is used to not double account migrating tablets, so it's important that
-    // only one of pending or leaving replica is accounted based on current migration stage.
-    locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const;
+    locator::combined_load_stats table_load_stats() const;

    const db::view::stats& get_view_stats() const {
        return _view_stats;
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -711,7 +711,9 @@ public:
        return make_ready_future<>();
    }

-    void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override {}
+    void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
+                                          const locator::effective_replication_map& erm,
+                                          noncopyable_function<void()> refresh_mutation_source) override {}

    compaction_group& compaction_group_for_token(dht::token token) const override {
        return get_compaction_group();
@@ -734,7 +736,7 @@ public:
        return *_single_sg;
    }

-    locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)>) const override {
+    locator::combined_load_stats table_load_stats() const override {
        return locator::combined_load_stats{
            .table_ls = locator::table_load_stats{
                            .size_in_bytes = _single_sg->live_disk_space_used(),
@@ -757,6 +759,11 @@ public:
    }
 };

+struct background_merge_guard {
+    compaction::compaction_reenabler compaction_guard;
+    locator::effective_replication_map_ptr erm_guard;
+};
+
 class tablet_storage_group_manager final : public storage_group_manager {
    replica::table& _t;
    locator::host_id _my_host_id;
@@ -777,7 +784,7 @@ class tablet_storage_group_manager final : public storage_group_manager {
    utils::phased_barrier _merge_fiber_barrier;
    std::optional<utils::phased_barrier::operation> _pending_merge_fiber_work;
    // Holds compaction reenabler which disables compaction temporarily during tablet merge
-    std::vector<compaction::compaction_reenabler> _compaction_reenablers_for_merging;
+    std::vector<background_merge_guard> _compaction_reenablers_for_merging;
 private:
    const schema_ptr& schema() const {
        return _t.schema();
@@ -801,7 +808,8 @@ private:
    // Called when coordinator executes tablet merge. Tablet ids X and X+1 are merged into
    // the new tablet id (X >> 1). In practice, that means storage groups for X and X+1
    // are merged into a new storage group with id (X >> 1).
-    void handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);
+    void handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
+                                        const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap);

    // When merge completes, compaction groups of sibling tablets are added to same storage
    // group, but they're not merged yet into one, since the merge completion handler happens
@@ -895,7 +903,9 @@ public:
                std::exchange(_stop_fut, make_ready_future())).discard_result();
    }

-    void update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) override;
+    void update_effective_replication_map(const locator::effective_replication_map_ptr& old_erm,
+                                          const locator::effective_replication_map& erm,
+                                          noncopyable_function<void()> refresh_mutation_source) override;

    compaction_group& compaction_group_for_token(dht::token token) const override;
    utils::chunked_vector<storage_group_ptr> storage_groups_for_token_range(dht::token_range tr) const override;
@@ -909,7 +919,7 @@ public:
        return storage_group_for_id(storage_group_of(token).first);
    }

-    locator::combined_load_stats table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const override;
+    locator::combined_load_stats table_load_stats() const override;
    bool all_storage_groups_split() override;
    future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override;
    future<> maybe_split_compaction_group_of(size_t idx) override;
@@ -1006,6 +1016,11 @@ bool storage_group::set_split_mode() {
        return false;
    }
    if (!splitting_mode()) {
+        // Don't create new compaction groups if the main cg has compaction disabled
+        if (_main_cg->compaction_disabled()) {
+            tlogger.debug("storage_group::set_split_mode: split ready groups not created due to compaction disabled on the main group");
+            return false;
+        }
        auto create_cg = [this] () -> compaction_group_ptr {
            // TODO: use the actual sub-ranges instead, to help incremental selection on the read path.
            return compaction_group::make_empty_group(*_main_cg);
@@ -1443,6 +1458,7 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
                                        sstables::offstrategy offstrategy) {
    std::vector<sstables::shared_sstable> ret, ssts;
    std::exception_ptr ex;
+    log_level failure_log_level = log_level::error;
    try {
        bool trigger_compaction = offstrategy == sstables::offstrategy::no;
        auto& cg = compaction_group_for_sstable(new_sst);
@@ -1464,6 +1480,9 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
            co_await do_add_sstable_and_update_cache(cg, sst, offstrategy, trigger_compaction);
            sst = nullptr;
        }
+    } catch (compaction::compaction_stopped_exception&) {
+        failure_log_level = log_level::warn;
+        ex = std::current_exception();
    } catch (...) {
        ex = std::current_exception();
    }
@@ -1471,13 +1490,13 @@ table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
    if (ex) {
        // on failed split, input sstable is unlinked here.
        if (new_sst) {
-            tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
+            tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
            co_await new_sst->unlink();
        }
        // on failure after successful split, sstables not attached yet will be unlinked
-        co_await coroutine::parallel_for_each(ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
+        co_await coroutine::parallel_for_each(ssts, [&ex, failure_log_level] (sstables::shared_sstable sst) -> future<> {
            if (sst) {
-                tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
+                tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
                co_await sst->unlink();
            }
        });
@@ -1491,6 +1510,7 @@ table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> n
                                         std::function<future<>(sstables::shared_sstable)> on_add) {
    std::exception_ptr ex;
    std::vector<sstables::shared_sstable> ret;
+    log_level failure_log_level = log_level::error;

    // We rely on add_new_sstable_and_update_cache() to unlink the sstable fed into it,
    // so the exception handling below will only have to unlink sstables not processed yet.
@@ -1500,14 +1520,17 @@ table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> n
            std::ranges::move(ssts, std::back_inserter(ret));

        }
+    } catch (compaction::compaction_stopped_exception&) {
+        failure_log_level = log_level::warn;
+        ex = std::current_exception();
    } catch (...) {
        ex = std::current_exception();
    }

    if (ex) {
-        co_await coroutine::parallel_for_each(new_ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
+        co_await coroutine::parallel_for_each(new_ssts, [&ex, failure_log_level] (sstables::shared_sstable sst) -> future<> {
            if (sst) {
-                tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
+                tlogger.log(failure_log_level, "Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
                co_await sst->unlink();
            }
        });
@@ -1743,7 +1766,9 @@ table::seal_active_memtable(compaction_group& cg, flush_permit&& flush_permit) n
        utils::get_local_injector().inject("table_seal_active_memtable_try_flush", []() {
            throw std::system_error(ENOSPC, std::system_category(), "Injected error");
        });
-        co_return co_await this->try_flush_memtable_to_sstable(cg, old, std::move(write_permit));
+        co_await this->try_flush_memtable_to_sstable(cg, old, std::move(write_permit));
+        // signal a memtable was sealed
+        utils::get_local_injector().receive_message("table_seal_post_flush_waiters");
    });

    undo_stats.reset();
@@ -2933,17 +2958,108 @@ void table::on_flush_timer() {
    });
 }

-locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const {
+// The following functions return true if we should return the tablet size of a tablet in
+// migration depending on its transition stage and whether it is a leaving or pending replica
+bool has_size_on_leaving (locator::tablet_transition_stage stage) {
+    switch (stage) {
+        case locator::tablet_transition_stage::allow_write_both_read_old:               [[fallthrough]];
+        case locator::tablet_transition_stage::write_both_read_old:                     [[fallthrough]];
+        case locator::tablet_transition_stage::streaming:                               [[fallthrough]];
+        case locator::tablet_transition_stage::write_both_read_new:                     [[fallthrough]];
+        case locator::tablet_transition_stage::use_new:                                 [[fallthrough]];
+        case locator::tablet_transition_stage::cleanup_target:                          [[fallthrough]];
+        case locator::tablet_transition_stage::revert_migration:                        [[fallthrough]];
+        case locator::tablet_transition_stage::rebuild_repair:                          [[fallthrough]];
+        case locator::tablet_transition_stage::repair:                                  [[fallthrough]];
+        case locator::tablet_transition_stage::end_repair:
+            return true;
+        case locator::tablet_transition_stage::cleanup:                                 [[fallthrough]];
+        case locator::tablet_transition_stage::end_migration:
+            return false;
+    }
+}
+
+bool has_size_on_pending (locator::tablet_transition_stage stage) {
+    switch (stage) {
+        case locator::tablet_transition_stage::allow_write_both_read_old:               [[fallthrough]];
+        case locator::tablet_transition_stage::write_both_read_old:                     [[fallthrough]];
+        case locator::tablet_transition_stage::streaming:                               [[fallthrough]];
+        case locator::tablet_transition_stage::cleanup_target:                          [[fallthrough]];
+        case locator::tablet_transition_stage::revert_migration:                        [[fallthrough]];
+        case locator::tablet_transition_stage::rebuild_repair:
+            return false;
+        case locator::tablet_transition_stage::write_both_read_new:                     [[fallthrough]];
+        case locator::tablet_transition_stage::use_new:                                 [[fallthrough]];
+        case locator::tablet_transition_stage::cleanup:                                 [[fallthrough]];
+        case locator::tablet_transition_stage::end_migration:                           [[fallthrough]];
+        case locator::tablet_transition_stage::repair:                                  [[fallthrough]];
+        case locator::tablet_transition_stage::end_repair:
+            return true;
+    }
+}
+
+locator::combined_load_stats tablet_storage_group_manager::table_load_stats() const {
    locator::table_load_stats table_stats;
    table_stats.split_ready_seq_number = _split_ready_seq_number;

    locator::tablet_load_stats tablet_stats;

    for_each_storage_group([&] (size_t id, storage_group& sg) {
-        locator::global_tablet_id gid { _t.schema()->id(), locator::tablet_id(id) };
-        if (tablet_filter(*_tablet_map, gid)) {
-            const uint64_t tablet_size = sg.live_disk_space_used();
+        auto tid = locator::tablet_id(id);
+        locator::global_tablet_id gid { _t.schema()->id(), tid };
+        locator::tablet_replica me { _my_host_id, this_shard_id() };
+        const uint64_t tablet_size = sg.live_disk_space_used();
+
+        auto transition = _tablet_map->get_tablet_transition_info(tid);
+        auto& info = _tablet_map->get_tablet_info(tid);
+        bool is_pending = transition && transition->pending_replica == me;
+        bool is_leaving = transition && locator::get_leaving_replica(info, *transition) == me;
+
+        // It's important to tackle the anomaly in reported size, since both leaving and
+        // pending replicas could otherwise be accounted during tablet migration.
+        // If transition hasn't reached write_both_read_new stage, then leaving replicas are accounted.
+        // Otherwise, pending replicas are accounted.
+        // This helps to reduce the discrepancy window.
+        auto table_size_filter = [&] () {
+            // if tablet is not in transit, it's filtered in.
+            if (!transition) {
+                return true;
+            }
+
+            auto s = transition->reads; // read selector
+
+            return (!is_pending && !is_leaving)
+                    || (is_leaving && s == locator::read_replica_set_selector::previous)
+                    || (is_pending && s == locator::read_replica_set_selector::next);
+        };
+
+        // When a tablet is in migration, we want to send its size during any migration stage when
+        // we still know the tablet's size. This way the balancer will have better information about
+        // tablet sizes, and we reduce the chance that the node will be ignored during balancing
+        // due to missing tablet size. On the leaving replica we include tablets until the use_new
+        // stage (inclusive), and on the pending we include tablets after the streaming stage.
+        // There is an overlap in tablet sizes (we report sizes on both the leaving and pending
+        // replicas for some stages), but that should not be a problem.
+        auto tablet_size_filter = [&] () {
+            // if tablet is not in transit, it's filtered in.
+            if (!transition) {
+                return true;
+            }
+
+            if (is_leaving) {
+                return has_size_on_leaving(transition->stage);
+            } else if (is_pending) {
+                return has_size_on_pending(transition->stage);
+            }
+
+            return true;
+        };
+
+        if (table_size_filter()) {
            table_stats.size_in_bytes += tablet_size;
+        }
+
+        if (tablet_size_filter()) {
            const dht::token_range trange = _tablet_map->get_token_range(gid.tablet);
            // Make sure the token range is in the form (a, b]
            SCYLLA_ASSERT(!trange.start()->is_inclusive() && trange.end()->is_inclusive());
@@ -2956,8 +3072,8 @@ locator::combined_load_stats tablet_storage_group_manager::table_load_stats(std:
    };
 }

-locator::combined_load_stats table::table_load_stats(std::function<bool(const locator::tablet_map&, locator::global_tablet_id)> tablet_filter) const {
-    return _sg_manager->table_load_stats(std::move(tablet_filter));
+locator::combined_load_stats table::table_load_stats() const {
+    return _sg_manager->table_load_stats();
 }

 void tablet_storage_group_manager::handle_tablet_split_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
@@ -3069,7 +3185,9 @@ future<> tablet_storage_group_manager::merge_completion_fiber() {
    }
 }

-void tablet_storage_group_manager::handle_tablet_merge_completion(const locator::tablet_map& old_tmap, const locator::tablet_map& new_tmap) {
+void tablet_storage_group_manager::handle_tablet_merge_completion(locator::effective_replication_map_ptr old_erm,
+                                                                  const locator::tablet_map& old_tmap,
+                                                                  const locator::tablet_map& new_tmap) {
    auto table_id = schema()->id();
    size_t old_tablet_count = old_tmap.tablet_count();
    size_t new_tablet_count = new_tmap.tablet_count();
@@ -3093,7 +3211,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
        auto new_cg = make_lw_shared<compaction_group>(_t, new_tid, new_range, make_repair_sstable_classifier_func());
        for (auto& view : new_cg->all_views()) {
            auto cre = _t.get_compaction_manager().stop_and_disable_compaction_no_wait(*view, "tablet merging");
-            _compaction_reenablers_for_merging.push_back(std::move(cre));
+            _compaction_reenablers_for_merging.push_back(background_merge_guard{std::move(cre), old_erm});
        }
        auto new_sg = make_lw_shared<storage_group>(std::move(new_cg));

@@ -3102,7 +3220,7 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:

            auto it = _storage_groups.find(group_id);
            if (it == _storage_groups.end()) {
-                throw std::runtime_error(format("Unable to find sibling tablet of id for table {}", group_id, table_id));
+                throw std::runtime_error(format("Unable to find sibling tablet of id {} for table {}", group_id, table_id));
            }
            auto& sg = it->second;
            sg->for_each_compaction_group([&new_sg, new_range, new_tid, group_id] (const compaction_group_ptr& cg) {
@@ -3126,7 +3244,11 @@ void tablet_storage_group_manager::handle_tablet_merge_completion(const locator:
    _merge_completion_event.signal();
 }

-void tablet_storage_group_manager::update_effective_replication_map(const locator::effective_replication_map& erm, noncopyable_function<void()> refresh_mutation_source) {
+void tablet_storage_group_manager::update_effective_replication_map(
+        const locator::effective_replication_map_ptr& old_erm,
+        const locator::effective_replication_map& erm,
+        noncopyable_function<void()> refresh_mutation_source)
+{
    auto* new_tablet_map = &erm.get_token_metadata().tablets().get_tablet_map(schema()->id());
    auto* old_tablet_map = std::exchange(_tablet_map, new_tablet_map);

@@ -3142,7 +3264,7 @@ void tablet_storage_group_manager::update_effective_replication_map(const locato
        if (utils::get_local_injector().is_enabled("tablet_force_tablet_count_decrease_once")) {
            utils::get_local_injector().disable("tablet_force_tablet_count_decrease");
        }
-        handle_tablet_merge_completion(*old_tablet_map, *new_tablet_map);
+        handle_tablet_merge_completion(old_erm, *old_tablet_map, *new_tablet_map);
    }

    // Allocate storage group if tablet is migrating in, or deallocate if it's migrating out.
@@ -3228,7 +3350,7 @@ void table::update_effective_replication_map(locator::effective_replication_map_
    };

    if (uses_tablets()) {
-        _sg_manager->update_effective_replication_map(*_erm, refresh_mutation_source);
+        _sg_manager->update_effective_replication_map(old_erm, *_erm, refresh_mutation_source);
    }
    if (old_erm) {
        old_erm->invalidate();
@@ -3690,7 +3812,6 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
        tlogger.debug("Taking snapshot of {}.{}: name={}", s->ks_name(), s->cf_name(), name);

        std::vector<snapshot_sstable_set> sstable_sets(smp::count);
-        std::vector<int64_t> tablet_counts(smp::count);

        co_await writer->init();
        co_await smp::invoke_on_all([&] -> future<> {
@@ -3698,7 +3819,6 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
            auto [tables, permit] = co_await t.snapshot_sstables();
            auto sstables_metadata = co_await t.get_sstables_manager().take_snapshot(std::move(tables), name);
            sstable_sets[this_shard_id()] = make_foreign(std::make_unique<utils::chunked_vector<sstables::sstable_snapshot_metadata>>(std::move(sstables_metadata)));
-            tablet_counts[this_shard_id()] = t.calculate_tablet_count();
        });
        co_await writer->sync();

@@ -3712,12 +3832,13 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, c
        });
        tlogger.debug("snapshot {}: seal_snapshot", name);
        const auto& topology = sharded_db.local().get_token_metadata().get_topology();
-        std::optional<int64_t> min_tablet_count;
+        std::optional<int64_t> tablet_count;
        if (t.uses_tablets()) {
-            SCYLLA_ASSERT(!tablet_counts.empty());
-            min_tablet_count = *std::ranges::min_element(tablet_counts);
+            auto erm = t.get_effective_replication_map();
+            auto& tm = erm->get_token_metadata().tablets().get_tablet_map(s->id());
+            tablet_count = tm.tablet_count();
        }
-        co_await write_manifest(topology, *writer, std::move(sstable_sets), name, std::move(opts), s, min_tablet_count).handle_exception([&] (std::exception_ptr ptr) {
+        co_await write_manifest(topology, *writer, std::move(sstable_sets), name, std::move(opts), s, tablet_count).handle_exception([&] (std::exception_ptr ptr) {
            tlogger.error("Failed to seal snapshot in {}: {}.", name, ptr);
            ex = std::move(ptr);
        });
@@ -3775,6 +3896,7 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
            }

            auto lister = directory_lister(snapshots_dir, lister::dir_entry_types::of<directory_entry_type::directory>());
+            auto close_lister = deferred_close(lister);
            while (auto de = lister.get().get()) {
                auto snapshot_name = de->name;
                all_snapshots.emplace(snapshot_name, snapshot_details());
@@ -3782,6 +3904,9 @@ future<std::unordered_map<sstring, table::snapshot_details>> table::get_snapshot
                auto& sd = all_snapshots.at(snapshot_name);
                sd.total += details.total;
                sd.live += details.live;
+                utils::get_local_injector().inject("get_snapshot_details", [&] (auto& handler) -> future<> {
+                    throw std::runtime_error("Injected exception in get_snapshot_details");
+                }).get();
            }
        }
        return all_snapshots;
@@ -3801,53 +3926,66 @@ future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_di
    }

    auto lister = directory_lister(snapshot_directory, snapshot_dir, lister::dir_entry_types::of<directory_entry_type::regular>());
-    while (auto de = co_await lister.get()) {
-        const auto& name = de->name;
-        future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
-        auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
-        auto size = sd.allocated_size;
+    std::exception_ptr ex;
+    try {
+        while (auto de = co_await lister.get()) {
+            const auto& name = de->name;
+            future<stat_data> (&file_stat)(file& directory, std::string_view name, follow_symlink) noexcept = seastar::file_stat;
+            auto sd = co_await io_check(file_stat, snapshot_directory, name, follow_symlink::no);
+            auto size = sd.allocated_size;

-        // The manifest and schema.sql files are the only files expected to be in this directory not belonging to the SSTable.
-        //
-        // All the others should just generate an exception: there is something wrong, so don't blindly
-        // add it to the size.
-        if (name != "manifest.json" && name != "schema.cql") {
-            details.total += size;
-            if (sd.number_of_links == 1) {
-                // File exists only in the snapshot directory.
-                details.live += size;
+            utils::get_local_injector().inject("per-snapshot-get_snapshot_details", [&] (auto& handler) -> future<> {
+                throw std::runtime_error("Injected exception in per-snapshot-get_snapshot_details");
+            }).get();
+
+            // The manifest and schema.cql files are the only files expected to be in this directory not belonging to the SSTable.
+            //
+            // All the others should just generate an exception: there is something wrong, so don't blindly
+            // add it to the size.
+            if (name != "manifest.json" && name != "schema.cql") {
+                details.total += size;
+                if (sd.number_of_links == 1) {
+                    // File exists only in the snapshot directory.
+                    details.live += size;
+                    continue;
+                }
+                // If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
+                // So check the datadir for the file too.
+            } else {
                continue;
            }
-            // If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
-            // So check the datadir for the file too.
-        } else {
-            continue;
-        }

-        auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
-          try {
-            // File exists in the main SSTable directory. Snapshots are not contributing to size
-            auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
-            // File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
-            if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
-                dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
-                        (path / name).native(), psd.device_id, psd.inode_number, psd.size,
-                        (snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
+            auto exists_in_dir = [&] (file& dir, const fs::path& path, std::string_view name) -> future<bool> {
+              try {
+                // File exists in the main SSTable directory. Snapshots are not contributing to size
+                auto psd = co_await io_check(file_stat, dir, name, follow_symlink::no);
+                // File in main SSTable directory must be hardlinked to the file in the snapshot dir with the same name.
+                if (psd.device_id != sd.device_id || psd.inode_number != sd.inode_number) {
+                    dblog.warn("[{} device_id={} inode_number={} size={}] is not the same file as [{} device_id={} inode_number={} size={}]",
+                            (path / name).native(), psd.device_id, psd.inode_number, psd.size,
+                            (snapshot_dir / name).native(), sd.device_id, sd.inode_number, sd.size);
+                    co_return false;
+                }
+                co_return true;
+              } catch (std::system_error& e) {
+                if (e.code() != std::error_code(ENOENT, std::system_category())) {
+                    throw;
+                }
                co_return false;
+              }
+            };
+            // Check staging dir first, as files might be moved from there to the datadir concurrently to this check
+            if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
+                    !co_await exists_in_dir(data_directory, datadir, name)) {
+                details.live += size;
            }
-            co_return true;
-          } catch (std::system_error& e) {
-            if (e.code() != std::error_code(ENOENT, std::system_category())) {
-                throw;
-            }
-            co_return false;
-          }
-        };
-        // Check staging dir first, as files might be moved from there to the datadir concurrently to this check
-        if ((!staging_dir || !co_await exists_in_dir(staging_directory, *staging_dir, name)) &&
-                !co_await exists_in_dir(data_directory, datadir, name)) {
-            details.live += size;
        }
+    } catch (...) {
+        ex = std::current_exception();
+    }
+    co_await lister.close();
+    if (ex) {
+        co_await coroutine::return_exception_ptr(std::move(ex));
    }

    co_return details;
--- a/schema/schema_builder.hh
+++ b/schema/schema_builder.hh
@@ -263,8 +263,9 @@ public:
    void enable_schema_commitlog() {
        _static_props.enable_schema_commitlog();
    }
-    void set_is_group0_table(bool enabled = true) {
-        _static_props.is_group0_table = enabled;
+    void set_is_group0_table() {
+        _static_props.is_group0_table = true;
+        enable_schema_commitlog();
    }

    class default_names {
--- a/service/paxos/paxos_state.cc
+++ b/service/paxos/paxos_state.cc
@@ -454,7 +454,7 @@ static future<cql3::untyped_result_set> do_execute_cql_with_timeout(sstring req,
    auto ps_ptr = qp.get_prepared(cache_key);
    if (!ps_ptr) {
        const auto msg_ptr = co_await qp.prepare(req, qs, cql3::internal_dialect());
-        ps_ptr = std::move(msg_ptr->get_prepared());
+        ps_ptr = msg_ptr->get_prepared();
        if (!ps_ptr) {
            on_internal_error(paxos_state::logger, "prepared statement is null");
        }
--- a/service/raft/group0_state_machine.cc
+++ b/service/raft/group0_state_machine.cc
@@ -350,6 +350,10 @@ static void ensure_group0_schema(const group0_command& cmd, const replica::datab
            if (!schema->static_props().is_group0_table) {
                on_internal_error(slogger, fmt::format("ensure_group0_schema: schema is not group0: {}", schema->cf_name()));
            }
+
+            if (!schema->static_props().use_schema_commitlog) {
+                on_internal_error(slogger, fmt::format("ensure_group0_schema: group0 table {} does not use schema commitlog", schema->cf_name()));
+            }
        }
    };

--- a/service/raft/raft_group0.cc
+++ b/service/raft/raft_group0.cc
@@ -559,6 +559,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
        group0_id = g0_info.group0_id;
        raft::server_address my_addr{my_id, {}};

+        bool starting_server_as_follower = false;
        if (server == nullptr) {
            // This is the first time discovery is run. Create and start a Raft server for group 0 on this node.
            raft::configuration initial_configuration;
@@ -586,6 +587,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
                // trigger an empty snapshot transfer.
                nontrivial_snapshot = true;
            } else {
+                starting_server_as_follower = true;
                co_await handshaker->pre_server_start(g0_info);
            }

@@ -614,7 +616,9 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
        }

        SCYLLA_ASSERT(server);
-        if (server->get_configuration().contains(my_id)) {
+        co_await utils::get_local_injector().inject("join_group0_pause_before_config_check",
+                utils::wait_for_message(std::chrono::minutes{5}));
+        if (!starting_server_as_follower && server->get_configuration().contains(my_id)) {
            // True if we started a new group or completed a configuration change initiated earlier.
            group0_log.info("server {} already in group 0 (id {}) as {}", my_id, group0_id,
                    server->get_configuration().can_vote(my_id)? "voter" : "non-voter");
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -987,7 +987,7 @@ future<> storage_service::merge_topology_snapshot(raft_snapshot snp) {
                    frozen_muts_to_apply.push_back(co_await freeze_gently(mut));
                } else {
                    co_await for_each_split_mutation(std::move(mut), max_size, [&] (mutation m) -> future<> {
-                        frozen_muts_to_apply.push_back(co_await freeze_gently(mut));
+                        frozen_muts_to_apply.push_back(co_await freeze_gently(m));
                    });
                }
            }
@@ -5003,6 +5003,8 @@ future<> storage_service::drain() {
 }

 future<> storage_service::do_drain() {
+    co_await utils::get_local_injector().inject("storage_service_drain_wait", utils::wait_for_message(60s));
+
    // Need to stop transport before group0, otherwise RPCs may fail with raft_group_not_found.
    co_await stop_transport();

@@ -6056,6 +6058,8 @@ future<> storage_service::process_tablet_split_candidate(table_id table) noexcep
        });
    };

+    co_await utils::get_local_injector().inject("tablet_split_monitor_wait", utils::wait_for_message(1min));
+
    exponential_backoff_retry split_retry = exponential_backoff_retry(std::chrono::seconds(5), std::chrono::seconds(300));

    while (!_async_gate.is_closed() && !_group0_as.abort_requested()) {
@@ -6090,6 +6094,9 @@ future<> storage_service::process_tablet_split_candidate(table_id table) noexcep
        } catch (raft::request_aborted& ex) {
            slogger.warn("Failed to complete splitting of table {} due to {}", table, ex);
            break;
+        } catch (seastar::gate_closed_exception& ex) {
+            slogger.warn("Failed to complete splitting of table {} due to {}", table, ex);
+            break;
        } catch (...) {
            slogger.error("Failed to complete splitting of table {} due to {}, retrying after {} seconds",
                          table, std::current_exception(), split_retry.sleep_time());
@@ -6156,6 +6163,57 @@ future<> storage_service::snitch_reconfigured() {
    }
 }

+future<> storage_service::local_topology_barrier() {
+    if (this_shard_id() != 0) {
+        co_await container().invoke_on(0, [] (storage_service& ss) {
+            return ss.local_topology_barrier();
+        });
+        co_return;
+    }
+
+    auto version = _topology_state_machine._topology.version;
+
+    utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
+        throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
+    });
+
+    co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
+    if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
+        for (auto& n : _topology_state_machine._topology.transition_nodes) {
+            if (!_address_map.find(locator::host_id{n.first.uuid()})) {
+                rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
+                break;
+            }
+        }
+    }
+
+    co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
+        const auto current_version = ss._shared_token_metadata.get()->get_version();
+        rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, current version {}",
+                      version, current_version);
+
+        // This shouldn't happen under normal operation, it's only plausible
+        // if the topology change coordinator has
+        // moved to another node and managed to update the topology
+        // parallel to this method. The previous coordinator
+        // should be inactive now, so it won't observe this
+        // exception. By returning exception we aim
+        // to reveal any other conditions where this may arise.
+        if (current_version != version) {
+            co_await coroutine::return_exception(std::runtime_error(
+                    ::format("raft topology: command::barrier_and_drain, the version has changed, "
+                             "version {}, current_version {}, the topology change coordinator "
+                             " had probably migrated to another node",
+                             version, current_version)));
+        }
+
+        co_await ss._shared_token_metadata.stale_versions_in_use();
+        co_await get_topology_session_manager().drain_closing_sessions();
+
+        rtlogger.info("raft_topology_cmd::barrier_and_drain done");
+    });
+}
+
 future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft::term_t term, uint64_t cmd_index, const raft_topology_cmd& cmd) {
    raft_topology_cmd_result result;
    rtlogger.info("topology cmd rpc {} is called index={}", cmd.cmd, cmd_index);
@@ -6183,12 +6241,6 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
            state.last_index = cmd_index;
        }

-        // We capture the topology version right after the checks
-        // above, before any yields. This is crucial since _topology_state_machine._topology
-        // might be altered concurrently while this method is running,
-        // which can cause the fence command to apply an invalid fence version.
-        const auto version = _topology_state_machine._topology.version;
-
        switch (cmd.cmd) {
            case raft_topology_cmd::command::barrier: {
                utils::get_local_injector().inject("raft_topology_barrier_fail",
@@ -6227,43 +6279,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
            }
            break;
            case raft_topology_cmd::command::barrier_and_drain: {
-                utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail_before", [] {
-                    throw std::runtime_error("raft_topology_barrier_and_drain_fail_before injected exception");
-                });
-                co_await utils::get_local_injector().inject("pause_before_barrier_and_drain", utils::wait_for_message(std::chrono::minutes(5)));
-                if (_topology_state_machine._topology.tstate == topology::transition_state::write_both_read_old) {
-                    for (auto& n : _topology_state_machine._topology.transition_nodes) {
-                        if (!_address_map.find(locator::host_id{n.first.uuid()})) {
-                            rtlogger.error("The topology transition is in a double write state but the IP of the node in transition is not known");
-                            break;
-                        }
-                    }
-                }
-                co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
-                    const auto current_version = ss._shared_token_metadata.get()->get_version();
-                    rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, current version {}",
-                        version, current_version);
-
-                    // This shouldn't happen under normal operation, it's only plausible
-                    // if the topology change coordinator has
-                    // moved to another node and managed to update the topology
-                    // parallel to this method. The previous coordinator
-                    // should be inactive now, so it won't observe this
-                    // exception. By returning exception we aim
-                    // to reveal any other conditions where this may arise.
-                    if (current_version != version) {
-                        co_await coroutine::return_exception(std::runtime_error(
-                            ::format("raft topology: command::barrier_and_drain, the version has changed, "
-                                     "version {}, current_version {}, the topology change coordinator "
-                                     " had probably migrated to another node",
-                                version, current_version)));
-                    }
-
-                    co_await ss._shared_token_metadata.stale_versions_in_use();
-                    co_await get_topology_session_manager().drain_closing_sessions();
-
-                    rtlogger.info("raft_topology_cmd::barrier_and_drain done");
-                });
+                co_await local_topology_barrier();

                co_await utils::get_local_injector().inject("raft_topology_barrier_and_drain_fail", [this] (auto& handler) -> future<> {
                    auto ks = handler.get("keyspace");
@@ -7359,34 +7375,8 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
            if (!table) {
                continue;
            }
-            auto erm = table->get_effective_replication_map();
-            auto& token_metadata = erm->get_token_metadata();
-            auto me = locator::tablet_replica { token_metadata.get_my_id(), this_shard_id() };

-            // It's important to tackle the anomaly in reported size, since both leaving and
-            // pending replicas could otherwise be accounted during tablet migration.
-            // If transition hasn't reached cleanup stage, then leaving replicas are accounted.
-            // If transition is past cleanup stage, then pending replicas are accounted.
-            // This helps to reduce the discrepancy window.
-            auto tablet_filter = [&me] (const locator::tablet_map& tmap, locator::global_tablet_id id) {
-                auto transition = tmap.get_tablet_transition_info(id.tablet);
-                auto& info = tmap.get_tablet_info(id.tablet);
-
-                // if tablet is not in transit, it's filtered in.
-                if (!transition) {
-                    return true;
-                }
-
-                bool is_pending = transition->pending_replica == me;
-                bool is_leaving = locator::get_leaving_replica(info, *transition) == me;
-                auto s = transition->reads; // read selector
-
-                return (!is_pending && !is_leaving)
-                       || (is_leaving && s == locator::read_replica_set_selector::previous)
-                       || (is_pending && s == locator::read_replica_set_selector::next);
-            };
-
-            locator::combined_load_stats combined_ls { table->table_load_stats(tablet_filter) };
+            locator::combined_load_stats combined_ls { table->table_load_stats() };
            load_stats.tables.emplace(id, std::move(combined_ls.table_ls));
            tablet_sizes_per_shard[this_shard_id()].size += load_stats.tablet_stats[this_host].add_tablet_sizes(combined_ls.tablet_ls);

--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -944,6 +944,9 @@ public:
    future<bool> ongoing_rf_change(const group0_guard& guard, sstring ks) const;
    future<> raft_initialize_discovery_leader(const join_node_request_params& params);
    future<> initialize_done_topology_upgrade_state();
+    // Does the local part of global_token_metadata_barrier(), without a raft group0 barrier.
+    // In particular, waits for non-latest local erms to go die.
+    future<> local_topology_barrier();
 private:
     // State machine that is responsible for topology change
    topology_state_machine& _topology_state_machine;
--- a/service/task_manager_module.cc
+++ b/service/task_manager_module.cc
@@ -21,7 +21,6 @@ namespace service {

 struct status_helper {
    tasks::task_status status;
-    utils::chunked_vector<locator::tablet_id> tablets;
    std::optional<locator::tablet_replica> pending_replica;
 };

@@ -141,27 +140,54 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
    auto task_type = hint.get_task_type();
    auto tablet_id_opt = tablet_id_provided(task_type) ? std::make_optional(hint.get_tablet_id()) : std::nullopt;

-    size_t tablet_count = _ss.get_token_metadata().tablets().get_tablet_map(table).tablet_count();
+    const auto& tablets = _ss.get_token_metadata().tablets();
+    size_t tablet_count = tablets.has_tablet_map(table) ? tablets.get_tablet_map(table).tablet_count() : 0;
    auto res = co_await get_status_helper(id, std::move(hint));
    if (!res) {
        co_return std::nullopt;
    }

    tasks::tmlogger.info("tablet_virtual_task: wait until tablet operation is finished");
-    co_await _ss._topology_state_machine.event.wait([&] {
-        auto& tmap = _ss.get_token_metadata().tablets().get_tablet_map(table);
-        if (is_resize_task(task_type)) {    // Resize task.
-            return tmap.resize_task_info().tablet_task_id.uuid() != id.uuid();
-        } else if (tablet_id_opt.has_value()) {    // Migration task.
-            return tmap.get_tablet_info(tablet_id_opt.value()).migration_task_info.tablet_task_id.uuid() != id.uuid();
-        } else {    // Repair task.
-            return std::all_of(res->tablets.begin(), res->tablets.end(), [&] (const locator::tablet_id& tablet) {
-                return tmap.get_tablet_info(tablet).repair_task_info.tablet_task_id.uuid() != id.uuid();
-            });
+    co_await utils::get_local_injector().inject("tablet_virtual_task_wait", utils::wait_for_message(60s));
+    while (true) {
+        co_await _ss._topology_state_machine.event.wait([&] {
+            if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
+                return true;
+            }
+            auto& tmap = _ss.get_token_metadata().tablets().get_tablet_map(table);
+            if (is_resize_task(task_type)) {    // Resize task.
+                return tmap.resize_task_info().tablet_task_id.uuid() != id.uuid();
+            } else if (tablet_id_opt.has_value()) {    // Migration task.
+                return tmap.get_tablet_info(tablet_id_opt.value()).migration_task_info.tablet_task_id.uuid() != id.uuid();
+            } else {    // Repair task.
+                return true;
+            }
+        });
+
+        if (!is_repair_task(task_type)) {
+            break;
        }
-    });
+
+        auto tmptr = _ss.get_token_metadata_ptr();
+        if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
+            break;
+        }
+        auto& tmap = tmptr->tablets().get_tablet_map(table);
+        bool repair_still_running = false;
+        co_await tmap.for_each_tablet([&] (locator::tablet_id tid, const locator::tablet_info& info) {
+            repair_still_running = repair_still_running || (info.repair_task_info.is_valid() && info.repair_task_info.tablet_task_id.uuid() == id.uuid());
+            return make_ready_future();
+        });
+        if (!repair_still_running) {
+            break;
+        }
+    }

    res->status.state = tasks::task_manager::task_state::done; // Failed repair task is retried.
+    if (!_ss.get_token_metadata().tablets().has_tablet_map(table)) {
+        res->status.end_time = db_clock::now();
+        co_return res->status;
+    }
    if (is_migration_task(task_type)) {
        auto& replicas = _ss.get_token_metadata().tablets().get_tablet_map(table).get_tablet_info(tablet_id_opt.value()).replicas;
        auto migration_failed = std::all_of(replicas.begin(), replicas.end(), [&] (const auto& replica) { return res->pending_replica.has_value() && replica != res->pending_replica.value(); });
@@ -169,9 +195,9 @@ future<std::optional<tasks::task_status>> tablet_virtual_task::wait(tasks::task_
    } else if (is_resize_task(task_type)) {
        auto new_tablet_count = _ss.get_token_metadata().tablets().get_tablet_map(table).tablet_count();
        res->status.state = new_tablet_count == tablet_count ? tasks::task_manager::task_state::suspended : tasks::task_manager::task_state::done;
-        res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
+        res->status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
    } else {
-        res->status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
+        res->status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
    }
    res->status.end_time = db_clock::now(); // FIXME: Get precise end time.
    co_return res->status;
@@ -244,7 +270,15 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
    status_helper res;
    auto table = hint.get_table_id();
    auto task_type = hint.get_task_type();
-    auto schema = _ss._db.local().get_tables_metadata().get_table(table).schema();
+    auto table_ptr = _ss._db.local().get_tables_metadata().get_table_if_exists(table);
+    if (!table_ptr) {
+        co_return tasks::task_status {
+            .task_id = id,
+            .kind = tasks::task_kind::cluster,
+            .is_abortable = co_await is_abortable(std::move(hint)),
+        };
+    }
+    auto schema = table_ptr->schema();
    res.status = {
        .task_id = id,
        .kind = tasks::task_kind::cluster,
@@ -257,6 +291,7 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
    auto& tmap = tmptr->tablets().get_tablet_map(table);
    bool repair_task_finished = false;
    bool repair_task_pending = false;
+    bool no_tablets_processed = true;
    if (is_repair_task(task_type)) {
        auto progress = co_await _ss._repair.local().get_tablet_repair_task_progress(id);
        if (progress) {
@@ -273,37 +308,37 @@ future<std::optional<status_helper>> tablet_virtual_task::get_status_helper(task
            auto& task_info = info.repair_task_info;
            if (task_info.tablet_task_id.uuid() == id.uuid()) {
                update_status(task_info, res.status, sched_nr);
-                res.tablets.push_back(tid);
+                no_tablets_processed = false;
            }
            return make_ready_future();
        });
-        res.status.children = co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper()));
+        res.status.children = co_await get_children(get_module(), id, _ss.get_token_metadata_ptr());
    } else if (is_migration_task(task_type)) {    // Migration task.
        auto tablet_id = hint.get_tablet_id();
        res.pending_replica = tmap.get_tablet_transition_info(tablet_id)->pending_replica;
        auto& task_info = tmap.get_tablet_info(tablet_id).migration_task_info;
        if (task_info.tablet_task_id.uuid() == id.uuid()) {
            update_status(task_info, res.status, sched_nr);
-            res.tablets.push_back(tablet_id);
+            no_tablets_processed = false;
        }
    } else {    // Resize task.
        auto& task_info = tmap.resize_task_info();
        if (task_info.tablet_task_id.uuid() == id.uuid()) {
            update_status(task_info, res.status, sched_nr);
            res.status.state = tasks::task_manager::task_state::running;
-            res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, std::bind_front(&gms::gossiper::is_alive, &_ss.gossiper())) : utils::chunked_vector<tasks::task_identity>{};
+            res.status.children = task_type == locator::tablet_task_type::split ? co_await get_children(get_module(), id, _ss.get_token_metadata_ptr()) : utils::chunked_vector<tasks::task_identity>{};
            co_return res;
        }
    }

-    if (!res.tablets.empty()) {
+    if (!no_tablets_processed) {
        res.status.state = sched_nr == 0 ? tasks::task_manager::task_state::created : tasks::task_manager::task_state::running;
        co_return res;
    }

    if (repair_task_pending) {
        // When repair_task_pending is true, the res.tablets will be empty iff the request is aborted by user.
-        res.status.state = res.tablets.empty() ? tasks::task_manager::task_state::failed : tasks::task_manager::task_state::running;
+        res.status.state = no_tablets_processed ? tasks::task_manager::task_state::failed : tasks::task_manager::task_state::running;
        co_return res;
    }
    if (repair_task_finished) {
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -2193,6 +2193,19 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                _tablet_allocator.set_load_stats(reconciled_stats);
            }
        }
+
+        // Wait for the background storage group merge to finish before releasing the state machine.
+        // Background merge holds the old erm, so a successful barrier joins with it.
+        // This guarantees that the background merge doesn't run concurrently with the next merge.
+        // Replica-side storage group merge takes compaction locks on the tablet's main compaction group, released
+        // by the background merge. If the next merge starts before the background merge finishes, it can cause a deadlock.
+        // The background merge fiber will try to stop a compaction group which is locked, and the lock is held
+        // by the background merge fiber.
+        tm = nullptr;
+        if (!guard) {
+            guard = co_await start_operation();
+        }
+        co_await global_tablet_token_metadata_barrier(std::move(guard));
    }

    future<> handle_truncate_table(group0_guard guard) {
--- a/service_permit.hh
+++ b/service_permit.hh
@@ -18,6 +18,13 @@ class service_permit {
    friend service_permit empty_service_permit();
 public:
    size_t count() const { return _permit ? _permit->count() : 0; };
+    // Merge additional semaphore units into this permit.
+    // Used to grow the permit after the actual resource cost is known.
+    void adopt(seastar::semaphore_units<>&& units) {
+        if (_permit) {
+            _permit->adopt(std::move(units));
+        }
+    }
 };

 inline service_permit make_service_permit(seastar::semaphore_units<>&& permit) {
--- a/sstables/index_entry.hh
+++ b/sstables/index_entry.hh
@@ -201,95 +201,47 @@ public:
    virtual future<std::optional<entry_info>> next_entry() = 0;
 };

-// Allocated inside LSA.
-class promoted_index {
-    deletion_time _del_time;
-    uint64_t _promoted_index_start;
-    uint32_t _promoted_index_size;
-    uint32_t _num_blocks;
-public:
-    promoted_index(const schema& s,
-        deletion_time del_time,
-        uint64_t promoted_index_start,
-        uint32_t promoted_index_size,
-        uint32_t num_blocks)
-            : _del_time{del_time}
-            , _promoted_index_start(promoted_index_start)
-            , _promoted_index_size(promoted_index_size)
-            , _num_blocks(num_blocks)
-    { }
-
-    [[nodiscard]] deletion_time get_deletion_time() const { return _del_time; }
-    [[nodiscard]] uint32_t get_promoted_index_size() const { return _promoted_index_size; }
-
-    // Call under allocating_section.
-    // For sstable versions >= mc the returned cursor will be of type `bsearch_clustered_cursor`.
-    std::unique_ptr<clustered_index_cursor> make_cursor(shared_sstable,
-        reader_permit,
-        tracing::trace_state_ptr,
-        file_input_stream_options,
-        use_caching);
+// Promoted index information produced by the parser.
+struct parsed_promoted_index_entry {
+    deletion_time del_time;
+    uint64_t promoted_index_start;
+    uint32_t promoted_index_size;
+    uint32_t num_blocks;
 };

+using promoted_index = parsed_promoted_index_entry;
+
 // A partition index element.
 // Allocated inside LSA.
-class index_entry {
-private:
-    managed_bytes _key;
-    mutable std::optional<dht::token> _token;
-    uint64_t _position;
-    managed_ref<promoted_index> _index;
+struct [[gnu::packed]] index_entry {
+    mutable int64_t raw_token;
+    uint64_t data_file_offset;
+    uint32_t key_offset;

-public:
-
-    key_view get_key() const {
-        return key_view{_key};
-    }
-
-    // May allocate so must be called under allocating_section.
-    decorated_key_view get_decorated_key(const schema& s) const {
-        if (!_token) {
-            _token.emplace(s.get_partitioner().get_token(get_key()));
-        }
-        return decorated_key_view(*_token, get_key());
-    }
-
-    uint64_t position() const { return _position; };
-
-    std::optional<deletion_time> get_deletion_time() const {
-        if (_index) {
-            return _index->get_deletion_time();
-        }
-
-        return {};
-    }
-
-    index_entry(managed_bytes&& key, uint64_t position, managed_ref<promoted_index>&& index)
-        : _key(std::move(key))
-        , _position(position)
-        , _index(std::move(index))
-    {}
-
-    index_entry(index_entry&&) = default;
-    index_entry& operator=(index_entry&&) = default;
-
-    // Can be nullptr
-    const managed_ref<promoted_index>& get_promoted_index() const { return _index; }
-    managed_ref<promoted_index>& get_promoted_index() { return _index; }
-    uint32_t get_promoted_index_size() const { return _index ? _index->get_promoted_index_size() : 0; }
-
-    size_t external_memory_usage() const {
-        return _key.external_memory_usage() + _index.external_memory_usage();
-    }
+    uint64_t position() const { return data_file_offset; }
+    dht::raw_token token() const { return dht::raw_token(raw_token); }
 };

+// Required for optimized LSA migration of storage of managed_vector.
+static_assert(std::is_trivially_move_assignable_v<index_entry>);
+static_assert(std::is_trivially_move_assignable_v<parsed_promoted_index_entry>);
+
 // A partition index page.
 //
 // Allocated in the standard allocator space but with an LSA allocator as the current allocator.
 // So the shallow part is in the standard allocator but all indirect objects are inside LSA.
 class partition_index_page {
 public:
-    lsa::chunked_managed_vector<managed_ref<index_entry>> _entries;
+    lsa::chunked_managed_vector<index_entry> _entries;
+    managed_bytes _key_storage;
+
+    // Stores promoted index information of index entries.
+    // The i-th element corresponds to the i-th entry in _entries.
+    // Can be smaller than _entries. If _entries[i] doesn't have a matching element in _promoted_indexes then
+    // that entry doesn't have a promoted index.
+    // Kept separately to avoid paying for storage cost in pages where no entry has a promoted index,
+    // which is typical in workloads with small partitions.
+    lsa::chunked_managed_vector<promoted_index> _promoted_indexes;
 public:
    partition_index_page() = default;
    partition_index_page(partition_index_page&&) noexcept = default;
@@ -298,15 +250,68 @@ public:
    bool empty() const { return _entries.empty(); }
    size_t size() const { return _entries.size(); }

+    stop_iteration clear_gently() {
+        // Vectors have trivial storage, so are fast to destroy.
+        return stop_iteration::yes;
+    }
+
    void clear_one_entry() {
        _entries.pop_back();
    }

+    bool has_promoted_index(size_t i) const {
+        return i < _promoted_indexes.size() && _promoted_indexes[i].promoted_index_size > 0;
+    }
+
+    /// Get promoted index for the i-th entry.
+    /// Call only when has_promoted_index(i) is true.
+    const promoted_index& get_promoted_index(size_t i) const {
+        return _promoted_indexes[i];
+    }
+
+    /// Get promoted index for the i-th entry.
+    /// Call only when has_promoted_index(i) is true.
+    promoted_index& get_promoted_index(size_t i) {
+        return _promoted_indexes[i];
+    }
+
+    /// Get promoted index size for the i-th entry.
+    uint32_t get_promoted_index_size(size_t i) const {
+        return has_promoted_index(i) ? get_promoted_index(i).promoted_index_size : 0;
+    }
+
+    /// Get deletion_time for partition represented by the i-th entry.
+    /// Returns disengaged optional if the entry doesn't have a promoted index, so we don't know the deletion_time.
+    /// It has to be read from the data file.
+    std::optional<deletion_time> get_deletion_time(size_t i) const {
+        if (has_promoted_index(i)) {
+            return get_promoted_index(i).del_time;
+        }
+        return {};
+    }
+
+    key_view get_key(size_t i) const {
+        auto start = _entries[i].key_offset;
+        auto end = i + 1 < _entries.size() ? _entries[i + 1].key_offset : _key_storage.size();
+        auto v = managed_bytes_view(_key_storage).prefix(end);
+        v.remove_prefix(start);
+        return key_view(v);
+    }
+
+    decorated_key_view get_decorated_key(const schema& s, size_t i) const {
+        auto key = get_key(i);
+        auto t = _entries[i].token();
+        if (!t) {
+            t = dht::raw_token(s.get_partitioner().get_token(key));
+            _entries[i].raw_token = t.value;
+        }
+        return decorated_key_view(dht::token(t), key);
+    }
+
    size_t external_memory_usage() const {
        size_t size = _entries.external_memory_usage();
-        for (auto&& e : _entries) {
-            size += sizeof(index_entry) + e->external_memory_usage();
-        }
+        size += _promoted_indexes.external_memory_usage();
+        size += _key_storage.external_memory_usage();
        return size;
    }
 };
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -25,14 +25,6 @@ namespace sstables {
 extern seastar::logger sstlog;
 extern thread_local mc::cached_promoted_index::metrics promoted_index_cache_metrics;

-// Promoted index information produced by the parser.
-struct parsed_promoted_index_entry {
-    deletion_time del_time;
-    uint64_t promoted_index_start;
-    uint32_t promoted_index_size;
-    uint32_t num_blocks;
-};
-
 // Partition index entry information produced by the parser.
 struct parsed_partition_index_entry {
    temporary_buffer<char> key;
@@ -53,9 +45,10 @@ class index_consumer {
    schema_ptr _s;
    logalloc::allocating_section _alloc_section;
    logalloc::region& _region;
+    utils::chunked_vector<parsed_partition_index_entry> _parsed_entries;
+    size_t _max_promoted_index_entry_plus_one = 0; // Highest index +1 in _parsed_entries which has a promoted index.
+    size_t _key_storage_size = 0;
 public:
-    index_list indexes;
-
    index_consumer(logalloc::region& r, schema_ptr s)
        : _s(s)
        , _alloc_section(abstract_formatter([s] (fmt::format_context& ctx) {
@@ -64,36 +57,63 @@ public:
        , _region(r)
    { }

-    ~index_consumer() {
-        with_allocator(_region.allocator(), [&] {
-            indexes._entries.clear_and_release();
-        });
+    void consume_entry(parsed_partition_index_entry&& e) {
+        _key_storage_size += e.key.size();
+        _parsed_entries.emplace_back(std::move(e));
+        if (e.promoted_index) {
+            _max_promoted_index_entry_plus_one = std::max(_max_promoted_index_entry_plus_one, _parsed_entries.size());
+        }
    }

-    void consume_entry(parsed_partition_index_entry&& e) {
-        _alloc_section(_region, [&] {
+    future<index_list> finalize() {
+        index_list result;
+        // In case of exception, need to deallocate under region allocator.
+        auto delete_result = seastar::defer([&] {
            with_allocator(_region.allocator(), [&] {
-                managed_ref<promoted_index> pi;
-                if (e.promoted_index) {
-                    pi = make_managed<promoted_index>(*_s,
-                            e.promoted_index->del_time,
-                            e.promoted_index->promoted_index_start,
-                            e.promoted_index->promoted_index_size,
-                            e.promoted_index->num_blocks);
-                }
-                auto key = managed_bytes(reinterpret_cast<const bytes::value_type*>(e.key.get()), e.key.size());
-                indexes._entries.emplace_back(make_managed<index_entry>(std::move(key), e.data_file_offset, std::move(pi)));
+                result._entries = {};
+                result._promoted_indexes = {};
+                result._key_storage = {};
            });
        });
+        auto i = _parsed_entries.begin();
+        size_t key_offset = 0;
+        while (i != _parsed_entries.end()) {
+            _alloc_section(_region, [&] {
+                with_allocator(_region.allocator(), [&] {
+                    result._entries.reserve(_parsed_entries.size());
+                    result._promoted_indexes.resize(_max_promoted_index_entry_plus_one);
+                    if (result._key_storage.empty()) {
+                        result._key_storage = managed_bytes(managed_bytes::initialized_later(), _key_storage_size);
+                    }
+                    managed_bytes_mutable_view key_out(result._key_storage);
+                    key_out.remove_prefix(key_offset);
+                    while (i != _parsed_entries.end()) {
+                        parsed_partition_index_entry& e = *i;
+                        if (e.promoted_index) {
+                            result._promoted_indexes[result._entries.size()] = *e.promoted_index;
+                        }
+                        write_fragmented(key_out, std::string_view(e.key.begin(), e.key.size()));
+                        result._entries.emplace_back(index_entry{dht::raw_token().value, e.data_file_offset, key_offset});
+                        ++i;
+                        key_offset += e.key.size();
+                        if (need_preempt()) {
+                            break;
+                        }
+                    }
+                });
+            });
+            co_await coroutine::maybe_yield();
+        }
+        delete_result.cancel();
+        _parsed_entries.clear();
+        co_return std::move(result);
    }

    void prepare(uint64_t size) {
-        _alloc_section = logalloc::allocating_section();
-        _alloc_section(_region, [&] {
-            with_allocator(_region.allocator(), [&] {
-                indexes._entries.reserve(size);
-            });
-        });
+        _max_promoted_index_entry_plus_one = 0;
+        _key_storage_size = 0;
+        _parsed_entries.clear();
+        _parsed_entries.reserve(size);
    }
 };

@@ -198,10 +218,14 @@ public:

        switch (_state) {
        // START comes first, to make the handling of the 0-quantity case simpler
+            state_START:
        case state::START:
            sstlog.trace("{}: pos {} state {} - data.size()={}", fmt::ptr(this), current_pos(), state::START, data.size());
            _state = state::KEY_SIZE;
-            break;
+            if (data.size() == 0) {
+                break;
+            }
+            [[fallthrough]];
        case state::KEY_SIZE:
            sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::KEY_SIZE);
            _entry_offset = current_pos();
@@ -227,7 +251,16 @@ public:
        case state::PROMOTED_SIZE:
            sstlog.trace("{}: pos {} state {}", fmt::ptr(this), current_pos(), state::PROMOTED_SIZE);
            _position = this->_u64;
-            if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
+            if (is_mc_format() && data.size() && *data.begin() == 0) { // promoted_index_size == 0
+                data.trim_front(1);
+                _consumer.consume_entry(parsed_partition_index_entry{
+                    .key = std::move(_key),
+                    .data_file_offset = _position,
+                    .index_offset = _entry_offset,
+                    .promoted_index = std::nullopt
+                });
+                goto state_START;
+            } else if (read_vint_or_uint32(data) != continuous_data_consumer::read_status::ready) {
                _state = state::PARTITION_HEADER_LENGTH_1;
                break;
            }
@@ -339,33 +372,6 @@ inline file make_tracked_index_file(sstable& sst, reader_permit permit, tracing:
    return tracing::make_traced_file(std::move(f), std::move(trace_state), format("{}:", sst.index_filename()));
 }

-inline
-std::unique_ptr<clustered_index_cursor> promoted_index::make_cursor(shared_sstable sst,
-    reader_permit permit,
-    tracing::trace_state_ptr trace_state,
-    file_input_stream_options options,
-    use_caching caching)
-{
-    if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
-        seastar::shared_ptr<cached_file> cached_file_ptr = caching
-                ? sst->_cached_index_file
-                : seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
-                                                    sst->manager().get_cache_tracker().get_index_cached_file_stats(),
-                                                    sst->manager().get_cache_tracker().get_lru(),
-                                                    sst->manager().get_cache_tracker().region(),
-                                                    sst->_index_file_size);
-        return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
-            _promoted_index_start, _promoted_index_size,
-            promoted_index_cache_metrics, permit,
-            sst->get_column_translation(), cached_file_ptr, _num_blocks, trace_state, sst->features());
-    }
-
-    auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
-    auto promoted_index_stream = make_file_input_stream(std::move(file), _promoted_index_start, _promoted_index_size,options);
-    return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
-        std::move(promoted_index_stream), _promoted_index_size, _num_blocks, std::nullopt);
-}
-
 // Less-comparator for lookups in the partition index.
 class index_comparator {
    dht::ring_position_comparator_for_sstables _tri_cmp;
@@ -376,27 +382,17 @@ public:
        return _tri_cmp(e.get_decorated_key(), rp) < 0;
    }

-    bool operator()(const index_entry& e, dht::ring_position_view rp) const {
-        return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) < 0;
-    }
-
-    bool operator()(const managed_ref<index_entry>& e, dht::ring_position_view rp) const {
-        return operator()(*e, rp);
-    }
-
-    bool operator()(dht::ring_position_view rp, const managed_ref<index_entry>& e) const {
-        return operator()(rp, *e);
-    }
-
    bool operator()(dht::ring_position_view rp, const summary_entry& e) const {
        return _tri_cmp(e.get_decorated_key(), rp) > 0;
    }
-
-    bool operator()(dht::ring_position_view rp, const index_entry& e) const {
-        return _tri_cmp(e.get_decorated_key(_tri_cmp.s), rp) > 0;
-    }
 };

+inline
+std::strong_ordering index_entry_tri_cmp(const schema& s, partition_index_page& page, size_t idx, dht::ring_position_view rp) {
+    dht::ring_position_comparator_for_sstables tri_cmp(s);
+    return tri_cmp(page.get_decorated_key(s, idx), rp);
+}
+
 // Contains information about index_reader position in the index file
 struct index_bound {
    index_bound() = default;
@@ -537,7 +533,7 @@ private:
                    if (ex) {
                        return make_exception_future<index_list>(std::move(ex));
                    }
-                    return make_ready_future<index_list>(std::move(bound.consumer->indexes));
+                    return bound.consumer->finalize();
                });
            });
        };
@@ -550,17 +546,18 @@ private:
            if (bound.current_list->empty()) {
                throw malformed_sstable_exception(format("missing index entry for summary index {} (bound {})", summary_idx, fmt::ptr(&bound)), _sstable->index_filename());
            }
-            bound.data_file_position = bound.current_list->_entries[0]->position();
+            bound.data_file_position = bound.current_list->_entries[0].position();
            bound.element = indexable_element::partition;
            bound.end_open_marker.reset();

            if (sstlog.is_enabled(seastar::log_level::trace)) {
                sstlog.trace("index {} bound {}: page:", fmt::ptr(this), fmt::ptr(&bound));
                logalloc::reclaim_lock rl(_region);
-                for (auto&& e : bound.current_list->_entries) {
+                for (size_t i = 0; i < bound.current_list->_entries.size(); ++i) {
+                    auto& e = bound.current_list->_entries[i];
                    auto dk = dht::decorate_key(*_sstable->_schema,
-                        e->get_key().to_partition_key(*_sstable->_schema));
-                    sstlog.trace("  {} -> {}", dk, e->position());
+                        bound.current_list->get_key(i).to_partition_key(*_sstable->_schema));
+                    sstlog.trace("  {} -> {}", dk, e.position());
                }
            }

@@ -604,7 +601,13 @@ private:
    // Valid if partition_data_ready(bound)
    index_entry& current_partition_entry(index_bound& bound) {
        parse_assert(bool(bound.current_list), _sstable->index_filename());
-        return *bound.current_list->_entries[bound.current_index_idx];
+        return bound.current_list->_entries[bound.current_index_idx];
+    }
+
+    // Valid if partition_data_ready(bound)
+    partition_index_page& current_page(index_bound& bound) {
+        parse_assert(bool(bound.current_list), _sstable->index_filename());
+        return *bound.current_list;
    }

    future<> advance_to_next_partition(index_bound& bound) {
@@ -617,7 +620,7 @@ private:
        if (bound.current_index_idx + 1 < bound.current_list->size()) {
            ++bound.current_index_idx;
            bound.current_pi_idx = 0;
-            bound.data_file_position = bound.current_list->_entries[bound.current_index_idx]->position();
+            bound.data_file_position = bound.current_list->_entries[bound.current_index_idx].position();
            bound.element = indexable_element::partition;
            bound.end_open_marker.reset();
            return reset_clustered_cursor(bound);
@@ -680,9 +683,13 @@ private:
        return advance_to_page(bound, summary_idx).then([this, &bound, pos, summary_idx] {
            sstlog.trace("index {}: old page index = {}", fmt::ptr(this), bound.current_index_idx);
            auto i = _alloc_section(_region, [&] {
-                auto& entries = bound.current_list->_entries;
-                return std::lower_bound(std::begin(entries) + bound.current_index_idx, std::end(entries), pos,
-                    index_comparator(*_sstable->_schema));
+                auto& page = *bound.current_list;
+                auto& s = *_sstable->_schema;
+                auto r = std::views::iota(bound.current_index_idx, page._entries.size());
+                auto it = std::ranges::partition_point(r, [&] (int idx) {
+                    return index_entry_tri_cmp(s, page, idx, pos) < 0;
+                });
+                return page._entries.begin() + bound.current_index_idx + std::ranges::distance(r.begin(), it);
            });
            // i is valid until next allocation point
            auto& entries = bound.current_list->_entries;
@@ -697,7 +704,7 @@ private:
            }
            bound.current_index_idx = std::distance(std::begin(entries), i);
            bound.current_pi_idx = 0;
-            bound.data_file_position = (*i)->position();
+            bound.data_file_position = (*i).position();
            bound.element = indexable_element::partition;
            bound.end_open_marker.reset();
            sstlog.trace("index {}: new page index = {}, pos={}", fmt::ptr(this), bound.current_index_idx, bound.data_file_position);
@@ -800,6 +807,34 @@ public:
        }
    }

+    static
+    std::unique_ptr<clustered_index_cursor> make_cursor(const parsed_promoted_index_entry& pi,
+        shared_sstable sst,
+        reader_permit permit,
+        tracing::trace_state_ptr trace_state,
+        file_input_stream_options options,
+        use_caching caching)
+    {
+        if (sst->get_version() >= sstable_version_types::mc) [[likely]] {
+            seastar::shared_ptr<cached_file> cached_file_ptr = caching
+                    ? sst->_cached_index_file
+                    : seastar::make_shared<cached_file>(make_tracked_index_file(*sst, permit, trace_state, caching),
+                                                        sst->manager().get_cache_tracker().get_index_cached_file_stats(),
+                                                        sst->manager().get_cache_tracker().get_lru(),
+                                                        sst->manager().get_cache_tracker().region(),
+                                                        sst->_index_file_size);
+            return std::make_unique<mc::bsearch_clustered_cursor>(*sst->get_schema(),
+                pi.promoted_index_start, pi.promoted_index_size,
+                promoted_index_cache_metrics, permit,
+                sst->get_column_translation(), cached_file_ptr, pi.num_blocks, trace_state, sst->features());
+        }
+
+        auto file = make_tracked_index_file(*sst, permit, std::move(trace_state), caching);
+        auto promoted_index_stream = make_file_input_stream(std::move(file), pi.promoted_index_start, pi.promoted_index_size,options);
+        return std::make_unique<scanning_clustered_index_cursor>(*sst->get_schema(), permit,
+            std::move(promoted_index_stream), pi.promoted_index_size, pi.num_blocks, std::nullopt);
+    }
+
    // Ensures that partition_data_ready() returns true.
    // Can be called only when !eof()
    future<> read_partition_data() override {
@@ -835,10 +870,10 @@ public:
    clustered_index_cursor* current_clustered_cursor(index_bound& bound) {
        if (!bound.clustered_cursor) {
            _alloc_section(_region, [&] {
-                index_entry& e = current_partition_entry(bound);
-                promoted_index* pi = e.get_promoted_index().get();
-                if (pi) {
-                    bound.clustered_cursor = pi->make_cursor(_sstable, _permit, _trace_state,
+                partition_index_page& page = current_page(bound);
+                if (page.has_promoted_index(bound.current_index_idx)) {
+                    promoted_index& pi = page.get_promoted_index(bound.current_index_idx);
+                    bound.clustered_cursor = make_cursor(pi, _sstable, _permit, _trace_state,
                        get_file_input_stream_options(), _use_caching);
                }
            });
@@ -861,15 +896,15 @@ public:
    // It may be unavailable for old sstables for which this information was not generated.
    // Can be called only when partition_data_ready().
    std::optional<sstables::deletion_time> partition_tombstone() override {
-        return current_partition_entry(_lower_bound).get_deletion_time();
+        return current_page(_lower_bound).get_deletion_time(_lower_bound.current_index_idx);
    }

    // Returns the key for current partition.
    // Can be called only when partition_data_ready().
    std::optional<partition_key> get_partition_key() override {
        return _alloc_section(_region, [this] {
-            index_entry& e = current_partition_entry(_lower_bound);
-            return e.get_key().to_partition_key(*_sstable->_schema);
+            return current_page(_lower_bound).get_key(_lower_bound.current_index_idx)
+                .to_partition_key(*_sstable->_schema);
        });
    }

@@ -883,8 +918,8 @@ public:
    // Returns the number of promoted index entries for the current partition.
    // Can be called only when partition_data_ready().
    uint64_t get_promoted_index_size() {
-        index_entry& e = current_partition_entry(_lower_bound);
-        return e.get_promoted_index_size();
+        partition_index_page& page = current_page(_lower_bound);
+        return page.get_promoted_index_size(_lower_bound.current_index_idx);
    }

    bool partition_data_ready() const override {
@@ -975,9 +1010,9 @@ public:
                return make_ready_future<bool>(false);
            }
            return read_partition_data().then([this, key] {
-                index_comparator cmp(*_sstable->_schema);
                bool found = _alloc_section(_region, [&] {
-                    return cmp(key, current_partition_entry(_lower_bound)) == 0;
+                    auto& page = current_page(_lower_bound);
+                    return index_entry_tri_cmp(*_sstable->_schema, page, _lower_bound.current_index_idx, key) == 0;
                });
                return make_ready_future<bool>(found);
            });
--- a/sstables/object_storage_client.cc
+++ b/sstables/object_storage_client.cc
@@ -189,10 +189,11 @@ public:
            {}
            future<std::optional<directory_entry>> get() override {
                std::filesystem::path dir(_prefix);
-                do {
+                while (true) {
                    if (_pos == _info.size()) {
                        _info.clear();
                        _info = co_await _client->list_objects(_bucket, _prefix, _paging);
+                        _pos = 0;
                    }
                    if (_info.empty()) {
                        break;
@@ -203,7 +204,7 @@ public:
                        continue;
                    }
                    co_return ent;
-                } while (false);
+                }

                co_return std::nullopt;
            }
@@ -276,7 +277,7 @@ public:
            co_await f.close();

            auto names = ranges | std::views::transform([](auto& p) { return p.name; }) | std::ranges::to<std::vector<std::string>>();
-            co_await _client->merge_objects(bucket, object, std::move(names), {}, as);
+            co_await _client->merge_objects(bucket, object, names, {}, as);

            co_await parallel_for_each(names, [this, bucket](auto& name) -> future<> {
                co_await _client->delete_object(bucket, name);
--- a/sstables/partition_index_cache.hh
+++ b/sstables/partition_index_cache.hh
@@ -257,14 +257,11 @@ public:
        while (partial_page || i != _cache.end()) {
            if (partial_page) {
                auto preempted = with_allocator(_region.allocator(), [&] {
-                    while (!partial_page->empty()) {
-                        partial_page->clear_one_entry();
-                        if (need_preempt()) {
-                            return true;
-                        }
+                    while (partial_page->clear_gently() != stop_iteration::yes) {
+                        return true;
                    }
                    partial_page.reset();
-                    return false;
+                    return need_preempt();
                });
                if (preempted) {
                    auto key = (i != _cache.end()) ? std::optional(i->key()) : std::nullopt;
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -1094,7 +1094,6 @@ public:

    friend class mc::writer;
    friend class index_reader;
-    friend class promoted_index;
    friend class sstables_manager;
    template <typename DataConsumeRowsContext>
    friend future<std::unique_ptr<DataConsumeRowsContext>>
--- a/streaming/stream_blob.cc
+++ b/streaming/stream_blob.cc
@@ -436,7 +436,10 @@ tablet_stream_files(netw::messaging_service& ms, std::list<stream_blob_info> sou
    stream_options.buffer_size = file_stream_buffer_size;
    stream_options.read_ahead = file_stream_read_ahead;

-    for (auto& info : sources) {
+    for (auto&& source_info : sources) {
+        // Keep stream_blob_info alive only at duration of streaming. Allowing the file descriptor
+        // of the sstable component to be released right after it has been streamed.
+        auto info = std::exchange(source_info, {});
        auto& filename = info.filename;
        std::optional<input_stream<char>> fstream;
        bool fstream_closed = false;
@@ -617,6 +620,7 @@ tablet_stream_files(netw::messaging_service& ms, std::list<stream_blob_info> sou
                    ops_id, filename, targets, total_size, get_bw(total_size, start_time));
        }
    }
+    co_await utils::get_local_injector().inject("tablet_stream_files_end_wait", utils::wait_for_message(std::chrono::seconds(60)));
    if (error) {
        blogger.warn("fstream[{}] Master failed sending files_nr={} files={} targets={} send_size={} bw={} error={}",
                ops_id, sources.size(), sources, targets, ops_total_size, get_bw(ops_total_size, ops_start_time), error);
@@ -680,15 +684,20 @@ future<stream_files_response> tablet_stream_files_handler(replica::database& db,
    if (files.empty()) {
        co_return resp;
    }
+    auto sstable_nr = sstables.size();
+    // Release reference to sstables to be streamed here. Since one sstable is streamed at a time,
+    // a sstable - that has been compacted - can have its space released from disk right after
+    // that sstable's content has been fully streamed.
+    sstables.clear();
    blogger.debug("stream_sstables[{}] Started sending sstable_nr={} files_nr={} files={} range={}",
-            req.ops_id, sstables.size(), files.size(), files, req.range);
+            req.ops_id, sstable_nr, files.size(), files, req.range);
    auto ops_start_time = std::chrono::steady_clock::now();
    auto files_nr = files.size();
    size_t stream_bytes = co_await tablet_stream_files(ms, std::move(files), req.targets, req.table, req.ops_id, req.topo_guard);
    resp.stream_bytes = stream_bytes;
    auto duration = std::chrono::steady_clock::now() - ops_start_time;
    blogger.info("stream_sstables[{}] Finished sending sstable_nr={} files_nr={} range={} stream_bytes={} stream_time={} stream_bw={}",
-            req.ops_id, sstables.size(), files_nr, req.range, stream_bytes, duration, get_bw(stream_bytes, ops_start_time));
+            req.ops_id, sstable_nr, files_nr, req.range, stream_bytes, duration, get_bw(stream_bytes, ops_start_time));
    co_return resp;
 }

--- a/table_helper.cc
+++ b/table_helper.cc
@@ -75,7 +75,7 @@ future<bool> table_helper::try_prepare(bool fallback, cql3::query_processor& qp,
    auto& stmt = fallback ? _insert_cql_fallback.value() : _insert_cql;
    try {
        shared_ptr<cql_transport::messages::result_message::prepared> msg_ptr = co_await qp.prepare(stmt, qs.get_client_state(), dialect);
-        _prepared_stmt = std::move(msg_ptr->get_prepared());
+        _prepared_stmt = msg_ptr->get_prepared();
        shared_ptr<cql3::cql_statement> cql_stmt = _prepared_stmt->statement;
        _insert_stmt = dynamic_pointer_cast<cql3::statements::modification_statement>(cql_stmt);
        _is_fallback_stmt = fallback;
--- a/tasks/task_manager.cc
+++ b/tasks/task_manager.cc
@@ -400,7 +400,7 @@ task_manager::virtual_task::impl::impl(module_ptr module) noexcept
    : _module(std::move(module))
 {}

-future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::get_children(module_ptr module, task_id parent_id, std::function<bool(locator::host_id)> is_host_alive) {
+future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::get_children(module_ptr module, task_id parent_id, locator::token_metadata_ptr tmptr) {
    auto ms = module->get_task_manager()._messaging;
    if (!ms) {
        auto ids = co_await module->get_task_manager().get_virtual_task_children(parent_id);
@@ -417,19 +417,18 @@ future<utils::chunked_vector<task_identity>> task_manager::virtual_task::impl::g
        tmlogger.info("tasks_vt_get_children: waiting");
        co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::seconds{10});
    });
-    co_return co_await map_reduce(nodes, [ms, parent_id, is_host_alive = std::move(is_host_alive)] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
-        if (is_host_alive(host_id)) {
-            return ser::tasks_rpc_verbs::send_tasks_get_children(ms, host_id, parent_id).then([host_id] (auto resp) {
-                return resp | std::views::transform([host_id] (auto id) {
-                    return task_identity{
-                        .host_id = host_id,
-                        .task_id = id
-                    };
-                }) | std::ranges::to<utils::chunked_vector<task_identity>>();
-            });
-        } else {
-            return make_ready_future<utils::chunked_vector<task_identity>>();
-        }
+    co_return co_await map_reduce(nodes, [ms, parent_id] (auto host_id) -> future<utils::chunked_vector<task_identity>> {
+        return ser::tasks_rpc_verbs::send_tasks_get_children(ms, host_id, parent_id).then([host_id] (auto resp) {
+            return resp | std::views::transform([host_id] (auto id) {
+                return task_identity{
+                    .host_id = host_id,
+                    .task_id = id
+                };
+            }) | std::ranges::to<utils::chunked_vector<task_identity>>();
+        }).handle_exception_type([host_id, parent_id] (const rpc::closed_error& ex) {
+            tmlogger.warn("Failed to get children of virtual task with id={} from node {}: {}", parent_id, host_id, ex);
+            return utils::chunked_vector<task_identity>{};
+        });
    }, utils::chunked_vector<task_identity>{}, [] (auto a, auto&& b) {
        std::move(b.begin(), b.end(), std::back_inserter(a));
        return a;
--- a/tasks/task_manager.hh
+++ b/tasks/task_manager.hh
@@ -19,6 +19,7 @@
 #include "db_clock.hh"
 #include "utils/log.hh"
 #include "locator/host_id.hh"
+#include "locator/token_metadata_fwd.hh"
 #include "schema/schema_fwd.hh"
 #include "tasks/types.hh"
 #include "utils/chunked_vector.hh"
@@ -282,7 +283,7 @@ public:
            impl& operator=(impl&&) = delete;
            virtual ~impl() = default;
        protected:
-            static future<utils::chunked_vector<task_identity>> get_children(module_ptr module, task_id parent_id, std::function<bool(locator::host_id)> is_host_alive);
+            static future<utils::chunked_vector<task_identity>> get_children(module_ptr module, task_id parent_id, locator::token_metadata_ptr tmptr);
        public:
            virtual task_group get_group() const noexcept = 0;
            // Returns std::nullopt if an operation with task_id isn't tracked by this virtual_task.
--- a/test/alternator/test_metrics.py
+++ b/test/alternator/test_metrics.py
@@ -423,14 +423,17 @@ def test_streams_operations(test_table_s, dynamodbstreams, metrics):
 # to update latencies for one kind of operation (#17616, and compare #9406),
 # and to do that checking that ..._count increases for that op is enough.
@contextmanager
-def check_sets_latency(metrics, operation_names):
+def check_sets_latency_by_metric(metrics, operation_names, metric_name):
    the_metrics = get_metrics(metrics)
-    saved_latency_count = { x: get_metric(metrics, 'scylla_alternator_op_latency_count', {'op': x}, the_metrics) for x in operation_names }
+    saved_latency_count = { x: get_metric(metrics, f'{metric_name}_count', {'op': x}, the_metrics) for x in operation_names }
    yield
    the_metrics = get_metrics(metrics)
    for op in operation_names:
        # The total "count" on all shards should strictly increase
-        assert saved_latency_count[op] < get_metric(metrics, 'scylla_alternator_op_latency_count', {'op': op}, the_metrics)
+        assert saved_latency_count[op] < get_metric(metrics, f'{metric_name}_count', {'op': op}, the_metrics)
+
+def check_sets_latency(metrics, operation_names):
+    return check_sets_latency_by_metric(metrics, operation_names, 'scylla_alternator_op_latency')

 # Test latency metrics for PutItem, GetItem, DeleteItem, UpdateItem.
 # We can't check what exactly the latency is - just that it gets updated.
@@ -446,6 +449,18 @@ def test_item_latency(test_table_s, metrics):
        test_table_s.meta.client.batch_get_item(RequestItems = {
            test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}})

+def test_item_latency_per_table(test_table_s, metrics):
+    with check_sets_latency_by_metric(metrics, ['DeleteItem', 'GetItem', 'PutItem', 'UpdateItem', 'BatchWriteItem', 'BatchGetItem'], 'scylla_alternator_table_op_latency'):
+        p = random_string()
+        test_table_s.put_item(Item={'p': p})
+        test_table_s.get_item(Key={'p': p})
+        test_table_s.delete_item(Key={'p': p})
+        test_table_s.update_item(Key={'p': p})
+        test_table_s.meta.client.batch_write_item(RequestItems = {
+            test_table_s.name: [{'PutRequest': {'Item': {'p': random_string(), 'a': 'hi'}}}]})
+        test_table_s.meta.client.batch_get_item(RequestItems = {
+            test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}})
+
 # Test latency metrics for GetRecords. Other Streams-related operations -
 # ListStreams, DescribeStream, and GetShardIterator, have an operation
 # count (tested above) but do NOT currently have a latency histogram.
--- a/test/boost/address_map_test.cc
+++ b/test/boost/address_map_test.cc
@@ -18,7 +18,7 @@

 #include <seastar/core/coroutine.hh>
 #include <seastar/core/manual_clock.hh>
-#include <seastar/util/later.hh>
+#include <seastar/core/timer.hh>
 #include <seastar/util/defer.hh>
 #include <seastar/coroutine/maybe_yield.hh>
 #include <seastar/util/alloc_failure_injector.hh>
@@ -290,12 +290,17 @@ SEASTAR_THREAD_TEST_CASE(test_address_map_replication) {
        m.set_expiring(id1);
        BOOST_CHECK(m.find(id1) && *m.find(id1) == addr1);
        m.barrier().get();
+        promise<> shard0_timer_expired;
+        timer<manual_clock> shard0_timer([&shard0_timer_expired] {
+            shard0_timer_expired.set_value();
+        });
+        shard0_timer.arm(manual_clock::now() + expiration_time);
        m_svc.invoke_on(1, [] (address_map_t<manual_clock>& m) {
            BOOST_CHECK(m.find(id1) && *m.find(id1) == addr1);
            manual_clock::advance(expiration_time);
            BOOST_CHECK(!m.find(id1));
-            return smp::submit_to(0, []{}); // Ensure shard 0 notices timer is expired.
        }).get();
+        shard0_timer_expired.get_future().get();
        BOOST_CHECK(!m.find(id1));

        // Expiring entries are replicated
--- a/test/boost/cache_algorithm_test.cc
+++ b/test/boost/cache_algorithm_test.cc
@@ -62,7 +62,11 @@ SEASTAR_TEST_CASE(test_index_doesnt_flood_cache_in_small_partition_workload) {
    // cfg.db_config->index_cache_fraction.set(1.0);
    return do_with_cql_env_thread([] (cql_test_env& e) {
        // We disable compactions because they cause confusing cache mispopulations.
-        e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
+        // We disable compression because the sstable writer targets a specific
+        // (*compressed* data file size : summary file size) ratio,
+        // so the number of keys per index page becomes hard to control,
+        // and might be arbitrarily large.
+        e.execute_cql("CREATE TABLE ks.t(pk blob PRIMARY KEY) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
        auto insert_query = e.prepare("INSERT INTO ks.t(pk) VALUES (?)").get();
        auto select_query = e.prepare("SELECT * FROM t WHERE pk = ?").get();

@@ -154,7 +158,11 @@ SEASTAR_TEST_CASE(test_index_is_cached_in_big_partition_workload) {
    // cfg.db_config->index_cache_fraction.set(0.0);
    return do_with_cql_env_thread([] (cql_test_env& e) {
        // We disable compactions because they cause confusing cache mispopulations.
-        e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' };").get();
+        // We disable compression because the sstable writer targets a specific
+        // (*compressed* data file size : summary file size) ratio,
+        // so the number of keys per index page becomes hard to control,
+        // and might be arbitrarily large.
+        e.execute_cql("CREATE TABLE ks.t(pk bigint, ck bigint, v blob, primary key (pk, ck)) WITH compaction = { 'class' : 'NullCompactionStrategy' } AND compression = {'sstable_compression': ''};").get();
        auto insert_query = e.prepare("INSERT INTO ks.t(pk, ck, v) VALUES (?, ?, ?)").get();
        auto select_query = e.prepare("SELECT * FROM t WHERE pk = ? AND ck = ?").get();

--- a/test/boost/database_test.cc
+++ b/test/boost/database_test.cc
@@ -1111,6 +1111,30 @@ SEASTAR_TEST_CASE(test_snapshot_ctl_true_snapshots_size) {
    });
 }

+SEASTAR_TEST_CASE(test_snapshot_ctl_details_exception_handling) {
+#ifndef SCYLLA_ENABLE_ERROR_INJECTION
+    testlog.debug("Skipping test as it depends on error injection. Please run in mode where it's enabled (debug,dev).\n");
+    return make_ready_future();
+#endif
+    return do_with_some_data_in_thread({"cf"}, [] (cql_test_env& e) {
+        sharded<db::snapshot_ctl> sc;
+        sc.start(std::ref(e.db()), std::ref(e.get_task_manager()), std::ref(e.get_sstorage_manager()), db::snapshot_ctl::config{}).get();
+        auto stop_sc = deferred_stop(sc);
+
+        auto& cf = e.local_db().find_column_family("ks", "cf");
+        take_snapshot(e).get();
+
+        utils::get_local_injector().enable("get_snapshot_details", true);
+        BOOST_REQUIRE_THROW(cf.get_snapshot_details().get(), std::runtime_error);
+
+        utils::get_local_injector().enable("per-snapshot-get_snapshot_details", true);
+        BOOST_REQUIRE_THROW(cf.get_snapshot_details().get(), std::runtime_error);
+
+        auto details = cf.get_snapshot_details().get();
+        BOOST_REQUIRE_EQUAL(details.size(), 1);
+    });
+}
+
 // toppartitions_query caused a lw_shared_ptr to cross shards when moving results, #5104
 SEASTAR_TEST_CASE(toppartitions_cross_shard_schema_ptr) {
    return do_with_cql_env_and_compaction_groups([] (cql_test_env& e) {
@@ -1857,7 +1881,7 @@ SEASTAR_THREAD_TEST_CASE(test_tombstone_gc_state_snapshot) {

    schema_builder::register_schema_initializer([] (schema_builder& builder) {
        if (builder.ks_name() == "test" && builder.cf_name() == "table_gc_mode_group0") {
-            builder.set_is_group0_table(true);
+            builder.set_is_group0_table();
        }
    });
    auto table_gc_mode_group0 = schema_builder("test", "table_gc_mode_group0")
--- a/test/boost/encrypted_file_test.cc
+++ b/test/boost/encrypted_file_test.cc
@@ -23,8 +23,11 @@
 #include "test/lib/tmpdir.hh"
 #include "test/lib/random_utils.hh"
 #include "test/lib/exception_utils.hh"
+#include "utils/limiting_data_source.hh"
 #include "utils/io-wrappers.hh"

+#include <seastar/util/memory-data-source.hh>
+
 using namespace encryption;

 static tmpdir dir;
@@ -595,6 +598,113 @@ SEASTAR_TEST_CASE(test_encrypted_data_source_simple) {
    co_await test_random_data_source(sizes);
 }

+// Reproduces the production deadlock where encrypted SSTable component downloads
+// got stuck during restore. The encrypted_data_source::get() caches a block in
+// _next, then on the next call bypasses input_stream::read()'s _eof check and
+// calls input_stream::read_exactly() — which does NOT check _eof when _buf is
+// empty. This causes a second get() on the underlying source after EOS.
+//
+// In production the underlying source was chunked_download_source whose get()
+// hung forever. Here we simulate it with a strict source that fails the test.
+//
+// The fix belongs in seastar's input_stream::read_exactly(): check _eof before
+// calling _fd.get(), consistent with read(), read_up_to(), and consume().
+static future<> test_encrypted_source_copy(size_t plaintext_size) {
+    testlog.info("test_encrypted_source_copy: plaintext_size={}", plaintext_size);
+
+    key_info info{"AES/CBC", 256};
+    auto k = ::make_shared<symmetric_key>(info);
+
+    // Step 1: Encrypt the plaintext into memory buffers
+    auto plaintext = generate_random<char>(plaintext_size);
+    std::vector<temporary_buffer<char>> encrypted_bufs;
+    {
+        data_sink sink(make_encrypted_sink(create_memory_sink(encrypted_bufs), k));
+        co_await sink.put(plaintext.clone());
+        co_await sink.close();
+    }
+
+    // Flatten encrypted buffers into a single contiguous buffer
+    size_t encrypted_total = 0;
+    for (const auto& b : encrypted_bufs) {
+        encrypted_total += b.size();
+    }
+    temporary_buffer<char> encrypted(encrypted_total);
+    size_t pos = 0;
+    for (const auto& b : encrypted_bufs) {
+        std::copy(b.begin(), b.end(), encrypted.get_write() + pos);
+        pos += b.size();
+    }
+
+    // Step 2: Create a data source from the encrypted data that fails on
+    // post-EOS get() — simulating a source like chunked_download_source
+    // that would hang forever in this situation.
+    class strict_memory_source final : public limiting_data_source_impl {
+        bool _eof = false;
+    public:
+        strict_memory_source(temporary_buffer<char> data, size_t chunk_size)
+            : limiting_data_source_impl(
+                data_source(std::make_unique<util::temporary_buffer_data_source>(std::move(data))),
+                [chunk_size] { return chunk_size; }) {}
+
+        future<temporary_buffer<char>> get() override {
+            BOOST_REQUIRE_MESSAGE(!_eof,
+                "get() called on source after it already returned EOS — "
+                "this is the production deadlock: read_exactly() does not "
+                "check _eof before calling _fd.get()");
+            auto buf = co_await limiting_data_source_impl::get();
+            _eof = buf.empty();
+            co_return buf;
+        }
+    };
+
+    // Step 3: Wrap in encrypted_data_source and drain via consume() —
+    // the exact code path used by seastar::copy() which is what
+    // sstables_loader_helpers::download_sstable() calls.
+    // Try multiple chunk sizes to hit different alignment scenarios.
+    for (size_t chunk_size : {1ul, 7ul, 4096ul, 8192ul, encrypted_total, encrypted_total + 1}) {
+        if (chunk_size == 0) continue;
+        auto src = data_source(make_encrypted_source(
+            data_source(std::make_unique<strict_memory_source>(encrypted.clone(), chunk_size)), k));
+        auto in = input_stream<char>(std::move(src));
+
+        // consume() is what seastar::copy() uses internally. It calls
+        // encrypted_data_source::get() via _fd.get() until EOF.
+        size_t total_decrypted = 0;
+        co_await in.consume([&total_decrypted](temporary_buffer<char> buf) {
+            total_decrypted += buf.size();
+            return make_ready_future<consumption_result<char>>(continue_consuming{});
+        });
+        co_await in.close();
+
+        BOOST_REQUIRE_EQUAL(total_decrypted, plaintext_size);
+    }
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_8k) {
+    co_await test_encrypted_source_copy(8192);
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_4k) {
+    co_await test_encrypted_source_copy(4096);
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_small) {
+    co_await test_encrypted_source_copy(100);
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_12k) {
+    co_await test_encrypted_source_copy(12288);
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_unaligned) {
+    co_await test_encrypted_source_copy(8193);
+}
+
+SEASTAR_TEST_CASE(test_encrypted_source_copy_1byte) {
+    co_await test_encrypted_source_copy(1);
+}
+

 SEASTAR_TEST_CASE(test_encrypted_data_source_fuzzy) {
    std::mt19937_64 rand_gen(std::random_device{}());
--- a/test/boost/group0_test.cc
+++ b/test/boost/group0_test.cc
@@ -252,7 +252,7 @@ SEASTAR_TEST_CASE(test_group0_batch) {
        // (group0 mutations are not allowed on non-group0 tables)
        schema_builder::register_schema_initializer([](schema_builder& builder) {
            if (builder.cf_name() == "test_group0_batch") {
-                builder.set_is_group0_table(true);
+                builder.set_is_group0_table();
            }
        });

@@ -345,4 +345,29 @@ SEASTAR_TEST_CASE(test_group0_batch) {
    });
 }

+SEASTAR_TEST_CASE(test_group0_tables_use_schema_commitlog) {
+    return do_with_cql_env([] (cql_test_env& e) {
+        schema_builder::register_schema_initializer([](schema_builder& builder) {
+            if (builder.cf_name() == "test_group0_tables_use_schema_commitlog1") {
+                builder.set_is_group0_table();
+            }
+        });
+
+        auto test_group0_tables_use_schema_commitlog1 = schema_builder("test", "test_group0_tables_use_schema_commitlog1")
+            .with_column("pk", utf8_type, column_kind::partition_key)
+            .build();
+
+        auto test_group0_tables_use_schema_commitlog2 = schema_builder("test", "test_group0_tables_use_schema_commitlog2")
+            .with_column("pk", utf8_type, column_kind::partition_key)
+            .build();
+
+        BOOST_REQUIRE(test_group0_tables_use_schema_commitlog1->static_props().is_group0_table);
+        BOOST_REQUIRE(test_group0_tables_use_schema_commitlog1->static_props().use_schema_commitlog);
+        BOOST_REQUIRE(!test_group0_tables_use_schema_commitlog2->static_props().is_group0_table);
+        BOOST_REQUIRE(!test_group0_tables_use_schema_commitlog2->static_props().use_schema_commitlog);
+
+        return make_ready_future();
+    });
+}
+
 BOOST_AUTO_TEST_SUITE_END()
--- a/test/boost/memtable_test.cc
+++ b/test/boost/memtable_test.cc
@@ -1004,7 +1004,20 @@ SEASTAR_TEST_CASE(memtable_flush_compresses_mutations) {
    }, db_config);
 }

-SEASTAR_TEST_CASE(memtable_flush_period) {
+static auto check_has_error_injection() {
+    return boost::unit_test::precondition([](auto){
+        return 
+#ifdef SCYLLA_ENABLE_ERROR_INJECTION
+            true
+#else
+            false
+#endif
+        ;
+    });
+}
+
+SEASTAR_TEST_CASE(memtable_flush_period, *check_has_error_injection()) {
+#ifdef SCYLLA_ENABLE_ERROR_INJECTION
    auto db_config = make_shared<db::config>();
    db_config->enable_cache.set(false);
    return do_with_cql_env_thread([](cql_test_env& env) {
@@ -1028,6 +1041,9 @@ SEASTAR_TEST_CASE(memtable_flush_period) {
        t.apply(m);
        BOOST_REQUIRE_EQUAL(t.sstables_count(), 0); // add mutation and check there are no sstables for this table

+        auto& errj = utils::get_local_injector();
+        errj.enable("table_seal_post_flush_waiters", true);
+
        // change schema to set memtable flush period
        // we use small value in this test but it is impossible to set the period less than 60000ms using ALTER TABLE construction
        schema_builder b(t.schema());
@@ -1035,8 +1051,10 @@ SEASTAR_TEST_CASE(memtable_flush_period) {
        schema_ptr s2 = b.build();
        t.set_schema(s2);

-        sleep(500ms).get(); // wait until memtable flush starts at least once
-        BOOST_REQUIRE(t.sstables_count() == 1 || t.get_stats().pending_flushes > 0);    // flush started
+        BOOST_TEST_MESSAGE("Wait for flush");
+        errj.inject("table_seal_post_flush_waiters", utils::wait_for_message(std::chrono::minutes(2))).get();
+        BOOST_TEST_MESSAGE("Flush received");
+
        BOOST_REQUIRE(eventually_true([&] { // wait until memtable will be flushed at least once
            return t.sstables_count() == 1;
        }));
@@ -1047,6 +1065,10 @@ SEASTAR_TEST_CASE(memtable_flush_period) {
            .produces(m)
            .produces_end_of_stream();
    }, db_config);
+#else
+    BOOST_TEST_MESSAGE("Skipping test as it depends on error injection. Please run in mode where it's enabled (debug,dev)");
+    return make_ready_future<>();
+#endif
 }

 SEASTAR_TEST_CASE(sstable_compaction_does_not_resurrect_data) {
--- a/test/boost/mvcc_test.cc
+++ b/test/boost/mvcc_test.cc
@@ -1990,6 +1990,116 @@ SEASTAR_TEST_CASE(test_reverse_cursor_refreshing_on_nonevictable_snapshot_with_e
    });
 }

+// Reproducer for SCYLLADB-1253 (https://github.com/scylladb/scylladb/issues/18732)
+// A reversed query with many overlapping range tombstones and a single live row
+// near the end of the tombstone range fails to return the live row.
+//
+// The bug is in partition_snapshot_row_cursor::maybe_refresh(), in the
+// !is_in_latest_version() path. When the cursor is reversed and positioned
+// above all entries in the latest version (in table order), it incorrectly
+// removes the latest version's entry from the heap, causing the live row
+// to be skipped.
+//
+// This test creates a multi-version partition snapshot:
+//   - v0 (older): contains overlapping range tombstones
+//   - v1 (latest): contains the live row with a higher timestamp
+// and directly exercises the cursor to verify that advance+maybe_refresh
+// correctly keeps the live row in the heap during reversed traversal.
+SEASTAR_TEST_CASE(test_reversed_maybe_refresh_keeps_latest_version_entry) {
+    return seastar::async([] {
+        logalloc::region region;
+        mutation_application_stats app_stats;
+        mutation_cleaner cleaner(region, no_cache_tracker, app_stats);
+
+        simple_schema ss(simple_schema::with_static::no);
+        auto s = ss.schema();
+        auto rev_s = s->make_reversed();
+
+        const int num_tombstones = 100;
+        const int range_span = num_tombstones; // each range covers [i, i + range_span)
+        const int row_ck = 5; // below all range tombstone boundary entries
+
+        // Step 1: Create a partition_entry with all range tombstones.
+        auto pe_ptr = with_allocator(region.allocator(), [&] {
+            logalloc::allocating_section as;
+            return as(region, [&] () -> std::unique_ptr<partition_entry> {
+                mutation m(s, ss.make_pkey(0));
+                for (int i = 0; i < num_tombstones; ++i) {
+                    auto range = query::clustering_range::make(
+                        query::clustering_range::bound(ss.make_ckey(i), true),
+                        query::clustering_range::bound(ss.make_ckey(i + range_span), false));
+                    ss.delete_range(m, range);
+                }
+                return std::make_unique<partition_entry>(*s, std::move(m.partition()));
+            });
+        });
+
+        // Step 2: Take a snapshot to pin the current version (v0 with tombstones).
+        auto snap1 = with_allocator(region.allocator(), [&] {
+            logalloc::allocating_section as;
+            return as(region, [&] {
+                return pe_ptr->read(region, cleaner, no_cache_tracker);
+            });
+        });
+
+        // Step 3: Apply the live row. Since v0 is pinned by snap1,
+        // this creates v1 (latest version) with just the live row.
+        with_allocator(region.allocator(), [&] {
+            logalloc::allocating_section as;
+            as(region, [&] {
+                mutation m(s, ss.make_pkey(0));
+                ss.add_row(m, ss.make_ckey(row_ck), "live_value");
+                pe_ptr->apply(region, cleaner, *s, m.partition(), *m.schema(), app_stats);
+            });
+        });
+
+        // Step 4: Take a second snapshot (sees both versions) and test cursor.
+        auto snap2 = with_allocator(region.allocator(), [&] {
+            logalloc::allocating_section as;
+            return as(region, [&] {
+                return pe_ptr->read(region, cleaner, no_cache_tracker);
+            });
+        });
+
+        {
+            logalloc::reclaim_lock rl(region);
+
+            partition_snapshot_row_cursor cursor(*rev_s, *snap2, false /* unique_owner */, true /* reversed */);
+
+            // Position cursor at the very end (in reversed/query order, this means
+            // the highest table-order position).
+            cursor.maybe_advance_to(position_in_partition_view::before_all_clustered_rows());
+            bool has_row = cursor.at_a_row();
+
+            // Traverse all entries in reversed order, calling maybe_refresh()
+            // before processing each row. This simulates what
+            // partition_snapshot_reader::next_interval() does and is where
+            // the bug manifests.
+            bool found_live_row = false;
+            while (has_row) {
+                cursor.maybe_refresh();
+                if (!cursor.dummy()) {
+                    found_live_row = true;
+                    break;
+                }
+                has_row = cursor.next();
+            }
+
+            BOOST_REQUIRE_MESSAGE(found_live_row,
+                fmt::format("Reversed cursor failed to find the live row at ck={}. "
+                            "The !is_in_latest_version() path in maybe_refresh() "
+                            "incorrectly removed the latest version's entry from the heap.",
+                            row_ck));
+        }
+
+        // Cleanup
+        snap2 = {};
+        snap1 = {};
+        with_allocator(region.allocator(), [&] {
+            pe_ptr.reset();
+        });
+    });
+}

 SEASTAR_TEST_CASE(test_apply_to_incomplete_with_dummies) {
    return seastar::async([] {
--- a/test/boost/network_topology_strategy_test.cc
+++ b/test/boost/network_topology_strategy_test.cc
@@ -1499,7 +1499,7 @@ SEASTAR_THREAD_TEST_CASE(tablets_simple_rack_aware_view_pairing_test) {
            base_host,
            base_erm,
            view_erm,
-            *ars_ptr,
+            true, // uses NTS
            base_token,
            view_token,
            use_tablets,
--- a/test/boost/partitioner_test.cc
+++ b/test/boost/partitioner_test.cc
@@ -719,7 +719,7 @@ SEASTAR_THREAD_TEST_CASE(test_dht_subtract_ranges) {

    auto get_random_ranges = [&] (size_t max_count) {
        auto count = tests::random::get_int<size_t>(1, max_count);
-        dht::partition_range_vector ranges;
+        utils::chunked_vector<dht::partition_range> ranges;
        ranges.reserve(count);

        for (size_t i = 0; i < count; i++) {
--- a/test/boost/row_cache_test.cc
+++ b/test/boost/row_cache_test.cc
@@ -2644,7 +2644,10 @@ SEASTAR_TEST_CASE(test_exception_safety_of_update_from_memtable) {
                return rd;
            };

-            populate_range(cache, population_range);
+            {
+                memory::scoped_critical_alloc_section dfg;
+                populate_range(cache, population_range);
+            }
            auto rd1_v1 = assert_that(make_reader(population_range));
            mutation_reader_opt snap;
            auto close_snap = defer([&snap] {
--- a/test/boost/sstable_partition_index_cache_test.cc
+++ b/test/boost/sstable_partition_index_cache_test.cc
@@ -20,16 +20,24 @@ static void add_entry(logalloc::region& r,
      const schema& s,
      partition_index_page& page,
      const partition_key& key,
-      uint64_t position)
+      uint64_t position,
+      std::optional<parsed_promoted_index_entry> promoted_index = std::nullopt)
 {
    logalloc::allocating_section as;
    as(r, [&] {
        with_allocator(r.allocator(), [&] {
            sstables::key sst_key = sstables::key::from_partition_key(s, key);
-            page._entries.push_back(make_managed<index_entry>(
-                    managed_bytes(sst_key.get_bytes()),
-                    position,
-                    managed_ref<promoted_index>()));
+            auto key_offset = page._key_storage.size();
+            auto old_storage = std::move(page._key_storage);
+            page._key_storage = managed_bytes(managed_bytes::initialized_later(), key_offset + sst_key.get_bytes().size());
+            auto out = managed_bytes_mutable_view(page._key_storage);
+            write_fragmented(out, managed_bytes_view(old_storage));
+            write_fragmented(out, single_fragmented_view(bytes_view(sst_key)));
+            page._entries.push_back(index_entry{dht::raw_token_opt()->value, position, key_offset});
+            if (promoted_index) {
+                page._promoted_indexes.resize(page._entries.size());
+                page._promoted_indexes[page._entries.size() - 1] = *promoted_index;
+            }
        });
    });
 }
@@ -54,10 +62,10 @@ static partition_index_page make_page0(logalloc::region& r, simple_schema& s) {
 static void has_page0(partition_index_cache::entry_ptr ptr) {
    BOOST_REQUIRE(!ptr->empty());
    BOOST_REQUIRE_EQUAL(ptr->_entries.size(), 4);
-    BOOST_REQUIRE_EQUAL(ptr->_entries[0]->position(), 0);
-    BOOST_REQUIRE_EQUAL(ptr->_entries[1]->position(), 1);
-    BOOST_REQUIRE_EQUAL(ptr->_entries[2]->position(), 2);
-    BOOST_REQUIRE_EQUAL(ptr->_entries[3]->position(), 3);
+    BOOST_REQUIRE_EQUAL(ptr->_entries[0].position(), 0);
+    BOOST_REQUIRE_EQUAL(ptr->_entries[1].position(), 1);
+    BOOST_REQUIRE_EQUAL(ptr->_entries[2].position(), 2);
+    BOOST_REQUIRE_EQUAL(ptr->_entries[3].position(), 3);
 };

 SEASTAR_THREAD_TEST_CASE(test_caching) {
@@ -139,6 +147,59 @@ SEASTAR_THREAD_TEST_CASE(test_caching) {
    }
 }

+SEASTAR_THREAD_TEST_CASE(test_sparse_promoted_index) {
+    ::lru lru;
+    simple_schema s;
+    logalloc::region r;
+    partition_index_cache_stats stats;
+    partition_index_cache cache(lru, r, stats);
+
+    auto page0_loader = [&] (partition_index_cache::key_type k) -> future<partition_index_page> {
+        partition_index_page page;
+        auto destroy_page = defer([&] {
+            with_allocator(r.allocator(), [&] {
+                auto p = std::move(page);
+            });
+        });
+
+        add_entry(r, *s.schema(), page, s.make_pkey(0).key(), 0);
+        add_entry(r, *s.schema(), page, s.make_pkey(1).key(), 1, parsed_promoted_index_entry{
+            .promoted_index_start = 1,
+            .promoted_index_size = 10,
+            .num_blocks = 3
+        });
+        add_entry(r, *s.schema(), page, s.make_pkey(2).key(), 2);
+        add_entry(r, *s.schema(), page, s.make_pkey(3).key(), 3, parsed_promoted_index_entry{
+            .promoted_index_start = 2,
+            .promoted_index_size = 13,
+            .num_blocks = 1
+        });
+        add_entry(r, *s.schema(), page, s.make_pkey(4).key(), 4);
+        destroy_page.cancel();
+        co_return std::move(page);
+    };
+
+    auto page = cache.get_or_load(0, page0_loader).get();
+
+    BOOST_REQUIRE_EQUAL(page->has_promoted_index(0), false);
+    BOOST_REQUIRE_EQUAL(page->has_promoted_index(1), true);
+    BOOST_REQUIRE_EQUAL(page->has_promoted_index(2), false);
+    BOOST_REQUIRE_EQUAL(page->has_promoted_index(3), true);
+    BOOST_REQUIRE_EQUAL(page->has_promoted_index(4), false);
+
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).promoted_index_start, 1);
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).promoted_index_size, 10);
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(1).num_blocks, 3);
+
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).promoted_index_start, 2);
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).promoted_index_size, 13);
+    BOOST_REQUIRE_EQUAL(page->get_promoted_index(3).num_blocks, 1);
+
+    with_allocator(r.allocator(), [&] {
+        lru.evict_all();
+    });
+}
+
 template <typename T>
 static future<> ignore_result(future<T>&& f) {
    return f.then_wrapped([] (auto&& f) {
--- a/test/boost/tablets_test.cc
+++ b/test/boost/tablets_test.cc
@@ -1607,6 +1607,29 @@ future<> apply_resize_plan(token_metadata& tm, const migration_plan& plan) {
    }
 }

+static
+future<group0_guard> save_token_metadata(cql_test_env& e, group0_guard guard) {
+    auto& stm = e.local_db().get_shared_token_metadata();
+    auto tm = stm.get();
+
+    e.get_topology_state_machine().local()._topology.version = tm->get_version();
+
+    co_await save_tablet_metadata(e.local_db(), tm->tablets(), guard.write_timestamp());
+    utils::chunked_vector<frozen_mutation> muts;
+    muts.push_back(freeze(topology_mutation_builder(guard.write_timestamp())
+                                  .set_version(tm->get_version())
+                                  .build().to_mutation(db::system_keyspace::topology())));
+    co_await e.local_db().apply(muts, db::no_timeout);
+    co_await e.get_storage_service().local().update_tablet_metadata({});
+
+    // Need a new guard to make sure later changes use later timestamp.
+    // Also, so that the table layer processes the changes we persisted, which is important for splits.
+    // Before we can finalize a split, the storage group needs to process the split by creating split-ready compaction groups.
+    release_guard(std::move(guard));
+    abort_source as;
+    co_return co_await e.get_raft_group0_client().start_operation(as);
+}
+
 static
 future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migration_plan& plan, shared_load_stats* load_stats) {
    auto& talloc = e.get_tablet_allocator().local();
@@ -1626,19 +1649,14 @@ future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migr
        co_await stm.mutate_token_metadata([table_id, &new_tmap, &changed] (token_metadata& tm) {
            changed = true;
            tm.tablets().set_tablet_map(table_id, std::move(new_tmap));
+            tm.set_version(tm.get_version() + 1);
            return make_ready_future<>();
        });
    }

    if (changed) {
        // Need to reload on each resize because table object expects tablet count to change by a factor of 2.
-        co_await save_tablet_metadata(e.local_db(), stm.get()->tablets(), guard.write_timestamp());
-        co_await e.get_storage_service().local().update_tablet_metadata({});
-
-        // Need a new guard to make sure later changes use later timestamp.
-        release_guard(std::move(guard));
-        abort_source as;
-        guard = co_await e.get_raft_group0_client().start_operation(as);
+        guard = co_await save_token_metadata(e, std::move(guard));

        if (load_stats) {
            auto new_tm = stm.get();
@@ -1647,6 +1665,11 @@ future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migr
                load_stats->stats = *reconciled_stats;
            }
        }
+
+        testlog.debug("Calling local_topology_barrier()");
+        old_tm = nullptr;
+        co_await e.get_storage_service().local().local_topology_barrier();
+        testlog.debug("Finished local_topology_barrier()");
    }
 }

@@ -1750,13 +1773,22 @@ void do_rebalance_tablets(cql_test_env& e,
        }).get();

        if (auto_split && load_stats) {
+            bool reload = false;
            auto& tm = *stm.get();
            for (const auto& [table, tmap]: tm.tablets().all_tables_ungrouped()) {
                if (std::holds_alternative<resize_decision::split>(tmap->resize_decision().way)) {
-                    testlog.debug("set_split_ready_seq_number({}, {})", table, tmap->resize_decision().sequence_number);
-                    load_stats->set_split_ready_seq_number(table, tmap->resize_decision().sequence_number);
+                    if (load_stats->stats.tables[table].split_ready_seq_number != tmap->resize_decision().sequence_number) {
+                        testlog.debug("set_split_ready_seq_number({}, {})", table, tmap->resize_decision().sequence_number);
+                        load_stats->set_split_ready_seq_number(table, tmap->resize_decision().sequence_number);
+                        reload = true;
+                    }
                }
            }
+
+            // Need to order split-ack before split finalization, storage_group assumes that.
+            if (reload) {
+                guard = save_token_metadata(e, std::move(guard)).get();
+            }
        }

        handle_resize_finalize(e, guard, plan, load_stats).get();
--- a/test/boost/token_metadata_test.cc
+++ b/test/boost/token_metadata_test.cc
@@ -331,4 +331,28 @@ SEASTAR_THREAD_TEST_CASE(test_stale_version_notification) {
    std::cerr.rdbuf(oldCerr);

    BOOST_TEST(my_stream.str().find("topology version 0 held for") != std::string::npos);
-}
+}
+
+SEASTAR_THREAD_TEST_CASE(test_raw_token) {
+    const auto t1 = dht::token::from_int64(1);
+    const auto t2 = dht::token::from_int64(2);
+
+    dht::raw_token_opt rt_opt;
+    BOOST_REQUIRE(!rt_opt);
+    rt_opt = dht::raw_token(t1);
+    BOOST_REQUIRE(*rt_opt == t1);
+
+    BOOST_REQUIRE(dht::raw_token() == dht::minimum_token());
+    BOOST_REQUIRE(dht::raw_token() < dht::raw_token(dht::first_token()));
+    BOOST_REQUIRE(dht::raw_token() < dht::first_token());
+    BOOST_REQUIRE(dht::raw_token() < dht::maximum_token());
+
+    auto rt1 = dht::raw_token(t1);
+    BOOST_REQUIRE(bool(rt1));
+    BOOST_REQUIRE(rt1 > dht::raw_token());
+    BOOST_REQUIRE(rt1 > dht::minimum_token());
+    BOOST_REQUIRE_EQUAL(rt1, t1);
+    BOOST_REQUIRE(rt1 == t1);
+    BOOST_REQUIRE(rt1 < t2);
+    BOOST_REQUIRE(rt1 < dht::maximum_token());
+}
--- a/test/boost/types_test.cc
+++ b/test/boost/types_test.cc
@@ -57,6 +57,20 @@ BOOST_AUTO_TEST_CASE(test_null_is_not_empty) {
    BOOST_REQUIRE(empty != null);
 }

+BOOST_AUTO_TEST_CASE(test_null_data_value_to_parsable_string) {
+    auto null_utf8 = data_value::make_null(utf8_type);
+    BOOST_REQUIRE_EQUAL(null_utf8.to_parsable_string(), "null");
+
+    auto null_int = data_value::make_null(int32_type);
+    BOOST_REQUIRE_EQUAL(null_int.to_parsable_string(), "null");
+
+    auto null_list = data_value::make_null(list_type_impl::get_instance(int32_type, true));
+    BOOST_REQUIRE_EQUAL(null_list.to_parsable_string(), "null");
+
+    auto null_map = data_value::make_null(map_type_impl::get_instance(utf8_type, int32_type, true));
+    BOOST_REQUIRE_EQUAL(null_map.to_parsable_string(), "null");
+}
+
 BOOST_AUTO_TEST_CASE(test_bytes_type_string_conversions) {
    BOOST_REQUIRE(bytes_type->equal(bytes_type->from_string("616263646566"), bytes_type->decompose(data_value(bytes{"abcdef"}))));
 }
--- a/test/boost/view_schema_test.cc
+++ b/test/boost/view_schema_test.cc
@@ -3221,6 +3221,87 @@ SEASTAR_TEST_CASE(test_view_update_generating_writetime) {
    });
 }

+// Usually if only an unselected column in the base table is modified, we expect an optimization that a view
+// update is not done, but we had an bug(https://scylladb.atlassian.net/browse/SCYLLADB-808) where the existence
+// of a collection selected in the view caused us to skip this optimization, even when it was not modified.
+// This test reproduces this bug.
+SEASTAR_TEST_CASE(test_view_update_unmodified_collection) {
+    // In this test we verify that we correctly skip (or not) view updates to a view that selects
+    // a collection column. We use two MVs, similarly as in the test above test.
+    return do_with_cql_env_thread([] (cql_test_env& e) {
+
+        auto f1 = e.local_view_builder().wait_until_built("ks", "mv1");
+        auto f2 = e.local_view_builder().wait_until_built("ks", "mv2");
+
+        e.execute_cql("CREATE TABLE t (k int, c int, a int, b list<int>, g int, primary key(k, c))").get();
+        e.execute_cql("CREATE MATERIALIZED VIEW mv1 AS SELECT k,c,a,b FROM t "
+                         "WHERE k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (c, k)").get();
+        e.execute_cql("CREATE MATERIALIZED VIEW mv2 AS SELECT k,c,a,b FROM t "
+                         "WHERE k IS NOT NULL AND c IS NOT NULL AND a IS NOT NULL PRIMARY KEY (c, k, a)").get();
+
+        f1.get();
+        f2.get();
+
+        auto total_t_view_updates = [&] {
+            return e.db().map_reduce0([] (replica::database& local_db) {
+                const db::view::stats& local_stats = local_db.find_column_family("ks", "t").get_view_stats();
+                return local_stats.view_updates_pushed_local + local_stats.view_updates_pushed_remote;
+            }, 0, std::plus<int64_t>()).get();
+        };
+
+        auto total_mv1_updates = [&] {
+            return e.db().map_reduce0([] (replica::database& local_db) {
+                return local_db.find_column_family("ks", "mv1").get_stats().writes.hist.count;
+            }, 0, std::plus<int64_t>()).get();
+        };
+
+        auto total_mv2_updates = [&] {
+            return e.db().map_reduce0([] (replica::database& local_db) {
+                return local_db.find_column_family("ks", "mv2").get_stats().writes.hist.count;
+            }, 0, std::plus<int64_t>()).get();
+        };
+
+        ::shared_ptr<cql_transport::messages::result_message> msg;
+
+        e.execute_cql("INSERT INTO t (k, c, a) VALUES (1, 1, 1)").get();
+        eventually([&] {
+            const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
+            const update_counter expected{1, 1, 2};
+
+            BOOST_REQUIRE_EQUAL(results, expected);
+        });
+
+        // We update an unselected column and the collection remains NULL, so we should generate an
+        // update to the virtual column in mv1 but not to mv2.
+        e.execute_cql("UPDATE t SET g=1 WHERE k=1 AND c=1;").get();
+        eventually([&] {
+            const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
+            const update_counter expected{2, 1, 3};
+
+            BOOST_REQUIRE_EQUAL(results, expected);
+        });
+
+        // We update the collection with an initial value
+        e.execute_cql("UPDATE t SET b=[1] WHERE k=1 AND c=1;").get();
+        eventually([&] {
+            const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
+            const update_counter expected{3, 2, 5};
+
+            BOOST_REQUIRE_EQUAL(results, expected);
+        });
+
+        // We update an unselected column again with a non-NULL selected collection. Because the liveness of the updated column is unchanged
+        // and no other selected column is updated (in particular, the collection column), we should generate no view updates.
+        e.execute_cql("UPDATE t SET g=2 WHERE k=1 AND c=1;").get();
+        eventually([&] {
+            const update_counter results{total_mv1_updates(), total_mv2_updates(), total_t_view_updates()};
+            const update_counter expected{3, 2, 5};
+
+            BOOST_REQUIRE_EQUAL(results, expected);
+        });
+    });
+}
+
 SEASTAR_TEST_CASE(test_conflicting_batch) {
    return do_with_cql_env_thread([] (cql_test_env& e) {

--- a/test/cluster/tasks/test_node_ops_tasks.py
+++ b/test/cluster/tasks/test_node_ops_tasks.py
@@ -254,27 +254,3 @@ async def test_node_ops_task_wait(manager: ManagerClient):

    await decommission_task
    await waiting_task
-
-@pytest.mark.asyncio
-async def test_get_children(manager: ManagerClient):
-    module_name = "node_ops"
-    tm = TaskManagerClient(manager.api)
-    servers = [await manager.server_add(cmdline=cmdline) for _ in range(2)]
-
-    injection = "tasks_vt_get_children"
-    handler = await inject_error_one_shot(manager.api, servers[0].ip_addr, injection)
-
-    log = await manager.server_open_log(servers[0].server_id)
-    mark = await log.mark()
-
-    bootstrap_task = [task for task in await tm.list_tasks(servers[0].ip_addr, module_name) if task.kind == "cluster"][0]
-
-    async def _decommission():
-        await log.wait_for('tasks_vt_get_children: waiting', from_mark=mark)
-        await manager.decommission_node(servers[1].server_id)
-        await handler.message()
-
-    async def _get_status():
-        await tm.get_task_status(servers[0].ip_addr, bootstrap_task.task_id)
-
-    await asyncio.gather(*(_decommission(), _get_status()))
--- a/test/cluster/tasks/test_tablet_tasks.py
+++ b/test/cluster/tasks/test_tablet_tasks.py
@@ -12,9 +12,11 @@ import pytest
 from test.pylib.internal_types import ServerInfo
 from test.pylib.manager_client import ManagerClient
 from test.pylib.repair import create_table_insert_data_for_repair, get_tablet_task_id
+from test.pylib.rest_client import read_barrier
 from test.pylib.tablets import get_all_tablet_replicas
 from test.cluster.conftest import skip_mode
-from test.cluster.util import create_new_test_keyspace, new_test_keyspace
+from test.cluster.util import create_new_test_keyspace, new_test_keyspace, get_topology_coordinator, find_server_by_host_id
+from test.cluster.test_incremental_repair import trigger_tablet_merge
 from test.cluster.test_tablets2 import inject_error_on
 from test.cluster.tasks.task_manager_client import TaskManagerClient
 from test.cluster.tasks.task_manager_types import TaskStatus, TaskStats
@@ -96,6 +98,50 @@ async def test_tablet_repair_task(manager: ManagerClient):

    await asyncio.gather(repair_task(), check_and_abort_repair_task(manager, tm, servers, module_name, ks))

+@pytest.mark.asyncio
+@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
+async def test_tablet_repair_wait_with_table_drop(manager: ManagerClient):
+    module_name = "tablets"
+    tm = TaskManagerClient(manager.api)
+    injection = "tablet_virtual_task_wait"
+
+    cmdline = [
+        '--logger-log-level', 'debug_error_injection=debug',
+    ]
+    servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager, cmdline=cmdline)
+    assert module_name in await tm.list_modules(servers[0].ip_addr), "tablets module wasn't registered"
+
+    token = -1
+    await enable_injection(manager, servers, "repair_tablet_fail_on_rpc_call")
+    await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, await_completion=False)
+
+    repair_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, "user_repair", keyspace=ks)
+
+    task = repair_tasks[0]
+    assert task.scope == "table"
+    assert task.keyspace == ks
+    assert task.table == "test"
+    assert task.state in ["created", "running"]
+
+    log = await manager.server_open_log(servers[0].server_id)
+    mark = await log.mark()
+
+    await enable_injection(manager, [servers[0]], injection)
+
+    async def wait_for_task():
+        status_wait = await tm.wait_for_task(servers[0].ip_addr, task.task_id)
+        assert status_wait.state == "done"
+
+    async def drop_table():
+        await log.wait_for(f'"{injection}"', from_mark=mark)
+        await disable_injection(manager, servers, "repair_tablet_fail_on_rpc_call")
+        await manager.get_cql().run_async(f"DROP TABLE {ks}.test")
+        await manager.api.message_injection(servers[0].ip_addr, injection)
+
+    await asyncio.gather(wait_for_task(), drop_table())
+
+    await disable_injection(manager, servers, injection)
+
 async def check_repair_task_list(tm: TaskManagerClient, servers: list[ServerInfo], module_name: str, keyspace: str):
    def get_task_with_id(repair_tasks, task_id):
        tasks_with_id1 = [task for task in repair_tasks if task.task_id == task_id]
@@ -151,6 +197,45 @@ async def test_tablet_repair_task_list(manager: ManagerClient):

    await asyncio.gather(run_repair(0, "test"), run_repair(1, "test2"), run_repair(2, "test3"), check_repair_task_list(tm, servers, module_name, ks))

+@pytest.mark.asyncio
+@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
+async def test_tablet_repair_wait(manager: ManagerClient):
+    module_name = "tablets"
+    tm = TaskManagerClient(manager.api)
+
+    stop_repair_injection = "repair_tablet_repair_task_impl_run"
+    servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager)
+    assert module_name in await tm.list_modules(servers[0].ip_addr), "tablets module wasn't registered"
+
+    await inject_error_on(manager, stop_repair_injection, servers)
+    await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", "all", await_completion=False)
+
+    repair_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, "user_repair", keyspace=ks)
+    task = repair_tasks[0]
+
+    log = await manager.server_open_log(servers[0].server_id)
+    mark = await log.mark()
+
+    async def wait_for_task():
+        await enable_injection(manager, servers, "tablet_virtual_task_wait")
+        status_wait = await tm.wait_for_task(servers[0].ip_addr, task.task_id)
+
+    async def merge_tablets():
+        await log.wait_for('tablet_virtual_task: wait until tablet operation is finished', from_mark=mark)
+
+        # Resume repair.
+        await message_injection(manager, servers, stop_repair_injection)
+
+        # Merge tablets.
+        coord = await find_server_by_host_id(manager, servers, await get_topology_coordinator(manager))
+        log2 = await manager.server_open_log(coord.server_id)
+        await trigger_tablet_merge(manager, servers, [log2])
+
+        await read_barrier(manager.api, servers[0].ip_addr)
+        await message_injection(manager, servers, "tablet_virtual_task_wait")
+
+    await asyncio.gather(wait_for_task(), merge_tablets())
+
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
 async def test_tablet_repair_task_children(manager: ManagerClient):
--- a/test/cluster/test_bootstrap_with_quick_group0_join.py
+++ b/test/cluster/test_bootstrap_with_quick_group0_join.py
@@ -0,0 +1,70 @@
+#
+# Copyright (C) 2026-present ScyllaDB
+#
+# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+#
+import logging
+import asyncio
+import time
+
+import pytest
+
+from test.cluster.util import get_current_group0_config
+from test.pylib.manager_client import ManagerClient
+from test.pylib.rest_client import read_barrier
+from test.pylib.util import wait_for
+
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.mark.asyncio
+@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
+async def test_bootstrap_with_quick_group0_join(manager: ManagerClient):
+    """Regression test for https://scylladb.atlassian.net/browse/SCYLLADB-959.
+
+    The bug was that when the bootstrapping node joined group0 before reaching
+    post_server_start, it skipped post_server_start and thus hung forever.
+
+    The test simulates the scenario by starting the second node with the
+    join_group0_pause_before_config_check injection. Without the fix, the
+    startup times out.
+    """
+    logger.info("Adding first server")
+    s1 = await manager.server_add()
+
+    logger.info("Adding second server with join_group0_pause_before_config_check enabled")
+    s2 = await manager.server_add(start=False, config={
+        'error_injections_at_startup': ['join_group0_pause_before_config_check']
+    })
+
+    logger.info(f"Starting {s2}")
+    start_task = asyncio.create_task(manager.server_start(s2.server_id))
+
+    s2_log = await manager.server_open_log(s2.server_id)
+
+    await s2_log.wait_for("join_group0_pause_before_config_check: waiting for message", timeout=60)
+
+    s1_host_id = await manager.get_host_id(s1.server_id)
+    s2_host_id = await manager.get_host_id(s2.server_id)
+
+    async def s2_in_group0_config_on_s1():
+        config = await get_current_group0_config(manager, s1)
+        ids = {m[0] for m in config}
+        assert s1_host_id in ids  # sanity check
+        return True if s2_host_id in ids else None
+
+    # Note: we would like to wait for s2 to see itself in the group0 config, but we can't execute
+    # get_current_group0_config for s2, as s2 doesn't handle CQL requests at this point. As a workaround, we wait for s1
+    # to see s2 and then perform a read barrier on s2.
+    logger.info(f"Waiting for {s1} to see {s2} in the group0 config")
+    await wait_for(s2_in_group0_config_on_s1, deadline=time.time() + 60, period=0.1)
+
+    logger.info(f"Performing read barrier on {s2} to make sure it sees itself in the group0 config")
+    await read_barrier(manager.api, s2.ip_addr)
+
+    logger.info(f"Unblocking {s2}")
+    await manager.api.message_injection(s2.ip_addr, 'join_group0_pause_before_config_check')
+
+    logger.info(f"Waiting for {s2} to complete bootstrap")
+    await asyncio.wait_for(start_task, timeout=60)
--- a/test/cluster/test_data_resurrection_in_memtable.py
+++ b/test/cluster/test_data_resurrection_in_memtable.py
@@ -54,9 +54,9 @@ async def run_test_cache_tombstone_gc(manager: ManagerClient, statement_pairs: l
                    "     AND compaction = {'class': 'NullCompactionStrategy'}")

        for write_statement, delete_statement in statement_pairs:
-            execute_with_tracing(cql, write_statement.format(ks=ks), log = True)
+            execute_with_tracing(cql, SimpleStatement(write_statement.format(ks=ks), consistency_level=ConsistencyLevel.ALL), log = True)
            await manager.api.enable_injection(node3.ip_addr, "database_apply", one_shot=False)
-            execute_with_tracing(cql, delete_statement.format(ks=ks), log = True)
+            execute_with_tracing(cql, SimpleStatement(delete_statement.format(ks=ks), consistency_level=ConsistencyLevel.LOCAL_QUORUM), log = True)
            await manager.api.disable_injection(node3.ip_addr, "database_apply")

        def check_data(host, data):
--- a/test/cluster/test_encryption.py
+++ b/test/cluster/test_encryption.py
@@ -117,17 +117,18 @@ async def create_encrypted_cf(manager: ManagerClient, ks: str,

    return new_test_table(manager, ks, columns, extra)

-async def prepare_write_workload(cql: CassandraSession, table_name, flush=True, n: int = None):
-    """write some data"""
-    keys = list(range(n if n else 100))
+async def prepare_write_workload(cql: CassandraSession, table_name, flush=True, n: int = None) -> list[str]:
+    """write some data, returns list of written partition keys"""
+    key_ids = list(range(n if n else 100))
    c1_values = ['value1']
    c2_values = ['value2']

    statement = cql.prepare(f"INSERT INTO {table_name} (key, c1, c2) VALUES (?, ?, ?)")
    statement.consistency_level = ConsistencyLevel.ALL

+    keys = [f"k{x}" for x in key_ids]
    await asyncio.gather(*[cql.run_async(statement, params) for params in
-                           list(map(lambda x, y, z: [f"k{x}", y, z], keys,
+                           list(map(lambda x, y, z: [x, y, z], keys,
                                    itertools.cycle(c1_values),
                                    itertools.cycle(c2_values)))]
                                    )
@@ -135,10 +136,14 @@ async def prepare_write_workload(cql: CassandraSession, table_name, flush=True,
    if flush:
        nodetool.flush(cql, table_name)

-async def read_verify_workload(cql: CassandraSession, table_name: str, expected_len: int = 100):
-    """check written data"""
-    rows = list(cql.execute(f"SELECT c1, c2 FROM {table_name}"))
-    assert len(rows) == expected_len
+    return keys
+
+async def read_verify_workload(cql: CassandraSession, table_name: str, keys: list[str]):
+    """check written data using single-partition queries"""
+    statement = cql.prepare(f"SELECT c1, c2 FROM {table_name} WHERE key = ?")
+    rows = await asyncio.gather(*[cql.run_async(statement, [key]) for key in keys])
+    for key, result in zip(keys, rows):
+        assert len(list(result)) == 1, f"Expected 1 row for key={key}, got {len(list(result))}"

 async def _smoke_test(manager: ManagerClient, key_provider: KeyProviderFactory,
                      ciphers: dict[str, list[int]], compression: str = None,
@@ -167,8 +172,8 @@ async def _smoke_test(manager: ManagerClient, key_provider: KeyProviderFactory,
                                                  compression=compression,
                                                  additional_options=additional_options
                                                  ))
-                    await prepare_write_workload(cql, table_name=table_name)
-                    cfs.append(table_name)
+                    keys = await prepare_write_workload(cql, table_name=table_name)
+                    cfs.append((table_name, keys))
                except Exception as e:
                    if exception_handler:
                        exception_handler(e, cipher_algorithm, secret_key_strength)
@@ -176,12 +181,12 @@ async def _smoke_test(manager: ManagerClient, key_provider: KeyProviderFactory,
                    raise e
            # restart the cluster
            if restart:
-                await restart(manager, servers, cfs)
-                await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
+                await restart(manager, servers, [table_name for table_name, _ in cfs])
+                cql, _ = await manager.get_ready_cql(servers)
            else:
                await manager.rolling_restart(servers)
-            for table_name in cfs:
-                await read_verify_workload(cql, table_name=table_name)
+            for table_name, keys in cfs:
+                await read_verify_workload(cql, table_name=table_name, keys=keys)

 # default: 'AES/CBC/PKCS5Padding', length 128
 supported_cipher_algorithms = {
@@ -363,7 +368,7 @@ async def test_alter(manager, key_provider):
                                           table_names[0], False,
                                           expected_data=expected_data)

-        await read_verify_workload(cql, table_name=table_names[0])
+        await read_verify_workload(cql, table_name=table_names[0], keys=[row[0] for row in expected_data])
        # enable encryption again
        options = key_provider.additional_cf_options()
        cql.execute(f"ALTER TABLE {table_names[0]} with scylla_encryption_options={options}")
@@ -433,7 +438,8 @@ async def test_non_existant_table_master_key(manager: ManagerClient, tmpdir):

 async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
    cfg = {"authenticator": "org.apache.cassandra.auth.PasswordAuthenticator", 
-               "authorizer": "org.apache.cassandra.auth.CassandraAuthorizer"}
+               "authorizer": "org.apache.cassandra.auth.CassandraAuthorizer",
+                "commitlog_sync": "batch" }

    servers: list[ServerInfo] = await manager.servers_add(servers_num = 1, config=cfg, 
                                                          driver_connect_opts={'auth_provider': PlainTextAuthProvider(username='cassandra', password='cassandra')})
@@ -450,11 +456,14 @@ async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
            file_paths = [f for f in file_paths if os.path.isfile(f) and not os.path.islink(f)]

            for file_path in file_paths:
-                with open(file_path, 'rb') as f:
-                    data = f.read()
-                    if pbytes in data:
-                        pattern_found_counter += 1
-                        logger.debug("Pattern '%s' found in %s", pattern, file_path)
+                try:
+                    with open(file_path, 'rb') as f:
+                        data = f.read()
+                        if pbytes in data:
+                            pattern_found_counter += 1
+                            logger.debug("Pattern '%s' found in %s", pattern, file_path)
+                except FileNotFoundError:
+                    pass # assume just compacted away

        if expect:
            assert pattern_found_counter > 0
@@ -462,15 +471,15 @@ async def test_system_auth_encryption(manager: ManagerClient, tmpdir):
            assert pattern_found_counter == 0

    async def verify_system_info(expect: bool):
-        user = f"user_{str(uuid.uuid4())}"
+        user = f"user_{str(uuid.uuid4())}".replace('-','_')
        pwd = f"pwd_{str(uuid.uuid4())}"
        cql.execute(f"CREATE USER {user} WITH PASSWORD '{pwd}' NOSUPERUSER")
        assert_one(cql, f"LIST ROLES of {user}", [user, False, True, {}])

        logger.debug("Verify PART 1: check commitlogs -------------")

-        grep_database_files(pwd, "commitlog", "**/*.log", expect)
-        grep_database_files(user, "commitlog", "**/*.log", True)
+        await grep_database_files(pwd, "commitlog", "**/*.log", False)
+        await grep_database_files(user, "commitlog", "**/*.log", expect)

        salted_hash = None
        system_auth = None
@@ -487,39 +496,38 @@ async def test_system_auth_encryption(manager: ManagerClient, tmpdir):

        assert salted_hash is not None
        assert system_auth is not None
-        grep_database_files(salted_hash, "commitlog", "**/*.log", expect)
+        await grep_database_files(salted_hash, "commitlog", "**/*.log", expect)

        rand_comment = f"comment_{str(uuid.uuid4())}"

        async with await create_ks(manager) as ks:
-            async with await new_test_table(cql, ks, "key text PRIMARY KEY, c1 text, c2 text") as table:
+            async with new_test_table(manager, ks, "key text PRIMARY KEY, c1 text, c2 text") as table:
                cql.execute(f"ALTER TABLE {table} WITH comment = '{rand_comment}'")
-                grep_database_files(rand_comment, "commitlog/schema", "**/*.log", expect)
-                nodetool.flush_all(cql)
+                await grep_database_files(rand_comment, "commitlog/schema", "**/*.log", expect)
+                # Note: original test did greping in sstables. This does no longer work
+                # since all system tables are compressed, and thus binary greping will 
+                # not work. We could do scylla sstable dump-data and grep in the json,
+                # but this is somewhat pointless as this would, if it handles it, just
+                # decrypt the info from the sstable, thus we can't really verify anything.
+                # We could maybe check that the expected system tables are in fact encrypted,
+                # though this is more a promise than guarantee... Also, the only tables
+                # encrypted are paxos and batchlog -> pointless

-                logger.debug("Verify PART 2: check sstable files -------------\n`system_info_encryption` won't encrypt sstable files on disk")
-                logger.debug("GREP_DB_FILES: Check PM key user in sstable file ....")
-                grep_database_files(user, f"data/{system_auth}/", "**/*-Data.db", expect=True)
-                logger.debug("GREP_DB_FILES: Check original password in commitlogs .... Original password should never be saved")
-                grep_database_files(pwd, f"data/{system_auth}/", "**/*-Data.db", expect=False)
-                logger.debug("GREP_DB_FILES: Check salted_hash of password in sstable file ....")
-                grep_database_files(salted_hash, f"data/{system_auth}/", "**/*-Data.db", expect=False)
-                logger.debug("GREP_DB_FILES: Check table comment in sstable file ....")
-                grep_database_files(rand_comment, "data/system_schema/", "**/*-Data.db", expect=True)
-
-    verify_system_info(True) # not encrypted
+    await verify_system_info(True) # not encrypted

    cfg = {"system_info_encryption": {
        "enabled": True, 
-        "key_provider": "LocalFileSystemKeyProviderFactory"}
+        "key_provider": "LocalFileSystemKeyProviderFactory"},
+        "system_key_directory": os.path.join(tmpdir, "resources/system_keys")
        }

    for server in servers:
-        manager.server_update_config(server.server_id, config_options=cfg)
+        await manager.server_update_config(server.server_id, config_options=cfg)
+        await manager.server_restart(server.server_id)

    await manager.rolling_restart(servers)

-    verify_system_info(False) # should not see stuff now
+    await verify_system_info(False) # should not see stuff now


 async def test_system_encryption_reboot(manager: ManagerClient, tmpdir):
--- a/test/cluster/test_gossiper_orphan_remover.py
+++ b/test/cluster/test_gossiper_orphan_remover.py
@@ -8,8 +8,11 @@ import asyncio
 import time
 import pytest
 import logging
+from functools import partial
 from test.pylib.manager_client import ManagerClient
 from test.cluster.conftest import skip_mode
+from test.pylib.util import wait_for
+from test.pylib.internal_types import ServerInfo

 logger = logging.getLogger(__name__)

@@ -17,6 +20,26 @@ logger = logging.getLogger(__name__)
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
 async def test_crashed_node_substitution(manager: ManagerClient):
+    """Test that a node which crashed after starting gossip but before joining group0
+    (an 'orphan' node) is eventually removed from gossip by the gossiper_orphan_remover_fiber.
+
+    The scenario:
+    1. Start 3 nodes with the 'fast_orphan_removal_fiber' injection enabled. This freezes
+       the gossiper_orphan_remover_fiber on each node before it enters its polling loop,
+       so it cannot remove any orphan until explicitly unblocked.
+    2. Start a 4th node with the 'crash_before_group0_join' injection enabled. This node
+       starts gossip normally but blocks inside pre_server_start(), just before sending
+       the join RPC to the topology coordinator. It never joins group0.
+    3. Wait until the 4th node's gossip state has fully propagated to all 3 running peers,
+       then trigger its crash via the injection. At this point all peers see it as an orphan:
+       present in gossip but absent from the group0 topology.
+    4. Assert the orphan is visible in gossip (live or down) on the surviving nodes.
+    5. Unblock the gossiper_orphan_remover_fiber on all 3 nodes (via message_injection) and
+       enable the 'speedup_orphan_removal' injection so the fiber removes the orphan immediately
+       without waiting for the normal 60-second age threshold.
+    6. Wait for the 'Finished to force remove node' log line confirming removal, then assert
+       the orphan is no longer present in gossip.
+    """
    servers = await manager.servers_add(3, config={
        'error_injections_at_startup': ['fast_orphan_removal_fiber']
    })
@@ -31,10 +54,24 @@ async def test_crashed_node_substitution(manager: ManagerClient):
    log = await manager.server_open_log(failed_server.server_id)
    await log.wait_for("finished do_send_ack2_msg")
    failed_id = await manager.get_host_id(failed_server.server_id)
+
+    # Wait until the failed server's gossip state has propagated to all running peers.
+    # "finished do_send_ack2_msg" only guarantees that one peer completed a gossip round
+    # with the failed server; other nodes learn about it only in subsequent gossip rounds.
+    # Querying gossip before propagation completes would cause the assertion below to fail
+    # because the orphan node would not yet appear as live or down on every peer.
+    async def gossip_has_node(server: ServerInfo):
+        live = await manager.api.client.get_json("/gossiper/endpoint/live", host=server.ip_addr)
+        down = await manager.api.client.get_json("/gossiper/endpoint/down", host=server.ip_addr)
+        return True if failed_server.ip_addr in live + down else None
+
+    for s in servers:
+        await wait_for(partial(gossip_has_node, s), deadline=time.time() + 30)
+
    await manager.api.message_injection(failed_server.ip_addr, 'crash_before_group0_join')
-    
+
    await task
-    
+
    live_eps = await manager.api.client.get_json("/gossiper/endpoint/live", host=servers[0].ip_addr)
    down_eps = await manager.api.client.get_json("/gossiper/endpoint/down", host=servers[0].ip_addr)

--- a/test/cluster/test_hints.py
+++ b/test/cluster/test_hints.py
@@ -87,7 +87,7 @@ async def test_limited_concurrency_of_writes(manager: ManagerClient):
    })
    node2 = await manager.server_add()

-    cql = manager.get_cql()
+    cql = await manager.get_cql_exclusive(node1)
    async with new_test_keyspace(manager, "WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 2}") as ks:
        table = f"{ks}.t"
        await cql.run_async(f"CREATE TABLE {table} (pk int primary key, v int)")
--- a/test/cluster/test_incremental_repair.py
+++ b/test/cluster/test_incremental_repair.py
@@ -312,14 +312,28 @@ async def test_tablet_incremental_repair_error(manager: ManagerClient):
    token = -1
    map0 = await load_tablet_sstables_repaired_at(manager, cql, servers[0], hosts[0], table_id)

-    # Repair should not finish with error
+    # Repair should not finish while the injection is enabled. We abort the task
+    # before turning the injection off, otherwise it may continue in background
+    # and increase sstables_repaired_at.
    await inject_error_on(manager, "repair_tablet_fail_on_rpc_call", servers)
    try:
-        await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental', timeout=10)
-        assert False # Check the tablet repair is not supposed to finish
-    except TimeoutError:
-        logger.info("Repair timeout as expected")
-    await inject_error_off(manager, "repair_tablet_fail_on_rpc_call", servers)
+        repair_response = await manager.api.tablet_repair(
+            servers[0].ip_addr,
+            ks,
+            "test",
+            token,
+            await_completion=False,
+            incremental_mode='incremental',
+        )
+        task_id = repair_response['tablet_task_id']
+
+        with pytest.raises(asyncio.TimeoutError):
+            await asyncio.wait_for(manager.api.wait_task(servers[0].ip_addr, task_id), timeout=10)
+
+        await manager.api.abort_task(servers[0].ip_addr, task_id)
+        await manager.api.wait_task(servers[0].ip_addr, task_id)
+    finally:
+        await inject_error_off(manager, "repair_tablet_fail_on_rpc_call", servers)

    map1 = await load_tablet_sstables_repaired_at(manager, cql, servers[0], hosts[0], table_id)

@@ -609,14 +623,19 @@ async def do_test_tablet_incremental_repair_merge_error(manager, error):

    scylla_path = get_scylla_path(cql)

+    coord = await get_topology_coordinator(manager)
+    coord_serv = await find_server_by_host_id(manager, servers, coord)
+    coord_log = await manager.server_open_log(coord_serv.server_id)
+
    # Trigger merge and error in merge
-    s1_mark = await logs[0].mark()
-    await inject_error_on(manager, error, servers[:1])
+    mark = await coord_log.mark()
+    await inject_error_on(manager, error, [coord_serv])
    await inject_error_on(manager, "tablet_force_tablet_count_decrease", servers)
-    await logs[0].wait_for(f'Got {error}', from_mark=s1_mark)
+    await inject_error_on(manager, "tablet_force_tablet_count_decrease_once", servers)
+    await coord_log.wait_for(f'Got {error}', from_mark=mark)
    await inject_error_off(manager, "tablet_force_tablet_count_decrease", servers)
-    await manager.server_stop(servers[0].server_id)
-    await manager.server_start(servers[0].server_id)
+    await manager.server_stop(coord_serv.server_id)
+    await manager.server_start(coord_serv.server_id)

    for server in servers:
        await manager.server_stop_gracefully(server.server_id)
@@ -862,50 +881,6 @@ async def test_repair_sigsegv_with_diff_shard_count(manager: ManagerClient, use_
            logger.info("Starting vnode repair")
            await manager.api.repair(servers[1].ip_addr, ks, "test")

-# Reproducer for https://github.com/scylladb/scylladb/issues/27365
-# Incremental repair vs tablet merge
-@pytest.mark.asyncio
-@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
-async def test_tablet_incremental_repair_tablet_merge_compaction_group_gone(manager: ManagerClient):
-    cmdline = ['--logger-log-level', 'repair=debug']
-    servers, cql, hosts, ks, table_id, logs, _, _, _, _ = await preapre_cluster_for_incremental_repair(manager, cmdline=cmdline)
-
-    coord = await get_topology_coordinator(manager)
-    coord_serv = await find_server_by_host_id(manager, servers, coord)
-    coord_log = await manager.server_open_log(coord_serv.server_id)
-
-    # Trigger merge and wait until the merge fiber starts
-    s1_mark = await coord_log.mark()
-    await inject_error_on(manager, "merge_completion_fiber", servers)
-    await inject_error_on(manager, "tablet_force_tablet_count_decrease_once", servers)
-    await inject_error_on(manager, "tablet_force_tablet_count_decrease", servers)
-    await coord_log.wait_for(f'Detected tablet merge for table', from_mark=s1_mark)
-    await inject_error_off(manager, "tablet_force_tablet_count_decrease", servers)
-    await coord_log.wait_for(f'merge_completion_fiber: waiting for message', from_mark=s1_mark)
-
-    # Trigger repair and wait for the inc repair prepare preparation to start
-    s1_mark = await coord_log.mark()
-    await inject_error_on(manager, "wait_after_prepare_sstables_for_incremental_repair", servers)
-    await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token=-1, await_completion=False, incremental_mode='incremental')
-    # Wait for preparation to start.
-    await coord_log.wait_for('Disabling compaction for range', from_mark=s1_mark)
-    # Without the serialization, sleep to increase chances of preparation finishing before merge fiber.
-    # With the serialization, preparation will wait for merge fiber to finish.
-    await asyncio.sleep(0.1)
-
-    # Continue to execute the merge fiber so that the compaction group is removed
-    await inject_error_on(manager, "replica_merge_completion_wait", servers)
-    for s in servers:
-        await manager.api.message_injection(s.ip_addr, "merge_completion_fiber")
-
-    await coord_log.wait_for(f'Merge completion fiber finished', from_mark=s1_mark)
-
-    # Continue the repair to trigger use-after-free
-    for s in servers:
-        await manager.api.message_injection(s.ip_addr, "wait_after_prepare_sstables_for_incremental_repair")
-
-    await coord_log.wait_for(f'Finished tablet repair', from_mark=s1_mark)
-
 # Reproducer for https://github.com/scylladb/scylladb/issues/27365
 # Incremental repair vs table drop
@pytest.mark.asyncio
--- a/test/cluster/test_internode_compression.py
+++ b/test/cluster/test_internode_compression.py
@@ -162,7 +162,12 @@ async def do_test_internode_compression_between_datacenters(manager: ManagerClie

    await asyncio.gather(*[manager.server_stop(s.server_id) for s,_ in servers])
    await asyncio.gather(*[p.stop() for p in proxies])
-
+    # these will all except, because we just stopped them above
+    for coro in proxy_futs:
+        try:
+            await coro
+        except:
+            pass

 async def test_internode_compression_compress_packets_between_nodes(request, manager: ManagerClient) -> None:
    def check_expected(msg_size, node1_proxy, node2_proxy, node3_proxy):
--- a/test/cluster/test_multidc.py
+++ b/test/cluster/test_multidc.py
@@ -20,6 +20,7 @@ from cassandra.query import SimpleStatement
 from test.pylib.async_cql import _wrap_future
 from test.pylib.manager_client import ManagerClient
 from test.pylib.random_tables import RandomTables, TextType, Column
+from test.pylib.rest_client import read_barrier
 from test.pylib.util import unique_name
 from test.cluster.conftest import cluster_con

@@ -403,6 +404,7 @@ async def test_arbiter_dc_rf_rack_valid_keyspaces(manager: ManagerClient):
        for task in [*valid_keyspaces, *invalid_keyspaces]:
            _ = tg.create_task(task)

+@pytest.mark.asyncio
 async def test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces(manager: ManagerClient):
    """
    This test verifies that starting a Scylla node fails when there's an RF-rack-invalid keyspace.
@@ -464,22 +466,50 @@ async def test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces(manager:
        for rfs, tablets in valid_keyspaces:
            _ = tg.create_task(create_keyspace(rfs, tablets))

-    await manager.server_stop_gracefully(s1.server_id)
-    await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "true")
-
+    # Precondition: s1 has rf_rack_valid_keyspaces set to false.
+    # Postcondition: s1 still has rf_rack_valid_keyspaces set to false.
    async def try_fail(rfs: List[int], dc: str, rf: int, rack_count: int):
+        running_servers = await manager.running_servers()
+        should_start = s1.server_id not in [server.server_id for server in running_servers]
+        if should_start:
+            await manager.server_start(s1.server_id)
+
        ks = await create_keyspace(rfs, True)
+        # We need to wait for the new schema to propagate.
+        # Otherwise, it's not clear when the mutation
+        # corresponding to the created keyspace will
+        # arrive at server 1.
+        # It could happen only after the node performs
+        # the check upon start-up, effectively leading
+        # to a successful start-up, which we don't want.
+        # For more context, see issue: SCYLLADB-1137.
+        await read_barrier(manager.api, s1.ip_addr)
+
+        await manager.server_stop_gracefully(s1.server_id)
+        await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "true")
+
        err = f"The keyspace '{ks}' is required to be RF-rack-valid. " \
              f"That condition is violated for DC '{dc}': RF={rf} vs. rack count={rack_count}."
-        _ = await manager.server_start(s1.server_id, expected_error=err)
+        await manager.server_start(s1.server_id, expected_error=err)
        await cql.run_async(f"DROP KEYSPACE {ks}")

+        await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "false")
+
    # Test RF-rack-invalid keyspaces.
    await try_fail([2, 0], "dc1", 2, 3)
    await try_fail([3, 2], "dc2", 2, 1)
    await try_fail([4, 1], "dc1", 4, 3)

-    _ = await manager.server_start(s1.server_id)
+    # We need to perform a read barrier on the node to make
+    # sure that it processes the last DROP KEYSPACE.
+    # Otherwise, the node could think the RF-rack-invalid
+    # keyspace still exists.
+    await manager.server_start(s1.server_id)
+    await read_barrier(manager.api, s1.ip_addr)
+    await manager.server_stop_gracefully(s1.server_id)
+
+    await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "true")
+    await manager.server_start(s1.server_id)

@pytest.mark.asyncio
 async def test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces_but_not_enforced(manager: ManagerClient):
--- a/test/cluster/test_prepare_race.py
+++ b/test/cluster/test_prepare_race.py
@@ -0,0 +1,65 @@
+#
+# Copyright (C) 2026-present ScyllaDB
+#
+# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+#
+
+import asyncio
+import pytest
+
+from test.cluster.util import new_test_keyspace, new_test_table
+from test.pylib.manager_client import ManagerClient
+from test.pylib.rest_client import inject_error_one_shot
+
+
+@pytest.mark.asyncio
+@pytest.mark.skip_mode(mode="release", reason="error injections are not supported in release mode")
+async def test_prepare_fails_if_cached_statement_is_invalidated_mid_prepare(manager: ManagerClient):
+    server = await manager.server_add()
+    cql = manager.get_cql()
+    log = await manager.server_open_log(server.server_id)
+    
+    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1};") as ks:
+        async with new_test_table(manager, ks, "pk int PRIMARY KEY") as table:
+            query = f"SELECT * FROM {table} WHERE pk = ?"
+            loop = asyncio.get_running_loop()
+            await cql.run_async(f"INSERT INTO {table} (pk) VALUES (7)")
+            await cql.run_async(f"INSERT INTO {table} (pk) VALUES (8)")
+
+            handler = await inject_error_one_shot(manager.api, server.ip_addr, "query_processor_prepare_wait_after_cache_get")
+            mark = await log.mark()
+            prepare_future = loop.run_in_executor(None, lambda: cql.prepare(query))
+            await log.wait_for("query_processor_prepare_wait_after_cache_get: waiting for message", from_mark=mark, timeout=60)
+
+            # Trigger table schema update (metadata-only) to invalidate prepared statements while PREPARE is paused.
+            await cql.run_async(f"ALTER TABLE {table} WITH comment = 'invalidate-prepared-race'")
+
+            await handler.message()
+            done, _ = await asyncio.wait({prepare_future}, timeout=15)
+            if not done:
+                pytest.fail("Timed out waiting for PREPARE to complete after signaling injection")
+
+            result = done.pop().result()
+            print(f"PREPARE succeeded as expected: {result!r}")
+
+            rows = cql.execute(result, [7])
+            row = rows.one()
+            assert row is not None and row.pk == 7
+
+            # Invalidate prepared statements again, then execute the same prepared object.
+            # The driver should transparently re-prepare and re-request execution.
+            await cql.run_async(f"ALTER TABLE {table} WITH comment = 'invalidate-prepared-race-again'")
+
+            reprepare_handler = await inject_error_one_shot(manager.api, server.ip_addr, "query_processor_prepare_wait_after_cache_get")
+            reprepare_mark = await log.mark()
+            execute_future = loop.run_in_executor(None, lambda: cql.execute(result, [8]))
+            await log.wait_for("query_processor_prepare_wait_after_cache_get: waiting for message", from_mark=reprepare_mark, timeout=60)
+
+            await reprepare_handler.message()
+            execute_done, _ = await asyncio.wait({execute_future}, timeout=15)
+            if not execute_done:
+                pytest.fail("Timed out waiting for driver execute to finish after re-prepare signaling")
+
+            retried_rows = execute_done.pop().result()
+            retried_row = retried_rows.one()
+            assert retried_row is not None and retried_row.pk == 8
--- a/test/cluster/test_proxy_protocol.py
+++ b/test/cluster/test_proxy_protocol.py
@@ -16,8 +16,10 @@ import pytest
 import socket
 import ssl
 import struct
+import time

 from test.pylib.manager_client import ManagerClient
+from test.pylib.util import wait_for

 logger = logging.getLogger(__name__)

@@ -269,6 +271,28 @@ async def send_cql_with_proxy_header_tls(
            sock.close()


+async def wait_for_results(cql, query: str, expected_count: int, timeout: float = 30.0, filter_fn=None):
+    """
+    Polls `query` until at least `expected_count` rows satisfy `filter_fn` (all rows if no filter is given).
+    On timeout, logs the full result set from the last poll to aid debugging.
+    """
+    last_rows: list = []
+
+    async def check_resultset():
+        nonlocal last_rows
+        last_rows = list(await cql.run_async(query))
+        matching = filter_fn(last_rows) if filter_fn is not None else last_rows
+        if len(matching) >= expected_count:
+            return matching
+        return None
+
+    try:
+        return await wait_for(check_resultset, time.time() + timeout, period=0.1)
+    except Exception:
+        logger.error('Timed out waiting for %d matching rows in system.clients. Last poll returned %d total rows:\n%s',
+                     expected_count, len(last_rows),'\n'.join(str(r) for r in last_rows))
+        raise
+
 # Shared server configuration for all tests
 # We configure explicit SSL ports to keep the standard ports unencrypted
 # so the Python driver can connect without TLS.
@@ -368,9 +392,12 @@ async def test_proxy_protocol_shard_aware(proxy_server):
            await do_cql_handshake(reader, writer)

        # Now query system.clients to verify shard assignments
-        rows = list(cql.execute(
-            f"SELECT address, port, shard_id FROM system.clients WHERE address = '{fake_src_addr}' ALLOW FILTERING"
-        ))
+        rows = await wait_for_results(
+            cql,
+            'SELECT address, port, shard_id FROM system.clients',
+            expected_count=num_shards,
+            filter_fn=lambda all_rows: [r for r in all_rows if str(r.address) == fake_src_addr],
+        )

        # Build a map of port -> shard_id from the results
        port_to_shard = {row.port: row.shard_id for row in rows}
@@ -446,9 +473,12 @@ async def test_proxy_protocol_port_preserved_in_system_clients(proxy_server):

        # Now query system.clients using the driver to see our connection
        cql = manager.get_cql()
-        rows = list(cql.execute(
-            f"SELECT address, port FROM system.clients WHERE address = '{fake_src_addr}' ALLOW FILTERING"
-        ))
+        rows = await wait_for_results(
+            cql,
+            'SELECT address, port FROM system.clients',
+            expected_count=1,
+            filter_fn=lambda all_rows: [r for r in all_rows if str(r.address) == fake_src_addr],
+        )

        # We should find our connection with the fake source address and port
        assert len(rows) > 0, f"Expected to find connection from {fake_src_addr} in system.clients"
@@ -569,9 +599,12 @@ async def test_proxy_protocol_ssl_shard_aware(proxy_server):
                ssl_sock.recv(4096)

        # Now query system.clients to verify shard assignments
-        rows = list(cql.execute(
-            f"SELECT address, port, shard_id, ssl_enabled FROM system.clients WHERE address = '{fake_src_addr}' ALLOW FILTERING"
-        ))
+        rows = await wait_for_results(
+            cql,
+            'SELECT address, port, shard_id, ssl_enabled FROM system.clients',
+            expected_count=num_shards,
+            filter_fn=lambda all_rows: [r for r in all_rows if str(r.address) == fake_src_addr],
+        )

        # Build a map of port -> (shard_id, ssl_enabled) from the results
        port_to_info = {row.port: (row.shard_id, row.ssl_enabled) for row in rows}
@@ -656,9 +689,12 @@ async def test_proxy_protocol_ssl_port_preserved(proxy_server):

        # Now query system.clients using the driver to see our connection
        cql = manager.get_cql()
-        rows = list(cql.execute(
-            f"SELECT address, port, ssl_enabled FROM system.clients WHERE address = '{fake_src_addr}' ALLOW FILTERING"
-        ))
+        rows = await wait_for_results(
+            cql,
+            'SELECT address, port, ssl_enabled FROM system.clients',
+            expected_count=1,
+            filter_fn=lambda all_rows: [r for r in all_rows if str(r.address) == fake_src_addr],
+        )

        # We should find our connection
        assert len(rows) > 0, f"Expected to find connection from {fake_src_addr} in system.clients"
--- a/test/cluster/test_raft_no_quorum.py
+++ b/test/cluster/test_raft_no_quorum.py
@@ -7,6 +7,7 @@ import logging

 import pytest
 import asyncio
+from test.pylib.internal_types import ServerNum
 from test.pylib.manager_client import ManagerClient
 from test.cluster.conftest import skip_mode
 from test.pylib.rest_client import inject_error_one_shot, InjectionHandler, read_barrier
@@ -20,6 +21,20 @@ def fixture_raft_op_timeout(build_mode):
    return 10000 if build_mode == 'debug' else 1000


+async def update_group0_raft_op_timeout(server_id: ServerNum, manager: ManagerClient, timeout: int) -> None:
+    logger.info(f"Updating group0_raft_op_timeout_in_ms on server {server_id} to {timeout}")
+    running_ids = [srv.server_id for srv in await manager.running_servers()]
+    if server_id in running_ids:
+        # If the node is alive, server_update_config only sends the SIGHUP signal to the Scylla process, so awaiting it
+        # doesn't guarantee that the new config file is active. Work around this by looking at the logs.
+        log_file = await manager.server_open_log(server_id)
+        mark = await log_file.mark()
+        await manager.server_update_config(server_id, 'group0_raft_op_timeout_in_ms', timeout)
+        await log_file.wait_for("completed re-reading configuration file", from_mark=mark, timeout=60)
+    else:
+        await manager.server_update_config(server_id, 'group0_raft_op_timeout_in_ms', timeout)
+
+
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
@pytest.mark.skip_mode(mode='debug', reason='aarch64/debug is unpredictably slow', platform_key='aarch64')
@@ -42,7 +57,6 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)

    config = {
        'direct_failure_detector_ping_timeout_in_ms': 300,
-        'group0_raft_op_timeout_in_ms': raft_op_timeout,
        'error_injections_at_startup': [
            {
                'name': 'raft-group-registry-fd-threshold-in-ms',
@@ -64,6 +78,10 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
                         manager.server_stop_gracefully(servers[3].server_id),
                         manager.server_stop_gracefully(servers[4].server_id))

+    # Do it here to prevent unexpected timeouts before quorum loss.
+    await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, raft_op_timeout)
+                           for srv in servers[:2]))
+
    logger.info("starting a sixth node with no quorum")
    await manager.server_add(expected_error="raft operation \\[read_barrier\\] timed out, there is no raft quorum",
                             timeout=60)
@@ -76,7 +94,6 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
@pytest.mark.skip_mode(mode='debug', reason='aarch64/debug is unpredictably slow', platform_key='aarch64')
 async def test_quorum_lost_during_node_join(manager: ManagerClient, raft_op_timeout: int) -> None:
    config = {
-        'group0_raft_op_timeout_in_ms': raft_op_timeout,
        'error_injections_at_startup': [
            {
                'name': 'raft-group-registry-fd-threshold-in-ms',
@@ -107,6 +124,9 @@ async def test_quorum_lost_during_node_join(manager: ManagerClient, raft_op_time
    await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
                         manager.server_stop_gracefully(servers[2].server_id))

+    # Do it here to prevent unexpected timeouts before quorum loss.
+    await update_group0_raft_op_timeout(servers[0].server_id, manager, raft_op_timeout)
+
    logger.info("release join-node-before-add-entry injection")
    await injection_handler.message()

@@ -126,7 +146,6 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli

    logger.info("adding a fourth node")
    servers += [await manager.server_add(config={
-        'group0_raft_op_timeout_in_ms': raft_op_timeout,
        'error_injections_at_startup': [
            {
                'name': 'raft-group-registry-fd-threshold-in-ms',
@@ -153,6 +172,9 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli
    await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
                         manager.server_stop_gracefully(servers[2].server_id))

+    # Do it here to prevent unexpected timeouts before quorum loss.
+    await update_group0_raft_op_timeout(servers[3].server_id, manager, raft_op_timeout)
+
    logger.info("release join-node-response_handler-before-read-barrier injection")
    injection_handler = InjectionHandler(manager.api,
                                         'join-node-response_handler-before-read-barrier',
@@ -169,7 +191,6 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli
 async def test_cannot_run_operations(manager: ManagerClient, raft_op_timeout: int) -> None:
    logger.info("starting a first node (the leader)")
    servers = [await manager.server_add(config={
-        'group0_raft_op_timeout_in_ms': raft_op_timeout,
        'error_injections_at_startup': [
            {
                'name': 'raft-group-registry-fd-threshold-in-ms',
@@ -189,6 +210,9 @@ async def test_cannot_run_operations(manager: ManagerClient, raft_op_timeout: in
    await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
                         manager.server_stop_gracefully(servers[2].server_id))

+    # Do it here to prevent unexpected timeouts before quorum loss.
+    await update_group0_raft_op_timeout(servers[0].server_id, manager, raft_op_timeout)
+
    logger.info("attempting removenode for the second node")
    await manager.remove_node(servers[0].server_id, servers[1].server_id,
                            expected_error="raft operation [read_barrier] timed out, there is no raft quorum",
@@ -232,9 +256,7 @@ async def test_can_restart(manager: ManagerClient, raft_op_timeout: int) -> None
    await asyncio.gather(*(manager.server_stop(srv.server_id) for srv in servers))

    # This ensures the read barriers below fail quickly without group 0 quorum.
-    logger.info(f"Decreasing group0_raft_op_timeout_in_ms on {servers}")
-    await asyncio.gather(*(manager.server_update_config(srv.server_id, 'group0_raft_op_timeout_in_ms', raft_op_timeout)
-                           for srv in servers))
+    await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, raft_op_timeout) for srv in servers))

    logger.info(f"Restarting {servers[:2]} with no group 0 quorum")
    for idx, srv in enumerate(servers[:2]):
@@ -246,8 +268,7 @@ async def test_can_restart(manager: ManagerClient, raft_op_timeout: int) -> None

    # Increase the timeout back to 300s to ensure the new group 0 leader is elected before the first read barrier below
    # times out.
-    await asyncio.gather(*(manager.server_update_config(srv.server_id, 'group0_raft_op_timeout_in_ms', 300000)
-                           for srv in servers))
+    await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, 300000) for srv in servers))

    logger.info(f"Restarting {servers[2:]} with group 0 quorum")
    for srv in servers[2:]:
--- a/test/cluster/test_sstable_cleanup_stop.py
+++ b/test/cluster/test_sstable_cleanup_stop.py
@@ -12,7 +12,6 @@ from test.cluster.util import check_token_ring_and_group0_consistency, new_test_
 import pytest
 import asyncio
 import logging
-import time

 logger = logging.getLogger(__name__)
@pytest.mark.asyncio
@@ -54,7 +53,7 @@ async def test_cleanup_stop(manager: ManagerClient):
        await s0_log.wait_for('sstable_cleanup_wait: waiting', from_mark=s0_mark)

        stop_cleanup = asyncio.create_task(manager.api.stop_compaction(servers[0].ip_addr, "CLEANUP"))
-        time.sleep(1)
+        await asyncio.sleep(1)

        await manager.api.message_injection(servers[0].ip_addr, "sstable_cleanup_wait")
        await stop_cleanup
--- a/test/cluster/test_sstable_compression_dictionaries_upgrade.py
+++ b/test/cluster/test_sstable_compression_dictionaries_upgrade.py
@@ -15,6 +15,7 @@ from test.pylib.manager_client import ManagerClient, ServerInfo
 from test.pylib.rest_client import read_barrier, HTTPError
 from test.pylib.scylla_cluster import ScyllaVersionDescription
 from test.pylib.util import wait_for_cql_and_get_hosts, wait_for_feature
+from test.cluster.util import reconnect_driver
 from cassandra.cluster import ConsistencyLevel
 from cassandra.policies import FallthroughRetryPolicy
 from cassandra.protocol import ServerError
@@ -162,6 +163,7 @@ async def test_upgrade_and_rollback(manager: ManagerClient, scylla_2025_1: Scyll
    )

    logger.info("Waiting for SSTABLE_COMPRESSION_DICTS cluster feature")
+    cql = await reconnect_driver(manager)
    hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
    await asyncio.gather(*(wait_for_feature("SSTABLE_COMPRESSION_DICTS", cql, h, time.time() + 60) for h in hosts))

--- a/test/cluster/test_tablets2.py
+++ b/test/cluster/test_tablets2.py
@@ -2212,4 +2212,75 @@ async def test_split_and_intranode_synchronization(manager: ManagerClient):
            tablet_count = await get_tablet_count(manager, server, ks, 'test')
            return tablet_count >= expected_tablet_count or None
        # Give enough time for split to happen in debug mode
-        await wait_for(finished_splitting, time.time() + 120)
+        await wait_for(finished_splitting, time.time() + 120)
+
+@pytest.mark.asyncio
+@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
+async def test_split_stopped_on_shutdown(manager: ManagerClient):
+    logger.info('Bootstrapping cluster')
+    cfg = { 'enable_tablets': True,
+            'tablet_load_stats_refresh_interval_in_seconds': 1
+            }
+    cmdline = [
+        '--logger-log-level', 'debug_error_injection=debug',
+        '--smp', '1',
+    ]
+    server = await manager.server_add(cmdline=cmdline, config=cfg)
+
+    logger.info(f'server_id = {server.server_id}')
+
+    cql = manager.get_cql()
+
+    await manager.disable_tablet_balancing()
+
+    initial_tablets = 2
+
+    async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}}") as ks:
+        await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int) WITH tablets = {{'min_tablet_count': {initial_tablets}}};")
+
+        await manager.api.disable_autocompaction(server.ip_addr, ks, 'test')
+
+        # insert data
+        pks = range(256)
+        await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in pks])
+
+        # flush the table
+        await manager.api.flush_keyspace(server.ip_addr, ks)
+
+        # force split on the test table
+        expected_tablet_count = 4
+        await cql.run_async(f"ALTER TABLE {ks}.test WITH tablets = {{'min_tablet_count': {expected_tablet_count}}}")
+
+        log = await manager.server_open_log(server.server_id)
+        log_mark = await log.mark()
+
+        await manager.api.enable_injection(server.ip_addr, "splitting_mutation_writer_switch_wait", one_shot=True)
+        await manager.api.enable_injection(server.ip_addr, "storage_service_drain_wait", one_shot=True)
+        await manager.enable_tablet_balancing()
+
+        await log.wait_for('Emitting resize decision of type split', from_mark=log_mark)
+        await log.wait_for('splitting_mutation_writer_switch_wait: waiting', from_mark=log_mark)
+
+        log_mark = await log.mark()
+
+        shutdown_task = asyncio.create_task(manager.server_stop_gracefully(server.server_id))
+
+        await log.wait_for('Stopping.*ongoing compactions')
+        await manager.api.message_injection(server.ip_addr, "splitting_mutation_writer_switch_wait")
+
+        await log.wait_for('storage_service_drain_wait: waiting', from_mark=log_mark)
+        await log.wait_for('Failed to complete splitting of table', from_mark=log_mark)
+
+        await manager.api.message_injection(server.ip_addr, "storage_service_drain_wait")
+
+        await shutdown_task
+
+        errors = await log.grep_for_errors(from_mark=log_mark)
+        assert errors == []
+
+        await manager.server_start(server.server_id)
+        await wait_for_cql_and_get_hosts(cql, [server], time.time() + 60)
+
+        await log.wait_for('Detected tablet split for table', from_mark=log_mark)
+        tablet_count = await get_tablet_count(manager, server, ks, 'test')
+        assert tablet_count >= expected_tablet_count
--- a/test/cluster/test_tablets_lwt.py
+++ b/test/cluster/test_tablets_lwt.py
@@ -978,7 +978,7 @@ async def test_tablets_merge_waits_for_lwt(manager: ManagerClient):
        await wait_for_tablet_count(manager, s0, ks, 'test', lambda c: c == 1, 1, timeout_s=15)

        logger.info("Ensure the guard decided to retain the erm")
-        await log0.wait_for("tablet_metadata_guard::check: retain the erm and abort the guard",
+        m, _ = await log0.wait_for("tablet_metadata_guard::check: retain the erm and abort the guard",
                            from_mark=m, timeout=10)

        tablets = await get_all_tablet_replicas(manager, s0, ks, 'test')
@@ -986,7 +986,11 @@ async def test_tablets_merge_waits_for_lwt(manager: ManagerClient):
        tablet = tablets[0]
        assert tablet.replicas == [(s0_host_id, 0)]

-        m = await log0.mark()
+        # Since merge now waits for erms before releasing the state machine,
+        # the migration initiated below will not start until paxos released the erm.
+        # The barrier which is blocked is the one in merge finalization.
+        # I keep the tablet movement as a guard against regressions in case the behavior changes.
+
        migration_task = asyncio.create_task(manager.api.move_tablet(s0.ip_addr, ks, "test",
                                                                     s0_host_id, 0,
                                                                     s0_host_id, 1,
--- a/test/cluster/test_tablets_merge.py
+++ b/test/cluster/test_tablets_merge.py
@@ -441,84 +441,6 @@ async def test_tablet_split_merge_with_many_tables(build_mode: str, manager: Man

    await check_logs("after merge completion")

-# Reproduces use-after-free when migration right after merge, but concurrently to background
-# merge completion handler.
-# See: https://github.com/scylladb/scylladb/issues/24045
-@pytest.mark.asyncio
-@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
-async def test_migration_running_concurrently_to_merge_completion_handling(manager: ManagerClient):
-    cmdline = []
-    # Size based balancing can attempt to migrate the merged tablet as soon as the merge is complete
-    # because of a lower transient effective_capacity on the node with the merged tablet.
-    # This migration will timeout on cleanup because the compaction group still has an active task,
-    # which is held by the merge_completion_fiber injection, so the tablet's compaction group gate
-    # can not be closed, resulting in cleanup getting stuck. We force capacity based balancing to
-    # avoid this problem.
-    cfg = {'force_capacity_based_balancing': True}
-    servers = [await manager.server_add(cmdline=cmdline, config=cfg)]
-
-    await manager.disable_tablet_balancing()
-
-    cql = manager.get_cql()
-
-    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") as ks:
-        await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);")
-
-        tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
-        assert tablet_count == 2
-
-        old_tablet_count = tablet_count
-
-        keys = range(100)
-        await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys])
-
-        await cql.run_async(f"ALTER KEYSPACE {ks} WITH tablets = {{'initial': 1}};")
-
-        s0_log = await manager.server_open_log(servers[0].server_id)
-        s0_mark = await s0_log.mark()
-
-        await manager.api.enable_injection(servers[0].ip_addr, "merge_completion_fiber", one_shot=True)
-        await manager.api.enable_injection(servers[0].ip_addr, "replica_merge_completion_wait", one_shot=True)
-        await manager.enable_tablet_balancing()
-
-        servers.append(await manager.server_add(cmdline=cmdline, config=cfg))
-        s1_host_id = await manager.get_host_id(servers[1].server_id)
-
-        async def finished_merging():
-            tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
-            return tablet_count < old_tablet_count or None
-
-        await wait_for(finished_merging, time.time() + 120)
-
-        await manager.disable_tablet_balancing()
-        await manager.api.enable_injection(servers[0].ip_addr, "take_storage_snapshot", one_shot=True)
-
-        await s0_log.wait_for(f"merge_completion_fiber: waiting", from_mark=s0_mark)
-
-        tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
-        assert tablet_count == 1
-
-        tablet_token = 0 # Doesn't matter since there is one tablet
-        replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token)
-
-        s0_host_id = await manager.get_host_id(servers[0].server_id)
-        src_shard = replica[1]
-        dst_shard = src_shard
-
-        migration = asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], src_shard, s1_host_id, dst_shard, tablet_token))
-
-        await s0_log.wait_for(f"take_storage_snapshot: waiting", from_mark=s0_mark)
-
-        await manager.api.message_injection(servers[0].ip_addr, "merge_completion_fiber")
-        await s0_log.wait_for(f"Merge completion fiber finished", from_mark=s0_mark)
-
-        await manager.api.message_injection(servers[0].ip_addr, "take_storage_snapshot")
-
-        await migration
-
-        rows = await cql.run_async(f"SELECT * FROM {ks}.test;")
-        assert len(rows) == len(keys)
-
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
 async def test_missing_data(manager: ManagerClient):
@@ -655,3 +577,77 @@ async def test_merge_with_drop(manager: ManagerClient):
        await asyncio.sleep(0.1)
        await manager.api.message_injection(server.ip_addr, "compaction_group_stop_wait")
        await drop_table_fut
+
+
+@pytest.mark.asyncio
+@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
+async def test_background_merge_deadlock(manager: ManagerClient):
+    """
+    Reproducer for https://scylladb.atlassian.net/browse/SCYLLADB-928
+
+    Reproduces a deadlock in the background merge completion handler that can happen when multiple merges accumulate.
+    If we accumulate more than 1 merge cycle for the fiber, deadlock occurs due to compaction lock taken
+    on the main group (post-merge). The lock is held until compaction groups are precessed by the background merge
+    fiber
+
+    Example:
+
+    Initial state:
+
+      cg0: main,
+      cg1: main
+      cg2: main
+      cg3: main
+
+    After 1st merge:
+
+      cg0': main [locked], merging_groups=[cg0.main, cg1.main]
+      cg1': main [locked], merging_groups=[cg2.main, cg3.main]
+
+    After 2nd merge:
+
+      cg0'': main [locked], merging_groups=[cg0'.main [locked], cg0.main, cg1.main, cg1'.main [locked], cg2.main, cg3.main]
+
+    The test reproduces this by doing a tablet merge from 8 tablets to 1 (8 -> 4 -> 2 -> 1). The background merge fiber
+    is blocked until after the first merge (to 4), so that there is a higher chance of two merges queueing in the fiber.
+
+    If deadlock occurs, node shutdown will hang waiting for the background merge fiber. That's why the test
+    tries to stop the node at the end.
+    """
+
+    cmdline = [
+        '--logger-log-level', 'load_balancer=debug',
+        '--logger-log-level', 'raft_topology=debug',
+    ]
+
+    servers = [await manager.server_add(cmdline=cmdline)]
+    cql, _ = await manager.get_ready_cql(servers)
+
+    ks = await create_new_test_keyspace(cql, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}")
+
+    # Create a table which will go through 3 merge cycles.
+    await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int) with tablets = {{'min_tablet_count': 8}};")
+
+    await manager.api.enable_injection(servers[0].ip_addr, "merge_completion_fiber", one_shot=True)
+    log = await manager.server_open_log(servers[0].server_id)
+    mark = await log.mark()
+
+    # Trigger tablet merging
+    await cql.run_async(f"ALTER TABLE {ks}.test WITH tablets = {{'min_tablet_count': 1}};")
+
+    async def produced_one_merge():
+        tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
+        return tablet_count == 4 or None
+    await wait_for(produced_one_merge, time.time() + 120)
+
+    mark, _ = await log.wait_for(f"merge_completion_fiber: waiting", from_mark=mark)
+    await manager.api.message_injection(servers[0].ip_addr, "merge_completion_fiber")
+    mark, _ = await log.wait_for(f"merge_completion_fiber: message received", from_mark=mark)
+
+    async def finished_merge():
+        tablet_count = await get_tablet_count(manager, servers[0], ks, 'test')
+        return tablet_count == 1 or None
+
+    await wait_for(finished_merge, time.time() + 120)
+
+    await manager.server_stop(servers[0].server_id)
--- a/Show More
+++ b/Show More