Merge '[branch-5.0] - minimal fix for crash caused by empty primary key range in LWT update' from Jan Ciołek

In #13001 we found a test case which causes a crash because it didn't handle `UNSET_VALUE` properly: ```python3 def test_unset_insert_where(cql, table2): p = unique_key_int() stmt = cql.prepare(f'INSERT INTO {table2} (p, c) VALUES ({p}, ?)') with pytest.raises(InvalidRequest, match="unset"): cql.execute(stmt, [UNSET_VALUE]) def test_unset_insert_where_lwt(cql, table2): p = unique_key_int() stmt = cql.prepare(f'INSERT INTO {table2} (p, c) VALUES ({p}, ?) IF NOT EXISTS') with pytest.raises(InvalidRequest, match="unset"): cql.execute(stmt, [UNSET_VALUE]) ``` This PR does an absolutely minimal change to fix the crash. It adds a check the moment before the crash would happen. To make sure that everything works correctly, and to detect any possible breaking changes, I wrote a bunch of tests that validate the current behavior. I also ported some tests from the `master` branch, at least the ones that were in line with the behavior on `branch-5.0`. The changes are the same as in #13133, just cherry-picked to `branch-5.0` Closes #13178 * github.com:scylladb/scylladb: cql-pytest/test_unset: port some tests from master branch cql-pytest/test_unset: test unset value in UPDATEs with LWT conditions cql-pytest/test_unset: test unset value in UPDATEs with IF EXISTS cql-pytest/test_unset: test unset value in UPDATE statements cql-pytest/test_unset: test unset value in INSERTs with IF NOT EXISTS cql-pytest/test_unset: test unset value in INSERT statements cas_request: fix crash on unset value in primary key with LWT
Update seastar submodule
2023-05-08 12:03:44 +03:00 · 2023-05-08 10:41:24 +03:00 · 2023-05-08 09:58:46 +03:00 · 2023-05-02 21:22:23 +03:00 · 2023-04-28 03:25:27 +02:00 · 2023-04-28 03:25:27 +02:00
154 changed files with 4125 additions and 712 deletions
--- a/2
+++ b/2
@@ -60,7 +60,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.0.2
+VERSION=5.0.13

 if test -f version
 then
--- a/alternator/controller.cc
+++ b/alternator/controller.cc
@@ -78,6 +78,11 @@ future<> controller::start_server() {

        _executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_mm), std::ref(_sys_dist_ks), sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value()).get();
        _server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper)).get();
+        // Note: from this point on, if start_server() throws for any reason,
+        // it must first call stop_server() to stop the executor and server
+        // services we just started - or Scylla will cause an assertion
+        // failure when the controller object is destroyed in the exception
+        // unwinding.
        std::optional<uint16_t> alternator_port;
        if (_config.alternator_port()) {
            alternator_port = _config.alternator_port();
@@ -104,7 +109,13 @@ future<> controller::start_server() {
            }
            opts.erase("require_client_auth");
            opts.erase("truststore");
-            utils::configure_tls_creds_builder(creds.value(), std::move(opts)).get();
+            try {
+                utils::configure_tls_creds_builder(creds.value(), std::move(opts)).get();
+            } catch(...) {
+                logger.error("Failed to set up Alternator TLS credentials: {}", std::current_exception());
+                stop_server().get();
+                std::throw_with_nested(std::runtime_error("Failed to set up Alternator TLS credentials"));
+            }
        }
        bool alternator_enforce_authorization = _config.alternator_enforce_authorization();
        _server.invoke_on_all(
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -34,6 +34,7 @@
 #include "expressions.hh"
 #include "conditions.hh"
 #include "cql3/constants.hh"
+#include "cql3/util.hh"
 #include <optional>
 #include "utils/overloaded_functor.hh"
 #include "seastar/json/json_elements.hh"
@@ -46,6 +47,7 @@
 #include <seastar/core/coroutine.hh>
 #include <boost/range/adaptors.hpp>
 #include <boost/range/algorithm/find_end.hpp>
+#include <unordered_set>
 #include "service/storage_proxy.hh"
 #include "gms/gossiper.hh"
 #include "schema_registry.hh"
@@ -148,16 +150,16 @@ static void validate_table_name(const std::string& name) {
 // instead of each component individually as DynamoDB does.
 // The view_name() function assumes the table_name has already been validated
 // but validates the legality of index_name and the combination of both.
-static std::string view_name(const std::string& table_name, const std::string& index_name, const std::string& delim = ":") {
+static std::string view_name(const std::string& table_name, std::string_view index_name, const std::string& delim = ":") {
    static const std::regex valid_index_name_chars ("[a-zA-Z0-9_.-]*");
    if (index_name.length() < 3) {
        throw api_error::validation("IndexName must be at least 3 characters long");
    }
-    if (!std::regex_match(index_name.c_str(), valid_index_name_chars)) {
+    if (!std::regex_match(index_name.data(), valid_index_name_chars)) {
        throw api_error::validation(
                format("IndexName '{}' must satisfy regular expression pattern: [a-zA-Z0-9_.-]+", index_name));
    }
-    std::string ret = table_name + delim + index_name;
+    std::string ret = table_name + delim + std::string(index_name);
    if (ret.length() > max_table_name_length) {
        throw api_error::validation(
                format("The total length of TableName ('{}') and IndexName ('{}') cannot exceed {} characters",
@@ -166,7 +168,7 @@ static std::string view_name(const std::string& table_name, const std::string& i
    return ret;
 }

-static std::string lsi_name(const std::string& table_name, const std::string& index_name) {
+static std::string lsi_name(const std::string& table_name, std::string_view index_name) {
    return view_name(table_name, index_name, "!:");
 }

@@ -273,16 +275,16 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
    if (index_name) {
        if (index_name->IsString()) {
            orig_table_name = std::move(table_name);
-            table_name = view_name(orig_table_name, index_name->GetString());
+            table_name = view_name(orig_table_name, rjson::to_string_view(*index_name));
            type = table_or_view_type::gsi;
        } else {
            throw api_error::validation(
-                    format("Non-string IndexName '{}'", index_name->GetString()));
+                    format("Non-string IndexName '{}'", rjson::to_string_view(*index_name)));
        }
        // If no tables for global indexes were found, the index may be local
        if (!proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
            type = table_or_view_type::lsi;
-            table_name = lsi_name(orig_table_name, index_name->GetString());
+            table_name = lsi_name(orig_table_name, rjson::to_string_view(*index_name));
        }
    }

@@ -432,6 +434,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
    rjson::add(table_description, "BillingModeSummary", rjson::empty_object());
    rjson::add(table_description["BillingModeSummary"], "BillingMode", "PAY_PER_REQUEST");
    rjson::add(table_description["BillingModeSummary"], "LastUpdateToPayPerRequestDateTime", rjson::value(creation_date_seconds));
+    // In PAY_PER_REQUEST billing mode, provisioned capacity should return 0
+    rjson::add(table_description, "ProvisionedThroughput", rjson::empty_object());
+    rjson::add(table_description["ProvisionedThroughput"], "ReadCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);

    std::unordered_map<std::string,std::string> key_attribute_types;
    // Add base table's KeySchema and collect types for AttributeDefinitions:
@@ -453,6 +460,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
            rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
            // Add indexes's KeySchema and collect types for AttributeDefinitions:
            describe_key_schema(view_entry, *vptr, key_attribute_types);
+            // Add projection type
+            rjson::value projection = rjson::empty_object();
+            rjson::add(projection, "ProjectionType", "ALL");
+            // FIXME: we have to get ProjectionType from the schema when it is added
+            rjson::add(view_entry, "Projection", std::move(projection));
            // Local secondary indexes are marked by an extra '!' sign occurring before the ':' delimiter
            rjson::value& index_array = (delim_it > 1 && cf_name[delim_it-1] == '!') ? lsi_array : gsi_array;
            rjson::push_back(index_array, std::move(view_entry));
@@ -884,17 +896,23 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
    const rjson::value* gsi = rjson::find(request, "GlobalSecondaryIndexes");
    std::vector<schema_builder> view_builders;
    std::vector<sstring> where_clauses;
+    std::unordered_set<std::string> index_names;
    if (gsi) {
        if (!gsi->IsArray()) {
            co_return api_error::validation("GlobalSecondaryIndexes must be an array.");
        }
        for (const rjson::value& g : gsi->GetArray()) {
-            const rjson::value* index_name = rjson::find(g, "IndexName");
-            if (!index_name || !index_name->IsString()) {
+            const rjson::value* index_name_v = rjson::find(g, "IndexName");
+            if (!index_name_v || !index_name_v->IsString()) {
                co_return api_error::validation("GlobalSecondaryIndexes IndexName must be a string.");
            }
-            std::string vname(view_name(table_name, index_name->GetString()));
-            elogger.trace("Adding GSI {}", index_name->GetString());
+            std::string_view index_name = rjson::to_string_view(*index_name_v);
+            auto [it, added] = index_names.emplace(index_name);
+            if (!added) {
+                co_return api_error::validation(format("Duplicate IndexName '{}', ", index_name));
+            }
+            std::string vname(view_name(table_name, index_name));
+            elogger.trace("Adding GSI {}", index_name);
            // FIXME: read and handle "Projection" parameter. This will
            // require the MV code to copy just parts of the attrs map.
            schema_builder view_builder(keyspace_name, vname);
@@ -927,9 +945,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            if  (!range_key.empty() && range_key != view_hash_key && range_key != view_range_key) {
                add_column(view_builder, range_key, attribute_definitions, column_kind::clustering_key);
            }
-            sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
+            sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
            if (!view_range_key.empty()) {
-                where_clause = where_clause + " AND \"" + view_hash_key + "\" IS NOT NULL";
+                where_clause = format("{} AND {} IS NOT NULL", where_clause,
+                    cql3::util::maybe_quote(view_range_key));
            }
            where_clauses.push_back(std::move(where_clause));
            view_builders.emplace_back(std::move(view_builder));
@@ -942,12 +961,17 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            throw api_error::validation("LocalSecondaryIndexes must be an array.");
        }
        for (const rjson::value& l : lsi->GetArray()) {
-            const rjson::value* index_name = rjson::find(l, "IndexName");
-            if (!index_name || !index_name->IsString()) {
+            const rjson::value* index_name_v = rjson::find(l, "IndexName");
+            if (!index_name_v || !index_name_v->IsString()) {
                throw api_error::validation("LocalSecondaryIndexes IndexName must be a string.");
            }
-            std::string vname(lsi_name(table_name, index_name->GetString()));
-            elogger.trace("Adding LSI {}", index_name->GetString());
+            std::string_view index_name = rjson::to_string_view(*index_name_v);
+            auto [it, added] = index_names.emplace(index_name);
+            if (!added) {
+                co_return api_error::validation(format("Duplicate IndexName '{}', ", index_name));
+            }
+            std::string vname(lsi_name(table_name, index_name));
+            elogger.trace("Adding LSI {}", index_name);
            if (range_key.empty()) {
                co_return api_error::validation("LocalSecondaryIndex requires that the base table have a range key");
            }
@@ -979,9 +1003,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            // Note above we don't need to add virtual columns, as all
            // base columns were copied to view. TODO: reconsider the need
            // for virtual columns when we support Projection.
-            sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
+            sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
            if (!view_range_key.empty()) {
-                where_clause = where_clause + " AND \"" + view_range_key + "\" IS NOT NULL";
+                where_clause = format("{} AND {} IS NOT NULL", where_clause,
+                    cql3::util::maybe_quote(view_range_key));
            }
            where_clauses.push_back(std::move(where_clause));
            view_builders.emplace_back(std::move(view_builder));
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -143,19 +143,24 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
    auto table = find_table(_proxy, request);
    auto db = _proxy.data_dictionary();
    auto cfs = db.get_tables();
-    auto i = cfs.begin();
-    auto e = cfs.end();

    if (limit < 1) {
        throw api_error::validation("Limit must be 1 or more");
    }

-    // TODO: the unordered_map here is not really well suited for partial
-    // querying - we're sorting on local hash order, and creating a table
-    // between queries may or may not miss info. But that should be rare,
-    // and we can probably expect this to be a single call.
+    // # 12601 (maybe?) - sort the set of tables on ID. This should ensure we never
+    // generate duplicates in a paged listing here. Can obviously miss things if they 
+    // are added between paged calls and end up with a "smaller" UUID/ARN, but that 
+    // is to be expected.
+    std::sort(cfs.begin(), cfs.end(), [](const data_dictionary::table& t1, const data_dictionary::table& t2) {
+        return t1.schema()->id() < t2.schema()->id();
+    });
+
+    auto i = cfs.begin();
+    auto e = cfs.end();
+
    if (streams_start) {
-        i = std::find_if(i, e, [&](data_dictionary::table t) {
+        i = std::find_if(i, e, [&](const data_dictionary::table& t) {
            return t.schema()->id() == streams_start 
                && cdc::get_base_table(db.real_database(), *t.schema())
                && is_alternator_keyspace(t.schema()->ks_name())
--- a/alternator/ttl.hh
+++ b/alternator/ttl.hh
@@ -12,6 +12,7 @@
 #include <seastar/core/sharded.hh>
 #include <seastar/core/abort_source.hh>
 #include <seastar/core/semaphore.hh>
+#include "data_dictionary/data_dictionary.hh"

 namespace replica {
 class database;
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -593,6 +593,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        if (column_families.empty()) {
            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
        }
+        apilog.debug("force_keyspace_compaction: keyspace={} tables={}", keyspace, column_families);
        return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) -> future<> {
            auto table_ids = boost::copy_range<std::vector<utils::UUID>>(column_families | boost::adaptors::transformed([&] (auto& cf_name) {
                return db.find_uuid(keyspace, cf_name);
@@ -617,6 +618,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        if (column_families.empty()) {
            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
        }
+        apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, column_families);
        return ss.local().is_cleanup_allowed(keyspace).then([&ctx, keyspace,
                column_families = std::move(column_families)] (bool is_cleanup_allowed) mutable {
            if (!is_cleanup_allowed) {
@@ -635,7 +637,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                // as a table can be dropped during loop below, let's find it before issuing the cleanup request.
                for (auto& id : table_ids) {
                    replica::table& t = db.find_column_family(id);
-                    co_await cm.perform_cleanup(db, &t);
+                    co_await t.perform_cleanup_compaction(db);
                }
                co_return;
            }).then([]{
@@ -645,6 +647,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::perform_keyspace_offstrategy_compaction.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> tables) -> future<json::json_return_type> {
+        apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, tables);
        co_return co_await ctx.db.map_reduce0([&keyspace, &tables] (replica::database& db) -> future<bool> {
            bool needed = false;
            for (const auto& table : tables) {
@@ -658,6 +661,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);

+        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, column_families, exclude_current_version);
        return ctx.db.invoke_on_all([=] (replica::database& db) {
            return do_for_each(column_families, [=, &db](sstring cfname) {
                auto& cm = db.get_compaction_manager();
@@ -672,6 +676,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) -> future<json::json_return_type> {
        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
+        apilog.info("perform_keyspace_flush: keyspace={} tables={}", keyspace, column_families);
        auto &db = ctx.db.local();
        if (column_families.empty()) {
            co_await db.flush_on_all(keyspace);
@@ -683,6 +688,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_


    ss::decommission.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("decommission");
        return ss.local().decommission().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -698,6 +704,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::remove_node.set(r, [&ss](std::unique_ptr<request> req) {
        auto host_id = req->get_query_param("host_id");
        std::vector<sstring> ignore_nodes_strs= split(req->get_query_param("ignore_nodes"), ",");
+        apilog.info("remove_node: host_id={} ignore_nodes={}", host_id, ignore_nodes_strs);
        auto ignore_nodes = std::list<gms::inet_address>();
        for (std::string n : ignore_nodes_strs) {
            try {
@@ -770,6 +777,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::drain.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("drain");
        return ss.local().drain().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -802,12 +810,14 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::stop_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("stop_gossiping");
        return ss.local().stop_gossiping().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

    ss::start_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("start_gossiping");
        return ss.local().start_gossiping().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -904,6 +914,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::rebuild.set(r, [&ss](std::unique_ptr<request> req) {
        auto source_dc = req->get_query_param("source_dc");
+        apilog.info("rebuild: source_dc={}", source_dc);
        return ss.local().rebuild(std::move(source_dc)).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -940,6 +951,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        // FIXME: We should truncate schema tables if more than one node in the cluster.
        auto& sp = service::get_storage_proxy();
        auto& fs = sp.local().features();
+        apilog.info("reset_local_schema");
        return db::schema_tables::recalculate_schema_version(sp, fs).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -947,6 +959,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
        auto probability = req->get_query_param("probability");
+        apilog.info("set_trace_probability: probability={}", probability);
        return futurize_invoke([probability] {
            double real_prob = std::stod(probability.c_str());
            return tracing::tracing::tracing_instance().invoke_on_all([real_prob] (auto& local_tracing) {
@@ -984,6 +997,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto ttl = req->get_query_param("ttl");
        auto threshold = req->get_query_param("threshold");
        auto fast = req->get_query_param("fast");
+        apilog.info("set_slow_query: enable={} ttl={} threshold={} fast={}", enable, ttl, threshold, fast);
        try {
            return tracing::tracing::tracing_instance().invoke_on_all([enable, ttl, threshold, fast] (auto& local_tracing) {
                if (threshold != "") {
@@ -1010,6 +1024,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

+        apilog.info("enable_auto_compaction: keyspace={} tables={}", keyspace, tables);
        return set_tables_autocompaction(ctx, ss.local(), keyspace, tables, true);
    });

@@ -1017,6 +1032,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

+        apilog.info("disable_auto_compaction: keyspace={} tables={}", keyspace, tables);
        return set_tables_autocompaction(ctx, ss.local(), keyspace, tables, false);
    });

@@ -1357,7 +1373,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        if (!req_param<bool>(*req, "disable_snapshot", false)) {
            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
            f = parallel_for_each(column_families, [&snap_ctl, keyspace, tag](sstring cf) {
-                return snap_ctl.local().take_column_family_snapshot(keyspace, cf, tag);
+                return snap_ctl.local().take_column_family_snapshot(keyspace, cf, tag, db::snapshot_ctl::skip_flush::no, db::snapshot_ctl::allow_view_snapshots::yes);
            });
        }

--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -601,16 +601,11 @@ future<> compaction_manager::stop() {
    }
 }

-void compaction_manager::really_do_stop() {
-    if (_state == state::none || _state == state::stopped) {
-        return;
-    }
-
-    _state = state::stopped;
+future<> compaction_manager::really_do_stop() {
    cmlog.info("Asked to stop");
    // Reset the metrics registry
    _metrics.clear();
-    _stop_future.emplace(stop_ongoing_compactions("shutdown").then([this] () mutable {
+    return stop_ongoing_compactions("shutdown").then([this] () mutable {
        reevaluate_postponed_compactions();
        return std::move(_waiting_reevalution);
    }).then([this] {
@@ -618,12 +613,34 @@ void compaction_manager::really_do_stop() {
        _compaction_submission_timer.cancel();
        cmlog.info("Stopped");
        return _compaction_controller.shutdown();
-    }));
+    });
+}
+
+template <typename Ex>
+requires std::is_base_of_v<std::exception, Ex> &&
+requires (const Ex& ex) {
+    { ex.code() } noexcept -> std::same_as<const std::error_code&>;
+}
+auto swallow_enospc(const Ex& ex) noexcept {
+    if (ex.code().value() != ENOSPC) {
+        return make_exception_future<>(std::make_exception_ptr(ex));
+    }
+
+    cmlog.warn("Got ENOSPC on stop, ignoring...");
+    return make_ready_future<>();
 }

 void compaction_manager::do_stop() noexcept {
+    if (_state == state::none || _state == state::stopped) {
+        return;
+    }
+
    try {
-        really_do_stop();
+        _state = state::stopped;
+        _stop_future = really_do_stop()
+            .handle_exception_type([] (const std::system_error& ex) { return swallow_enospc(ex); })
+            .handle_exception_type([] (const storage_io_error& ex) { return swallow_enospc(ex); })
+        ;
    } catch (...) {
        try {
            cmlog.error("Failed to stop the manager: {}", std::current_exception());
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -235,7 +235,7 @@ public:

    // Stop all fibers, without waiting. Safe to be called multiple times.
    void do_stop() noexcept;
-    void really_do_stop();
+    future<> really_do_stop();

    // Submit a table to be compacted.
    void submit(replica::table* t);
--- a/configure.py
+++ b/configure.py
@@ -615,6 +615,8 @@ arg_parser.add_argument('--static-yaml-cpp', dest='staticyamlcpp', action='store
                        help='Link libyaml-cpp statically')
 arg_parser.add_argument('--tests-debuginfo', action='store', dest='tests_debuginfo', type=int, default=0,
                        help='Enable(1)/disable(0)compiler debug information generation for tests')
+arg_parser.add_argument('--perf-tests-debuginfo', action='store', dest='perf_tests_debuginfo', type=int, default=0,
+                        help='Enable(1)/disable(0)compiler debug information generation for perf tests')
 arg_parser.add_argument('--python', action='store', dest='python', default='python3',
                        help='Python3 path')
 arg_parser.add_argument('--split-dwarf', dest='split_dwarf', action='store_true', default=False,
@@ -1377,6 +1379,7 @@ linker_flags = linker_flags(compiler=args.cxx)

 dbgflag = '-g -gz' if args.debuginfo else ''
 tests_link_rule = 'link' if args.tests_debuginfo else 'link_stripped'
+perf_tests_link_rule = 'link' if args.perf_tests_debuginfo else 'link_stripped'

 # Strip if debuginfo is disabled, otherwise we end up with partial
 # debug info from the libraries we static link with
@@ -1901,7 +1904,8 @@ with open(buildfile_tmp, 'w') as f:
                    # So we strip the tests by default; The user can very
                    # quickly re-link the test unstripped by adding a "_g"
                    # to the test name, e.g., "ninja build/release/testname_g"
-                    f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, tests_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
+                    link_rule = perf_tests_link_rule if binary.startswith('test/perf/') else tests_link_rule
+                    f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
                    f.write('   libs = {}\n'.format(local_libs))
                    f.write('build $builddir/{}/{}_g: {}.{} {} | {} {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
                    f.write('   libs = {}\n'.format(local_libs))
@@ -2004,7 +2008,8 @@ with open(buildfile_tmp, 'w') as f:
                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
                if cc.endswith('Parser.cpp'):
                    # Unoptimized parsers end up using huge amounts of stack space and overflowing their stack
-                    flags = '-O1'
+                    flags = '-O1' if modes[mode]['optimization-level'] in ['0', 'g', 's'] else ''
+
                    if has_sanitize_address_use_after_scope:
                        flags += ' -fno-sanitize-address-use-after-scope'
                    f.write('  obj_cxxflags = %s\n' % flags)
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -1386,7 +1386,7 @@ serviceLevelOrRoleName returns [sstring name]
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower); }
 | t=STRING_LITERAL     { $name = sstring($t.text); }
 | t=QUOTED_NAME        { $name = sstring($t.text); }
-| k=unreserved_keyword { $name = sstring($t.text); 
+| k=unreserved_keyword { $name = k;
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower);}
 | QMARK {add_recognition_error("Bind variables cannot be used for service levels or role names");}
 ;
--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -12,6 +12,7 @@

 #include "cql3_type.hh"
 #include "cql3/util.hh"
+#include "exceptions/exceptions.hh"
 #include "ut_name.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "data_dictionary/user_types_metadata.hh"
@@ -436,7 +437,20 @@ sstring maybe_quote(const sstring& identifier) {
    }

    if (!need_quotes) {
-        return identifier;
+        // A seemingly valid identifier matching [a-z][a-z0-9_]* may still
+        // need quoting if it is a CQL keyword, e.g., "to" (see issue #9450).
+        // While our parser Cql.g has different production rules for different
+        // types of identifiers (column names, table names, etc.), all of
+        // these behave identically for alphanumeric strings: they exclude
+        // many keywords but allow keywords listed as "unreserved keywords".
+        // So we can use any of them, for example cident.
+        try {
+            cql3::util::do_with_parser(identifier, std::mem_fn(&cql3_parser::CqlParser::cident));
+            return identifier;
+        } catch(exceptions::syntax_exception&) {
+            // This alphanumeric string is not a valid identifier, so fall
+            // through to have it quoted:
+        }
    }
    if (num_quotes == 0) {
        return make_sstring("\"", identifier, "\"");
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -1293,7 +1293,7 @@ expression search_and_replace(const expression& e,
                    };
                },
                [&] (const binary_operator& oper) -> expression {
-                    return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs));
+                    return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs), oper.order);
                },
                [&] (const column_mutation_attribute& cma) -> expression {
                    return column_mutation_attribute{cma.kind, recurse(cma.column)};
--- a/cql3/selection/field_selector.hh
+++ b/cql3/selection/field_selector.hh
@@ -83,7 +83,7 @@ public:

    virtual sstring assignment_testable_source_context() const override {
        auto&& name = _type->field_name(_field);
-        auto sname = sstring(reinterpret_cast<const char*>(name.begin(), name.size()));
+        auto sname = std::string_view(reinterpret_cast<const char*>(name.data()), name.size());
        return format("{}.{}", _selected, sname);
    }

--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -422,11 +422,16 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
    }

    auto clustering_columns_restrictions = _restrictions->get_clustering_columns_restrictions();
-    if (dynamic_pointer_cast<cql3::restrictions::multi_column_restriction>(clustering_columns_restrictions)) {
+    bool has_multi_col_clustering_restrictions =
+        dynamic_pointer_cast<cql3::restrictions::multi_column_restriction>(clustering_columns_restrictions) != nullptr;
+    if (has_multi_col_clustering_restrictions) {
        clustering_key_prefix ckey = clustering_key_prefix::from_exploded(clustering_key);
-        return expr::is_satisfied_by(
+        bool multi_col_clustering_satisfied = expr::is_satisfied_by(
                clustering_columns_restrictions->expression,
                partition_key, clustering_key, static_row, row, selection, _options);
+        if (!multi_col_clustering_satisfied) {
+            return false;
+        }
    }

    auto static_row_iterator = static_row.iterator();
@@ -474,6 +479,13 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
            if (_skip_ck_restrictions) {
                continue;
            }
+            if (has_multi_col_clustering_restrictions) {
+                // Mixing multi column and single column restrictions on clustering
+                // key columns is forbidden.
+                // Since there are multi column restrictions we have to skip
+                // evaluating single column restrictions or we will get an error.
+                continue;
+            }
            auto clustering_key_restrictions_map = _restrictions->get_single_column_clustering_key_restrictions();
            auto restr_it = clustering_key_restrictions_map.find(cdef);
            if (restr_it == clustering_key_restrictions_map.end()) {
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -254,6 +254,10 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
    if (options.getSerialConsistency() == null)
        throw new InvalidRequestException("Invalid empty serial consistency level");
 #endif
+    for (size_t i = 0; i < _statements.size(); ++i) {
+        _statements[i].statement->validate_primary_key_restrictions(options.for_statement(i));
+    }
+
    if (_has_conditions) {
        ++_stats.cas_batches;
        _stats.statements_in_cas_batches += _statements.size();
--- a/cql3/statements/cas_request.cc
+++ b/cql3/statements/cas_request.cc
@@ -121,6 +121,9 @@ std::optional<mutation> cas_request::apply(foreign_ptr<lw_shared_ptr<query::resu

 const update_parameters::prefetch_data::row* cas_request::find_old_row(const cas_row_update& op) const {
    static const clustering_key empty_ckey = clustering_key::make_empty();
+    if (_key.empty()) {
+        throw exceptions::invalid_request_exception("partition key ranges empty - probably caused by an unset value");
+    }
    const partition_key& pkey = _key.front().start()->value().key().value();
    // If a statement has only static columns conditions, we must ignore its clustering columns
    // restriction when choosing a row to check the conditions, i.e. choose any partition row,
@@ -134,6 +137,9 @@ const update_parameters::prefetch_data::row* cas_request::find_old_row(const cas
    // Another case when we pass an empty clustering key prefix is apparently when the table
    // doesn't have any clustering key columns and the clustering key range is empty (open
    // ended on both sides).
+    if (op.ranges.empty()) {
+        throw exceptions::invalid_request_exception("clustering key ranges empty - probably caused by an unset value");
+    }
    const clustering_key& ckey = !op.statement.has_only_static_column_conditions() && op.ranges.front().start() ?
        op.ranges.front().start()->value() : empty_ckey;
    return _rows.find_row(pkey, ckey);
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -20,6 +20,7 @@
 #include "gms/feature_service.hh"
 #include "tombstone_gc_extension.hh"
 #include "tombstone_gc.hh"
+#include "utils/bloom_calculations.hh"

 #include <boost/algorithm/string/predicate.hpp>

@@ -145,6 +146,16 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
        throw exceptions::configuration_exception(KW_MAX_INDEX_INTERVAL + " must be greater than " + KW_MIN_INDEX_INTERVAL);
    }

+    if (get_simple(KW_BF_FP_CHANCE)) {
+        double bloom_filter_fp_chance = get_double(KW_BF_FP_CHANCE, 0/*not used*/);
+        double min_bloom_filter_fp_chance = utils::bloom_calculations::min_supported_bloom_filter_fp_chance();
+        if (bloom_filter_fp_chance <= min_bloom_filter_fp_chance || bloom_filter_fp_chance > 1.0) {
+            throw exceptions::configuration_exception(format(
+                "{} must be larger than {} and less than or equal to 1.0 (got {})",
+                KW_BF_FP_CHANCE, min_bloom_filter_fp_chance, bloom_filter_fp_chance));
+        }
+    }
+
    speculative_retry::from_sstring(get_string(KW_SPECULATIVE_RETRY, speculative_retry(speculative_retry::type::NONE, 0).to_sstring()));
 }

--- a/cql3/statements/cf_properties.hh
+++ b/cql3/statements/cf_properties.hh
@@ -13,6 +13,7 @@

 #include "cql3/statements/cf_prop_defs.hh"
 #include "cql3/column_identifier.hh"
+#include "data_dictionary/data_dictionary.hh"

 namespace cql3 {

--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -110,9 +110,6 @@ future<> modification_statement::check_access(query_processor& qp, const service

 future<std::vector<mutation>>
 modification_statement::get_mutations(query_processor& qp, const query_options& options, db::timeout_clock::time_point timeout, bool local, int64_t now, service::query_state& qs) const {
-    if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
-        throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
-    }
    auto cl = options.get_consistency();
    auto json_cache = maybe_prepare_json_cache(options);
    auto keys = build_partition_keys(options, json_cache);
@@ -245,6 +242,12 @@ modification_statement::execute(query_processor& qp, service::query_state& qs, c
    return modify_stage(this, seastar::ref(qp), seastar::ref(qs), seastar::cref(options));
 }

+void modification_statement::validate_primary_key_restrictions(const query_options& options) const {
+    if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
+        throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
+    }
+}
+
 future<::shared_ptr<cql_transport::messages::result_message>>
 modification_statement::do_execute(query_processor& qp, service::query_state& qs, const query_options& options) const {
    if (has_conditions() && options.get_protocol_version() == 1) {
@@ -255,6 +258,8 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs

    inc_cql_stats(qs.get_client_state().is_internal());

+    validate_primary_key_restrictions(options);
+
    if (has_conditions()) {
        return execute_with_condition(qp, qs, options);
    }
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -231,6 +231,8 @@ public:
    // True if this statement needs to read only static column values to check if it can be applied.
    bool has_only_static_column_conditions() const { return !_has_regular_column_conditions && _has_static_column_conditions; }

+    void validate_primary_key_restrictions(const query_options& options) const;
+
    virtual future<::shared_ptr<cql_transport::messages::result_message>>
    execute(query_processor& qp, service::query_state& qs, const query_options& options) const override;

--- a/cql3/statements/sl_prop_defs.cc
+++ b/cql3/statements/sl_prop_defs.cc
@@ -30,7 +30,7 @@ void sl_prop_defs::validate() {
        data_value v = duration_type->deserialize(duration_type->from_string(*repr));
        cql_duration duration = static_pointer_cast<const duration_type_impl>(duration_type)->from_value(v);
        if (duration.months || duration.days) {
-            throw exceptions::invalid_request_exception("Timeout values cannot be longer than 24h");
+            throw exceptions::invalid_request_exception("Timeout values cannot be expressed in days/months");
        }
        if (duration.nanoseconds % 1'000'000 != 0) {
            throw exceptions::invalid_request_exception("Timeout values must be expressed in millisecond granularity");
--- a/cql3/type_json.cc
+++ b/cql3/type_json.cc
@@ -18,6 +18,8 @@
 #include "types/listlike_partial_deserializing_iterator.hh"
 #include "utils/managed_bytes.hh"
 #include "exceptions/exceptions.hh"
+#include <boost/algorithm/string/trim_all.hpp>
+#include <boost/algorithm/string.hpp>

 static inline bool is_control_char(char c) {
    return c >= 0 && c <= 0x1F;
@@ -78,8 +80,35 @@ static int64_t to_int64_t(const rjson::value& value) {
        return value.GetInt();
    } else if (value.IsUint()) {
        return value.GetUint();
-    } else if (value.GetUint64()) {
+    } else if (value.IsUint64()) {
        return value.GetUint64(); //NOTICE: large uint64_t values will get overflown
+    } else if (value.IsDouble()) {
+        // We allow specifing integer constants
+        // using scientific notation (for example 1.3e8)
+        // and floating-point numbers ending with .0 (for example 12.0),
+        // but not floating-point numbers with fractional part (12.34).
+        //
+        // The reason is that JSON standard does not have separate
+        // types for integers and floating-point numbers, only
+        // a single "number" type. Some serializers may
+        // produce an integer in that floating-point format.
+        double double_value = value.GetDouble();
+
+        // Check if the value contains disallowed fractional part (.34 from 12.34).
+        // With RapidJSON and an integer value in range [-(2^53)+1, (2^53)-1], 
+        // the fractional part will be zero as the entire value
+        // fits in 53-bit significand. RapidJSON's parsing code does not lose accuracy:
+        // when parsing a number like 12.34e8, it accumulates 1234 to a int64_t number,
+        // then converts it to double and multiples by power of 10, never having any
+        // digit in fractional part.
+        double integral;
+        double fractional = std::modf(double_value, &integral);
+        if (fractional != 0.0 && fractional != -0.0) {
+            throw marshal_exception(format("Incorrect JSON floating-point value "
+                "for int64 type: {} (it should not contain fractional part {})", value, fractional));
+        }
+
+        return double_value;
    }
    throw marshal_exception(format("Incorrect JSON value for int64 type: {}", value));
 }
@@ -189,7 +218,7 @@ struct from_json_object_visitor {
            throw marshal_exception("bytes_type must be represented as string");
        }
        std::string_view string_v = rjson::to_string_view(value);
-        if (string_v.size() < 2 && string_v[0] != '0' && string_v[1] != 'x') {
+        if (string_v.size() < 2 || string_v[0] != '0' || string_v[1] != 'x') {
            throw marshal_exception("Blob JSON strings must start with 0x");
        }
        string_v.remove_prefix(2);
@@ -197,6 +226,17 @@ struct from_json_object_visitor {
    }
    bytes operator()(const boolean_type_impl& t) {
        if (!value.IsBool()) {
+            if (value.IsString()) {
+                std::string str(rjson::to_string_view(value));
+                boost::trim_all(str);
+                boost::to_lower(str);
+
+                if (str == "true") {
+                    return t.decompose(true);
+                } else if (str == "false") {
+                    return t.decompose(false);
+                }
+            }
            throw marshal_exception(format("Invalid JSON object {}", value));
        }
        return t.decompose(value.GetBool());
--- a/cql3/util.hh
+++ b/cql3/util.hh
@@ -74,6 +74,13 @@ std::unique_ptr<cql3::statements::raw::select_statement> build_select_statement(
 /// forbids non-alpha-numeric characters in identifier names.
 /// Quoting involves wrapping the string in double-quotes ("). A double-quote
 /// character itself is quoted by doubling it.
+/// maybe_quote() also quotes reserved CQL keywords (e.g., "to", "where")
+/// but doesn't quote *unreserved* keywords (like ttl, int or as).
+/// Note that this means that if new reserved keywords are added to the
+/// parser, a saved output of maybe_quote() may no longer be parsable by
+/// parser. To avoid this forward-compatibility issue, use quote() instead
+/// of maybe_quote() - to unconditionally quote an identifier even if it is
+/// lowercase and not (yet) a keyword.
 sstring maybe_quote(const sstring& s);

 // Check whether timestamp is not too far in the future as this probably
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -11,6 +11,7 @@
 */

 #include <chrono>
+#include <exception>
 #include <seastar/core/future-util.hh>
 #include <seastar/core/do_with.hh>
 #include <seastar/core/semaphore.hh>
@@ -247,6 +248,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
            } catch (data_dictionary::no_such_keyspace& ex) {
                // should probably ignore and drop the batch
            } catch (...) {
+                blogger.warn("Replay failed (will retry): {}", std::current_exception());
                // timeout, overload etc.
                // Do _not_ remove the batch, assuning we got a node write error.
                // Since we don't have hints (which origin is satisfied with),
--- a/db/config.cc
+++ b/db/config.cc
@@ -881,6 +881,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Flush tables in the system_schema keyspace after schema modification. This is required for crash recovery, but slows down tests and can be disabled for them")
    , restrict_replication_simplestrategy(this, "restrict_replication_simplestrategy", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::FALSE, "Controls whether to disable SimpleStrategy replication. Can be true, false, or warn.")
    , restrict_dtcs(this, "restrict_dtcs", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::WARN, "Controls whether to prevent setting DateTieredCompactionStrategy. Can be true, false, or warn.")
+    , cache_index_pages(this, "cache_index_pages", liveness::LiveUpdate, value_status::Used, true,
+        "Keep SSTable index pages in the global cache after a SSTable read. Expected to improve performance for workloads with big partitions, but may degrade performance for workloads with small partitions.")
    , default_log_level(this, "default_log_level", value_status::Used)
    , logger_log_level(this, "logger_log_level", value_status::Used)
    , log_to_stdout(this, "log_to_stdout", value_status::Used)
--- a/db/config.hh
+++ b/db/config.hh
@@ -365,6 +365,9 @@ public:
    named_value<tri_mode_restriction> restrict_replication_simplestrategy;
    named_value<tri_mode_restriction> restrict_dtcs;

+
+    named_value<bool> cache_index_pages;
+
    seastar::logging_settings logging_settings(const log_cli::options&) const;

    const db::extensions& extensions() const;
--- a/db/snapshot-ctl.cc
+++ b/db/snapshot-ctl.cc
@@ -75,7 +75,7 @@ future<> snapshot_ctl::do_take_snapshot(sstring tag, std::vector<sstring> keyspa
    });
 }

-future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf) {
+future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf, allow_view_snapshots av) {
    if (ks_name.empty()) {
        throw std::runtime_error("You must supply a keyspace name");
    }
@@ -86,25 +86,25 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
        throw std::runtime_error("You must supply a snapshot name.");
    }

-    return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), sf] () mutable {
-        return do_take_column_family_snapshot(std::move(ks_name), std::move(tables), std::move(tag), sf);
+    return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), sf, av] () mutable {
+        return do_take_column_family_snapshot(std::move(ks_name), std::move(tables), std::move(tag), sf, av);
    });
 }

-future<> snapshot_ctl::do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf) {
+future<> snapshot_ctl::do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf, allow_view_snapshots av) {
    co_await check_snapshot_not_exist(ks_name, tag, tables);

    for (const auto& table_name : tables) {
        auto& cf = _db.local().find_column_family(ks_name, table_name);
-        if (cf.schema()->is_view()) {
+        if (cf.schema()->is_view() && !av) {
            throw std::invalid_argument("Do not take a snapshot of a materialized view or a secondary index by itself. Run snapshot on the base table instead.");
        }
    }
    co_await _db.local().snapshot_on_all(ks_name, std::move(tables), std::move(tag), bool(sf));
 }

-future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf) {
-    return take_column_family_snapshot(ks_name, std::vector<sstring>{cf_name}, tag, sf);
+future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf, allow_view_snapshots av) {
+    return take_column_family_snapshot(ks_name, std::vector<sstring>{cf_name}, tag, sf, av);
 }

 future<> snapshot_ctl::clear_snapshot(sstring tag, std::vector<sstring> keyspace_names, sstring cf_name) {
--- a/db/snapshot-ctl.hh
+++ b/db/snapshot-ctl.hh
@@ -27,6 +27,7 @@ namespace db {
 class snapshot_ctl : public peering_sharded_service<snapshot_ctl> {
 public:
    using skip_flush = bool_class<class skip_flush_tag>;
+    using allow_view_snapshots = bool_class<class allow_view_snapsots_tag>;

    struct snapshot_details {
        int64_t live;
@@ -64,7 +65,7 @@ public:
     * @param tables a vector of tables names to snapshot
     * @param tag the tag given to the snapshot; may not be null or empty
     */
-    future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no);
+    future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);

    /**
     * Takes the snapshot of a specific column family. A snapshot name must be specified.
@@ -73,7 +74,7 @@ public:
     * @param columnFamilyName the column family to snapshot
     * @param tag the tag given to the snapshot; may not be null or empty
     */
-    future<> take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf = skip_flush::no);
+    future<> take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);

    /**
     * Remove the snapshot with the given name from the given keyspaces.
@@ -99,7 +100,7 @@ private:
    std::result_of_t<Func()> run_snapshot_list_operation(Func&&);

    future<> do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf = skip_flush::no);
-    future<> do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no);
+    future<> do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);
 };

 }
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -2482,10 +2482,14 @@ class db_config_table final : public streaming_virtual_table {
            for (auto& c_ref : cfg.values()) {
                auto& c = c_ref.get();
                if (c.name() == name) {
-                    if (c.set_value(value, utils::config_file::config_source::CQL)) {
-                        return cfg.broadcast_to_all_shards();
-                    } else {
-                        return make_exception_future<>(virtual_table_update_exception("option is not live-updateable"));
+                    try {
+                        if (c.set_value(value, utils::config_file::config_source::CQL)) {
+                            return cfg.broadcast_to_all_shards();
+                        } else {
+                            return make_exception_future<>(virtual_table_update_exception("option is not live-updateable"));
+                        }
+                    } catch (boost::bad_lexical_cast&) {
+                        return make_exception_future<>(virtual_table_update_exception("cannot parse option value"));
                    }
                }
            }
@@ -3068,11 +3072,11 @@ mutation system_keyspace::make_group0_history_state_id_mutation(
        using namespace std::chrono;
        assert(*gc_older_than >= gc_clock::duration{0});

-        auto ts_millis = duration_cast<milliseconds>(microseconds{ts});
-        auto gc_older_than_millis = duration_cast<milliseconds>(*gc_older_than);
-        assert(gc_older_than_millis < ts_millis);
+        auto ts_micros = microseconds{ts};
+        auto gc_older_than_micros = duration_cast<microseconds>(*gc_older_than);
+        assert(gc_older_than_micros < ts_micros);

-        auto tomb_upper_bound = utils::UUID_gen::min_time_UUID(ts_millis - gc_older_than_millis);
+        auto tomb_upper_bound = utils::UUID_gen::min_time_UUID(ts_micros - gc_older_than_micros);
        // We want to delete all entries with IDs smaller than `tomb_upper_bound`
        // but the deleted range is of the form (x, +inf) since the schema is reversed.
        auto range = query::clustering_range::make_starting_with({
--- a/db/system_keyspace_view_types.hh
+++ b/db/system_keyspace_view_types.hh
@@ -10,6 +10,7 @@

 #include <seastar/core/seastar.hh>
 #include <seastar/core/sstring.hh>
+#include <seastar/core/reactor.hh>
 #include <utility>
 #include <optional>
 #include "dht/token.hh"
--- a/db/view/row_locking.cc
+++ b/db/view/row_locking.cc
@@ -10,8 +10,6 @@
 #include "log.hh"
 #include "utils/latency.hh"

-#include <seastar/core/when_all.hh>
-
 static logging::logger mylog("row_locking");

 row_locker::row_locker(schema_ptr s)
@@ -76,35 +74,32 @@ row_locker::lock_pk(const dht::decorated_key& pk, bool exclusive, db::timeout_cl
 future<row_locker::lock_holder>
 row_locker::lock_ck(const dht::decorated_key& pk, const clustering_key_prefix& cpk, bool exclusive, db::timeout_clock::time_point timeout, stats& stats) {
    mylog.debug("taking shared lock on partition {}, and {} lock on row {} in it", pk, (exclusive ? "exclusive" : "shared"), cpk);
+    auto ck = cpk;
+    // Create a two-level lock entry for the partition if it doesn't exist already.
    auto i = _two_level_locks.try_emplace(pk, this).first;
+    // The two-level lock entry we've just created is guaranteed to be kept alive as long as it's locked.
+    // Initiating read locking in the background below ensures that even if the two-level lock is currently
+    // write-locked, releasing the write-lock will synchronously engage any waiting
+    // locks and will keep the entry alive.
    future<lock_type::holder> lock_partition = i->second._partition_lock.hold_read_lock(timeout);
-    auto j = i->second._row_locks.find(cpk);
-    if (j == i->second._row_locks.end()) {
-        // Not yet locked, need to create the lock. This makes a copy of cpk.
-        try {
-            j = i->second._row_locks.emplace(cpk, lock_type()).first;
-        } catch(...) {
-            // If this emplace() failed, e.g., out of memory, we fail. We
-            // could do nothing - the partition lock we already started
-            // taking will be unlocked automatically after being locked.
-            // But it's better form to wait for the work we started, and it
-            // will also allow us to remove the hash-table row we added.
-            return lock_partition.then([ex = std::current_exception()] (auto lock) {
-                // The lock is automatically released when "lock" goes out of scope.
-                // TODO: unlock (lock = {}) now, search for the partition in the
-                // hash table (we know it's still there, because we held the lock until
-                // now) and remove the unused lock from the hash table if still unused.
-                return make_exception_future<row_locker::lock_holder>(std::current_exception());
-            });
-        }
-    }
    single_lock_stats &single_lock_stats = exclusive ? stats.exclusive_row : stats.shared_row;
    single_lock_stats.operations_currently_waiting_for_lock++;
    utils::latency_counter waiting_latency;
    waiting_latency.start();
-    future<lock_type::holder> lock_row = exclusive ? j->second.hold_write_lock(timeout) : j->second.hold_read_lock(timeout);
-    return when_all_succeed(std::move(lock_partition), std::move(lock_row))
-    .then_unpack([this, pk = &i->first, cpk = &j->first, exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency)] (auto lock1, auto lock2) mutable {
+    return lock_partition.then([this, pk = &i->first, row_locks = &i->second._row_locks, ck = std::move(ck), exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency), timeout] (auto lock1) mutable {
+        auto j = row_locks->find(ck);
+        if (j == row_locks->end()) {
+            // Not yet locked, need to create the lock.
+            j = row_locks->emplace(std::move(ck), lock_type()).first;
+        }
+        auto* cpk = &j->first;
+        auto& row_lock = j->second;
+        // Like to the two-level lock entry above, the row_lock entry we've just created
+        // is guaranteed to be kept alive as long as it's locked.
+        // Initiating read/write locking in the background below ensures that.
+        auto lock_row = exclusive ? row_lock.hold_write_lock(timeout) : row_lock.hold_read_lock(timeout);
+        return lock_row.then([this, pk, cpk, exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency), lock1 = std::move(lock1)] (auto lock2) mutable {
+        // FIXME: indentation
        lock1.release();
        lock2.release();
        waiting_latency.stop();
@@ -112,6 +107,7 @@ row_locker::lock_ck(const dht::decorated_key& pk, const clustering_key_prefix& c
        single_lock_stats.lock_acquisitions++;
        single_lock_stats.operations_currently_waiting_for_lock--;
        return lock_holder(this, pk, cpk, exclusive);
+        });
    });
 }

--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -121,6 +121,9 @@ const column_definition* view_info::view_column(const column_definition& base_de

 void view_info::set_base_info(db::view::base_info_ptr base_info) {
    _base_info = std::move(base_info);
+    // Forget the cached objects which may refer to the base schema.
+    _select_statement = nullptr;
+    _partition_slice = std::nullopt;
 }

 // A constructor for a base info that can facilitate reads and writes from the materialized view.
@@ -863,13 +866,18 @@ void view_updates::generate_update(
    bool same_row = true;
    for (auto col_id : col_ids) {
        auto* after = update.cells().find_cell(col_id);
-        // Note: multi-cell columns can't be part of the primary key.
        auto& cdef = _base->regular_column_at(col_id);
        if (existing) {
            auto* before = existing->cells().find_cell(col_id);
+            // Note that this cell is necessarily atomic, because col_ids are
+            // view key columns, and keys must be atomic.
            if (before && before->as_atomic_cell(cdef).is_live()) {
                if (after && after->as_atomic_cell(cdef).is_live()) {
-                    auto cmp = compare_atomic_cell_for_merge(before->as_atomic_cell(cdef), after->as_atomic_cell(cdef));
+                    // We need to compare just the values of the keys, not
+                    // metadata like the timestamp. This is because below,
+                    // if the old and new view row have the same key, we need
+                    // to be sure to reach the update_entry() case.
+                    auto cmp = compare_unsigned(before->as_atomic_cell(cdef).value(), after->as_atomic_cell(cdef).value());
                    if (cmp != 0) {
                        same_row = false;
                    }
@@ -889,7 +897,13 @@ void view_updates::generate_update(
            if (same_row) {
                update_entry(base_key, update, *existing, now);
            } else {
-                replace_entry(base_key, update, *existing, now);
+                // This code doesn't work if the old and new view row have the
+                // same key, because if they do we get both data and tombstone
+                // for the same timestamp (now) and the tombstone wins. This
+                // is why we need the "same_row" case above - it's not just a
+                // performance optimization.
+                delete_old_entry(base_key, *existing, update, now);
+                create_entry(base_key, update, now);
            }
        } else {
            delete_old_entry(base_key, *existing, update, now);
@@ -933,8 +947,12 @@ future<stop_iteration> view_update_builder::stop() const {
    return make_ready_future<stop_iteration>(stop_iteration::yes);
 }

-future<utils::chunked_vector<frozen_mutation_and_schema>> view_update_builder::build_some() {
+future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>> view_update_builder::build_some() {
    return advance_all().then([this] (stop_iteration ignored) {
+        if (!_update && !_existing) {
+            // Tell the caller there is no more data to build.
+            return make_ready_future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>>(std::nullopt);
+        }
        bool do_advance_updates = false;
        bool do_advance_existings = false;
        if (_update && _update->is_partition_start()) {
@@ -946,22 +964,23 @@ future<utils::chunked_vector<frozen_mutation_and_schema>> view_update_builder::b
            _existing_tombstone_tracker.set_partition_tombstone(_existing->as_partition_start().partition_tombstone());
            do_advance_existings = true;
        }
+        future<stop_iteration> f = make_ready_future<stop_iteration>(stop_iteration::no);
        if (do_advance_updates) {
-            return do_advance_existings ? advance_all() : advance_updates();
+            f = do_advance_existings ? advance_all() : advance_updates();
        } else if (do_advance_existings) {
-            return advance_existings();
+            f = advance_existings();
        }
-        return make_ready_future<stop_iteration>(stop_iteration::no);
-    }).then([this] (stop_iteration ignored) {
-        return repeat([this] {
-            return this->on_results();
+        return std::move(f).then([this] (stop_iteration ignored) {
+            return repeat([this] {
+                return this->on_results();
+            });
+        }).then([this] {
+            utils::chunked_vector<frozen_mutation_and_schema> mutations;
+            for (auto& update : _view_updates) {
+                update.move_to(mutations);
+            }
+            return std::make_optional(mutations);
        });
-    }).then([this] {
-        utils::chunked_vector<frozen_mutation_and_schema> mutations;
-        for (auto& update : _view_updates) {
-            update.move_to(mutations);
-        }
-        return mutations;
    });
 }

@@ -2035,15 +2054,21 @@ public:
 // Called in the context of a seastar::thread.
 void view_builder::execute(build_step& step, exponential_backoff_retry r) {
    gc_clock::time_point now = gc_clock::now();
-    auto consumer = compact_for_query<emit_only_live_rows::yes, view_builder::consumer>(
+    auto compaction_state = make_lw_shared<compact_for_query_state<emit_only_live_rows::yes>>(
            *step.reader.schema(),
            now,
            step.pslice,
            batch_size,
-            query::max_partitions,
-            view_builder::consumer{*this, step, now});
-    consumer.consume_new_partition(step.current_key); // Initialize the state in case we're resuming a partition
+            query::max_partitions);
+    auto consumer = compact_for_query<emit_only_live_rows::yes, view_builder::consumer>(compaction_state, view_builder::consumer{*this, step, now});
    auto built = step.reader.consume_in_thread(std::move(consumer));
+    if (auto ds = std::move(*compaction_state).detach_state()) {
+        auto& range_tombstones = std::get<std::deque<range_tombstone>>(ds->range_tombstones);
+        for (auto& rt : range_tombstones) {
+            step.reader.unpop_mutation_fragment(mutation_fragment(*step.reader.schema(), step.reader.permit(), std::move(rt)));
+        }
+        step.reader.unpop_mutation_fragment(mutation_fragment(*step.reader.schema(), step.reader.permit(), std::move(ds->partition_start)));
+    }

    _as.check();

@@ -2125,24 +2150,28 @@ update_backlog node_update_backlog::add_fetch(unsigned shard, update_backlog bac
    return std::max(backlog, _max.load(std::memory_order_relaxed));
 }

-future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const sstring& ks_name, const sstring& cf_name) {
-    return sys_dist_ks.view_status(ks_name, cf_name).then([] (std::unordered_map<utils::UUID, sstring>&& view_statuses) {
-        return boost::algorithm::any_of(view_statuses | boost::adaptors::map_values, [] (const sstring& view_status) {
-            return view_status == "STARTED";
+future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const sstring& ks_name,
+        const sstring& cf_name) {
+    using view_statuses_type = std::unordered_map<utils::UUID, sstring>;
+    return sys_dist_ks.view_status(ks_name, cf_name).then([&tm] (view_statuses_type&& view_statuses) {
+        return boost::algorithm::any_of(view_statuses, [&tm] (const view_statuses_type::value_type& view_status) {
+            // Only consider status of known hosts.
+            return view_status.second == "STARTED" && tm.get_endpoint_for_host_id(view_status.first);
        });
    });
 }

-future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const replica::table& t, streaming::stream_reason reason) {
+future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const replica::table& t,
+        streaming::stream_reason reason) {
    if (is_internal_keyspace(t.schema()->ks_name())) {
        return make_ready_future<bool>(false);
    }
    if (reason == streaming::stream_reason::repair && !t.views().empty()) {
        return make_ready_future<bool>(true);
    }
-    return do_with(t.views(), [&sys_dist_ks] (auto& views) {
+    return do_with(t.views(), [&sys_dist_ks, &tm] (auto& views) {
        return map_reduce(views,
-                [&sys_dist_ks] (const view_ptr& view) { return check_view_build_ongoing(sys_dist_ks, view->ks_name(), view->cf_name()); },
+                [&sys_dist_ks, &tm] (const view_ptr& view) { return check_view_build_ongoing(sys_dist_ks, tm, view->ks_name(), view->cf_name()); },
                false,
                std::logical_or<bool>());
    });
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -154,10 +154,7 @@ private:
    void delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
    void do_delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
    void update_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now);
-    void replace_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now) {
-        create_entry(base_key, update, now);
-        delete_old_entry(base_key, existing, update, now);
-    }
+    void update_entry_for_computed_column(const partition_key& base_key, const clustering_row& update, const std::optional<clustering_row>& existing, gc_clock::time_point now);
 };

 class view_update_builder {
@@ -188,7 +185,15 @@ public:
    }
    view_update_builder(view_update_builder&& other) noexcept = default;

-    future<utils::chunked_vector<frozen_mutation_and_schema>> build_some();
+
+    // build_some() works on batches of 100 (max_rows_for_view_updates)
+    // updated rows, but can_skip_view_updates() can decide that some of
+    // these rows do not effect the view, and as a result build_some() can
+    // fewer than 100 rows - in extreme cases even zero (see issue #12297).
+    // So we can't use an empty returned vector to signify that the view
+    // update building is done - and we wrap the return value in an
+    // std::optional, which is disengaged when the iteration is done.
+    future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>> build_some();

    future<> close() noexcept;

--- a/db/view/view_update_checks.hh
+++ b/db/view/view_update_checks.hh
@@ -22,9 +22,13 @@ class system_distributed_keyspace;

 }

+namespace locator {
+class token_metadata;
+}
+
 namespace db::view {

-future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const sstring& ks_name, const sstring& cf_name);
-future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const replica::table& t, streaming::stream_reason reason);
+future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const replica::table& t,
+        streaming::stream_reason reason);

 }
--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -83,10 +83,10 @@ future<> view_update_generator::start() {
                            service::get_local_streaming_priority(),
                            nullptr,
                            ::mutation_reader::forwarding::no);
+                    auto close_sr = deferred_close(staging_sstable_reader);

                    inject_failure("view_update_generator_consume_staging_sstable");
                    auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, std::move(permit), *t, sstables, _as, staging_sstable_reader_handle));
-                    staging_sstable_reader.close().get();
                    if (result == stop_iteration::yes) {
                        break;
                    }
--- a/db/view/view_updating_consumer.hh
+++ b/db/view/view_updating_consumer.hh
@@ -16,6 +16,7 @@
 #include "db/view/row_locking.hh"
 #include <seastar/core/abort_source.hh>
 #include "mutation.hh"
+#include <seastar/core/circular_buffer.hh>

 class evictable_reader_handle;

--- a/dht/murmur3_partitioner.cc
+++ b/dht/murmur3_partitioner.cc
@@ -15,11 +15,18 @@

 namespace dht {

+// Note: Cassandra has a special case where for an empty key it returns
+// minimum_token() instead of 0 (the naturally-calculated hash function for
+// an empty string). Their thinking was that empty partition keys are not
+// allowed anyway. However, they *are* allowed in materialized views, so the
+// empty-key partition should get a real token, not an invalid token, so
+// we dropped this special case. Since we don't support migrating sstables of
+// materialized-views from Cassandra, this Cassandra-Scylla incompatiblity
+// will not cause problems in practice.
+// Note that get_token(const schema& s, partition_key_view key) below must
+// use exactly the same algorithm as this function.
 token
 murmur3_partitioner::get_token(bytes_view key) const {
-    if (key.empty()) {
-        return minimum_token();
-    }
    std::array<uint64_t, 2> hash;
    utils::murmur_hash::hash3_x64_128(key, 0, hash);
    return get_token(hash[0]);
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -42,7 +42,8 @@ if __name__ == '__main__':
        if systemd_unit.available('systemd-coredump@.service'):
            dropin = '''
 [Service]
-TimeoutStartSec=infinity
+RuntimeMaxSec=infinity
+TimeoutSec=infinity
 '''[1:-1]
            os.makedirs('/etc/systemd/system/systemd-coredump@.service.d', exist_ok=True)
            with open('/etc/systemd/system/systemd-coredump@.service.d/timeout.conf', 'w') as f:
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -16,7 +16,7 @@ import stat
 import distro
 from pathlib import Path
 from scylla_util import *
-from subprocess import run
+from subprocess import run, SubprocessError

 if __name__ == '__main__':
    if os.getuid() > 0:
@@ -137,7 +137,9 @@ if __name__ == '__main__':
    # stalling. The minimum block size for crc enabled filesystems is 1024,
    # and it also cannot be smaller than the sector size.
    block_size = max(1024, sector_size)
+    run('udevadm settle', shell=True, check=True)
    run(f'mkfs.xfs -b size={block_size} {fsdev} -f -K', shell=True, check=True)
+    run('udevadm settle', shell=True, check=True)

    if is_debian_variant():
        confpath = '/etc/mdadm/mdadm.conf'
@@ -153,6 +155,11 @@ if __name__ == '__main__':
    os.makedirs(mount_at, exist_ok=True)

    uuid = run(f'blkid -s UUID -o value {fsdev}', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
+    if not uuid:
+        raise Exception(f'Failed to get UUID of {fsdev}')
+
+    uuidpath = f'/dev/disk/by-uuid/{uuid}'
+
    after = 'local-fs.target'
    wants = ''
    if raid and args.raid_level != '0':
@@ -169,7 +176,7 @@ After={after}{wants}
 DefaultDependencies=no

 [Mount]
-What=/dev/disk/by-uuid/{uuid}
+What={uuidpath}
 Where={mount_at}
 Type=xfs
 Options=noatime{opt_discard}
@@ -191,8 +198,16 @@ WantedBy=multi-user.target
    systemd_unit.reload()
    if args.raid_level != '0':
        md_service.start()
-    mount = systemd_unit(mntunit_bn)
-    mount.start()
+    try:
+        mount = systemd_unit(mntunit_bn)
+        mount.start()
+    except SubprocessError as e:
+        if not os.path.exists(uuidpath):
+            print(f'\nERROR: {uuidpath} is not found\n')
+        elif not stat.S_ISBLK(os.stat(uuidpath).st_mode):
+            print(f'\nERROR: {uuidpath} is not block device\n')
+        raise e
+
    if args.enable_on_nextboot:
        mount.enable()
    uid = pwd.getpwnam('scylla').pw_uid
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -214,7 +214,7 @@ if __name__ == '__main__':
                        help='skip raid setup')
    parser.add_argument('--raid-level-5', action='store_true', default=False,
                        help='use RAID5 for RAID volume')
-    parser.add_argument('--online-discard', default=True,
+    parser.add_argument('--online-discard', default=1, choices=[0, 1], type=int,
                        help='Configure XFS to discard unused blocks as soon as files are deleted')
    parser.add_argument('--nic',
                        help='specify NIC')
@@ -458,7 +458,7 @@ if __name__ == '__main__':
        args.no_raid_setup = not raid_setup
        if raid_setup:
            level = '5' if raid_level_5 else '0'
-            run_setup_script('RAID', f'scylla_raid_setup --disks {disks} --enable-on-nextboot --raid-level={level} --online-discard={int(online_discard)}')
+            run_setup_script('RAID', f'scylla_raid_setup --disks {disks} --enable-on-nextboot --raid-level={level} --online-discard={online_discard}')

        coredump_setup = interactive_ask_service('Do you want to enable coredumps?', 'Yes - sets up coredump to allow a post-mortem analysis of the Scylla state just prior to a crash. No - skips this step.', coredump_setup)
        args.no_coredump_setup = not coredump_setup
--- a/dist/docker/scyllasetup.py
+++ b/dist/docker/scyllasetup.py
@@ -68,7 +68,12 @@ class ScyllaSetup:

    def cqlshrc(self):
        home = os.environ['HOME']
-        hostname = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
+        if self._rpcAddress:
+            hostname = self._rpcAddress
+        elif self._listenAddress:
+            hostname = self._listenAddress
+        else:
+            hostname = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
        with open("%s/.cqlshrc" % home, "w") as cqlshrc:
            cqlshrc.write("[connection]\nhostname = %s\n" % hostname)

--- a/dist/redhat/scylla.spec
+++ b/dist/redhat/scylla.spec
@@ -7,7 +7,7 @@ Group:          Applications/Databases
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Source0:        %{reloc_pkg}
-Requires:       %{product}-server = %{version} %{product}-conf = %{version} %{product}-python3 = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version} %{product}-node-exporter = %{version}
+Requires:       %{product}-server = %{version}-%{release} %{product}-conf = %{version}-%{release} %{product}-python3 = %{version}-%{release} %{product}-kernel-conf = %{version}-%{release} %{product}-jmx = %{version}-%{release} %{product}-tools = %{version}-%{release} %{product}-tools-core = %{version}-%{release} %{product}-node-exporter = %{version}-%{release}
 Obsoletes:	scylla-server < 1.1

 %global _debugsource_template %{nil}
@@ -54,7 +54,7 @@ Group:          Applications/Databases
 Summary:        The Scylla database server
 License:        AGPLv3
 URL:            http://www.scylladb.com/
-Requires:       %{product}-conf  = %{version} %{product}-python3 = %{version}
+Requires:       %{product}-conf  = %{version}-%{release} %{product}-python3 = %{version}-%{release}
 Conflicts:      abrt
 AutoReqProv:    no

--- a/flat_mutation_reader.cc
+++ b/flat_mutation_reader.cc
@@ -32,7 +32,7 @@
 logging::logger fmr_logger("flat_mutation_reader");

 flat_mutation_reader& flat_mutation_reader::operator=(flat_mutation_reader&& o) noexcept {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -45,7 +45,7 @@ flat_mutation_reader& flat_mutation_reader::operator=(flat_mutation_reader&& o)
 }

 flat_mutation_reader::~flat_mutation_reader() {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -774,11 +774,14 @@ make_flat_mutation_reader_from_mutations_v2(schema_ptr s, reader_permit permit,
        std::optional<mutation_consume_cookie> _cookie;

    private:
-        void flush_tombstones(position_in_partition_view pos) {
+        void flush_tombstones(position_in_partition_view pos, bool emit_end = false) {
            _rt_gen.flush(pos, [&] (range_tombstone_change rt) {
                _current_rt = rt.tombstone();
                push_mutation_fragment(*_schema, _permit, std::move(rt));
            });
+            if (emit_end && _current_rt) {
+                push_mutation_fragment(*_schema, _permit, range_tombstone_change(pos, {}));
+            }
        }
        void maybe_emit_partition_start() {
            if (_dk) {
@@ -815,10 +818,7 @@ make_flat_mutation_reader_from_mutations_v2(schema_ptr s, reader_permit permit,
                return stop_iteration::yes;
            }
            maybe_emit_partition_start();
-            flush_tombstones(position_in_partition::after_all_clustered_rows());
-            if (_current_rt) {
-                push_mutation_fragment(*_schema, _permit, range_tombstone_change(position_in_partition::after_all_clustered_rows(), {}));
-            }
+            flush_tombstones(position_in_partition::after_all_clustered_rows(), true);
            push_mutation_fragment(*_schema, _permit, partition_end{});
            return stop_iteration::no;
        }
@@ -1786,7 +1786,7 @@ void mutation_fragment_stream_validating_filter::on_end_of_stream() {
 }

 flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader_v2&& o) noexcept {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -1799,7 +1799,7 @@ flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader
 }

 flat_mutation_reader_v2::~flat_mutation_reader_v2() {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -1986,11 +1986,14 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
        tombstone _current_rt;
        std::optional<position_range> _pr;
    public:
-        void flush_tombstones(position_in_partition_view pos) {
+        void flush_tombstones(position_in_partition_view pos, bool emit_end = false) {
            _rt_gen.flush(pos, [&] (range_tombstone_change rt) {
                _current_rt = rt.tombstone();
                push_mutation_fragment(*_schema, _permit, std::move(rt));
            });
+            if (emit_end && _current_rt) {
+                push_mutation_fragment(*_schema, _permit, range_tombstone_change(pos, {}));
+            }
        }
        void consume(static_row mf) {
            push_mutation_fragment(*_schema, _permit, std::move(mf));
@@ -2015,11 +2018,9 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
            push_mutation_fragment(*_schema, _permit, std::move(mf));
        }
        void consume(partition_end mf) {
-            flush_tombstones(position_in_partition::after_all_clustered_rows());
+            flush_tombstones(position_in_partition::after_all_clustered_rows(), true);
            if (_current_rt) {
                assert(!_pr);
-                push_mutation_fragment(*_schema, _permit, range_tombstone_change(
-                        position_in_partition::after_all_clustered_rows(), {}));
            }
            push_mutation_fragment(*_schema, _permit, std::move(mf));
        }
@@ -2042,10 +2043,7 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
                if (_reader.is_end_of_stream() && _reader.is_buffer_empty()) {
                    if (_pr) {
                        // If !_pr we should flush on partition_end
-                        flush_tombstones(_pr->end());
-                        if (_current_rt) {
-                            push_mutation_fragment(*_schema, _permit, range_tombstone_change(_pr->end(), {}));
-                        }
+                        flush_tombstones(_pr->end(), true);
                    }
                    _end_of_stream = true;
                }
--- a/flat_mutation_reader.hh
+++ b/flat_mutation_reader.hh
@@ -132,6 +132,7 @@ public:
    private:
        tracked_buffer _buffer;
        size_t _buffer_size = 0;
+        bool _close_required = false;
    protected:
        size_t max_buffer_size_in_bytes = default_max_buffer_size_in_bytes();
        bool _end_of_stream = false;
@@ -167,6 +168,8 @@ public:
        bool is_end_of_stream() const { return _end_of_stream; }
        bool is_buffer_empty() const { return _buffer.empty(); }
        bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; }
+        bool is_close_required() const { return _close_required; }
+        void set_close_required() { _close_required = true; }
        static constexpr size_t default_max_buffer_size_in_bytes() { return 8 * 1024; }

        mutation_fragment pop_mutation_fragment() {
@@ -504,9 +507,15 @@ public:
    //
    // Can be used to skip over entire partitions if interleaved with
    // `operator()()` calls.
-    future<> next_partition() { return _impl->next_partition(); }
+    future<> next_partition() {
+        _impl->set_close_required();
+        return _impl->next_partition();
+    }

-    future<> fill_buffer() { return _impl->fill_buffer(); }
+    future<> fill_buffer() {
+        _impl->set_close_required();
+        return _impl->fill_buffer();
+    }

    // Changes the range of partitions to pr. The range can only be moved
    // forwards. pr.begin() needs to be larger than pr.end() of the previousl
@@ -515,6 +524,7 @@ public:
    // pr needs to be valid until the reader is destroyed or fast_forward_to()
    // is called again.
    future<> fast_forward_to(const dht::partition_range& pr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(pr);
    }
    // Skips to a later range of rows.
@@ -544,6 +554,7 @@ public:
    // In particular one must first enter a partition by fetching a `partition_start`
    // fragment before calling `fast_forward_to`.
    future<> fast_forward_to(position_range cr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(std::move(cr));
    }
    // Closes the reader.
--- a/flat_mutation_reader_v2.hh
+++ b/flat_mutation_reader_v2.hh
@@ -164,6 +164,7 @@ public:
    private:
        tracked_buffer _buffer;
        size_t _buffer_size = 0;
+        bool _close_required = false;
    protected:
        size_t max_buffer_size_in_bytes = default_max_buffer_size_in_bytes();

@@ -205,6 +206,8 @@ public:
        bool is_end_of_stream() const { return _end_of_stream; }
        bool is_buffer_empty() const { return _buffer.empty(); }
        bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; }
+        bool is_close_required() const { return _close_required; }
+        void set_close_required() { _close_required = true; }
        static constexpr size_t default_max_buffer_size_in_bytes() { return 8 * 1024; }

        mutation_fragment_v2 pop_mutation_fragment() {
@@ -542,9 +545,15 @@ public:
    //
    // Can be used to skip over entire partitions if interleaved with
    // `operator()()` calls.
-    future<> next_partition() { return _impl->next_partition(); }
+    future<> next_partition() {
+        _impl->set_close_required();
+        return _impl->next_partition();
+    }

-    future<> fill_buffer() { return _impl->fill_buffer(); }
+    future<> fill_buffer() {
+        _impl->set_close_required();
+        return _impl->fill_buffer();
+    }

    // Changes the range of partitions to pr. The range can only be moved
    // forwards. pr.begin() needs to be larger than pr.end() of the previousl
@@ -553,6 +562,7 @@ public:
    // pr needs to be valid until the reader is destroyed or fast_forward_to()
    // is called again.
    future<> fast_forward_to(const dht::partition_range& pr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(pr);
    }
    // Skips to a later range of rows.
@@ -582,6 +592,7 @@ public:
    // In particular one must first enter a partition by fetching a `partition_start`
    // fragment before calling `fast_forward_to`.
    future<> fast_forward_to(position_range cr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(std::move(cr));
    }
    // Closes the reader.
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -1012,10 +1012,10 @@ std::set<inet_address> gossiper::get_live_members() {

 std::set<inet_address> gossiper::get_live_token_owners() {
    std::set<inet_address> token_owners;
-    for (auto& member : get_live_members()) {
-        auto es = get_endpoint_state_for_endpoint_ptr(member);
-        if (es && !is_dead_state(*es) && get_token_metadata_ptr()->is_member(member)) {
-            token_owners.insert(member);
+    auto normal_token_owners = get_token_metadata_ptr()->get_all_endpoints();
+    for (auto& node: normal_token_owners) {
+        if (is_alive(node)) {
+            token_owners.insert(node);
        }
    }
    return token_owners;
@@ -1023,10 +1023,10 @@ std::set<inet_address> gossiper::get_live_token_owners() {

 std::set<inet_address> gossiper::get_unreachable_token_owners() {
    std::set<inet_address> token_owners;
-    for (auto&& x : _unreachable_endpoints) {
-        auto& endpoint = x.first;
-        if (get_token_metadata_ptr()->is_member(endpoint)) {
-            token_owners.insert(endpoint);
+    auto normal_token_owners = get_token_metadata_ptr()->get_all_endpoints();
+    for (auto& node: normal_token_owners) {
+        if (!is_alive(node)) {
+            token_owners.insert(node);
        }
    }
    return token_owners;
--- a/locator/abstract_replication_strategy.cc
+++ b/locator/abstract_replication_strategy.cc
@@ -215,22 +215,6 @@ effective_replication_map::get_primary_ranges_within_dc(inet_address ep) const {
    });
 }

-future<std::unordered_multimap<inet_address, dht::token_range>>
-abstract_replication_strategy::get_address_ranges(const token_metadata& tm) const {
-    std::unordered_multimap<inet_address, dht::token_range> ret;
-    for (auto& t : tm.sorted_tokens()) {
-        dht::token_range_vector r = tm.get_primary_ranges_for(t);
-        auto eps = co_await calculate_natural_endpoints(t, tm);
-        rslogger.debug("token={}, primary_range={}, address={}", t, r, eps);
-        for (auto ep : eps) {
-            for (auto&& rng : r) {
-                ret.emplace(ep, rng);
-            }
-        }
-    }
-    co_return ret;
-}
-
 future<std::unordered_multimap<inet_address, dht::token_range>>
 abstract_replication_strategy::get_address_ranges(const token_metadata& tm, inet_address endpoint) const {
    std::unordered_multimap<inet_address, dht::token_range> ret;
--- a/locator/abstract_replication_strategy.hh
+++ b/locator/abstract_replication_strategy.hh
@@ -112,7 +112,6 @@ public:
    future<dht::token_range_vector> get_ranges(inet_address ep, token_metadata_ptr tmptr) const;

 public:
-    future<std::unordered_multimap<inet_address, dht::token_range>> get_address_ranges(const token_metadata& tm) const;
    future<std::unordered_multimap<inet_address, dht::token_range>> get_address_ranges(const token_metadata& tm, inet_address endpoint) const;

    // Caller must ensure that token_metadata will not change throughout the call.
--- a/locator/azure_snitch.cc
+++ b/locator/azure_snitch.cc
@@ -15,6 +15,7 @@
 #include <seastar/core/coroutine.hh>
 #include <seastar/core/seastar.hh>
 #include <seastar/http/response_parser.hh>
+#include <seastar/http/reply.hh>
 #include <seastar/net/api.hh>
 #include <seastar/net/dns.hh>

@@ -47,7 +48,8 @@ future<> azure_snitch::load_config() {

    logger().info("AzureSnitch using region: {}, zone: {}.", azure_region, azure_zone);

-    _my_rack = azure_zone;
+    // Zoneless regions return empty zone
+    _my_rack = (azure_zone != "" ? azure_zone : azure_region);
    _my_dc = azure_region;

    co_return co_await _my_distributed->invoke_on_all([this] (snitch_ptr& local_s) {
@@ -90,6 +92,10 @@ future<sstring> azure_snitch::azure_api_call(sstring path) {

        // Read HTTP response header first
        auto rsp = parser.get_parsed_response();
+        if (rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
+            throw std::runtime_error(format("Error: HTTP response status {}", rsp->_status_code));
+        }
+
        auto it = rsp->_headers.find("Content-Length");
        if (it == rsp->_headers.end()) {
            throw std::runtime_error("Error: HTTP response does not contain: Content-Length\n");
--- a/locator/ec2_snitch.cc
+++ b/locator/ec2_snitch.cc
@@ -1,5 +1,8 @@
 #include "locator/ec2_snitch.hh"
 #include <seastar/core/seastar.hh>
+#include <seastar/core/sleep.hh>
+#include <seastar/core/do_with.hh>
+#include <seastar/http/reply.hh>

 #include <boost/algorithm/string/classification.hpp>
 #include <boost/algorithm/string/split.hpp>
@@ -67,6 +70,30 @@ future<> ec2_snitch::start() {
 }

 future<sstring> ec2_snitch::aws_api_call(sstring addr, uint16_t port, sstring cmd) {
+    return do_with(int(0), [this, addr, port, cmd] (int& i) {
+        return repeat_until_value([this, addr, port, cmd, &i]() -> future<std::optional<sstring>> {
+            ++i;
+            return aws_api_call_once(addr, port, cmd).then([] (auto res) {
+                return make_ready_future<std::optional<sstring>>(std::move(res));
+            }).handle_exception([&i] (auto ep) {
+                try {
+                    std::rethrow_exception(ep);
+                } catch (const std::system_error &e) {
+                    logger().error(e.what());
+                    if (i >= AWS_API_CALL_RETRIES - 1) {
+                        logger().error("Maximum number of retries exceeded");
+                        throw e;
+                    }
+                }
+                return sleep(AWS_API_CALL_RETRY_INTERVAL).then([] {
+                    return make_ready_future<std::optional<sstring>>(std::nullopt);
+                });
+            });
+        });
+    });
+}
+
+future<sstring> ec2_snitch::aws_api_call_once(sstring addr, uint16_t port, sstring cmd) {
    return connect(socket_address(inet_address{addr}, port))
    .then([this, addr, cmd] (connected_socket fd) {
        _sd = std::move(fd);
@@ -88,6 +115,9 @@ future<sstring> ec2_snitch::aws_api_call(sstring addr, uint16_t port, sstring cm

            // Read HTTP response header first
            auto _rsp = _parser.get_parsed_response();
+            if (_rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
+                return make_exception_future<sstring>(std::runtime_error(format("Error: HTTP response status {}", _rsp->_status_code)));
+            }
            auto it = _rsp->_headers.find("Content-Length");
            if (it == _rsp->_headers.end()) {
                return make_exception_future<sstring>("Error: HTTP response does not contain: Content-Length\n");
--- a/locator/ec2_snitch.hh
+++ b/locator/ec2_snitch.hh
@@ -16,6 +16,8 @@ public:
    static constexpr const char* ZONE_NAME_QUERY_REQ = "/latest/meta-data/placement/availability-zone";
    static constexpr const char* AWS_QUERY_SERVER_ADDR = "169.254.169.254";
    static constexpr uint16_t AWS_QUERY_SERVER_PORT = 80;
+    static constexpr int AWS_API_CALL_RETRIES = 5;
+    static constexpr auto AWS_API_CALL_RETRY_INTERVAL = std::chrono::seconds{5};

    ec2_snitch(const sstring& fname = "", unsigned io_cpu_id = 0);
    virtual future<> start() override;
@@ -32,5 +34,6 @@ private:
    output_stream<char> _out;
    http_response_parser _parser;
    sstring _zone_req;
+    future<sstring> aws_api_call_once(sstring addr, uint16_t port, const sstring cmd);
 };
 } // namespace locator
--- a/locator/gce_snitch.cc
+++ b/locator/gce_snitch.cc
@@ -14,6 +14,7 @@
 #include <seastar/net/dns.hh>
 #include <seastar/core/seastar.hh>
 #include "locator/gce_snitch.hh"
+#include <seastar/http/reply.hh>

 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string/classification.hpp>
@@ -106,6 +107,10 @@ future<sstring> gce_snitch::gce_api_call(sstring addr, sstring cmd) {

        // Read HTTP response header first
        auto rsp = parser.get_parsed_response();
+        if (rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
+            throw std::runtime_error(format("Error: HTTP response status {}", rsp->_status_code));
+        }
+
        auto it = rsp->_headers.find("Content-Length");
        if (it == rsp->_headers.end()) {
            throw std::runtime_error("Error: HTTP response does not contain: Content-Length\n");
--- a/locator/token_metadata.cc
+++ b/locator/token_metadata.cc
@@ -786,13 +786,12 @@ void token_metadata_impl::calculate_pending_ranges_for_leaving(
        const abstract_replication_strategy& strategy,
        std::unordered_multimap<range<token>, inet_address>& new_pending_ranges,
        mutable_token_metadata_ptr all_left_metadata) const {
-    std::unordered_multimap<inet_address, dht::token_range> address_ranges = strategy.get_address_ranges(unpimplified_this).get0();
    // get all ranges that will be affected by leaving nodes
    std::unordered_set<range<token>> affected_ranges;
    for (auto endpoint : _leaving_endpoints) {
-        auto r = address_ranges.equal_range(endpoint);
-        for (auto x = r.first; x != r.second; x++) {
-            affected_ranges.emplace(x->second);
+        auto r = strategy.get_address_ranges(unpimplified_this, endpoint).get0();
+        for (const auto& x : r) {
+            affected_ranges.emplace(x.second);
        }
    }
    // for each of those ranges, find what new nodes will be responsible for the range when
@@ -826,16 +825,14 @@ void token_metadata_impl::calculate_pending_ranges_for_replacing(
    if (_replacing_endpoints.empty()) {
        return;
    }
-    auto address_ranges = strategy.get_address_ranges(unpimplified_this).get0();
    for (const auto& node : _replacing_endpoints) {
        auto existing_node = node.first;
        auto replacing_node = node.second;
+        auto address_ranges = strategy.get_address_ranges(unpimplified_this, existing_node).get0();
        for (const auto& x : address_ranges) {
            seastar::thread::maybe_yield();
-            if (x.first == existing_node) {
-                tlogger.debug("Node {} replaces {} for range {}", replacing_node, existing_node, x.second);
-                new_pending_ranges.emplace(x.second, replacing_node);
-            }
+            tlogger.debug("Node {} replaces {} for range {}", replacing_node, existing_node, x.second);
+            new_pending_ranges.emplace(x.second, replacing_node);
        }
    }
 }
--- a/main.cc
+++ b/main.cc
@@ -383,6 +383,8 @@ static auto defer_verbose_shutdown(const char* what, Func&& func) {
                        break;
                    }
                }
+            } catch (const storage_io_error& e) {
+                do_abort = false;
            } catch (...) {
            }
            auto msg = fmt::format("Unexpected error shutting down {}: {}", what, ex);
@@ -425,6 +427,39 @@ static int scylla_main(int ac, char** av) {
        exit(1);
    }

+    // Even on the environment which causes error during initalize Scylla,
+    // "scylla --version" should be able to run without error.
+    // To do so, we need to parse and execute these options before
+    // initializing Scylla/Seastar classes.
+    bpo::options_description preinit_description("Scylla options");
+    bpo::variables_map preinit_vm;
+    preinit_description.add_options()
+        ("version", bpo::bool_switch(), "print version number and exit")
+        ("build-id", bpo::bool_switch(), "print build-id and exit")
+        ("build-mode", bpo::bool_switch(), "print build mode and exit")
+        ("list-tools", bpo::bool_switch(), "list included tools and exit");
+    auto preinit_parsed_opts = bpo::command_line_parser(ac, av).options(preinit_description).allow_unregistered().run();
+    bpo::store(preinit_parsed_opts, preinit_vm);
+    if (preinit_vm["version"].as<bool>()) {
+        fmt::print("{}\n", scylla_version());
+        return 0;
+    }
+    if (preinit_vm["build-id"].as<bool>()) {
+        fmt::print("{}\n", get_build_id());
+        return 0;
+    }
+    if (preinit_vm["build-mode"].as<bool>()) {
+        fmt::print("{}\n", scylla_build_mode());
+        return 0;
+    }
+    if (preinit_vm["list-tools"].as<bool>()) {
+        fmt::print(
+                "types - a command-line tool to examine values belonging to scylla types\n"
+                "sstable - a multifunctional command-line tool to examine the content of sstables\n"
+        );
+        return 0;
+    }
+
  try {
    runtime::init_uptime();
    std::setvbuf(stdout, nullptr, _IOLBF, 1000);
@@ -479,26 +514,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
    bpo::variables_map vm;
    auto parsed_opts = bpo::command_line_parser(ac, av).options(app.get_options_description()).allow_unregistered().run();
    bpo::store(parsed_opts, vm);
-    if (vm["version"].as<bool>()) {
-        fmt::print("{}\n", scylla_version());
-        return 0;
-    }
-    if (vm["build-id"].as<bool>()) {
-        fmt::print("{}\n", get_build_id());
-        return 0;
-    }
-    if (vm["build-mode"].as<bool>()) {
-        fmt::print("{}\n", scylla_build_mode());
-        return 0;
-    }
-    if (vm["list-tools"].as<bool>()) {
-        fmt::print(
-                "types - a command-line tool to examine values belonging to scylla types\n"
-                "sstable - a multifunctional command-line tool to examine the content of sstables\n"
-        );
-        return 0;
-    }
-
    print_starting_message(ac, av, parsed_opts);

    sharded<locator::shared_token_metadata> token_metadata;
@@ -574,6 +589,12 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl

            cfg->broadcast_to_all_shards().get();

+            // We pass this piece of config through a global as a temporary hack.
+            // See the comment at the definition of sstables::global_cache_index_pages.
+            smp::invoke_on_all([&cfg] {
+                sstables::global_cache_index_pages = cfg->cache_index_pages.operator utils::updateable_value<bool>();
+            }).get();
+
            ::sighup_handler sighup_handler(opts, *cfg);
            auto stop_sighup_handler = defer_verbose_shutdown("sighup", [&] {
                sighup_handler.stop().get();
@@ -1116,7 +1137,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            // ATTN -- sharded repair reference already sits on storage_service and if
            // it calls repair.local() before this place it'll crash (now it doesn't do
            // both)
-            supervisor::notify("starting messaging service");
+            supervisor::notify("starting repair service");
            auto max_memory_repair = memory::stats().total_memory() * 0.1;
            repair.start(std::ref(gossiper), std::ref(messaging), std::ref(db), std::ref(proxy), std::ref(bm), std::ref(sys_dist_ks), std::ref(view_update_generator), std::ref(mm), max_memory_repair).get();
            auto stop_repair_service = defer_verbose_shutdown("repair service", [&repair] {
--- a/memtable-sstable.hh
+++ b/memtable-sstable.hh
@@ -15,6 +15,7 @@
 #include "sstables/shared_sstable.hh"
 #include <seastar/core/future.hh>
 #include <seastar/core/io_priority_class.hh>
+#include "reader_permit.hh"

 class memtable;
 class flat_mutation_reader;
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -438,6 +438,8 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
    // should not be blocked by any data requests.
    case messaging_verb::GROUP0_PEER_EXCHANGE:
    case messaging_verb::GROUP0_MODIFY_CONFIG:
+        // ATTN -- if moving GOSSIP_ verbs elsewhere, mind updating the tcp_nodelay
+        // setting in get_rpc_client(), which assumes gossiper verbs live in idx 0
        return 0;
    case messaging_verb::PREPARE_MESSAGE:
    case messaging_verb::PREPARE_DONE_MESSAGE:
@@ -695,7 +697,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }();

    auto must_tcp_nodelay = [&] {
-        if (idx == 1) {
+        if (idx == 0) {
            return true; // gossip
        }
        if (_cfg.tcp_nodelay == tcp_nodelay_what::local) {
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -272,8 +272,8 @@ public:

    future<> lookup_readers(db::timeout_clock::time_point timeout);

-    future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
-            std::optional<clustering_key_prefix> last_ckey);
+    future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, std::optional<detached_compaction_state> compaction_state,
+            dht::decorated_key last_pkey, std::optional<clustering_key_prefix> last_ckey);

    future<> stop();
 };
@@ -580,19 +580,22 @@ future<> read_context::lookup_readers(db::timeout_clock::time_point timeout) {
    });
 }

-future<> read_context::save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
-            std::optional<clustering_key_prefix> last_ckey) {
+future<> read_context::save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, std::optional<detached_compaction_state> compaction_state,
+            dht::decorated_key last_pkey, std::optional<clustering_key_prefix> last_ckey) {
    if (_cmd.query_uuid == utils::UUID{}) {
        return make_ready_future<>();
    }

-    auto last_pkey = compaction_state.partition_start.key();
-
    const auto cb_stats = dismantle_combined_buffer(std::move(unconsumed_buffer), last_pkey);
    tracing::trace(_trace_state, "Dismantled combined buffer: {}", cb_stats);

-    const auto cs_stats = dismantle_compaction_state(std::move(compaction_state));
-    tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);
+    auto cs_stats = dismantle_buffer_stats{};
+    if (compaction_state) {
+        cs_stats = dismantle_compaction_state(std::move(*compaction_state));
+        tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);
+    } else {
+        tracing::trace(_trace_state, "No compaction state to dismantle, partition exhausted", cs_stats);
+    }

    return do_with(std::move(last_pkey), std::move(last_ckey), [this] (const dht::decorated_key& last_pkey,
            const std::optional<clustering_key_prefix>& last_ckey) {
@@ -754,7 +757,9 @@ future<typename ResultBuilder::result_type> do_query(
                std::move(result_builder));

        if (compaction_state->are_limits_reached() || result.is_short_read()) {
-            co_await ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_ckey));
+            // Must call before calling 'detached_state()`.
+            auto last_pkey = *compaction_state->current_partition();
+            co_await ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_pkey), std::move(last_ckey));
        }

        co_await ctx->stop();
--- a/mutation_compactor.hh
+++ b/mutation_compactor.hh
@@ -167,6 +167,9 @@ class compact_mutation_state {
    std::unique_ptr<mutation_compactor_garbage_collector> _collector;

    compaction_stats _stats;
+
+    // Remember if we requested to stop mid-partition.
+    stop_iteration _stop = stop_iteration::no;
 private:
    template <typename Consumer, typename GCConsumer>
    requires CompactedFragmentsConsumer<Consumer> && CompactedFragmentsConsumer<GCConsumer>
@@ -304,6 +307,7 @@ public:
    }

    void consume_new_partition(const dht::decorated_key& dk) {
+        _stop = stop_iteration::no;
        auto& pk = dk.key();
        _dk = &dk;
        _return_static_content_on_partition_with_no_rows =
@@ -370,9 +374,9 @@ public:
        _static_row_live = is_live;
        if (is_live || (!only_live() && !sr.empty())) {
            partition_is_not_empty(consumer);
-            return consumer.consume(std::move(sr), current_tombstone, is_live);
+            _stop = consumer.consume(std::move(sr), current_tombstone, is_live);
        }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -424,22 +428,21 @@ public:
        };

        if (only_live() && is_live) {
-            auto stop = consume_row();
+            _stop = consume_row();
            if (++_rows_in_current_partition == _current_partition_limit) {
-                return stop_iteration::yes;
+                _stop = stop_iteration::yes;
            }
-            return stop;
+            return _stop;
        } else if (!only_live()) {
-            auto stop = stop_iteration::no;
            if (!cr.empty()) {
-                stop = consume_row();
+                _stop = consume_row();
            }
            if (!sstable_compaction() && is_live && ++_rows_in_current_partition == _current_partition_limit) {
-                return stop_iteration::yes;
+                _stop = stop_iteration::yes;
            }
-            return stop;
+            return _stop;
        }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -448,7 +451,8 @@ public:
        ++_stats.range_tombstones;
        _range_tombstones.apply(rt);
        // FIXME: drop tombstone if it is fully covered by other range tombstones
-        return do_consume(std::move(rt), consumer, gc_consumer);
+        _stop = do_consume(std::move(rt), consumer, gc_consumer);
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -459,9 +463,9 @@ public:
            _rt_assembler.emplace();
        }
        if (auto rt_opt = _rt_assembler->consume(_schema, std::move(rtc))) {
-            return do_consume(std::move(*rt_opt), consumer, gc_consumer);
+            _stop = do_consume(std::move(*rt_opt), consumer, gc_consumer);
        }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -490,8 +494,16 @@ public:
            _partition_limit -= _rows_in_current_partition > 0;
            auto stop = consumer.consume_end_of_partition();
            if (!sstable_compaction()) {
-                return _row_limit && _partition_limit && stop != stop_iteration::yes
+                stop = _row_limit && _partition_limit && stop != stop_iteration::yes
                       ? stop_iteration::no : stop_iteration::yes;
+                // If we decided to stop earlier but decide to continue now, we
+                // are in effect skipping the partition. Do not leave `_stop` at
+                // `stop_iteration::yes` in this case, reset it back to
+                // `stop_iteration::no` as if we exhausted the partition.
+                if (_stop && !stop) {
+                    _stop = stop_iteration::no;
+                }
+                return stop;
            }
        }
        return stop_iteration::no;
@@ -536,6 +548,7 @@ public:
        _current_partition_limit = std::min(_row_limit, _partition_row_limit);
        _query_time = query_time;
        _stats = {};
+        _stop = stop_iteration::no;

        noop_compacted_fragments_consumer nc;

@@ -562,16 +575,31 @@ public:
    /// compactor will result in the new compactor being in the same state *this
    /// is (given the same outside parameters of course). Practically this
    /// allows the compaction state to be stored in the compacted reader.
-    detached_compaction_state detach_state() && {
+    /// If the currently compacted partition is exhausted a disengaged optional
+    /// is returned -- in this case there is no state to detach.
+    std::optional<detached_compaction_state> detach_state() && {
+        // If we exhausted the partition, there is no need to detach-restore the
+        // compaction state.
+        // We exhausted the partition if `consume_partition_end()` was called
+        // without us requesting the consumption to stop (remembered in _stop)
+        // from one of the consume() overloads.
+        // The consume algorithm calls `consume_partition_end()` in two cases:
+        // * on a partition-end fragment
+        // * consume() requested to stop
+        // In the latter case, the partition is not exhausted. Even if the next
+        // fragment to process is a partition-end, it will not be consumed.
+        if (!_stop) {
+            return {};
+        }
        partition_start ps(std::move(_last_dk), _range_tombstones.get_partition_tombstone());
        if (_rt_assembler) {
            if (_current_tombstone) {
-                return {std::move(ps), std::move(_last_static_row), range_tombstone_change(position_in_partition_view::after_key(_last_clustering_pos), _current_tombstone)};
+                return detached_compaction_state{std::move(ps), std::move(_last_static_row), range_tombstone_change(position_in_partition_view::after_key(_last_clustering_pos), _current_tombstone)};
            } else {
-                return {std::move(ps), std::move(_last_static_row), std::optional<range_tombstone_change>{}};
+                return detached_compaction_state{std::move(ps), std::move(_last_static_row), std::optional<range_tombstone_change>{}};
            }
        }
-        return {std::move(ps), std::move(_last_static_row), std::move(_range_tombstones).range_tombstones()};
+        return detached_compaction_state{std::move(ps), std::move(_last_static_row), std::move(_range_tombstones).range_tombstones()};
    }

    const compaction_stats& stats() const { return _stats; }
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -826,6 +826,7 @@ public:

    void apply(tombstone deleted_at) {
        _deleted_at.apply(deleted_at);
+        maybe_shadow();
    }

    void apply(shadowable_tombstone deleted_at) {
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -1240,7 +1240,10 @@ future<flat_mutation_reader> evictable_reader::resume_or_create_reader() {
    if (auto reader_opt = try_resume()) {
        co_return std::move(*reader_opt);
    }
-    co_await _permit.maybe_wait_readmission();
+    // See evictable_reader_v2::resume_or_create_reader()
+    if (_permit.needs_readmission()) {
+        co_await _permit.wait_readmission();
+    }
    co_return recreate_reader();
 }

@@ -1581,11 +1584,7 @@ private:
    tracing::global_trace_state_ptr _trace_state;
    const mutation_reader::forwarding _fwd_mr;
    reader_concurrency_semaphore::inactive_read_handle _irh;
-    bool _drop_partition_start = false;
-    bool _drop_static_row = false;
-    // Validate the partition key of the first emitted partition, set after the
-    // reader was recreated.
-    bool _validate_partition_key = false;
+    bool _reader_recreated = false; // set if reader was recreated since last operation
    position_in_partition::tri_compare _tri_cmp;

    std::optional<dht::decorated_key> _last_pkey;
@@ -1606,10 +1605,9 @@ private:
    void adjust_partition_slice();
    flat_mutation_reader_v2 recreate_reader();
    future<flat_mutation_reader_v2> resume_or_create_reader();
-    void maybe_validate_partition_start(const flat_mutation_reader_v2::tracked_buffer& buffer);
+    void validate_partition_start(const partition_start& ps);
    void validate_position_in_partition(position_in_partition_view pos) const;
-    bool should_drop_fragment(const mutation_fragment_v2& mf);
-    future<> do_fill_buffer();
+    void examine_first_fragments(mutation_fragment_v2_opt& mf1, mutation_fragment_v2_opt& mf2, mutation_fragment_v2_opt& mf3);

 public:
    evictable_reader_v2(
@@ -1725,9 +1723,6 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
    _range_override.reset();
    _slice_override.reset();

-    _drop_partition_start = false;
-    _drop_static_row = false;
-
    if (_last_pkey) {
        bool partition_range_is_inclusive = true;

@@ -1736,11 +1731,8 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
            partition_range_is_inclusive = false;
            break;
        case partition_region::static_row:
-            _drop_partition_start = true;
            break;
        case partition_region::clustered:
-            _drop_partition_start = true;
-            _drop_static_row = true;
            adjust_partition_slice();
            slice = &*_slice_override;
            break;
@@ -1763,7 +1755,7 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
        _range_override = dht::partition_range({dht::partition_range::bound(*_last_pkey, partition_range_is_inclusive)}, _pr->end());
        range = &*_range_override;

-        _validate_partition_key = true;
+        _reader_recreated = true;
    }

    return _ms.make_reader_v2(
@@ -1784,45 +1776,48 @@ future<flat_mutation_reader_v2> evictable_reader_v2::resume_or_create_reader() {
    if (auto reader_opt = try_resume()) {
        co_return std::move(*reader_opt);
    }
-    co_await _permit.maybe_wait_readmission();
+    // When the reader is created the first time and we are actually resuming a
+    // saved reader in `recreate_reader()`, we have two cases here:
+    // * the reader is still alive (in inactive state)
+    // * the reader was evicted
+    // We check for this below with `needs_readmission()` and it is very
+    // important to not allow for preemption between said check and
+    // `recreate_reader()`, otherwise the reader might be evicted between the
+    // check and `recreate_reader()` and the latter will recreate it without
+    // waiting for re-admission.
+    if (_permit.needs_readmission()) {
+        co_await _permit.wait_readmission();
+    }
    co_return recreate_reader();
 }

-void evictable_reader_v2::maybe_validate_partition_start(const flat_mutation_reader_v2::tracked_buffer& buffer) {
-    if (!_validate_partition_key || buffer.empty()) {
-        return;
-    }
-
-    // If this is set we can assume the first fragment is a partition-start.
-    const auto& ps = buffer.front().as_partition_start();
+void evictable_reader_v2::validate_partition_start(const partition_start& ps) {
    const auto tri_cmp = dht::ring_position_comparator(*_schema);
    // If we recreated the reader after fast-forwarding it we won't have
    // _last_pkey set. In this case it is enough to check if the partition
    // is in range.
    if (_last_pkey) {
        const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
-        if (_drop_partition_start) { // we expect to continue from the same partition
+        if (_next_position_in_partition.region() != partition_region::partition_start) { // we expect to continue from the same partition
            // We cannot assume the partition we stopped the read at is still alive
            // when we recreate the reader. It might have been compacted away in the
            // meanwhile, so allow for a larger partition too.
            require(
                    cmp_res <= 0,
-                    "{}(): validation failed, expected partition with key larger or equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
+                    "{}(): validation failed, expected partition with key larger or equal to _last_pkey {}, but got {}",
                    __FUNCTION__,
                    *_last_pkey,
                    ps.key());
-            // Reset drop flags and next pos if we are not continuing from the same partition
+            // Reset next pos if we are not continuing from the same partition
            if (cmp_res < 0) {
                // Close previous partition, we are not going to continue it.
                push_mutation_fragment(*_schema, _permit, partition_end{});
-                _drop_partition_start = false;
-                _drop_static_row = false;
                _next_position_in_partition = position_in_partition::for_partition_start();
            }
        } else { // should be a larger partition
            require(
                    cmp_res < 0,
-                    "{}(): validation failed, expected partition with key larger than _last_pkey {} due to _drop_partition_start being unset, but got {}",
+                    "{}(): validation failed, expected partition with key larger than _last_pkey {}, but got {}",
                    __FUNCTION__,
                    *_last_pkey,
                    ps.key());
@@ -1836,8 +1831,6 @@ void evictable_reader_v2::maybe_validate_partition_start(const flat_mutation_rea
            __FUNCTION__,
            prange,
            ps.key());
-
-    _validate_partition_key = false;
 }

 void evictable_reader_v2::validate_position_in_partition(position_in_partition_view pos) const {
@@ -1860,7 +1853,12 @@ void evictable_reader_v2::validate_position_in_partition(position_in_partition_v
        const bool any_contains = std::any_of(ranges.begin(), ranges.end(), [this, &pos] (const query::clustering_range& cr) {
            // TODO: somehow avoid this copy
            auto range = position_range(cr);
-            return range.contains(*_schema, pos);
+            // We cannot use range.contains() because that treats range as a
+            // [a, b) range, meaning a range tombstone change with position
+            // after_key(b) will be considered outside of it. Such range
+            // tombstone changes can be emitted however when recreating the
+            // reader on clustering range edge.
+            return _tri_cmp(range.start(), pos) <= 0 && _tri_cmp(pos, range.end()) <= 0;
        });
        require(
                any_contains,
@@ -1871,42 +1869,40 @@ void evictable_reader_v2::validate_position_in_partition(position_in_partition_v
    }
 }

-bool evictable_reader_v2::should_drop_fragment(const mutation_fragment_v2& mf) {
-    if (_drop_partition_start && mf.is_partition_start()) {
-        _drop_partition_start = false;
-        return true;
+void evictable_reader_v2::examine_first_fragments(mutation_fragment_v2_opt& mf1, mutation_fragment_v2_opt& mf2, mutation_fragment_v2_opt& mf3) {
+    if (!mf1) {
+        return; // the reader is at EOS
    }
-    // Unlike partition-start above, a partition is not guaranteed to have a
-    // static row fragment. So reset the flag regardless of whether we could
-    // drop one or not.
-    // We are guaranteed to get here only right after dropping a partition-start,
-    // so if we are not seeing a static row here, the partition doesn't have one.
-    if (_drop_static_row) {
-         _drop_static_row = false;
-        return mf.is_static_row();
-    }
-    return false;
-}

-future<> evictable_reader_v2::do_fill_buffer() {
-    if (!_drop_partition_start && !_drop_static_row) {
-        auto fill_buf_fut = _reader->fill_buffer();
-        if (_validate_partition_key) {
-            fill_buf_fut = fill_buf_fut.then([this] {
-                maybe_validate_partition_start(_reader->buffer());
-            });
-        }
-        return fill_buf_fut;
+    // If engaged, the first fragment is always a partition-start.
+    validate_partition_start(mf1->as_partition_start());
+    if (_tri_cmp(mf1->position(), _next_position_in_partition) < 0) {
+        mf1 = {}; // drop mf1
+    }
+
+    const auto continue_same_partition = _next_position_in_partition.region() != partition_region::partition_start;
+
+    // If we have a first fragment, we are guaranteed to have a second one -- if not else, a partition-end.
+    if (mf2->is_end_of_partition()) {
+        return; // no further fragments, nothing to do
+    }
+
+    // We want to validate the position of the first non-dropped fragment.
+    // If mf2 is a static row and we need to drop it, this will be mf3.
+    if (mf2->is_static_row() && _tri_cmp(mf2->position(), _next_position_in_partition) < 0) {
+        mf2 = {}; // drop mf2
+    } else {
+        if (continue_same_partition) {
+            validate_position_in_partition(mf2->position());
+        }
+        return;
+    }
+
+    if (mf3->is_end_of_partition()) {
+        return; // no further fragments, nothing to do
+    } else if (continue_same_partition) {
+        validate_position_in_partition(mf3->position());
    }
-    return repeat([this] {
-        return _reader->fill_buffer().then([this] {
-            maybe_validate_partition_start(_reader->buffer());
-            while (!_reader->is_buffer_empty() && should_drop_fragment(_reader->peek_buffer())) {
-                _reader->pop_mutation_fragment();
-            }
-            return stop_iteration(_reader->is_buffer_full() || _reader->is_end_of_stream());
-        });
-    });
 }

 evictable_reader_v2::evictable_reader_v2(
@@ -1935,10 +1931,64 @@ future<> evictable_reader_v2::fill_buffer() {
        co_return;
    }
    _reader = co_await resume_or_create_reader();
-    co_await do_fill_buffer();
+
+    if (_reader_recreated) {
+        // Recreating the reader breaks snapshot isolation and creates all sorts
+        // of complications around the continuity of range tombstone changes,
+        // e.g. a range tombstone started by the previous reader object
+        // might not exist anymore with the new reader object.
+        // To avoid complications we reset the tombstone state on each reader
+        // recreation by emitting a null tombstone change, if we read at least
+        // one clustering fragment from the partition.
+        if (_next_position_in_partition.region() == partition_region::clustered
+                && _tri_cmp(_next_position_in_partition, position_in_partition::before_all_clustered_rows()) > 0) {
+            push_mutation_fragment(*_schema, _permit, range_tombstone_change{position_in_partition_view::before_key(_next_position_in_partition), {}});
+        }
+        auto mf1 = co_await (*_reader)();
+        auto mf2 = co_await (*_reader)();
+        auto mf3 = co_await (*_reader)();
+        examine_first_fragments(mf1, mf2, mf3);
+        if (mf3) {
+            _reader->unpop_mutation_fragment(std::move(*mf3));
+        }
+        if (mf2) {
+            _reader->unpop_mutation_fragment(std::move(*mf2));
+        }
+        if (mf1) {
+            _reader->unpop_mutation_fragment(std::move(*mf1));
+        }
+        _reader_recreated = false;
+    } else {
+        co_await _reader->fill_buffer();
+    }
+
    _reader->move_buffer_content_to(*this);
+
+    // Ensure that each buffer represents forward progress. Only a concern when
+    // the last fragment in the buffer is range tombstone change. In this case
+    // ensure that:
+    // * buffer().back().position() > _next_position_in_partition;
+    // * _reader.peek()->position() > buffer().back().position();
+    if (!is_buffer_empty() && buffer().back().is_range_tombstone_change()) {
+        auto* next_mf = co_await _reader->peek();
+
+        // First make sure we've made progress w.r.t. _next_position_in_partition.
+        // This loop becomes inifinite when next pos is a partition start.
+        // In that case progress is guranteed anyway, so skip this loop entirely.
+        while (!_next_position_in_partition.is_partition_start() && next_mf && _tri_cmp(_next_position_in_partition, buffer().back().position()) <= 0) {
+            push_mutation_fragment(_reader->pop_mutation_fragment());
+            next_mf = co_await _reader->peek();
+        }
+
+        const auto last_pos = position_in_partition(buffer().back().position());
+        while (next_mf && _tri_cmp(last_pos, next_mf->position()) == 0) {
+            push_mutation_fragment(_reader->pop_mutation_fragment());
+            next_mf = co_await _reader->peek();
+        }
+    }
+
    update_next_position();
-    _end_of_stream = _reader->is_end_of_stream() && _reader->is_buffer_empty();
+    _end_of_stream = _reader->is_end_of_stream();
    maybe_pause(std::move(*_reader));
 }

--- a/partition_snapshot_row_cursor.hh
+++ b/partition_snapshot_row_cursor.hh
@@ -444,7 +444,7 @@ public:
    // When throws, the cursor is invalidated and its position is not changed.
    bool advance_to(position_in_partition_view lower_bound) {
        maybe_advance_to(lower_bound);
-        return no_clustering_row_between(_schema, lower_bound, position());
+        return no_clustering_row_between_weak(_schema, lower_bound, position());
    }

    // Call only when valid.
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -567,6 +567,20 @@ bool no_clustering_row_between(const schema& s, position_in_partition_view a, po
    }
 }

+// Returns true if and only if there can't be any clustering_row with position >= a and < b.
+// It is assumed that a <= b.
+inline
+bool no_clustering_row_between_weak(const schema& s, position_in_partition_view a, position_in_partition_view b) {
+    clustering_key_prefix::equality eq(s);
+    if (a.has_key() && b.has_key()) {
+        return eq(a.key(), b.key())
+               && (a.get_bound_weight() == bound_weight::after_all_prefixed
+                   || b.get_bound_weight() != bound_weight::after_all_prefixed);
+    } else {
+        return !a.has_key() && !b.has_key();
+    }
+}
+
 // Includes all position_in_partition objects "p" for which: start <= p < end
 // And only those.
 class position_range {
--- a/protocol_server.hh
+++ b/protocol_server.hh
@@ -10,6 +10,7 @@

 #include "seastarx.hh"
 #include <seastar/core/future.hh>
+#include <seastar/net/socket_defs.hh>
 #include <vector>

 // Abstraction for a server serving some kind of user-facing protocol.
--- a/querier.cc
+++ b/querier.cc
@@ -414,25 +414,6 @@ future<bool> querier_cache::evict_one() noexcept {
    co_return false;
 }

-future<> querier_cache::evict_all_for_table(const utils::UUID& schema_id) noexcept {
-    for (auto ip : {&_data_querier_index, &_mutation_querier_index, &_shard_mutation_querier_index}) {
-        auto& idx = *ip;
-        for (auto it = idx.begin(); it != idx.end();) {
-            if (it->second->schema().id() == schema_id) {
-                auto reader_opt = it->second->permit().semaphore().unregister_inactive_read(querier_utils::get_inactive_read_handle(*it->second));
-                it = idx.erase(it);
-                --_stats.population;
-                if (reader_opt) {
-                    co_await reader_opt->close();
-                }
-            } else {
-                ++it;
-            }
-        }
-    }
-    co_return;
-}
-
 future<> querier_cache::stop() noexcept {
    co_await _closing_gate.close();

--- a/querier.hh
+++ b/querier.hh
@@ -476,11 +476,6 @@ public:
    /// is empty).
    future<bool> evict_one() noexcept;

-    /// Evict all queriers that belong to a table.
-    ///
-    /// Should be used when dropping a table.
-    future<> evict_all_for_table(const utils::UUID& schema_id) noexcept;
-
    /// Close all queriers and wait on background work.
    ///
    /// Should be used before destroying the querier_cache.
--- a/query.cc
+++ b/query.cc
@@ -92,14 +92,13 @@ void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& range
 }

 void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, const clustering_key& key, bool reversed) {
-    if (key.is_full(s)) {
+    if (key.is_full(s) || reversed) {
        return trim_clustering_row_ranges_to(s, ranges,
                reversed ? position_in_partition_view::before_key(key) : position_in_partition_view::after_key(key), reversed);
    }
    auto full_key = key;
    clustering_key::make_full(s, full_key);
-    return trim_clustering_row_ranges_to(s, ranges,
-            reversed ? position_in_partition_view::after_key(full_key) : position_in_partition_view::before_key(full_key), reversed);
+    return trim_clustering_row_ranges_to(s, ranges, position_in_partition_view::before_key(full_key), reversed);
 }


--- a/range_tombstone_change_generator.hh
+++ b/range_tombstone_change_generator.hh
@@ -68,22 +68,33 @@ public:
    // for accumulated range tombstones.
    // After this, only range_tombstones with positions >= upper_bound may be added,
    // which guarantees that they won't affect the output of this flush.
+    //
+    // If upper_bound == position_in_partition::after_all_clustered_rows(),
+    // emits all remaining range_tombstone_changes.
+    // No range_tombstones may be added after this.
+    //
    // FIXME: respect preemption
    template<RangeTombstoneChangeConsumer C>
-    void flush(position_in_partition_view upper_bound, C consumer) {
-        position_in_partition::less_compare less(_schema);
-        std::optional<range_tombstone> prev;
+    void flush(const position_in_partition_view upper_bound, C consumer) {
+        if (_range_tombstones.empty()) {
+            _lower_bound = upper_bound;
+            return;
+        }

-        while (!_range_tombstones.empty() && less(_range_tombstones.begin()->end_position(), upper_bound)) {
+        position_in_partition::tri_compare cmp(_schema);
+        std::optional<range_tombstone> prev;
+        bool flush_all = cmp(upper_bound, position_in_partition::after_all_clustered_rows()) == 0;
+
+        while (!_range_tombstones.empty() && (flush_all || (cmp(_range_tombstones.begin()->end_position(), upper_bound) < 0))) {
            auto rt = _range_tombstones.pop(_range_tombstones.begin());

-            if (prev && less(prev->end_position(), rt.position())) { // [1]
+            if (prev && (cmp(prev->end_position(), rt.position()) < 0)) { // [1]
                // previous range tombstone not adjacent, emit gap.
                consumer(range_tombstone_change(prev->end_position(), tombstone()));
            }

            // Check if start of rt was already emitted, emit if not.
-            if (!less(rt.position(), _lower_bound)) {
+            if (cmp(rt.position(), _lower_bound) >= 0) {
                consumer(range_tombstone_change(rt.position(), rt.tomb));
            }

@@ -95,15 +106,15 @@ public:
        // It cannot get adjacent later because prev->end_position() < upper_bound,
        // so nothing == prev->end_position() can be added after this invocation.
        if (prev && (_range_tombstones.empty()
-                     || less(prev->end_position(), _range_tombstones.begin()->position()))) {
+                     || (cmp(prev->end_position(), _range_tombstones.begin()->position()) < 0))) {
            consumer(range_tombstone_change(prev->end_position(), tombstone())); // [2]
        }

        // Emit the fragment for start bound of a range_tombstone which is overlapping with upper_bound,
        // unless no such fragment or already emitted.
        if (!_range_tombstones.empty()
-            && less(_range_tombstones.begin()->position(), upper_bound)
-            && (!less(_range_tombstones.begin()->position(), _lower_bound))) {
+            && (cmp(_range_tombstones.begin()->position(), upper_bound) < 0)
+            && (cmp(_range_tombstones.begin()->position(), _lower_bound) >= 0)) {
            consumer(range_tombstone_change(
                    _range_tombstones.begin()->position(), _range_tombstones.begin()->tombstone().tomb));
        }
--- a/range_tombstone_list.cc
+++ b/range_tombstone_list.cc
@@ -9,6 +9,7 @@
 #include <boost/range/adaptor/reversed.hpp>
 #include "range_tombstone_list.hh"
 #include "utils/allocation_strategy.hh"
+#include "utils/amortized_reserve.hh"
 #include <seastar/util/variant_utils.hh>

 range_tombstone_list::range_tombstone_list(const range_tombstone_list& x)
@@ -375,13 +376,13 @@ range_tombstone_list::reverter::insert(range_tombstones_type::iterator it, range

 range_tombstone_list::range_tombstones_type::iterator
 range_tombstone_list::reverter::erase(range_tombstones_type::iterator it) {
-    _ops.reserve(_ops.size() + 1);
+    amortized_reserve(_ops, _ops.size() + 1);
    _ops.emplace_back(erase_undo_op(*it));
    return _dst._tombstones.erase(it);
 }

 void range_tombstone_list::reverter::update(range_tombstones_type::iterator it, range_tombstone&& new_rt) {
-    _ops.reserve(_ops.size() + 1);
+    amortized_reserve(_ops, _ops.size() + 1);
    swap(it->tombstone(), new_rt);
    _ops.emplace_back(update_undo_op(std::move(new_rt), *it));
 }
--- a/range_tombstone_list.hh
+++ b/range_tombstone_list.hh
@@ -12,6 +12,7 @@
 #include "range_tombstone.hh"
 #include "query-request.hh"
 #include "utils/preempt.hh"
+#include "utils/chunked_vector.hh"
 #include <iosfwd>
 #include <variant>

@@ -106,7 +107,7 @@ class range_tombstone_list final {
    class reverter {
    private:
        using op = std::variant<erase_undo_op, insert_undo_op, update_undo_op>;
-        std::vector<op> _ops;
+        utils::chunked_vector<op> _ops;
        const schema& _s;
    protected:
        range_tombstone_list& _dst;
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -294,10 +294,11 @@ public:
        }
    }

-    future<> maybe_wait_readmission() {
-        if (_state != reader_permit::state::evicted) {
-            return make_ready_future<>();
-        }
+    bool needs_readmission() const {
+        return _state == reader_permit::state::evicted;
+    }
+
+    future<> wait_readmission() {
        return _semaphore.do_wait_admission(shared_from_this());
    }

@@ -360,8 +361,16 @@ reader_concurrency_semaphore& reader_permit::semaphore() {
    return _impl->semaphore();
 }

-future<> reader_permit::maybe_wait_readmission() {
-    return _impl->maybe_wait_readmission();
+reader_permit::state reader_permit::get_state() const {
+    return _impl->get_state();
+}
+
+bool reader_permit::needs_readmission() const {
+    return _impl->needs_readmission();
+}
+
+future<> reader_permit::wait_readmission() {
+    return _impl->wait_readmission();
 }

 void reader_permit::consume(reader_resources res) {
@@ -661,11 +670,7 @@ reader_concurrency_semaphore::~reader_concurrency_semaphore() {
 reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore::register_inactive_read(flat_mutation_reader_v2 reader) noexcept {
    auto& permit_impl = *reader.permit()._impl;
    permit_impl.on_register_as_inactive();
-    // Implies _inactive_reads.empty(), we don't queue new readers before
-    // evicting all inactive reads.
-    // Checking the _wait_list covers the count resources only, so check memory
-    // separately.
-    if (_wait_list.empty() && _resources.memory > 0) {
+    if (!should_evict_inactive_read()) {
      try {
        auto irp = std::make_unique<inactive_read>(std::move(reader));
        auto& ir = *irp;
@@ -736,10 +741,24 @@ bool reader_concurrency_semaphore::try_evict_one_inactive_read(evict_reason reas

 void reader_concurrency_semaphore::clear_inactive_reads() {
    while (!_inactive_reads.empty()) {
-        auto& ir = _inactive_reads.front();
-        close_reader(std::move(ir.reader));
-        // Destroying the read unlinks it too.
-        std::unique_ptr<inactive_read> _(&*_inactive_reads.begin());
+        evict(_inactive_reads.front(), evict_reason::manual);
+    }
+}
+
+future<> reader_concurrency_semaphore::evict_inactive_reads_for_table(utils::UUID id) noexcept {
+    inactive_reads_type evicted_readers;
+    auto it = _inactive_reads.begin();
+    while (it != _inactive_reads.end()) {
+        auto& ir = *it;
+        ++it;
+        if (ir.reader.schema()->id() == id) {
+            do_detach_inactive_reader(ir, evict_reason::manual);
+            evicted_readers.push_back(ir);
+        }
+    }
+    while (!evicted_readers.empty()) {
+        std::unique_ptr<inactive_read> irp(&evicted_readers.front());
+        co_await irp->reader.close();
    }
 }

@@ -765,11 +784,11 @@ future<> reader_concurrency_semaphore::stop() noexcept {
    co_return;
 }

-flat_mutation_reader_v2 reader_concurrency_semaphore::detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
-    auto reader = std::move(ir.reader);
+void reader_concurrency_semaphore::do_detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
+    ir.unlink();
+    ir.ttl_timer.cancel();
    ir.detach();
-    reader.permit()._impl->on_evicted();
-    std::unique_ptr<inactive_read> irp(&ir);
+    ir.reader.permit()._impl->on_evicted();
    try {
        if (ir.notify_handler) {
            ir.notify_handler(reason);
@@ -788,7 +807,12 @@ flat_mutation_reader_v2 reader_concurrency_semaphore::detach_inactive_reader(ina
            break;
    }
    --_stats.inactive_reads;
-    return reader;
+}
+
+flat_mutation_reader_v2 reader_concurrency_semaphore::detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
+    std::unique_ptr<inactive_read> irp(&ir);
+    do_detach_inactive_reader(ir, reason);
+    return std::move(irp->reader);
 }

 void reader_concurrency_semaphore::evict(inactive_read& ir, evict_reason reason) noexcept {
@@ -836,35 +860,89 @@ future<> reader_concurrency_semaphore::enqueue_waiter(reader_permit permit, read
 }

 void reader_concurrency_semaphore::evict_readers_in_background() {
+    if (_evicting) {
+        return;
+    }
+    _evicting = true;
    // Evict inactive readers in the background while wait list isn't empty
    // This is safe since stop() closes _gate;
    (void)with_gate(_close_readers_gate, [this] {
-        return do_until([this] { return _wait_list.empty() || _inactive_reads.empty(); }, [this] {
-            return detach_inactive_reader(_inactive_reads.front(), evict_reason::permit).close();
+        return repeat([this] {
+            if (_inactive_reads.empty() || !should_evict_inactive_read()) {
+                _evicting = false;
+                return make_ready_future<stop_iteration>(stop_iteration::yes);
+            }
+            return detach_inactive_reader(_inactive_reads.front(), evict_reason::permit).close().then([] {
+                return stop_iteration::no;
+            });
        });
    });
- }
+}
+
+reader_concurrency_semaphore::admit_result
+reader_concurrency_semaphore::can_admit_read(const reader_permit& permit) const noexcept {
+    if (!_ready_list.empty()) {
+        return {can_admit::no, reason::ready_list};
+    }
+
+    if (!all_used_permits_are_stalled()) {
+        return {can_admit::no, reason::used_permits};
+    }
+
+    if (!has_available_units(permit.base_resources())) {
+        auto reason = _resources.memory >= permit.base_resources().memory ? reason::memory_resources : reason::count_resources;
+        if (_inactive_reads.empty()) {
+            return {can_admit::no, reason};
+        } else {
+            return {can_admit::maybe, reason};
+        }
+    }
+
+    return {can_admit::yes, reason::all_ok};
+}
+
+bool reader_concurrency_semaphore::should_evict_inactive_read() const noexcept {
+    if (_resources.memory < 0 || _resources.count < 0) {
+        return true;
+    }
+    if (_wait_list.empty()) {
+        return false;
+    }
+    const auto r = can_admit_read(_wait_list.front().permit).why;
+    return r == reason::memory_resources || r == reason::count_resources;
+}

 future<> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, read_func func) {
    if (!_execution_loop_future) {
        _execution_loop_future.emplace(execution_loop());
    }
-    if (!_wait_list.empty() || !_ready_list.empty()) {
-        return enqueue_waiter(std::move(permit), std::move(func));
-    }

-    if (!has_available_units(permit.base_resources())) {
+    static uint64_t stats::*stats_table[] = {
+        &stats::reads_admitted_immediately,
+        &stats::reads_queued_because_ready_list,
+        &stats::reads_queued_because_used_permits,
+        &stats::reads_queued_because_memory_resources,
+        &stats::reads_queued_because_count_resources
+    };
+
+    const auto [admit, why] = can_admit_read(permit);
+    ++(_stats.*stats_table[static_cast<int>(why)]);
+    if (admit != can_admit::yes || !_wait_list.empty()) {
        auto fut = enqueue_waiter(std::move(permit), std::move(func));
-        if (!_inactive_reads.empty()) {
+        if (admit == can_admit::yes && !_wait_list.empty()) {
+            // This is a contradiction: the semaphore could admit waiters yet it has waiters.
+            // Normally, the semaphore should admit waiters as soon as it can.
+            // So at any point in time, there should either be no waiters, or it
+            // shouldn't be able to admit new reads. Otherwise something went wrong.
+            maybe_dump_reader_permit_diagnostics(*this, _permit_list, "semaphore could admit new reads yet there are waiters");
+            maybe_admit_waiters();
+        } else if (admit == can_admit::maybe) {
+            ++_stats.reads_queued_with_eviction;
            evict_readers_in_background();
        }
        return fut;
    }

-    if (!all_used_permits_are_stalled()) {
-        return enqueue_waiter(std::move(permit), std::move(func));
-    }
-
    permit.on_admission();
    ++_stats.reads_admitted;
    if (func) {
@@ -874,7 +952,8 @@ future<> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, r
 }

 void reader_concurrency_semaphore::maybe_admit_waiters() noexcept {
-    while (!_wait_list.empty() && _ready_list.empty() && has_available_units(_wait_list.front().permit.base_resources()) && all_used_permits_are_stalled()) {
+    auto admit = can_admit::no;
+    while (!_wait_list.empty() && (admit = can_admit_read(_wait_list.front().permit).decision) == can_admit::yes) {
        auto& x = _wait_list.front();
        try {
            x.permit.on_admission();
@@ -889,6 +968,10 @@ void reader_concurrency_semaphore::maybe_admit_waiters() noexcept {
        }
        _wait_list.pop_front();
    }
+    if (admit == can_admit::maybe) {
+        // Evicting readers will trigger another call to `maybe_admit_waiters()` from `signal()`.
+        evict_readers_in_background();
+    }
 }

 void reader_concurrency_semaphore::on_permit_created(reader_permit::impl& permit) {
@@ -965,6 +1048,13 @@ future<> reader_concurrency_semaphore::with_ready_permit(reader_permit permit, r
    return fut;
 }

+void reader_concurrency_semaphore::set_resources(resources r) {
+    auto delta = r - _initial_resources;
+    _initial_resources = r;
+    _resources += delta;
+    maybe_admit_waiters();
+}
+
 void reader_concurrency_semaphore::broken(std::exception_ptr ex) {
    if (!ex) {
        ex = std::make_exception_ptr(broken_semaphore{});
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -74,6 +74,18 @@ public:
        uint64_t reads_admitted = 0;
        // Total number of reads enqueued to wait for admission.
        uint64_t reads_enqueued = 0;
+        // Total number of reads admitted immediately, without queueing
+        uint64_t reads_admitted_immediately = 0;
+        // Total number of reads enqueued because ready_list wasn't empty
+        uint64_t reads_queued_because_ready_list = 0;
+        // Total number of reads enqueued because there are used but unblocked permits
+        uint64_t reads_queued_because_used_permits = 0;
+        // Total number of reads enqueued because there weren't enough memory resources
+        uint64_t reads_queued_because_memory_resources = 0;
+        // Total number of reads enqueued because there weren't enough count resources
+        uint64_t reads_queued_because_count_resources = 0;
+        // Total number of reads enqueued to be maybe admitted after evicting some inactive reads
+        uint64_t reads_queued_with_eviction = 0;
        // Total number of permits created so far.
        uint64_t total_permits = 0;
        // Current number of permits.
@@ -169,7 +181,7 @@ public:
    };

 private:
-    const resources _initial_resources;
+    resources _initial_resources;
    resources _resources;

    expiring_fifo<entry, expiry_handler, db::timeout_clock> _wait_list;
@@ -181,11 +193,13 @@ private:
    stats _stats;
    permit_list_type _permit_list;
    bool _stopped = false;
+    bool _evicting = false;
    gate _close_readers_gate;
    gate _permit_gate;
    std::optional<future<>> _execution_loop_future;

 private:
+    void do_detach_inactive_reader(inactive_read&, evict_reason reason) noexcept;
    [[nodiscard]] flat_mutation_reader_v2 detach_inactive_reader(inactive_read&, evict_reason reason) noexcept;
    void evict(inactive_read&, evict_reason reason) noexcept;

@@ -200,6 +214,19 @@ private:
    future<> enqueue_waiter(reader_permit permit, read_func func);
    void evict_readers_in_background();
    future<> do_wait_admission(reader_permit permit, read_func func = {});
+
+    // Check whether permit can be admitted or not.
+    // The wait list is not taken into consideration, this is the caller's
+    // responsibility.
+    // A return value of can_admit::maybe means admission might be possible if
+    // some of the inactive readers are evicted.
+    enum class can_admit { no, maybe, yes };
+    enum class reason { all_ok = 0, ready_list, used_permits, memory_resources, count_resources };
+    struct admit_result { can_admit decision; reason why; };
+    admit_result can_admit_read(const reader_permit& permit) const noexcept;
+
+    bool should_evict_inactive_read() const noexcept;
+
    void maybe_admit_waiters() noexcept;

    void on_permit_created(reader_permit::impl&);
@@ -301,6 +328,9 @@ public:

    /// Clear all inactive reads.
    void clear_inactive_reads();
+
+    /// Evict all inactive reads the belong to the table designated by the id.
+    future<> evict_inactive_reads_for_table(utils::UUID id) noexcept;
 private:
    // The following two functions are extension points for
    // future inheriting classes that needs to run some stop
@@ -386,6 +416,12 @@ public:
    /// optimal then just using \ref with_permit().
    future<> with_ready_permit(reader_permit permit, read_func func);

+    /// Set the total resources of the semaphore to \p r.
+    ///
+    /// After this call, \ref initial_resources() will reflect the new value.
+    /// Available resources will be adjusted by the delta.
+    void set_resources(resources r);
+
    const resources initial_resources() const {
        return _initial_resources;
    }
--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -134,7 +134,12 @@ public:

    reader_concurrency_semaphore& semaphore();

-    future<> maybe_wait_readmission();
+    state get_state() const;
+
+    bool needs_readmission() const;
+
+    // Call only when needs_readmission() = true.
+    future<> wait_readmission();

    void consume(reader_resources res);

@@ -182,6 +187,8 @@ public:
    reader_resources resources() const { return _resources; }
 };

+std::ostream& operator<<(std::ostream& os, reader_permit::state s);
+
 /// Mark a permit as used.
 ///
 /// Conceptually, a permit is considered used, when at least one reader
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -48,14 +48,42 @@

 logging::logger rlogger("repair");

+node_ops_info::node_ops_info(utils::UUID ops_uuid_, shared_ptr<abort_source> as_, std::list<gms::inet_address>&& ignore_nodes_) noexcept
+    : ops_uuid(ops_uuid_)
+    , as(std::move(as_))
+    , ignore_nodes(std::move(ignore_nodes_))
+{}
+
 void node_ops_info::check_abort() {
-    if (abort) {
+    if (as && as->abort_requested()) {
        auto msg = format("Node operation with ops_uuid={} is aborted", ops_uuid);
        rlogger.warn("{}", msg);
        throw std::runtime_error(msg);
    }
 }

+future<> node_ops_info::start() {
+    if (as) {
+        co_await _sas.start();
+        _abort_subscription = as->subscribe([this] () noexcept {
+            _abort_done = _sas.invoke_on_all([] (abort_source& as) noexcept {
+                as.request_abort();
+            });
+        });
+    }
+}
+
+future<> node_ops_info::stop() noexcept {
+    if (as) {
+        co_await std::exchange(_abort_done, make_ready_future<>());
+        co_await _sas.stop();
+    }
+}
+
+abort_source* node_ops_info::local_abort_source() {
+    return as ? &_sas.local() : nullptr;
+}
+
 node_ops_metrics::node_ops_metrics(tracker& tracker)
    : _tracker(tracker)
 {
@@ -436,16 +464,6 @@ void tracker::abort_all_repairs() {
    rlogger.info0("Aborted {} repair job(s)", count);
 }

-void tracker::abort_repair_node_ops(utils::UUID ops_uuid) {
-    for (auto& x : _repairs) {
-        auto& ri = x.second;
-        if (ri->ops_uuid() && ri->ops_uuid().value() == ops_uuid) {
-            rlogger.info0("Aborted repair jobs for ops_uuid={}", ops_uuid);
-            ri->abort();
-        }
-    }
-}
-
 float tracker::report_progress(streaming::stream_reason reason) {
    uint64_t nr_ranges_finished = 0;
    uint64_t nr_ranges_total = 0;
@@ -534,7 +552,7 @@ repair_info::repair_info(repair_service& repair,
    const std::vector<sstring>& hosts_,
    const std::unordered_set<gms::inet_address>& ignore_nodes_,
    streaming::stream_reason reason_,
-    std::optional<utils::UUID> ops_uuid,
+    abort_source* as,
    bool hints_batchlog_flushed)
    : rs(repair)
    , db(repair.get_db())
@@ -556,8 +574,10 @@ repair_info::repair_info(repair_service& repair,
    , reason(reason_)
    , total_rf(db.local().find_keyspace(keyspace).get_effective_replication_map()->get_replication_factor())
    , nr_ranges_total(ranges.size())
-    , _ops_uuid(std::move(ops_uuid))
    , _hints_batchlog_flushed(std::move(hints_batchlog_flushed)) {
+    if (as != nullptr) {
+        _abort_subscription = as->subscribe([this] () noexcept { abort(); });
+    }
 }

 void repair_info::check_failed_ranges() {
@@ -575,7 +595,7 @@ void repair_info::check_failed_ranges() {
    }
 }

-void repair_info::abort() {
+void repair_info::abort() noexcept {
    aborted = true;
 }

@@ -1190,7 +1210,7 @@ int repair_service::do_repair_start(sstring keyspace, std::unordered_map<sstring
                local_repair.get_metrics().repair_total_ranges_sum += ranges.size();
                auto ri = make_lw_shared<repair_info>(local_repair,
                        std::move(keyspace), std::move(ranges), std::move(table_ids),
-                        id, std::move(data_centers), std::move(hosts), std::move(ignore_nodes), streaming::stream_reason::repair, id.uuid, hints_batchlog_flushed);
+                        id, std::move(data_centers), std::move(hosts), std::move(ignore_nodes), streaming::stream_reason::repair, nullptr, hints_batchlog_flushed);
                return repair_ranges(ri);
            });
            repair_results.push_back(std::move(f));
@@ -1257,12 +1277,12 @@ future<> repair_service::sync_data_using_repair(
        dht::token_range_vector ranges,
        std::unordered_map<dht::token_range, repair_neighbors> neighbors,
        streaming::stream_reason reason,
-        std::optional<utils::UUID> ops_uuid) {
+        shared_ptr<node_ops_info> ops_info) {
    if (ranges.empty()) {
        return make_ready_future<>();
    }
-    return container().invoke_on(0, [keyspace = std::move(keyspace), ranges = std::move(ranges), neighbors = std::move(neighbors), reason, ops_uuid] (repair_service& local_repair) mutable {
-        return local_repair.do_sync_data_using_repair(std::move(keyspace), std::move(ranges), std::move(neighbors), reason, ops_uuid);
+    return container().invoke_on(0, [keyspace = std::move(keyspace), ranges = std::move(ranges), neighbors = std::move(neighbors), reason, ops_info] (repair_service& local_repair) mutable {
+        return local_repair.do_sync_data_using_repair(std::move(keyspace), std::move(ranges), std::move(neighbors), reason, ops_info);
    });
 }

@@ -1271,12 +1291,12 @@ future<> repair_service::do_sync_data_using_repair(
        dht::token_range_vector ranges,
        std::unordered_map<dht::token_range, repair_neighbors> neighbors,
        streaming::stream_reason reason,
-        std::optional<utils::UUID> ops_uuid) {
+        shared_ptr<node_ops_info> ops_info) {
    seastar::sharded<replica::database>& db = get_db();

    repair_uniq_id id = repair_tracker().next_repair_command();
    rlogger.info("repair id {} to sync data for keyspace={}, status=started", id, keyspace);
-    return repair_tracker().run(id, [this, id, &db, keyspace, ranges = std::move(ranges), neighbors = std::move(neighbors), reason, ops_uuid] () mutable {
+    return repair_tracker().run(id, [this, id, &db, keyspace, ranges = std::move(ranges), neighbors = std::move(neighbors), reason, ops_info] () mutable {
        auto cfs = list_column_families(db.local(), keyspace);
        if (cfs.empty()) {
            rlogger.warn("repair id {} to sync data for keyspace={}, no table in this keyspace", id, keyspace);
@@ -1286,14 +1306,15 @@ future<> repair_service::do_sync_data_using_repair(
        std::vector<future<>> repair_results;
        repair_results.reserve(smp::count);
        for (auto shard : boost::irange(unsigned(0), smp::count)) {
-            auto f = container().invoke_on(shard, [keyspace, table_ids, id, ranges, neighbors, reason, ops_uuid] (repair_service& local_repair) mutable {
+            auto f = container().invoke_on(shard, [keyspace, table_ids, id, ranges, neighbors, reason, ops_info] (repair_service& local_repair) mutable {
                auto data_centers = std::vector<sstring>();
                auto hosts = std::vector<sstring>();
                auto ignore_nodes = std::unordered_set<gms::inet_address>();
                bool hints_batchlog_flushed = false;
+                abort_source* asp = ops_info ? ops_info->local_abort_source() : nullptr;
                auto ri = make_lw_shared<repair_info>(local_repair,
                        std::move(keyspace), std::move(ranges), std::move(table_ids),
-                        id, std::move(data_centers), std::move(hosts), std::move(ignore_nodes), reason, ops_uuid, hints_batchlog_flushed);
+                        id, std::move(data_centers), std::move(hosts), std::move(ignore_nodes), reason, asp, hints_batchlog_flushed);
                ri->neighbors = std::move(neighbors);
                return repair_ranges(ri);
            });
@@ -1494,7 +1515,7 @@ future<> repair_service::bootstrap_with_repair(locator::token_metadata_ptr tmptr
                }
            }
            auto nr_ranges = desired_ranges.size();
-            sync_data_using_repair(keyspace_name, std::move(desired_ranges), std::move(range_sources), reason, {}).get();
+            sync_data_using_repair(keyspace_name, std::move(desired_ranges), std::move(range_sources), reason, nullptr).get();
            rlogger.info("bootstrap_with_repair: finished with keyspace={}, nr_ranges={}", keyspace_name, nr_ranges);
        }
        rlogger.info("bootstrap_with_repair: finished with keyspaces={}", keyspaces);
@@ -1690,8 +1711,7 @@ future<> repair_service::do_decommission_removenode_with_repair(locator::token_m
                ranges.swap(ranges_for_removenode);
            }
            auto nr_ranges_synced = ranges.size();
-            std::optional<utils::UUID> opt_uuid = ops ? std::make_optional<utils::UUID>(ops->ops_uuid) : std::nullopt;
-            sync_data_using_repair(keyspace_name, std::move(ranges), std::move(range_sources), reason, opt_uuid).get();
+            sync_data_using_repair(keyspace_name, std::move(ranges), std::move(range_sources), reason, ops).get();
            rlogger.info("{}: finished with keyspace={}, leaving_node={}, nr_ranges={}, nr_ranges_synced={}, nr_ranges_skipped={}",
                op, keyspace_name, leaving_node, nr_ranges_total, nr_ranges_synced, nr_ranges_skipped);
        }
@@ -1715,12 +1735,6 @@ future<> repair_service::removenode_with_repair(locator::token_metadata_ptr tmpt
    });
 }

-future<> repair_service::abort_repair_node_ops(utils::UUID ops_uuid) {
-    return container().invoke_on_all([ops_uuid] (repair_service& rs) {
-        rs.repair_tracker().abort_repair_node_ops(ops_uuid);
-    });
-}
-
 future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_ptr tmptr, sstring op, sstring source_dc, streaming::stream_reason reason, std::list<gms::inet_address> ignore_nodes) {
    return seastar::async([this, tmptr = std::move(tmptr), source_dc = std::move(source_dc), op = std::move(op), reason, ignore_nodes = std::move(ignore_nodes)] () mutable {
        seastar::sharded<replica::database>& db = get_db();
@@ -1799,7 +1813,7 @@ future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_
                }).get();
            }
            auto nr_ranges = ranges.size();
-            sync_data_using_repair(keyspace_name, std::move(ranges), std::move(range_sources), reason, {}).get();
+            sync_data_using_repair(keyspace_name, std::move(ranges), std::move(range_sources), reason, nullptr).get();
            rlogger.info("{}: finished with keyspace={}, source_dc={}, nr_ranges={}", op, keyspace_name, source_dc, nr_ranges);
        }
        rlogger.info("{}: finished with keyspaces={}, source_dc={}", op, keyspaces, source_dc);
--- a/repair/repair.hh
+++ b/repair/repair.hh
@@ -67,11 +67,28 @@ struct repair_uniq_id {
 };
 std::ostream& operator<<(std::ostream& os, const repair_uniq_id& x);

-struct node_ops_info {
+class node_ops_info {
+public:
    utils::UUID ops_uuid;
-    bool abort = false;
+    shared_ptr<abort_source> as;
    std::list<gms::inet_address> ignore_nodes;
+
+private:
+    optimized_optional<abort_source::subscription> _abort_subscription;
+    sharded<abort_source> _sas;
+    future<> _abort_done = make_ready_future<>();
+
+public:
+    node_ops_info(utils::UUID ops_uuid_, shared_ptr<abort_source> as_, std::list<gms::inet_address>&& ignore_nodes_) noexcept;
+    node_ops_info(const node_ops_info&) = delete;
+    node_ops_info(node_ops_info&&) = delete;
+
+    future<> start();
+    future<> stop() noexcept;
+
    void check_abort();
+
+    abort_source* local_abort_source();
 };

 // NOTE: repair_start() can be run on any node, but starts a node-global
@@ -167,7 +184,7 @@ public:
    int ranges_index = 0;
    repair_stats _stats;
    std::unordered_set<sstring> dropped_tables;
-    std::optional<utils::UUID> _ops_uuid;
+    optimized_optional<abort_source::subscription> _abort_subscription;
    bool _hints_batchlog_flushed = false;
 public:
    repair_info(repair_service& repair,
@@ -179,10 +196,10 @@ public:
            const std::vector<sstring>& hosts_,
            const std::unordered_set<gms::inet_address>& ingore_nodes_,
            streaming::stream_reason reason_,
-            std::optional<utils::UUID> ops_uuid,
+            abort_source* as,
            bool hints_batchlog_flushed);
    void check_failed_ranges();
-    void abort();
+    void abort() noexcept;
    void check_in_abort();
    void check_in_shutdown();
    repair_neighbors get_repair_neighbors(const dht::token_range& range);
@@ -192,9 +209,6 @@ public:
    const std::vector<sstring>& table_names() {
        return cfs;
    }
-    const std::optional<utils::UUID>& ops_uuid() const {
-        return _ops_uuid;
-    };

    bool hints_batchlog_flushed() const {
        return _hints_batchlog_flushed;
@@ -252,7 +266,6 @@ public:
    future<> run(repair_uniq_id id, std::function<void ()> func);
    future<repair_status> repair_await_completion(int id, std::chrono::steady_clock::time_point timeout);
    float report_progress(streaming::stream_reason reason);
-    void abort_repair_node_ops(utils::UUID ops_uuid);
 };

 future<uint64_t> estimate_partitions(seastar::sharded<replica::database>& db, const sstring& keyspace,
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -347,9 +347,9 @@ private:
    // Only needed for local readers, the multishard reader takes care
    // of pinning tables on used shards.
    std::optional<utils::phased_barrier::operation> _local_read_op;
+    std::optional<evictable_reader_handle> _reader_handle;
    // Local reader or multishard reader to read the range
    flat_mutation_reader _reader;
-    std::optional<evictable_reader_handle> _reader_handle;
    // Current partition read from disk
    lw_shared_ptr<const decorated_key_with_hash> _current_dk;
    uint64_t _reads_issued = 0;
--- a/repair/row_level.hh
+++ b/repair/row_level.hh
@@ -141,13 +141,13 @@ private:
            dht::token_range_vector ranges,
            std::unordered_map<dht::token_range, repair_neighbors> neighbors,
            streaming::stream_reason reason,
-            std::optional<utils::UUID> ops_uuid);
+            shared_ptr<node_ops_info> ops_info);

    future<> do_sync_data_using_repair(sstring keyspace,
            dht::token_range_vector ranges,
            std::unordered_map<dht::token_range, repair_neighbors> neighbors,
            streaming::stream_reason reason,
-            std::optional<utils::UUID> ops_uuid);
+            shared_ptr<node_ops_info> ops_info);

    future<repair_update_system_table_response> repair_update_system_table_handler(
            gms::inet_address from,
@@ -193,8 +193,6 @@ public:
    // Abort all the repairs
    future<> abort_all();

-    future<> abort_repair_node_ops(utils::UUID ops_uuid);
-
    std::unordered_map<node_repair_meta_id, repair_meta_ptr>& repair_meta_map() noexcept {
        return _repair_metas;
    }
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -939,7 +939,9 @@ future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_
    remove(*cf);
    cf->clear_views();
    co_await cf->await_pending_ops();
-    co_await _querier_cache.evict_all_for_table(cf->schema()->id());
+    for (auto* sem : {&_read_concurrency_sem, &_streaming_concurrency_sem, &_compaction_concurrency_sem, &_system_read_concurrency_sem}) {
+        co_await sem->evict_inactive_reads_for_table(uuid);
+    }
    std::exception_ptr ex;
    try {
        co_await truncate(ks, *cf, std::move(tsf), snapshot);
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -279,6 +279,7 @@ using sstable_list = sstables::sstable_list;
 namespace replica {

 class distributed_loader;
+struct table_population_metadata;

 // The CF has a "stats" structure. But we don't want all fields here,
 // since some of them are fairly complex for exporting to collectd. Also,
@@ -900,6 +901,8 @@ public:
    // The future value is true iff offstrategy compaction was required.
    future<bool> perform_offstrategy_compaction();
    future<> run_offstrategy_compaction(sstables::compaction_data& info);
+    future<> perform_cleanup_compaction(replica::database& db);
+
    void set_compaction_strategy(sstables::compaction_strategy_type strategy);
    const sstables::compaction_strategy& get_compaction_strategy() const {
        return _compaction_strategy;
@@ -925,7 +928,11 @@ public:
        return _config;
    }

-    compaction_manager& get_compaction_manager() const {
+    const compaction_manager& get_compaction_manager() const noexcept {
+        return _compaction_manager;
+    }
+
+    compaction_manager& get_compaction_manager() noexcept {
        return _compaction_manager;
    }

@@ -1080,6 +1087,7 @@ public:
    friend class ::column_family_test;

    friend class distributed_loader;
+    friend class table_population_metadata;

 private:
    timer<> _off_strategy_trigger;
--- a/replica/distributed_loader.cc
+++ b/replica/distributed_loader.cc
@@ -6,6 +6,7 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

+#include <seastar/core/coroutine.hh>
 #include <seastar/util/closeable.hh>
 #include "distributed_loader.hh"
 #include "replica/database.hh"
@@ -361,7 +362,7 @@ distributed_loader::process_upload_dir(distributed<replica::database>& db, distr
                  &error_handler_gen_for_upload_dir);
        }, sstables::sstable_directory::default_sstable_filter()).get();

-        const bool use_view_update_path = db::view::check_needs_view_update_path(sys_dist_ks.local(), *global_table, streaming::stream_reason::repair).get0();
+        const bool use_view_update_path = db::view::check_needs_view_update_path(sys_dist_ks.local(), db.local().get_token_metadata(), *global_table, streaming::stream_reason::repair).get0();

        auto datadir = upload.parent_path();
        if (use_view_update_path) {
@@ -454,92 +455,192 @@ future<> distributed_loader::handle_sstables_pending_delete(sstring pending_dele
    });
 }

-future<> distributed_loader::populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, allow_offstrategy_compaction do_allow_offstrategy_compaction, must_exist dir_must_exist) {
-    dblog.debug("Populating {}/{}/{} allow_offstrategy_compaction={} must_exist={}", ks, cf, sstdir, do_allow_offstrategy_compaction, dir_must_exist);
-    return async([&db, sstdir = std::move(sstdir), ks = std::move(ks), cf = std::move(cf), do_allow_offstrategy_compaction, dir_must_exist] {
+class table_population_metadata {
+    distributed<replica::database>& _db;
+    sstring _ks;
+    sstring _cf;
+    global_column_family_ptr _global_table;
+    fs::path _base_path;
+    std::unordered_map<sstring, lw_shared_ptr<sharded<sstables::sstable_directory>>> _sstable_directories;
+    sstables::sstable_version_types _highest_version = sstables::oldest_writable_sstable_format;
+    int64_t _highest_generation = 0;
+
+public:
+    table_population_metadata(distributed<replica::database>& db, sstring ks, sstring cf)
+        : _db(db)
+        , _ks(std::move(ks))
+        , _cf(std::move(cf))
+        , _global_table(_db, _ks, _cf)
+        , _base_path(_global_table->dir())
+    {}
+
+    ~table_population_metadata() {
+        // All directories must have been stopped
+        // using table_population_metadata::stop()
+        assert(_sstable_directories.empty());
+    }
+
+    future<> start() {
        assert(this_shard_id() == 0);
-
-        if (!file_exists(sstdir).get0()) {
-            if (dir_must_exist) {
-                throw std::runtime_error(format("Populating {}/{} failed: {} does not exist", ks, cf, sstdir));
-            }
-            return;
+        for (auto subdir : { "", sstables::staging_dir, sstables::quarantine_dir }) {
+            co_await start_subdir(subdir);
        }

-        // First pass, cleanup temporary sstable directories and sstables pending delete.
-        cleanup_column_family_temp_sst_dirs(sstdir).get();
-        auto pending_delete_dir = sstdir + "/" + sstables::sstable::pending_delete_dir_basename();
-        auto exists = file_exists(pending_delete_dir).get0();
-        if (exists) {
-            handle_sstables_pending_delete(pending_delete_dir).get();
+        co_await smp::invoke_on_all([this] {
+            _global_table->update_sstables_known_generation(_highest_generation);
+            return _global_table->disable_auto_compaction();
+        });
+    }
+
+    future<> stop() {
+        for (auto it = _sstable_directories.begin(); it != _sstable_directories.end(); it = _sstable_directories.erase(it)) {
+            co_await it->second->stop();
        }
+    }

-        global_column_family_ptr global_table(db, ks, cf);
+    fs::path get_path(std::string_view subdir) {
+        return subdir.empty() ? _base_path : _base_path / subdir;
+    }

-        sharded<sstables::sstable_directory> directory;
-        directory.start(fs::path(sstdir), db.local().get_config().initial_sstable_loading_concurrency(), std::ref(db.local().get_sharded_sst_dir_semaphore()),
-            sstables::sstable_directory::need_mutate_level::no,
-            sstables::sstable_directory::lack_of_toc_fatal::yes,
-            sstables::sstable_directory::enable_dangerous_direct_import_of_cassandra_counters(db.local().get_config().enable_dangerous_direct_import_of_cassandra_counters()),
-            sstables::sstable_directory::allow_loading_materialized_view::yes,
-            [&global_table] (fs::path dir, int64_t gen, sstables::sstable_version_types v, sstables::sstable_format_types f) {
-                return global_table->make_sstable(dir.native(), gen, v, f);
-        }).get();
+    distributed<replica::database>& db() noexcept {
+        return _db;
+    }

-        auto stop = deferred_stop(directory);
+    const sstring& ks() const noexcept {
+        return _ks;
+    }

-        lock_table(directory, db, ks, cf).get();
-        process_sstable_dir(directory).get();
+    const sstring& cf() const noexcept {
+        return _cf;
+    }

-        // If we are resharding system tables before we can read them, we will not
-        // know which is the highest format we support: this information is itself stored
-        // in the system tables. In that case we'll rely on what we find on disk: we'll
-        // at least not downgrade any files. If we already know that we support a higher
-        // format than the one we see then we use that.
-        auto sys_format = global_table->get_sstables_manager().get_highest_supported_format();
-        auto sst_version = highest_version_seen(directory, sys_format).get0();
-        auto generation = highest_generation_seen(directory).get0();
+    global_column_family_ptr& global_table() noexcept {
+        return _global_table;
+    };

-        db.invoke_on_all([&global_table, generation] (replica::database& db) {
-            global_table->update_sstables_known_generation(generation);
-            return global_table->disable_auto_compaction();
-        }).get();
+    const global_column_family_ptr& global_table() const noexcept {
+        return _global_table;
+    };

-        reshard(directory, db, ks, cf, [&global_table, sstdir, sst_version] (shard_id shard) mutable {
-            auto gen = smp::submit_to(shard, [&global_table] () {
-                return global_table->calculate_generation_for_new_table();
-            }).get0();
+    const std::unordered_map<sstring, lw_shared_ptr<sharded<sstables::sstable_directory>>>& sstable_directories() const noexcept {
+        return _sstable_directories;
+    }

-            return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
-        }).get();
+    sstables::sstable::version_types highest_version() const noexcept {
+        return _highest_version;
+    }

-        // The node is offline at this point so we are very lenient with what we consider
-        // offstrategy.
-        // SSTables created by repair may not conform to compaction strategy layout goal
-        // because data segregation is only performed by compaction
-        // Instead of reshaping them on boot, let's add them to maintenance set and allow
-        // off-strategy compaction to reshape them. This will allow node to become online
-        // ASAP. Given that SSTables with repair origin are disjoint, they can be efficiently
-        // read from.
-        auto eligible_for_reshape_on_boot = [] (const sstables::shared_sstable& sst) {
-            return sst->get_origin() != sstables::repair_origin;
-        };
+    int64_t highest_generation() const noexcept {
+        return _highest_generation;
+    }

-        reshape(directory, db, sstables::reshape_mode::relaxed, ks, cf, [global_table, sstdir, sst_version] (shard_id shard) {
-            auto gen = global_table->calculate_generation_for_new_table();
-            return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
-        }, eligible_for_reshape_on_boot).get();
+private:
+    future<> start_subdir(sstring subdir);
+};

-        directory.invoke_on_all([global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::sstable_directory& dir) {
-            return dir.do_for_each_sstable([&global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::shared_sstable sst) {
-                auto requires_offstrategy = sstables::offstrategy(do_allow_offstrategy_compaction && !eligible_for_reshape_on_boot(sst));
-                return global_table->add_sstable_and_update_cache(sst, requires_offstrategy);
-            }).then([&global_table, do_allow_offstrategy_compaction] {
-              if (do_allow_offstrategy_compaction) {
-                global_table->trigger_offstrategy_compaction();
-              }
-            });
-        }).get();
+future<> table_population_metadata::start_subdir(sstring subdir) {
+    sstring sstdir = get_path(subdir).native();
+    if (!co_await file_exists(sstdir)) {
+        co_return;
+    }
+
+    // First pass, cleanup temporary sstable directories and sstables pending delete.
+    co_await distributed_loader::cleanup_column_family_temp_sst_dirs(sstdir);
+    auto pending_delete_dir = sstdir + "/" + sstables::sstable::pending_delete_dir_basename();
+    auto exists = co_await file_exists(pending_delete_dir);
+    if (exists) {
+        co_await distributed_loader::handle_sstables_pending_delete(pending_delete_dir);
+    }
+
+    auto dptr = make_lw_shared<sharded<sstables::sstable_directory>>();
+    auto& directory = *dptr;
+    auto& global_table = _global_table;
+    auto& db = _db;
+    co_await directory.start(fs::path(sstdir),
+        db.local().get_config().initial_sstable_loading_concurrency(), std::ref(db.local().get_sharded_sst_dir_semaphore()),
+        sstables::sstable_directory::need_mutate_level::no,
+        sstables::sstable_directory::lack_of_toc_fatal::yes,
+        sstables::sstable_directory::enable_dangerous_direct_import_of_cassandra_counters(db.local().get_config().enable_dangerous_direct_import_of_cassandra_counters()),
+        sstables::sstable_directory::allow_loading_materialized_view::yes,
+        [&global_table] (fs::path dir, int64_t gen, sstables::sstable_version_types v, sstables::sstable_format_types f) {
+            return global_table->make_sstable(dir.native(), gen, v, f);
+    });
+
+    // directory must be stopped using table_population_metadata::stop below
+    _sstable_directories[subdir] = dptr;
+
+    co_await distributed_loader::lock_table(directory, _db, _ks, _cf);
+    co_await distributed_loader::process_sstable_dir(directory);
+
+    // If we are resharding system tables before we can read them, we will not
+    // know which is the highest format we support: this information is itself stored
+    // in the system tables. In that case we'll rely on what we find on disk: we'll
+    // at least not downgrade any files. If we already know that we support a higher
+    // format than the one we see then we use that.
+    auto sys_format = global_table->get_sstables_manager().get_highest_supported_format();
+    auto sst_version = co_await highest_version_seen(directory, sys_format);
+    auto generation = co_await highest_generation_seen(directory);
+
+    _highest_version = std::max(sst_version, _highest_version);
+    _highest_generation = std::max(generation, _highest_generation);
+}
+
+future<> distributed_loader::populate_column_family(table_population_metadata& metadata, sstring subdir, allow_offstrategy_compaction do_allow_offstrategy_compaction, must_exist dir_must_exist) {
+    auto& db = metadata.db();
+    const auto& ks = metadata.ks();
+    const auto& cf = metadata.cf();
+    auto sstdir = metadata.get_path(subdir).native();
+    dblog.debug("Populating {}/{}/{} allow_offstrategy_compaction={} must_exist={}", ks, cf, sstdir, do_allow_offstrategy_compaction, dir_must_exist);
+
+    assert(this_shard_id() == 0);
+
+    if (!co_await file_exists(sstdir)) {
+        if (dir_must_exist) {
+            throw std::runtime_error(format("Populating {}/{} failed: {} does not exist", metadata.ks(), metadata.cf(), sstdir));
+        }
+        co_return;
+    }
+
+    auto& global_table = metadata.global_table();
+    if (!metadata.sstable_directories().contains(subdir)) {
+        dblog.error("Could not find sstables directory {}.{}/{}", ks, cf, subdir);
+    }
+    auto& directory = *metadata.sstable_directories().at(subdir);
+    auto sst_version = metadata.highest_version();
+
+    co_await reshard(directory, db, ks, cf, [&global_table, sstdir, sst_version] (shard_id shard) mutable {
+        auto gen = smp::submit_to(shard, [&global_table] () {
+            return global_table->calculate_generation_for_new_table();
+        }).get0();
+
+        return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
+    });
+
+    // The node is offline at this point so we are very lenient with what we consider
+    // offstrategy.
+    // SSTables created by repair may not conform to compaction strategy layout goal
+    // because data segregation is only performed by compaction
+    // Instead of reshaping them on boot, let's add them to maintenance set and allow
+    // off-strategy compaction to reshape them. This will allow node to become online
+    // ASAP. Given that SSTables with repair origin are disjoint, they can be efficiently
+    // read from.
+    auto eligible_for_reshape_on_boot = [] (const sstables::shared_sstable& sst) {
+        return sst->get_origin() != sstables::repair_origin;
+    };
+
+    co_await reshape(directory, db, sstables::reshape_mode::relaxed, ks, cf, [global_table, sstdir, sst_version] (shard_id shard) {
+        auto gen = global_table->calculate_generation_for_new_table();
+        return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
+    }, eligible_for_reshape_on_boot);
+
+    co_await directory.invoke_on_all([global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::sstable_directory& dir) -> future<> {
+        co_await dir.do_for_each_sstable([&global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::shared_sstable sst) {
+            auto requires_offstrategy = sstables::offstrategy(do_allow_offstrategy_compaction && !eligible_for_reshape_on_boot(sst));
+            return global_table->add_sstable_and_update_cache(sst, requires_offstrategy);
+        });
+        if (do_allow_offstrategy_compaction) {
+            global_table->trigger_offstrategy_compaction();
+        }
    });
 }

@@ -549,41 +650,51 @@ future<> distributed_loader::populate_keyspace(distributed<replica::database>& d
    auto i = keyspaces.find(ks_name);
    if (i == keyspaces.end()) {
        dblog.warn("Skipping undefined keyspace: {}", ks_name);
-        return make_ready_future<>();
-    } else {
-        dblog.info("Populating Keyspace {}", ks_name);
-        auto& ks = i->second;
-        auto& column_families = db.local().get_column_families();
-
-        return parallel_for_each(ks.metadata()->cf_meta_data() | boost::adaptors::map_values,
-            [ks_name, ksdir, &ks, &column_families, &db] (schema_ptr s) {
-                utils::UUID uuid = s->id();
-                lw_shared_ptr<replica::column_family> cf = column_families[uuid];
-                sstring cfname = cf->schema()->cf_name();
-                auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
-                dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
-                return ks.make_directory_for_column_family(cfname, uuid).then([&db, sstdir, uuid, ks_name, cfname] {
-                    return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::staging_dir, ks_name, cfname, allow_offstrategy_compaction::no);
-                }).then([&db, sstdir, ks_name, cfname] {
-                    return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::quarantine_dir, ks_name, cfname, allow_offstrategy_compaction::no, must_exist::no);
-                }).then([&db, sstdir, uuid, ks_name, cfname] {
-                    return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname, allow_offstrategy_compaction::yes);
-                }).handle_exception([ks_name, cfname, sstdir](std::exception_ptr eptr) {
-                    std::string msg =
-                        format("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
-                               ks_name, cfname, sstdir, eptr);
-                    dblog.error("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
-                                ks_name, cfname, sstdir, eptr);
-                    try {
-                        std::rethrow_exception(eptr);
-                    } catch (sstables::compaction_stopped_exception& e) {
-                        // swallow compaction stopped exception, to allow clean shutdown.
-                    } catch (...) {
-                        throw std::runtime_error(msg.c_str());
-                    }
-                });
-            });
+        co_return;
    }
+
+    dblog.info("Populating Keyspace {}", ks_name);
+    auto& ks = i->second;
+    auto& column_families = db.local().get_column_families();
+
+    co_await parallel_for_each(ks.metadata()->cf_meta_data() | boost::adaptors::map_values, [&] (schema_ptr s) -> future<> {
+            utils::UUID uuid = s->id();
+            lw_shared_ptr<replica::column_family> cf = column_families[uuid];
+            sstring cfname = cf->schema()->cf_name();
+            auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
+            dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
+
+        auto metadata = table_population_metadata(db, ks_name, cfname);
+        std::exception_ptr ex;
+
+        try {
+            co_await ks.make_directory_for_column_family(cfname, uuid);
+
+            co_await metadata.start();
+            co_await distributed_loader::populate_column_family(metadata, sstables::staging_dir, allow_offstrategy_compaction::no);
+            co_await distributed_loader::populate_column_family(metadata, sstables::quarantine_dir, allow_offstrategy_compaction::no, must_exist::no);
+            co_await distributed_loader::populate_column_family(metadata, "", allow_offstrategy_compaction::yes);
+        } catch (...) {
+            std::exception_ptr eptr = std::current_exception();
+            std::string msg =
+                format("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
+                        ks_name, cfname, sstdir, eptr);
+            dblog.error("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
+                        ks_name, cfname, sstdir, eptr);
+            try {
+                std::rethrow_exception(eptr);
+            } catch (sstables::compaction_stopped_exception& e) {
+                // swallow compaction stopped exception, to allow clean shutdown.
+            } catch (...) {
+                ex = std::make_exception_ptr(std::runtime_error(msg.c_str()));
+            }
+        }
+
+        co_await metadata.stop();
+        if (ex) {
+            std::rethrow_exception(std::move(ex));
+        }
+    });
 }

 future<> distributed_loader::init_system_keyspace(distributed<replica::database>& db, distributed<service::storage_service>& ss, sharded<gms::gossiper>& g, db::config& cfg) {
--- a/replica/distributed_loader.hh
+++ b/replica/distributed_loader.hh
@@ -57,8 +57,11 @@ class distributed_loader_for_tests;

 namespace replica {

+class table_population_metadata;
+
 class distributed_loader {
    friend class ::distributed_loader_for_tests;
+    friend class table_population_metadata;

    static future<> reshape(sharded<sstables::sstable_directory>& dir, sharded<replica::database>& db, sstables::reshape_mode mode,
            sstring ks_name, sstring table_name, sstables::compaction_sstable_creator_fn creator, std::function<bool (const sstables::shared_sstable&)> filter);
@@ -70,7 +73,7 @@ class distributed_loader {
            std::filesystem::path datadir, sstring ks, sstring cf);
    using allow_offstrategy_compaction = bool_class<struct allow_offstrategy_compaction_tag>;
    using must_exist = bool_class<struct must_exist_tag>;
-    static future<> populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, allow_offstrategy_compaction, must_exist = must_exist::yes);
+    static future<> populate_column_family(table_population_metadata& metadata, sstring subdir, allow_offstrategy_compaction, must_exist = must_exist::yes);
    static future<> populate_keyspace(distributed<replica::database>& db, sstring datadir, sstring ks_name);
    static future<> cleanup_column_family_temp_sst_dirs(sstring sstdir);
    static future<> handle_sstables_pending_delete(sstring pending_deletes_dir);
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -803,16 +803,15 @@ void table::set_metrics() {
 }

 void table::rebuild_statistics() {
-    // zeroing live_disk_space_used and live_sstable_count because the
-    // sstable list was re-created
    _stats.live_disk_space_used = 0;
    _stats.live_sstable_count = 0;
+    _stats.total_disk_space_used = 0;

    _sstables->for_each_sstable([this] (const sstables::shared_sstable& tab) {
        update_stats_for_new_sstable(tab->bytes_on_disk());
    });
    for (auto& tab : _sstables_compacted_but_not_deleted) {
-        update_stats_for_new_sstable(tab->bytes_on_disk());
+        _stats.total_disk_space_used += tab->bytes_on_disk();
    }
 }

@@ -1137,6 +1136,11 @@ future<> table::run_offstrategy_compaction(sstables::compaction_data& info) {
    tlogger.info("Done with off-strategy compaction for {}.{}", _schema->ks_name(), _schema->cf_name());
 }

+future<> table::perform_cleanup_compaction(replica::database& db) {
+    co_await flush();
+    co_await get_compaction_manager().perform_cleanup(db, this);
+}
+
 void table::set_compaction_strategy(sstables::compaction_strategy_type strategy) {
    tlogger.debug("Setting compaction strategy of {}.{} to {}", _schema->ks_name(), _schema->cf_name(), sstables::compaction_strategy::name(strategy));
    auto new_cs = make_compaction_strategy(strategy, _schema->compaction_strategy_options());
@@ -1772,29 +1776,30 @@ future<> table::generate_and_propagate_view_updates(const schema_ptr& base,
        tracing::trace_state_ptr tr_state,
        gc_clock::time_point now) const {
    auto base_token = m.token();
+    auto m_schema = m.schema();
    db::view::view_update_builder builder = co_await db::view::make_view_update_builder(
            base,
            std::move(views),
-            make_flat_mutation_reader_from_mutations(m.schema(), std::move(permit), {std::move(m)}),
+            make_flat_mutation_reader_from_mutations(std::move(m_schema), std::move(permit), {std::move(m)}),
            std::move(existings),
            now);

    std::exception_ptr err = nullptr;
    while (true) {
-        utils::chunked_vector<frozen_mutation_and_schema> updates;
+        std::optional<utils::chunked_vector<frozen_mutation_and_schema>> updates;
        try {
            updates = co_await builder.build_some();
        } catch (...) {
            err = std::current_exception();
            break;
        }
-        if (updates.empty()) {
+        if (!updates) {
            break;
        }
-        tracing::trace(tr_state, "Generated {} view update mutations", updates.size());
-        auto units = seastar::consume_units(*_config.view_update_concurrency_semaphore, memory_usage_of(updates));
+        tracing::trace(tr_state, "Generated {} view update mutations", updates->size());
+        auto units = seastar::consume_units(*_config.view_update_concurrency_semaphore, memory_usage_of(*updates));
        try {
-            co_await db::view::mutate_MV(base_token, std::move(updates), _view_stats, *_config.cf_stats, tr_state,
+            co_await db::view::mutate_MV(base_token, std::move(*updates), _view_stats, *_config.cf_stats, tr_state,
                std::move(units), service::allow_hints::yes, db::view::wait_for_all_updates::no);
        } catch (...) {
            // Ignore exceptions: any individual failure to propagate a view update will be reported
@@ -1918,14 +1923,14 @@ future<> table::populate_views(
    while (true) {
        try {
            auto updates = co_await builder.build_some();
-            if (updates.empty()) {
+            if (!updates) {
                break;
            }
-            size_t update_size = memory_usage_of(updates);
+            size_t update_size = memory_usage_of(*updates);
            size_t units_to_wait_for = std::min(_config.view_update_concurrency_semaphore_limit, update_size);
            auto units = co_await seastar::get_units(*_config.view_update_concurrency_semaphore, units_to_wait_for);
            units.adopt(seastar::consume_units(*_config.view_update_concurrency_semaphore, update_size - units_to_wait_for));
-            co_await db::view::mutate_MV(base_token, std::move(updates), _view_stats, *_config.cf_stats,
+            co_await db::view::mutate_MV(base_token, std::move(*updates), _view_stats, *_config.cf_stats,
                    tracing::trace_state_ptr(), std::move(units), service::allow_hints::no, db::view::wait_for_all_updates::yes);
        } catch (...) {
            if (!err) {
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -950,6 +950,11 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
            _prev_snapshot = {};
        });
        utils::coroutine update; // Destroy before cleanup to release snapshots before invalidating.
+        auto destroy_update = defer([&] {
+            with_allocator(_tracker.allocator(), [&] {
+                update = {};
+            });
+        });
        partition_presence_checker is_present = _prev_snapshot->make_partition_presence_checker();
        while (!m.partitions.empty()) {
            with_allocator(_tracker.allocator(), [&] () {
@@ -1222,6 +1227,10 @@ void rows_entry::on_evicted(cache_tracker& tracker) noexcept {
        // That dummy is linked in the LRU, because there may be partitions
        // with no regular rows, and we need to track them.
        unlink_from_lru();
+
+        // We still need to break continuity in order to preserve the "older versions are evicted first"
+        // invariant.
+        it->set_continuous(false);
    } else {
        // When evicting a dummy with both sides continuous we don't need to break continuity.
        //
--- a/schema_upgrader.hh
+++ b/schema_upgrader.hh
@@ -9,6 +9,7 @@
 #pragma once

 #include "mutation_fragment.hh"
+#include "mutation_fragment_v2.hh"
 #include "converting_mutation_partition_applier.hh"

 // A StreamedMutationTransformer which transforms the stream to a different schema
--- a/scylla_post_install.sh
+++ b/scylla_post_install.sh
@@ -63,4 +63,15 @@ MemoryLimit=$MEMORY_LIMIT
 EOS
 fi

+if [ -e /etc/systemd/system/systemd-coredump@.service.d/timeout.conf ]; then
+    COREDUMP_RUNTIME_MAX=$(grep RuntimeMaxSec /etc/systemd/system/systemd-coredump@.service.d/timeout.conf)
+    if [ -z $COREDUMP_RUNTIME_MAX ]; then
+    cat << EOS > /etc/systemd/system/systemd-coredump@.service.d/timeout.conf
+[Service]
+RuntimeMaxSec=infinity
+TimeoutSec=infinity
+EOS
+    fi
+fi
+
 systemctl --system daemon-reload >/dev/null || true
--- a/2
+++ b/2
--- a/service/paxos/cas_request.hh
+++ b/service/paxos/cas_request.hh
@@ -8,6 +8,7 @@
 #pragma once

 #include <optional>
+#include <seastar/core/sharded.hh>

 #include "timestamp.hh"

--- a/service/paxos/paxos_state.cc
+++ b/service/paxos/paxos_state.cc
@@ -78,7 +78,7 @@ future<prepare_response> paxos_state::prepare(storage_proxy& sp, tracing::trace_
                                    prv, tr_state, timeout);
                        });
                    });
-                    return when_all(std::move(f1), std::move(f2)).then([state = std::move(state), only_digest] (auto t) {
+                    return when_all(std::move(f1), std::move(f2)).then([state = std::move(state), only_digest, schema] (auto t) mutable {
                        if (utils::get_local_injector().enter("paxos_error_after_save_promise")) {
                            return make_exception_future<prepare_response>(utils::injected_error("injected_error_after_save_promise"));
                        }
@@ -103,8 +103,25 @@ future<prepare_response> paxos_state::prepare(storage_proxy& sp, tracing::trace_
                            auto ex = f2.get_exception();
                            logger.debug("Failed to get data or digest: {}. Ignored.", std::move(ex));
                        }
-                        return make_ready_future<prepare_response>(prepare_response(promise(std::move(state._accepted_proposal),
-                                        std::move(state._most_recent_commit), std::move(data_or_digest))));
+                        auto upgrade_if_needed = [schema = std::move(schema)] (std::optional<proposal> p) {
+                            if (!p || p->update.schema_version() == schema->version()) {
+                                return make_ready_future<std::optional<proposal>>(std::move(p));
+                            }
+                            // In case current schema is not the same as the schema in the proposal
+                            // try to look it up first in the local schema_registry cache and upgrade
+                            // the mutation using schema from the cache.
+                            //
+                            // If there's no schema in the cache, then retrieve persisted column mapping
+                            // for that version and upgrade the mutation with it.
+                            logger.debug("Stored mutation references outdated schema version. "
+                                "Trying to upgrade the accepted proposal mutation to the most recent schema version.");
+                            return service::get_column_mapping(p->update.column_family_id(), p->update.schema_version()).then([schema, p = std::move(p)] (const column_mapping& cm) {
+                                return make_ready_future<std::optional<proposal>>(proposal(p->ballot, freeze(p->update.unfreeze_upgrading(schema, cm))));
+                            });
+                        };
+                        return when_all_succeed(upgrade_if_needed(std::move(state._accepted_proposal)), upgrade_if_needed(std::move(state._most_recent_commit))).then([data_or_digest = std::move(data_or_digest)] (auto&& u) mutable {
+                            return prepare_response(promise(std::move(std::get<0>(u)), std::move(std::get<1>(u)), std::move(data_or_digest)));
+                        });
                    });
                } else {
                    logger.debug("Promise rejected; {} is not sufficiently newer than {}", ballot, state._promised_ballot);
@@ -200,15 +217,9 @@ future<> paxos_state::learn(storage_proxy& sp, schema_ptr schema, proposal decis
                // If there's no schema in the cache, then retrieve persisted column mapping
                // for that version and upgrade the mutation with it.
                if (decision.update.schema_version() != schema->version()) {
-                    logger.debug("Stored mutation references outdated schema version. "
-                        "Trying to upgrade the accepted proposal mutation to the most recent schema version.");
-                    return service::get_column_mapping(decision.update.column_family_id(), decision.update.schema_version())
-                        .then([&sp, schema, tr_state, timeout, &decision] (const column_mapping& cm) {
-                            return do_with(decision.update.unfreeze_upgrading(schema, cm), [&sp, tr_state, timeout] (const mutation& upgraded) {
-                                return sp.mutate_locally(upgraded, tr_state, db::commitlog::force_sync::yes, timeout);
-                            });
-                        });
+                    on_internal_error(logger, format("schema version in learn does not match current schema"));
                }
+
                return sp.mutate_locally(schema, decision.update, tr_state, db::commitlog::force_sync::yes, timeout);
            });
        } else {
--- a/service/raft/raft_group0.cc
+++ b/service/raft/raft_group0.cc
@@ -161,6 +161,11 @@ raft_group0::discover_group0(raft::server_address my_addr) {
    }
 }

+future<> raft_group0::abort() {
+    return _shutdown_gate.close();
+}
+
+
 future<> raft_group0::join_group0() {
    assert(this_shard_id() == 0);
    if (!_raft_gr.is_enabled()) {
--- a/service/raft/raft_group0.hh
+++ b/service/raft/raft_group0.hh
@@ -47,13 +47,7 @@ public:
        cql3::query_processor& qp,
        migration_manager& mm);

-    future<> abort() {
-        if (!_abort_source.abort_requested()) {
-            _abort_source.request_abort();
-        }
-        return _shutdown_gate.close();
-    }
-
+    future<> abort();

    // Join this node to the cluster-wide Raft group
    // Called during bootstrap. Is idempotent - it
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -1227,19 +1227,15 @@ future<> paxos_response_handler::learn_decision(lw_shared_ptr<paxos::proposal> d

        auto cdc = _proxy->get_cdc_service();
        if (cdc && cdc->needs_cdc_augmentation(update_mut_vec)) {
-            f_cdc = cdc->augment_mutation_call(_timeout, std::move(update_mut_vec), tr_state, _cl_for_learn)
-                    .then([this, base_tbl_id, cdc = cdc->shared_from_this()] (std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>&& t) {
-                auto mutations = std::move(std::get<0>(t));
-                auto tracker = std::move(std::get<1>(t));
-                // Pick only the CDC ("augmenting") mutations
-                std::erase_if(mutations, [base_tbl_id = std::move(base_tbl_id)] (const mutation& v) {
-                    return v.schema()->id() == base_tbl_id;
-                });
-                if (mutations.empty()) {
-                    return make_ready_future<>();
-                }
-                return _proxy->mutate_internal(std::move(mutations), _cl_for_learn, false, tr_state, _permit, _timeout, std::move(tracker));
+            auto cdc_shared = cdc->shared_from_this(); // keep CDC service alive
+            auto [mutations, tracker] = co_await cdc->augment_mutation_call(_timeout, std::move(update_mut_vec), tr_state, _cl_for_learn);
+            // Pick only the CDC ("augmenting") mutations
+            std::erase_if(mutations, [base_tbl_id = std::move(base_tbl_id)] (const mutation& v) {
+                return v.schema()->id() == base_tbl_id;
            });
+            if (!mutations.empty()) {
+                f_cdc = _proxy->mutate_internal(std::move(mutations), _cl_for_learn, false, tr_state, _permit, _timeout, std::move(tracker));
+            }
        }
    }

@@ -1247,7 +1243,7 @@ future<> paxos_response_handler::learn_decision(lw_shared_ptr<paxos::proposal> d
    std::array<std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>, 1> m{std::make_tuple(std::move(decision), _schema, shared_from_this(), _key.token())};
    future<> f_lwt = _proxy->mutate_internal(std::move(m), _cl_for_learn, false, tr_state, _permit, _timeout);

-    return when_all_succeed(std::move(f_cdc), std::move(f_lwt)).discard_result();
+    co_await when_all_succeed(std::move(f_cdc), std::move(f_lwt)).discard_result();
 }

 void paxos_response_handler::prune(utils::UUID ballot) {
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -2282,6 +2282,8 @@ future<> storage_service::removenode(sstring host_id_string, std::list<gms::inet
                ss._group0->leave_group0(endpoint).get();
                slogger.info("removenode[{}]: Finished removenode operation, removing node={}, sync_nodes={}, ignore_nodes={}", uuid, endpoint, nodes, ignore_nodes);
            } catch (...) {
+                slogger.warn("removenode[{}]: removing node={}, sync_nodes={}, ignore_nodes={} failed, error {}",
+                             uuid, endpoint, nodes, ignore_nodes, std::current_exception());
                // we need to revert the effect of prepare verb the removenode ops is failed
                req.cmd = node_ops_cmd::removenode_abort;
                parallel_for_each(nodes, [&ss, &req, &nodes_unknown_verb, &nodes_down, uuid] (const gms::inet_address& node) {
@@ -2369,8 +2371,7 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
                }
                return update_pending_ranges(tmptr, format("removenode {}", req.leaving_nodes));
            }).get();
-            auto ops = seastar::make_shared<node_ops_info>(node_ops_info{ops_uuid, false, std::move(req.ignore_nodes)});
-            auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(ops), [this, coordinator, req = std::move(req)] () mutable {
+            auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(req.ignore_nodes), [this, coordinator, req = std::move(req)] () mutable {
                return mutate_token_metadata([this, coordinator, req = std::move(req)] (mutable_token_metadata_ptr tmptr) mutable {
                    for (auto& node : req.leaving_nodes) {
                        slogger.info("removenode[{}]: Removed node={} as leaving node, coordinator={}", req.ops_uuid, node, coordinator);
@@ -2380,6 +2381,7 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
                });
            },
            [this, ops_uuid] () mutable { node_ops_singal_abort(ops_uuid); });
+            meta.start().get();
            _node_ops.emplace(ops_uuid, std::move(meta));
        } else if (req.cmd == node_ops_cmd::removenode_heartbeat) {
            slogger.debug("removenode[{}]: Updated heartbeat from coordinator={}", req.ops_uuid,  coordinator);
@@ -2418,8 +2420,7 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
                }
                return update_pending_ranges(tmptr, format("decommission {}", req.leaving_nodes));
            }).get();
-            auto ops = seastar::make_shared<node_ops_info>(node_ops_info{ops_uuid, false, std::move(req.ignore_nodes)});
-            auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(ops), [this, coordinator, req = std::move(req)] () mutable {
+            auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(req.ignore_nodes), [this, coordinator, req = std::move(req)] () mutable {
                return mutate_token_metadata([this, coordinator, req = std::move(req)] (mutable_token_metadata_ptr tmptr) mutable {
                    for (auto& node : req.leaving_nodes) {
                        slogger.info("decommission[{}]: Removed node={} as leaving node, coordinator={}", req.ops_uuid, node, coordinator);
@@ -2429,6 +2430,7 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
                });
            },
            [this, ops_uuid] () mutable { node_ops_singal_abort(ops_uuid); });
+            meta.start().get();
            _node_ops.emplace(ops_uuid, std::move(meta));
        } else if (req.cmd == node_ops_cmd::decommission_heartbeat) {
            slogger.debug("decommission[{}]: Updated heartbeat from coordinator={}", req.ops_uuid,  coordinator);
@@ -2460,8 +2462,7 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
                }
                return make_ready_future<>();
            }).get();
-            auto ops = seastar::make_shared<node_ops_info>(node_ops_info{ops_uuid, false, std::move(req.ignore_nodes)});
-            auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(ops), [this, coordinator, req = std::move(req)] () mutable {
+            auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(req.ignore_nodes), [this, coordinator, req = std::move(req)] () mutable {
                return mutate_token_metadata([this, coordinator, req = std::move(req)] (mutable_token_metadata_ptr tmptr) mutable {
                    for (auto& x: req.replace_nodes) {
                        auto existing_node = x.first;
@@ -2473,6 +2474,7 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
                });
            },
            [this, ops_uuid ] { node_ops_singal_abort(ops_uuid); });
+            meta.start().get();
            _node_ops.emplace(ops_uuid, std::move(meta));
        } else if (req.cmd == node_ops_cmd::replace_prepare_mark_alive) {
            // Wait for local node has marked replacing node as alive
@@ -2514,8 +2516,7 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
                }
                return update_pending_ranges(tmptr, format("bootstrap {}", req.bootstrap_nodes));
            }).get();
-            auto ops = seastar::make_shared<node_ops_info>(node_ops_info{ops_uuid, false, std::move(req.ignore_nodes)});
-            auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(ops), [this, coordinator, req = std::move(req)] () mutable {
+            auto meta = node_ops_meta_data(ops_uuid, coordinator, std::move(req.ignore_nodes), [this, coordinator, req = std::move(req)] () mutable {
                return mutate_token_metadata([this, coordinator, req = std::move(req)] (mutable_token_metadata_ptr tmptr) mutable {
                    for (auto& x: req.bootstrap_nodes) {
                        auto& endpoint = x.first;
@@ -2527,6 +2528,7 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
                });
            },
            [this, ops_uuid ] { node_ops_singal_abort(ops_uuid); });
+            meta.start().get();
            _node_ops.emplace(ops_uuid, std::move(meta));
        } else if (req.cmd == node_ops_cmd::bootstrap_heartbeat) {
            slogger.debug("bootstrap[{}]: Updated heartbeat from coordinator={}", req.ops_uuid, coordinator);
@@ -2789,7 +2791,7 @@ future<> storage_service::removenode_with_stream(gms::inet_address leaving_node,
 future<> storage_service::restore_replica_count(inet_address endpoint, inet_address notify_endpoint) {
    if (is_repair_based_node_ops_enabled(streaming::stream_reason::removenode)) {
        auto ops_uuid = utils::make_random_uuid();
-        auto ops = seastar::make_shared<node_ops_info>(node_ops_info{ops_uuid, false, std::list<gms::inet_address>()});
+        auto ops = seastar::make_shared<node_ops_info>(ops_uuid, nullptr, std::list<gms::inet_address>());
        return _repair.local().removenode_with_repair(get_token_metadata_ptr(), endpoint, ops).finally([this, notify_endpoint] () {
            return send_replication_notification(notify_endpoint);
        });
@@ -3550,7 +3552,7 @@ bool storage_service::is_repair_based_node_ops_enabled(streaming::stream_reason
 node_ops_meta_data::node_ops_meta_data(
        utils::UUID ops_uuid,
        gms::inet_address coordinator,
-        shared_ptr<node_ops_info> ops,
+        std::list<gms::inet_address> ignore_nodes,
        std::function<future<> ()> abort_func,
        std::function<void ()> signal_func)
    : _ops_uuid(std::move(ops_uuid))
@@ -3558,24 +3560,28 @@ node_ops_meta_data::node_ops_meta_data(
    , _abort(std::move(abort_func))
    , _abort_source(seastar::make_shared<abort_source>())
    , _signal(std::move(signal_func))
-    , _ops(std::move(ops))
+    , _ops(seastar::make_shared<node_ops_info>(_ops_uuid, _abort_source, std::move(ignore_nodes)))
    , _watchdog([sig = _signal] { sig(); }) {
    _watchdog.arm(_watchdog_interval);
 }

+future<> node_ops_meta_data::start() {
+    return _ops ? _ops->start() : make_ready_future<>();
+}
+
+future<> node_ops_meta_data::stop() noexcept {
+    return _ops ? _ops->stop() : make_ready_future<>();
+}
+
 future<> node_ops_meta_data::abort() {
    slogger.debug("node_ops_meta_data: ops_uuid={} abort", _ops_uuid);
-    _aborted = true;
-    if (_ops) {
-        _ops->abort = true;
-    }
    _watchdog.cancel();
    return _abort();
 }

 void node_ops_meta_data::update_watchdog() {
    slogger.debug("node_ops_meta_data: ops_uuid={} update_watchdog", _ops_uuid);
-    if (_aborted) {
+    if (_abort_source->abort_requested()) {
        return;
    }
    _watchdog.cancel();
@@ -3612,6 +3618,7 @@ void storage_service::node_ops_done(utils::UUID ops_uuid) {
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
        meta.cancel_watchdog();
+        meta.stop().get();
        _node_ops.erase(it);
    }
 }
@@ -3619,6 +3626,24 @@ void storage_service::node_ops_done(utils::UUID ops_uuid) {
 void storage_service::node_ops_abort(utils::UUID ops_uuid) {
    slogger.debug("node_ops_abort: ops_uuid={}", ops_uuid);
    auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
+
+    if (!ops_uuid) {
+        for (auto& [uuid, meta] : _node_ops) {
+            meta.abort().get();
+            auto as = meta.get_abort_source();
+            if (as && !as->abort_requested()) {
+                as->request_abort();
+            }
+        }
+
+        for (auto it = _node_ops.begin(); it != _node_ops.end(); it = _node_ops.erase(it)) {
+            node_ops_meta_data& meta = it->second;
+            meta.stop().get();
+        }
+
+        return;
+    }
+
    auto it = _node_ops.find(ops_uuid);
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
@@ -3627,7 +3652,7 @@ void storage_service::node_ops_abort(utils::UUID ops_uuid) {
        if (as && !as->abort_requested()) {
            as->request_abort();
        }
-        _repair.local().abort_repair_node_ops(ops_uuid).get();
+        meta.stop().get();
        _node_ops.erase(it);
    }
 }
@@ -3647,17 +3672,18 @@ future<> storage_service::node_ops_abort_thread() {
            while (!_node_ops_abort_queue.empty()) {
                auto uuid_opt = _node_ops_abort_queue.front();
                _node_ops_abort_queue.pop_front();
-                if (!uuid_opt) {
-                    return;
-                }
                try {
-                    storage_service::node_ops_abort(*uuid_opt);
+                    storage_service::node_ops_abort(uuid_opt.value_or(utils::null_uuid()));
                } catch (...) {
                    slogger.warn("Failed to abort node operation ops_uuid={}: {}", *uuid_opt, std::current_exception());
                }
+                if (!uuid_opt) {
+                    slogger.info("Stopped node_ops_abort_thread");
+                    return;
+                }
            }
        }
-        slogger.info("Stopped node_ops_abort_thread");
+        __builtin_unreachable();
    });
 }

--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -104,14 +104,15 @@ class node_ops_meta_data {
    shared_ptr<node_ops_info> _ops;
    seastar::timer<lowres_clock> _watchdog;
    std::chrono::seconds _watchdog_interval{30};
-    bool _aborted = false;
 public:
    explicit node_ops_meta_data(
            utils::UUID ops_uuid,
            gms::inet_address coordinator,
-            shared_ptr<node_ops_info> ops,
+            std::list<gms::inet_address> ignore_nodes,
            std::function<future<> ()> abort_func,
            std::function<void ()> signal_func);
+    future<> start();
+    future<> stop() noexcept;
    shared_ptr<node_ops_info> get_ops_info();
    shared_ptr<abort_source> get_abort_source();
    future<> abort();
--- a/sstables/compress.cc
+++ b/sstables/compress.cc
@@ -21,6 +21,7 @@
 #include "unimplemented.hh"
 #include "segmented_compress_params.hh"
 #include "utils/class_registrator.hh"
+#include "reader_permit.hh"

 namespace sstables {

@@ -338,16 +339,18 @@ class compressed_file_data_source_impl : public data_source_impl {
    sstables::compression* _compression_metadata;
    sstables::compression::segmented_offsets::accessor _offsets;
    sstables::local_compression _compression;
+    reader_permit _permit;
    uint64_t _underlying_pos;
    uint64_t _pos;
    uint64_t _beg_pos;
    uint64_t _end_pos;
 public:
    compressed_file_data_source_impl(file f, sstables::compression* cm,
-                uint64_t pos, size_t len, file_input_stream_options options)
+                uint64_t pos, size_t len, file_input_stream_options options, reader_permit permit)
            : _compression_metadata(cm)
            , _offsets(_compression_metadata->offsets.get_accessor())
            , _compression(*cm)
+            , _permit(std::move(permit))
    {
        _beg_pos = pos;
        if (pos > _compression_metadata->uncompressed_file_length()) {
@@ -412,7 +415,7 @@ public:
                _pos += out.size();
                _underlying_pos += addr.chunk_len;

-                return out;
+                return make_tracked_temporary_buffer(std::move(out), _permit);
        });
    }

@@ -444,9 +447,9 @@ requires ChecksumUtils<ChecksumType>
 class compressed_file_data_source : public data_source {
 public:
    compressed_file_data_source(file f, sstables::compression* cm,
-            uint64_t offset, size_t len, file_input_stream_options options)
+            uint64_t offset, size_t len, file_input_stream_options options, reader_permit permit)
        : data_source(std::make_unique<compressed_file_data_source_impl<ChecksumType>>(
-                std::move(f), cm, offset, len, std::move(options)))
+                std::move(f), cm, offset, len, std::move(options), std::move(permit)))
        {}
 };

@@ -454,10 +457,10 @@ template <typename ChecksumType>
 requires ChecksumUtils<ChecksumType>
 inline input_stream<char> make_compressed_file_input_stream(
        file f, sstables::compression *cm, uint64_t offset, size_t len,
-        file_input_stream_options options)
+        file_input_stream_options options, reader_permit permit)
 {
    return input_stream<char>(compressed_file_data_source<ChecksumType>(
-            std::move(f), cm, offset, len, std::move(options)));
+            std::move(f), cm, offset, len, std::move(options), std::move(permit)));
 }

 // For SSTables 2.x (formats 'ka' and 'la'), the full checksum is a combination of checksums of compressed chunks.
@@ -569,15 +572,15 @@ inline output_stream<char> make_compressed_file_output_stream(output_stream<char

 input_stream<char> sstables::make_compressed_file_k_l_format_input_stream(file f,
        sstables::compression* cm, uint64_t offset, size_t len,
-        class file_input_stream_options options)
+        class file_input_stream_options options, reader_permit permit)
 {
-    return make_compressed_file_input_stream<adler32_utils>(std::move(f), cm, offset, len, std::move(options));
+    return make_compressed_file_input_stream<adler32_utils>(std::move(f), cm, offset, len, std::move(options), std::move(permit));
 }

 input_stream<char> sstables::make_compressed_file_m_format_input_stream(file f,
        sstables::compression *cm, uint64_t offset, size_t len,
-        class file_input_stream_options options) {
-    return make_compressed_file_input_stream<crc32_utils>(std::move(f), cm, offset, len, std::move(options));
+        class file_input_stream_options options, reader_permit permit) {
+    return make_compressed_file_input_stream<crc32_utils>(std::move(f), cm, offset, len, std::move(options), std::move(permit));
 }

 output_stream<char> sstables::make_compressed_file_m_format_output_stream(output_stream<char> out,
--- a/sstables/compress.hh
+++ b/sstables/compress.hh
@@ -47,6 +47,8 @@
 #include "checksum_utils.hh"
 #include "../compress.hh"

+class reader_permit;
+
 class compression_parameters;
 class compressor;
 using compressor_ptr = shared_ptr<compressor>;
@@ -371,11 +373,11 @@ compressor_ptr get_sstable_compressor(const compression&);
 // sstable alive, and the compression metadata is only a part of it.
 input_stream<char> make_compressed_file_k_l_format_input_stream(file f,
                sstables::compression* cm, uint64_t offset, size_t len,
-                class file_input_stream_options options);
+                class file_input_stream_options options, reader_permit permit);

 input_stream<char> make_compressed_file_m_format_input_stream(file f,
                sstables::compression* cm, uint64_t offset, size_t len,
-                class file_input_stream_options options);
+                class file_input_stream_options options, reader_permit permit);

 output_stream<char> make_compressed_file_m_format_output_stream(output_stream<char> out,
                sstables::compression* cm,
--- a/sstables/downsampling.hh
+++ b/sstables/downsampling.hh
@@ -16,6 +16,7 @@
 #include <list>
 #include <map>
 #include <vector>
+#include <array>
 #include <algorithm>
 #include <iterator>
 #include <cassert>
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -1070,10 +1070,9 @@ public:

    future<> close() noexcept {
        // index_bound::close must not fail
-        return close(_lower_bound).then([this] {
-            if (_upper_bound) {
-                return close(*_upper_bound);
-            }
+        auto close_lb = close(_lower_bound);
+        auto close_ub = _upper_bound ? close(*_upper_bound) : make_ready_future<>();
+        return when_all(std::move(close_lb), std::move(close_ub)).discard_result().finally([this] {
            if (_local_index_cache) {
                return _local_index_cache->evict_gently();
            }
--- a/sstables/kl/reader.cc
+++ b/sstables/kl/reader.cc
@@ -1142,7 +1142,7 @@ private:
    }
    index_reader& get_index_reader() {
        if (!_index_reader) {
-            auto caching = use_caching(!_slice.options.contains(query::partition_slice::option::bypass_cache));
+            auto caching = use_caching(global_cache_index_pages && !_slice.options.contains(query::partition_slice::option::bypass_cache));
            _index_reader = std::make_unique<index_reader>(_sst, _consumer.permit(), _consumer.io_priority(),
                                                           _consumer.trace_state(), caching);
        }
--- a/sstables/mx/reader.cc
+++ b/sstables/mx/reader.cc
@@ -1319,7 +1319,7 @@ private:
    }
    index_reader& get_index_reader() {
        if (!_index_reader) {
-            auto caching = use_caching(!_slice.options.contains(query::partition_slice::option::bypass_cache));
+            auto caching = use_caching(global_cache_index_pages && !_slice.options.contains(query::partition_slice::option::bypass_cache));
            _index_reader = std::make_unique<index_reader>(_sst, _consumer.permit(), _consumer.io_priority(),
                                                           _consumer.trace_state(), caching);
        }
@@ -1754,9 +1754,7 @@ public:
        _monitor.on_read_started(_context->reader_position());
    }
 public:
-    void on_out_of_clustering_range() override {
-        push_mutation_fragment(mutation_fragment_v2(*_schema, _permit, partition_end()));
-    }
+    void on_out_of_clustering_range() override { }
    virtual future<> fast_forward_to(const dht::partition_range& pr) override {
        on_internal_error(sstlog, "mx_crawling_sstable_mutation_reader: doesn't support fast_forward_to(const dht::partition_range&)");
    }
--- a/sstables/promoted_index_blocks_reader.hh
+++ b/sstables/promoted_index_blocks_reader.hh
@@ -14,6 +14,7 @@
 #include "m_format_read_helpers.hh"
 #include "sstables/mx/parsers.hh"
 #include "sstables/index_entry.hh"
+#include <seastar/core/circular_buffer.hh>

 namespace sstables {

--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -77,6 +77,18 @@ thread_local disk_error_signal_type sstable_write_error;

 namespace sstables {

+// The below flag governs the mode of index file page caching used by the index
+// reader.
+//
+// If set to true, the reader will read and/or populate a common global cache,
+// which shares its capacity with the row cache. If false, the reader will use
+// BYPASS CACHE semantics for index caching.
+//
+// This flag is intended to be a temporary hack. The goal is to eventually
+// solve index caching problems via a smart cache replacement policy.
+//
+thread_local utils::updateable_value<bool> global_cache_index_pages(false);
+
 logging::logger sstlog("sstable");

 // Because this is a noop and won't hold any state, it is better to use a global than a
@@ -2275,7 +2287,7 @@ input_stream<char> sstable::data_stream(uint64_t pos, size_t len, const io_prior
    options.read_ahead = 4;
    options.dynamic_adjustments = std::move(history);

-    file f = make_tracked_file(_data_file, std::move(permit));
+    file f = make_tracked_file(_data_file, permit);
    if (trace_state) {
        f = tracing::make_traced_file(std::move(f), std::move(trace_state), format("{}:", get_filename()));
    }
@@ -2284,10 +2296,10 @@ input_stream<char> sstable::data_stream(uint64_t pos, size_t len, const io_prior
    if (_components->compression) {
        if (_version >= sstable_version_types::mc) {
             return make_compressed_file_m_format_input_stream(f, &_components->compression,
-                pos, len, std::move(options));
+                pos, len, std::move(options), permit);
        } else {
            return make_compressed_file_k_l_format_input_stream(f, &_components->compression,
-                pos, len, std::move(options));
+                pos, len, std::move(options), permit);
        }
    }

--- a/Show More
+++ b/Show More