Merge '[branch-5.0] - minimal fix for crash caused by empty primary key range in LWT update' from Jan Ciołek

In #13001 we found a test case which causes a crash because it didn't handle `UNSET_VALUE` properly: ```python3 def test_unset_insert_where(cql, table2): p = unique_key_int() stmt = cql.prepare(f'INSERT INTO {table2} (p, c) VALUES ({p}, ?)') with pytest.raises(InvalidRequest, match="unset"): cql.execute(stmt, [UNSET_VALUE]) def test_unset_insert_where_lwt(cql, table2): p = unique_key_int() stmt = cql.prepare(f'INSERT INTO {table2} (p, c) VALUES ({p}, ?) IF NOT EXISTS') with pytest.raises(InvalidRequest, match="unset"): cql.execute(stmt, [UNSET_VALUE]) ``` This PR does an absolutely minimal change to fix the crash. It adds a check the moment before the crash would happen. To make sure that everything works correctly, and to detect any possible breaking changes, I wrote a bunch of tests that validate the current behavior. I also ported some tests from the `master` branch, at least the ones that were in line with the behavior on `branch-5.0`. The changes are the same as in #13133, just cherry-picked to `branch-5.0` Closes #13178 * github.com:scylladb/scylladb: cql-pytest/test_unset: port some tests from master branch cql-pytest/test_unset: test unset value in UPDATEs with LWT conditions cql-pytest/test_unset: test unset value in UPDATEs with IF EXISTS cql-pytest/test_unset: test unset value in UPDATE statements cql-pytest/test_unset: test unset value in INSERTs with IF NOT EXISTS cql-pytest/test_unset: test unset value in INSERT statements cas_request: fix crash on unset value in primary key with LWT
Update seastar submodule
2023-05-08 12:03:44 +03:00 · 2023-05-08 10:41:24 +03:00 · 2023-05-08 09:58:46 +03:00 · 2023-05-02 21:22:23 +03:00 · 2023-04-28 03:25:27 +02:00 · 2023-04-28 03:25:27 +02:00
219 changed files with 6454 additions and 1326 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -60,7 +60,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.0.dev
+VERSION=5.0.13

 if test -f version
 then
--- a/alternator/controller.cc
+++ b/alternator/controller.cc
@@ -78,6 +78,11 @@ future<> controller::start_server() {

        _executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_mm), std::ref(_sys_dist_ks), sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value()).get();
        _server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper)).get();
+        // Note: from this point on, if start_server() throws for any reason,
+        // it must first call stop_server() to stop the executor and server
+        // services we just started - or Scylla will cause an assertion
+        // failure when the controller object is destroyed in the exception
+        // unwinding.
        std::optional<uint16_t> alternator_port;
        if (_config.alternator_port()) {
            alternator_port = _config.alternator_port();
@@ -104,7 +109,13 @@ future<> controller::start_server() {
            }
            opts.erase("require_client_auth");
            opts.erase("truststore");
-            utils::configure_tls_creds_builder(creds.value(), std::move(opts)).get();
+            try {
+                utils::configure_tls_creds_builder(creds.value(), std::move(opts)).get();
+            } catch(...) {
+                logger.error("Failed to set up Alternator TLS credentials: {}", std::current_exception());
+                stop_server().get();
+                std::throw_with_nested(std::runtime_error("Failed to set up Alternator TLS credentials"));
+            }
        }
        bool alternator_enforce_authorization = _config.alternator_enforce_authorization();
        _server.invoke_on_all(
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -34,6 +34,7 @@
 #include "expressions.hh"
 #include "conditions.hh"
 #include "cql3/constants.hh"
+#include "cql3/util.hh"
 #include <optional>
 #include "utils/overloaded_functor.hh"
 #include "seastar/json/json_elements.hh"
@@ -46,6 +47,7 @@
 #include <seastar/core/coroutine.hh>
 #include <boost/range/adaptors.hpp>
 #include <boost/range/algorithm/find_end.hpp>
+#include <unordered_set>
 #include "service/storage_proxy.hh"
 #include "gms/gossiper.hh"
 #include "schema_registry.hh"
@@ -148,16 +150,16 @@ static void validate_table_name(const std::string& name) {
 // instead of each component individually as DynamoDB does.
 // The view_name() function assumes the table_name has already been validated
 // but validates the legality of index_name and the combination of both.
-static std::string view_name(const std::string& table_name, const std::string& index_name, const std::string& delim = ":") {
+static std::string view_name(const std::string& table_name, std::string_view index_name, const std::string& delim = ":") {
    static const std::regex valid_index_name_chars ("[a-zA-Z0-9_.-]*");
    if (index_name.length() < 3) {
        throw api_error::validation("IndexName must be at least 3 characters long");
    }
-    if (!std::regex_match(index_name.c_str(), valid_index_name_chars)) {
+    if (!std::regex_match(index_name.data(), valid_index_name_chars)) {
        throw api_error::validation(
                format("IndexName '{}' must satisfy regular expression pattern: [a-zA-Z0-9_.-]+", index_name));
    }
-    std::string ret = table_name + delim + index_name;
+    std::string ret = table_name + delim + std::string(index_name);
    if (ret.length() > max_table_name_length) {
        throw api_error::validation(
                format("The total length of TableName ('{}') and IndexName ('{}') cannot exceed {} characters",
@@ -166,7 +168,7 @@ static std::string view_name(const std::string& table_name, const std::string& i
    return ret;
 }

-static std::string lsi_name(const std::string& table_name, const std::string& index_name) {
+static std::string lsi_name(const std::string& table_name, std::string_view index_name) {
    return view_name(table_name, index_name, "!:");
 }

@@ -273,16 +275,16 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
    if (index_name) {
        if (index_name->IsString()) {
            orig_table_name = std::move(table_name);
-            table_name = view_name(orig_table_name, index_name->GetString());
+            table_name = view_name(orig_table_name, rjson::to_string_view(*index_name));
            type = table_or_view_type::gsi;
        } else {
            throw api_error::validation(
-                    format("Non-string IndexName '{}'", index_name->GetString()));
+                    format("Non-string IndexName '{}'", rjson::to_string_view(*index_name)));
        }
        // If no tables for global indexes were found, the index may be local
        if (!proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
            type = table_or_view_type::lsi;
-            table_name = lsi_name(orig_table_name, index_name->GetString());
+            table_name = lsi_name(orig_table_name, rjson::to_string_view(*index_name));
        }
    }

@@ -432,6 +434,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
    rjson::add(table_description, "BillingModeSummary", rjson::empty_object());
    rjson::add(table_description["BillingModeSummary"], "BillingMode", "PAY_PER_REQUEST");
    rjson::add(table_description["BillingModeSummary"], "LastUpdateToPayPerRequestDateTime", rjson::value(creation_date_seconds));
+    // In PAY_PER_REQUEST billing mode, provisioned capacity should return 0
+    rjson::add(table_description, "ProvisionedThroughput", rjson::empty_object());
+    rjson::add(table_description["ProvisionedThroughput"], "ReadCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);

    std::unordered_map<std::string,std::string> key_attribute_types;
    // Add base table's KeySchema and collect types for AttributeDefinitions:
@@ -453,6 +460,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
            rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
            // Add indexes's KeySchema and collect types for AttributeDefinitions:
            describe_key_schema(view_entry, *vptr, key_attribute_types);
+            // Add projection type
+            rjson::value projection = rjson::empty_object();
+            rjson::add(projection, "ProjectionType", "ALL");
+            // FIXME: we have to get ProjectionType from the schema when it is added
+            rjson::add(view_entry, "Projection", std::move(projection));
            // Local secondary indexes are marked by an extra '!' sign occurring before the ':' delimiter
            rjson::value& index_array = (delim_it > 1 && cf_name[delim_it-1] == '!') ? lsi_array : gsi_array;
            rjson::push_back(index_array, std::move(view_entry));
@@ -884,17 +896,23 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
    const rjson::value* gsi = rjson::find(request, "GlobalSecondaryIndexes");
    std::vector<schema_builder> view_builders;
    std::vector<sstring> where_clauses;
+    std::unordered_set<std::string> index_names;
    if (gsi) {
        if (!gsi->IsArray()) {
            co_return api_error::validation("GlobalSecondaryIndexes must be an array.");
        }
        for (const rjson::value& g : gsi->GetArray()) {
-            const rjson::value* index_name = rjson::find(g, "IndexName");
-            if (!index_name || !index_name->IsString()) {
+            const rjson::value* index_name_v = rjson::find(g, "IndexName");
+            if (!index_name_v || !index_name_v->IsString()) {
                co_return api_error::validation("GlobalSecondaryIndexes IndexName must be a string.");
            }
-            std::string vname(view_name(table_name, index_name->GetString()));
-            elogger.trace("Adding GSI {}", index_name->GetString());
+            std::string_view index_name = rjson::to_string_view(*index_name_v);
+            auto [it, added] = index_names.emplace(index_name);
+            if (!added) {
+                co_return api_error::validation(format("Duplicate IndexName '{}', ", index_name));
+            }
+            std::string vname(view_name(table_name, index_name));
+            elogger.trace("Adding GSI {}", index_name);
            // FIXME: read and handle "Projection" parameter. This will
            // require the MV code to copy just parts of the attrs map.
            schema_builder view_builder(keyspace_name, vname);
@@ -927,9 +945,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            if  (!range_key.empty() && range_key != view_hash_key && range_key != view_range_key) {
                add_column(view_builder, range_key, attribute_definitions, column_kind::clustering_key);
            }
-            sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
+            sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
            if (!view_range_key.empty()) {
-                where_clause = where_clause + " AND \"" + view_hash_key + "\" IS NOT NULL";
+                where_clause = format("{} AND {} IS NOT NULL", where_clause,
+                    cql3::util::maybe_quote(view_range_key));
            }
            where_clauses.push_back(std::move(where_clause));
            view_builders.emplace_back(std::move(view_builder));
@@ -942,12 +961,17 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            throw api_error::validation("LocalSecondaryIndexes must be an array.");
        }
        for (const rjson::value& l : lsi->GetArray()) {
-            const rjson::value* index_name = rjson::find(l, "IndexName");
-            if (!index_name || !index_name->IsString()) {
+            const rjson::value* index_name_v = rjson::find(l, "IndexName");
+            if (!index_name_v || !index_name_v->IsString()) {
                throw api_error::validation("LocalSecondaryIndexes IndexName must be a string.");
            }
-            std::string vname(lsi_name(table_name, index_name->GetString()));
-            elogger.trace("Adding LSI {}", index_name->GetString());
+            std::string_view index_name = rjson::to_string_view(*index_name_v);
+            auto [it, added] = index_names.emplace(index_name);
+            if (!added) {
+                co_return api_error::validation(format("Duplicate IndexName '{}', ", index_name));
+            }
+            std::string vname(lsi_name(table_name, index_name));
+            elogger.trace("Adding LSI {}", index_name);
            if (range_key.empty()) {
                co_return api_error::validation("LocalSecondaryIndex requires that the base table have a range key");
            }
@@ -979,9 +1003,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            // Note above we don't need to add virtual columns, as all
            // base columns were copied to view. TODO: reconsider the need
            // for virtual columns when we support Projection.
-            sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
+            sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
            if (!view_range_key.empty()) {
-                where_clause = where_clause + " AND \"" + view_range_key + "\" IS NOT NULL";
+                where_clause = format("{} AND {} IS NOT NULL", where_clause,
+                    cql3::util::maybe_quote(view_range_key));
            }
            where_clauses.push_back(std::move(where_clause));
            view_builders.emplace_back(std::move(view_builder));
@@ -2173,6 +2198,9 @@ static attrs_to_get calculate_attrs_to_get(const rjson::value& req, std::unorder
        for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) {
            attribute_path_map_add("AttributesToGet", ret, it->GetString());
        }
+        if (ret.empty()) {
+            throw api_error::validation("Empty AttributesToGet is not allowed. Consider using Select=COUNT instead.");
+        }
        return ret;
    } else if (has_projection_expression) {
        const rjson::value& projection_expression = req["ProjectionExpression"];
@@ -2577,8 +2605,8 @@ static bool hierarchy_actions(
                        // attr member so we can use add()
                        rjson::add_with_string_name(v, attr, std::move(*newv));
                    } else {
-                        throw api_error::validation(format("Can't remove document path {} - not present in item",
-                            subh.get_value()._path));
+                        // Removing a.b when a is a map but a.b doesn't exist
+                        // is silently ignored. It's not considered an error.
                    }
                } else {
                    throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -143,19 +143,24 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
    auto table = find_table(_proxy, request);
    auto db = _proxy.data_dictionary();
    auto cfs = db.get_tables();
-    auto i = cfs.begin();
-    auto e = cfs.end();

    if (limit < 1) {
        throw api_error::validation("Limit must be 1 or more");
    }

-    // TODO: the unordered_map here is not really well suited for partial
-    // querying - we're sorting on local hash order, and creating a table
-    // between queries may or may not miss info. But that should be rare,
-    // and we can probably expect this to be a single call.
+    // # 12601 (maybe?) - sort the set of tables on ID. This should ensure we never
+    // generate duplicates in a paged listing here. Can obviously miss things if they 
+    // are added between paged calls and end up with a "smaller" UUID/ARN, but that 
+    // is to be expected.
+    std::sort(cfs.begin(), cfs.end(), [](const data_dictionary::table& t1, const data_dictionary::table& t2) {
+        return t1.schema()->id() < t2.schema()->id();
+    });
+
+    auto i = cfs.begin();
+    auto e = cfs.end();
+
    if (streams_start) {
-        i = std::find_if(i, e, [&](data_dictionary::table t) {
+        i = std::find_if(i, e, [&](const data_dictionary::table& t) {
            return t.schema()->id() == streams_start 
                && cdc::get_base_table(db.real_database(), *t.schema())
                && is_alternator_keyspace(t.schema()->ks_name())
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -116,9 +116,6 @@ future<executor::request_return_type> executor::update_time_to_live(client_state

 future<executor::request_return_type> executor::describe_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
    _stats.api_operations.describe_time_to_live++;
-    if (!_proxy.data_dictionary().features().cluster_supports_alternator_ttl()) {
-        co_return api_error::unknown_operation("DescribeTimeToLive not yet supported. Experimental support is available if the 'alternator_ttl' experimental feature is enabled on all nodes.");
-    }
    schema_ptr schema = get_table(_proxy, request);
    std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
    rjson::value desc = rjson::empty_object();
--- a/alternator/ttl.hh
+++ b/alternator/ttl.hh
@@ -12,6 +12,7 @@
 #include <seastar/core/sharded.hh>
 #include <seastar/core/abort_source.hh>
 #include <seastar/core/semaphore.hh>
+#include "data_dictionary/data_dictionary.hh"

 namespace replica {
 class database;
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -624,7 +624,7 @@
                  },
                  {
                     "name":"kn",
-                     "description":"Comma seperated keyspaces name to snapshot",
+                     "description":"Keyspace(s) to snapshot. Multiple keyspaces can be provided using a comma-separated list. If omitted, snapshot all keyspaces.",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
@@ -632,7 +632,7 @@
                  },
                  {
                     "name":"cf",
-                     "description":"the column family to snapshot",
+                     "description":"Table(s) to snapshot. Multiple tables (in a single keyspace) can be provided using a comma-separated list. If omitted, snapshot all tables in the given keyspace(s).",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -593,6 +593,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        if (column_families.empty()) {
            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
        }
+        apilog.debug("force_keyspace_compaction: keyspace={} tables={}", keyspace, column_families);
        return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) -> future<> {
            auto table_ids = boost::copy_range<std::vector<utils::UUID>>(column_families | boost::adaptors::transformed([&] (auto& cf_name) {
                return db.find_uuid(keyspace, cf_name);
@@ -617,6 +618,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        if (column_families.empty()) {
            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
        }
+        apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, column_families);
        return ss.local().is_cleanup_allowed(keyspace).then([&ctx, keyspace,
                column_families = std::move(column_families)] (bool is_cleanup_allowed) mutable {
            if (!is_cleanup_allowed) {
@@ -635,7 +637,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                // as a table can be dropped during loop below, let's find it before issuing the cleanup request.
                for (auto& id : table_ids) {
                    replica::table& t = db.find_column_family(id);
-                    co_await cm.perform_cleanup(db, &t);
+                    co_await t.perform_cleanup_compaction(db);
                }
                co_return;
            }).then([]{
@@ -645,6 +647,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::perform_keyspace_offstrategy_compaction.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> tables) -> future<json::json_return_type> {
+        apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, tables);
        co_return co_await ctx.db.map_reduce0([&keyspace, &tables] (replica::database& db) -> future<bool> {
            bool needed = false;
            for (const auto& table : tables) {
@@ -658,6 +661,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);

+        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, column_families, exclude_current_version);
        return ctx.db.invoke_on_all([=] (replica::database& db) {
            return do_for_each(column_families, [=, &db](sstring cfname) {
                auto& cm = db.get_compaction_manager();
@@ -669,23 +673,22 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        });
    }));

-    ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) {
+    ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) -> future<json::json_return_type> {
        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
+        apilog.info("perform_keyspace_flush: keyspace={} tables={}", keyspace, column_families);
+        auto &db = ctx.db.local();
        if (column_families.empty()) {
-            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
+            co_await db.flush_on_all(keyspace);
+        } else {
+            co_await db.flush_on_all(keyspace, std::move(column_families));
        }
-        return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) {
-            return parallel_for_each(column_families, [&db, keyspace](const sstring& cf) mutable {
-                return db.find_column_family(keyspace, cf).flush();
-            });
-        }).then([]{
-                return make_ready_future<json::json_return_type>(json_void());
-        });
+        co_return json_void();
    });


    ss::decommission.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("decommission");
        return ss.local().decommission().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -701,6 +704,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::remove_node.set(r, [&ss](std::unique_ptr<request> req) {
        auto host_id = req->get_query_param("host_id");
        std::vector<sstring> ignore_nodes_strs= split(req->get_query_param("ignore_nodes"), ",");
+        apilog.info("remove_node: host_id={} ignore_nodes={}", host_id, ignore_nodes_strs);
        auto ignore_nodes = std::list<gms::inet_address>();
        for (std::string n : ignore_nodes_strs) {
            try {
@@ -773,6 +777,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::drain.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("drain");
        return ss.local().drain().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -805,12 +810,14 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::stop_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("stop_gossiping");
        return ss.local().stop_gossiping().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

    ss::start_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("start_gossiping");
        return ss.local().start_gossiping().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -907,6 +914,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::rebuild.set(r, [&ss](std::unique_ptr<request> req) {
        auto source_dc = req->get_query_param("source_dc");
+        apilog.info("rebuild: source_dc={}", source_dc);
        return ss.local().rebuild(std::move(source_dc)).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -943,6 +951,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        // FIXME: We should truncate schema tables if more than one node in the cluster.
        auto& sp = service::get_storage_proxy();
        auto& fs = sp.local().features();
+        apilog.info("reset_local_schema");
        return db::schema_tables::recalculate_schema_version(sp, fs).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -950,6 +959,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
        auto probability = req->get_query_param("probability");
+        apilog.info("set_trace_probability: probability={}", probability);
        return futurize_invoke([probability] {
            double real_prob = std::stod(probability.c_str());
            return tracing::tracing::tracing_instance().invoke_on_all([real_prob] (auto& local_tracing) {
@@ -987,6 +997,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto ttl = req->get_query_param("ttl");
        auto threshold = req->get_query_param("threshold");
        auto fast = req->get_query_param("fast");
+        apilog.info("set_slow_query: enable={} ttl={} threshold={} fast={}", enable, ttl, threshold, fast);
        try {
            return tracing::tracing::tracing_instance().invoke_on_all([enable, ttl, threshold, fast] (auto& local_tracing) {
                if (threshold != "") {
@@ -1013,6 +1024,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

+        apilog.info("enable_auto_compaction: keyspace={} tables={}", keyspace, tables);
        return set_tables_autocompaction(ctx, ss.local(), keyspace, tables, true);
    });

@@ -1020,6 +1032,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

+        apilog.info("disable_auto_compaction: keyspace={} tables={}", keyspace, tables);
        return set_tables_autocompaction(ctx, ss.local(), keyspace, tables, false);
    });

@@ -1284,40 +1297,46 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        });
    });

-    ss::take_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) {
-        apilog.debug("take_snapshot: {}", req->query_parameters);
+    ss::take_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) -> future<json::json_return_type> {
+        apilog.info("take_snapshot: {}", req->query_parameters);
        auto tag = req->get_query_param("tag");
        auto column_families = split(req->get_query_param("cf"), ",");
        auto sfopt = req->get_query_param("sf");
        auto sf = db::snapshot_ctl::skip_flush(strcasecmp(sfopt.c_str(), "true") == 0);

        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-
-        auto resp = make_ready_future<>();
-        if (column_families.empty()) {
-            resp = snap_ctl.local().take_snapshot(tag, keynames, sf);
-        } else {
-            if (keynames.empty()) {
-                throw httpd::bad_param_exception("The keyspace of column families must be specified");
+        try {
+            if (column_families.empty()) {
+                co_await snap_ctl.local().take_snapshot(tag, keynames, sf);
+            } else {
+                if (keynames.empty()) {
+                    throw httpd::bad_param_exception("The keyspace of column families must be specified");
+                }
+                if (keynames.size() > 1) {
+                    throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
+                }
+                co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, sf);
            }
-            if (keynames.size() > 1) {
-                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
-            }
-            resp = snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, sf);
+            co_return json_void();
+        } catch (...) {
+            apilog.error("take_snapshot failed: {}", std::current_exception());
+            throw;
        }
-        return resp.then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
    });

-    ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) {
+    ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) -> future<json::json_return_type> {
+        apilog.info("del_snapshot: {}", req->query_parameters);
        auto tag = req->get_query_param("tag");
        auto column_family = req->get_query_param("cf");

        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-        return snap_ctl.local().clear_snapshot(tag, keynames, column_family).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
+        try {
+            co_await snap_ctl.local().clear_snapshot(tag, keynames, column_family);
+            co_return json_void();
+        } catch (...) {
+            apilog.error("del_snapshot failed: {}", std::current_exception());
+            throw;
+        }
    });

    ss::true_snapshots_size.set(r, [&snap_ctl](std::unique_ptr<request> req) {
@@ -1354,7 +1373,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        if (!req_param<bool>(*req, "disable_snapshot", false)) {
            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
            f = parallel_for_each(column_families, [&snap_ctl, keyspace, tag](sstring cf) {
-                return snap_ctl.local().take_column_family_snapshot(keyspace, cf, tag);
+                return snap_ctl.local().take_column_family_snapshot(keyspace, cf, tag, db::snapshot_ctl::skip_flush::no, db::snapshot_ctl::allow_view_snapshots::yes);
            });
        }

--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -87,19 +87,24 @@ compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
            // prefer expiring cells.
            return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
        }
-        if (left.is_live_and_has_ttl() && left.expiry() != right.expiry()) {
-            return left.expiry() <=> right.expiry();
+        if (left.is_live_and_has_ttl()) {
+            if (left.expiry() != right.expiry()) {
+                return left.expiry() <=> right.expiry();
+            } else {
+                // prefer the cell that was written later,
+                // so it survives longer after it expires, until purged.
+                return right.ttl() <=> left.ttl();
+            }
        }
    } else {
        // Both are deleted
-        if (left.deletion_time() != right.deletion_time()) {
-            // Origin compares big-endian serialized deletion time. That's because it
-            // delegates to AbstractCell.reconcile() which compares values after
-            // comparing timestamps, which in case of deleted cells will hold
-            // serialized expiry.
-            return (uint64_t) left.deletion_time().time_since_epoch().count()
-                   <=> (uint64_t) right.deletion_time().time_since_epoch().count();
-        }
+
+        // Origin compares big-endian serialized deletion time. That's because it
+        // delegates to AbstractCell.reconcile() which compares values after
+        // comparing timestamps, which in case of deleted cells will hold
+        // serialized expiry.
+        return (uint64_t) left.deletion_time().time_since_epoch().count()
+                <=> (uint64_t) right.deletion_time().time_since_epoch().count();
    }
    return std::strong_ordering::equal;
 }
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -59,7 +59,7 @@ using namespace std::chrono_literals;
 logging::logger cdc_log("cdc");

 namespace cdc {
-static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {});
+static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {}, schema_ptr = nullptr);
 }

 static constexpr auto cdc_group_name = "cdc";
@@ -206,7 +206,7 @@ public:
                return;
            }

-            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt);
+            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);

            auto log_mut = log_schema 
                ? db::schema_tables::make_update_table_mutations(db, keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
@@ -484,7 +484,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
    return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
 }

-static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid) {
+static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid, schema_ptr old) {
    schema_builder b(s.ks_name(), log_name(s.cf_name()));
    b.with_partitioner("com.scylladb.dht.CDCPartitioner");
    b.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
@@ -571,6 +571,20 @@ static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID>
        b.set_uuid(*uuid);
    }

+    /**
+     * #10473 - if we are redefining the log table, we need to ensure any dropped
+     * columns are registered in "dropped_columns" table, otherwise clients will not
+     * be able to read data older than now.
+     */
+    if (old) {
+        // not super efficient, but we don't do this often.
+        for (auto& col : old->all_columns()) {
+            if (!b.has_column({col.name(), col.name_as_text() })) {
+                b.without_column(col.name_as_text(), col.type, api::new_timestamp());
+            }
+        }
+    }
+
    return b.build();
 }

--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -1281,6 +1281,13 @@ private:

            const auto& key = _validator.previous_partition_key();

+            if (_validator.current_tombstone()) {
+                throw compaction_aborted_exception(
+                        _schema->ks_name(),
+                        _schema->cf_name(),
+                        "scrub compaction cannot handle invalid fragments with an active range tombstone change");
+            }
+
            // If the unexpected fragment is a partition end, we just drop it.
            // The only case a partition end is invalid is when it comes after
            // another partition end, and we can just drop it in that case.
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -317,9 +317,9 @@ future<> compaction_manager::run_custom_job(replica::table* t, sstables::compact

    auto job_ptr = std::make_unique<noncopyable_function<future<>(sstables::compaction_data&)>>(std::move(job));

-    task->compaction_done = with_semaphore(_maintenance_ops_sem, 1, [this, task, &job = *job_ptr] () mutable {
-        // take read lock for table, so major compaction and resharding can't proceed in parallel.
-        return with_lock(task->compaction_state.lock.for_read(), [this, task, &job] () mutable {
+    task->compaction_done = with_semaphore(_custom_jobs_sem, 1, [this, task, &job = *job_ptr] () mutable {
+            // We don't need to take task->compaction_state.lock.for_read() as it only serializes minor and major
+
            // Allow caller to know that task (e.g. reshape) was asked to stop while waiting for a chance to run.
            if (task->stopping) {
                throw sstables::compaction_stopped_exception(task->compacting_table->schema()->ks_name(), task->compacting_table->schema()->cf_name(),
@@ -335,7 +335,6 @@ future<> compaction_manager::run_custom_job(replica::table* t, sstables::compact
            // no need to register shared sstables because they're excluded from non-resharding
            // compaction and some of them may not even belong to current shard.
            return job(task->compaction_data);
-        });
    }).then_wrapped([this, task, job_ptr = std::move(job_ptr), type] (future<> f) {
        _stats.active_tasks--;
        _tasks.remove(task);
@@ -353,32 +352,50 @@ future<> compaction_manager::run_custom_job(replica::table* t, sstables::compact
    return task->compaction_done.get_future().then([task] {});
 }

+compaction_manager::compaction_reenabler::compaction_reenabler(compaction_manager& cm, replica::table* t)
+    : _cm(cm)
+    , _table(t)
+    , _compaction_state(cm.get_compaction_state(_table))
+    , _holder(_compaction_state.gate.hold())
+{
+    _compaction_state.compaction_disabled_counter++;
+    cmlog.debug("Temporarily disabled compaction for {}.{}. compaction_disabled_counter={}",
+            _table->schema()->ks_name(), _table->schema()->cf_name(), _compaction_state.compaction_disabled_counter);
+}
+
+compaction_manager::compaction_reenabler::compaction_reenabler(compaction_reenabler&& o) noexcept
+    : _cm(o._cm)
+    , _table(std::exchange(o._table, nullptr))
+    , _compaction_state(o._compaction_state)
+    , _holder(std::move(o._holder))
+{}
+
+compaction_manager::compaction_reenabler::~compaction_reenabler() {
+    // submit compaction request if we're the last holder of the gate which is still opened.
+    if (_table && --_compaction_state.compaction_disabled_counter == 0 && !_compaction_state.gate.is_closed()) {
+        cmlog.debug("Reenabling compaction for {}.{}",
+                _table->schema()->ks_name(), _table->schema()->cf_name());
+        try {
+            _cm.submit(_table);
+        } catch (...) {
+            cmlog.warn("compaction_reenabler could not reenable compaction for {}.{}: {}",
+                    _table->schema()->ks_name(), _table->schema()->cf_name(), std::current_exception());
+        }
+    }
+}
+
+future<compaction_manager::compaction_reenabler>
+compaction_manager::stop_and_disable_compaction(replica::table* t) {
+    compaction_reenabler cre(*this, t);
+    co_await stop_ongoing_compactions("user-triggered operation", t);
+    co_return cre;
+}
+
 future<>
 compaction_manager::run_with_compaction_disabled(replica::table* t, std::function<future<> ()> func) {
-    auto& c_state = _compaction_state[t];
-    auto holder = c_state.gate.hold();
+    compaction_reenabler cre = co_await stop_and_disable_compaction(t);

-    c_state.compaction_disabled_counter++;
-
-    std::exception_ptr err;
-    try {
-        co_await stop_ongoing_compactions("user-triggered operation", t);
-        co_await func();
-    } catch (...) {
-        err = std::current_exception();
-    }
-
-#ifdef DEBUG
-    assert(_compaction_state.contains(t));
-#endif
-    // submit compaction request if we're the last holder of the gate which is still opened.
-    if (--c_state.compaction_disabled_counter == 0 && !c_state.gate.is_closed()) {
-        submit(t);
-    }
-    if (err) {
-        std::rethrow_exception(err);
-    }
-    co_return;
+    co_await func();
 }

 void compaction_manager::task::setup_new_compaction() {
@@ -584,16 +601,11 @@ future<> compaction_manager::stop() {
    }
 }

-void compaction_manager::really_do_stop() {
-    if (_state == state::none || _state == state::stopped) {
-        return;
-    }
-
-    _state = state::stopped;
+future<> compaction_manager::really_do_stop() {
    cmlog.info("Asked to stop");
    // Reset the metrics registry
    _metrics.clear();
-    _stop_future.emplace(stop_ongoing_compactions("shutdown").then([this] () mutable {
+    return stop_ongoing_compactions("shutdown").then([this] () mutable {
        reevaluate_postponed_compactions();
        return std::move(_waiting_reevalution);
    }).then([this] {
@@ -601,12 +613,34 @@ void compaction_manager::really_do_stop() {
        _compaction_submission_timer.cancel();
        cmlog.info("Stopped");
        return _compaction_controller.shutdown();
-    }));
+    });
+}
+
+template <typename Ex>
+requires std::is_base_of_v<std::exception, Ex> &&
+requires (const Ex& ex) {
+    { ex.code() } noexcept -> std::same_as<const std::error_code&>;
+}
+auto swallow_enospc(const Ex& ex) noexcept {
+    if (ex.code().value() != ENOSPC) {
+        return make_exception_future<>(std::make_exception_ptr(ex));
+    }
+
+    cmlog.warn("Got ENOSPC on stop, ignoring...");
+    return make_ready_future<>();
 }

 void compaction_manager::do_stop() noexcept {
+    if (_state == state::none || _state == state::stopped) {
+        return;
+    }
+
    try {
-        really_do_stop();
+        _state = state::stopped;
+        _stop_future = really_do_stop()
+            .handle_exception_type([] (const std::system_error& ex) { return swallow_enospc(ex); })
+            .handle_exception_type([] (const storage_io_error& ex) { return swallow_enospc(ex); })
+        ;
    } catch (...) {
        try {
            cmlog.error("Failed to stop the manager: {}", std::current_exception());
@@ -742,6 +776,7 @@ future<> compaction_manager::perform_offstrategy(replica::table* t) {
                _stats.active_tasks++;
                task->setup_new_compaction();

+              return with_scheduling_group(_maintenance_sg.cpu, [this, task, t] {
                return t->run_offstrategy_compaction(task->compaction_data).then_wrapped([this, task, schema = t->schema()] (future<> f) mutable {
                    _stats.active_tasks--;
                    task->finish_compaction();
@@ -763,6 +798,7 @@ future<> compaction_manager::perform_offstrategy(replica::table* t) {
                    }
                    return make_ready_future<stop_iteration>(stop_iteration::yes);
                });
+              });
            });
        });
    }).finally([this, task] {
@@ -810,7 +846,8 @@ future<> compaction_manager::rewrite_sstables(replica::table* t, sstables::compa
            auto sstable_level = sst->get_sstable_level();
            auto run_identifier = sst->run_identifier();
            auto sstable_set_snapshot = can_purge ? std::make_optional(t.get_sstable_set()) : std::nullopt;
-            auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), _maintenance_sg.io,
+            // FIXME: this compaction should run with maintenance priority.
+            auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), service::get_local_compaction_priority(),
                sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, options);

            // Releases reference to cleaned sstable such that respective used disk space can be freed.
@@ -819,8 +856,9 @@ future<> compaction_manager::rewrite_sstables(replica::table* t, sstables::compa
            };

            auto maintenance_permit = co_await seastar::get_units(_maintenance_ops_sem, 1);
-            // Take write lock for table to serialize cleanup/upgrade sstables/scrub with major compaction/reshape/reshard.
-            auto write_lock_holder = co_await _compaction_state[&t].lock.hold_write_lock();
+            // FIXME: acquiring the read lock is not needed after acquiring the _maintenance_ops_sem
+            // only major compaction needs to acquire the write lock to synchronize with regular compaction.
+            auto lock_holder = co_await _compaction_state[&t].lock.hold_read_lock();

            _stats.pending_tasks--;
            _stats.active_tasks++;
@@ -852,7 +890,7 @@ future<> compaction_manager::rewrite_sstables(replica::table* t, sstables::compa
            };

            compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
-            completed = co_await with_scheduling_group(_maintenance_sg.cpu, std::ref(perform_rewrite));
+            completed = co_await with_scheduling_group(_compaction_controller.sg(), std::ref(perform_rewrite));
        } while (!completed);
    };

--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -147,6 +147,8 @@ private:
    // If the operation must be serialized with regular, then the per-table write lock must be taken.
    seastar::named_semaphore _maintenance_ops_sem = {1, named_semaphore_exception_factory{"maintenance operation"}};

+    seastar::named_semaphore _custom_jobs_sem = {1, named_semaphore_exception_factory{"custom jobs"}};
+
    std::function<void()> compaction_submission_callback();
    // all registered tables are reevaluated at a constant interval.
    // Submission is a NO-OP when there's nothing to do, so it's fine to call it regularly.
@@ -233,7 +235,7 @@ public:

    // Stop all fibers, without waiting. Safe to be called multiple times.
    void do_stop() noexcept;
-    void really_do_stop();
+    future<> really_do_stop();

    // Submit a table to be compacted.
    void submit(replica::table* t);
@@ -269,6 +271,31 @@ public:
    // parameter job is a function that will carry the operation
    future<> run_custom_job(replica::table* t, sstables::compaction_type type, noncopyable_function<future<>(sstables::compaction_data&)> job);

+    class compaction_reenabler {
+        compaction_manager& _cm;
+        replica::table* _table;
+        compaction_state& _compaction_state;
+        gate::holder _holder;
+
+    public:
+        compaction_reenabler(compaction_manager&, replica::table*);
+        compaction_reenabler(compaction_reenabler&&) noexcept;
+
+        ~compaction_reenabler();
+
+        replica::table* compacting_table() const noexcept {
+            return _table;
+        }
+
+        const compaction_state& compaction_state() const noexcept {
+            return _compaction_state;
+        }
+    };
+
+    // Disable compaction temporarily for a table t.
+    // Caller should call the compaction_reenabler::reenable
+    future<compaction_reenabler> stop_and_disable_compaction(replica::table* t);
+
    // Run a function with compaction temporarily disabled for a table T.
    future<> run_with_compaction_disabled(replica::table* t, std::function<future<> ()> func);

--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -69,7 +69,11 @@ compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(tabl
 }

 void leveled_compaction_strategy::notify_completion(const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
-    if (removed.empty() || added.empty()) {
+    // All the update here is only relevant for regular compaction's round-robin picking policy, and if
+    // last_compacted_keys wasn't generated by regular, it means regular is disabled since last restart,
+    // therefore we can skip the updates here until regular runs for the first time. Once it runs,
+    // it will be able to generate last_compacted_keys correctly by looking at metadata of files.
+    if (removed.empty() || added.empty() || !_last_compacted_keys) {
        return;
    }
    auto min_level = std::numeric_limits<uint32_t>::max();
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -217,6 +217,7 @@ time_window_compaction_strategy::get_sstables_for_compaction(table_state& table_
    auto compaction_time = gc_clock::now();

    if (candidates.empty()) {
+        _estimated_remaining_tasks = 0;
        return compaction_descriptor();
    }

--- a/configure.py
+++ b/configure.py
@@ -615,6 +615,8 @@ arg_parser.add_argument('--static-yaml-cpp', dest='staticyamlcpp', action='store
                        help='Link libyaml-cpp statically')
 arg_parser.add_argument('--tests-debuginfo', action='store', dest='tests_debuginfo', type=int, default=0,
                        help='Enable(1)/disable(0)compiler debug information generation for tests')
+arg_parser.add_argument('--perf-tests-debuginfo', action='store', dest='perf_tests_debuginfo', type=int, default=0,
+                        help='Enable(1)/disable(0)compiler debug information generation for perf tests')
 arg_parser.add_argument('--python', action='store', dest='python', default='python3',
                        help='Python3 path')
 arg_parser.add_argument('--split-dwarf', dest='split_dwarf', action='store_true', default=False,
@@ -1377,6 +1379,7 @@ linker_flags = linker_flags(compiler=args.cxx)

 dbgflag = '-g -gz' if args.debuginfo else ''
 tests_link_rule = 'link' if args.tests_debuginfo else 'link_stripped'
+perf_tests_link_rule = 'link' if args.perf_tests_debuginfo else 'link_stripped'

 # Strip if debuginfo is disabled, otherwise we end up with partial
 # debug info from the libraries we static link with
@@ -1901,7 +1904,8 @@ with open(buildfile_tmp, 'w') as f:
                    # So we strip the tests by default; The user can very
                    # quickly re-link the test unstripped by adding a "_g"
                    # to the test name, e.g., "ninja build/release/testname_g"
-                    f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, tests_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
+                    link_rule = perf_tests_link_rule if binary.startswith('test/perf/') else tests_link_rule
+                    f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
                    f.write('   libs = {}\n'.format(local_libs))
                    f.write('build $builddir/{}/{}_g: {}.{} {} | {} {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
                    f.write('   libs = {}\n'.format(local_libs))
@@ -2004,7 +2008,8 @@ with open(buildfile_tmp, 'w') as f:
                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
                if cc.endswith('Parser.cpp'):
                    # Unoptimized parsers end up using huge amounts of stack space and overflowing their stack
-                    flags = '-O1'
+                    flags = '-O1' if modes[mode]['optimization-level'] in ['0', 'g', 's'] else ''
+
                    if has_sanitize_address_use_after_scope:
                        flags += ' -fno-sanitize-address-use-after-scope'
                    f.write('  obj_cxxflags = %s\n' % flags)
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -1386,7 +1386,7 @@ serviceLevelOrRoleName returns [sstring name]
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower); }
 | t=STRING_LITERAL     { $name = sstring($t.text); }
 | t=QUOTED_NAME        { $name = sstring($t.text); }
-| k=unreserved_keyword { $name = sstring($t.text); 
+| k=unreserved_keyword { $name = k;
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower);}
 | QMARK {add_recognition_error("Bind variables cannot be used for service levels or role names");}
 ;
--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -12,6 +12,7 @@

 #include "cql3_type.hh"
 #include "cql3/util.hh"
+#include "exceptions/exceptions.hh"
 #include "ut_name.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "data_dictionary/user_types_metadata.hh"
@@ -436,7 +437,20 @@ sstring maybe_quote(const sstring& identifier) {
    }

    if (!need_quotes) {
-        return identifier;
+        // A seemingly valid identifier matching [a-z][a-z0-9_]* may still
+        // need quoting if it is a CQL keyword, e.g., "to" (see issue #9450).
+        // While our parser Cql.g has different production rules for different
+        // types of identifiers (column names, table names, etc.), all of
+        // these behave identically for alphanumeric strings: they exclude
+        // many keywords but allow keywords listed as "unreserved keywords".
+        // So we can use any of them, for example cident.
+        try {
+            cql3::util::do_with_parser(identifier, std::mem_fn(&cql3_parser::CqlParser::cident));
+            return identifier;
+        } catch(exceptions::syntax_exception&) {
+            // This alphanumeric string is not a valid identifier, so fall
+            // through to have it quoted:
+        }
    }
    if (num_quotes == 0) {
        return make_sstring("\"", identifier, "\"");
--- a/cql3/cql_statement.hh
+++ b/cql3/cql_statement.hh
@@ -81,9 +81,7 @@ public:
    virtual seastar::future<seastar::shared_ptr<cql_transport::messages::result_message>>
        execute(query_processor& qp, service::query_state& state, const query_options& options) const = 0;

-    virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const = 0;
-
-    virtual bool depends_on_column_family(const seastar::sstring& cf_name) const = 0;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const = 0;

    virtual seastar::shared_ptr<const metadata> get_result_metadata() const = 0;

--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -103,10 +103,50 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
        if (!col_type->is_map()) {
            throw exceptions::invalid_request_exception(format("subscripting non-map column {}", cdef->name_as_text()));
        }
-        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*data.other_columns[data.sel.index_of(*cdef)]));
+        int32_t index = data.sel.index_of(*cdef);
+        if (index == -1) {
+            throw std::runtime_error(
+                    format("Column definition {} does not match any column in the query selection",
+                    cdef->name_as_text()));
+        }
+        const managed_bytes_opt& serialized = data.other_columns[index];
+        if (!serialized) {
+            // For null[i] we return null.
+            return std::nullopt;
+        }
+        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*serialized));
        const auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
        const auto key = evaluate(*col.sub, options);
        auto&& key_type = col_type->name_comparator();
+        if (key.is_null()) {
+            // For m[null] return null.
+            // This is different from Cassandra - which treats m[null]
+            // as an invalid request error. But m[null] -> null is more
+            // consistent with our usual null treatement (e.g., both
+            // null[2] and null < 2 return null). It will also allow us
+            // to support non-constant subscripts (e.g., m[a]) where "a"
+            // may be null in some rows and non-null in others, and it's
+            // not an error.
+            return std::nullopt;
+        }
+        if (key.is_unset_value()) {
+            // An m[?] with ? bound to UNSET_VALUE is a invalid query.
+            // We could have detected it earlier while binding, but since
+            // we currently don't, we must protect the following code
+            // which can't work with an UNSET_VALUE. Note that the
+            // placement of this check here means that in an empty table,
+            // where we never need to evaluate the filter expression, this
+            // error will not be detected.
+            throw exceptions::invalid_request_exception(
+                format("Unsupported unset map key for column {}",
+                    cdef->name_as_text()));
+        }
+        if (key.type != key_type) {
+            // This can't happen, we always verify the index type earlier.
+            throw std::logic_error(
+                format("Tried to evaluate expression with wrong type for subscript of {}",
+                    cdef->name_as_text()));
+        }
        const auto found = key.view().with_linearized([&] (bytes_view key_bv) {
            using entry = std::pair<data_value, data_value>;
            return std::find_if(data_map.cbegin(), data_map.cend(), [&] (const entry& element) {
@@ -121,8 +161,16 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
        case column_kind::clustering_key:
            return managed_bytes(data.clustering_key[cdef->id]);
        case column_kind::static_column:
-        case column_kind::regular_column:
-            return managed_bytes_opt(data.other_columns[data.sel.index_of(*cdef)]);
+            [[fallthrough]];
+        case column_kind::regular_column: {
+            int32_t index = data.sel.index_of(*cdef);
+            if (index == -1) {
+                throw std::runtime_error(
+                        format("Column definition {} does not match any column in the query selection",
+                        cdef->name_as_text()));
+            }
+            return managed_bytes_opt(data.other_columns[index]);
+        }
        default:
            throw exceptions::unsupported_operation_exception("Unknown column kind");
        }
@@ -1245,7 +1293,7 @@ expression search_and_replace(const expression& e,
                    };
                },
                [&] (const binary_operator& oper) -> expression {
-                    return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs));
+                    return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs), oper.order);
                },
                [&] (const column_mutation_attribute& cma) -> expression {
                    return column_mutation_attribute{cma.kind, recurse(cma.column)};
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -953,7 +953,7 @@ bool query_processor::migration_subscriber::should_invalidate(
        sstring ks_name,
        std::optional<sstring> cf_name,
        ::shared_ptr<cql_statement> statement) {
-    return statement->depends_on_keyspace(ks_name) && (!cf_name || statement->depends_on_column_family(*cf_name));
+    return statement->depends_on(ks_name, cf_name);
 }

 future<> query_processor::query_internal(
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -514,7 +514,7 @@ statement_restrictions::statement_restrictions(data_dictionary::database db,
    }

    if (!_nonprimary_key_restrictions->empty()) {
-        if (_has_queriable_regular_index) {
+        if (_has_queriable_regular_index && _partition_range_is_simple) {
            _uses_secondary_indexing = true;
        } else if (!allow_filtering) {
            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -165,7 +165,7 @@ public:

    template<typename RowComparator>
    void sort(const RowComparator& cmp) {
-        std::sort(_rows.begin(), _rows.end(), std::ref(cmp));
+        std::sort(_rows.begin(), _rows.end(), cmp);
    }

    metadata& get_metadata();
--- a/cql3/selection/field_selector.hh
+++ b/cql3/selection/field_selector.hh
@@ -83,7 +83,7 @@ public:

    virtual sstring assignment_testable_source_context() const override {
        auto&& name = _type->field_name(_field);
-        auto sname = sstring(reinterpret_cast<const char*>(name.begin(), name.size()));
+        auto sname = std::string_view(reinterpret_cast<const char*>(name.data()), name.size());
        return format("{}.{}", _selected, sname);
    }

--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -422,11 +422,16 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
    }

    auto clustering_columns_restrictions = _restrictions->get_clustering_columns_restrictions();
-    if (dynamic_pointer_cast<cql3::restrictions::multi_column_restriction>(clustering_columns_restrictions)) {
+    bool has_multi_col_clustering_restrictions =
+        dynamic_pointer_cast<cql3::restrictions::multi_column_restriction>(clustering_columns_restrictions) != nullptr;
+    if (has_multi_col_clustering_restrictions) {
        clustering_key_prefix ckey = clustering_key_prefix::from_exploded(clustering_key);
-        return expr::is_satisfied_by(
+        bool multi_col_clustering_satisfied = expr::is_satisfied_by(
                clustering_columns_restrictions->expression,
                partition_key, clustering_key, static_row, row, selection, _options);
+        if (!multi_col_clustering_satisfied) {
+            return false;
+        }
    }

    auto static_row_iterator = static_row.iterator();
@@ -474,6 +479,13 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
            if (_skip_ck_restrictions) {
                continue;
            }
+            if (has_multi_col_clustering_restrictions) {
+                // Mixing multi column and single column restrictions on clustering
+                // key columns is forbidden.
+                // Since there are multi column restrictions we have to skip
+                // evaluating single column restrictions or we will get an error.
+                continue;
+            }
            auto clustering_key_restrictions_map = _restrictions->get_single_column_clustering_key_restrictions();
            auto restr_it = clustering_key_restrictions_map.find(cdef);
            if (restr_it == clustering_key_restrictions_map.end()) {
--- a/cql3/statements/authentication_statement.cc
+++ b/cql3/statements/authentication_statement.cc
@@ -18,13 +18,7 @@ uint32_t cql3::statements::authentication_statement::get_bound_terms() const {
    return 0;
 }

-bool cql3::statements::authentication_statement::depends_on_keyspace(
-                const sstring& ks_name) const {
-    return false;
-}
-
-bool cql3::statements::authentication_statement::depends_on_column_family(
-                const sstring& cf_name) const {
+bool cql3::statements::authentication_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/authentication_statement.hh
+++ b/cql3/statements/authentication_statement.hh
@@ -27,9 +27,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/statements/authorization_statement.cc
+++ b/cql3/statements/authorization_statement.cc
@@ -20,13 +20,7 @@ uint32_t cql3::statements::authorization_statement::get_bound_terms() const {
    return 0;
 }

-bool cql3::statements::authorization_statement::depends_on_keyspace(
-                const sstring& ks_name) const {
-    return false;
-}
-
-bool cql3::statements::authorization_statement::depends_on_column_family(
-                const sstring& cf_name) const {
+bool cql3::statements::authorization_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/authorization_statement.hh
+++ b/cql3/statements/authorization_statement.hh
@@ -31,9 +31,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -70,14 +70,9 @@ batch_statement::batch_statement(type type_,
 {
 }

-bool batch_statement::depends_on_keyspace(const sstring& ks_name) const
+bool batch_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
-    return false;
-}
-
-bool batch_statement::depends_on_column_family(const sstring& cf_name) const
-{
-    return false;
+    return boost::algorithm::any_of(_statements, [&ks_name, &cf_name] (auto&& s) { return s.statement->depends_on(ks_name, cf_name); });
 }

 uint32_t batch_statement::get_bound_terms() const
@@ -259,6 +254,10 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
    if (options.getSerialConsistency() == null)
        throw new InvalidRequestException("Invalid empty serial consistency level");
 #endif
+    for (size_t i = 0; i < _statements.size(); ++i) {
+        _statements[i].statement->validate_primary_key_restrictions(options.for_statement(i));
+    }
+
    if (_has_conditions) {
        ++_stats.cas_batches;
        _stats.statements_in_cas_batches += _statements.size();
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -88,9 +88,7 @@ public:
                    std::unique_ptr<attributes> attrs,
                    cql_stats& stats);

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;

--- a/cql3/statements/cas_request.cc
+++ b/cql3/statements/cas_request.cc
@@ -121,6 +121,9 @@ std::optional<mutation> cas_request::apply(foreign_ptr<lw_shared_ptr<query::resu

 const update_parameters::prefetch_data::row* cas_request::find_old_row(const cas_row_update& op) const {
    static const clustering_key empty_ckey = clustering_key::make_empty();
+    if (_key.empty()) {
+        throw exceptions::invalid_request_exception("partition key ranges empty - probably caused by an unset value");
+    }
    const partition_key& pkey = _key.front().start()->value().key().value();
    // If a statement has only static columns conditions, we must ignore its clustering columns
    // restriction when choosing a row to check the conditions, i.e. choose any partition row,
@@ -134,6 +137,9 @@ const update_parameters::prefetch_data::row* cas_request::find_old_row(const cas
    // Another case when we pass an empty clustering key prefix is apparently when the table
    // doesn't have any clustering key columns and the clustering key range is empty (open
    // ended on both sides).
+    if (op.ranges.empty()) {
+        throw exceptions::invalid_request_exception("clustering key ranges empty - probably caused by an unset value");
+    }
    const clustering_key& ckey = !op.statement.has_only_static_column_conditions() && op.ranges.front().start() ?
        op.ranges.front().start()->value() : empty_ckey;
    return _rows.find_row(pkey, ckey);
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -20,6 +20,7 @@
 #include "gms/feature_service.hh"
 #include "tombstone_gc_extension.hh"
 #include "tombstone_gc.hh"
+#include "utils/bloom_calculations.hh"

 #include <boost/algorithm/string/predicate.hpp>

@@ -145,6 +146,16 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
        throw exceptions::configuration_exception(KW_MAX_INDEX_INTERVAL + " must be greater than " + KW_MIN_INDEX_INTERVAL);
    }

+    if (get_simple(KW_BF_FP_CHANCE)) {
+        double bloom_filter_fp_chance = get_double(KW_BF_FP_CHANCE, 0/*not used*/);
+        double min_bloom_filter_fp_chance = utils::bloom_calculations::min_supported_bloom_filter_fp_chance();
+        if (bloom_filter_fp_chance <= min_bloom_filter_fp_chance || bloom_filter_fp_chance > 1.0) {
+            throw exceptions::configuration_exception(format(
+                "{} must be larger than {} and less than or equal to 1.0 (got {})",
+                KW_BF_FP_CHANCE, min_bloom_filter_fp_chance, bloom_filter_fp_chance));
+        }
+    }
+
    speculative_retry::from_sstring(get_string(KW_SPECULATIVE_RETRY, speculative_retry(speculative_retry::type::NONE, 0).to_sstring()));
 }

--- a/cql3/statements/cf_properties.hh
+++ b/cql3/statements/cf_properties.hh
@@ -13,6 +13,7 @@

 #include "cql3/statements/cf_prop_defs.hh"
 #include "cql3/column_identifier.hh"
+#include "data_dictionary/data_dictionary.hh"

 namespace cql3 {

--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -110,9 +110,6 @@ future<> modification_statement::check_access(query_processor& qp, const service

 future<std::vector<mutation>>
 modification_statement::get_mutations(query_processor& qp, const query_options& options, db::timeout_clock::time_point timeout, bool local, int64_t now, service::query_state& qs) const {
-    if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
-        throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
-    }
    auto cl = options.get_consistency();
    auto json_cache = maybe_prepare_json_cache(options);
    auto keys = build_partition_keys(options, json_cache);
@@ -245,6 +242,12 @@ modification_statement::execute(query_processor& qp, service::query_state& qs, c
    return modify_stage(this, seastar::ref(qp), seastar::ref(qs), seastar::cref(options));
 }

+void modification_statement::validate_primary_key_restrictions(const query_options& options) const {
+    if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
+        throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
+    }
+}
+
 future<::shared_ptr<cql_transport::messages::result_message>>
 modification_statement::do_execute(query_processor& qp, service::query_state& qs, const query_options& options) const {
    if (has_conditions() && options.get_protocol_version() == 1) {
@@ -255,6 +258,8 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs

    inc_cql_stats(qs.get_client_state().is_internal());

+    validate_primary_key_restrictions(options);
+
    if (has_conditions()) {
        return execute_with_condition(qp, qs, options);
    }
@@ -539,12 +544,8 @@ modification_statement::validate(query_processor&, const service::client_state&
    }
 }

-bool modification_statement::depends_on_keyspace(const sstring& ks_name) const {
-    return keyspace() == ks_name;
-}
-
-bool modification_statement::depends_on_column_family(const sstring& cf_name) const {
-    return column_family() == cf_name;
+bool modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
 }

 void modification_statement::add_operation(::shared_ptr<operation> op) {
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -137,9 +137,7 @@ public:
    // Validate before execute, using client state and current schema
    void validate(query_processor&, const service::client_state& state) const override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    void add_operation(::shared_ptr<operation> op);

@@ -233,6 +231,8 @@ public:
    // True if this statement needs to read only static column values to check if it can be applied.
    bool has_only_static_column_conditions() const { return !_has_regular_column_conditions && _has_static_column_conditions; }

+    void validate_primary_key_restrictions(const query_options& options) const;
+
    virtual future<::shared_ptr<cql_transport::messages::result_message>>
    execute(query_processor& qp, service::query_state& qs, const query_options& options) const override;

--- a/cql3/statements/schema_altering_statement.cc
+++ b/cql3/statements/schema_altering_statement.cc
@@ -45,12 +45,7 @@ future<> schema_altering_statement::grant_permissions_to_creator(const service::
    return make_ready_future<>();
 }

-bool schema_altering_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool schema_altering_statement::depends_on_column_family(const sstring& cf_name) const
+bool schema_altering_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/schema_altering_statement.hh
+++ b/cql3/statements/schema_altering_statement.hh
@@ -53,9 +53,7 @@ protected:
     */
    virtual future<> grant_permissions_to_creator(const service::client_state&) const;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;

--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -167,12 +167,8 @@ void select_statement::validate(query_processor&, const service::client_state& s
    // Nothing to do, all validation has been done by raw_statemet::prepare()
 }

-bool select_statement::depends_on_keyspace(const sstring& ks_name) const {
-    return keyspace() == ks_name;
-}
-
-bool select_statement::depends_on_column_family(const sstring& cf_name) const {
-    return column_family() == cf_name;
+bool select_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
 }

 const sstring& select_statement::keyspace() const {
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -100,8 +100,7 @@ public:
    virtual uint32_t get_bound_terms() const override;
    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;
    virtual void validate(query_processor&, const service::client_state& state) const override;
-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<::shared_ptr<cql_transport::messages::result_message>> execute(query_processor& qp,
        service::query_state& state, const query_options& options) const override;
--- a/cql3/statements/service_level_statement.cc
+++ b/cql3/statements/service_level_statement.cc
@@ -17,13 +17,7 @@ uint32_t service_level_statement::get_bound_terms() const {
    return 0;
 }

-bool service_level_statement::depends_on_keyspace(
-        const sstring &ks_name) const {
-    return false;
-}
-
-bool service_level_statement::depends_on_column_family(
-        const sstring &cf_name) const {
+bool service_level_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/service_level_statement.hh
+++ b/cql3/statements/service_level_statement.hh
@@ -43,9 +43,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/statements/sl_prop_defs.cc
+++ b/cql3/statements/sl_prop_defs.cc
@@ -30,7 +30,7 @@ void sl_prop_defs::validate() {
        data_value v = duration_type->deserialize(duration_type->from_string(*repr));
        cql_duration duration = static_pointer_cast<const duration_type_impl>(duration_type)->from_value(v);
        if (duration.months || duration.days) {
-            throw exceptions::invalid_request_exception("Timeout values cannot be longer than 24h");
+            throw exceptions::invalid_request_exception("Timeout values cannot be expressed in days/months");
        }
        if (duration.nanoseconds % 1'000'000 != 0) {
            throw exceptions::invalid_request_exception("Timeout values must be expressed in millisecond granularity");
--- a/cql3/statements/truncate_statement.cc
+++ b/cql3/statements/truncate_statement.cc
@@ -39,12 +39,7 @@ std::unique_ptr<prepared_statement> truncate_statement::prepare(data_dictionary:
    return std::make_unique<prepared_statement>(::make_shared<truncate_statement>(*this));
 }

-bool truncate_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool truncate_statement::depends_on_column_family(const sstring& cf_name) const
+bool truncate_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/truncate_statement.hh
+++ b/cql3/statements/truncate_statement.hh
@@ -30,9 +30,7 @@ public:

    virtual std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/statements/use_statement.cc
+++ b/cql3/statements/use_statement.cc
@@ -46,12 +46,7 @@ std::unique_ptr<prepared_statement> use_statement::prepare(data_dictionary::data

 }

-bool use_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool use_statement::depends_on_column_family(const sstring& cf_name) const
+bool use_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/use_statement.hh
+++ b/cql3/statements/use_statement.hh
@@ -31,9 +31,7 @@ public:

    virtual uint32_t get_bound_terms() const override;

-    virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const seastar::sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual seastar::future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/type_json.cc
+++ b/cql3/type_json.cc
@@ -18,6 +18,8 @@
 #include "types/listlike_partial_deserializing_iterator.hh"
 #include "utils/managed_bytes.hh"
 #include "exceptions/exceptions.hh"
+#include <boost/algorithm/string/trim_all.hpp>
+#include <boost/algorithm/string.hpp>

 static inline bool is_control_char(char c) {
    return c >= 0 && c <= 0x1F;
@@ -78,8 +80,35 @@ static int64_t to_int64_t(const rjson::value& value) {
        return value.GetInt();
    } else if (value.IsUint()) {
        return value.GetUint();
-    } else if (value.GetUint64()) {
+    } else if (value.IsUint64()) {
        return value.GetUint64(); //NOTICE: large uint64_t values will get overflown
+    } else if (value.IsDouble()) {
+        // We allow specifing integer constants
+        // using scientific notation (for example 1.3e8)
+        // and floating-point numbers ending with .0 (for example 12.0),
+        // but not floating-point numbers with fractional part (12.34).
+        //
+        // The reason is that JSON standard does not have separate
+        // types for integers and floating-point numbers, only
+        // a single "number" type. Some serializers may
+        // produce an integer in that floating-point format.
+        double double_value = value.GetDouble();
+
+        // Check if the value contains disallowed fractional part (.34 from 12.34).
+        // With RapidJSON and an integer value in range [-(2^53)+1, (2^53)-1], 
+        // the fractional part will be zero as the entire value
+        // fits in 53-bit significand. RapidJSON's parsing code does not lose accuracy:
+        // when parsing a number like 12.34e8, it accumulates 1234 to a int64_t number,
+        // then converts it to double and multiples by power of 10, never having any
+        // digit in fractional part.
+        double integral;
+        double fractional = std::modf(double_value, &integral);
+        if (fractional != 0.0 && fractional != -0.0) {
+            throw marshal_exception(format("Incorrect JSON floating-point value "
+                "for int64 type: {} (it should not contain fractional part {})", value, fractional));
+        }
+
+        return double_value;
    }
    throw marshal_exception(format("Incorrect JSON value for int64 type: {}", value));
 }
@@ -189,7 +218,7 @@ struct from_json_object_visitor {
            throw marshal_exception("bytes_type must be represented as string");
        }
        std::string_view string_v = rjson::to_string_view(value);
-        if (string_v.size() < 2 && string_v[0] != '0' && string_v[1] != 'x') {
+        if (string_v.size() < 2 || string_v[0] != '0' || string_v[1] != 'x') {
            throw marshal_exception("Blob JSON strings must start with 0x");
        }
        string_v.remove_prefix(2);
@@ -197,6 +226,17 @@ struct from_json_object_visitor {
    }
    bytes operator()(const boolean_type_impl& t) {
        if (!value.IsBool()) {
+            if (value.IsString()) {
+                std::string str(rjson::to_string_view(value));
+                boost::trim_all(str);
+                boost::to_lower(str);
+
+                if (str == "true") {
+                    return t.decompose(true);
+                } else if (str == "false") {
+                    return t.decompose(false);
+                }
+            }
            throw marshal_exception(format("Invalid JSON object {}", value));
        }
        return t.decompose(value.GetBool());
--- a/cql3/util.hh
+++ b/cql3/util.hh
@@ -74,6 +74,13 @@ std::unique_ptr<cql3::statements::raw::select_statement> build_select_statement(
 /// forbids non-alpha-numeric characters in identifier names.
 /// Quoting involves wrapping the string in double-quotes ("). A double-quote
 /// character itself is quoted by doubling it.
+/// maybe_quote() also quotes reserved CQL keywords (e.g., "to", "where")
+/// but doesn't quote *unreserved* keywords (like ttl, int or as).
+/// Note that this means that if new reserved keywords are added to the
+/// parser, a saved output of maybe_quote() may no longer be parsable by
+/// parser. To avoid this forward-compatibility issue, use quote() instead
+/// of maybe_quote() - to unconditionally quote an identifier even if it is
+/// lowercase and not (yet) a keyword.
 sstring maybe_quote(const sstring& s);

 // Check whether timestamp is not too far in the future as this probably
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -11,6 +11,7 @@
 */

 #include <chrono>
+#include <exception>
 #include <seastar/core/future-util.hh>
 #include <seastar/core/do_with.hh>
 #include <seastar/core/semaphore.hh>
@@ -247,6 +248,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
            } catch (data_dictionary::no_such_keyspace& ex) {
                // should probably ignore and drop the batch
            } catch (...) {
+                blogger.warn("Replay failed (will retry): {}", std::current_exception());
                // timeout, overload etc.
                // Do _not_ remove the batch, assuning we got a node write error.
                // Since we don't have hints (which origin is satisfied with),
--- a/db/config.cc
+++ b/db/config.cc
@@ -65,6 +65,25 @@ hinted_handoff_enabled_to_json(const db::config::hinted_handoff_enabled_type& h)
    return value_to_json(h.to_configuration_string());
 }

+// Convert a value that can be printed with operator<<, or a vector of
+// such values, to JSON. An example is enum_option<T>, because enum_option<T>
+// has a operator<<.
+template <typename T>
+static json::json_return_type
+printable_to_json(const T& e) {
+    return value_to_json(format("{}", e));
+}
+template <typename T>
+static json::json_return_type
+printable_vector_to_json(const std::vector<T>& e) {
+    std::vector<sstring> converted;
+    converted.reserve(e.size());
+    for (const auto& option : e) {
+        converted.push_back(format("{}", option));
+    }
+    return value_to_json(converted);
+}
+
 template <>
 const config_type config_type_for<bool> = config_type("bool", value_to_json<bool>);

@@ -109,11 +128,11 @@ const config_type config_type_for<db::seed_provider_type> = config_type("seed pr

 template <>
 const config_type config_type_for<std::vector<enum_option<db::experimental_features_t>>> = config_type(
-        "experimental features", value_to_json<std::vector<sstring>>);
+        "experimental features", printable_vector_to_json<enum_option<db::experimental_features_t>>);

 template <>
 const config_type config_type_for<enum_option<db::tri_mode_restriction_t>> = config_type(
-        "restriction mode", value_to_json<sstring>);
+        "restriction mode", printable_to_json<enum_option<db::tri_mode_restriction_t>>);

 template <>
 const config_type config_type_for<db::config::hinted_handoff_enabled_type> = config_type("hinted handoff enabled", hinted_handoff_enabled_to_json);
@@ -862,6 +881,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Flush tables in the system_schema keyspace after schema modification. This is required for crash recovery, but slows down tests and can be disabled for them")
    , restrict_replication_simplestrategy(this, "restrict_replication_simplestrategy", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::FALSE, "Controls whether to disable SimpleStrategy replication. Can be true, false, or warn.")
    , restrict_dtcs(this, "restrict_dtcs", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::WARN, "Controls whether to prevent setting DateTieredCompactionStrategy. Can be true, false, or warn.")
+    , cache_index_pages(this, "cache_index_pages", liveness::LiveUpdate, value_status::Used, true,
+        "Keep SSTable index pages in the global cache after a SSTable read. Expected to improve performance for workloads with big partitions, but may degrade performance for workloads with small partitions.")
    , default_log_level(this, "default_log_level", value_status::Used)
    , logger_log_level(this, "logger_log_level", value_status::Used)
    , log_to_stdout(this, "log_to_stdout", value_status::Used)
--- a/db/config.hh
+++ b/db/config.hh
@@ -365,6 +365,9 @@ public:
    named_value<tri_mode_restriction> restrict_replication_simplestrategy;
    named_value<tri_mode_restriction> restrict_dtcs;

+
+    named_value<bool> cache_index_pages;
+
    seastar::logging_settings logging_settings(const log_cli::options&) const;

    const db::extensions& extensions() const;
--- a/db/legacy_schema_migrator.cc
+++ b/db/legacy_schema_migrator.cc
@@ -574,12 +574,8 @@ public:
    }

    future<> flush_schemas() {
-        return _qp.proxy().get_db().invoke_on_all([this] (replica::database& db) {
-            return parallel_for_each(db::schema_tables::all_table_names(schema_features::full()), [this, &db](const sstring& cf_name) {
-                auto& cf = db.find_column_family(db::schema_tables::NAME, cf_name);
-                return cf.flush();
-            });
-        });
+        auto& db = _qp.db().real_database();
+        return db.flush_on_all(db::schema_tables::NAME, db::schema_tables::all_table_names(schema_features::full()));
    }

    future<> migrate() {
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -1042,12 +1042,9 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
    co_await proxy.local().mutate_locally(std::move(mutations), tracing::trace_state_ptr());

    if (do_flush) {
-        co_await proxy.local().get_db().invoke_on_all([&] (replica::database& db) -> future<> {
-            auto& cfs = column_families;
-            co_await parallel_for_each(cfs.begin(), cfs.end(), [&] (const utils::UUID& id) -> future<> {
-                auto& cf = db.find_column_family(id);
-                co_await cf.flush();
-            });
+        auto& db = proxy.local().local_db();
+        co_await parallel_for_each(column_families, [&db] (const utils::UUID& id) -> future<> {
+            return db.flush_on_all(id);
        });
    }

--- a/db/snapshot-ctl.cc
+++ b/db/snapshot-ctl.cc
@@ -11,6 +11,8 @@
 */

 #include <boost/range/adaptors.hpp>
+#include <seastar/core/coroutine.hh>
+#include <seastar/coroutine/maybe_yield.hh>
 #include "db/snapshot-ctl.hh"
 #include "replica/database.hh"

@@ -59,24 +61,21 @@ future<> snapshot_ctl::take_snapshot(sstring tag, std::vector<sstring> keyspace_
        boost::copy(_db.local().get_keyspaces() | boost::adaptors::map_keys, std::back_inserter(keyspace_names));
    };

-    return run_snapshot_modify_operation([tag = std::move(tag), keyspace_names = std::move(keyspace_names), sf, this] {
-        return parallel_for_each(keyspace_names, [tag, this] (auto& ks_name) {
-            return check_snapshot_not_exist(ks_name, tag);
-        }).then([this, tag, keyspace_names, sf] {
-            return _db.invoke_on_all([tag = std::move(tag), keyspace_names, sf] (replica::database& db) {
-                return parallel_for_each(keyspace_names, [&db, tag = std::move(tag), sf] (auto& ks_name) {
-                    auto& ks = db.find_keyspace(ks_name);
-                    return parallel_for_each(ks.metadata()->cf_meta_data(), [&db, tag = std::move(tag), sf] (auto& pair) {
-                        auto& cf = db.find_column_family(pair.second);
-                        return cf.snapshot(db, tag, bool(sf));
-                    });
-                });
-            });
-        });
+    return run_snapshot_modify_operation([tag = std::move(tag), keyspace_names = std::move(keyspace_names), sf, this] () mutable {
+        return do_take_snapshot(std::move(tag), std::move(keyspace_names), sf);
    });
 }

-future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf) {
+future<> snapshot_ctl::do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf) {
+    co_await parallel_for_each(keyspace_names, [tag, this] (const auto& ks_name) {
+        return check_snapshot_not_exist(ks_name, tag);
+    });
+    co_await parallel_for_each(keyspace_names, [this, tag = std::move(tag), sf] (const auto& ks_name) {
+        return _db.local().snapshot_on_all(ks_name, tag, bool(sf));
+    });
+}
+
+future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf, allow_view_snapshots av) {
    if (ks_name.empty()) {
        throw std::runtime_error("You must supply a keyspace name");
    }
@@ -87,25 +86,25 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
        throw std::runtime_error("You must supply a snapshot name.");
    }

-    return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), sf] {
-        return check_snapshot_not_exist(ks_name, tag, tables).then([this, ks_name, tables, tag, sf] {
-            return do_with(std::vector<sstring>(std::move(tables)),[this, ks_name, tag, sf](const std::vector<sstring>& tables) {
-                return do_for_each(tables, [ks_name, tag, sf, this] (const sstring& table_name) {
-                    if (table_name.find(".") != sstring::npos) {
-                        throw std::invalid_argument("Cannot take a snapshot of a secondary index by itself. Run snapshot on the table that owns the index.");
-                    }
-                    return _db.invoke_on_all([ks_name, table_name, tag, sf] (replica::database &db) {
-                        auto& cf = db.find_column_family(ks_name, table_name);
-                        return cf.snapshot(db, tag, bool(sf));
-                    });
-                });
-            });
-        });
+    return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), sf, av] () mutable {
+        return do_take_column_family_snapshot(std::move(ks_name), std::move(tables), std::move(tag), sf, av);
    });
 }

-future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf) {
-    return take_column_family_snapshot(ks_name, std::vector<sstring>{cf_name}, tag, sf);
+future<> snapshot_ctl::do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf, allow_view_snapshots av) {
+    co_await check_snapshot_not_exist(ks_name, tag, tables);
+
+    for (const auto& table_name : tables) {
+        auto& cf = _db.local().find_column_family(ks_name, table_name);
+        if (cf.schema()->is_view() && !av) {
+            throw std::invalid_argument("Do not take a snapshot of a materialized view or a secondary index by itself. Run snapshot on the base table instead.");
+        }
+    }
+    co_await _db.local().snapshot_on_all(ks_name, std::move(tables), std::move(tag), bool(sf));
+}
+
+future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf, allow_view_snapshots av) {
+    return take_column_family_snapshot(ks_name, std::vector<sstring>{cf_name}, tag, sf, av);
 }

 future<> snapshot_ctl::clear_snapshot(sstring tag, std::vector<sstring> keyspace_names, sstring cf_name) {
--- a/db/snapshot-ctl.hh
+++ b/db/snapshot-ctl.hh
@@ -27,6 +27,7 @@ namespace db {
 class snapshot_ctl : public peering_sharded_service<snapshot_ctl> {
 public:
    using skip_flush = bool_class<class skip_flush_tag>;
+    using allow_view_snapshots = bool_class<class allow_view_snapsots_tag>;

    struct snapshot_details {
        int64_t live;
@@ -64,7 +65,7 @@ public:
     * @param tables a vector of tables names to snapshot
     * @param tag the tag given to the snapshot; may not be null or empty
     */
-    future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no);
+    future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);

    /**
     * Takes the snapshot of a specific column family. A snapshot name must be specified.
@@ -73,7 +74,7 @@ public:
     * @param columnFamilyName the column family to snapshot
     * @param tag the tag given to the snapshot; may not be null or empty
     */
-    future<> take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf = skip_flush::no);
+    future<> take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);

    /**
     * Remove the snapshot with the given name from the given keyspaces.
@@ -97,6 +98,9 @@ private:

    template <typename Func>
    std::result_of_t<Func()> run_snapshot_list_operation(Func&&);
+
+    future<> do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf = skip_flush::no);
+    future<> do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);
 };

 }
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -2482,10 +2482,14 @@ class db_config_table final : public streaming_virtual_table {
            for (auto& c_ref : cfg.values()) {
                auto& c = c_ref.get();
                if (c.name() == name) {
-                    if (c.set_value(value, utils::config_file::config_source::CQL)) {
-                        return cfg.broadcast_to_all_shards();
-                    } else {
-                        return make_exception_future<>(virtual_table_update_exception("option is not live-updateable"));
+                    try {
+                        if (c.set_value(value, utils::config_file::config_source::CQL)) {
+                            return cfg.broadcast_to_all_shards();
+                        } else {
+                            return make_exception_future<>(virtual_table_update_exception("option is not live-updateable"));
+                        }
+                    } catch (boost::bad_lexical_cast&) {
+                        return make_exception_future<>(virtual_table_update_exception("cannot parse option value"));
                    }
                }
            }
@@ -3068,11 +3072,11 @@ mutation system_keyspace::make_group0_history_state_id_mutation(
        using namespace std::chrono;
        assert(*gc_older_than >= gc_clock::duration{0});

-        auto ts_millis = duration_cast<milliseconds>(microseconds{ts});
-        auto gc_older_than_millis = duration_cast<milliseconds>(*gc_older_than);
-        assert(gc_older_than_millis < ts_millis);
+        auto ts_micros = microseconds{ts};
+        auto gc_older_than_micros = duration_cast<microseconds>(*gc_older_than);
+        assert(gc_older_than_micros < ts_micros);

-        auto tomb_upper_bound = utils::UUID_gen::min_time_UUID(ts_millis - gc_older_than_millis);
+        auto tomb_upper_bound = utils::UUID_gen::min_time_UUID(ts_micros - gc_older_than_micros);
        // We want to delete all entries with IDs smaller than `tomb_upper_bound`
        // but the deleted range is of the form (x, +inf) since the schema is reversed.
        auto range = query::clustering_range::make_starting_with({
--- a/db/system_keyspace_view_types.hh
+++ b/db/system_keyspace_view_types.hh
@@ -10,6 +10,7 @@

 #include <seastar/core/seastar.hh>
 #include <seastar/core/sstring.hh>
+#include <seastar/core/reactor.hh>
 #include <utility>
 #include <optional>
 #include "dht/token.hh"
--- a/db/view/row_locking.cc
+++ b/db/view/row_locking.cc
@@ -10,8 +10,6 @@
 #include "log.hh"
 #include "utils/latency.hh"

-#include <seastar/core/when_all.hh>
-
 static logging::logger mylog("row_locking");

 row_locker::row_locker(schema_ptr s)
@@ -76,35 +74,32 @@ row_locker::lock_pk(const dht::decorated_key& pk, bool exclusive, db::timeout_cl
 future<row_locker::lock_holder>
 row_locker::lock_ck(const dht::decorated_key& pk, const clustering_key_prefix& cpk, bool exclusive, db::timeout_clock::time_point timeout, stats& stats) {
    mylog.debug("taking shared lock on partition {}, and {} lock on row {} in it", pk, (exclusive ? "exclusive" : "shared"), cpk);
+    auto ck = cpk;
+    // Create a two-level lock entry for the partition if it doesn't exist already.
    auto i = _two_level_locks.try_emplace(pk, this).first;
+    // The two-level lock entry we've just created is guaranteed to be kept alive as long as it's locked.
+    // Initiating read locking in the background below ensures that even if the two-level lock is currently
+    // write-locked, releasing the write-lock will synchronously engage any waiting
+    // locks and will keep the entry alive.
    future<lock_type::holder> lock_partition = i->second._partition_lock.hold_read_lock(timeout);
-    auto j = i->second._row_locks.find(cpk);
-    if (j == i->second._row_locks.end()) {
-        // Not yet locked, need to create the lock. This makes a copy of cpk.
-        try {
-            j = i->second._row_locks.emplace(cpk, lock_type()).first;
-        } catch(...) {
-            // If this emplace() failed, e.g., out of memory, we fail. We
-            // could do nothing - the partition lock we already started
-            // taking will be unlocked automatically after being locked.
-            // But it's better form to wait for the work we started, and it
-            // will also allow us to remove the hash-table row we added.
-            return lock_partition.then([ex = std::current_exception()] (auto lock) {
-                // The lock is automatically released when "lock" goes out of scope.
-                // TODO: unlock (lock = {}) now, search for the partition in the
-                // hash table (we know it's still there, because we held the lock until
-                // now) and remove the unused lock from the hash table if still unused.
-                return make_exception_future<row_locker::lock_holder>(std::current_exception());
-            });
-        }
-    }
    single_lock_stats &single_lock_stats = exclusive ? stats.exclusive_row : stats.shared_row;
    single_lock_stats.operations_currently_waiting_for_lock++;
    utils::latency_counter waiting_latency;
    waiting_latency.start();
-    future<lock_type::holder> lock_row = exclusive ? j->second.hold_write_lock(timeout) : j->second.hold_read_lock(timeout);
-    return when_all_succeed(std::move(lock_partition), std::move(lock_row))
-    .then_unpack([this, pk = &i->first, cpk = &j->first, exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency)] (auto lock1, auto lock2) mutable {
+    return lock_partition.then([this, pk = &i->first, row_locks = &i->second._row_locks, ck = std::move(ck), exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency), timeout] (auto lock1) mutable {
+        auto j = row_locks->find(ck);
+        if (j == row_locks->end()) {
+            // Not yet locked, need to create the lock.
+            j = row_locks->emplace(std::move(ck), lock_type()).first;
+        }
+        auto* cpk = &j->first;
+        auto& row_lock = j->second;
+        // Like to the two-level lock entry above, the row_lock entry we've just created
+        // is guaranteed to be kept alive as long as it's locked.
+        // Initiating read/write locking in the background below ensures that.
+        auto lock_row = exclusive ? row_lock.hold_write_lock(timeout) : row_lock.hold_read_lock(timeout);
+        return lock_row.then([this, pk, cpk, exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency), lock1 = std::move(lock1)] (auto lock2) mutable {
+        // FIXME: indentation
        lock1.release();
        lock2.release();
        waiting_latency.stop();
@@ -112,6 +107,7 @@ row_locker::lock_ck(const dht::decorated_key& pk, const clustering_key_prefix& c
        single_lock_stats.lock_acquisitions++;
        single_lock_stats.operations_currently_waiting_for_lock--;
        return lock_holder(this, pk, cpk, exclusive);
+        });
    });
 }

--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -121,6 +121,9 @@ const column_definition* view_info::view_column(const column_definition& base_de

 void view_info::set_base_info(db::view::base_info_ptr base_info) {
    _base_info = std::move(base_info);
+    // Forget the cached objects which may refer to the base schema.
+    _select_statement = nullptr;
+    _partition_slice = std::nullopt;
 }

 // A constructor for a base info that can facilitate reads and writes from the materialized view.
@@ -322,7 +325,11 @@ public:
    view_filter_checking_visitor(const schema& base, const view_info& view)
        : _base(base)
        , _view(view)
-        , _selection(cql3::selection::selection::wildcard(_base.shared_from_this()))
+        , _selection(cql3::selection::selection::for_columns(_base.shared_from_this(),
+            boost::copy_range<std::vector<const column_definition*>>(
+                _base.regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return &cdef; }))
+            )
+        )
    {}

    void accept_new_partition(const partition_key& key, uint64_t row_count) {
@@ -859,13 +866,18 @@ void view_updates::generate_update(
    bool same_row = true;
    for (auto col_id : col_ids) {
        auto* after = update.cells().find_cell(col_id);
-        // Note: multi-cell columns can't be part of the primary key.
        auto& cdef = _base->regular_column_at(col_id);
        if (existing) {
            auto* before = existing->cells().find_cell(col_id);
+            // Note that this cell is necessarily atomic, because col_ids are
+            // view key columns, and keys must be atomic.
            if (before && before->as_atomic_cell(cdef).is_live()) {
                if (after && after->as_atomic_cell(cdef).is_live()) {
-                    auto cmp = compare_atomic_cell_for_merge(before->as_atomic_cell(cdef), after->as_atomic_cell(cdef));
+                    // We need to compare just the values of the keys, not
+                    // metadata like the timestamp. This is because below,
+                    // if the old and new view row have the same key, we need
+                    // to be sure to reach the update_entry() case.
+                    auto cmp = compare_unsigned(before->as_atomic_cell(cdef).value(), after->as_atomic_cell(cdef).value());
                    if (cmp != 0) {
                        same_row = false;
                    }
@@ -885,7 +897,13 @@ void view_updates::generate_update(
            if (same_row) {
                update_entry(base_key, update, *existing, now);
            } else {
-                replace_entry(base_key, update, *existing, now);
+                // This code doesn't work if the old and new view row have the
+                // same key, because if they do we get both data and tombstone
+                // for the same timestamp (now) and the tombstone wins. This
+                // is why we need the "same_row" case above - it's not just a
+                // performance optimization.
+                delete_old_entry(base_key, *existing, update, now);
+                create_entry(base_key, update, now);
            }
        } else {
            delete_old_entry(base_key, *existing, update, now);
@@ -929,8 +947,12 @@ future<stop_iteration> view_update_builder::stop() const {
    return make_ready_future<stop_iteration>(stop_iteration::yes);
 }

-future<utils::chunked_vector<frozen_mutation_and_schema>> view_update_builder::build_some() {
+future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>> view_update_builder::build_some() {
    return advance_all().then([this] (stop_iteration ignored) {
+        if (!_update && !_existing) {
+            // Tell the caller there is no more data to build.
+            return make_ready_future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>>(std::nullopt);
+        }
        bool do_advance_updates = false;
        bool do_advance_existings = false;
        if (_update && _update->is_partition_start()) {
@@ -942,22 +964,23 @@ future<utils::chunked_vector<frozen_mutation_and_schema>> view_update_builder::b
            _existing_tombstone_tracker.set_partition_tombstone(_existing->as_partition_start().partition_tombstone());
            do_advance_existings = true;
        }
+        future<stop_iteration> f = make_ready_future<stop_iteration>(stop_iteration::no);
        if (do_advance_updates) {
-            return do_advance_existings ? advance_all() : advance_updates();
+            f = do_advance_existings ? advance_all() : advance_updates();
        } else if (do_advance_existings) {
-            return advance_existings();
+            f = advance_existings();
        }
-        return make_ready_future<stop_iteration>(stop_iteration::no);
-    }).then([this] (stop_iteration ignored) {
-        return repeat([this] {
-            return this->on_results();
+        return std::move(f).then([this] (stop_iteration ignored) {
+            return repeat([this] {
+                return this->on_results();
+            });
+        }).then([this] {
+            utils::chunked_vector<frozen_mutation_and_schema> mutations;
+            for (auto& update : _view_updates) {
+                update.move_to(mutations);
+            }
+            return std::make_optional(mutations);
        });
-    }).then([this] {
-        utils::chunked_vector<frozen_mutation_and_schema> mutations;
-        for (auto& update : _view_updates) {
-            update.move_to(mutations);
-        }
-        return mutations;
    });
 }

@@ -1293,7 +1316,7 @@ future<> mutate_MV(
            auto mut_ptr = remote_endpoints.empty() ? std::make_unique<frozen_mutation>(std::move(mut.fm)) : std::make_unique<frozen_mutation>(mut.fm);
            tracing::trace(tr_state, "Locally applying view update for {}.{}; base token = {}; view token = {}",
                    mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
-            local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, std::move(tr_state), db::commitlog::force_sync::no).then_wrapped(
+            local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, tr_state, db::commitlog::force_sync::no).then_wrapped(
                    [s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
                            units = sem_units.split(sem_units.count())] (future<>&& f) {
                --stats.writes;
@@ -2031,15 +2054,21 @@ public:
 // Called in the context of a seastar::thread.
 void view_builder::execute(build_step& step, exponential_backoff_retry r) {
    gc_clock::time_point now = gc_clock::now();
-    auto consumer = compact_for_query<emit_only_live_rows::yes, view_builder::consumer>(
+    auto compaction_state = make_lw_shared<compact_for_query_state<emit_only_live_rows::yes>>(
            *step.reader.schema(),
            now,
            step.pslice,
            batch_size,
-            query::max_partitions,
-            view_builder::consumer{*this, step, now});
-    consumer.consume_new_partition(step.current_key); // Initialize the state in case we're resuming a partition
+            query::max_partitions);
+    auto consumer = compact_for_query<emit_only_live_rows::yes, view_builder::consumer>(compaction_state, view_builder::consumer{*this, step, now});
    auto built = step.reader.consume_in_thread(std::move(consumer));
+    if (auto ds = std::move(*compaction_state).detach_state()) {
+        auto& range_tombstones = std::get<std::deque<range_tombstone>>(ds->range_tombstones);
+        for (auto& rt : range_tombstones) {
+            step.reader.unpop_mutation_fragment(mutation_fragment(*step.reader.schema(), step.reader.permit(), std::move(rt)));
+        }
+        step.reader.unpop_mutation_fragment(mutation_fragment(*step.reader.schema(), step.reader.permit(), std::move(ds->partition_start)));
+    }

    _as.check();

@@ -2121,24 +2150,28 @@ update_backlog node_update_backlog::add_fetch(unsigned shard, update_backlog bac
    return std::max(backlog, _max.load(std::memory_order_relaxed));
 }

-future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const sstring& ks_name, const sstring& cf_name) {
-    return sys_dist_ks.view_status(ks_name, cf_name).then([] (std::unordered_map<utils::UUID, sstring>&& view_statuses) {
-        return boost::algorithm::any_of(view_statuses | boost::adaptors::map_values, [] (const sstring& view_status) {
-            return view_status == "STARTED";
+future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const sstring& ks_name,
+        const sstring& cf_name) {
+    using view_statuses_type = std::unordered_map<utils::UUID, sstring>;
+    return sys_dist_ks.view_status(ks_name, cf_name).then([&tm] (view_statuses_type&& view_statuses) {
+        return boost::algorithm::any_of(view_statuses, [&tm] (const view_statuses_type::value_type& view_status) {
+            // Only consider status of known hosts.
+            return view_status.second == "STARTED" && tm.get_endpoint_for_host_id(view_status.first);
        });
    });
 }

-future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const replica::table& t, streaming::stream_reason reason) {
+future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const replica::table& t,
+        streaming::stream_reason reason) {
    if (is_internal_keyspace(t.schema()->ks_name())) {
        return make_ready_future<bool>(false);
    }
    if (reason == streaming::stream_reason::repair && !t.views().empty()) {
        return make_ready_future<bool>(true);
    }
-    return do_with(t.views(), [&sys_dist_ks] (auto& views) {
+    return do_with(t.views(), [&sys_dist_ks, &tm] (auto& views) {
        return map_reduce(views,
-                [&sys_dist_ks] (const view_ptr& view) { return check_view_build_ongoing(sys_dist_ks, view->ks_name(), view->cf_name()); },
+                [&sys_dist_ks, &tm] (const view_ptr& view) { return check_view_build_ongoing(sys_dist_ks, tm, view->ks_name(), view->cf_name()); },
                false,
                std::logical_or<bool>());
    });
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -154,10 +154,7 @@ private:
    void delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
    void do_delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
    void update_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now);
-    void replace_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now) {
-        create_entry(base_key, update, now);
-        delete_old_entry(base_key, existing, update, now);
-    }
+    void update_entry_for_computed_column(const partition_key& base_key, const clustering_row& update, const std::optional<clustering_row>& existing, gc_clock::time_point now);
 };

 class view_update_builder {
@@ -188,7 +185,15 @@ public:
    }
    view_update_builder(view_update_builder&& other) noexcept = default;

-    future<utils::chunked_vector<frozen_mutation_and_schema>> build_some();
+
+    // build_some() works on batches of 100 (max_rows_for_view_updates)
+    // updated rows, but can_skip_view_updates() can decide that some of
+    // these rows do not effect the view, and as a result build_some() can
+    // fewer than 100 rows - in extreme cases even zero (see issue #12297).
+    // So we can't use an empty returned vector to signify that the view
+    // update building is done - and we wrap the return value in an
+    // std::optional, which is disengaged when the iteration is done.
+    future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>> build_some();

    future<> close() noexcept;

--- a/db/view/view_update_checks.hh
+++ b/db/view/view_update_checks.hh
@@ -22,9 +22,13 @@ class system_distributed_keyspace;

 }

+namespace locator {
+class token_metadata;
+}
+
 namespace db::view {

-future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const sstring& ks_name, const sstring& cf_name);
-future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const replica::table& t, streaming::stream_reason reason);
+future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const replica::table& t,
+        streaming::stream_reason reason);

 }
--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -83,10 +83,10 @@ future<> view_update_generator::start() {
                            service::get_local_streaming_priority(),
                            nullptr,
                            ::mutation_reader::forwarding::no);
+                    auto close_sr = deferred_close(staging_sstable_reader);

                    inject_failure("view_update_generator_consume_staging_sstable");
                    auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, std::move(permit), *t, sstables, _as, staging_sstable_reader_handle));
-                    staging_sstable_reader.close().get();
                    if (result == stop_iteration::yes) {
                        break;
                    }
--- a/db/view/view_updating_consumer.hh
+++ b/db/view/view_updating_consumer.hh
@@ -16,6 +16,7 @@
 #include "db/view/row_locking.hh"
 #include <seastar/core/abort_source.hh>
 #include "mutation.hh"
+#include <seastar/core/circular_buffer.hh>

 class evictable_reader_handle;

--- a/dht/murmur3_partitioner.cc
+++ b/dht/murmur3_partitioner.cc
@@ -15,11 +15,18 @@

 namespace dht {

+// Note: Cassandra has a special case where for an empty key it returns
+// minimum_token() instead of 0 (the naturally-calculated hash function for
+// an empty string). Their thinking was that empty partition keys are not
+// allowed anyway. However, they *are* allowed in materialized views, so the
+// empty-key partition should get a real token, not an invalid token, so
+// we dropped this special case. Since we don't support migrating sstables of
+// materialized-views from Cassandra, this Cassandra-Scylla incompatiblity
+// will not cause problems in practice.
+// Note that get_token(const schema& s, partition_key_view key) below must
+// use exactly the same algorithm as this function.
 token
 murmur3_partitioner::get_token(bytes_view key) const {
-    if (key.empty()) {
-        return minimum_token();
-    }
    std::array<uint64_t, 2> hash;
    utils::murmur_hash::hash3_x64_128(key, 0, hash);
    return get_token(hash[0]);
--- a/dirty_memory_manager.hh
+++ b/dirty_memory_manager.hh
@@ -202,6 +202,12 @@ public:
        });
    }

+    future<flush_permit> get_all_flush_permits() {
+        return get_units(_background_work_flush_serializer, _max_background_work).then([this] (auto&& units) {
+            return this->get_flush_permit(std::move(units));
+        });
+    }
+
    bool has_extraneous_flushes_requested() const {
        return _extraneous_flushes > 0;
    }
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -42,7 +42,8 @@ if __name__ == '__main__':
        if systemd_unit.available('systemd-coredump@.service'):
            dropin = '''
 [Service]
-TimeoutStartSec=infinity
+RuntimeMaxSec=infinity
+TimeoutSec=infinity
 '''[1:-1]
            os.makedirs('/etc/systemd/system/systemd-coredump@.service.d', exist_ok=True)
            with open('/etc/systemd/system/systemd-coredump@.service.d/timeout.conf', 'w') as f:
@@ -123,10 +124,14 @@ WantedBy=multi-user.target
        #  - Storage: /path/to/file (inacessible)
        #  - Storage: /path/to/file
        #
+        # After systemd-v248, available coredump file output changed like this:
+        #  - Storage: /path/to/file (present)
+        # We need to support both versions.
+        #
        # reference: https://github.com/systemd/systemd/commit/47f50642075a7a215c9f7b600599cbfee81a2913

        corefail = False
-        res = re.findall(r'Storage: (.*)$', coreinfo, flags=re.MULTILINE)
+        res = re.findall(r'Storage: (\S+)(?: \(.+\))?$', coreinfo, flags=re.MULTILINE)
        # v232 or later
        if res:
            corepath = res[0]
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -16,7 +16,7 @@ import stat
 import distro
 from pathlib import Path
 from scylla_util import *
-from subprocess import run
+from subprocess import run, SubprocessError

 if __name__ == '__main__':
    if os.getuid() > 0:
@@ -137,7 +137,9 @@ if __name__ == '__main__':
    # stalling. The minimum block size for crc enabled filesystems is 1024,
    # and it also cannot be smaller than the sector size.
    block_size = max(1024, sector_size)
+    run('udevadm settle', shell=True, check=True)
    run(f'mkfs.xfs -b size={block_size} {fsdev} -f -K', shell=True, check=True)
+    run('udevadm settle', shell=True, check=True)

    if is_debian_variant():
        confpath = '/etc/mdadm/mdadm.conf'
@@ -153,6 +155,11 @@ if __name__ == '__main__':
    os.makedirs(mount_at, exist_ok=True)

    uuid = run(f'blkid -s UUID -o value {fsdev}', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
+    if not uuid:
+        raise Exception(f'Failed to get UUID of {fsdev}')
+
+    uuidpath = f'/dev/disk/by-uuid/{uuid}'
+
    after = 'local-fs.target'
    wants = ''
    if raid and args.raid_level != '0':
@@ -169,7 +176,7 @@ After={after}{wants}
 DefaultDependencies=no

 [Mount]
-What=/dev/disk/by-uuid/{uuid}
+What={uuidpath}
 Where={mount_at}
 Type=xfs
 Options=noatime{opt_discard}
@@ -191,8 +198,16 @@ WantedBy=multi-user.target
    systemd_unit.reload()
    if args.raid_level != '0':
        md_service.start()
-    mount = systemd_unit(mntunit_bn)
-    mount.start()
+    try:
+        mount = systemd_unit(mntunit_bn)
+        mount.start()
+    except SubprocessError as e:
+        if not os.path.exists(uuidpath):
+            print(f'\nERROR: {uuidpath} is not found\n')
+        elif not stat.S_ISBLK(os.stat(uuidpath).st_mode):
+            print(f'\nERROR: {uuidpath} is not block device\n')
+        raise e
+
    if args.enable_on_nextboot:
        mount.enable()
    uid = pwd.getpwnam('scylla').pw_uid
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -214,7 +214,7 @@ if __name__ == '__main__':
                        help='skip raid setup')
    parser.add_argument('--raid-level-5', action='store_true', default=False,
                        help='use RAID5 for RAID volume')
-    parser.add_argument('--online-discard', default=True,
+    parser.add_argument('--online-discard', default=1, choices=[0, 1], type=int,
                        help='Configure XFS to discard unused blocks as soon as files are deleted')
    parser.add_argument('--nic',
                        help='specify NIC')
@@ -458,7 +458,7 @@ if __name__ == '__main__':
        args.no_raid_setup = not raid_setup
        if raid_setup:
            level = '5' if raid_level_5 else '0'
-            run_setup_script('RAID', f'scylla_raid_setup --disks {disks} --enable-on-nextboot --raid-level={level} --online-discard={int(online_discard)}')
+            run_setup_script('RAID', f'scylla_raid_setup --disks {disks} --enable-on-nextboot --raid-level={level} --online-discard={online_discard}')

        coredump_setup = interactive_ask_service('Do you want to enable coredumps?', 'Yes - sets up coredump to allow a post-mortem analysis of the Scylla state just prior to a crash. No - skips this step.', coredump_setup)
        args.no_coredump_setup = not coredump_setup
--- a/dist/common/scripts/scylla_sysconfig_setup
+++ b/dist/common/scripts/scylla_sysconfig_setup
@@ -70,7 +70,17 @@ if __name__ == '__main__':
    network_mode = args.mode if args.mode else cfg.get('NETWORK_MODE')

    if args.setup_nic_and_disks:
-        rps_cpus = run('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname), shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
+        res = run('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname), shell=True, check=True, capture_output=True, encoding='utf-8').stdout
+        # we need to extract CPU mask from output, since perftune.py may also print warning messages (#10082)
+        match = re.match('(.*\n)?(0x[0-9a-f]+(?:,0x[0-9a-f]+)*)', res, re.DOTALL)
+        try:
+            warning = match.group(1)
+            rps_cpus = match.group(2)
+        except:
+            raise Exception(f'Failed to retrive CPU mask: {res}')
+        # print warning message if available
+        if warning:
+            print(warning.strip())
        if len(rps_cpus) > 0:
            cpuset = hex2list(rps_cpus)
            run('/opt/scylladb/scripts/scylla_cpuset_setup --cpuset {}'.format(cpuset), shell=True, check=True)
--- a/dist/common/supervisor/scylla_util.sh
+++ b/dist/common/supervisor/scylla_util.sh
@@ -6,12 +6,16 @@ is_nonroot() {
    [ -f "$scylladir"/SCYLLA-NONROOT-FILE ]
 }

+is_container() {
+    [ -f "$scylladir"/SCYLLA-CONTAINER-FILE ]
+}
+
 is_privileged() {
    [ ${EUID:-${UID}} = 0 ]
 }

 execsudo() {
-    if is_nonroot; then
+    if is_nonroot || is_container; then
        exec "$@"
    else
        exec sudo -u scylla -g scylla "$@"
--- a/dist/docker/debian/build_docker.sh
+++ b/dist/docker/debian/build_docker.sh
@@ -82,15 +82,17 @@ run bash -ec "echo 'debconf debconf/frontend select Noninteractive' | debconf-se
 run bash -ec "rm -rf /etc/rsyslog.conf"
 run apt-get -y install hostname supervisor openssh-server openssh-client openjdk-11-jre-headless python python-yaml curl rsyslog locales sudo
 run locale-gen en_US.UTF-8
-run update-locale LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF_8
+run update-locale LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF-8
 run bash -ec "dpkg -i packages/*.deb"
 run apt-get -y clean all
 run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
 run mkdir -p /etc/supervisor.conf.d
 run mkdir -p /var/log/scylla
 run chown -R scylla:scylla /var/lib/scylla
+run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"/' /etc/default/scylla-server

 run mkdir -p /opt/scylladb/supervisor
+run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
 bcp dist/common/supervisor/scylla-server.sh /opt/scylladb/supervisor/scylla-server.sh
 bcp dist/common/supervisor/scylla-jmx.sh /opt/scylladb/supervisor/scylla-jmx.sh
 bcp dist/common/supervisor/scylla-node-exporter.sh /opt/scylladb/supervisor/scylla-node-exporter.sh
--- a/dist/docker/etc/supervisord.conf.d/scylla-server.conf
+++ b/dist/docker/etc/supervisord.conf.d/scylla-server.conf
@@ -1,4 +1,4 @@
-[program:scylla-server]
+[program:scylla]
 command=/opt/scylladb/supervisor/scylla-server.sh
 stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
--- a/dist/docker/etc/sysconfig/scylla-server
+++ b/dist/docker/etc/sysconfig/scylla-server
@@ -1,41 +0,0 @@
-# choose following mode: virtio, dpdk, posix
-NETWORK_MODE=posix
-
-# tap device name(virtio)
-TAP=tap0
-
-# bridge device name (virtio)
-BRIDGE=virbr0
-
-# ethernet device name
-IFNAME=eth0
-
-# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
-SET_NIC_AND_DISKS=no
-
-# ethernet device driver (dpdk)
-ETHDRV=
-
-# ethernet device PCI ID (dpdk)
-ETHPCIID=
-
-# number of hugepages
-NR_HUGEPAGES=64
-
-# user for process (must be root for dpdk)
-USER=scylla
-
-# group for process
-GROUP=scylla
-
-# scylla home dir
-SCYLLA_HOME=/var/lib/scylla
-
-# scylla config dir
-SCYLLA_CONF=/etc/scylla
-
-# scylla arguments
-SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"
-
-# setup as AMI instance
-AMI=no
--- a/dist/docker/scyllasetup.py
+++ b/dist/docker/scyllasetup.py
@@ -68,7 +68,12 @@ class ScyllaSetup:

    def cqlshrc(self):
        home = os.environ['HOME']
-        hostname = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
+        if self._rpcAddress:
+            hostname = self._rpcAddress
+        elif self._listenAddress:
+            hostname = self._listenAddress
+        else:
+            hostname = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
        with open("%s/.cqlshrc" % home, "w") as cqlshrc:
            cqlshrc.write("[connection]\nhostname = %s\n" % hostname)

--- a/dist/redhat/scylla.spec
+++ b/dist/redhat/scylla.spec
@@ -7,7 +7,7 @@ Group:          Applications/Databases
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Source0:        %{reloc_pkg}
-Requires:       %{product}-server = %{version} %{product}-conf = %{version} %{product}-python3 = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version} %{product}-node-exporter = %{version}
+Requires:       %{product}-server = %{version}-%{release} %{product}-conf = %{version}-%{release} %{product}-python3 = %{version}-%{release} %{product}-kernel-conf = %{version}-%{release} %{product}-jmx = %{version}-%{release} %{product}-tools = %{version}-%{release} %{product}-tools-core = %{version}-%{release} %{product}-node-exporter = %{version}-%{release}
 Obsoletes:	scylla-server < 1.1

 %global _debugsource_template %{nil}
@@ -54,7 +54,7 @@ Group:          Applications/Databases
 Summary:        The Scylla database server
 License:        AGPLv3
 URL:            http://www.scylladb.com/
-Requires:       %{product}-conf  = %{version} %{product}-python3 = %{version}
+Requires:       %{product}-conf  = %{version}-%{release} %{product}-python3 = %{version}-%{release}
 Conflicts:      abrt
 AutoReqProv:    no

--- a/flat_mutation_reader.cc
+++ b/flat_mutation_reader.cc
@@ -32,7 +32,7 @@
 logging::logger fmr_logger("flat_mutation_reader");

 flat_mutation_reader& flat_mutation_reader::operator=(flat_mutation_reader&& o) noexcept {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -45,7 +45,7 @@ flat_mutation_reader& flat_mutation_reader::operator=(flat_mutation_reader&& o)
 }

 flat_mutation_reader::~flat_mutation_reader() {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -774,11 +774,14 @@ make_flat_mutation_reader_from_mutations_v2(schema_ptr s, reader_permit permit,
        std::optional<mutation_consume_cookie> _cookie;

    private:
-        void flush_tombstones(position_in_partition_view pos) {
+        void flush_tombstones(position_in_partition_view pos, bool emit_end = false) {
            _rt_gen.flush(pos, [&] (range_tombstone_change rt) {
                _current_rt = rt.tombstone();
                push_mutation_fragment(*_schema, _permit, std::move(rt));
            });
+            if (emit_end && _current_rt) {
+                push_mutation_fragment(*_schema, _permit, range_tombstone_change(pos, {}));
+            }
        }
        void maybe_emit_partition_start() {
            if (_dk) {
@@ -815,10 +818,7 @@ make_flat_mutation_reader_from_mutations_v2(schema_ptr s, reader_permit permit,
                return stop_iteration::yes;
            }
            maybe_emit_partition_start();
-            flush_tombstones(position_in_partition::after_all_clustered_rows());
-            if (_current_rt) {
-                push_mutation_fragment(*_schema, _permit, range_tombstone_change(position_in_partition::after_all_clustered_rows(), {}));
-            }
+            flush_tombstones(position_in_partition::after_all_clustered_rows(), true);
            push_mutation_fragment(*_schema, _permit, partition_end{});
            return stop_iteration::no;
        }
@@ -1580,6 +1580,9 @@ bool mutation_fragment_stream_validator::operator()(dht::token t) {
 }

 bool mutation_fragment_stream_validator::operator()(mutation_fragment_v2::kind kind, position_in_partition_view pos) {
+    if (kind == mutation_fragment_v2::kind::partition_end && _current_tombstone) {
+        return false;
+    }
    if (_prev_kind == mutation_fragment_v2::kind::partition_end) {
        const bool valid = (kind == mutation_fragment_v2::kind::partition_start);
        if (valid) {
@@ -1607,7 +1610,11 @@ bool mutation_fragment_stream_validator::operator()(mutation_fragment::kind kind
 }

 bool mutation_fragment_stream_validator::operator()(const mutation_fragment_v2& mf) {
-    return (*this)(mf.mutation_fragment_kind(), mf.position());
+    const auto valid = (*this)(mf.mutation_fragment_kind(), mf.position());
+    if (valid && mf.is_range_tombstone_change()) {
+        _current_tombstone = mf.as_range_tombstone_change().tombstone();
+    }
+    return valid;
 }
 bool mutation_fragment_stream_validator::operator()(const mutation_fragment& mf) {
    return (*this)(to_mutation_fragment_kind_v2(mf.mutation_fragment_kind()), mf.position());
@@ -1646,11 +1653,17 @@ void mutation_fragment_stream_validator::reset(dht::decorated_key dk) {
    _prev_partition_key = dk;
    _prev_pos = position_in_partition::for_partition_start();
    _prev_kind = mutation_fragment_v2::kind::partition_start;
+    _current_tombstone = {};
 }

 void mutation_fragment_stream_validator::reset(const mutation_fragment_v2& mf) {
    _prev_pos = mf.position();
    _prev_kind = mf.mutation_fragment_kind();
+    if (mf.is_range_tombstone_change()) {
+        _current_tombstone = mf.as_range_tombstone_change().tombstone();
+    } else {
+        _current_tombstone = {};
+    }
 }
 void mutation_fragment_stream_validator::reset(const mutation_fragment& mf) {
    _prev_pos = mf.position();
@@ -1719,6 +1732,11 @@ bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment_v2

    fmr_logger.debug("[validator {}] {}:{}", static_cast<void*>(this), kind, pos);

+    if (kind == mutation_fragment_v2::kind::partition_end && _current_tombstone) {
+        on_validation_error(fmr_logger, format("[validator {} for {}] Unexpected active tombstone at partition-end: partition key {}: tombstone {}",
+                static_cast<void*>(this), _name, _validator.previous_partition_key(), _current_tombstone));
+    }
+
    if (_validation_level >= mutation_fragment_stream_validation_level::clustering_key) {
        valid = _validator(kind, pos);
    } else {
@@ -1745,7 +1763,11 @@ bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment::k
 }

 bool mutation_fragment_stream_validating_filter::operator()(const mutation_fragment_v2& mv) {
-    return (*this)(mv.mutation_fragment_kind(), mv.position());
+    auto valid = (*this)(mv.mutation_fragment_kind(), mv.position());
+    if (valid && mv.is_range_tombstone_change()) {
+        _current_tombstone = mv.as_range_tombstone_change().tombstone();
+    }
+    return valid;
 }
 bool mutation_fragment_stream_validating_filter::operator()(const mutation_fragment& mv) {
    return (*this)(to_mutation_fragment_kind_v2(mv.mutation_fragment_kind()), mv.position());
@@ -1764,7 +1786,7 @@ void mutation_fragment_stream_validating_filter::on_end_of_stream() {
 }

 flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader_v2&& o) noexcept {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -1777,7 +1799,7 @@ flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader
 }

 flat_mutation_reader_v2::~flat_mutation_reader_v2() {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -1964,11 +1986,14 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
        tombstone _current_rt;
        std::optional<position_range> _pr;
    public:
-        void flush_tombstones(position_in_partition_view pos) {
+        void flush_tombstones(position_in_partition_view pos, bool emit_end = false) {
            _rt_gen.flush(pos, [&] (range_tombstone_change rt) {
                _current_rt = rt.tombstone();
                push_mutation_fragment(*_schema, _permit, std::move(rt));
            });
+            if (emit_end && _current_rt) {
+                push_mutation_fragment(*_schema, _permit, range_tombstone_change(pos, {}));
+            }
        }
        void consume(static_row mf) {
            push_mutation_fragment(*_schema, _permit, std::move(mf));
@@ -1993,11 +2018,9 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
            push_mutation_fragment(*_schema, _permit, std::move(mf));
        }
        void consume(partition_end mf) {
-            flush_tombstones(position_in_partition::after_all_clustered_rows());
+            flush_tombstones(position_in_partition::after_all_clustered_rows(), true);
            if (_current_rt) {
                assert(!_pr);
-                push_mutation_fragment(*_schema, _permit, range_tombstone_change(
-                        position_in_partition::after_all_clustered_rows(), {}));
            }
            push_mutation_fragment(*_schema, _permit, std::move(mf));
        }
@@ -2020,10 +2043,7 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
                if (_reader.is_end_of_stream() && _reader.is_buffer_empty()) {
                    if (_pr) {
                        // If !_pr we should flush on partition_end
-                        flush_tombstones(_pr->end());
-                        if (_current_rt) {
-                            push_mutation_fragment(*_schema, _permit, range_tombstone_change(_pr->end(), {}));
-                        }
+                        flush_tombstones(_pr->end(), true);
                    }
                    _end_of_stream = true;
                }
--- a/flat_mutation_reader.hh
+++ b/flat_mutation_reader.hh
@@ -132,6 +132,7 @@ public:
    private:
        tracked_buffer _buffer;
        size_t _buffer_size = 0;
+        bool _close_required = false;
    protected:
        size_t max_buffer_size_in_bytes = default_max_buffer_size_in_bytes();
        bool _end_of_stream = false;
@@ -167,6 +168,8 @@ public:
        bool is_end_of_stream() const { return _end_of_stream; }
        bool is_buffer_empty() const { return _buffer.empty(); }
        bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; }
+        bool is_close_required() const { return _close_required; }
+        void set_close_required() { _close_required = true; }
        static constexpr size_t default_max_buffer_size_in_bytes() { return 8 * 1024; }

        mutation_fragment pop_mutation_fragment() {
@@ -504,9 +507,15 @@ public:
    //
    // Can be used to skip over entire partitions if interleaved with
    // `operator()()` calls.
-    future<> next_partition() { return _impl->next_partition(); }
+    future<> next_partition() {
+        _impl->set_close_required();
+        return _impl->next_partition();
+    }

-    future<> fill_buffer() { return _impl->fill_buffer(); }
+    future<> fill_buffer() {
+        _impl->set_close_required();
+        return _impl->fill_buffer();
+    }

    // Changes the range of partitions to pr. The range can only be moved
    // forwards. pr.begin() needs to be larger than pr.end() of the previousl
@@ -515,6 +524,7 @@ public:
    // pr needs to be valid until the reader is destroyed or fast_forward_to()
    // is called again.
    future<> fast_forward_to(const dht::partition_range& pr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(pr);
    }
    // Skips to a later range of rows.
@@ -544,6 +554,7 @@ public:
    // In particular one must first enter a partition by fetching a `partition_start`
    // fragment before calling `fast_forward_to`.
    future<> fast_forward_to(position_range cr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(std::move(cr));
    }
    // Closes the reader.
--- a/flat_mutation_reader_v2.hh
+++ b/flat_mutation_reader_v2.hh
@@ -164,6 +164,7 @@ public:
    private:
        tracked_buffer _buffer;
        size_t _buffer_size = 0;
+        bool _close_required = false;
    protected:
        size_t max_buffer_size_in_bytes = default_max_buffer_size_in_bytes();

@@ -205,6 +206,8 @@ public:
        bool is_end_of_stream() const { return _end_of_stream; }
        bool is_buffer_empty() const { return _buffer.empty(); }
        bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; }
+        bool is_close_required() const { return _close_required; }
+        void set_close_required() { _close_required = true; }
        static constexpr size_t default_max_buffer_size_in_bytes() { return 8 * 1024; }

        mutation_fragment_v2 pop_mutation_fragment() {
@@ -542,9 +545,15 @@ public:
    //
    // Can be used to skip over entire partitions if interleaved with
    // `operator()()` calls.
-    future<> next_partition() { return _impl->next_partition(); }
+    future<> next_partition() {
+        _impl->set_close_required();
+        return _impl->next_partition();
+    }

-    future<> fill_buffer() { return _impl->fill_buffer(); }
+    future<> fill_buffer() {
+        _impl->set_close_required();
+        return _impl->fill_buffer();
+    }

    // Changes the range of partitions to pr. The range can only be moved
    // forwards. pr.begin() needs to be larger than pr.end() of the previousl
@@ -553,6 +562,7 @@ public:
    // pr needs to be valid until the reader is destroyed or fast_forward_to()
    // is called again.
    future<> fast_forward_to(const dht::partition_range& pr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(pr);
    }
    // Skips to a later range of rows.
@@ -582,6 +592,7 @@ public:
    // In particular one must first enter a partition by fetching a `partition_start`
    // fragment before calling `fast_forward_to`.
    future<> fast_forward_to(position_range cr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(std::move(cr));
    }
    // Closes the reader.
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -1012,10 +1012,10 @@ std::set<inet_address> gossiper::get_live_members() {

 std::set<inet_address> gossiper::get_live_token_owners() {
    std::set<inet_address> token_owners;
-    for (auto& member : get_live_members()) {
-        auto es = get_endpoint_state_for_endpoint_ptr(member);
-        if (es && !is_dead_state(*es) && get_token_metadata_ptr()->is_member(member)) {
-            token_owners.insert(member);
+    auto normal_token_owners = get_token_metadata_ptr()->get_all_endpoints();
+    for (auto& node: normal_token_owners) {
+        if (is_alive(node)) {
+            token_owners.insert(node);
        }
    }
    return token_owners;
@@ -1023,10 +1023,10 @@ std::set<inet_address> gossiper::get_live_token_owners() {

 std::set<inet_address> gossiper::get_unreachable_token_owners() {
    std::set<inet_address> token_owners;
-    for (auto&& x : _unreachable_endpoints) {
-        auto& endpoint = x.first;
-        if (get_token_metadata_ptr()->is_member(endpoint)) {
-            token_owners.insert(endpoint);
+    auto normal_token_owners = get_token_metadata_ptr()->get_all_endpoints();
+    for (auto& node: normal_token_owners) {
+        if (!is_alive(node)) {
+            token_owners.insert(node);
        }
    }
    return token_owners;
--- a/install.sh
+++ b/install.sh
@@ -143,7 +143,7 @@ export LD_LIBRARY_PATH="$prefix/libreloc"
 export UBSAN_OPTIONS="${UBSAN_OPTIONS:+$UBSAN_OPTIONS:}suppressions=$prefix/libexec/ubsan-suppressions.supp"
 exec -a "\$0" "$prefix/libexec/$bin" "\$@"
 EOF
-    chmod +x "$root/$prefix/bin/$bin"
+    chmod 755 "$root/$prefix/bin/$bin"
 }

 relocate_python3() {
@@ -156,11 +156,11 @@ relocate_python3() {
    local pythonpath="$(dirname "$pythoncmd")"

    if [ ! -x "$script" ]; then
-        cp "$script" "$install"
+        install -m755 "$script" "$install"
        return
    fi
-    mkdir -p "$relocateddir"
-    cp "$script" "$relocateddir"
+    install -d -m755 "$relocateddir"
+    install -m755 "$script" "$relocateddir"
    cat > "$install"<<EOF
 #!/usr/bin/env bash
 [[ -z "\$LD_PRELOAD" ]] || { echo "\$0: not compatible with LD_PRELOAD" >&2; exit 110; }
@@ -178,7 +178,7 @@ if [ -f "\${DEBIAN_SSL_CERT_FILE}" ]; then
 fi
 PYTHONPATH="\${d}:\${d}/libexec:\$PYTHONPATH" PATH="\${d}/../bin:\${d}/$pythonpath:\${PATH}" SSL_CERT_FILE="\${c}" exec -a "\$0" "\${d}/libexec/\${b}" "\$@"
 EOF
-    chmod +x "$install"
+    chmod 755 "$install"
 }

 install() {
@@ -392,6 +392,7 @@ install -d -m755 -d "$rprefix"/scyllatop
 cp -r tools/scyllatop/* "$rprefix"/scyllatop
 install -d -m755 -d "$rprefix"/scripts
 cp -r dist/common/scripts/* "$rprefix"/scripts
+chmod 755 "$rprefix"/scripts/*
 ln -srf "$rprefix/scyllatop/scyllatop.py" "$rprefix/bin/scyllatop"
 if $supervisor; then
    install -d -m755 "$rprefix"/supervisor
@@ -508,8 +509,13 @@ relocate_python3 "$rprefix"/scripts fix_system_distributed_tables.py
 if $supervisor; then
    install -d -m755 `supervisor_dir $retc`
    for service in scylla-server scylla-jmx scylla-node-exporter; do
+        if [ "$service" = "scylla-server" ]; then
+            program="scylla"
+        else
+            program=$service
+        fi
        cat << EOS > `supervisor_conf $retc $service`
-[program:$service]
+[program:$program]
 directory=$rprefix
 command=/bin/bash -c './supervisor/$service.sh'
 EOS
--- a/locator/abstract_replication_strategy.cc
+++ b/locator/abstract_replication_strategy.cc
@@ -215,22 +215,6 @@ effective_replication_map::get_primary_ranges_within_dc(inet_address ep) const {
    });
 }

-future<std::unordered_multimap<inet_address, dht::token_range>>
-abstract_replication_strategy::get_address_ranges(const token_metadata& tm) const {
-    std::unordered_multimap<inet_address, dht::token_range> ret;
-    for (auto& t : tm.sorted_tokens()) {
-        dht::token_range_vector r = tm.get_primary_ranges_for(t);
-        auto eps = co_await calculate_natural_endpoints(t, tm);
-        rslogger.debug("token={}, primary_range={}, address={}", t, r, eps);
-        for (auto ep : eps) {
-            for (auto&& rng : r) {
-                ret.emplace(ep, rng);
-            }
-        }
-    }
-    co_return ret;
-}
-
 future<std::unordered_multimap<inet_address, dht::token_range>>
 abstract_replication_strategy::get_address_ranges(const token_metadata& tm, inet_address endpoint) const {
    std::unordered_multimap<inet_address, dht::token_range> ret;
--- a/locator/abstract_replication_strategy.hh
+++ b/locator/abstract_replication_strategy.hh
@@ -112,7 +112,6 @@ public:
    future<dht::token_range_vector> get_ranges(inet_address ep, token_metadata_ptr tmptr) const;

 public:
-    future<std::unordered_multimap<inet_address, dht::token_range>> get_address_ranges(const token_metadata& tm) const;
    future<std::unordered_multimap<inet_address, dht::token_range>> get_address_ranges(const token_metadata& tm, inet_address endpoint) const;

    // Caller must ensure that token_metadata will not change throughout the call.
--- a/locator/azure_snitch.cc
+++ b/locator/azure_snitch.cc
@@ -15,6 +15,7 @@
 #include <seastar/core/coroutine.hh>
 #include <seastar/core/seastar.hh>
 #include <seastar/http/response_parser.hh>
+#include <seastar/http/reply.hh>
 #include <seastar/net/api.hh>
 #include <seastar/net/dns.hh>

@@ -34,6 +35,10 @@ azure_snitch::azure_snitch(const sstring& fname, unsigned io_cpuid) : production
 }

 future<> azure_snitch::load_config() {
+    if (this_shard_id() != io_cpu_id()) {
+        co_return;
+    }
+
    sstring region = co_await azure_api_call(REGION_NAME_QUERY_PATH);
    sstring azure_zone = co_await azure_api_call(ZONE_NAME_QUERY_PATH);

@@ -43,7 +48,8 @@ future<> azure_snitch::load_config() {

    logger().info("AzureSnitch using region: {}, zone: {}.", azure_region, azure_zone);

-    _my_rack = azure_zone;
+    // Zoneless regions return empty zone
+    _my_rack = (azure_zone != "" ? azure_zone : azure_region);
    _my_dc = azure_region;

    co_return co_await _my_distributed->invoke_on_all([this] (snitch_ptr& local_s) {
@@ -86,6 +92,10 @@ future<sstring> azure_snitch::azure_api_call(sstring path) {

        // Read HTTP response header first
        auto rsp = parser.get_parsed_response();
+        if (rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
+            throw std::runtime_error(format("Error: HTTP response status {}", rsp->_status_code));
+        }
+
        auto it = rsp->_headers.find("Content-Length");
        if (it == rsp->_headers.end()) {
            throw std::runtime_error("Error: HTTP response does not contain: Content-Length\n");
--- a/locator/ec2_snitch.cc
+++ b/locator/ec2_snitch.cc
@@ -1,5 +1,8 @@
 #include "locator/ec2_snitch.hh"
 #include <seastar/core/seastar.hh>
+#include <seastar/core/sleep.hh>
+#include <seastar/core/do_with.hh>
+#include <seastar/http/reply.hh>

 #include <boost/algorithm/string/classification.hpp>
 #include <boost/algorithm/string/split.hpp>
@@ -67,6 +70,30 @@ future<> ec2_snitch::start() {
 }

 future<sstring> ec2_snitch::aws_api_call(sstring addr, uint16_t port, sstring cmd) {
+    return do_with(int(0), [this, addr, port, cmd] (int& i) {
+        return repeat_until_value([this, addr, port, cmd, &i]() -> future<std::optional<sstring>> {
+            ++i;
+            return aws_api_call_once(addr, port, cmd).then([] (auto res) {
+                return make_ready_future<std::optional<sstring>>(std::move(res));
+            }).handle_exception([&i] (auto ep) {
+                try {
+                    std::rethrow_exception(ep);
+                } catch (const std::system_error &e) {
+                    logger().error(e.what());
+                    if (i >= AWS_API_CALL_RETRIES - 1) {
+                        logger().error("Maximum number of retries exceeded");
+                        throw e;
+                    }
+                }
+                return sleep(AWS_API_CALL_RETRY_INTERVAL).then([] {
+                    return make_ready_future<std::optional<sstring>>(std::nullopt);
+                });
+            });
+        });
+    });
+}
+
+future<sstring> ec2_snitch::aws_api_call_once(sstring addr, uint16_t port, sstring cmd) {
    return connect(socket_address(inet_address{addr}, port))
    .then([this, addr, cmd] (connected_socket fd) {
        _sd = std::move(fd);
@@ -88,6 +115,9 @@ future<sstring> ec2_snitch::aws_api_call(sstring addr, uint16_t port, sstring cm

            // Read HTTP response header first
            auto _rsp = _parser.get_parsed_response();
+            if (_rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
+                return make_exception_future<sstring>(std::runtime_error(format("Error: HTTP response status {}", _rsp->_status_code)));
+            }
            auto it = _rsp->_headers.find("Content-Length");
            if (it == _rsp->_headers.end()) {
                return make_exception_future<sstring>("Error: HTTP response does not contain: Content-Length\n");
--- a/locator/ec2_snitch.hh
+++ b/locator/ec2_snitch.hh
@@ -16,6 +16,8 @@ public:
    static constexpr const char* ZONE_NAME_QUERY_REQ = "/latest/meta-data/placement/availability-zone";
    static constexpr const char* AWS_QUERY_SERVER_ADDR = "169.254.169.254";
    static constexpr uint16_t AWS_QUERY_SERVER_PORT = 80;
+    static constexpr int AWS_API_CALL_RETRIES = 5;
+    static constexpr auto AWS_API_CALL_RETRY_INTERVAL = std::chrono::seconds{5};

    ec2_snitch(const sstring& fname = "", unsigned io_cpu_id = 0);
    virtual future<> start() override;
@@ -32,5 +34,6 @@ private:
    output_stream<char> _out;
    http_response_parser _parser;
    sstring _zone_req;
+    future<sstring> aws_api_call_once(sstring addr, uint16_t port, const sstring cmd);
 };
 } // namespace locator
--- a/locator/gce_snitch.cc
+++ b/locator/gce_snitch.cc
@@ -14,6 +14,7 @@
 #include <seastar/net/dns.hh>
 #include <seastar/core/seastar.hh>
 #include "locator/gce_snitch.hh"
+#include <seastar/http/reply.hh>

 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string/classification.hpp>
@@ -106,6 +107,10 @@ future<sstring> gce_snitch::gce_api_call(sstring addr, sstring cmd) {

        // Read HTTP response header first
        auto rsp = parser.get_parsed_response();
+        if (rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
+            throw std::runtime_error(format("Error: HTTP response status {}", rsp->_status_code));
+        }
+
        auto it = rsp->_headers.find("Content-Length");
        if (it == rsp->_headers.end()) {
            throw std::runtime_error("Error: HTTP response does not contain: Content-Length\n");
--- a/locator/token_metadata.cc
+++ b/locator/token_metadata.cc
@@ -786,13 +786,12 @@ void token_metadata_impl::calculate_pending_ranges_for_leaving(
        const abstract_replication_strategy& strategy,
        std::unordered_multimap<range<token>, inet_address>& new_pending_ranges,
        mutable_token_metadata_ptr all_left_metadata) const {
-    std::unordered_multimap<inet_address, dht::token_range> address_ranges = strategy.get_address_ranges(unpimplified_this).get0();
    // get all ranges that will be affected by leaving nodes
    std::unordered_set<range<token>> affected_ranges;
    for (auto endpoint : _leaving_endpoints) {
-        auto r = address_ranges.equal_range(endpoint);
-        for (auto x = r.first; x != r.second; x++) {
-            affected_ranges.emplace(x->second);
+        auto r = strategy.get_address_ranges(unpimplified_this, endpoint).get0();
+        for (const auto& x : r) {
+            affected_ranges.emplace(x.second);
        }
    }
    // for each of those ranges, find what new nodes will be responsible for the range when
@@ -826,16 +825,14 @@ void token_metadata_impl::calculate_pending_ranges_for_replacing(
    if (_replacing_endpoints.empty()) {
        return;
    }
-    auto address_ranges = strategy.get_address_ranges(unpimplified_this).get0();
    for (const auto& node : _replacing_endpoints) {
        auto existing_node = node.first;
        auto replacing_node = node.second;
+        auto address_ranges = strategy.get_address_ranges(unpimplified_this, existing_node).get0();
        for (const auto& x : address_ranges) {
            seastar::thread::maybe_yield();
-            if (x.first == existing_node) {
-                tlogger.debug("Node {} replaces {} for range {}", replacing_node, existing_node, x.second);
-                new_pending_ranges.emplace(x.second, replacing_node);
-            }
+            tlogger.debug("Node {} replaces {} for range {}", replacing_node, existing_node, x.second);
+            new_pending_ranges.emplace(x.second, replacing_node);
        }
    }
 }
--- a/main.cc
+++ b/main.cc
@@ -367,11 +367,40 @@ static auto defer_verbose_shutdown(const char* what, Func&& func) {
        startlog.info("Shutting down {}", what);
        try {
            func();
+            startlog.info("Shutting down {} was successful", what);
        } catch (...) {
-            startlog.error("Unexpected error shutting down {}: {}", what, std::current_exception());
-            throw;
+            auto ex = std::current_exception();
+            bool do_abort = true;
+            try {
+                std::rethrow_exception(ex);
+            } catch (const std::system_error& e) {
+                // System error codes we consider "environmental",
+                // i.e. not scylla's fault, therefore there is no point in
+                // aborting and dumping core.
+                for (int i : {EIO, EACCES, ENOSPC}) {
+                    if (e.code() == std::error_code(i, std::system_category())) {
+                        do_abort = false;
+                        break;
+                    }
+                }
+            } catch (const storage_io_error& e) {
+                do_abort = false;
+            } catch (...) {
+            }
+            auto msg = fmt::format("Unexpected error shutting down {}: {}", what, ex);
+            if (do_abort) {
+                startlog.error("{}: aborting", msg);
+                abort();
+            } else {
+                startlog.error("{}: exiting, at {}", msg, current_backtrace());
+
+                // Call _exit() rather than exit() to exit immediately
+                // without calling exit handlers, avoiding
+                // boost::intrusive::detail::destructor_impl assert failure
+                // from ~segment_pool exit handler.
+                _exit(255);
+            }
        }
-        startlog.info("Shutting down {} was successful", what);
    };

    auto ret = deferred_action(std::move(vfunc));
@@ -398,6 +427,39 @@ static int scylla_main(int ac, char** av) {
        exit(1);
    }

+    // Even on the environment which causes error during initalize Scylla,
+    // "scylla --version" should be able to run without error.
+    // To do so, we need to parse and execute these options before
+    // initializing Scylla/Seastar classes.
+    bpo::options_description preinit_description("Scylla options");
+    bpo::variables_map preinit_vm;
+    preinit_description.add_options()
+        ("version", bpo::bool_switch(), "print version number and exit")
+        ("build-id", bpo::bool_switch(), "print build-id and exit")
+        ("build-mode", bpo::bool_switch(), "print build mode and exit")
+        ("list-tools", bpo::bool_switch(), "list included tools and exit");
+    auto preinit_parsed_opts = bpo::command_line_parser(ac, av).options(preinit_description).allow_unregistered().run();
+    bpo::store(preinit_parsed_opts, preinit_vm);
+    if (preinit_vm["version"].as<bool>()) {
+        fmt::print("{}\n", scylla_version());
+        return 0;
+    }
+    if (preinit_vm["build-id"].as<bool>()) {
+        fmt::print("{}\n", get_build_id());
+        return 0;
+    }
+    if (preinit_vm["build-mode"].as<bool>()) {
+        fmt::print("{}\n", scylla_build_mode());
+        return 0;
+    }
+    if (preinit_vm["list-tools"].as<bool>()) {
+        fmt::print(
+                "types - a command-line tool to examine values belonging to scylla types\n"
+                "sstable - a multifunctional command-line tool to examine the content of sstables\n"
+        );
+        return 0;
+    }
+
  try {
    runtime::init_uptime();
    std::setvbuf(stdout, nullptr, _IOLBF, 1000);
@@ -452,26 +514,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
    bpo::variables_map vm;
    auto parsed_opts = bpo::command_line_parser(ac, av).options(app.get_options_description()).allow_unregistered().run();
    bpo::store(parsed_opts, vm);
-    if (vm["version"].as<bool>()) {
-        fmt::print("{}\n", scylla_version());
-        return 0;
-    }
-    if (vm["build-id"].as<bool>()) {
-        fmt::print("{}\n", get_build_id());
-        return 0;
-    }
-    if (vm["build-mode"].as<bool>()) {
-        fmt::print("{}\n", scylla_build_mode());
-        return 0;
-    }
-    if (vm["list-tools"].as<bool>()) {
-        fmt::print(
-                "types - a command-line tool to examine values belonging to scylla types\n"
-                "sstable - a multifunctional command-line tool to examine the content of sstables\n"
-        );
-        return 0;
-    }
-
    print_starting_message(ac, av, parsed_opts);

    sharded<locator::shared_token_metadata> token_metadata;
@@ -547,6 +589,12 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl

            cfg->broadcast_to_all_shards().get();

+            // We pass this piece of config through a global as a temporary hack.
+            // See the comment at the definition of sstables::global_cache_index_pages.
+            smp::invoke_on_all([&cfg] {
+                sstables::global_cache_index_pages = cfg->cache_index_pages.operator utils::updateable_value<bool>();
+            }).get();
+
            ::sighup_handler sighup_handler(opts, *cfg);
            auto stop_sighup_handler = defer_verbose_shutdown("sighup", [&] {
                sighup_handler.stop().get();
@@ -1089,7 +1137,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            // ATTN -- sharded repair reference already sits on storage_service and if
            // it calls repair.local() before this place it'll crash (now it doesn't do
            // both)
-            supervisor::notify("starting messaging service");
+            supervisor::notify("starting repair service");
            auto max_memory_repair = memory::stats().total_memory() * 0.1;
            repair.start(std::ref(gossiper), std::ref(messaging), std::ref(db), std::ref(proxy), std::ref(bm), std::ref(sys_dist_ks), std::ref(view_update_generator), std::ref(mm), max_memory_repair).get();
            auto stop_repair_service = defer_verbose_shutdown("repair service", [&repair] {
--- a/memtable-sstable.hh
+++ b/memtable-sstable.hh
@@ -15,6 +15,7 @@
 #include "sstables/shared_sstable.hh"
 #include <seastar/core/future.hh>
 #include <seastar/core/io_priority_class.hh>
+#include "reader_permit.hh"

 class memtable;
 class flat_mutation_reader;
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -438,6 +438,8 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
    // should not be blocked by any data requests.
    case messaging_verb::GROUP0_PEER_EXCHANGE:
    case messaging_verb::GROUP0_MODIFY_CONFIG:
+        // ATTN -- if moving GOSSIP_ verbs elsewhere, mind updating the tcp_nodelay
+        // setting in get_rpc_client(), which assumes gossiper verbs live in idx 0
        return 0;
    case messaging_verb::PREPARE_MESSAGE:
    case messaging_verb::PREPARE_DONE_MESSAGE:
@@ -695,7 +697,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }();

    auto must_tcp_nodelay = [&] {
-        if (idx == 1) {
+        if (idx == 0) {
            return true; // gossip
        }
        if (_cfg.tcp_nodelay == tcp_nodelay_what::local) {
@@ -716,10 +718,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }
    opts.tcp_nodelay = must_tcp_nodelay;
    opts.reuseaddr = true;
-    // We send cookies only for non-default statement tenant clients.
-    if (idx > 3) {
-        opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;
-    }
+    opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;

    auto client = must_encrypt ?
                    ::make_shared<rpc_protocol_client_wrapper>(_rpc->protocol(), std::move(opts),
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -272,8 +272,8 @@ public:

    future<> lookup_readers(db::timeout_clock::time_point timeout);

-    future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
-            std::optional<clustering_key_prefix> last_ckey);
+    future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, std::optional<detached_compaction_state> compaction_state,
+            dht::decorated_key last_pkey, std::optional<clustering_key_prefix> last_ckey);

    future<> stop();
 };
@@ -580,19 +580,22 @@ future<> read_context::lookup_readers(db::timeout_clock::time_point timeout) {
    });
 }

-future<> read_context::save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
-            std::optional<clustering_key_prefix> last_ckey) {
+future<> read_context::save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, std::optional<detached_compaction_state> compaction_state,
+            dht::decorated_key last_pkey, std::optional<clustering_key_prefix> last_ckey) {
    if (_cmd.query_uuid == utils::UUID{}) {
        return make_ready_future<>();
    }

-    auto last_pkey = compaction_state.partition_start.key();
-
    const auto cb_stats = dismantle_combined_buffer(std::move(unconsumed_buffer), last_pkey);
    tracing::trace(_trace_state, "Dismantled combined buffer: {}", cb_stats);

-    const auto cs_stats = dismantle_compaction_state(std::move(compaction_state));
-    tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);
+    auto cs_stats = dismantle_buffer_stats{};
+    if (compaction_state) {
+        cs_stats = dismantle_compaction_state(std::move(*compaction_state));
+        tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);
+    } else {
+        tracing::trace(_trace_state, "No compaction state to dismantle, partition exhausted", cs_stats);
+    }

    return do_with(std::move(last_pkey), std::move(last_ckey), [this] (const dht::decorated_key& last_pkey,
            const std::optional<clustering_key_prefix>& last_ckey) {
@@ -745,16 +748,18 @@ future<typename ResultBuilder::result_type> do_query(
        ResultBuilder&& result_builder) {
    auto ctx = seastar::make_shared<read_context>(db, s, cmd, ranges, trace_state, timeout);

-    co_await ctx->lookup_readers(timeout);
-
    std::exception_ptr ex;

    try {
+        co_await ctx->lookup_readers(timeout);
+
        auto [last_ckey, result, unconsumed_buffer, compaction_state] = co_await read_page<ResultBuilder>(ctx, s, cmd, ranges, trace_state,
                std::move(result_builder));

        if (compaction_state->are_limits_reached() || result.is_short_read()) {
-            co_await ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_ckey));
+            // Must call before calling 'detached_state()`.
+            auto last_pkey = *compaction_state->current_partition();
+            co_await ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_pkey), std::move(last_ckey));
        }

        co_await ctx->stop();
--- a/mutation_compactor.hh
+++ b/mutation_compactor.hh
@@ -167,6 +167,9 @@ class compact_mutation_state {
    std::unique_ptr<mutation_compactor_garbage_collector> _collector;

    compaction_stats _stats;
+
+    // Remember if we requested to stop mid-partition.
+    stop_iteration _stop = stop_iteration::no;
 private:
    template <typename Consumer, typename GCConsumer>
    requires CompactedFragmentsConsumer<Consumer> && CompactedFragmentsConsumer<GCConsumer>
@@ -304,6 +307,7 @@ public:
    }

    void consume_new_partition(const dht::decorated_key& dk) {
+        _stop = stop_iteration::no;
        auto& pk = dk.key();
        _dk = &dk;
        _return_static_content_on_partition_with_no_rows =
@@ -370,9 +374,9 @@ public:
        _static_row_live = is_live;
        if (is_live || (!only_live() && !sr.empty())) {
            partition_is_not_empty(consumer);
-            return consumer.consume(std::move(sr), current_tombstone, is_live);
+            _stop = consumer.consume(std::move(sr), current_tombstone, is_live);
        }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -424,22 +428,21 @@ public:
        };

        if (only_live() && is_live) {
-            auto stop = consume_row();
+            _stop = consume_row();
            if (++_rows_in_current_partition == _current_partition_limit) {
-                return stop_iteration::yes;
+                _stop = stop_iteration::yes;
            }
-            return stop;
+            return _stop;
        } else if (!only_live()) {
-            auto stop = stop_iteration::no;
            if (!cr.empty()) {
-                stop = consume_row();
+                _stop = consume_row();
            }
            if (!sstable_compaction() && is_live && ++_rows_in_current_partition == _current_partition_limit) {
-                return stop_iteration::yes;
+                _stop = stop_iteration::yes;
            }
-            return stop;
+            return _stop;
        }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -448,7 +451,8 @@ public:
        ++_stats.range_tombstones;
        _range_tombstones.apply(rt);
        // FIXME: drop tombstone if it is fully covered by other range tombstones
-        return do_consume(std::move(rt), consumer, gc_consumer);
+        _stop = do_consume(std::move(rt), consumer, gc_consumer);
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -459,9 +463,9 @@ public:
            _rt_assembler.emplace();
        }
        if (auto rt_opt = _rt_assembler->consume(_schema, std::move(rtc))) {
-            return do_consume(std::move(*rt_opt), consumer, gc_consumer);
+            _stop = do_consume(std::move(*rt_opt), consumer, gc_consumer);
        }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -490,8 +494,16 @@ public:
            _partition_limit -= _rows_in_current_partition > 0;
            auto stop = consumer.consume_end_of_partition();
            if (!sstable_compaction()) {
-                return _row_limit && _partition_limit && stop != stop_iteration::yes
+                stop = _row_limit && _partition_limit && stop != stop_iteration::yes
                       ? stop_iteration::no : stop_iteration::yes;
+                // If we decided to stop earlier but decide to continue now, we
+                // are in effect skipping the partition. Do not leave `_stop` at
+                // `stop_iteration::yes` in this case, reset it back to
+                // `stop_iteration::no` as if we exhausted the partition.
+                if (_stop && !stop) {
+                    _stop = stop_iteration::no;
+                }
+                return stop;
            }
        }
        return stop_iteration::no;
@@ -536,6 +548,7 @@ public:
        _current_partition_limit = std::min(_row_limit, _partition_row_limit);
        _query_time = query_time;
        _stats = {};
+        _stop = stop_iteration::no;

        noop_compacted_fragments_consumer nc;

@@ -562,16 +575,31 @@ public:
    /// compactor will result in the new compactor being in the same state *this
    /// is (given the same outside parameters of course). Practically this
    /// allows the compaction state to be stored in the compacted reader.
-    detached_compaction_state detach_state() && {
+    /// If the currently compacted partition is exhausted a disengaged optional
+    /// is returned -- in this case there is no state to detach.
+    std::optional<detached_compaction_state> detach_state() && {
+        // If we exhausted the partition, there is no need to detach-restore the
+        // compaction state.
+        // We exhausted the partition if `consume_partition_end()` was called
+        // without us requesting the consumption to stop (remembered in _stop)
+        // from one of the consume() overloads.
+        // The consume algorithm calls `consume_partition_end()` in two cases:
+        // * on a partition-end fragment
+        // * consume() requested to stop
+        // In the latter case, the partition is not exhausted. Even if the next
+        // fragment to process is a partition-end, it will not be consumed.
+        if (!_stop) {
+            return {};
+        }
        partition_start ps(std::move(_last_dk), _range_tombstones.get_partition_tombstone());
        if (_rt_assembler) {
            if (_current_tombstone) {
-                return {std::move(ps), std::move(_last_static_row), range_tombstone_change(position_in_partition_view::after_key(_last_clustering_pos), _current_tombstone)};
+                return detached_compaction_state{std::move(ps), std::move(_last_static_row), range_tombstone_change(position_in_partition_view::after_key(_last_clustering_pos), _current_tombstone)};
            } else {
-                return {std::move(ps), std::move(_last_static_row), std::optional<range_tombstone_change>{}};
+                return detached_compaction_state{std::move(ps), std::move(_last_static_row), std::optional<range_tombstone_change>{}};
            }
        }
-        return {std::move(ps), std::move(_last_static_row), std::move(_range_tombstones).range_tombstones()};
+        return detached_compaction_state{std::move(ps), std::move(_last_static_row), std::move(_range_tombstones).range_tombstones()};
    }

    const compaction_stats& stats() const { return _stats; }
--- a/mutation_fragment_stream_validator.hh
+++ b/mutation_fragment_stream_validator.hh
@@ -28,6 +28,7 @@ class mutation_fragment_stream_validator {
    mutation_fragment_v2::kind _prev_kind;
    position_in_partition _prev_pos;
    dht::decorated_key _prev_partition_key;
+    tombstone _current_tombstone;
 public:
    explicit mutation_fragment_stream_validator(const schema& s);

@@ -122,6 +123,12 @@ public:
    const position_in_partition& previous_position() const {
        return _prev_pos;
    }
+    /// Get the current effective tombstone
+    ///
+    /// Not meaningful, when operator()(mutation_fragment_v2) is not used.
+    tombstone current_tombstone() const {
+        return _current_tombstone;
+    }
    /// The previous valid partition key.
    ///
    /// Only valid if `operator()(const dht::decorated_key&)` or
@@ -151,6 +158,7 @@ class mutation_fragment_stream_validating_filter {
    mutation_fragment_stream_validator _validator;
    sstring _name;
    mutation_fragment_stream_validation_level _validation_level;
+    tombstone _current_tombstone;

 public:
    /// Constructor.
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -826,6 +826,7 @@ public:

    void apply(tombstone deleted_at) {
        _deleted_at.apply(deleted_at);
+        maybe_shadow();
    }

    void apply(shadowable_tombstone deleted_at) {
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -1240,7 +1240,10 @@ future<flat_mutation_reader> evictable_reader::resume_or_create_reader() {
    if (auto reader_opt = try_resume()) {
        co_return std::move(*reader_opt);
    }
-    co_await _permit.maybe_wait_readmission();
+    // See evictable_reader_v2::resume_or_create_reader()
+    if (_permit.needs_readmission()) {
+        co_await _permit.wait_readmission();
+    }
    co_return recreate_reader();
 }

@@ -1581,11 +1584,7 @@ private:
    tracing::global_trace_state_ptr _trace_state;
    const mutation_reader::forwarding _fwd_mr;
    reader_concurrency_semaphore::inactive_read_handle _irh;
-    bool _drop_partition_start = false;
-    bool _drop_static_row = false;
-    // Validate the partition key of the first emitted partition, set after the
-    // reader was recreated.
-    bool _validate_partition_key = false;
+    bool _reader_recreated = false; // set if reader was recreated since last operation
    position_in_partition::tri_compare _tri_cmp;

    std::optional<dht::decorated_key> _last_pkey;
@@ -1606,10 +1605,9 @@ private:
    void adjust_partition_slice();
    flat_mutation_reader_v2 recreate_reader();
    future<flat_mutation_reader_v2> resume_or_create_reader();
-    void maybe_validate_partition_start(const flat_mutation_reader_v2::tracked_buffer& buffer);
+    void validate_partition_start(const partition_start& ps);
    void validate_position_in_partition(position_in_partition_view pos) const;
-    bool should_drop_fragment(const mutation_fragment_v2& mf);
-    future<> do_fill_buffer();
+    void examine_first_fragments(mutation_fragment_v2_opt& mf1, mutation_fragment_v2_opt& mf2, mutation_fragment_v2_opt& mf3);

 public:
    evictable_reader_v2(
@@ -1725,9 +1723,6 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
    _range_override.reset();
    _slice_override.reset();

-    _drop_partition_start = false;
-    _drop_static_row = false;
-
    if (_last_pkey) {
        bool partition_range_is_inclusive = true;

@@ -1736,11 +1731,8 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
            partition_range_is_inclusive = false;
            break;
        case partition_region::static_row:
-            _drop_partition_start = true;
            break;
        case partition_region::clustered:
-            _drop_partition_start = true;
-            _drop_static_row = true;
            adjust_partition_slice();
            slice = &*_slice_override;
            break;
@@ -1763,7 +1755,7 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
        _range_override = dht::partition_range({dht::partition_range::bound(*_last_pkey, partition_range_is_inclusive)}, _pr->end());
        range = &*_range_override;

-        _validate_partition_key = true;
+        _reader_recreated = true;
    }

    return _ms.make_reader_v2(
@@ -1784,45 +1776,48 @@ future<flat_mutation_reader_v2> evictable_reader_v2::resume_or_create_reader() {
    if (auto reader_opt = try_resume()) {
        co_return std::move(*reader_opt);
    }
-    co_await _permit.maybe_wait_readmission();
+    // When the reader is created the first time and we are actually resuming a
+    // saved reader in `recreate_reader()`, we have two cases here:
+    // * the reader is still alive (in inactive state)
+    // * the reader was evicted
+    // We check for this below with `needs_readmission()` and it is very
+    // important to not allow for preemption between said check and
+    // `recreate_reader()`, otherwise the reader might be evicted between the
+    // check and `recreate_reader()` and the latter will recreate it without
+    // waiting for re-admission.
+    if (_permit.needs_readmission()) {
+        co_await _permit.wait_readmission();
+    }
    co_return recreate_reader();
 }

-void evictable_reader_v2::maybe_validate_partition_start(const flat_mutation_reader_v2::tracked_buffer& buffer) {
-    if (!_validate_partition_key || buffer.empty()) {
-        return;
-    }
-
-    // If this is set we can assume the first fragment is a partition-start.
-    const auto& ps = buffer.front().as_partition_start();
+void evictable_reader_v2::validate_partition_start(const partition_start& ps) {
    const auto tri_cmp = dht::ring_position_comparator(*_schema);
    // If we recreated the reader after fast-forwarding it we won't have
    // _last_pkey set. In this case it is enough to check if the partition
    // is in range.
    if (_last_pkey) {
        const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
-        if (_drop_partition_start) { // we expect to continue from the same partition
+        if (_next_position_in_partition.region() != partition_region::partition_start) { // we expect to continue from the same partition
            // We cannot assume the partition we stopped the read at is still alive
            // when we recreate the reader. It might have been compacted away in the
            // meanwhile, so allow for a larger partition too.
            require(
                    cmp_res <= 0,
-                    "{}(): validation failed, expected partition with key larger or equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
+                    "{}(): validation failed, expected partition with key larger or equal to _last_pkey {}, but got {}",
                    __FUNCTION__,
                    *_last_pkey,
                    ps.key());
-            // Reset drop flags and next pos if we are not continuing from the same partition
+            // Reset next pos if we are not continuing from the same partition
            if (cmp_res < 0) {
                // Close previous partition, we are not going to continue it.
                push_mutation_fragment(*_schema, _permit, partition_end{});
-                _drop_partition_start = false;
-                _drop_static_row = false;
                _next_position_in_partition = position_in_partition::for_partition_start();
            }
        } else { // should be a larger partition
            require(
                    cmp_res < 0,
-                    "{}(): validation failed, expected partition with key larger than _last_pkey {} due to _drop_partition_start being unset, but got {}",
+                    "{}(): validation failed, expected partition with key larger than _last_pkey {}, but got {}",
                    __FUNCTION__,
                    *_last_pkey,
                    ps.key());
@@ -1836,8 +1831,6 @@ void evictable_reader_v2::maybe_validate_partition_start(const flat_mutation_rea
            __FUNCTION__,
            prange,
            ps.key());
-
-    _validate_partition_key = false;
 }

 void evictable_reader_v2::validate_position_in_partition(position_in_partition_view pos) const {
@@ -1860,7 +1853,12 @@ void evictable_reader_v2::validate_position_in_partition(position_in_partition_v
        const bool any_contains = std::any_of(ranges.begin(), ranges.end(), [this, &pos] (const query::clustering_range& cr) {
            // TODO: somehow avoid this copy
            auto range = position_range(cr);
-            return range.contains(*_schema, pos);
+            // We cannot use range.contains() because that treats range as a
+            // [a, b) range, meaning a range tombstone change with position
+            // after_key(b) will be considered outside of it. Such range
+            // tombstone changes can be emitted however when recreating the
+            // reader on clustering range edge.
+            return _tri_cmp(range.start(), pos) <= 0 && _tri_cmp(pos, range.end()) <= 0;
        });
        require(
                any_contains,
@@ -1871,42 +1869,40 @@ void evictable_reader_v2::validate_position_in_partition(position_in_partition_v
    }
 }

-bool evictable_reader_v2::should_drop_fragment(const mutation_fragment_v2& mf) {
-    if (_drop_partition_start && mf.is_partition_start()) {
-        _drop_partition_start = false;
-        return true;
+void evictable_reader_v2::examine_first_fragments(mutation_fragment_v2_opt& mf1, mutation_fragment_v2_opt& mf2, mutation_fragment_v2_opt& mf3) {
+    if (!mf1) {
+        return; // the reader is at EOS
    }
-    // Unlike partition-start above, a partition is not guaranteed to have a
-    // static row fragment. So reset the flag regardless of whether we could
-    // drop one or not.
-    // We are guaranteed to get here only right after dropping a partition-start,
-    // so if we are not seeing a static row here, the partition doesn't have one.
-    if (_drop_static_row) {
-         _drop_static_row = false;
-        return mf.is_static_row();
-    }
-    return false;
-}

-future<> evictable_reader_v2::do_fill_buffer() {
-    if (!_drop_partition_start && !_drop_static_row) {
-        auto fill_buf_fut = _reader->fill_buffer();
-        if (_validate_partition_key) {
-            fill_buf_fut = fill_buf_fut.then([this] {
-                maybe_validate_partition_start(_reader->buffer());
-            });
-        }
-        return fill_buf_fut;
+    // If engaged, the first fragment is always a partition-start.
+    validate_partition_start(mf1->as_partition_start());
+    if (_tri_cmp(mf1->position(), _next_position_in_partition) < 0) {
+        mf1 = {}; // drop mf1
+    }
+
+    const auto continue_same_partition = _next_position_in_partition.region() != partition_region::partition_start;
+
+    // If we have a first fragment, we are guaranteed to have a second one -- if not else, a partition-end.
+    if (mf2->is_end_of_partition()) {
+        return; // no further fragments, nothing to do
+    }
+
+    // We want to validate the position of the first non-dropped fragment.
+    // If mf2 is a static row and we need to drop it, this will be mf3.
+    if (mf2->is_static_row() && _tri_cmp(mf2->position(), _next_position_in_partition) < 0) {
+        mf2 = {}; // drop mf2
+    } else {
+        if (continue_same_partition) {
+            validate_position_in_partition(mf2->position());
+        }
+        return;
+    }
+
+    if (mf3->is_end_of_partition()) {
+        return; // no further fragments, nothing to do
+    } else if (continue_same_partition) {
+        validate_position_in_partition(mf3->position());
    }
-    return repeat([this] {
-        return _reader->fill_buffer().then([this] {
-            maybe_validate_partition_start(_reader->buffer());
-            while (!_reader->is_buffer_empty() && should_drop_fragment(_reader->peek_buffer())) {
-                _reader->pop_mutation_fragment();
-            }
-            return stop_iteration(_reader->is_buffer_full() || _reader->is_end_of_stream());
-        });
-    });
 }

 evictable_reader_v2::evictable_reader_v2(
@@ -1935,10 +1931,64 @@ future<> evictable_reader_v2::fill_buffer() {
        co_return;
    }
    _reader = co_await resume_or_create_reader();
-    co_await do_fill_buffer();
+
+    if (_reader_recreated) {
+        // Recreating the reader breaks snapshot isolation and creates all sorts
+        // of complications around the continuity of range tombstone changes,
+        // e.g. a range tombstone started by the previous reader object
+        // might not exist anymore with the new reader object.
+        // To avoid complications we reset the tombstone state on each reader
+        // recreation by emitting a null tombstone change, if we read at least
+        // one clustering fragment from the partition.
+        if (_next_position_in_partition.region() == partition_region::clustered
+                && _tri_cmp(_next_position_in_partition, position_in_partition::before_all_clustered_rows()) > 0) {
+            push_mutation_fragment(*_schema, _permit, range_tombstone_change{position_in_partition_view::before_key(_next_position_in_partition), {}});
+        }
+        auto mf1 = co_await (*_reader)();
+        auto mf2 = co_await (*_reader)();
+        auto mf3 = co_await (*_reader)();
+        examine_first_fragments(mf1, mf2, mf3);
+        if (mf3) {
+            _reader->unpop_mutation_fragment(std::move(*mf3));
+        }
+        if (mf2) {
+            _reader->unpop_mutation_fragment(std::move(*mf2));
+        }
+        if (mf1) {
+            _reader->unpop_mutation_fragment(std::move(*mf1));
+        }
+        _reader_recreated = false;
+    } else {
+        co_await _reader->fill_buffer();
+    }
+
    _reader->move_buffer_content_to(*this);
+
+    // Ensure that each buffer represents forward progress. Only a concern when
+    // the last fragment in the buffer is range tombstone change. In this case
+    // ensure that:
+    // * buffer().back().position() > _next_position_in_partition;
+    // * _reader.peek()->position() > buffer().back().position();
+    if (!is_buffer_empty() && buffer().back().is_range_tombstone_change()) {
+        auto* next_mf = co_await _reader->peek();
+
+        // First make sure we've made progress w.r.t. _next_position_in_partition.
+        // This loop becomes inifinite when next pos is a partition start.
+        // In that case progress is guranteed anyway, so skip this loop entirely.
+        while (!_next_position_in_partition.is_partition_start() && next_mf && _tri_cmp(_next_position_in_partition, buffer().back().position()) <= 0) {
+            push_mutation_fragment(_reader->pop_mutation_fragment());
+            next_mf = co_await _reader->peek();
+        }
+
+        const auto last_pos = position_in_partition(buffer().back().position());
+        while (next_mf && _tri_cmp(last_pos, next_mf->position()) == 0) {
+            push_mutation_fragment(_reader->pop_mutation_fragment());
+            next_mf = co_await _reader->peek();
+        }
+    }
+
    update_next_position();
-    _end_of_stream = _reader->is_end_of_stream() && _reader->is_buffer_empty();
+    _end_of_stream = _reader->is_end_of_stream();
    maybe_pause(std::move(*_reader));
 }

--- a/partition_snapshot_reader.hh
+++ b/partition_snapshot_reader.hh
@@ -292,14 +292,23 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
                const std::optional<position_in_partition>& last_row,
                const std::optional<position_in_partition>& last_rts,
                position_in_partition_view pos) {
-            if (!_rt_stream.empty()) {
-                return _rt_stream.get_next(std::move(pos));
-            }
            return in_alloc_section([&] () -> mutation_fragment_opt {
                maybe_refresh_state(ck_range_snapshot, last_row, last_rts);

                position_in_partition::less_compare rt_less(_query_schema);

+                // The while below moves range tombstones from partition versions
+                // into _rt_stream, just enough to produce the next range tombstone
+                // The main goal behind moving to _rt_stream is to deoverlap range tombstones
+                // which have the same starting position. This is not in order to satisfy
+                // flat_mutation_reader stream requirements, the reader can emit range tombstones
+                // which have the same position incrementally. This is to guarantee forward
+                // progress in the case iterators get invalidated and maybe_refresh_state()
+                // above needs to restore them. It does so using last_rts, which tracks
+                // the position of the last emitted range tombstone. All range tombstones
+                // with positions <= than last_rts are skipped on refresh. To make progress,
+                // we need to make sure that all range tombstones with duplicated positions
+                // are emitted before maybe_refresh_state().
                while (has_more_range_tombstones()
                        && !rt_less(pos, peek_range_tombstone().position())
                        && (_rt_stream.empty() || !rt_less(_rt_stream.peek_next().position(), peek_range_tombstone().position()))) {
--- a/partition_snapshot_row_cursor.hh
+++ b/partition_snapshot_row_cursor.hh
@@ -444,7 +444,7 @@ public:
    // When throws, the cursor is invalidated and its position is not changed.
    bool advance_to(position_in_partition_view lower_bound) {
        maybe_advance_to(lower_bound);
-        return no_clustering_row_between(_schema, lower_bound, position());
+        return no_clustering_row_between_weak(_schema, lower_bound, position());
    }

    // Call only when valid.
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -567,6 +567,20 @@ bool no_clustering_row_between(const schema& s, position_in_partition_view a, po
    }
 }

+// Returns true if and only if there can't be any clustering_row with position >= a and < b.
+// It is assumed that a <= b.
+inline
+bool no_clustering_row_between_weak(const schema& s, position_in_partition_view a, position_in_partition_view b) {
+    clustering_key_prefix::equality eq(s);
+    if (a.has_key() && b.has_key()) {
+        return eq(a.key(), b.key())
+               && (a.get_bound_weight() == bound_weight::after_all_prefixed
+                   || b.get_bound_weight() != bound_weight::after_all_prefixed);
+    } else {
+        return !a.has_key() && !b.has_key();
+    }
+}
+
 // Includes all position_in_partition objects "p" for which: start <= p < end
 // And only those.
 class position_range {
--- a/Show More
+++ b/Show More