release: prepare for 5.0.12

db/view/view_update_check: check_needs_view_update_path(): filter out non-member hosts
We currently don't clean up the system_distributed.view_build_status table after removed nodes. This can cause false-positive check for whether view update generation is needed for streaming. The proper fix is to clean up this table, but that will be more involved, it even when done, it might not be immediate. So until then and to be on the safe side, filter out entries belonging to unknown hosts from said table. Fixes: #11905 Refs: #11836 Closes #11860 (cherry picked from commit 84a69b6adb)
2023-04-10 15:58:57 +03:00 · 2023-03-22 09:14:12 +02:00 · 2023-03-22 08:26:32 +02:00 · 2023-03-21 17:11:00 +01:00 · 2023-03-21 17:54:56 +02:00 · 2023-03-17 14:15:17 +03:00
207 changed files with 5103 additions and 1186 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -60,7 +60,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.0.dev
+VERSION=5.0.12

 if test -f version
 then
--- a/alternator/controller.cc
+++ b/alternator/controller.cc
@@ -78,6 +78,11 @@ future<> controller::start_server() {

        _executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_mm), std::ref(_sys_dist_ks), sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value()).get();
        _server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper)).get();
+        // Note: from this point on, if start_server() throws for any reason,
+        // it must first call stop_server() to stop the executor and server
+        // services we just started - or Scylla will cause an assertion
+        // failure when the controller object is destroyed in the exception
+        // unwinding.
        std::optional<uint16_t> alternator_port;
        if (_config.alternator_port()) {
            alternator_port = _config.alternator_port();
@@ -104,7 +109,13 @@ future<> controller::start_server() {
            }
            opts.erase("require_client_auth");
            opts.erase("truststore");
-            utils::configure_tls_creds_builder(creds.value(), std::move(opts)).get();
+            try {
+                utils::configure_tls_creds_builder(creds.value(), std::move(opts)).get();
+            } catch(...) {
+                logger.error("Failed to set up Alternator TLS credentials: {}", std::current_exception());
+                stop_server().get();
+                std::throw_with_nested(std::runtime_error("Failed to set up Alternator TLS credentials"));
+            }
        }
        bool alternator_enforce_authorization = _config.alternator_enforce_authorization();
        _server.invoke_on_all(
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -34,6 +34,7 @@
 #include "expressions.hh"
 #include "conditions.hh"
 #include "cql3/constants.hh"
+#include "cql3/util.hh"
 #include <optional>
 #include "utils/overloaded_functor.hh"
 #include "seastar/json/json_elements.hh"
@@ -46,6 +47,7 @@
 #include <seastar/core/coroutine.hh>
 #include <boost/range/adaptors.hpp>
 #include <boost/range/algorithm/find_end.hpp>
+#include <unordered_set>
 #include "service/storage_proxy.hh"
 #include "gms/gossiper.hh"
 #include "schema_registry.hh"
@@ -148,16 +150,16 @@ static void validate_table_name(const std::string& name) {
 // instead of each component individually as DynamoDB does.
 // The view_name() function assumes the table_name has already been validated
 // but validates the legality of index_name and the combination of both.
-static std::string view_name(const std::string& table_name, const std::string& index_name, const std::string& delim = ":") {
+static std::string view_name(const std::string& table_name, std::string_view index_name, const std::string& delim = ":") {
    static const std::regex valid_index_name_chars ("[a-zA-Z0-9_.-]*");
    if (index_name.length() < 3) {
        throw api_error::validation("IndexName must be at least 3 characters long");
    }
-    if (!std::regex_match(index_name.c_str(), valid_index_name_chars)) {
+    if (!std::regex_match(index_name.data(), valid_index_name_chars)) {
        throw api_error::validation(
                format("IndexName '{}' must satisfy regular expression pattern: [a-zA-Z0-9_.-]+", index_name));
    }
-    std::string ret = table_name + delim + index_name;
+    std::string ret = table_name + delim + std::string(index_name);
    if (ret.length() > max_table_name_length) {
        throw api_error::validation(
                format("The total length of TableName ('{}') and IndexName ('{}') cannot exceed {} characters",
@@ -166,7 +168,7 @@ static std::string view_name(const std::string& table_name, const std::string& i
    return ret;
 }

-static std::string lsi_name(const std::string& table_name, const std::string& index_name) {
+static std::string lsi_name(const std::string& table_name, std::string_view index_name) {
    return view_name(table_name, index_name, "!:");
 }

@@ -273,16 +275,16 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
    if (index_name) {
        if (index_name->IsString()) {
            orig_table_name = std::move(table_name);
-            table_name = view_name(orig_table_name, index_name->GetString());
+            table_name = view_name(orig_table_name, rjson::to_string_view(*index_name));
            type = table_or_view_type::gsi;
        } else {
            throw api_error::validation(
-                    format("Non-string IndexName '{}'", index_name->GetString()));
+                    format("Non-string IndexName '{}'", rjson::to_string_view(*index_name)));
        }
        // If no tables for global indexes were found, the index may be local
        if (!proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
            type = table_or_view_type::lsi;
-            table_name = lsi_name(orig_table_name, index_name->GetString());
+            table_name = lsi_name(orig_table_name, rjson::to_string_view(*index_name));
        }
    }

@@ -432,6 +434,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
    rjson::add(table_description, "BillingModeSummary", rjson::empty_object());
    rjson::add(table_description["BillingModeSummary"], "BillingMode", "PAY_PER_REQUEST");
    rjson::add(table_description["BillingModeSummary"], "LastUpdateToPayPerRequestDateTime", rjson::value(creation_date_seconds));
+    // In PAY_PER_REQUEST billing mode, provisioned capacity should return 0
+    rjson::add(table_description, "ProvisionedThroughput", rjson::empty_object());
+    rjson::add(table_description["ProvisionedThroughput"], "ReadCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);

    std::unordered_map<std::string,std::string> key_attribute_types;
    // Add base table's KeySchema and collect types for AttributeDefinitions:
@@ -453,6 +460,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
            rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
            // Add indexes's KeySchema and collect types for AttributeDefinitions:
            describe_key_schema(view_entry, *vptr, key_attribute_types);
+            // Add projection type
+            rjson::value projection = rjson::empty_object();
+            rjson::add(projection, "ProjectionType", "ALL");
+            // FIXME: we have to get ProjectionType from the schema when it is added
+            rjson::add(view_entry, "Projection", std::move(projection));
            // Local secondary indexes are marked by an extra '!' sign occurring before the ':' delimiter
            rjson::value& index_array = (delim_it > 1 && cf_name[delim_it-1] == '!') ? lsi_array : gsi_array;
            rjson::push_back(index_array, std::move(view_entry));
@@ -884,17 +896,23 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
    const rjson::value* gsi = rjson::find(request, "GlobalSecondaryIndexes");
    std::vector<schema_builder> view_builders;
    std::vector<sstring> where_clauses;
+    std::unordered_set<std::string> index_names;
    if (gsi) {
        if (!gsi->IsArray()) {
            co_return api_error::validation("GlobalSecondaryIndexes must be an array.");
        }
        for (const rjson::value& g : gsi->GetArray()) {
-            const rjson::value* index_name = rjson::find(g, "IndexName");
-            if (!index_name || !index_name->IsString()) {
+            const rjson::value* index_name_v = rjson::find(g, "IndexName");
+            if (!index_name_v || !index_name_v->IsString()) {
                co_return api_error::validation("GlobalSecondaryIndexes IndexName must be a string.");
            }
-            std::string vname(view_name(table_name, index_name->GetString()));
-            elogger.trace("Adding GSI {}", index_name->GetString());
+            std::string_view index_name = rjson::to_string_view(*index_name_v);
+            auto [it, added] = index_names.emplace(index_name);
+            if (!added) {
+                co_return api_error::validation(format("Duplicate IndexName '{}', ", index_name));
+            }
+            std::string vname(view_name(table_name, index_name));
+            elogger.trace("Adding GSI {}", index_name);
            // FIXME: read and handle "Projection" parameter. This will
            // require the MV code to copy just parts of the attrs map.
            schema_builder view_builder(keyspace_name, vname);
@@ -927,9 +945,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            if  (!range_key.empty() && range_key != view_hash_key && range_key != view_range_key) {
                add_column(view_builder, range_key, attribute_definitions, column_kind::clustering_key);
            }
-            sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
+            sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
            if (!view_range_key.empty()) {
-                where_clause = where_clause + " AND \"" + view_hash_key + "\" IS NOT NULL";
+                where_clause = format("{} AND {} IS NOT NULL", where_clause,
+                    cql3::util::maybe_quote(view_range_key));
            }
            where_clauses.push_back(std::move(where_clause));
            view_builders.emplace_back(std::move(view_builder));
@@ -942,12 +961,17 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            throw api_error::validation("LocalSecondaryIndexes must be an array.");
        }
        for (const rjson::value& l : lsi->GetArray()) {
-            const rjson::value* index_name = rjson::find(l, "IndexName");
-            if (!index_name || !index_name->IsString()) {
+            const rjson::value* index_name_v = rjson::find(l, "IndexName");
+            if (!index_name_v || !index_name_v->IsString()) {
                throw api_error::validation("LocalSecondaryIndexes IndexName must be a string.");
            }
-            std::string vname(lsi_name(table_name, index_name->GetString()));
-            elogger.trace("Adding LSI {}", index_name->GetString());
+            std::string_view index_name = rjson::to_string_view(*index_name_v);
+            auto [it, added] = index_names.emplace(index_name);
+            if (!added) {
+                co_return api_error::validation(format("Duplicate IndexName '{}', ", index_name));
+            }
+            std::string vname(lsi_name(table_name, index_name));
+            elogger.trace("Adding LSI {}", index_name);
            if (range_key.empty()) {
                co_return api_error::validation("LocalSecondaryIndex requires that the base table have a range key");
            }
@@ -979,9 +1003,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            // Note above we don't need to add virtual columns, as all
            // base columns were copied to view. TODO: reconsider the need
            // for virtual columns when we support Projection.
-            sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
+            sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
            if (!view_range_key.empty()) {
-                where_clause = where_clause + " AND \"" + view_range_key + "\" IS NOT NULL";
+                where_clause = format("{} AND {} IS NOT NULL", where_clause,
+                    cql3::util::maybe_quote(view_range_key));
            }
            where_clauses.push_back(std::move(where_clause));
            view_builders.emplace_back(std::move(view_builder));
@@ -2173,6 +2198,9 @@ static attrs_to_get calculate_attrs_to_get(const rjson::value& req, std::unorder
        for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) {
            attribute_path_map_add("AttributesToGet", ret, it->GetString());
        }
+        if (ret.empty()) {
+            throw api_error::validation("Empty AttributesToGet is not allowed. Consider using Select=COUNT instead.");
+        }
        return ret;
    } else if (has_projection_expression) {
        const rjson::value& projection_expression = req["ProjectionExpression"];
@@ -2577,8 +2605,8 @@ static bool hierarchy_actions(
                        // attr member so we can use add()
                        rjson::add_with_string_name(v, attr, std::move(*newv));
                    } else {
-                        throw api_error::validation(format("Can't remove document path {} - not present in item",
-                            subh.get_value()._path));
+                        // Removing a.b when a is a map but a.b doesn't exist
+                        // is silently ignored. It's not considered an error.
                    }
                } else {
                    throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -143,19 +143,24 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
    auto table = find_table(_proxy, request);
    auto db = _proxy.data_dictionary();
    auto cfs = db.get_tables();
-    auto i = cfs.begin();
-    auto e = cfs.end();

    if (limit < 1) {
        throw api_error::validation("Limit must be 1 or more");
    }

-    // TODO: the unordered_map here is not really well suited for partial
-    // querying - we're sorting on local hash order, and creating a table
-    // between queries may or may not miss info. But that should be rare,
-    // and we can probably expect this to be a single call.
+    // # 12601 (maybe?) - sort the set of tables on ID. This should ensure we never
+    // generate duplicates in a paged listing here. Can obviously miss things if they 
+    // are added between paged calls and end up with a "smaller" UUID/ARN, but that 
+    // is to be expected.
+    std::sort(cfs.begin(), cfs.end(), [](const data_dictionary::table& t1, const data_dictionary::table& t2) {
+        return t1.schema()->id() < t2.schema()->id();
+    });
+
+    auto i = cfs.begin();
+    auto e = cfs.end();
+
    if (streams_start) {
-        i = std::find_if(i, e, [&](data_dictionary::table t) {
+        i = std::find_if(i, e, [&](const data_dictionary::table& t) {
            return t.schema()->id() == streams_start 
                && cdc::get_base_table(db.real_database(), *t.schema())
                && is_alternator_keyspace(t.schema()->ks_name())
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -116,9 +116,6 @@ future<executor::request_return_type> executor::update_time_to_live(client_state

 future<executor::request_return_type> executor::describe_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
    _stats.api_operations.describe_time_to_live++;
-    if (!_proxy.data_dictionary().features().cluster_supports_alternator_ttl()) {
-        co_return api_error::unknown_operation("DescribeTimeToLive not yet supported. Experimental support is available if the 'alternator_ttl' experimental feature is enabled on all nodes.");
-    }
    schema_ptr schema = get_table(_proxy, request);
    std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
    rjson::value desc = rjson::empty_object();
--- a/alternator/ttl.hh
+++ b/alternator/ttl.hh
@@ -12,6 +12,7 @@
 #include <seastar/core/sharded.hh>
 #include <seastar/core/abort_source.hh>
 #include <seastar/core/semaphore.hh>
+#include "data_dictionary/data_dictionary.hh"

 namespace replica {
 class database;
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -624,7 +624,7 @@
                  },
                  {
                     "name":"kn",
-                     "description":"Comma seperated keyspaces name to snapshot",
+                     "description":"Keyspace(s) to snapshot. Multiple keyspaces can be provided using a comma-separated list. If omitted, snapshot all keyspaces.",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
@@ -632,7 +632,7 @@
                  },
                  {
                     "name":"cf",
-                     "description":"the column family to snapshot",
+                     "description":"Table(s) to snapshot. Multiple tables (in a single keyspace) can be provided using a comma-separated list. If omitted, snapshot all tables in the given keyspace(s).",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -593,6 +593,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        if (column_families.empty()) {
            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
        }
+        apilog.debug("force_keyspace_compaction: keyspace={} tables={}", keyspace, column_families);
        return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) -> future<> {
            auto table_ids = boost::copy_range<std::vector<utils::UUID>>(column_families | boost::adaptors::transformed([&] (auto& cf_name) {
                return db.find_uuid(keyspace, cf_name);
@@ -617,6 +618,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        if (column_families.empty()) {
            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
        }
+        apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, column_families);
        return ss.local().is_cleanup_allowed(keyspace).then([&ctx, keyspace,
                column_families = std::move(column_families)] (bool is_cleanup_allowed) mutable {
            if (!is_cleanup_allowed) {
@@ -635,7 +637,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                // as a table can be dropped during loop below, let's find it before issuing the cleanup request.
                for (auto& id : table_ids) {
                    replica::table& t = db.find_column_family(id);
-                    co_await cm.perform_cleanup(db, &t);
+                    co_await t.perform_cleanup_compaction(db);
                }
                co_return;
            }).then([]{
@@ -645,6 +647,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::perform_keyspace_offstrategy_compaction.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> tables) -> future<json::json_return_type> {
+        apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, tables);
        co_return co_await ctx.db.map_reduce0([&keyspace, &tables] (replica::database& db) -> future<bool> {
            bool needed = false;
            for (const auto& table : tables) {
@@ -658,6 +661,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);

+        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, column_families, exclude_current_version);
        return ctx.db.invoke_on_all([=] (replica::database& db) {
            return do_for_each(column_families, [=, &db](sstring cfname) {
                auto& cm = db.get_compaction_manager();
@@ -669,23 +673,22 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        });
    }));

-    ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) {
+    ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) -> future<json::json_return_type> {
        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
+        apilog.info("perform_keyspace_flush: keyspace={} tables={}", keyspace, column_families);
+        auto &db = ctx.db.local();
        if (column_families.empty()) {
-            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
+            co_await db.flush_on_all(keyspace);
+        } else {
+            co_await db.flush_on_all(keyspace, std::move(column_families));
        }
-        return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) {
-            return parallel_for_each(column_families, [&db, keyspace](const sstring& cf) mutable {
-                return db.find_column_family(keyspace, cf).flush();
-            });
-        }).then([]{
-                return make_ready_future<json::json_return_type>(json_void());
-        });
+        co_return json_void();
    });


    ss::decommission.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("decommission");
        return ss.local().decommission().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -701,6 +704,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::remove_node.set(r, [&ss](std::unique_ptr<request> req) {
        auto host_id = req->get_query_param("host_id");
        std::vector<sstring> ignore_nodes_strs= split(req->get_query_param("ignore_nodes"), ",");
+        apilog.info("remove_node: host_id={} ignore_nodes={}", host_id, ignore_nodes_strs);
        auto ignore_nodes = std::list<gms::inet_address>();
        for (std::string n : ignore_nodes_strs) {
            try {
@@ -773,6 +777,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::drain.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("drain");
        return ss.local().drain().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -805,12 +810,14 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::stop_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("stop_gossiping");
        return ss.local().stop_gossiping().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

    ss::start_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("start_gossiping");
        return ss.local().start_gossiping().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -907,6 +914,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::rebuild.set(r, [&ss](std::unique_ptr<request> req) {
        auto source_dc = req->get_query_param("source_dc");
+        apilog.info("rebuild: source_dc={}", source_dc);
        return ss.local().rebuild(std::move(source_dc)).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -943,6 +951,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        // FIXME: We should truncate schema tables if more than one node in the cluster.
        auto& sp = service::get_storage_proxy();
        auto& fs = sp.local().features();
+        apilog.info("reset_local_schema");
        return db::schema_tables::recalculate_schema_version(sp, fs).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -950,6 +959,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
        auto probability = req->get_query_param("probability");
+        apilog.info("set_trace_probability: probability={}", probability);
        return futurize_invoke([probability] {
            double real_prob = std::stod(probability.c_str());
            return tracing::tracing::tracing_instance().invoke_on_all([real_prob] (auto& local_tracing) {
@@ -987,6 +997,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto ttl = req->get_query_param("ttl");
        auto threshold = req->get_query_param("threshold");
        auto fast = req->get_query_param("fast");
+        apilog.info("set_slow_query: enable={} ttl={} threshold={} fast={}", enable, ttl, threshold, fast);
        try {
            return tracing::tracing::tracing_instance().invoke_on_all([enable, ttl, threshold, fast] (auto& local_tracing) {
                if (threshold != "") {
@@ -1013,6 +1024,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

+        apilog.info("enable_auto_compaction: keyspace={} tables={}", keyspace, tables);
        return set_tables_autocompaction(ctx, ss.local(), keyspace, tables, true);
    });

@@ -1020,6 +1032,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

+        apilog.info("disable_auto_compaction: keyspace={} tables={}", keyspace, tables);
        return set_tables_autocompaction(ctx, ss.local(), keyspace, tables, false);
    });

@@ -1284,40 +1297,46 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        });
    });

-    ss::take_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) {
-        apilog.debug("take_snapshot: {}", req->query_parameters);
+    ss::take_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) -> future<json::json_return_type> {
+        apilog.info("take_snapshot: {}", req->query_parameters);
        auto tag = req->get_query_param("tag");
        auto column_families = split(req->get_query_param("cf"), ",");
        auto sfopt = req->get_query_param("sf");
        auto sf = db::snapshot_ctl::skip_flush(strcasecmp(sfopt.c_str(), "true") == 0);

        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-
-        auto resp = make_ready_future<>();
-        if (column_families.empty()) {
-            resp = snap_ctl.local().take_snapshot(tag, keynames, sf);
-        } else {
-            if (keynames.empty()) {
-                throw httpd::bad_param_exception("The keyspace of column families must be specified");
+        try {
+            if (column_families.empty()) {
+                co_await snap_ctl.local().take_snapshot(tag, keynames, sf);
+            } else {
+                if (keynames.empty()) {
+                    throw httpd::bad_param_exception("The keyspace of column families must be specified");
+                }
+                if (keynames.size() > 1) {
+                    throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
+                }
+                co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, sf);
            }
-            if (keynames.size() > 1) {
-                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
-            }
-            resp = snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, sf);
+            co_return json_void();
+        } catch (...) {
+            apilog.error("take_snapshot failed: {}", std::current_exception());
+            throw;
        }
-        return resp.then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
    });

-    ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) {
+    ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) -> future<json::json_return_type> {
+        apilog.info("del_snapshot: {}", req->query_parameters);
        auto tag = req->get_query_param("tag");
        auto column_family = req->get_query_param("cf");

        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-        return snap_ctl.local().clear_snapshot(tag, keynames, column_family).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
+        try {
+            co_await snap_ctl.local().clear_snapshot(tag, keynames, column_family);
+            co_return json_void();
+        } catch (...) {
+            apilog.error("del_snapshot failed: {}", std::current_exception());
+            throw;
+        }
    });

    ss::true_snapshots_size.set(r, [&snap_ctl](std::unique_ptr<request> req) {
@@ -1354,7 +1373,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        if (!req_param<bool>(*req, "disable_snapshot", false)) {
            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
            f = parallel_for_each(column_families, [&snap_ctl, keyspace, tag](sstring cf) {
-                return snap_ctl.local().take_column_family_snapshot(keyspace, cf, tag);
+                return snap_ctl.local().take_column_family_snapshot(keyspace, cf, tag, db::snapshot_ctl::skip_flush::no, db::snapshot_ctl::allow_view_snapshots::yes);
            });
        }

--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -87,19 +87,24 @@ compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
            // prefer expiring cells.
            return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
        }
-        if (left.is_live_and_has_ttl() && left.expiry() != right.expiry()) {
-            return left.expiry() <=> right.expiry();
+        if (left.is_live_and_has_ttl()) {
+            if (left.expiry() != right.expiry()) {
+                return left.expiry() <=> right.expiry();
+            } else {
+                // prefer the cell that was written later,
+                // so it survives longer after it expires, until purged.
+                return right.ttl() <=> left.ttl();
+            }
        }
    } else {
        // Both are deleted
-        if (left.deletion_time() != right.deletion_time()) {
-            // Origin compares big-endian serialized deletion time. That's because it
-            // delegates to AbstractCell.reconcile() which compares values after
-            // comparing timestamps, which in case of deleted cells will hold
-            // serialized expiry.
-            return (uint64_t) left.deletion_time().time_since_epoch().count()
-                   <=> (uint64_t) right.deletion_time().time_since_epoch().count();
-        }
+
+        // Origin compares big-endian serialized deletion time. That's because it
+        // delegates to AbstractCell.reconcile() which compares values after
+        // comparing timestamps, which in case of deleted cells will hold
+        // serialized expiry.
+        return (uint64_t) left.deletion_time().time_since_epoch().count()
+                <=> (uint64_t) right.deletion_time().time_since_epoch().count();
    }
    return std::strong_ordering::equal;
 }
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -59,7 +59,7 @@ using namespace std::chrono_literals;
 logging::logger cdc_log("cdc");

 namespace cdc {
-static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {});
+static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {}, schema_ptr = nullptr);
 }

 static constexpr auto cdc_group_name = "cdc";
@@ -206,7 +206,7 @@ public:
                return;
            }

-            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt);
+            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);

            auto log_mut = log_schema 
                ? db::schema_tables::make_update_table_mutations(db, keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
@@ -484,7 +484,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
    return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
 }

-static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid) {
+static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid, schema_ptr old) {
    schema_builder b(s.ks_name(), log_name(s.cf_name()));
    b.with_partitioner("com.scylladb.dht.CDCPartitioner");
    b.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
@@ -571,6 +571,20 @@ static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID>
        b.set_uuid(*uuid);
    }

+    /**
+     * #10473 - if we are redefining the log table, we need to ensure any dropped
+     * columns are registered in "dropped_columns" table, otherwise clients will not
+     * be able to read data older than now.
+     */
+    if (old) {
+        // not super efficient, but we don't do this often.
+        for (auto& col : old->all_columns()) {
+            if (!b.has_column({col.name(), col.name_as_text() })) {
+                b.without_column(col.name_as_text(), col.type, api::new_timestamp());
+            }
+        }
+    }
+
    return b.build();
 }

--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -1281,6 +1281,13 @@ private:

            const auto& key = _validator.previous_partition_key();

+            if (_validator.current_tombstone()) {
+                throw compaction_aborted_exception(
+                        _schema->ks_name(),
+                        _schema->cf_name(),
+                        "scrub compaction cannot handle invalid fragments with an active range tombstone change");
+            }
+
            // If the unexpected fragment is a partition end, we just drop it.
            // The only case a partition end is invalid is when it comes after
            // another partition end, and we can just drop it in that case.
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -317,9 +317,9 @@ future<> compaction_manager::run_custom_job(replica::table* t, sstables::compact

    auto job_ptr = std::make_unique<noncopyable_function<future<>(sstables::compaction_data&)>>(std::move(job));

-    task->compaction_done = with_semaphore(_maintenance_ops_sem, 1, [this, task, &job = *job_ptr] () mutable {
-        // take read lock for table, so major compaction and resharding can't proceed in parallel.
-        return with_lock(task->compaction_state.lock.for_read(), [this, task, &job] () mutable {
+    task->compaction_done = with_semaphore(_custom_jobs_sem, 1, [this, task, &job = *job_ptr] () mutable {
+            // We don't need to take task->compaction_state.lock.for_read() as it only serializes minor and major
+
            // Allow caller to know that task (e.g. reshape) was asked to stop while waiting for a chance to run.
            if (task->stopping) {
                throw sstables::compaction_stopped_exception(task->compacting_table->schema()->ks_name(), task->compacting_table->schema()->cf_name(),
@@ -335,7 +335,6 @@ future<> compaction_manager::run_custom_job(replica::table* t, sstables::compact
            // no need to register shared sstables because they're excluded from non-resharding
            // compaction and some of them may not even belong to current shard.
            return job(task->compaction_data);
-        });
    }).then_wrapped([this, task, job_ptr = std::move(job_ptr), type] (future<> f) {
        _stats.active_tasks--;
        _tasks.remove(task);
@@ -353,32 +352,50 @@ future<> compaction_manager::run_custom_job(replica::table* t, sstables::compact
    return task->compaction_done.get_future().then([task] {});
 }

+compaction_manager::compaction_reenabler::compaction_reenabler(compaction_manager& cm, replica::table* t)
+    : _cm(cm)
+    , _table(t)
+    , _compaction_state(cm.get_compaction_state(_table))
+    , _holder(_compaction_state.gate.hold())
+{
+    _compaction_state.compaction_disabled_counter++;
+    cmlog.debug("Temporarily disabled compaction for {}.{}. compaction_disabled_counter={}",
+            _table->schema()->ks_name(), _table->schema()->cf_name(), _compaction_state.compaction_disabled_counter);
+}
+
+compaction_manager::compaction_reenabler::compaction_reenabler(compaction_reenabler&& o) noexcept
+    : _cm(o._cm)
+    , _table(std::exchange(o._table, nullptr))
+    , _compaction_state(o._compaction_state)
+    , _holder(std::move(o._holder))
+{}
+
+compaction_manager::compaction_reenabler::~compaction_reenabler() {
+    // submit compaction request if we're the last holder of the gate which is still opened.
+    if (_table && --_compaction_state.compaction_disabled_counter == 0 && !_compaction_state.gate.is_closed()) {
+        cmlog.debug("Reenabling compaction for {}.{}",
+                _table->schema()->ks_name(), _table->schema()->cf_name());
+        try {
+            _cm.submit(_table);
+        } catch (...) {
+            cmlog.warn("compaction_reenabler could not reenable compaction for {}.{}: {}",
+                    _table->schema()->ks_name(), _table->schema()->cf_name(), std::current_exception());
+        }
+    }
+}
+
+future<compaction_manager::compaction_reenabler>
+compaction_manager::stop_and_disable_compaction(replica::table* t) {
+    compaction_reenabler cre(*this, t);
+    co_await stop_ongoing_compactions("user-triggered operation", t);
+    co_return cre;
+}
+
 future<>
 compaction_manager::run_with_compaction_disabled(replica::table* t, std::function<future<> ()> func) {
-    auto& c_state = _compaction_state[t];
-    auto holder = c_state.gate.hold();
+    compaction_reenabler cre = co_await stop_and_disable_compaction(t);

-    c_state.compaction_disabled_counter++;
-
-    std::exception_ptr err;
-    try {
-        co_await stop_ongoing_compactions("user-triggered operation", t);
-        co_await func();
-    } catch (...) {
-        err = std::current_exception();
-    }
-
-#ifdef DEBUG
-    assert(_compaction_state.contains(t));
-#endif
-    // submit compaction request if we're the last holder of the gate which is still opened.
-    if (--c_state.compaction_disabled_counter == 0 && !c_state.gate.is_closed()) {
-        submit(t);
-    }
-    if (err) {
-        std::rethrow_exception(err);
-    }
-    co_return;
+    co_await func();
 }

 void compaction_manager::task::setup_new_compaction() {
@@ -584,16 +601,11 @@ future<> compaction_manager::stop() {
    }
 }

-void compaction_manager::really_do_stop() {
-    if (_state == state::none || _state == state::stopped) {
-        return;
-    }
-
-    _state = state::stopped;
+future<> compaction_manager::really_do_stop() {
    cmlog.info("Asked to stop");
    // Reset the metrics registry
    _metrics.clear();
-    _stop_future.emplace(stop_ongoing_compactions("shutdown").then([this] () mutable {
+    return stop_ongoing_compactions("shutdown").then([this] () mutable {
        reevaluate_postponed_compactions();
        return std::move(_waiting_reevalution);
    }).then([this] {
@@ -601,12 +613,34 @@ void compaction_manager::really_do_stop() {
        _compaction_submission_timer.cancel();
        cmlog.info("Stopped");
        return _compaction_controller.shutdown();
-    }));
+    });
+}
+
+template <typename Ex>
+requires std::is_base_of_v<std::exception, Ex> &&
+requires (const Ex& ex) {
+    { ex.code() } noexcept -> std::same_as<const std::error_code&>;
+}
+auto swallow_enospc(const Ex& ex) noexcept {
+    if (ex.code().value() != ENOSPC) {
+        return make_exception_future<>(std::make_exception_ptr(ex));
+    }
+
+    cmlog.warn("Got ENOSPC on stop, ignoring...");
+    return make_ready_future<>();
 }

 void compaction_manager::do_stop() noexcept {
+    if (_state == state::none || _state == state::stopped) {
+        return;
+    }
+
    try {
-        really_do_stop();
+        _state = state::stopped;
+        _stop_future = really_do_stop()
+            .handle_exception_type([] (const std::system_error& ex) { return swallow_enospc(ex); })
+            .handle_exception_type([] (const storage_io_error& ex) { return swallow_enospc(ex); })
+        ;
    } catch (...) {
        try {
            cmlog.error("Failed to stop the manager: {}", std::current_exception());
@@ -742,6 +776,7 @@ future<> compaction_manager::perform_offstrategy(replica::table* t) {
                _stats.active_tasks++;
                task->setup_new_compaction();

+              return with_scheduling_group(_maintenance_sg.cpu, [this, task, t] {
                return t->run_offstrategy_compaction(task->compaction_data).then_wrapped([this, task, schema = t->schema()] (future<> f) mutable {
                    _stats.active_tasks--;
                    task->finish_compaction();
@@ -763,6 +798,7 @@ future<> compaction_manager::perform_offstrategy(replica::table* t) {
                    }
                    return make_ready_future<stop_iteration>(stop_iteration::yes);
                });
+              });
            });
        });
    }).finally([this, task] {
@@ -810,7 +846,8 @@ future<> compaction_manager::rewrite_sstables(replica::table* t, sstables::compa
            auto sstable_level = sst->get_sstable_level();
            auto run_identifier = sst->run_identifier();
            auto sstable_set_snapshot = can_purge ? std::make_optional(t.get_sstable_set()) : std::nullopt;
-            auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), _maintenance_sg.io,
+            // FIXME: this compaction should run with maintenance priority.
+            auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), service::get_local_compaction_priority(),
                sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, options);

            // Releases reference to cleaned sstable such that respective used disk space can be freed.
@@ -819,8 +856,9 @@ future<> compaction_manager::rewrite_sstables(replica::table* t, sstables::compa
            };

            auto maintenance_permit = co_await seastar::get_units(_maintenance_ops_sem, 1);
-            // Take write lock for table to serialize cleanup/upgrade sstables/scrub with major compaction/reshape/reshard.
-            auto write_lock_holder = co_await _compaction_state[&t].lock.hold_write_lock();
+            // FIXME: acquiring the read lock is not needed after acquiring the _maintenance_ops_sem
+            // only major compaction needs to acquire the write lock to synchronize with regular compaction.
+            auto lock_holder = co_await _compaction_state[&t].lock.hold_read_lock();

            _stats.pending_tasks--;
            _stats.active_tasks++;
@@ -852,7 +890,7 @@ future<> compaction_manager::rewrite_sstables(replica::table* t, sstables::compa
            };

            compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
-            completed = co_await with_scheduling_group(_maintenance_sg.cpu, std::ref(perform_rewrite));
+            completed = co_await with_scheduling_group(_compaction_controller.sg(), std::ref(perform_rewrite));
        } while (!completed);
    };

--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -147,6 +147,8 @@ private:
    // If the operation must be serialized with regular, then the per-table write lock must be taken.
    seastar::named_semaphore _maintenance_ops_sem = {1, named_semaphore_exception_factory{"maintenance operation"}};

+    seastar::named_semaphore _custom_jobs_sem = {1, named_semaphore_exception_factory{"custom jobs"}};
+
    std::function<void()> compaction_submission_callback();
    // all registered tables are reevaluated at a constant interval.
    // Submission is a NO-OP when there's nothing to do, so it's fine to call it regularly.
@@ -233,7 +235,7 @@ public:

    // Stop all fibers, without waiting. Safe to be called multiple times.
    void do_stop() noexcept;
-    void really_do_stop();
+    future<> really_do_stop();

    // Submit a table to be compacted.
    void submit(replica::table* t);
@@ -269,6 +271,31 @@ public:
    // parameter job is a function that will carry the operation
    future<> run_custom_job(replica::table* t, sstables::compaction_type type, noncopyable_function<future<>(sstables::compaction_data&)> job);

+    class compaction_reenabler {
+        compaction_manager& _cm;
+        replica::table* _table;
+        compaction_state& _compaction_state;
+        gate::holder _holder;
+
+    public:
+        compaction_reenabler(compaction_manager&, replica::table*);
+        compaction_reenabler(compaction_reenabler&&) noexcept;
+
+        ~compaction_reenabler();
+
+        replica::table* compacting_table() const noexcept {
+            return _table;
+        }
+
+        const compaction_state& compaction_state() const noexcept {
+            return _compaction_state;
+        }
+    };
+
+    // Disable compaction temporarily for a table t.
+    // Caller should call the compaction_reenabler::reenable
+    future<compaction_reenabler> stop_and_disable_compaction(replica::table* t);
+
    // Run a function with compaction temporarily disabled for a table T.
    future<> run_with_compaction_disabled(replica::table* t, std::function<future<> ()> func);

--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -69,7 +69,11 @@ compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(tabl
 }

 void leveled_compaction_strategy::notify_completion(const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
-    if (removed.empty() || added.empty()) {
+    // All the update here is only relevant for regular compaction's round-robin picking policy, and if
+    // last_compacted_keys wasn't generated by regular, it means regular is disabled since last restart,
+    // therefore we can skip the updates here until regular runs for the first time. Once it runs,
+    // it will be able to generate last_compacted_keys correctly by looking at metadata of files.
+    if (removed.empty() || added.empty() || !_last_compacted_keys) {
        return;
    }
    auto min_level = std::numeric_limits<uint32_t>::max();
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -217,6 +217,7 @@ time_window_compaction_strategy::get_sstables_for_compaction(table_state& table_
    auto compaction_time = gc_clock::now();

    if (candidates.empty()) {
+        _estimated_remaining_tasks = 0;
        return compaction_descriptor();
    }

--- a/configure.py
+++ b/configure.py
@@ -615,6 +615,8 @@ arg_parser.add_argument('--static-yaml-cpp', dest='staticyamlcpp', action='store
                        help='Link libyaml-cpp statically')
 arg_parser.add_argument('--tests-debuginfo', action='store', dest='tests_debuginfo', type=int, default=0,
                        help='Enable(1)/disable(0)compiler debug information generation for tests')
+arg_parser.add_argument('--perf-tests-debuginfo', action='store', dest='perf_tests_debuginfo', type=int, default=0,
+                        help='Enable(1)/disable(0)compiler debug information generation for perf tests')
 arg_parser.add_argument('--python', action='store', dest='python', default='python3',
                        help='Python3 path')
 arg_parser.add_argument('--split-dwarf', dest='split_dwarf', action='store_true', default=False,
@@ -1377,6 +1379,7 @@ linker_flags = linker_flags(compiler=args.cxx)

 dbgflag = '-g -gz' if args.debuginfo else ''
 tests_link_rule = 'link' if args.tests_debuginfo else 'link_stripped'
+perf_tests_link_rule = 'link' if args.perf_tests_debuginfo else 'link_stripped'

 # Strip if debuginfo is disabled, otherwise we end up with partial
 # debug info from the libraries we static link with
@@ -1901,7 +1904,8 @@ with open(buildfile_tmp, 'w') as f:
                    # So we strip the tests by default; The user can very
                    # quickly re-link the test unstripped by adding a "_g"
                    # to the test name, e.g., "ninja build/release/testname_g"
-                    f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, tests_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
+                    link_rule = perf_tests_link_rule if binary.startswith('test/perf/') else tests_link_rule
+                    f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
                    f.write('   libs = {}\n'.format(local_libs))
                    f.write('build $builddir/{}/{}_g: {}.{} {} | {} {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
                    f.write('   libs = {}\n'.format(local_libs))
@@ -2004,7 +2008,8 @@ with open(buildfile_tmp, 'w') as f:
                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
                if cc.endswith('Parser.cpp'):
                    # Unoptimized parsers end up using huge amounts of stack space and overflowing their stack
-                    flags = '-O1'
+                    flags = '-O1' if modes[mode]['optimization-level'] in ['0', 'g', 's'] else ''
+
                    if has_sanitize_address_use_after_scope:
                        flags += ' -fno-sanitize-address-use-after-scope'
                    f.write('  obj_cxxflags = %s\n' % flags)
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -1386,7 +1386,7 @@ serviceLevelOrRoleName returns [sstring name]
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower); }
 | t=STRING_LITERAL     { $name = sstring($t.text); }
 | t=QUOTED_NAME        { $name = sstring($t.text); }
-| k=unreserved_keyword { $name = sstring($t.text); 
+| k=unreserved_keyword { $name = k;
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower);}
 | QMARK {add_recognition_error("Bind variables cannot be used for service levels or role names");}
 ;
--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -12,6 +12,7 @@

 #include "cql3_type.hh"
 #include "cql3/util.hh"
+#include "exceptions/exceptions.hh"
 #include "ut_name.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "data_dictionary/user_types_metadata.hh"
@@ -436,7 +437,20 @@ sstring maybe_quote(const sstring& identifier) {
    }

    if (!need_quotes) {
-        return identifier;
+        // A seemingly valid identifier matching [a-z][a-z0-9_]* may still
+        // need quoting if it is a CQL keyword, e.g., "to" (see issue #9450).
+        // While our parser Cql.g has different production rules for different
+        // types of identifiers (column names, table names, etc.), all of
+        // these behave identically for alphanumeric strings: they exclude
+        // many keywords but allow keywords listed as "unreserved keywords".
+        // So we can use any of them, for example cident.
+        try {
+            cql3::util::do_with_parser(identifier, std::mem_fn(&cql3_parser::CqlParser::cident));
+            return identifier;
+        } catch(exceptions::syntax_exception&) {
+            // This alphanumeric string is not a valid identifier, so fall
+            // through to have it quoted:
+        }
    }
    if (num_quotes == 0) {
        return make_sstring("\"", identifier, "\"");
--- a/cql3/cql_statement.hh
+++ b/cql3/cql_statement.hh
@@ -81,9 +81,7 @@ public:
    virtual seastar::future<seastar::shared_ptr<cql_transport::messages::result_message>>
        execute(query_processor& qp, service::query_state& state, const query_options& options) const = 0;

-    virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const = 0;
-
-    virtual bool depends_on_column_family(const seastar::sstring& cf_name) const = 0;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const = 0;

    virtual seastar::shared_ptr<const metadata> get_result_metadata() const = 0;

--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -103,10 +103,50 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
        if (!col_type->is_map()) {
            throw exceptions::invalid_request_exception(format("subscripting non-map column {}", cdef->name_as_text()));
        }
-        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*data.other_columns[data.sel.index_of(*cdef)]));
+        int32_t index = data.sel.index_of(*cdef);
+        if (index == -1) {
+            throw std::runtime_error(
+                    format("Column definition {} does not match any column in the query selection",
+                    cdef->name_as_text()));
+        }
+        const managed_bytes_opt& serialized = data.other_columns[index];
+        if (!serialized) {
+            // For null[i] we return null.
+            return std::nullopt;
+        }
+        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*serialized));
        const auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
        const auto key = evaluate(*col.sub, options);
        auto&& key_type = col_type->name_comparator();
+        if (key.is_null()) {
+            // For m[null] return null.
+            // This is different from Cassandra - which treats m[null]
+            // as an invalid request error. But m[null] -> null is more
+            // consistent with our usual null treatement (e.g., both
+            // null[2] and null < 2 return null). It will also allow us
+            // to support non-constant subscripts (e.g., m[a]) where "a"
+            // may be null in some rows and non-null in others, and it's
+            // not an error.
+            return std::nullopt;
+        }
+        if (key.is_unset_value()) {
+            // An m[?] with ? bound to UNSET_VALUE is a invalid query.
+            // We could have detected it earlier while binding, but since
+            // we currently don't, we must protect the following code
+            // which can't work with an UNSET_VALUE. Note that the
+            // placement of this check here means that in an empty table,
+            // where we never need to evaluate the filter expression, this
+            // error will not be detected.
+            throw exceptions::invalid_request_exception(
+                format("Unsupported unset map key for column {}",
+                    cdef->name_as_text()));
+        }
+        if (key.type != key_type) {
+            // This can't happen, we always verify the index type earlier.
+            throw std::logic_error(
+                format("Tried to evaluate expression with wrong type for subscript of {}",
+                    cdef->name_as_text()));
+        }
        const auto found = key.view().with_linearized([&] (bytes_view key_bv) {
            using entry = std::pair<data_value, data_value>;
            return std::find_if(data_map.cbegin(), data_map.cend(), [&] (const entry& element) {
@@ -121,8 +161,16 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
        case column_kind::clustering_key:
            return managed_bytes(data.clustering_key[cdef->id]);
        case column_kind::static_column:
-        case column_kind::regular_column:
-            return managed_bytes_opt(data.other_columns[data.sel.index_of(*cdef)]);
+            [[fallthrough]];
+        case column_kind::regular_column: {
+            int32_t index = data.sel.index_of(*cdef);
+            if (index == -1) {
+                throw std::runtime_error(
+                        format("Column definition {} does not match any column in the query selection",
+                        cdef->name_as_text()));
+            }
+            return managed_bytes_opt(data.other_columns[index]);
+        }
        default:
            throw exceptions::unsupported_operation_exception("Unknown column kind");
        }
@@ -1245,7 +1293,7 @@ expression search_and_replace(const expression& e,
                    };
                },
                [&] (const binary_operator& oper) -> expression {
-                    return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs));
+                    return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs), oper.order);
                },
                [&] (const column_mutation_attribute& cma) -> expression {
                    return column_mutation_attribute{cma.kind, recurse(cma.column)};
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -953,7 +953,7 @@ bool query_processor::migration_subscriber::should_invalidate(
        sstring ks_name,
        std::optional<sstring> cf_name,
        ::shared_ptr<cql_statement> statement) {
-    return statement->depends_on_keyspace(ks_name) && (!cf_name || statement->depends_on_column_family(*cf_name));
+    return statement->depends_on(ks_name, cf_name);
 }

 future<> query_processor::query_internal(
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -514,7 +514,7 @@ statement_restrictions::statement_restrictions(data_dictionary::database db,
    }

    if (!_nonprimary_key_restrictions->empty()) {
-        if (_has_queriable_regular_index) {
+        if (_has_queriable_regular_index && _partition_range_is_simple) {
            _uses_secondary_indexing = true;
        } else if (!allow_filtering) {
            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -165,7 +165,7 @@ public:

    template<typename RowComparator>
    void sort(const RowComparator& cmp) {
-        std::sort(_rows.begin(), _rows.end(), std::ref(cmp));
+        std::sort(_rows.begin(), _rows.end(), cmp);
    }

    metadata& get_metadata();
--- a/cql3/selection/field_selector.hh
+++ b/cql3/selection/field_selector.hh
@@ -83,7 +83,7 @@ public:

    virtual sstring assignment_testable_source_context() const override {
        auto&& name = _type->field_name(_field);
-        auto sname = sstring(reinterpret_cast<const char*>(name.begin(), name.size()));
+        auto sname = std::string_view(reinterpret_cast<const char*>(name.data()), name.size());
        return format("{}.{}", _selected, sname);
    }

--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -422,11 +422,16 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
    }

    auto clustering_columns_restrictions = _restrictions->get_clustering_columns_restrictions();
-    if (dynamic_pointer_cast<cql3::restrictions::multi_column_restriction>(clustering_columns_restrictions)) {
+    bool has_multi_col_clustering_restrictions =
+        dynamic_pointer_cast<cql3::restrictions::multi_column_restriction>(clustering_columns_restrictions) != nullptr;
+    if (has_multi_col_clustering_restrictions) {
        clustering_key_prefix ckey = clustering_key_prefix::from_exploded(clustering_key);
-        return expr::is_satisfied_by(
+        bool multi_col_clustering_satisfied = expr::is_satisfied_by(
                clustering_columns_restrictions->expression,
                partition_key, clustering_key, static_row, row, selection, _options);
+        if (!multi_col_clustering_satisfied) {
+            return false;
+        }
    }

    auto static_row_iterator = static_row.iterator();
@@ -474,6 +479,13 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
            if (_skip_ck_restrictions) {
                continue;
            }
+            if (has_multi_col_clustering_restrictions) {
+                // Mixing multi column and single column restrictions on clustering
+                // key columns is forbidden.
+                // Since there are multi column restrictions we have to skip
+                // evaluating single column restrictions or we will get an error.
+                continue;
+            }
            auto clustering_key_restrictions_map = _restrictions->get_single_column_clustering_key_restrictions();
            auto restr_it = clustering_key_restrictions_map.find(cdef);
            if (restr_it == clustering_key_restrictions_map.end()) {
--- a/cql3/statements/authentication_statement.cc
+++ b/cql3/statements/authentication_statement.cc
@@ -18,13 +18,7 @@ uint32_t cql3::statements::authentication_statement::get_bound_terms() const {
    return 0;
 }

-bool cql3::statements::authentication_statement::depends_on_keyspace(
-                const sstring& ks_name) const {
-    return false;
-}
-
-bool cql3::statements::authentication_statement::depends_on_column_family(
-                const sstring& cf_name) const {
+bool cql3::statements::authentication_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/authentication_statement.hh
+++ b/cql3/statements/authentication_statement.hh
@@ -27,9 +27,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/statements/authorization_statement.cc
+++ b/cql3/statements/authorization_statement.cc
@@ -20,13 +20,7 @@ uint32_t cql3::statements::authorization_statement::get_bound_terms() const {
    return 0;
 }

-bool cql3::statements::authorization_statement::depends_on_keyspace(
-                const sstring& ks_name) const {
-    return false;
-}
-
-bool cql3::statements::authorization_statement::depends_on_column_family(
-                const sstring& cf_name) const {
+bool cql3::statements::authorization_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/authorization_statement.hh
+++ b/cql3/statements/authorization_statement.hh
@@ -31,9 +31,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -70,14 +70,9 @@ batch_statement::batch_statement(type type_,
 {
 }

-bool batch_statement::depends_on_keyspace(const sstring& ks_name) const
+bool batch_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
-    return false;
-}
-
-bool batch_statement::depends_on_column_family(const sstring& cf_name) const
-{
-    return false;
+    return boost::algorithm::any_of(_statements, [&ks_name, &cf_name] (auto&& s) { return s.statement->depends_on(ks_name, cf_name); });
 }

 uint32_t batch_statement::get_bound_terms() const
@@ -259,6 +254,10 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
    if (options.getSerialConsistency() == null)
        throw new InvalidRequestException("Invalid empty serial consistency level");
 #endif
+    for (size_t i = 0; i < _statements.size(); ++i) {
+        _statements[i].statement->validate_primary_key_restrictions(options.for_statement(i));
+    }
+
    if (_has_conditions) {
        ++_stats.cas_batches;
        _stats.statements_in_cas_batches += _statements.size();
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -88,9 +88,7 @@ public:
                    std::unique_ptr<attributes> attrs,
                    cql_stats& stats);

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;

--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -20,6 +20,7 @@
 #include "gms/feature_service.hh"
 #include "tombstone_gc_extension.hh"
 #include "tombstone_gc.hh"
+#include "utils/bloom_calculations.hh"

 #include <boost/algorithm/string/predicate.hpp>

@@ -145,6 +146,16 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
        throw exceptions::configuration_exception(KW_MAX_INDEX_INTERVAL + " must be greater than " + KW_MIN_INDEX_INTERVAL);
    }

+    if (get_simple(KW_BF_FP_CHANCE)) {
+        double bloom_filter_fp_chance = get_double(KW_BF_FP_CHANCE, 0/*not used*/);
+        double min_bloom_filter_fp_chance = utils::bloom_calculations::min_supported_bloom_filter_fp_chance();
+        if (bloom_filter_fp_chance <= min_bloom_filter_fp_chance || bloom_filter_fp_chance > 1.0) {
+            throw exceptions::configuration_exception(format(
+                "{} must be larger than {} and less than or equal to 1.0 (got {})",
+                KW_BF_FP_CHANCE, min_bloom_filter_fp_chance, bloom_filter_fp_chance));
+        }
+    }
+
    speculative_retry::from_sstring(get_string(KW_SPECULATIVE_RETRY, speculative_retry(speculative_retry::type::NONE, 0).to_sstring()));
 }

--- a/cql3/statements/cf_properties.hh
+++ b/cql3/statements/cf_properties.hh
@@ -13,6 +13,7 @@

 #include "cql3/statements/cf_prop_defs.hh"
 #include "cql3/column_identifier.hh"
+#include "data_dictionary/data_dictionary.hh"

 namespace cql3 {

--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -110,9 +110,6 @@ future<> modification_statement::check_access(query_processor& qp, const service

 future<std::vector<mutation>>
 modification_statement::get_mutations(query_processor& qp, const query_options& options, db::timeout_clock::time_point timeout, bool local, int64_t now, service::query_state& qs) const {
-    if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
-        throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
-    }
    auto cl = options.get_consistency();
    auto json_cache = maybe_prepare_json_cache(options);
    auto keys = build_partition_keys(options, json_cache);
@@ -245,6 +242,12 @@ modification_statement::execute(query_processor& qp, service::query_state& qs, c
    return modify_stage(this, seastar::ref(qp), seastar::ref(qs), seastar::cref(options));
 }

+void modification_statement::validate_primary_key_restrictions(const query_options& options) const {
+    if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
+        throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
+    }
+}
+
 future<::shared_ptr<cql_transport::messages::result_message>>
 modification_statement::do_execute(query_processor& qp, service::query_state& qs, const query_options& options) const {
    if (has_conditions() && options.get_protocol_version() == 1) {
@@ -255,6 +258,8 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs

    inc_cql_stats(qs.get_client_state().is_internal());

+    validate_primary_key_restrictions(options);
+
    if (has_conditions()) {
        return execute_with_condition(qp, qs, options);
    }
@@ -539,12 +544,8 @@ modification_statement::validate(query_processor&, const service::client_state&
    }
 }

-bool modification_statement::depends_on_keyspace(const sstring& ks_name) const {
-    return keyspace() == ks_name;
-}
-
-bool modification_statement::depends_on_column_family(const sstring& cf_name) const {
-    return column_family() == cf_name;
+bool modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
 }

 void modification_statement::add_operation(::shared_ptr<operation> op) {
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -137,9 +137,7 @@ public:
    // Validate before execute, using client state and current schema
    void validate(query_processor&, const service::client_state& state) const override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    void add_operation(::shared_ptr<operation> op);

@@ -233,6 +231,8 @@ public:
    // True if this statement needs to read only static column values to check if it can be applied.
    bool has_only_static_column_conditions() const { return !_has_regular_column_conditions && _has_static_column_conditions; }

+    void validate_primary_key_restrictions(const query_options& options) const;
+
    virtual future<::shared_ptr<cql_transport::messages::result_message>>
    execute(query_processor& qp, service::query_state& qs, const query_options& options) const override;

--- a/cql3/statements/schema_altering_statement.cc
+++ b/cql3/statements/schema_altering_statement.cc
@@ -45,12 +45,7 @@ future<> schema_altering_statement::grant_permissions_to_creator(const service::
    return make_ready_future<>();
 }

-bool schema_altering_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool schema_altering_statement::depends_on_column_family(const sstring& cf_name) const
+bool schema_altering_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/schema_altering_statement.hh
+++ b/cql3/statements/schema_altering_statement.hh
@@ -53,9 +53,7 @@ protected:
     */
    virtual future<> grant_permissions_to_creator(const service::client_state&) const;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;

--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -167,12 +167,8 @@ void select_statement::validate(query_processor&, const service::client_state& s
    // Nothing to do, all validation has been done by raw_statemet::prepare()
 }

-bool select_statement::depends_on_keyspace(const sstring& ks_name) const {
-    return keyspace() == ks_name;
-}
-
-bool select_statement::depends_on_column_family(const sstring& cf_name) const {
-    return column_family() == cf_name;
+bool select_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
 }

 const sstring& select_statement::keyspace() const {
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -100,8 +100,7 @@ public:
    virtual uint32_t get_bound_terms() const override;
    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;
    virtual void validate(query_processor&, const service::client_state& state) const override;
-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<::shared_ptr<cql_transport::messages::result_message>> execute(query_processor& qp,
        service::query_state& state, const query_options& options) const override;
--- a/cql3/statements/service_level_statement.cc
+++ b/cql3/statements/service_level_statement.cc
@@ -17,13 +17,7 @@ uint32_t service_level_statement::get_bound_terms() const {
    return 0;
 }

-bool service_level_statement::depends_on_keyspace(
-        const sstring &ks_name) const {
-    return false;
-}
-
-bool service_level_statement::depends_on_column_family(
-        const sstring &cf_name) const {
+bool service_level_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/service_level_statement.hh
+++ b/cql3/statements/service_level_statement.hh
@@ -43,9 +43,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/statements/sl_prop_defs.cc
+++ b/cql3/statements/sl_prop_defs.cc
@@ -30,7 +30,7 @@ void sl_prop_defs::validate() {
        data_value v = duration_type->deserialize(duration_type->from_string(*repr));
        cql_duration duration = static_pointer_cast<const duration_type_impl>(duration_type)->from_value(v);
        if (duration.months || duration.days) {
-            throw exceptions::invalid_request_exception("Timeout values cannot be longer than 24h");
+            throw exceptions::invalid_request_exception("Timeout values cannot be expressed in days/months");
        }
        if (duration.nanoseconds % 1'000'000 != 0) {
            throw exceptions::invalid_request_exception("Timeout values must be expressed in millisecond granularity");
--- a/cql3/statements/truncate_statement.cc
+++ b/cql3/statements/truncate_statement.cc
@@ -39,12 +39,7 @@ std::unique_ptr<prepared_statement> truncate_statement::prepare(data_dictionary:
    return std::make_unique<prepared_statement>(::make_shared<truncate_statement>(*this));
 }

-bool truncate_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool truncate_statement::depends_on_column_family(const sstring& cf_name) const
+bool truncate_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/truncate_statement.hh
+++ b/cql3/statements/truncate_statement.hh
@@ -30,9 +30,7 @@ public:

    virtual std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/statements/use_statement.cc
+++ b/cql3/statements/use_statement.cc
@@ -46,12 +46,7 @@ std::unique_ptr<prepared_statement> use_statement::prepare(data_dictionary::data

 }

-bool use_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool use_statement::depends_on_column_family(const sstring& cf_name) const
+bool use_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/use_statement.hh
+++ b/cql3/statements/use_statement.hh
@@ -31,9 +31,7 @@ public:

    virtual uint32_t get_bound_terms() const override;

-    virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const seastar::sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual seastar::future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/type_json.cc
+++ b/cql3/type_json.cc
@@ -18,6 +18,8 @@
 #include "types/listlike_partial_deserializing_iterator.hh"
 #include "utils/managed_bytes.hh"
 #include "exceptions/exceptions.hh"
+#include <boost/algorithm/string/trim_all.hpp>
+#include <boost/algorithm/string.hpp>

 static inline bool is_control_char(char c) {
    return c >= 0 && c <= 0x1F;
@@ -78,8 +80,35 @@ static int64_t to_int64_t(const rjson::value& value) {
        return value.GetInt();
    } else if (value.IsUint()) {
        return value.GetUint();
-    } else if (value.GetUint64()) {
+    } else if (value.IsUint64()) {
        return value.GetUint64(); //NOTICE: large uint64_t values will get overflown
+    } else if (value.IsDouble()) {
+        // We allow specifing integer constants
+        // using scientific notation (for example 1.3e8)
+        // and floating-point numbers ending with .0 (for example 12.0),
+        // but not floating-point numbers with fractional part (12.34).
+        //
+        // The reason is that JSON standard does not have separate
+        // types for integers and floating-point numbers, only
+        // a single "number" type. Some serializers may
+        // produce an integer in that floating-point format.
+        double double_value = value.GetDouble();
+
+        // Check if the value contains disallowed fractional part (.34 from 12.34).
+        // With RapidJSON and an integer value in range [-(2^53)+1, (2^53)-1], 
+        // the fractional part will be zero as the entire value
+        // fits in 53-bit significand. RapidJSON's parsing code does not lose accuracy:
+        // when parsing a number like 12.34e8, it accumulates 1234 to a int64_t number,
+        // then converts it to double and multiples by power of 10, never having any
+        // digit in fractional part.
+        double integral;
+        double fractional = std::modf(double_value, &integral);
+        if (fractional != 0.0 && fractional != -0.0) {
+            throw marshal_exception(format("Incorrect JSON floating-point value "
+                "for int64 type: {} (it should not contain fractional part {})", value, fractional));
+        }
+
+        return double_value;
    }
    throw marshal_exception(format("Incorrect JSON value for int64 type: {}", value));
 }
@@ -189,7 +218,7 @@ struct from_json_object_visitor {
            throw marshal_exception("bytes_type must be represented as string");
        }
        std::string_view string_v = rjson::to_string_view(value);
-        if (string_v.size() < 2 && string_v[0] != '0' && string_v[1] != 'x') {
+        if (string_v.size() < 2 || string_v[0] != '0' || string_v[1] != 'x') {
            throw marshal_exception("Blob JSON strings must start with 0x");
        }
        string_v.remove_prefix(2);
@@ -197,6 +226,17 @@ struct from_json_object_visitor {
    }
    bytes operator()(const boolean_type_impl& t) {
        if (!value.IsBool()) {
+            if (value.IsString()) {
+                std::string str(rjson::to_string_view(value));
+                boost::trim_all(str);
+                boost::to_lower(str);
+
+                if (str == "true") {
+                    return t.decompose(true);
+                } else if (str == "false") {
+                    return t.decompose(false);
+                }
+            }
            throw marshal_exception(format("Invalid JSON object {}", value));
        }
        return t.decompose(value.GetBool());
--- a/cql3/util.hh
+++ b/cql3/util.hh
@@ -74,6 +74,13 @@ std::unique_ptr<cql3::statements::raw::select_statement> build_select_statement(
 /// forbids non-alpha-numeric characters in identifier names.
 /// Quoting involves wrapping the string in double-quotes ("). A double-quote
 /// character itself is quoted by doubling it.
+/// maybe_quote() also quotes reserved CQL keywords (e.g., "to", "where")
+/// but doesn't quote *unreserved* keywords (like ttl, int or as).
+/// Note that this means that if new reserved keywords are added to the
+/// parser, a saved output of maybe_quote() may no longer be parsable by
+/// parser. To avoid this forward-compatibility issue, use quote() instead
+/// of maybe_quote() - to unconditionally quote an identifier even if it is
+/// lowercase and not (yet) a keyword.
 sstring maybe_quote(const sstring& s);

 // Check whether timestamp is not too far in the future as this probably
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -11,6 +11,7 @@
 */

 #include <chrono>
+#include <exception>
 #include <seastar/core/future-util.hh>
 #include <seastar/core/do_with.hh>
 #include <seastar/core/semaphore.hh>
@@ -247,6 +248,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
            } catch (data_dictionary::no_such_keyspace& ex) {
                // should probably ignore and drop the batch
            } catch (...) {
+                blogger.warn("Replay failed (will retry): {}", std::current_exception());
                // timeout, overload etc.
                // Do _not_ remove the batch, assuning we got a node write error.
                // Since we don't have hints (which origin is satisfied with),
--- a/db/config.cc
+++ b/db/config.cc
@@ -65,6 +65,25 @@ hinted_handoff_enabled_to_json(const db::config::hinted_handoff_enabled_type& h)
    return value_to_json(h.to_configuration_string());
 }

+// Convert a value that can be printed with operator<<, or a vector of
+// such values, to JSON. An example is enum_option<T>, because enum_option<T>
+// has a operator<<.
+template <typename T>
+static json::json_return_type
+printable_to_json(const T& e) {
+    return value_to_json(format("{}", e));
+}
+template <typename T>
+static json::json_return_type
+printable_vector_to_json(const std::vector<T>& e) {
+    std::vector<sstring> converted;
+    converted.reserve(e.size());
+    for (const auto& option : e) {
+        converted.push_back(format("{}", option));
+    }
+    return value_to_json(converted);
+}
+
 template <>
 const config_type config_type_for<bool> = config_type("bool", value_to_json<bool>);

@@ -109,11 +128,11 @@ const config_type config_type_for<db::seed_provider_type> = config_type("seed pr

 template <>
 const config_type config_type_for<std::vector<enum_option<db::experimental_features_t>>> = config_type(
-        "experimental features", value_to_json<std::vector<sstring>>);
+        "experimental features", printable_vector_to_json<enum_option<db::experimental_features_t>>);

 template <>
 const config_type config_type_for<enum_option<db::tri_mode_restriction_t>> = config_type(
-        "restriction mode", value_to_json<sstring>);
+        "restriction mode", printable_to_json<enum_option<db::tri_mode_restriction_t>>);

 template <>
 const config_type config_type_for<db::config::hinted_handoff_enabled_type> = config_type("hinted handoff enabled", hinted_handoff_enabled_to_json);
@@ -862,6 +881,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Flush tables in the system_schema keyspace after schema modification. This is required for crash recovery, but slows down tests and can be disabled for them")
    , restrict_replication_simplestrategy(this, "restrict_replication_simplestrategy", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::FALSE, "Controls whether to disable SimpleStrategy replication. Can be true, false, or warn.")
    , restrict_dtcs(this, "restrict_dtcs", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::WARN, "Controls whether to prevent setting DateTieredCompactionStrategy. Can be true, false, or warn.")
+    , cache_index_pages(this, "cache_index_pages", liveness::LiveUpdate, value_status::Used, true,
+        "Keep SSTable index pages in the global cache after a SSTable read. Expected to improve performance for workloads with big partitions, but may degrade performance for workloads with small partitions.")
    , default_log_level(this, "default_log_level", value_status::Used)
    , logger_log_level(this, "logger_log_level", value_status::Used)
    , log_to_stdout(this, "log_to_stdout", value_status::Used)
--- a/db/config.hh
+++ b/db/config.hh
@@ -365,6 +365,9 @@ public:
    named_value<tri_mode_restriction> restrict_replication_simplestrategy;
    named_value<tri_mode_restriction> restrict_dtcs;

+
+    named_value<bool> cache_index_pages;
+
    seastar::logging_settings logging_settings(const log_cli::options&) const;

    const db::extensions& extensions() const;
--- a/db/legacy_schema_migrator.cc
+++ b/db/legacy_schema_migrator.cc
@@ -574,12 +574,8 @@ public:
    }

    future<> flush_schemas() {
-        return _qp.proxy().get_db().invoke_on_all([this] (replica::database& db) {
-            return parallel_for_each(db::schema_tables::all_table_names(schema_features::full()), [this, &db](const sstring& cf_name) {
-                auto& cf = db.find_column_family(db::schema_tables::NAME, cf_name);
-                return cf.flush();
-            });
-        });
+        auto& db = _qp.db().real_database();
+        return db.flush_on_all(db::schema_tables::NAME, db::schema_tables::all_table_names(schema_features::full()));
    }

    future<> migrate() {
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -1042,12 +1042,9 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
    co_await proxy.local().mutate_locally(std::move(mutations), tracing::trace_state_ptr());

    if (do_flush) {
-        co_await proxy.local().get_db().invoke_on_all([&] (replica::database& db) -> future<> {
-            auto& cfs = column_families;
-            co_await parallel_for_each(cfs.begin(), cfs.end(), [&] (const utils::UUID& id) -> future<> {
-                auto& cf = db.find_column_family(id);
-                co_await cf.flush();
-            });
+        auto& db = proxy.local().local_db();
+        co_await parallel_for_each(column_families, [&db] (const utils::UUID& id) -> future<> {
+            return db.flush_on_all(id);
        });
    }

--- a/db/snapshot-ctl.cc
+++ b/db/snapshot-ctl.cc
@@ -11,6 +11,8 @@
 */

 #include <boost/range/adaptors.hpp>
+#include <seastar/core/coroutine.hh>
+#include <seastar/coroutine/maybe_yield.hh>
 #include "db/snapshot-ctl.hh"
 #include "replica/database.hh"

@@ -59,24 +61,21 @@ future<> snapshot_ctl::take_snapshot(sstring tag, std::vector<sstring> keyspace_
        boost::copy(_db.local().get_keyspaces() | boost::adaptors::map_keys, std::back_inserter(keyspace_names));
    };

-    return run_snapshot_modify_operation([tag = std::move(tag), keyspace_names = std::move(keyspace_names), sf, this] {
-        return parallel_for_each(keyspace_names, [tag, this] (auto& ks_name) {
-            return check_snapshot_not_exist(ks_name, tag);
-        }).then([this, tag, keyspace_names, sf] {
-            return _db.invoke_on_all([tag = std::move(tag), keyspace_names, sf] (replica::database& db) {
-                return parallel_for_each(keyspace_names, [&db, tag = std::move(tag), sf] (auto& ks_name) {
-                    auto& ks = db.find_keyspace(ks_name);
-                    return parallel_for_each(ks.metadata()->cf_meta_data(), [&db, tag = std::move(tag), sf] (auto& pair) {
-                        auto& cf = db.find_column_family(pair.second);
-                        return cf.snapshot(db, tag, bool(sf));
-                    });
-                });
-            });
-        });
+    return run_snapshot_modify_operation([tag = std::move(tag), keyspace_names = std::move(keyspace_names), sf, this] () mutable {
+        return do_take_snapshot(std::move(tag), std::move(keyspace_names), sf);
    });
 }

-future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf) {
+future<> snapshot_ctl::do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf) {
+    co_await parallel_for_each(keyspace_names, [tag, this] (const auto& ks_name) {
+        return check_snapshot_not_exist(ks_name, tag);
+    });
+    co_await parallel_for_each(keyspace_names, [this, tag = std::move(tag), sf] (const auto& ks_name) {
+        return _db.local().snapshot_on_all(ks_name, tag, bool(sf));
+    });
+}
+
+future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf, allow_view_snapshots av) {
    if (ks_name.empty()) {
        throw std::runtime_error("You must supply a keyspace name");
    }
@@ -87,25 +86,25 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
        throw std::runtime_error("You must supply a snapshot name.");
    }

-    return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), sf] {
-        return check_snapshot_not_exist(ks_name, tag, tables).then([this, ks_name, tables, tag, sf] {
-            return do_with(std::vector<sstring>(std::move(tables)),[this, ks_name, tag, sf](const std::vector<sstring>& tables) {
-                return do_for_each(tables, [ks_name, tag, sf, this] (const sstring& table_name) {
-                    if (table_name.find(".") != sstring::npos) {
-                        throw std::invalid_argument("Cannot take a snapshot of a secondary index by itself. Run snapshot on the table that owns the index.");
-                    }
-                    return _db.invoke_on_all([ks_name, table_name, tag, sf] (replica::database &db) {
-                        auto& cf = db.find_column_family(ks_name, table_name);
-                        return cf.snapshot(db, tag, bool(sf));
-                    });
-                });
-            });
-        });
+    return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), sf, av] () mutable {
+        return do_take_column_family_snapshot(std::move(ks_name), std::move(tables), std::move(tag), sf, av);
    });
 }

-future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf) {
-    return take_column_family_snapshot(ks_name, std::vector<sstring>{cf_name}, tag, sf);
+future<> snapshot_ctl::do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf, allow_view_snapshots av) {
+    co_await check_snapshot_not_exist(ks_name, tag, tables);
+
+    for (const auto& table_name : tables) {
+        auto& cf = _db.local().find_column_family(ks_name, table_name);
+        if (cf.schema()->is_view() && !av) {
+            throw std::invalid_argument("Do not take a snapshot of a materialized view or a secondary index by itself. Run snapshot on the base table instead.");
+        }
+    }
+    co_await _db.local().snapshot_on_all(ks_name, std::move(tables), std::move(tag), bool(sf));
+}
+
+future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf, allow_view_snapshots av) {
+    return take_column_family_snapshot(ks_name, std::vector<sstring>{cf_name}, tag, sf, av);
 }

 future<> snapshot_ctl::clear_snapshot(sstring tag, std::vector<sstring> keyspace_names, sstring cf_name) {
--- a/db/snapshot-ctl.hh
+++ b/db/snapshot-ctl.hh
@@ -27,6 +27,7 @@ namespace db {
 class snapshot_ctl : public peering_sharded_service<snapshot_ctl> {
 public:
    using skip_flush = bool_class<class skip_flush_tag>;
+    using allow_view_snapshots = bool_class<class allow_view_snapsots_tag>;

    struct snapshot_details {
        int64_t live;
@@ -64,7 +65,7 @@ public:
     * @param tables a vector of tables names to snapshot
     * @param tag the tag given to the snapshot; may not be null or empty
     */
-    future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no);
+    future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);

    /**
     * Takes the snapshot of a specific column family. A snapshot name must be specified.
@@ -73,7 +74,7 @@ public:
     * @param columnFamilyName the column family to snapshot
     * @param tag the tag given to the snapshot; may not be null or empty
     */
-    future<> take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf = skip_flush::no);
+    future<> take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);

    /**
     * Remove the snapshot with the given name from the given keyspaces.
@@ -97,6 +98,9 @@ private:

    template <typename Func>
    std::result_of_t<Func()> run_snapshot_list_operation(Func&&);
+
+    future<> do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf = skip_flush::no);
+    future<> do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);
 };

 }
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -2482,10 +2482,14 @@ class db_config_table final : public streaming_virtual_table {
            for (auto& c_ref : cfg.values()) {
                auto& c = c_ref.get();
                if (c.name() == name) {
-                    if (c.set_value(value, utils::config_file::config_source::CQL)) {
-                        return cfg.broadcast_to_all_shards();
-                    } else {
-                        return make_exception_future<>(virtual_table_update_exception("option is not live-updateable"));
+                    try {
+                        if (c.set_value(value, utils::config_file::config_source::CQL)) {
+                            return cfg.broadcast_to_all_shards();
+                        } else {
+                            return make_exception_future<>(virtual_table_update_exception("option is not live-updateable"));
+                        }
+                    } catch (boost::bad_lexical_cast&) {
+                        return make_exception_future<>(virtual_table_update_exception("cannot parse option value"));
                    }
                }
            }
--- a/db/system_keyspace_view_types.hh
+++ b/db/system_keyspace_view_types.hh
@@ -10,6 +10,7 @@

 #include <seastar/core/seastar.hh>
 #include <seastar/core/sstring.hh>
+#include <seastar/core/reactor.hh>
 #include <utility>
 #include <optional>
 #include "dht/token.hh"
--- a/db/view/row_locking.cc
+++ b/db/view/row_locking.cc
@@ -10,8 +10,6 @@
 #include "log.hh"
 #include "utils/latency.hh"

-#include <seastar/core/when_all.hh>
-
 static logging::logger mylog("row_locking");

 row_locker::row_locker(schema_ptr s)
@@ -76,35 +74,32 @@ row_locker::lock_pk(const dht::decorated_key& pk, bool exclusive, db::timeout_cl
 future<row_locker::lock_holder>
 row_locker::lock_ck(const dht::decorated_key& pk, const clustering_key_prefix& cpk, bool exclusive, db::timeout_clock::time_point timeout, stats& stats) {
    mylog.debug("taking shared lock on partition {}, and {} lock on row {} in it", pk, (exclusive ? "exclusive" : "shared"), cpk);
+    auto ck = cpk;
+    // Create a two-level lock entry for the partition if it doesn't exist already.
    auto i = _two_level_locks.try_emplace(pk, this).first;
+    // The two-level lock entry we've just created is guaranteed to be kept alive as long as it's locked.
+    // Initiating read locking in the background below ensures that even if the two-level lock is currently
+    // write-locked, releasing the write-lock will synchronously engage any waiting
+    // locks and will keep the entry alive.
    future<lock_type::holder> lock_partition = i->second._partition_lock.hold_read_lock(timeout);
-    auto j = i->second._row_locks.find(cpk);
-    if (j == i->second._row_locks.end()) {
-        // Not yet locked, need to create the lock. This makes a copy of cpk.
-        try {
-            j = i->second._row_locks.emplace(cpk, lock_type()).first;
-        } catch(...) {
-            // If this emplace() failed, e.g., out of memory, we fail. We
-            // could do nothing - the partition lock we already started
-            // taking will be unlocked automatically after being locked.
-            // But it's better form to wait for the work we started, and it
-            // will also allow us to remove the hash-table row we added.
-            return lock_partition.then([ex = std::current_exception()] (auto lock) {
-                // The lock is automatically released when "lock" goes out of scope.
-                // TODO: unlock (lock = {}) now, search for the partition in the
-                // hash table (we know it's still there, because we held the lock until
-                // now) and remove the unused lock from the hash table if still unused.
-                return make_exception_future<row_locker::lock_holder>(std::current_exception());
-            });
-        }
-    }
    single_lock_stats &single_lock_stats = exclusive ? stats.exclusive_row : stats.shared_row;
    single_lock_stats.operations_currently_waiting_for_lock++;
    utils::latency_counter waiting_latency;
    waiting_latency.start();
-    future<lock_type::holder> lock_row = exclusive ? j->second.hold_write_lock(timeout) : j->second.hold_read_lock(timeout);
-    return when_all_succeed(std::move(lock_partition), std::move(lock_row))
-    .then_unpack([this, pk = &i->first, cpk = &j->first, exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency)] (auto lock1, auto lock2) mutable {
+    return lock_partition.then([this, pk = &i->first, row_locks = &i->second._row_locks, ck = std::move(ck), exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency), timeout] (auto lock1) mutable {
+        auto j = row_locks->find(ck);
+        if (j == row_locks->end()) {
+            // Not yet locked, need to create the lock.
+            j = row_locks->emplace(std::move(ck), lock_type()).first;
+        }
+        auto* cpk = &j->first;
+        auto& row_lock = j->second;
+        // Like to the two-level lock entry above, the row_lock entry we've just created
+        // is guaranteed to be kept alive as long as it's locked.
+        // Initiating read/write locking in the background below ensures that.
+        auto lock_row = exclusive ? row_lock.hold_write_lock(timeout) : row_lock.hold_read_lock(timeout);
+        return lock_row.then([this, pk, cpk, exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency), lock1 = std::move(lock1)] (auto lock2) mutable {
+        // FIXME: indentation
        lock1.release();
        lock2.release();
        waiting_latency.stop();
@@ -112,6 +107,7 @@ row_locker::lock_ck(const dht::decorated_key& pk, const clustering_key_prefix& c
        single_lock_stats.lock_acquisitions++;
        single_lock_stats.operations_currently_waiting_for_lock--;
        return lock_holder(this, pk, cpk, exclusive);
+        });
    });
 }

--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -121,6 +121,9 @@ const column_definition* view_info::view_column(const column_definition& base_de

 void view_info::set_base_info(db::view::base_info_ptr base_info) {
    _base_info = std::move(base_info);
+    // Forget the cached objects which may refer to the base schema.
+    _select_statement = nullptr;
+    _partition_slice = std::nullopt;
 }

 // A constructor for a base info that can facilitate reads and writes from the materialized view.
@@ -322,7 +325,11 @@ public:
    view_filter_checking_visitor(const schema& base, const view_info& view)
        : _base(base)
        , _view(view)
-        , _selection(cql3::selection::selection::wildcard(_base.shared_from_this()))
+        , _selection(cql3::selection::selection::for_columns(_base.shared_from_this(),
+            boost::copy_range<std::vector<const column_definition*>>(
+                _base.regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return &cdef; }))
+            )
+        )
    {}

    void accept_new_partition(const partition_key& key, uint64_t row_count) {
@@ -859,13 +866,18 @@ void view_updates::generate_update(
    bool same_row = true;
    for (auto col_id : col_ids) {
        auto* after = update.cells().find_cell(col_id);
-        // Note: multi-cell columns can't be part of the primary key.
        auto& cdef = _base->regular_column_at(col_id);
        if (existing) {
            auto* before = existing->cells().find_cell(col_id);
+            // Note that this cell is necessarily atomic, because col_ids are
+            // view key columns, and keys must be atomic.
            if (before && before->as_atomic_cell(cdef).is_live()) {
                if (after && after->as_atomic_cell(cdef).is_live()) {
-                    auto cmp = compare_atomic_cell_for_merge(before->as_atomic_cell(cdef), after->as_atomic_cell(cdef));
+                    // We need to compare just the values of the keys, not
+                    // metadata like the timestamp. This is because below,
+                    // if the old and new view row have the same key, we need
+                    // to be sure to reach the update_entry() case.
+                    auto cmp = compare_unsigned(before->as_atomic_cell(cdef).value(), after->as_atomic_cell(cdef).value());
                    if (cmp != 0) {
                        same_row = false;
                    }
@@ -885,7 +897,13 @@ void view_updates::generate_update(
            if (same_row) {
                update_entry(base_key, update, *existing, now);
            } else {
-                replace_entry(base_key, update, *existing, now);
+                // This code doesn't work if the old and new view row have the
+                // same key, because if they do we get both data and tombstone
+                // for the same timestamp (now) and the tombstone wins. This
+                // is why we need the "same_row" case above - it's not just a
+                // performance optimization.
+                delete_old_entry(base_key, *existing, update, now);
+                create_entry(base_key, update, now);
            }
        } else {
            delete_old_entry(base_key, *existing, update, now);
@@ -929,8 +947,12 @@ future<stop_iteration> view_update_builder::stop() const {
    return make_ready_future<stop_iteration>(stop_iteration::yes);
 }

-future<utils::chunked_vector<frozen_mutation_and_schema>> view_update_builder::build_some() {
+future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>> view_update_builder::build_some() {
    return advance_all().then([this] (stop_iteration ignored) {
+        if (!_update && !_existing) {
+            // Tell the caller there is no more data to build.
+            return make_ready_future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>>(std::nullopt);
+        }
        bool do_advance_updates = false;
        bool do_advance_existings = false;
        if (_update && _update->is_partition_start()) {
@@ -942,22 +964,23 @@ future<utils::chunked_vector<frozen_mutation_and_schema>> view_update_builder::b
            _existing_tombstone_tracker.set_partition_tombstone(_existing->as_partition_start().partition_tombstone());
            do_advance_existings = true;
        }
+        future<stop_iteration> f = make_ready_future<stop_iteration>(stop_iteration::no);
        if (do_advance_updates) {
-            return do_advance_existings ? advance_all() : advance_updates();
+            f = do_advance_existings ? advance_all() : advance_updates();
        } else if (do_advance_existings) {
-            return advance_existings();
+            f = advance_existings();
        }
-        return make_ready_future<stop_iteration>(stop_iteration::no);
-    }).then([this] (stop_iteration ignored) {
-        return repeat([this] {
-            return this->on_results();
+        return std::move(f).then([this] (stop_iteration ignored) {
+            return repeat([this] {
+                return this->on_results();
+            });
+        }).then([this] {
+            utils::chunked_vector<frozen_mutation_and_schema> mutations;
+            for (auto& update : _view_updates) {
+                update.move_to(mutations);
+            }
+            return std::make_optional(mutations);
        });
-    }).then([this] {
-        utils::chunked_vector<frozen_mutation_and_schema> mutations;
-        for (auto& update : _view_updates) {
-            update.move_to(mutations);
-        }
-        return mutations;
    });
 }

@@ -1293,7 +1316,7 @@ future<> mutate_MV(
            auto mut_ptr = remote_endpoints.empty() ? std::make_unique<frozen_mutation>(std::move(mut.fm)) : std::make_unique<frozen_mutation>(mut.fm);
            tracing::trace(tr_state, "Locally applying view update for {}.{}; base token = {}; view token = {}",
                    mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
-            local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, std::move(tr_state), db::commitlog::force_sync::no).then_wrapped(
+            local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, tr_state, db::commitlog::force_sync::no).then_wrapped(
                    [s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
                            units = sem_units.split(sem_units.count())] (future<>&& f) {
                --stats.writes;
@@ -2031,15 +2054,21 @@ public:
 // Called in the context of a seastar::thread.
 void view_builder::execute(build_step& step, exponential_backoff_retry r) {
    gc_clock::time_point now = gc_clock::now();
-    auto consumer = compact_for_query<emit_only_live_rows::yes, view_builder::consumer>(
+    auto compaction_state = make_lw_shared<compact_for_query_state<emit_only_live_rows::yes>>(
            *step.reader.schema(),
            now,
            step.pslice,
            batch_size,
-            query::max_partitions,
-            view_builder::consumer{*this, step, now});
-    consumer.consume_new_partition(step.current_key); // Initialize the state in case we're resuming a partition
+            query::max_partitions);
+    auto consumer = compact_for_query<emit_only_live_rows::yes, view_builder::consumer>(compaction_state, view_builder::consumer{*this, step, now});
    auto built = step.reader.consume_in_thread(std::move(consumer));
+    if (auto ds = std::move(*compaction_state).detach_state()) {
+        auto& range_tombstones = std::get<std::deque<range_tombstone>>(ds->range_tombstones);
+        for (auto& rt : range_tombstones) {
+            step.reader.unpop_mutation_fragment(mutation_fragment(*step.reader.schema(), step.reader.permit(), std::move(rt)));
+        }
+        step.reader.unpop_mutation_fragment(mutation_fragment(*step.reader.schema(), step.reader.permit(), std::move(ds->partition_start)));
+    }

    _as.check();

@@ -2121,24 +2150,28 @@ update_backlog node_update_backlog::add_fetch(unsigned shard, update_backlog bac
    return std::max(backlog, _max.load(std::memory_order_relaxed));
 }

-future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const sstring& ks_name, const sstring& cf_name) {
-    return sys_dist_ks.view_status(ks_name, cf_name).then([] (std::unordered_map<utils::UUID, sstring>&& view_statuses) {
-        return boost::algorithm::any_of(view_statuses | boost::adaptors::map_values, [] (const sstring& view_status) {
-            return view_status == "STARTED";
+future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const sstring& ks_name,
+        const sstring& cf_name) {
+    using view_statuses_type = std::unordered_map<utils::UUID, sstring>;
+    return sys_dist_ks.view_status(ks_name, cf_name).then([&tm] (view_statuses_type&& view_statuses) {
+        return boost::algorithm::any_of(view_statuses, [&tm] (const view_statuses_type::value_type& view_status) {
+            // Only consider status of known hosts.
+            return view_status.second == "STARTED" && tm.get_endpoint_for_host_id(view_status.first);
        });
    });
 }

-future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const replica::table& t, streaming::stream_reason reason) {
+future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const replica::table& t,
+        streaming::stream_reason reason) {
    if (is_internal_keyspace(t.schema()->ks_name())) {
        return make_ready_future<bool>(false);
    }
    if (reason == streaming::stream_reason::repair && !t.views().empty()) {
        return make_ready_future<bool>(true);
    }
-    return do_with(t.views(), [&sys_dist_ks] (auto& views) {
+    return do_with(t.views(), [&sys_dist_ks, &tm] (auto& views) {
        return map_reduce(views,
-                [&sys_dist_ks] (const view_ptr& view) { return check_view_build_ongoing(sys_dist_ks, view->ks_name(), view->cf_name()); },
+                [&sys_dist_ks, &tm] (const view_ptr& view) { return check_view_build_ongoing(sys_dist_ks, tm, view->ks_name(), view->cf_name()); },
                false,
                std::logical_or<bool>());
    });
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -154,10 +154,7 @@ private:
    void delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
    void do_delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
    void update_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now);
-    void replace_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now) {
-        create_entry(base_key, update, now);
-        delete_old_entry(base_key, existing, update, now);
-    }
+    void update_entry_for_computed_column(const partition_key& base_key, const clustering_row& update, const std::optional<clustering_row>& existing, gc_clock::time_point now);
 };

 class view_update_builder {
@@ -188,7 +185,15 @@ public:
    }
    view_update_builder(view_update_builder&& other) noexcept = default;

-    future<utils::chunked_vector<frozen_mutation_and_schema>> build_some();
+
+    // build_some() works on batches of 100 (max_rows_for_view_updates)
+    // updated rows, but can_skip_view_updates() can decide that some of
+    // these rows do not effect the view, and as a result build_some() can
+    // fewer than 100 rows - in extreme cases even zero (see issue #12297).
+    // So we can't use an empty returned vector to signify that the view
+    // update building is done - and we wrap the return value in an
+    // std::optional, which is disengaged when the iteration is done.
+    future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>> build_some();

    future<> close() noexcept;

--- a/db/view/view_update_checks.hh
+++ b/db/view/view_update_checks.hh
@@ -22,9 +22,13 @@ class system_distributed_keyspace;

 }

+namespace locator {
+class token_metadata;
+}
+
 namespace db::view {

-future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const sstring& ks_name, const sstring& cf_name);
-future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const replica::table& t, streaming::stream_reason reason);
+future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const replica::table& t,
+        streaming::stream_reason reason);

 }
--- a/db/view/view_updating_consumer.hh
+++ b/db/view/view_updating_consumer.hh
@@ -16,6 +16,7 @@
 #include "db/view/row_locking.hh"
 #include <seastar/core/abort_source.hh>
 #include "mutation.hh"
+#include <seastar/core/circular_buffer.hh>

 class evictable_reader_handle;

--- a/dht/murmur3_partitioner.cc
+++ b/dht/murmur3_partitioner.cc
@@ -15,11 +15,18 @@

 namespace dht {

+// Note: Cassandra has a special case where for an empty key it returns
+// minimum_token() instead of 0 (the naturally-calculated hash function for
+// an empty string). Their thinking was that empty partition keys are not
+// allowed anyway. However, they *are* allowed in materialized views, so the
+// empty-key partition should get a real token, not an invalid token, so
+// we dropped this special case. Since we don't support migrating sstables of
+// materialized-views from Cassandra, this Cassandra-Scylla incompatiblity
+// will not cause problems in practice.
+// Note that get_token(const schema& s, partition_key_view key) below must
+// use exactly the same algorithm as this function.
 token
 murmur3_partitioner::get_token(bytes_view key) const {
-    if (key.empty()) {
-        return minimum_token();
-    }
    std::array<uint64_t, 2> hash;
    utils::murmur_hash::hash3_x64_128(key, 0, hash);
    return get_token(hash[0]);
--- a/dirty_memory_manager.hh
+++ b/dirty_memory_manager.hh
@@ -202,6 +202,12 @@ public:
        });
    }

+    future<flush_permit> get_all_flush_permits() {
+        return get_units(_background_work_flush_serializer, _max_background_work).then([this] (auto&& units) {
+            return this->get_flush_permit(std::move(units));
+        });
+    }
+
    bool has_extraneous_flushes_requested() const {
        return _extraneous_flushes > 0;
    }
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -42,7 +42,8 @@ if __name__ == '__main__':
        if systemd_unit.available('systemd-coredump@.service'):
            dropin = '''
 [Service]
-TimeoutStartSec=infinity
+RuntimeMaxSec=infinity
+TimeoutSec=infinity
 '''[1:-1]
            os.makedirs('/etc/systemd/system/systemd-coredump@.service.d', exist_ok=True)
            with open('/etc/systemd/system/systemd-coredump@.service.d/timeout.conf', 'w') as f:
@@ -123,10 +124,14 @@ WantedBy=multi-user.target
        #  - Storage: /path/to/file (inacessible)
        #  - Storage: /path/to/file
        #
+        # After systemd-v248, available coredump file output changed like this:
+        #  - Storage: /path/to/file (present)
+        # We need to support both versions.
+        #
        # reference: https://github.com/systemd/systemd/commit/47f50642075a7a215c9f7b600599cbfee81a2913

        corefail = False
-        res = re.findall(r'Storage: (.*)$', coreinfo, flags=re.MULTILINE)
+        res = re.findall(r'Storage: (\S+)(?: \(.+\))?$', coreinfo, flags=re.MULTILINE)
        # v232 or later
        if res:
            corepath = res[0]
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -16,7 +16,7 @@ import stat
 import distro
 from pathlib import Path
 from scylla_util import *
-from subprocess import run
+from subprocess import run, SubprocessError

 if __name__ == '__main__':
    if os.getuid() > 0:
@@ -137,7 +137,9 @@ if __name__ == '__main__':
    # stalling. The minimum block size for crc enabled filesystems is 1024,
    # and it also cannot be smaller than the sector size.
    block_size = max(1024, sector_size)
+    run('udevadm settle', shell=True, check=True)
    run(f'mkfs.xfs -b size={block_size} {fsdev} -f -K', shell=True, check=True)
+    run('udevadm settle', shell=True, check=True)

    if is_debian_variant():
        confpath = '/etc/mdadm/mdadm.conf'
@@ -153,6 +155,11 @@ if __name__ == '__main__':
    os.makedirs(mount_at, exist_ok=True)

    uuid = run(f'blkid -s UUID -o value {fsdev}', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
+    if not uuid:
+        raise Exception(f'Failed to get UUID of {fsdev}')
+
+    uuidpath = f'/dev/disk/by-uuid/{uuid}'
+
    after = 'local-fs.target'
    wants = ''
    if raid and args.raid_level != '0':
@@ -169,7 +176,7 @@ After={after}{wants}
 DefaultDependencies=no

 [Mount]
-What=/dev/disk/by-uuid/{uuid}
+What={uuidpath}
 Where={mount_at}
 Type=xfs
 Options=noatime{opt_discard}
@@ -191,8 +198,16 @@ WantedBy=multi-user.target
    systemd_unit.reload()
    if args.raid_level != '0':
        md_service.start()
-    mount = systemd_unit(mntunit_bn)
-    mount.start()
+    try:
+        mount = systemd_unit(mntunit_bn)
+        mount.start()
+    except SubprocessError as e:
+        if not os.path.exists(uuidpath):
+            print(f'\nERROR: {uuidpath} is not found\n')
+        elif not stat.S_ISBLK(os.stat(uuidpath).st_mode):
+            print(f'\nERROR: {uuidpath} is not block device\n')
+        raise e
+
    if args.enable_on_nextboot:
        mount.enable()
    uid = pwd.getpwnam('scylla').pw_uid
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -214,7 +214,7 @@ if __name__ == '__main__':
                        help='skip raid setup')
    parser.add_argument('--raid-level-5', action='store_true', default=False,
                        help='use RAID5 for RAID volume')
-    parser.add_argument('--online-discard', default=True,
+    parser.add_argument('--online-discard', default=1, choices=[0, 1], type=int,
                        help='Configure XFS to discard unused blocks as soon as files are deleted')
    parser.add_argument('--nic',
                        help='specify NIC')
@@ -458,7 +458,7 @@ if __name__ == '__main__':
        args.no_raid_setup = not raid_setup
        if raid_setup:
            level = '5' if raid_level_5 else '0'
-            run_setup_script('RAID', f'scylla_raid_setup --disks {disks} --enable-on-nextboot --raid-level={level} --online-discard={int(online_discard)}')
+            run_setup_script('RAID', f'scylla_raid_setup --disks {disks} --enable-on-nextboot --raid-level={level} --online-discard={online_discard}')

        coredump_setup = interactive_ask_service('Do you want to enable coredumps?', 'Yes - sets up coredump to allow a post-mortem analysis of the Scylla state just prior to a crash. No - skips this step.', coredump_setup)
        args.no_coredump_setup = not coredump_setup
--- a/dist/common/scripts/scylla_sysconfig_setup
+++ b/dist/common/scripts/scylla_sysconfig_setup
@@ -70,7 +70,17 @@ if __name__ == '__main__':
    network_mode = args.mode if args.mode else cfg.get('NETWORK_MODE')

    if args.setup_nic_and_disks:
-        rps_cpus = run('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname), shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
+        res = run('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname), shell=True, check=True, capture_output=True, encoding='utf-8').stdout
+        # we need to extract CPU mask from output, since perftune.py may also print warning messages (#10082)
+        match = re.match('(.*\n)?(0x[0-9a-f]+(?:,0x[0-9a-f]+)*)', res, re.DOTALL)
+        try:
+            warning = match.group(1)
+            rps_cpus = match.group(2)
+        except:
+            raise Exception(f'Failed to retrive CPU mask: {res}')
+        # print warning message if available
+        if warning:
+            print(warning.strip())
        if len(rps_cpus) > 0:
            cpuset = hex2list(rps_cpus)
            run('/opt/scylladb/scripts/scylla_cpuset_setup --cpuset {}'.format(cpuset), shell=True, check=True)
--- a/dist/common/supervisor/scylla_util.sh
+++ b/dist/common/supervisor/scylla_util.sh
@@ -6,12 +6,16 @@ is_nonroot() {
    [ -f "$scylladir"/SCYLLA-NONROOT-FILE ]
 }

+is_container() {
+    [ -f "$scylladir"/SCYLLA-CONTAINER-FILE ]
+}
+
 is_privileged() {
    [ ${EUID:-${UID}} = 0 ]
 }

 execsudo() {
-    if is_nonroot; then
+    if is_nonroot || is_container; then
        exec "$@"
    else
        exec sudo -u scylla -g scylla "$@"
--- a/dist/docker/debian/build_docker.sh
+++ b/dist/docker/debian/build_docker.sh
@@ -82,15 +82,17 @@ run bash -ec "echo 'debconf debconf/frontend select Noninteractive' | debconf-se
 run bash -ec "rm -rf /etc/rsyslog.conf"
 run apt-get -y install hostname supervisor openssh-server openssh-client openjdk-11-jre-headless python python-yaml curl rsyslog locales sudo
 run locale-gen en_US.UTF-8
-run update-locale LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF_8
+run update-locale LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF-8
 run bash -ec "dpkg -i packages/*.deb"
 run apt-get -y clean all
 run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
 run mkdir -p /etc/supervisor.conf.d
 run mkdir -p /var/log/scylla
 run chown -R scylla:scylla /var/lib/scylla
+run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"/' /etc/default/scylla-server

 run mkdir -p /opt/scylladb/supervisor
+run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
 bcp dist/common/supervisor/scylla-server.sh /opt/scylladb/supervisor/scylla-server.sh
 bcp dist/common/supervisor/scylla-jmx.sh /opt/scylladb/supervisor/scylla-jmx.sh
 bcp dist/common/supervisor/scylla-node-exporter.sh /opt/scylladb/supervisor/scylla-node-exporter.sh
--- a/dist/docker/etc/supervisord.conf.d/scylla-server.conf
+++ b/dist/docker/etc/supervisord.conf.d/scylla-server.conf
@@ -1,4 +1,4 @@
-[program:scylla-server]
+[program:scylla]
 command=/opt/scylladb/supervisor/scylla-server.sh
 stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
--- a/dist/docker/etc/sysconfig/scylla-server
+++ b/dist/docker/etc/sysconfig/scylla-server
@@ -1,41 +0,0 @@
-# choose following mode: virtio, dpdk, posix
-NETWORK_MODE=posix
-
-# tap device name(virtio)
-TAP=tap0
-
-# bridge device name (virtio)
-BRIDGE=virbr0
-
-# ethernet device name
-IFNAME=eth0
-
-# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
-SET_NIC_AND_DISKS=no
-
-# ethernet device driver (dpdk)
-ETHDRV=
-
-# ethernet device PCI ID (dpdk)
-ETHPCIID=
-
-# number of hugepages
-NR_HUGEPAGES=64
-
-# user for process (must be root for dpdk)
-USER=scylla
-
-# group for process
-GROUP=scylla
-
-# scylla home dir
-SCYLLA_HOME=/var/lib/scylla
-
-# scylla config dir
-SCYLLA_CONF=/etc/scylla
-
-# scylla arguments
-SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"
-
-# setup as AMI instance
-AMI=no
--- a/dist/docker/scyllasetup.py
+++ b/dist/docker/scyllasetup.py
@@ -68,7 +68,12 @@ class ScyllaSetup:

    def cqlshrc(self):
        home = os.environ['HOME']
-        hostname = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
+        if self._rpcAddress:
+            hostname = self._rpcAddress
+        elif self._listenAddress:
+            hostname = self._listenAddress
+        else:
+            hostname = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
        with open("%s/.cqlshrc" % home, "w") as cqlshrc:
            cqlshrc.write("[connection]\nhostname = %s\n" % hostname)

--- a/flat_mutation_reader.cc
+++ b/flat_mutation_reader.cc
@@ -32,7 +32,7 @@
 logging::logger fmr_logger("flat_mutation_reader");

 flat_mutation_reader& flat_mutation_reader::operator=(flat_mutation_reader&& o) noexcept {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -45,7 +45,7 @@ flat_mutation_reader& flat_mutation_reader::operator=(flat_mutation_reader&& o)
 }

 flat_mutation_reader::~flat_mutation_reader() {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -774,11 +774,14 @@ make_flat_mutation_reader_from_mutations_v2(schema_ptr s, reader_permit permit,
        std::optional<mutation_consume_cookie> _cookie;

    private:
-        void flush_tombstones(position_in_partition_view pos) {
+        void flush_tombstones(position_in_partition_view pos, bool emit_end = false) {
            _rt_gen.flush(pos, [&] (range_tombstone_change rt) {
                _current_rt = rt.tombstone();
                push_mutation_fragment(*_schema, _permit, std::move(rt));
            });
+            if (emit_end && _current_rt) {
+                push_mutation_fragment(*_schema, _permit, range_tombstone_change(pos, {}));
+            }
        }
        void maybe_emit_partition_start() {
            if (_dk) {
@@ -815,10 +818,7 @@ make_flat_mutation_reader_from_mutations_v2(schema_ptr s, reader_permit permit,
                return stop_iteration::yes;
            }
            maybe_emit_partition_start();
-            flush_tombstones(position_in_partition::after_all_clustered_rows());
-            if (_current_rt) {
-                push_mutation_fragment(*_schema, _permit, range_tombstone_change(position_in_partition::after_all_clustered_rows(), {}));
-            }
+            flush_tombstones(position_in_partition::after_all_clustered_rows(), true);
            push_mutation_fragment(*_schema, _permit, partition_end{});
            return stop_iteration::no;
        }
@@ -1580,6 +1580,9 @@ bool mutation_fragment_stream_validator::operator()(dht::token t) {
 }

 bool mutation_fragment_stream_validator::operator()(mutation_fragment_v2::kind kind, position_in_partition_view pos) {
+    if (kind == mutation_fragment_v2::kind::partition_end && _current_tombstone) {
+        return false;
+    }
    if (_prev_kind == mutation_fragment_v2::kind::partition_end) {
        const bool valid = (kind == mutation_fragment_v2::kind::partition_start);
        if (valid) {
@@ -1607,7 +1610,11 @@ bool mutation_fragment_stream_validator::operator()(mutation_fragment::kind kind
 }

 bool mutation_fragment_stream_validator::operator()(const mutation_fragment_v2& mf) {
-    return (*this)(mf.mutation_fragment_kind(), mf.position());
+    const auto valid = (*this)(mf.mutation_fragment_kind(), mf.position());
+    if (valid && mf.is_range_tombstone_change()) {
+        _current_tombstone = mf.as_range_tombstone_change().tombstone();
+    }
+    return valid;
 }
 bool mutation_fragment_stream_validator::operator()(const mutation_fragment& mf) {
    return (*this)(to_mutation_fragment_kind_v2(mf.mutation_fragment_kind()), mf.position());
@@ -1646,11 +1653,17 @@ void mutation_fragment_stream_validator::reset(dht::decorated_key dk) {
    _prev_partition_key = dk;
    _prev_pos = position_in_partition::for_partition_start();
    _prev_kind = mutation_fragment_v2::kind::partition_start;
+    _current_tombstone = {};
 }

 void mutation_fragment_stream_validator::reset(const mutation_fragment_v2& mf) {
    _prev_pos = mf.position();
    _prev_kind = mf.mutation_fragment_kind();
+    if (mf.is_range_tombstone_change()) {
+        _current_tombstone = mf.as_range_tombstone_change().tombstone();
+    } else {
+        _current_tombstone = {};
+    }
 }
 void mutation_fragment_stream_validator::reset(const mutation_fragment& mf) {
    _prev_pos = mf.position();
@@ -1719,6 +1732,11 @@ bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment_v2

    fmr_logger.debug("[validator {}] {}:{}", static_cast<void*>(this), kind, pos);

+    if (kind == mutation_fragment_v2::kind::partition_end && _current_tombstone) {
+        on_validation_error(fmr_logger, format("[validator {} for {}] Unexpected active tombstone at partition-end: partition key {}: tombstone {}",
+                static_cast<void*>(this), _name, _validator.previous_partition_key(), _current_tombstone));
+    }
+
    if (_validation_level >= mutation_fragment_stream_validation_level::clustering_key) {
        valid = _validator(kind, pos);
    } else {
@@ -1745,7 +1763,11 @@ bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment::k
 }

 bool mutation_fragment_stream_validating_filter::operator()(const mutation_fragment_v2& mv) {
-    return (*this)(mv.mutation_fragment_kind(), mv.position());
+    auto valid = (*this)(mv.mutation_fragment_kind(), mv.position());
+    if (valid && mv.is_range_tombstone_change()) {
+        _current_tombstone = mv.as_range_tombstone_change().tombstone();
+    }
+    return valid;
 }
 bool mutation_fragment_stream_validating_filter::operator()(const mutation_fragment& mv) {
    return (*this)(to_mutation_fragment_kind_v2(mv.mutation_fragment_kind()), mv.position());
@@ -1764,7 +1786,7 @@ void mutation_fragment_stream_validating_filter::on_end_of_stream() {
 }

 flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader_v2&& o) noexcept {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -1777,7 +1799,7 @@ flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader
 }

 flat_mutation_reader_v2::~flat_mutation_reader_v2() {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -1964,11 +1986,14 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
        tombstone _current_rt;
        std::optional<position_range> _pr;
    public:
-        void flush_tombstones(position_in_partition_view pos) {
+        void flush_tombstones(position_in_partition_view pos, bool emit_end = false) {
            _rt_gen.flush(pos, [&] (range_tombstone_change rt) {
                _current_rt = rt.tombstone();
                push_mutation_fragment(*_schema, _permit, std::move(rt));
            });
+            if (emit_end && _current_rt) {
+                push_mutation_fragment(*_schema, _permit, range_tombstone_change(pos, {}));
+            }
        }
        void consume(static_row mf) {
            push_mutation_fragment(*_schema, _permit, std::move(mf));
@@ -1993,11 +2018,9 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
            push_mutation_fragment(*_schema, _permit, std::move(mf));
        }
        void consume(partition_end mf) {
-            flush_tombstones(position_in_partition::after_all_clustered_rows());
+            flush_tombstones(position_in_partition::after_all_clustered_rows(), true);
            if (_current_rt) {
                assert(!_pr);
-                push_mutation_fragment(*_schema, _permit, range_tombstone_change(
-                        position_in_partition::after_all_clustered_rows(), {}));
            }
            push_mutation_fragment(*_schema, _permit, std::move(mf));
        }
@@ -2020,10 +2043,7 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
                if (_reader.is_end_of_stream() && _reader.is_buffer_empty()) {
                    if (_pr) {
                        // If !_pr we should flush on partition_end
-                        flush_tombstones(_pr->end());
-                        if (_current_rt) {
-                            push_mutation_fragment(*_schema, _permit, range_tombstone_change(_pr->end(), {}));
-                        }
+                        flush_tombstones(_pr->end(), true);
                    }
                    _end_of_stream = true;
                }
--- a/flat_mutation_reader.hh
+++ b/flat_mutation_reader.hh
@@ -132,6 +132,7 @@ public:
    private:
        tracked_buffer _buffer;
        size_t _buffer_size = 0;
+        bool _close_required = false;
    protected:
        size_t max_buffer_size_in_bytes = default_max_buffer_size_in_bytes();
        bool _end_of_stream = false;
@@ -167,6 +168,8 @@ public:
        bool is_end_of_stream() const { return _end_of_stream; }
        bool is_buffer_empty() const { return _buffer.empty(); }
        bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; }
+        bool is_close_required() const { return _close_required; }
+        void set_close_required() { _close_required = true; }
        static constexpr size_t default_max_buffer_size_in_bytes() { return 8 * 1024; }

        mutation_fragment pop_mutation_fragment() {
@@ -504,9 +507,15 @@ public:
    //
    // Can be used to skip over entire partitions if interleaved with
    // `operator()()` calls.
-    future<> next_partition() { return _impl->next_partition(); }
+    future<> next_partition() {
+        _impl->set_close_required();
+        return _impl->next_partition();
+    }

-    future<> fill_buffer() { return _impl->fill_buffer(); }
+    future<> fill_buffer() {
+        _impl->set_close_required();
+        return _impl->fill_buffer();
+    }

    // Changes the range of partitions to pr. The range can only be moved
    // forwards. pr.begin() needs to be larger than pr.end() of the previousl
@@ -515,6 +524,7 @@ public:
    // pr needs to be valid until the reader is destroyed or fast_forward_to()
    // is called again.
    future<> fast_forward_to(const dht::partition_range& pr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(pr);
    }
    // Skips to a later range of rows.
@@ -544,6 +554,7 @@ public:
    // In particular one must first enter a partition by fetching a `partition_start`
    // fragment before calling `fast_forward_to`.
    future<> fast_forward_to(position_range cr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(std::move(cr));
    }
    // Closes the reader.
--- a/flat_mutation_reader_v2.hh
+++ b/flat_mutation_reader_v2.hh
@@ -164,6 +164,7 @@ public:
    private:
        tracked_buffer _buffer;
        size_t _buffer_size = 0;
+        bool _close_required = false;
    protected:
        size_t max_buffer_size_in_bytes = default_max_buffer_size_in_bytes();

@@ -205,6 +206,8 @@ public:
        bool is_end_of_stream() const { return _end_of_stream; }
        bool is_buffer_empty() const { return _buffer.empty(); }
        bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; }
+        bool is_close_required() const { return _close_required; }
+        void set_close_required() { _close_required = true; }
        static constexpr size_t default_max_buffer_size_in_bytes() { return 8 * 1024; }

        mutation_fragment_v2 pop_mutation_fragment() {
@@ -542,9 +545,15 @@ public:
    //
    // Can be used to skip over entire partitions if interleaved with
    // `operator()()` calls.
-    future<> next_partition() { return _impl->next_partition(); }
+    future<> next_partition() {
+        _impl->set_close_required();
+        return _impl->next_partition();
+    }

-    future<> fill_buffer() { return _impl->fill_buffer(); }
+    future<> fill_buffer() {
+        _impl->set_close_required();
+        return _impl->fill_buffer();
+    }

    // Changes the range of partitions to pr. The range can only be moved
    // forwards. pr.begin() needs to be larger than pr.end() of the previousl
@@ -553,6 +562,7 @@ public:
    // pr needs to be valid until the reader is destroyed or fast_forward_to()
    // is called again.
    future<> fast_forward_to(const dht::partition_range& pr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(pr);
    }
    // Skips to a later range of rows.
@@ -582,6 +592,7 @@ public:
    // In particular one must first enter a partition by fetching a `partition_start`
    // fragment before calling `fast_forward_to`.
    future<> fast_forward_to(position_range cr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(std::move(cr));
    }
    // Closes the reader.
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -1012,10 +1012,10 @@ std::set<inet_address> gossiper::get_live_members() {

 std::set<inet_address> gossiper::get_live_token_owners() {
    std::set<inet_address> token_owners;
-    for (auto& member : get_live_members()) {
-        auto es = get_endpoint_state_for_endpoint_ptr(member);
-        if (es && !is_dead_state(*es) && get_token_metadata_ptr()->is_member(member)) {
-            token_owners.insert(member);
+    auto normal_token_owners = get_token_metadata_ptr()->get_all_endpoints();
+    for (auto& node: normal_token_owners) {
+        if (is_alive(node)) {
+            token_owners.insert(node);
        }
    }
    return token_owners;
@@ -1023,10 +1023,10 @@ std::set<inet_address> gossiper::get_live_token_owners() {

 std::set<inet_address> gossiper::get_unreachable_token_owners() {
    std::set<inet_address> token_owners;
-    for (auto&& x : _unreachable_endpoints) {
-        auto& endpoint = x.first;
-        if (get_token_metadata_ptr()->is_member(endpoint)) {
-            token_owners.insert(endpoint);
+    auto normal_token_owners = get_token_metadata_ptr()->get_all_endpoints();
+    for (auto& node: normal_token_owners) {
+        if (!is_alive(node)) {
+            token_owners.insert(node);
        }
    }
    return token_owners;
--- a/install.sh
+++ b/install.sh
@@ -143,7 +143,7 @@ export LD_LIBRARY_PATH="$prefix/libreloc"
 export UBSAN_OPTIONS="${UBSAN_OPTIONS:+$UBSAN_OPTIONS:}suppressions=$prefix/libexec/ubsan-suppressions.supp"
 exec -a "\$0" "$prefix/libexec/$bin" "\$@"
 EOF
-    chmod +x "$root/$prefix/bin/$bin"
+    chmod 755 "$root/$prefix/bin/$bin"
 }

 relocate_python3() {
@@ -156,11 +156,11 @@ relocate_python3() {
    local pythonpath="$(dirname "$pythoncmd")"

    if [ ! -x "$script" ]; then
-        cp "$script" "$install"
+        install -m755 "$script" "$install"
        return
    fi
-    mkdir -p "$relocateddir"
-    cp "$script" "$relocateddir"
+    install -d -m755 "$relocateddir"
+    install -m755 "$script" "$relocateddir"
    cat > "$install"<<EOF
 #!/usr/bin/env bash
 [[ -z "\$LD_PRELOAD" ]] || { echo "\$0: not compatible with LD_PRELOAD" >&2; exit 110; }
@@ -178,7 +178,7 @@ if [ -f "\${DEBIAN_SSL_CERT_FILE}" ]; then
 fi
 PYTHONPATH="\${d}:\${d}/libexec:\$PYTHONPATH" PATH="\${d}/../bin:\${d}/$pythonpath:\${PATH}" SSL_CERT_FILE="\${c}" exec -a "\$0" "\${d}/libexec/\${b}" "\$@"
 EOF
-    chmod +x "$install"
+    chmod 755 "$install"
 }

 install() {
@@ -392,6 +392,7 @@ install -d -m755 -d "$rprefix"/scyllatop
 cp -r tools/scyllatop/* "$rprefix"/scyllatop
 install -d -m755 -d "$rprefix"/scripts
 cp -r dist/common/scripts/* "$rprefix"/scripts
+chmod 755 "$rprefix"/scripts/*
 ln -srf "$rprefix/scyllatop/scyllatop.py" "$rprefix/bin/scyllatop"
 if $supervisor; then
    install -d -m755 "$rprefix"/supervisor
@@ -508,8 +509,13 @@ relocate_python3 "$rprefix"/scripts fix_system_distributed_tables.py
 if $supervisor; then
    install -d -m755 `supervisor_dir $retc`
    for service in scylla-server scylla-jmx scylla-node-exporter; do
+        if [ "$service" = "scylla-server" ]; then
+            program="scylla"
+        else
+            program=$service
+        fi
        cat << EOS > `supervisor_conf $retc $service`
-[program:$service]
+[program:$program]
 directory=$rprefix
 command=/bin/bash -c './supervisor/$service.sh'
 EOS
--- a/locator/azure_snitch.cc
+++ b/locator/azure_snitch.cc
@@ -15,6 +15,7 @@
 #include <seastar/core/coroutine.hh>
 #include <seastar/core/seastar.hh>
 #include <seastar/http/response_parser.hh>
+#include <seastar/http/reply.hh>
 #include <seastar/net/api.hh>
 #include <seastar/net/dns.hh>

@@ -34,6 +35,10 @@ azure_snitch::azure_snitch(const sstring& fname, unsigned io_cpuid) : production
 }

 future<> azure_snitch::load_config() {
+    if (this_shard_id() != io_cpu_id()) {
+        co_return;
+    }
+
    sstring region = co_await azure_api_call(REGION_NAME_QUERY_PATH);
    sstring azure_zone = co_await azure_api_call(ZONE_NAME_QUERY_PATH);

@@ -43,7 +48,8 @@ future<> azure_snitch::load_config() {

    logger().info("AzureSnitch using region: {}, zone: {}.", azure_region, azure_zone);

-    _my_rack = azure_zone;
+    // Zoneless regions return empty zone
+    _my_rack = (azure_zone != "" ? azure_zone : azure_region);
    _my_dc = azure_region;

    co_return co_await _my_distributed->invoke_on_all([this] (snitch_ptr& local_s) {
@@ -86,6 +92,10 @@ future<sstring> azure_snitch::azure_api_call(sstring path) {

        // Read HTTP response header first
        auto rsp = parser.get_parsed_response();
+        if (rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
+            throw std::runtime_error(format("Error: HTTP response status {}", rsp->_status_code));
+        }
+
        auto it = rsp->_headers.find("Content-Length");
        if (it == rsp->_headers.end()) {
            throw std::runtime_error("Error: HTTP response does not contain: Content-Length\n");
--- a/locator/ec2_snitch.cc
+++ b/locator/ec2_snitch.cc
@@ -1,5 +1,8 @@
 #include "locator/ec2_snitch.hh"
 #include <seastar/core/seastar.hh>
+#include <seastar/core/sleep.hh>
+#include <seastar/core/do_with.hh>
+#include <seastar/http/reply.hh>

 #include <boost/algorithm/string/classification.hpp>
 #include <boost/algorithm/string/split.hpp>
@@ -67,6 +70,30 @@ future<> ec2_snitch::start() {
 }

 future<sstring> ec2_snitch::aws_api_call(sstring addr, uint16_t port, sstring cmd) {
+    return do_with(int(0), [this, addr, port, cmd] (int& i) {
+        return repeat_until_value([this, addr, port, cmd, &i]() -> future<std::optional<sstring>> {
+            ++i;
+            return aws_api_call_once(addr, port, cmd).then([] (auto res) {
+                return make_ready_future<std::optional<sstring>>(std::move(res));
+            }).handle_exception([&i] (auto ep) {
+                try {
+                    std::rethrow_exception(ep);
+                } catch (const std::system_error &e) {
+                    logger().error(e.what());
+                    if (i >= AWS_API_CALL_RETRIES - 1) {
+                        logger().error("Maximum number of retries exceeded");
+                        throw e;
+                    }
+                }
+                return sleep(AWS_API_CALL_RETRY_INTERVAL).then([] {
+                    return make_ready_future<std::optional<sstring>>(std::nullopt);
+                });
+            });
+        });
+    });
+}
+
+future<sstring> ec2_snitch::aws_api_call_once(sstring addr, uint16_t port, sstring cmd) {
    return connect(socket_address(inet_address{addr}, port))
    .then([this, addr, cmd] (connected_socket fd) {
        _sd = std::move(fd);
@@ -88,6 +115,9 @@ future<sstring> ec2_snitch::aws_api_call(sstring addr, uint16_t port, sstring cm

            // Read HTTP response header first
            auto _rsp = _parser.get_parsed_response();
+            if (_rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
+                return make_exception_future<sstring>(std::runtime_error(format("Error: HTTP response status {}", _rsp->_status_code)));
+            }
            auto it = _rsp->_headers.find("Content-Length");
            if (it == _rsp->_headers.end()) {
                return make_exception_future<sstring>("Error: HTTP response does not contain: Content-Length\n");
--- a/locator/ec2_snitch.hh
+++ b/locator/ec2_snitch.hh
@@ -16,6 +16,8 @@ public:
    static constexpr const char* ZONE_NAME_QUERY_REQ = "/latest/meta-data/placement/availability-zone";
    static constexpr const char* AWS_QUERY_SERVER_ADDR = "169.254.169.254";
    static constexpr uint16_t AWS_QUERY_SERVER_PORT = 80;
+    static constexpr int AWS_API_CALL_RETRIES = 5;
+    static constexpr auto AWS_API_CALL_RETRY_INTERVAL = std::chrono::seconds{5};

    ec2_snitch(const sstring& fname = "", unsigned io_cpu_id = 0);
    virtual future<> start() override;
@@ -32,5 +34,6 @@ private:
    output_stream<char> _out;
    http_response_parser _parser;
    sstring _zone_req;
+    future<sstring> aws_api_call_once(sstring addr, uint16_t port, const sstring cmd);
 };
 } // namespace locator
--- a/locator/gce_snitch.cc
+++ b/locator/gce_snitch.cc
@@ -14,6 +14,7 @@
 #include <seastar/net/dns.hh>
 #include <seastar/core/seastar.hh>
 #include "locator/gce_snitch.hh"
+#include <seastar/http/reply.hh>

 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string/classification.hpp>
@@ -106,6 +107,10 @@ future<sstring> gce_snitch::gce_api_call(sstring addr, sstring cmd) {

        // Read HTTP response header first
        auto rsp = parser.get_parsed_response();
+        if (rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
+            throw std::runtime_error(format("Error: HTTP response status {}", rsp->_status_code));
+        }
+
        auto it = rsp->_headers.find("Content-Length");
        if (it == rsp->_headers.end()) {
            throw std::runtime_error("Error: HTTP response does not contain: Content-Length\n");
--- a/main.cc
+++ b/main.cc
@@ -367,11 +367,40 @@ static auto defer_verbose_shutdown(const char* what, Func&& func) {
        startlog.info("Shutting down {}", what);
        try {
            func();
+            startlog.info("Shutting down {} was successful", what);
        } catch (...) {
-            startlog.error("Unexpected error shutting down {}: {}", what, std::current_exception());
-            throw;
+            auto ex = std::current_exception();
+            bool do_abort = true;
+            try {
+                std::rethrow_exception(ex);
+            } catch (const std::system_error& e) {
+                // System error codes we consider "environmental",
+                // i.e. not scylla's fault, therefore there is no point in
+                // aborting and dumping core.
+                for (int i : {EIO, EACCES, ENOSPC}) {
+                    if (e.code() == std::error_code(i, std::system_category())) {
+                        do_abort = false;
+                        break;
+                    }
+                }
+            } catch (const storage_io_error& e) {
+                do_abort = false;
+            } catch (...) {
+            }
+            auto msg = fmt::format("Unexpected error shutting down {}: {}", what, ex);
+            if (do_abort) {
+                startlog.error("{}: aborting", msg);
+                abort();
+            } else {
+                startlog.error("{}: exiting, at {}", msg, current_backtrace());
+
+                // Call _exit() rather than exit() to exit immediately
+                // without calling exit handlers, avoiding
+                // boost::intrusive::detail::destructor_impl assert failure
+                // from ~segment_pool exit handler.
+                _exit(255);
+            }
        }
-        startlog.info("Shutting down {} was successful", what);
    };

    auto ret = deferred_action(std::move(vfunc));
@@ -398,6 +427,39 @@ static int scylla_main(int ac, char** av) {
        exit(1);
    }

+    // Even on the environment which causes error during initalize Scylla,
+    // "scylla --version" should be able to run without error.
+    // To do so, we need to parse and execute these options before
+    // initializing Scylla/Seastar classes.
+    bpo::options_description preinit_description("Scylla options");
+    bpo::variables_map preinit_vm;
+    preinit_description.add_options()
+        ("version", bpo::bool_switch(), "print version number and exit")
+        ("build-id", bpo::bool_switch(), "print build-id and exit")
+        ("build-mode", bpo::bool_switch(), "print build mode and exit")
+        ("list-tools", bpo::bool_switch(), "list included tools and exit");
+    auto preinit_parsed_opts = bpo::command_line_parser(ac, av).options(preinit_description).allow_unregistered().run();
+    bpo::store(preinit_parsed_opts, preinit_vm);
+    if (preinit_vm["version"].as<bool>()) {
+        fmt::print("{}\n", scylla_version());
+        return 0;
+    }
+    if (preinit_vm["build-id"].as<bool>()) {
+        fmt::print("{}\n", get_build_id());
+        return 0;
+    }
+    if (preinit_vm["build-mode"].as<bool>()) {
+        fmt::print("{}\n", scylla_build_mode());
+        return 0;
+    }
+    if (preinit_vm["list-tools"].as<bool>()) {
+        fmt::print(
+                "types - a command-line tool to examine values belonging to scylla types\n"
+                "sstable - a multifunctional command-line tool to examine the content of sstables\n"
+        );
+        return 0;
+    }
+
  try {
    runtime::init_uptime();
    std::setvbuf(stdout, nullptr, _IOLBF, 1000);
@@ -452,26 +514,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
    bpo::variables_map vm;
    auto parsed_opts = bpo::command_line_parser(ac, av).options(app.get_options_description()).allow_unregistered().run();
    bpo::store(parsed_opts, vm);
-    if (vm["version"].as<bool>()) {
-        fmt::print("{}\n", scylla_version());
-        return 0;
-    }
-    if (vm["build-id"].as<bool>()) {
-        fmt::print("{}\n", get_build_id());
-        return 0;
-    }
-    if (vm["build-mode"].as<bool>()) {
-        fmt::print("{}\n", scylla_build_mode());
-        return 0;
-    }
-    if (vm["list-tools"].as<bool>()) {
-        fmt::print(
-                "types - a command-line tool to examine values belonging to scylla types\n"
-                "sstable - a multifunctional command-line tool to examine the content of sstables\n"
-        );
-        return 0;
-    }
-
    print_starting_message(ac, av, parsed_opts);

    sharded<locator::shared_token_metadata> token_metadata;
@@ -547,6 +589,12 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl

            cfg->broadcast_to_all_shards().get();

+            // We pass this piece of config through a global as a temporary hack.
+            // See the comment at the definition of sstables::global_cache_index_pages.
+            smp::invoke_on_all([&cfg] {
+                sstables::global_cache_index_pages = cfg->cache_index_pages.operator utils::updateable_value<bool>();
+            }).get();
+
            ::sighup_handler sighup_handler(opts, *cfg);
            auto stop_sighup_handler = defer_verbose_shutdown("sighup", [&] {
                sighup_handler.stop().get();
@@ -1089,7 +1137,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            // ATTN -- sharded repair reference already sits on storage_service and if
            // it calls repair.local() before this place it'll crash (now it doesn't do
            // both)
-            supervisor::notify("starting messaging service");
+            supervisor::notify("starting repair service");
            auto max_memory_repair = memory::stats().total_memory() * 0.1;
            repair.start(std::ref(gossiper), std::ref(messaging), std::ref(db), std::ref(proxy), std::ref(bm), std::ref(sys_dist_ks), std::ref(view_update_generator), std::ref(mm), max_memory_repair).get();
            auto stop_repair_service = defer_verbose_shutdown("repair service", [&repair] {
--- a/memtable-sstable.hh
+++ b/memtable-sstable.hh
@@ -15,6 +15,7 @@
 #include "sstables/shared_sstable.hh"
 #include <seastar/core/future.hh>
 #include <seastar/core/io_priority_class.hh>
+#include "reader_permit.hh"

 class memtable;
 class flat_mutation_reader;
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -438,6 +438,8 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
    // should not be blocked by any data requests.
    case messaging_verb::GROUP0_PEER_EXCHANGE:
    case messaging_verb::GROUP0_MODIFY_CONFIG:
+        // ATTN -- if moving GOSSIP_ verbs elsewhere, mind updating the tcp_nodelay
+        // setting in get_rpc_client(), which assumes gossiper verbs live in idx 0
        return 0;
    case messaging_verb::PREPARE_MESSAGE:
    case messaging_verb::PREPARE_DONE_MESSAGE:
@@ -695,7 +697,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }();

    auto must_tcp_nodelay = [&] {
-        if (idx == 1) {
+        if (idx == 0) {
            return true; // gossip
        }
        if (_cfg.tcp_nodelay == tcp_nodelay_what::local) {
@@ -716,10 +718,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }
    opts.tcp_nodelay = must_tcp_nodelay;
    opts.reuseaddr = true;
-    // We send cookies only for non-default statement tenant clients.
-    if (idx > 3) {
-        opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;
-    }
+    opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;

    auto client = must_encrypt ?
                    ::make_shared<rpc_protocol_client_wrapper>(_rpc->protocol(), std::move(opts),
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -272,8 +272,8 @@ public:

    future<> lookup_readers(db::timeout_clock::time_point timeout);

-    future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
-            std::optional<clustering_key_prefix> last_ckey);
+    future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, std::optional<detached_compaction_state> compaction_state,
+            dht::decorated_key last_pkey, std::optional<clustering_key_prefix> last_ckey);

    future<> stop();
 };
@@ -580,19 +580,22 @@ future<> read_context::lookup_readers(db::timeout_clock::time_point timeout) {
    });
 }

-future<> read_context::save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
-            std::optional<clustering_key_prefix> last_ckey) {
+future<> read_context::save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, std::optional<detached_compaction_state> compaction_state,
+            dht::decorated_key last_pkey, std::optional<clustering_key_prefix> last_ckey) {
    if (_cmd.query_uuid == utils::UUID{}) {
        return make_ready_future<>();
    }

-    auto last_pkey = compaction_state.partition_start.key();
-
    const auto cb_stats = dismantle_combined_buffer(std::move(unconsumed_buffer), last_pkey);
    tracing::trace(_trace_state, "Dismantled combined buffer: {}", cb_stats);

-    const auto cs_stats = dismantle_compaction_state(std::move(compaction_state));
-    tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);
+    auto cs_stats = dismantle_buffer_stats{};
+    if (compaction_state) {
+        cs_stats = dismantle_compaction_state(std::move(*compaction_state));
+        tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);
+    } else {
+        tracing::trace(_trace_state, "No compaction state to dismantle, partition exhausted", cs_stats);
+    }

    return do_with(std::move(last_pkey), std::move(last_ckey), [this] (const dht::decorated_key& last_pkey,
            const std::optional<clustering_key_prefix>& last_ckey) {
@@ -745,16 +748,18 @@ future<typename ResultBuilder::result_type> do_query(
        ResultBuilder&& result_builder) {
    auto ctx = seastar::make_shared<read_context>(db, s, cmd, ranges, trace_state, timeout);

-    co_await ctx->lookup_readers(timeout);
-
    std::exception_ptr ex;

    try {
+        co_await ctx->lookup_readers(timeout);
+
        auto [last_ckey, result, unconsumed_buffer, compaction_state] = co_await read_page<ResultBuilder>(ctx, s, cmd, ranges, trace_state,
                std::move(result_builder));

        if (compaction_state->are_limits_reached() || result.is_short_read()) {
-            co_await ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_ckey));
+            // Must call before calling 'detached_state()`.
+            auto last_pkey = *compaction_state->current_partition();
+            co_await ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_pkey), std::move(last_ckey));
        }

        co_await ctx->stop();
--- a/mutation_compactor.hh
+++ b/mutation_compactor.hh
@@ -167,6 +167,9 @@ class compact_mutation_state {
    std::unique_ptr<mutation_compactor_garbage_collector> _collector;

    compaction_stats _stats;
+
+    // Remember if we requested to stop mid-partition.
+    stop_iteration _stop = stop_iteration::no;
 private:
    template <typename Consumer, typename GCConsumer>
    requires CompactedFragmentsConsumer<Consumer> && CompactedFragmentsConsumer<GCConsumer>
@@ -304,6 +307,7 @@ public:
    }

    void consume_new_partition(const dht::decorated_key& dk) {
+        _stop = stop_iteration::no;
        auto& pk = dk.key();
        _dk = &dk;
        _return_static_content_on_partition_with_no_rows =
@@ -370,9 +374,9 @@ public:
        _static_row_live = is_live;
        if (is_live || (!only_live() && !sr.empty())) {
            partition_is_not_empty(consumer);
-            return consumer.consume(std::move(sr), current_tombstone, is_live);
+            _stop = consumer.consume(std::move(sr), current_tombstone, is_live);
        }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -424,22 +428,21 @@ public:
        };

        if (only_live() && is_live) {
-            auto stop = consume_row();
+            _stop = consume_row();
            if (++_rows_in_current_partition == _current_partition_limit) {
-                return stop_iteration::yes;
+                _stop = stop_iteration::yes;
            }
-            return stop;
+            return _stop;
        } else if (!only_live()) {
-            auto stop = stop_iteration::no;
            if (!cr.empty()) {
-                stop = consume_row();
+                _stop = consume_row();
            }
            if (!sstable_compaction() && is_live && ++_rows_in_current_partition == _current_partition_limit) {
-                return stop_iteration::yes;
+                _stop = stop_iteration::yes;
            }
-            return stop;
+            return _stop;
        }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -448,7 +451,8 @@ public:
        ++_stats.range_tombstones;
        _range_tombstones.apply(rt);
        // FIXME: drop tombstone if it is fully covered by other range tombstones
-        return do_consume(std::move(rt), consumer, gc_consumer);
+        _stop = do_consume(std::move(rt), consumer, gc_consumer);
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -459,9 +463,9 @@ public:
            _rt_assembler.emplace();
        }
        if (auto rt_opt = _rt_assembler->consume(_schema, std::move(rtc))) {
-            return do_consume(std::move(*rt_opt), consumer, gc_consumer);
+            _stop = do_consume(std::move(*rt_opt), consumer, gc_consumer);
        }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -536,6 +540,7 @@ public:
        _current_partition_limit = std::min(_row_limit, _partition_row_limit);
        _query_time = query_time;
        _stats = {};
+        _stop = stop_iteration::no;

        noop_compacted_fragments_consumer nc;

@@ -562,16 +567,31 @@ public:
    /// compactor will result in the new compactor being in the same state *this
    /// is (given the same outside parameters of course). Practically this
    /// allows the compaction state to be stored in the compacted reader.
-    detached_compaction_state detach_state() && {
+    /// If the currently compacted partition is exhausted a disengaged optional
+    /// is returned -- in this case there is no state to detach.
+    std::optional<detached_compaction_state> detach_state() && {
+        // If we exhausted the partition, there is no need to detach-restore the
+        // compaction state.
+        // We exhausted the partition if `consume_partition_end()` was called
+        // without us requesting the consumption to stop (remembered in _stop)
+        // from one of the consume() overloads.
+        // The consume algorithm calls `consume_partition_end()` in two cases:
+        // * on a partition-end fragment
+        // * consume() requested to stop
+        // In the latter case, the partition is not exhausted. Even if the next
+        // fragment to process is a partition-end, it will not be consumed.
+        if (!_stop) {
+            return {};
+        }
        partition_start ps(std::move(_last_dk), _range_tombstones.get_partition_tombstone());
        if (_rt_assembler) {
            if (_current_tombstone) {
-                return {std::move(ps), std::move(_last_static_row), range_tombstone_change(position_in_partition_view::after_key(_last_clustering_pos), _current_tombstone)};
+                return detached_compaction_state{std::move(ps), std::move(_last_static_row), range_tombstone_change(position_in_partition_view::after_key(_last_clustering_pos), _current_tombstone)};
            } else {
-                return {std::move(ps), std::move(_last_static_row), std::optional<range_tombstone_change>{}};
+                return detached_compaction_state{std::move(ps), std::move(_last_static_row), std::optional<range_tombstone_change>{}};
            }
        }
-        return {std::move(ps), std::move(_last_static_row), std::move(_range_tombstones).range_tombstones()};
+        return detached_compaction_state{std::move(ps), std::move(_last_static_row), std::move(_range_tombstones).range_tombstones()};
    }

    const compaction_stats& stats() const { return _stats; }
--- a/mutation_fragment_stream_validator.hh
+++ b/mutation_fragment_stream_validator.hh
@@ -28,6 +28,7 @@ class mutation_fragment_stream_validator {
    mutation_fragment_v2::kind _prev_kind;
    position_in_partition _prev_pos;
    dht::decorated_key _prev_partition_key;
+    tombstone _current_tombstone;
 public:
    explicit mutation_fragment_stream_validator(const schema& s);

@@ -122,6 +123,12 @@ public:
    const position_in_partition& previous_position() const {
        return _prev_pos;
    }
+    /// Get the current effective tombstone
+    ///
+    /// Not meaningful, when operator()(mutation_fragment_v2) is not used.
+    tombstone current_tombstone() const {
+        return _current_tombstone;
+    }
    /// The previous valid partition key.
    ///
    /// Only valid if `operator()(const dht::decorated_key&)` or
@@ -151,6 +158,7 @@ class mutation_fragment_stream_validating_filter {
    mutation_fragment_stream_validator _validator;
    sstring _name;
    mutation_fragment_stream_validation_level _validation_level;
+    tombstone _current_tombstone;

 public:
    /// Constructor.
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -826,6 +826,7 @@ public:

    void apply(tombstone deleted_at) {
        _deleted_at.apply(deleted_at);
+        maybe_shadow();
    }

    void apply(shadowable_tombstone deleted_at) {
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -1240,7 +1240,10 @@ future<flat_mutation_reader> evictable_reader::resume_or_create_reader() {
    if (auto reader_opt = try_resume()) {
        co_return std::move(*reader_opt);
    }
-    co_await _permit.maybe_wait_readmission();
+    // See evictable_reader_v2::resume_or_create_reader()
+    if (_permit.needs_readmission()) {
+        co_await _permit.wait_readmission();
+    }
    co_return recreate_reader();
 }

@@ -1581,11 +1584,7 @@ private:
    tracing::global_trace_state_ptr _trace_state;
    const mutation_reader::forwarding _fwd_mr;
    reader_concurrency_semaphore::inactive_read_handle _irh;
-    bool _drop_partition_start = false;
-    bool _drop_static_row = false;
-    // Validate the partition key of the first emitted partition, set after the
-    // reader was recreated.
-    bool _validate_partition_key = false;
+    bool _reader_recreated = false; // set if reader was recreated since last operation
    position_in_partition::tri_compare _tri_cmp;

    std::optional<dht::decorated_key> _last_pkey;
@@ -1606,10 +1605,9 @@ private:
    void adjust_partition_slice();
    flat_mutation_reader_v2 recreate_reader();
    future<flat_mutation_reader_v2> resume_or_create_reader();
-    void maybe_validate_partition_start(const flat_mutation_reader_v2::tracked_buffer& buffer);
+    void validate_partition_start(const partition_start& ps);
    void validate_position_in_partition(position_in_partition_view pos) const;
-    bool should_drop_fragment(const mutation_fragment_v2& mf);
-    future<> do_fill_buffer();
+    void examine_first_fragments(mutation_fragment_v2_opt& mf1, mutation_fragment_v2_opt& mf2, mutation_fragment_v2_opt& mf3);

 public:
    evictable_reader_v2(
@@ -1725,9 +1723,6 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
    _range_override.reset();
    _slice_override.reset();

-    _drop_partition_start = false;
-    _drop_static_row = false;
-
    if (_last_pkey) {
        bool partition_range_is_inclusive = true;

@@ -1736,11 +1731,8 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
            partition_range_is_inclusive = false;
            break;
        case partition_region::static_row:
-            _drop_partition_start = true;
            break;
        case partition_region::clustered:
-            _drop_partition_start = true;
-            _drop_static_row = true;
            adjust_partition_slice();
            slice = &*_slice_override;
            break;
@@ -1763,7 +1755,7 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
        _range_override = dht::partition_range({dht::partition_range::bound(*_last_pkey, partition_range_is_inclusive)}, _pr->end());
        range = &*_range_override;

-        _validate_partition_key = true;
+        _reader_recreated = true;
    }

    return _ms.make_reader_v2(
@@ -1784,45 +1776,48 @@ future<flat_mutation_reader_v2> evictable_reader_v2::resume_or_create_reader() {
    if (auto reader_opt = try_resume()) {
        co_return std::move(*reader_opt);
    }
-    co_await _permit.maybe_wait_readmission();
+    // When the reader is created the first time and we are actually resuming a
+    // saved reader in `recreate_reader()`, we have two cases here:
+    // * the reader is still alive (in inactive state)
+    // * the reader was evicted
+    // We check for this below with `needs_readmission()` and it is very
+    // important to not allow for preemption between said check and
+    // `recreate_reader()`, otherwise the reader might be evicted between the
+    // check and `recreate_reader()` and the latter will recreate it without
+    // waiting for re-admission.
+    if (_permit.needs_readmission()) {
+        co_await _permit.wait_readmission();
+    }
    co_return recreate_reader();
 }

-void evictable_reader_v2::maybe_validate_partition_start(const flat_mutation_reader_v2::tracked_buffer& buffer) {
-    if (!_validate_partition_key || buffer.empty()) {
-        return;
-    }
-
-    // If this is set we can assume the first fragment is a partition-start.
-    const auto& ps = buffer.front().as_partition_start();
+void evictable_reader_v2::validate_partition_start(const partition_start& ps) {
    const auto tri_cmp = dht::ring_position_comparator(*_schema);
    // If we recreated the reader after fast-forwarding it we won't have
    // _last_pkey set. In this case it is enough to check if the partition
    // is in range.
    if (_last_pkey) {
        const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
-        if (_drop_partition_start) { // we expect to continue from the same partition
+        if (_next_position_in_partition.region() != partition_region::partition_start) { // we expect to continue from the same partition
            // We cannot assume the partition we stopped the read at is still alive
            // when we recreate the reader. It might have been compacted away in the
            // meanwhile, so allow for a larger partition too.
            require(
                    cmp_res <= 0,
-                    "{}(): validation failed, expected partition with key larger or equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
+                    "{}(): validation failed, expected partition with key larger or equal to _last_pkey {}, but got {}",
                    __FUNCTION__,
                    *_last_pkey,
                    ps.key());
-            // Reset drop flags and next pos if we are not continuing from the same partition
+            // Reset next pos if we are not continuing from the same partition
            if (cmp_res < 0) {
                // Close previous partition, we are not going to continue it.
                push_mutation_fragment(*_schema, _permit, partition_end{});
-                _drop_partition_start = false;
-                _drop_static_row = false;
                _next_position_in_partition = position_in_partition::for_partition_start();
            }
        } else { // should be a larger partition
            require(
                    cmp_res < 0,
-                    "{}(): validation failed, expected partition with key larger than _last_pkey {} due to _drop_partition_start being unset, but got {}",
+                    "{}(): validation failed, expected partition with key larger than _last_pkey {}, but got {}",
                    __FUNCTION__,
                    *_last_pkey,
                    ps.key());
@@ -1836,8 +1831,6 @@ void evictable_reader_v2::maybe_validate_partition_start(const flat_mutation_rea
            __FUNCTION__,
            prange,
            ps.key());
-
-    _validate_partition_key = false;
 }

 void evictable_reader_v2::validate_position_in_partition(position_in_partition_view pos) const {
@@ -1860,7 +1853,12 @@ void evictable_reader_v2::validate_position_in_partition(position_in_partition_v
        const bool any_contains = std::any_of(ranges.begin(), ranges.end(), [this, &pos] (const query::clustering_range& cr) {
            // TODO: somehow avoid this copy
            auto range = position_range(cr);
-            return range.contains(*_schema, pos);
+            // We cannot use range.contains() because that treats range as a
+            // [a, b) range, meaning a range tombstone change with position
+            // after_key(b) will be considered outside of it. Such range
+            // tombstone changes can be emitted however when recreating the
+            // reader on clustering range edge.
+            return _tri_cmp(range.start(), pos) <= 0 && _tri_cmp(pos, range.end()) <= 0;
        });
        require(
                any_contains,
@@ -1871,42 +1869,40 @@ void evictable_reader_v2::validate_position_in_partition(position_in_partition_v
    }
 }

-bool evictable_reader_v2::should_drop_fragment(const mutation_fragment_v2& mf) {
-    if (_drop_partition_start && mf.is_partition_start()) {
-        _drop_partition_start = false;
-        return true;
+void evictable_reader_v2::examine_first_fragments(mutation_fragment_v2_opt& mf1, mutation_fragment_v2_opt& mf2, mutation_fragment_v2_opt& mf3) {
+    if (!mf1) {
+        return; // the reader is at EOS
    }
-    // Unlike partition-start above, a partition is not guaranteed to have a
-    // static row fragment. So reset the flag regardless of whether we could
-    // drop one or not.
-    // We are guaranteed to get here only right after dropping a partition-start,
-    // so if we are not seeing a static row here, the partition doesn't have one.
-    if (_drop_static_row) {
-         _drop_static_row = false;
-        return mf.is_static_row();
-    }
-    return false;
-}

-future<> evictable_reader_v2::do_fill_buffer() {
-    if (!_drop_partition_start && !_drop_static_row) {
-        auto fill_buf_fut = _reader->fill_buffer();
-        if (_validate_partition_key) {
-            fill_buf_fut = fill_buf_fut.then([this] {
-                maybe_validate_partition_start(_reader->buffer());
-            });
-        }
-        return fill_buf_fut;
+    // If engaged, the first fragment is always a partition-start.
+    validate_partition_start(mf1->as_partition_start());
+    if (_tri_cmp(mf1->position(), _next_position_in_partition) < 0) {
+        mf1 = {}; // drop mf1
+    }
+
+    const auto continue_same_partition = _next_position_in_partition.region() != partition_region::partition_start;
+
+    // If we have a first fragment, we are guaranteed to have a second one -- if not else, a partition-end.
+    if (mf2->is_end_of_partition()) {
+        return; // no further fragments, nothing to do
+    }
+
+    // We want to validate the position of the first non-dropped fragment.
+    // If mf2 is a static row and we need to drop it, this will be mf3.
+    if (mf2->is_static_row() && _tri_cmp(mf2->position(), _next_position_in_partition) < 0) {
+        mf2 = {}; // drop mf2
+    } else {
+        if (continue_same_partition) {
+            validate_position_in_partition(mf2->position());
+        }
+        return;
+    }
+
+    if (mf3->is_end_of_partition()) {
+        return; // no further fragments, nothing to do
+    } else if (continue_same_partition) {
+        validate_position_in_partition(mf3->position());
    }
-    return repeat([this] {
-        return _reader->fill_buffer().then([this] {
-            maybe_validate_partition_start(_reader->buffer());
-            while (!_reader->is_buffer_empty() && should_drop_fragment(_reader->peek_buffer())) {
-                _reader->pop_mutation_fragment();
-            }
-            return stop_iteration(_reader->is_buffer_full() || _reader->is_end_of_stream());
-        });
-    });
 }

 evictable_reader_v2::evictable_reader_v2(
@@ -1935,10 +1931,62 @@ future<> evictable_reader_v2::fill_buffer() {
        co_return;
    }
    _reader = co_await resume_or_create_reader();
-    co_await do_fill_buffer();
+
+    if (_reader_recreated) {
+        // Recreating the reader breaks snapshot isolation and creates all sorts
+        // of complications around the continuity of range tombstone changes,
+        // e.g. a range tombstone started by the previous reader object
+        // might not exist anymore with the new reader object.
+        // To avoid complications we reset the tombstone state on each reader
+        // recreation by emitting a null tombstone change, if we read at least
+        // one clustering fragment from the partition.
+        if (_next_position_in_partition.region() == partition_region::clustered
+                && _tri_cmp(_next_position_in_partition, position_in_partition::before_all_clustered_rows()) > 0) {
+            push_mutation_fragment(*_schema, _permit, range_tombstone_change{position_in_partition_view::before_key(_next_position_in_partition), {}});
+        }
+        auto mf1 = co_await (*_reader)();
+        auto mf2 = co_await (*_reader)();
+        auto mf3 = co_await (*_reader)();
+        examine_first_fragments(mf1, mf2, mf3);
+        if (mf3) {
+            _reader->unpop_mutation_fragment(std::move(*mf3));
+        }
+        if (mf2) {
+            _reader->unpop_mutation_fragment(std::move(*mf2));
+        }
+        if (mf1) {
+            _reader->unpop_mutation_fragment(std::move(*mf1));
+        }
+        _reader_recreated = false;
+    } else {
+        co_await _reader->fill_buffer();
+    }
+
    _reader->move_buffer_content_to(*this);
+
+    // Ensure that each buffer represents forward progress. Only a concern when
+    // the last fragment in the buffer is range tombstone change. In this case
+    // ensure that:
+    // * buffer().back().position() > _next_position_in_partition;
+    // * _reader.peek()->position() > buffer().back().position();
+    if (!is_buffer_empty() && buffer().back().is_range_tombstone_change()) {
+        auto* next_mf = co_await _reader->peek();
+
+        // First make sure we've made progress w.r.t. _next_position_in_partition.
+        while (next_mf && _tri_cmp(_next_position_in_partition, buffer().back().position()) <= 0) {
+            push_mutation_fragment(_reader->pop_mutation_fragment());
+            next_mf = co_await _reader->peek();
+        }
+
+        const auto last_pos = position_in_partition(buffer().back().position());
+        while (next_mf && _tri_cmp(last_pos, next_mf->position()) == 0) {
+            push_mutation_fragment(_reader->pop_mutation_fragment());
+            next_mf = co_await _reader->peek();
+        }
+    }
+
    update_next_position();
-    _end_of_stream = _reader->is_end_of_stream() && _reader->is_buffer_empty();
+    _end_of_stream = _reader->is_end_of_stream();
    maybe_pause(std::move(*_reader));
 }

--- a/partition_snapshot_reader.hh
+++ b/partition_snapshot_reader.hh
@@ -292,14 +292,23 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
                const std::optional<position_in_partition>& last_row,
                const std::optional<position_in_partition>& last_rts,
                position_in_partition_view pos) {
-            if (!_rt_stream.empty()) {
-                return _rt_stream.get_next(std::move(pos));
-            }
            return in_alloc_section([&] () -> mutation_fragment_opt {
                maybe_refresh_state(ck_range_snapshot, last_row, last_rts);

                position_in_partition::less_compare rt_less(_query_schema);

+                // The while below moves range tombstones from partition versions
+                // into _rt_stream, just enough to produce the next range tombstone
+                // The main goal behind moving to _rt_stream is to deoverlap range tombstones
+                // which have the same starting position. This is not in order to satisfy
+                // flat_mutation_reader stream requirements, the reader can emit range tombstones
+                // which have the same position incrementally. This is to guarantee forward
+                // progress in the case iterators get invalidated and maybe_refresh_state()
+                // above needs to restore them. It does so using last_rts, which tracks
+                // the position of the last emitted range tombstone. All range tombstones
+                // with positions <= than last_rts are skipped on refresh. To make progress,
+                // we need to make sure that all range tombstones with duplicated positions
+                // are emitted before maybe_refresh_state().
                while (has_more_range_tombstones()
                        && !rt_less(pos, peek_range_tombstone().position())
                        && (_rt_stream.empty() || !rt_less(_rt_stream.peek_next().position(), peek_range_tombstone().position()))) {
--- a/partition_snapshot_row_cursor.hh
+++ b/partition_snapshot_row_cursor.hh
@@ -444,7 +444,7 @@ public:
    // When throws, the cursor is invalidated and its position is not changed.
    bool advance_to(position_in_partition_view lower_bound) {
        maybe_advance_to(lower_bound);
-        return no_clustering_row_between(_schema, lower_bound, position());
+        return no_clustering_row_between_weak(_schema, lower_bound, position());
    }

    // Call only when valid.
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -567,6 +567,20 @@ bool no_clustering_row_between(const schema& s, position_in_partition_view a, po
    }
 }

+// Returns true if and only if there can't be any clustering_row with position >= a and < b.
+// It is assumed that a <= b.
+inline
+bool no_clustering_row_between_weak(const schema& s, position_in_partition_view a, position_in_partition_view b) {
+    clustering_key_prefix::equality eq(s);
+    if (a.has_key() && b.has_key()) {
+        return eq(a.key(), b.key())
+               && (a.get_bound_weight() == bound_weight::after_all_prefixed
+                   || b.get_bound_weight() != bound_weight::after_all_prefixed);
+    } else {
+        return !a.has_key() && !b.has_key();
+    }
+}
+
 // Includes all position_in_partition objects "p" for which: start <= p < end
 // And only those.
 class position_range {
--- a/protocol_server.hh
+++ b/protocol_server.hh
@@ -10,6 +10,7 @@

 #include "seastarx.hh"
 #include <seastar/core/future.hh>
+#include <seastar/net/socket_defs.hh>
 #include <vector>

 // Abstraction for a server serving some kind of user-facing protocol.
--- a/querier.cc
+++ b/querier.cc
@@ -414,25 +414,6 @@ future<bool> querier_cache::evict_one() noexcept {
    co_return false;
 }

-future<> querier_cache::evict_all_for_table(const utils::UUID& schema_id) noexcept {
-    for (auto ip : {&_data_querier_index, &_mutation_querier_index, &_shard_mutation_querier_index}) {
-        auto& idx = *ip;
-        for (auto it = idx.begin(); it != idx.end();) {
-            if (it->second->schema().id() == schema_id) {
-                auto reader_opt = it->second->permit().semaphore().unregister_inactive_read(querier_utils::get_inactive_read_handle(*it->second));
-                it = idx.erase(it);
-                --_stats.population;
-                if (reader_opt) {
-                    co_await reader_opt->close();
-                }
-            } else {
-                ++it;
-            }
-        }
-    }
-    co_return;
-}
-
 future<> querier_cache::stop() noexcept {
    co_await _closing_gate.close();

--- a/querier.hh
+++ b/querier.hh
@@ -476,11 +476,6 @@ public:
    /// is empty).
    future<bool> evict_one() noexcept;

-    /// Evict all queriers that belong to a table.
-    ///
-    /// Should be used when dropping a table.
-    future<> evict_all_for_table(const utils::UUID& schema_id) noexcept;
-
    /// Close all queriers and wait on background work.
    ///
    /// Should be used before destroying the querier_cache.
--- a/query.cc
+++ b/query.cc
@@ -92,14 +92,13 @@ void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& range
 }

 void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, const clustering_key& key, bool reversed) {
-    if (key.is_full(s)) {
+    if (key.is_full(s) || reversed) {
        return trim_clustering_row_ranges_to(s, ranges,
                reversed ? position_in_partition_view::before_key(key) : position_in_partition_view::after_key(key), reversed);
    }
    auto full_key = key;
    clustering_key::make_full(s, full_key);
-    return trim_clustering_row_ranges_to(s, ranges,
-            reversed ? position_in_partition_view::after_key(full_key) : position_in_partition_view::before_key(full_key), reversed);
+    return trim_clustering_row_ranges_to(s, ranges, position_in_partition_view::before_key(full_key), reversed);
 }


--- a/range_tombstone_change_generator.hh
+++ b/range_tombstone_change_generator.hh
@@ -68,22 +68,33 @@ public:
    // for accumulated range tombstones.
    // After this, only range_tombstones with positions >= upper_bound may be added,
    // which guarantees that they won't affect the output of this flush.
+    //
+    // If upper_bound == position_in_partition::after_all_clustered_rows(),
+    // emits all remaining range_tombstone_changes.
+    // No range_tombstones may be added after this.
+    //
    // FIXME: respect preemption
    template<RangeTombstoneChangeConsumer C>
-    void flush(position_in_partition_view upper_bound, C consumer) {
-        position_in_partition::less_compare less(_schema);
-        std::optional<range_tombstone> prev;
+    void flush(const position_in_partition_view upper_bound, C consumer) {
+        if (_range_tombstones.empty()) {
+            _lower_bound = upper_bound;
+            return;
+        }

-        while (!_range_tombstones.empty() && less(_range_tombstones.begin()->end_position(), upper_bound)) {
+        position_in_partition::tri_compare cmp(_schema);
+        std::optional<range_tombstone> prev;
+        bool flush_all = cmp(upper_bound, position_in_partition::after_all_clustered_rows()) == 0;
+
+        while (!_range_tombstones.empty() && (flush_all || (cmp(_range_tombstones.begin()->end_position(), upper_bound) < 0))) {
            auto rt = _range_tombstones.pop(_range_tombstones.begin());

-            if (prev && less(prev->end_position(), rt.position())) { // [1]
+            if (prev && (cmp(prev->end_position(), rt.position()) < 0)) { // [1]
                // previous range tombstone not adjacent, emit gap.
                consumer(range_tombstone_change(prev->end_position(), tombstone()));
            }

            // Check if start of rt was already emitted, emit if not.
-            if (!less(rt.position(), _lower_bound)) {
+            if (cmp(rt.position(), _lower_bound) >= 0) {
                consumer(range_tombstone_change(rt.position(), rt.tomb));
            }

@@ -95,15 +106,15 @@ public:
        // It cannot get adjacent later because prev->end_position() < upper_bound,
        // so nothing == prev->end_position() can be added after this invocation.
        if (prev && (_range_tombstones.empty()
-                     || less(prev->end_position(), _range_tombstones.begin()->position()))) {
+                     || (cmp(prev->end_position(), _range_tombstones.begin()->position()) < 0))) {
            consumer(range_tombstone_change(prev->end_position(), tombstone())); // [2]
        }

        // Emit the fragment for start bound of a range_tombstone which is overlapping with upper_bound,
        // unless no such fragment or already emitted.
        if (!_range_tombstones.empty()
-            && less(_range_tombstones.begin()->position(), upper_bound)
-            && (!less(_range_tombstones.begin()->position(), _lower_bound))) {
+            && (cmp(_range_tombstones.begin()->position(), upper_bound) < 0)
+            && (cmp(_range_tombstones.begin()->position(), _lower_bound) >= 0)) {
            consumer(range_tombstone_change(
                    _range_tombstones.begin()->position(), _range_tombstones.begin()->tombstone().tomb));
        }
--- a/range_tombstone_list.cc
+++ b/range_tombstone_list.cc
@@ -9,6 +9,7 @@
 #include <boost/range/adaptor/reversed.hpp>
 #include "range_tombstone_list.hh"
 #include "utils/allocation_strategy.hh"
+#include "utils/amortized_reserve.hh"
 #include <seastar/util/variant_utils.hh>

 range_tombstone_list::range_tombstone_list(const range_tombstone_list& x)
@@ -96,7 +97,7 @@ void range_tombstone_list::insert_from(const schema& s,
        if (cmp(end, it->position()) < 0) {
            // not overlapping
            if (it->tombstone().tomb == tomb && cmp(end, it->position()) == 0) {
-                rev.update(it, {std::move(start), std::move(start), tomb});
+                rev.update(it, {std::move(start), std::move(end), tomb});
            } else {
                auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), tomb);
                rev.insert(it, *rt);
@@ -375,13 +376,13 @@ range_tombstone_list::reverter::insert(range_tombstones_type::iterator it, range

 range_tombstone_list::range_tombstones_type::iterator
 range_tombstone_list::reverter::erase(range_tombstones_type::iterator it) {
-    _ops.reserve(_ops.size() + 1);
+    amortized_reserve(_ops, _ops.size() + 1);
    _ops.emplace_back(erase_undo_op(*it));
    return _dst._tombstones.erase(it);
 }

 void range_tombstone_list::reverter::update(range_tombstones_type::iterator it, range_tombstone&& new_rt) {
-    _ops.reserve(_ops.size() + 1);
+    amortized_reserve(_ops, _ops.size() + 1);
    swap(it->tombstone(), new_rt);
    _ops.emplace_back(update_undo_op(std::move(new_rt), *it));
 }
--- a/Show More
+++ b/Show More