release: prepare for 5.0.7

Merge 'alternator: fix wrong 'where' condition for GSI range key' from Marcin Maliszkiewicz
Contains fixes requested in the issue (and some tiny extras), together with analysis why they don't affect the users (see commit messages). Fixes [ #11800](https://github.com/scylladb/scylladb/issues/11800) Closes #11926 * github.com:scylladb/scylladb: alternator: add maybe_quote to secondary indexes 'where' condition test/alternator: correct xfail reason for test_gsi_backfill_empty_string test/alternator: correct indentation in test_lsi_describe alternator: fix wrong 'where' condition for GSI range key (cherry picked from commit ce7c1a6c52)
2022-12-07 14:57:09 +02:00 · 2022-12-05 20:53:19 +02:00 · 2022-12-05 20:33:58 +02:00 · 2022-12-05 20:09:36 +02:00 · 2022-12-05 15:01:21 +02:00 · 2022-12-04 17:20:33 +02:00
178 changed files with 4086 additions and 899 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -60,7 +60,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.0.dev
+VERSION=5.0.7

 if test -f version
 then
--- a/alternator/controller.cc
+++ b/alternator/controller.cc
@@ -78,6 +78,11 @@ future<> controller::start_server() {

        _executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_mm), std::ref(_sys_dist_ks), sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value()).get();
        _server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper)).get();
+        // Note: from this point on, if start_server() throws for any reason,
+        // it must first call stop_server() to stop the executor and server
+        // services we just started - or Scylla will cause an assertion
+        // failure when the controller object is destroyed in the exception
+        // unwinding.
        std::optional<uint16_t> alternator_port;
        if (_config.alternator_port()) {
            alternator_port = _config.alternator_port();
@@ -104,7 +109,13 @@ future<> controller::start_server() {
            }
            opts.erase("require_client_auth");
            opts.erase("truststore");
-            utils::configure_tls_creds_builder(creds.value(), std::move(opts)).get();
+            try {
+                utils::configure_tls_creds_builder(creds.value(), std::move(opts)).get();
+            } catch(...) {
+                logger.error("Failed to set up Alternator TLS credentials: {}", std::current_exception());
+                stop_server().get();
+                std::throw_with_nested(std::runtime_error("Failed to set up Alternator TLS credentials"));
+            }
        }
        bool alternator_enforce_authorization = _config.alternator_enforce_authorization();
        _server.invoke_on_all(
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -34,6 +34,7 @@
 #include "expressions.hh"
 #include "conditions.hh"
 #include "cql3/constants.hh"
+#include "cql3/util.hh"
 #include <optional>
 #include "utils/overloaded_functor.hh"
 #include "seastar/json/json_elements.hh"
@@ -46,6 +47,7 @@
 #include <seastar/core/coroutine.hh>
 #include <boost/range/adaptors.hpp>
 #include <boost/range/algorithm/find_end.hpp>
+#include <unordered_set>
 #include "service/storage_proxy.hh"
 #include "gms/gossiper.hh"
 #include "schema_registry.hh"
@@ -148,16 +150,16 @@ static void validate_table_name(const std::string& name) {
 // instead of each component individually as DynamoDB does.
 // The view_name() function assumes the table_name has already been validated
 // but validates the legality of index_name and the combination of both.
-static std::string view_name(const std::string& table_name, const std::string& index_name, const std::string& delim = ":") {
+static std::string view_name(const std::string& table_name, std::string_view index_name, const std::string& delim = ":") {
    static const std::regex valid_index_name_chars ("[a-zA-Z0-9_.-]*");
    if (index_name.length() < 3) {
        throw api_error::validation("IndexName must be at least 3 characters long");
    }
-    if (!std::regex_match(index_name.c_str(), valid_index_name_chars)) {
+    if (!std::regex_match(index_name.data(), valid_index_name_chars)) {
        throw api_error::validation(
                format("IndexName '{}' must satisfy regular expression pattern: [a-zA-Z0-9_.-]+", index_name));
    }
-    std::string ret = table_name + delim + index_name;
+    std::string ret = table_name + delim + std::string(index_name);
    if (ret.length() > max_table_name_length) {
        throw api_error::validation(
                format("The total length of TableName ('{}') and IndexName ('{}') cannot exceed {} characters",
@@ -166,7 +168,7 @@ static std::string view_name(const std::string& table_name, const std::string& i
    return ret;
 }

-static std::string lsi_name(const std::string& table_name, const std::string& index_name) {
+static std::string lsi_name(const std::string& table_name, std::string_view index_name) {
    return view_name(table_name, index_name, "!:");
 }

@@ -273,16 +275,16 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
    if (index_name) {
        if (index_name->IsString()) {
            orig_table_name = std::move(table_name);
-            table_name = view_name(orig_table_name, index_name->GetString());
+            table_name = view_name(orig_table_name, rjson::to_string_view(*index_name));
            type = table_or_view_type::gsi;
        } else {
            throw api_error::validation(
-                    format("Non-string IndexName '{}'", index_name->GetString()));
+                    format("Non-string IndexName '{}'", rjson::to_string_view(*index_name)));
        }
        // If no tables for global indexes were found, the index may be local
        if (!proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
            type = table_or_view_type::lsi;
-            table_name = lsi_name(orig_table_name, index_name->GetString());
+            table_name = lsi_name(orig_table_name, rjson::to_string_view(*index_name));
        }
    }

@@ -432,6 +434,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
    rjson::add(table_description, "BillingModeSummary", rjson::empty_object());
    rjson::add(table_description["BillingModeSummary"], "BillingMode", "PAY_PER_REQUEST");
    rjson::add(table_description["BillingModeSummary"], "LastUpdateToPayPerRequestDateTime", rjson::value(creation_date_seconds));
+    // In PAY_PER_REQUEST billing mode, provisioned capacity should return 0
+    rjson::add(table_description, "ProvisionedThroughput", rjson::empty_object());
+    rjson::add(table_description["ProvisionedThroughput"], "ReadCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);

    std::unordered_map<std::string,std::string> key_attribute_types;
    // Add base table's KeySchema and collect types for AttributeDefinitions:
@@ -453,6 +460,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
            rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
            // Add indexes's KeySchema and collect types for AttributeDefinitions:
            describe_key_schema(view_entry, *vptr, key_attribute_types);
+            // Add projection type
+            rjson::value projection = rjson::empty_object();
+            rjson::add(projection, "ProjectionType", "ALL");
+            // FIXME: we have to get ProjectionType from the schema when it is added
+            rjson::add(view_entry, "Projection", std::move(projection));
            // Local secondary indexes are marked by an extra '!' sign occurring before the ':' delimiter
            rjson::value& index_array = (delim_it > 1 && cf_name[delim_it-1] == '!') ? lsi_array : gsi_array;
            rjson::push_back(index_array, std::move(view_entry));
@@ -884,17 +896,23 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
    const rjson::value* gsi = rjson::find(request, "GlobalSecondaryIndexes");
    std::vector<schema_builder> view_builders;
    std::vector<sstring> where_clauses;
+    std::unordered_set<std::string> index_names;
    if (gsi) {
        if (!gsi->IsArray()) {
            co_return api_error::validation("GlobalSecondaryIndexes must be an array.");
        }
        for (const rjson::value& g : gsi->GetArray()) {
-            const rjson::value* index_name = rjson::find(g, "IndexName");
-            if (!index_name || !index_name->IsString()) {
+            const rjson::value* index_name_v = rjson::find(g, "IndexName");
+            if (!index_name_v || !index_name_v->IsString()) {
                co_return api_error::validation("GlobalSecondaryIndexes IndexName must be a string.");
            }
-            std::string vname(view_name(table_name, index_name->GetString()));
-            elogger.trace("Adding GSI {}", index_name->GetString());
+            std::string_view index_name = rjson::to_string_view(*index_name_v);
+            auto [it, added] = index_names.emplace(index_name);
+            if (!added) {
+                co_return api_error::validation(format("Duplicate IndexName '{}', ", index_name));
+            }
+            std::string vname(view_name(table_name, index_name));
+            elogger.trace("Adding GSI {}", index_name);
            // FIXME: read and handle "Projection" parameter. This will
            // require the MV code to copy just parts of the attrs map.
            schema_builder view_builder(keyspace_name, vname);
@@ -927,9 +945,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            if  (!range_key.empty() && range_key != view_hash_key && range_key != view_range_key) {
                add_column(view_builder, range_key, attribute_definitions, column_kind::clustering_key);
            }
-            sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
+            sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
            if (!view_range_key.empty()) {
-                where_clause = where_clause + " AND \"" + view_hash_key + "\" IS NOT NULL";
+                where_clause = format("{} AND {} IS NOT NULL", where_clause,
+                    cql3::util::maybe_quote(view_range_key));
            }
            where_clauses.push_back(std::move(where_clause));
            view_builders.emplace_back(std::move(view_builder));
@@ -942,12 +961,17 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            throw api_error::validation("LocalSecondaryIndexes must be an array.");
        }
        for (const rjson::value& l : lsi->GetArray()) {
-            const rjson::value* index_name = rjson::find(l, "IndexName");
-            if (!index_name || !index_name->IsString()) {
+            const rjson::value* index_name_v = rjson::find(l, "IndexName");
+            if (!index_name_v || !index_name_v->IsString()) {
                throw api_error::validation("LocalSecondaryIndexes IndexName must be a string.");
            }
-            std::string vname(lsi_name(table_name, index_name->GetString()));
-            elogger.trace("Adding LSI {}", index_name->GetString());
+            std::string_view index_name = rjson::to_string_view(*index_name_v);
+            auto [it, added] = index_names.emplace(index_name);
+            if (!added) {
+                co_return api_error::validation(format("Duplicate IndexName '{}', ", index_name));
+            }
+            std::string vname(lsi_name(table_name, index_name));
+            elogger.trace("Adding LSI {}", index_name);
            if (range_key.empty()) {
                co_return api_error::validation("LocalSecondaryIndex requires that the base table have a range key");
            }
@@ -979,9 +1003,10 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
            // Note above we don't need to add virtual columns, as all
            // base columns were copied to view. TODO: reconsider the need
            // for virtual columns when we support Projection.
-            sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
+            sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
            if (!view_range_key.empty()) {
-                where_clause = where_clause + " AND \"" + view_range_key + "\" IS NOT NULL";
+                where_clause = format("{} AND {} IS NOT NULL", where_clause,
+                    cql3::util::maybe_quote(view_range_key));
            }
            where_clauses.push_back(std::move(where_clause));
            view_builders.emplace_back(std::move(view_builder));
@@ -2173,6 +2198,9 @@ static attrs_to_get calculate_attrs_to_get(const rjson::value& req, std::unorder
        for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) {
            attribute_path_map_add("AttributesToGet", ret, it->GetString());
        }
+        if (ret.empty()) {
+            throw api_error::validation("Empty AttributesToGet is not allowed. Consider using Select=COUNT instead.");
+        }
        return ret;
    } else if (has_projection_expression) {
        const rjson::value& projection_expression = req["ProjectionExpression"];
@@ -2577,8 +2605,8 @@ static bool hierarchy_actions(
                        // attr member so we can use add()
                        rjson::add_with_string_name(v, attr, std::move(*newv));
                    } else {
-                        throw api_error::validation(format("Can't remove document path {} - not present in item",
-                            subh.get_value()._path));
+                        // Removing a.b when a is a map but a.b doesn't exist
+                        // is silently ignored. It's not considered an error.
                    }
                } else {
                    throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -116,9 +116,6 @@ future<executor::request_return_type> executor::update_time_to_live(client_state

 future<executor::request_return_type> executor::describe_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
    _stats.api_operations.describe_time_to_live++;
-    if (!_proxy.data_dictionary().features().cluster_supports_alternator_ttl()) {
-        co_return api_error::unknown_operation("DescribeTimeToLive not yet supported. Experimental support is available if the 'alternator_ttl' experimental feature is enabled on all nodes.");
-    }
    schema_ptr schema = get_table(_proxy, request);
    std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
    rjson::value desc = rjson::empty_object();
--- a/alternator/ttl.hh
+++ b/alternator/ttl.hh
@@ -12,6 +12,7 @@
 #include <seastar/core/sharded.hh>
 #include <seastar/core/abort_source.hh>
 #include <seastar/core/semaphore.hh>
+#include "data_dictionary/data_dictionary.hh"

 namespace replica {
 class database;
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -624,7 +624,7 @@
                  },
                  {
                     "name":"kn",
-                     "description":"Comma seperated keyspaces name to snapshot",
+                     "description":"Keyspace(s) to snapshot. Multiple keyspaces can be provided using a comma-separated list. If omitted, snapshot all keyspaces.",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
@@ -632,7 +632,7 @@
                  },
                  {
                     "name":"cf",
-                     "description":"the column family to snapshot",
+                     "description":"Table(s) to snapshot. Multiple tables (in a single keyspace) can be provided using a comma-separated list. If omitted, snapshot all tables in the given keyspace(s).",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -669,19 +669,16 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        });
    }));

-    ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) {
+    ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) -> future<json::json_return_type> {
        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
+        auto &db = ctx.db.local();
        if (column_families.empty()) {
-            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
+            co_await db.flush_on_all(keyspace);
+        } else {
+            co_await db.flush_on_all(keyspace, std::move(column_families));
        }
-        return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) {
-            return parallel_for_each(column_families, [&db, keyspace](const sstring& cf) mutable {
-                return db.find_column_family(keyspace, cf).flush();
-            });
-        }).then([]{
-                return make_ready_future<json::json_return_type>(json_void());
-        });
+        co_return json_void();
    });


@@ -1284,40 +1281,46 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        });
    });

-    ss::take_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) {
-        apilog.debug("take_snapshot: {}", req->query_parameters);
+    ss::take_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) -> future<json::json_return_type> {
+        apilog.info("take_snapshot: {}", req->query_parameters);
        auto tag = req->get_query_param("tag");
        auto column_families = split(req->get_query_param("cf"), ",");
        auto sfopt = req->get_query_param("sf");
        auto sf = db::snapshot_ctl::skip_flush(strcasecmp(sfopt.c_str(), "true") == 0);

        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-
-        auto resp = make_ready_future<>();
-        if (column_families.empty()) {
-            resp = snap_ctl.local().take_snapshot(tag, keynames, sf);
-        } else {
-            if (keynames.empty()) {
-                throw httpd::bad_param_exception("The keyspace of column families must be specified");
+        try {
+            if (column_families.empty()) {
+                co_await snap_ctl.local().take_snapshot(tag, keynames, sf);
+            } else {
+                if (keynames.empty()) {
+                    throw httpd::bad_param_exception("The keyspace of column families must be specified");
+                }
+                if (keynames.size() > 1) {
+                    throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
+                }
+                co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, sf);
            }
-            if (keynames.size() > 1) {
-                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
-            }
-            resp = snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, sf);
+            co_return json_void();
+        } catch (...) {
+            apilog.error("take_snapshot failed: {}", std::current_exception());
+            throw;
        }
-        return resp.then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
    });

-    ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) {
+    ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) -> future<json::json_return_type> {
+        apilog.info("del_snapshot: {}", req->query_parameters);
        auto tag = req->get_query_param("tag");
        auto column_family = req->get_query_param("cf");

        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-        return snap_ctl.local().clear_snapshot(tag, keynames, column_family).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
+        try {
+            co_await snap_ctl.local().clear_snapshot(tag, keynames, column_family);
+            co_return json_void();
+        } catch (...) {
+            apilog.error("del_snapshot failed: {}", std::current_exception());
+            throw;
+        }
    });

    ss::true_snapshots_size.set(r, [&snap_ctl](std::unique_ptr<request> req) {
@@ -1354,7 +1357,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        if (!req_param<bool>(*req, "disable_snapshot", false)) {
            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
            f = parallel_for_each(column_families, [&snap_ctl, keyspace, tag](sstring cf) {
-                return snap_ctl.local().take_column_family_snapshot(keyspace, cf, tag);
+                return snap_ctl.local().take_column_family_snapshot(keyspace, cf, tag, db::snapshot_ctl::skip_flush::no, db::snapshot_ctl::allow_view_snapshots::yes);
            });
        }

--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -87,19 +87,24 @@ compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
            // prefer expiring cells.
            return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
        }
-        if (left.is_live_and_has_ttl() && left.expiry() != right.expiry()) {
-            return left.expiry() <=> right.expiry();
+        if (left.is_live_and_has_ttl()) {
+            if (left.expiry() != right.expiry()) {
+                return left.expiry() <=> right.expiry();
+            } else {
+                // prefer the cell that was written later,
+                // so it survives longer after it expires, until purged.
+                return right.ttl() <=> left.ttl();
+            }
        }
    } else {
        // Both are deleted
-        if (left.deletion_time() != right.deletion_time()) {
-            // Origin compares big-endian serialized deletion time. That's because it
-            // delegates to AbstractCell.reconcile() which compares values after
-            // comparing timestamps, which in case of deleted cells will hold
-            // serialized expiry.
-            return (uint64_t) left.deletion_time().time_since_epoch().count()
-                   <=> (uint64_t) right.deletion_time().time_since_epoch().count();
-        }
+
+        // Origin compares big-endian serialized deletion time. That's because it
+        // delegates to AbstractCell.reconcile() which compares values after
+        // comparing timestamps, which in case of deleted cells will hold
+        // serialized expiry.
+        return (uint64_t) left.deletion_time().time_since_epoch().count()
+                <=> (uint64_t) right.deletion_time().time_since_epoch().count();
    }
    return std::strong_ordering::equal;
 }
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -59,7 +59,7 @@ using namespace std::chrono_literals;
 logging::logger cdc_log("cdc");

 namespace cdc {
-static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {});
+static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {}, schema_ptr = nullptr);
 }

 static constexpr auto cdc_group_name = "cdc";
@@ -206,7 +206,7 @@ public:
                return;
            }

-            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt);
+            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);

            auto log_mut = log_schema 
                ? db::schema_tables::make_update_table_mutations(db, keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
@@ -484,7 +484,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
    return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
 }

-static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid) {
+static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid, schema_ptr old) {
    schema_builder b(s.ks_name(), log_name(s.cf_name()));
    b.with_partitioner("com.scylladb.dht.CDCPartitioner");
    b.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
@@ -571,6 +571,20 @@ static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID>
        b.set_uuid(*uuid);
    }

+    /**
+     * #10473 - if we are redefining the log table, we need to ensure any dropped
+     * columns are registered in "dropped_columns" table, otherwise clients will not
+     * be able to read data older than now.
+     */
+    if (old) {
+        // not super efficient, but we don't do this often.
+        for (auto& col : old->all_columns()) {
+            if (!b.has_column({col.name(), col.name_as_text() })) {
+                b.without_column(col.name_as_text(), col.type, api::new_timestamp());
+            }
+        }
+    }
+
    return b.build();
 }

--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -1281,6 +1281,13 @@ private:

            const auto& key = _validator.previous_partition_key();

+            if (_validator.current_tombstone()) {
+                throw compaction_aborted_exception(
+                        _schema->ks_name(),
+                        _schema->cf_name(),
+                        "scrub compaction cannot handle invalid fragments with an active range tombstone change");
+            }
+
            // If the unexpected fragment is a partition end, we just drop it.
            // The only case a partition end is invalid is when it comes after
            // another partition end, and we can just drop it in that case.
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -317,9 +317,9 @@ future<> compaction_manager::run_custom_job(replica::table* t, sstables::compact

    auto job_ptr = std::make_unique<noncopyable_function<future<>(sstables::compaction_data&)>>(std::move(job));

-    task->compaction_done = with_semaphore(_maintenance_ops_sem, 1, [this, task, &job = *job_ptr] () mutable {
-        // take read lock for table, so major compaction and resharding can't proceed in parallel.
-        return with_lock(task->compaction_state.lock.for_read(), [this, task, &job] () mutable {
+    task->compaction_done = with_semaphore(_custom_jobs_sem, 1, [this, task, &job = *job_ptr] () mutable {
+            // We don't need to take task->compaction_state.lock.for_read() as it only serializes minor and major
+
            // Allow caller to know that task (e.g. reshape) was asked to stop while waiting for a chance to run.
            if (task->stopping) {
                throw sstables::compaction_stopped_exception(task->compacting_table->schema()->ks_name(), task->compacting_table->schema()->cf_name(),
@@ -335,7 +335,6 @@ future<> compaction_manager::run_custom_job(replica::table* t, sstables::compact
            // no need to register shared sstables because they're excluded from non-resharding
            // compaction and some of them may not even belong to current shard.
            return job(task->compaction_data);
-        });
    }).then_wrapped([this, task, job_ptr = std::move(job_ptr), type] (future<> f) {
        _stats.active_tasks--;
        _tasks.remove(task);
@@ -353,32 +352,50 @@ future<> compaction_manager::run_custom_job(replica::table* t, sstables::compact
    return task->compaction_done.get_future().then([task] {});
 }

+compaction_manager::compaction_reenabler::compaction_reenabler(compaction_manager& cm, replica::table* t)
+    : _cm(cm)
+    , _table(t)
+    , _compaction_state(cm.get_compaction_state(_table))
+    , _holder(_compaction_state.gate.hold())
+{
+    _compaction_state.compaction_disabled_counter++;
+    cmlog.debug("Temporarily disabled compaction for {}.{}. compaction_disabled_counter={}",
+            _table->schema()->ks_name(), _table->schema()->cf_name(), _compaction_state.compaction_disabled_counter);
+}
+
+compaction_manager::compaction_reenabler::compaction_reenabler(compaction_reenabler&& o) noexcept
+    : _cm(o._cm)
+    , _table(std::exchange(o._table, nullptr))
+    , _compaction_state(o._compaction_state)
+    , _holder(std::move(o._holder))
+{}
+
+compaction_manager::compaction_reenabler::~compaction_reenabler() {
+    // submit compaction request if we're the last holder of the gate which is still opened.
+    if (_table && --_compaction_state.compaction_disabled_counter == 0 && !_compaction_state.gate.is_closed()) {
+        cmlog.debug("Reenabling compaction for {}.{}",
+                _table->schema()->ks_name(), _table->schema()->cf_name());
+        try {
+            _cm.submit(_table);
+        } catch (...) {
+            cmlog.warn("compaction_reenabler could not reenable compaction for {}.{}: {}",
+                    _table->schema()->ks_name(), _table->schema()->cf_name(), std::current_exception());
+        }
+    }
+}
+
+future<compaction_manager::compaction_reenabler>
+compaction_manager::stop_and_disable_compaction(replica::table* t) {
+    compaction_reenabler cre(*this, t);
+    co_await stop_ongoing_compactions("user-triggered operation", t);
+    co_return cre;
+}
+
 future<>
 compaction_manager::run_with_compaction_disabled(replica::table* t, std::function<future<> ()> func) {
-    auto& c_state = _compaction_state[t];
-    auto holder = c_state.gate.hold();
+    compaction_reenabler cre = co_await stop_and_disable_compaction(t);

-    c_state.compaction_disabled_counter++;
-
-    std::exception_ptr err;
-    try {
-        co_await stop_ongoing_compactions("user-triggered operation", t);
-        co_await func();
-    } catch (...) {
-        err = std::current_exception();
-    }
-
-#ifdef DEBUG
-    assert(_compaction_state.contains(t));
-#endif
-    // submit compaction request if we're the last holder of the gate which is still opened.
-    if (--c_state.compaction_disabled_counter == 0 && !c_state.gate.is_closed()) {
-        submit(t);
-    }
-    if (err) {
-        std::rethrow_exception(err);
-    }
-    co_return;
+    co_await func();
 }

 void compaction_manager::task::setup_new_compaction() {
@@ -584,16 +601,11 @@ future<> compaction_manager::stop() {
    }
 }

-void compaction_manager::really_do_stop() {
-    if (_state == state::none || _state == state::stopped) {
-        return;
-    }
-
-    _state = state::stopped;
+future<> compaction_manager::really_do_stop() {
    cmlog.info("Asked to stop");
    // Reset the metrics registry
    _metrics.clear();
-    _stop_future.emplace(stop_ongoing_compactions("shutdown").then([this] () mutable {
+    return stop_ongoing_compactions("shutdown").then([this] () mutable {
        reevaluate_postponed_compactions();
        return std::move(_waiting_reevalution);
    }).then([this] {
@@ -601,12 +613,34 @@ void compaction_manager::really_do_stop() {
        _compaction_submission_timer.cancel();
        cmlog.info("Stopped");
        return _compaction_controller.shutdown();
-    }));
+    });
+}
+
+template <typename Ex>
+requires std::is_base_of_v<std::exception, Ex> &&
+requires (const Ex& ex) {
+    { ex.code() } noexcept -> std::same_as<const std::error_code&>;
+}
+auto swallow_enospc(const Ex& ex) noexcept {
+    if (ex.code().value() != ENOSPC) {
+        return make_exception_future<>(std::make_exception_ptr(ex));
+    }
+
+    cmlog.warn("Got ENOSPC on stop, ignoring...");
+    return make_ready_future<>();
 }

 void compaction_manager::do_stop() noexcept {
+    if (_state == state::none || _state == state::stopped) {
+        return;
+    }
+
    try {
-        really_do_stop();
+        _state = state::stopped;
+        _stop_future = really_do_stop()
+            .handle_exception_type([] (const std::system_error& ex) { return swallow_enospc(ex); })
+            .handle_exception_type([] (const storage_io_error& ex) { return swallow_enospc(ex); })
+        ;
    } catch (...) {
        try {
            cmlog.error("Failed to stop the manager: {}", std::current_exception());
@@ -742,6 +776,7 @@ future<> compaction_manager::perform_offstrategy(replica::table* t) {
                _stats.active_tasks++;
                task->setup_new_compaction();

+              return with_scheduling_group(_maintenance_sg.cpu, [this, task, t] {
                return t->run_offstrategy_compaction(task->compaction_data).then_wrapped([this, task, schema = t->schema()] (future<> f) mutable {
                    _stats.active_tasks--;
                    task->finish_compaction();
@@ -763,6 +798,7 @@ future<> compaction_manager::perform_offstrategy(replica::table* t) {
                    }
                    return make_ready_future<stop_iteration>(stop_iteration::yes);
                });
+              });
            });
        });
    }).finally([this, task] {
@@ -810,7 +846,8 @@ future<> compaction_manager::rewrite_sstables(replica::table* t, sstables::compa
            auto sstable_level = sst->get_sstable_level();
            auto run_identifier = sst->run_identifier();
            auto sstable_set_snapshot = can_purge ? std::make_optional(t.get_sstable_set()) : std::nullopt;
-            auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), _maintenance_sg.io,
+            // FIXME: this compaction should run with maintenance priority.
+            auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), service::get_local_compaction_priority(),
                sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, options);

            // Releases reference to cleaned sstable such that respective used disk space can be freed.
@@ -819,8 +856,9 @@ future<> compaction_manager::rewrite_sstables(replica::table* t, sstables::compa
            };

            auto maintenance_permit = co_await seastar::get_units(_maintenance_ops_sem, 1);
-            // Take write lock for table to serialize cleanup/upgrade sstables/scrub with major compaction/reshape/reshard.
-            auto write_lock_holder = co_await _compaction_state[&t].lock.hold_write_lock();
+            // FIXME: acquiring the read lock is not needed after acquiring the _maintenance_ops_sem
+            // only major compaction needs to acquire the write lock to synchronize with regular compaction.
+            auto lock_holder = co_await _compaction_state[&t].lock.hold_read_lock();

            _stats.pending_tasks--;
            _stats.active_tasks++;
@@ -852,7 +890,7 @@ future<> compaction_manager::rewrite_sstables(replica::table* t, sstables::compa
            };

            compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
-            completed = co_await with_scheduling_group(_maintenance_sg.cpu, std::ref(perform_rewrite));
+            completed = co_await with_scheduling_group(_compaction_controller.sg(), std::ref(perform_rewrite));
        } while (!completed);
    };

--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -147,6 +147,8 @@ private:
    // If the operation must be serialized with regular, then the per-table write lock must be taken.
    seastar::named_semaphore _maintenance_ops_sem = {1, named_semaphore_exception_factory{"maintenance operation"}};

+    seastar::named_semaphore _custom_jobs_sem = {1, named_semaphore_exception_factory{"custom jobs"}};
+
    std::function<void()> compaction_submission_callback();
    // all registered tables are reevaluated at a constant interval.
    // Submission is a NO-OP when there's nothing to do, so it's fine to call it regularly.
@@ -233,7 +235,7 @@ public:

    // Stop all fibers, without waiting. Safe to be called multiple times.
    void do_stop() noexcept;
-    void really_do_stop();
+    future<> really_do_stop();

    // Submit a table to be compacted.
    void submit(replica::table* t);
@@ -269,6 +271,31 @@ public:
    // parameter job is a function that will carry the operation
    future<> run_custom_job(replica::table* t, sstables::compaction_type type, noncopyable_function<future<>(sstables::compaction_data&)> job);

+    class compaction_reenabler {
+        compaction_manager& _cm;
+        replica::table* _table;
+        compaction_state& _compaction_state;
+        gate::holder _holder;
+
+    public:
+        compaction_reenabler(compaction_manager&, replica::table*);
+        compaction_reenabler(compaction_reenabler&&) noexcept;
+
+        ~compaction_reenabler();
+
+        replica::table* compacting_table() const noexcept {
+            return _table;
+        }
+
+        const compaction_state& compaction_state() const noexcept {
+            return _compaction_state;
+        }
+    };
+
+    // Disable compaction temporarily for a table t.
+    // Caller should call the compaction_reenabler::reenable
+    future<compaction_reenabler> stop_and_disable_compaction(replica::table* t);
+
    // Run a function with compaction temporarily disabled for a table T.
    future<> run_with_compaction_disabled(replica::table* t, std::function<future<> ()> func);

--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -69,7 +69,11 @@ compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(tabl
 }

 void leveled_compaction_strategy::notify_completion(const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
-    if (removed.empty() || added.empty()) {
+    // All the update here is only relevant for regular compaction's round-robin picking policy, and if
+    // last_compacted_keys wasn't generated by regular, it means regular is disabled since last restart,
+    // therefore we can skip the updates here until regular runs for the first time. Once it runs,
+    // it will be able to generate last_compacted_keys correctly by looking at metadata of files.
+    if (removed.empty() || added.empty() || !_last_compacted_keys) {
        return;
    }
    auto min_level = std::numeric_limits<uint32_t>::max();
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -217,6 +217,7 @@ time_window_compaction_strategy::get_sstables_for_compaction(table_state& table_
    auto compaction_time = gc_clock::now();

    if (candidates.empty()) {
+        _estimated_remaining_tasks = 0;
        return compaction_descriptor();
    }

--- a/configure.py
+++ b/configure.py
@@ -615,6 +615,8 @@ arg_parser.add_argument('--static-yaml-cpp', dest='staticyamlcpp', action='store
                        help='Link libyaml-cpp statically')
 arg_parser.add_argument('--tests-debuginfo', action='store', dest='tests_debuginfo', type=int, default=0,
                        help='Enable(1)/disable(0)compiler debug information generation for tests')
+arg_parser.add_argument('--perf-tests-debuginfo', action='store', dest='perf_tests_debuginfo', type=int, default=0,
+                        help='Enable(1)/disable(0)compiler debug information generation for perf tests')
 arg_parser.add_argument('--python', action='store', dest='python', default='python3',
                        help='Python3 path')
 arg_parser.add_argument('--split-dwarf', dest='split_dwarf', action='store_true', default=False,
@@ -1377,6 +1379,7 @@ linker_flags = linker_flags(compiler=args.cxx)

 dbgflag = '-g -gz' if args.debuginfo else ''
 tests_link_rule = 'link' if args.tests_debuginfo else 'link_stripped'
+perf_tests_link_rule = 'link' if args.perf_tests_debuginfo else 'link_stripped'

 # Strip if debuginfo is disabled, otherwise we end up with partial
 # debug info from the libraries we static link with
@@ -1901,7 +1904,8 @@ with open(buildfile_tmp, 'w') as f:
                    # So we strip the tests by default; The user can very
                    # quickly re-link the test unstripped by adding a "_g"
                    # to the test name, e.g., "ninja build/release/testname_g"
-                    f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, tests_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
+                    link_rule = perf_tests_link_rule if binary.startswith('test/perf/') else tests_link_rule
+                    f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
                    f.write('   libs = {}\n'.format(local_libs))
                    f.write('build $builddir/{}/{}_g: {}.{} {} | {} {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
                    f.write('   libs = {}\n'.format(local_libs))
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -1386,7 +1386,7 @@ serviceLevelOrRoleName returns [sstring name]
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower); }
 | t=STRING_LITERAL     { $name = sstring($t.text); }
 | t=QUOTED_NAME        { $name = sstring($t.text); }
-| k=unreserved_keyword { $name = sstring($t.text); 
+| k=unreserved_keyword { $name = k;
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower);}
 | QMARK {add_recognition_error("Bind variables cannot be used for service levels or role names");}
 ;
--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -12,6 +12,7 @@

 #include "cql3_type.hh"
 #include "cql3/util.hh"
+#include "exceptions/exceptions.hh"
 #include "ut_name.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "data_dictionary/user_types_metadata.hh"
@@ -436,7 +437,20 @@ sstring maybe_quote(const sstring& identifier) {
    }

    if (!need_quotes) {
-        return identifier;
+        // A seemingly valid identifier matching [a-z][a-z0-9_]* may still
+        // need quoting if it is a CQL keyword, e.g., "to" (see issue #9450).
+        // While our parser Cql.g has different production rules for different
+        // types of identifiers (column names, table names, etc.), all of
+        // these behave identically for alphanumeric strings: they exclude
+        // many keywords but allow keywords listed as "unreserved keywords".
+        // So we can use any of them, for example cident.
+        try {
+            cql3::util::do_with_parser(identifier, std::mem_fn(&cql3_parser::CqlParser::cident));
+            return identifier;
+        } catch(exceptions::syntax_exception&) {
+            // This alphanumeric string is not a valid identifier, so fall
+            // through to have it quoted:
+        }
    }
    if (num_quotes == 0) {
        return make_sstring("\"", identifier, "\"");
--- a/cql3/cql_statement.hh
+++ b/cql3/cql_statement.hh
@@ -81,9 +81,7 @@ public:
    virtual seastar::future<seastar::shared_ptr<cql_transport::messages::result_message>>
        execute(query_processor& qp, service::query_state& state, const query_options& options) const = 0;

-    virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const = 0;
-
-    virtual bool depends_on_column_family(const seastar::sstring& cf_name) const = 0;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const = 0;

    virtual seastar::shared_ptr<const metadata> get_result_metadata() const = 0;

--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -103,10 +103,50 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
        if (!col_type->is_map()) {
            throw exceptions::invalid_request_exception(format("subscripting non-map column {}", cdef->name_as_text()));
        }
-        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*data.other_columns[data.sel.index_of(*cdef)]));
+        int32_t index = data.sel.index_of(*cdef);
+        if (index == -1) {
+            throw std::runtime_error(
+                    format("Column definition {} does not match any column in the query selection",
+                    cdef->name_as_text()));
+        }
+        const managed_bytes_opt& serialized = data.other_columns[index];
+        if (!serialized) {
+            // For null[i] we return null.
+            return std::nullopt;
+        }
+        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*serialized));
        const auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
        const auto key = evaluate(*col.sub, options);
        auto&& key_type = col_type->name_comparator();
+        if (key.is_null()) {
+            // For m[null] return null.
+            // This is different from Cassandra - which treats m[null]
+            // as an invalid request error. But m[null] -> null is more
+            // consistent with our usual null treatement (e.g., both
+            // null[2] and null < 2 return null). It will also allow us
+            // to support non-constant subscripts (e.g., m[a]) where "a"
+            // may be null in some rows and non-null in others, and it's
+            // not an error.
+            return std::nullopt;
+        }
+        if (key.is_unset_value()) {
+            // An m[?] with ? bound to UNSET_VALUE is a invalid query.
+            // We could have detected it earlier while binding, but since
+            // we currently don't, we must protect the following code
+            // which can't work with an UNSET_VALUE. Note that the
+            // placement of this check here means that in an empty table,
+            // where we never need to evaluate the filter expression, this
+            // error will not be detected.
+            throw exceptions::invalid_request_exception(
+                format("Unsupported unset map key for column {}",
+                    cdef->name_as_text()));
+        }
+        if (key.type != key_type) {
+            // This can't happen, we always verify the index type earlier.
+            throw std::logic_error(
+                format("Tried to evaluate expression with wrong type for subscript of {}",
+                    cdef->name_as_text()));
+        }
        const auto found = key.view().with_linearized([&] (bytes_view key_bv) {
            using entry = std::pair<data_value, data_value>;
            return std::find_if(data_map.cbegin(), data_map.cend(), [&] (const entry& element) {
@@ -121,8 +161,16 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
        case column_kind::clustering_key:
            return managed_bytes(data.clustering_key[cdef->id]);
        case column_kind::static_column:
-        case column_kind::regular_column:
-            return managed_bytes_opt(data.other_columns[data.sel.index_of(*cdef)]);
+            [[fallthrough]];
+        case column_kind::regular_column: {
+            int32_t index = data.sel.index_of(*cdef);
+            if (index == -1) {
+                throw std::runtime_error(
+                        format("Column definition {} does not match any column in the query selection",
+                        cdef->name_as_text()));
+            }
+            return managed_bytes_opt(data.other_columns[index]);
+        }
        default:
            throw exceptions::unsupported_operation_exception("Unknown column kind");
        }
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -953,7 +953,7 @@ bool query_processor::migration_subscriber::should_invalidate(
        sstring ks_name,
        std::optional<sstring> cf_name,
        ::shared_ptr<cql_statement> statement) {
-    return statement->depends_on_keyspace(ks_name) && (!cf_name || statement->depends_on_column_family(*cf_name));
+    return statement->depends_on(ks_name, cf_name);
 }

 future<> query_processor::query_internal(
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -514,7 +514,7 @@ statement_restrictions::statement_restrictions(data_dictionary::database db,
    }

    if (!_nonprimary_key_restrictions->empty()) {
-        if (_has_queriable_regular_index) {
+        if (_has_queriable_regular_index && _partition_range_is_simple) {
            _uses_secondary_indexing = true;
        } else if (!allow_filtering) {
            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -165,7 +165,7 @@ public:

    template<typename RowComparator>
    void sort(const RowComparator& cmp) {
-        std::sort(_rows.begin(), _rows.end(), std::ref(cmp));
+        std::sort(_rows.begin(), _rows.end(), cmp);
    }

    metadata& get_metadata();
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -422,11 +422,16 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
    }

    auto clustering_columns_restrictions = _restrictions->get_clustering_columns_restrictions();
-    if (dynamic_pointer_cast<cql3::restrictions::multi_column_restriction>(clustering_columns_restrictions)) {
+    bool has_multi_col_clustering_restrictions =
+        dynamic_pointer_cast<cql3::restrictions::multi_column_restriction>(clustering_columns_restrictions) != nullptr;
+    if (has_multi_col_clustering_restrictions) {
        clustering_key_prefix ckey = clustering_key_prefix::from_exploded(clustering_key);
-        return expr::is_satisfied_by(
+        bool multi_col_clustering_satisfied = expr::is_satisfied_by(
                clustering_columns_restrictions->expression,
                partition_key, clustering_key, static_row, row, selection, _options);
+        if (!multi_col_clustering_satisfied) {
+            return false;
+        }
    }

    auto static_row_iterator = static_row.iterator();
@@ -474,6 +479,13 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
            if (_skip_ck_restrictions) {
                continue;
            }
+            if (has_multi_col_clustering_restrictions) {
+                // Mixing multi column and single column restrictions on clustering
+                // key columns is forbidden.
+                // Since there are multi column restrictions we have to skip
+                // evaluating single column restrictions or we will get an error.
+                continue;
+            }
            auto clustering_key_restrictions_map = _restrictions->get_single_column_clustering_key_restrictions();
            auto restr_it = clustering_key_restrictions_map.find(cdef);
            if (restr_it == clustering_key_restrictions_map.end()) {
--- a/cql3/statements/authentication_statement.cc
+++ b/cql3/statements/authentication_statement.cc
@@ -18,13 +18,7 @@ uint32_t cql3::statements::authentication_statement::get_bound_terms() const {
    return 0;
 }

-bool cql3::statements::authentication_statement::depends_on_keyspace(
-                const sstring& ks_name) const {
-    return false;
-}
-
-bool cql3::statements::authentication_statement::depends_on_column_family(
-                const sstring& cf_name) const {
+bool cql3::statements::authentication_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/authentication_statement.hh
+++ b/cql3/statements/authentication_statement.hh
@@ -27,9 +27,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/statements/authorization_statement.cc
+++ b/cql3/statements/authorization_statement.cc
@@ -20,13 +20,7 @@ uint32_t cql3::statements::authorization_statement::get_bound_terms() const {
    return 0;
 }

-bool cql3::statements::authorization_statement::depends_on_keyspace(
-                const sstring& ks_name) const {
-    return false;
-}
-
-bool cql3::statements::authorization_statement::depends_on_column_family(
-                const sstring& cf_name) const {
+bool cql3::statements::authorization_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/authorization_statement.hh
+++ b/cql3/statements/authorization_statement.hh
@@ -31,9 +31,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -70,14 +70,9 @@ batch_statement::batch_statement(type type_,
 {
 }

-bool batch_statement::depends_on_keyspace(const sstring& ks_name) const
+bool batch_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
-    return false;
-}
-
-bool batch_statement::depends_on_column_family(const sstring& cf_name) const
-{
-    return false;
+    return boost::algorithm::any_of(_statements, [&ks_name, &cf_name] (auto&& s) { return s.statement->depends_on(ks_name, cf_name); });
 }

 uint32_t batch_statement::get_bound_terms() const
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -88,9 +88,7 @@ public:
                    std::unique_ptr<attributes> attrs,
                    cql_stats& stats);

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;

--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -20,6 +20,7 @@
 #include "gms/feature_service.hh"
 #include "tombstone_gc_extension.hh"
 #include "tombstone_gc.hh"
+#include "utils/bloom_calculations.hh"

 #include <boost/algorithm/string/predicate.hpp>

@@ -145,6 +146,16 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name,
        throw exceptions::configuration_exception(KW_MAX_INDEX_INTERVAL + " must be greater than " + KW_MIN_INDEX_INTERVAL);
    }

+    if (get_simple(KW_BF_FP_CHANCE)) {
+        double bloom_filter_fp_chance = get_double(KW_BF_FP_CHANCE, 0/*not used*/);
+        double min_bloom_filter_fp_chance = utils::bloom_calculations::min_supported_bloom_filter_fp_chance();
+        if (bloom_filter_fp_chance <= min_bloom_filter_fp_chance || bloom_filter_fp_chance > 1.0) {
+            throw exceptions::configuration_exception(format(
+                "{} must be larger than {} and less than or equal to 1.0 (got {})",
+                KW_BF_FP_CHANCE, min_bloom_filter_fp_chance, bloom_filter_fp_chance));
+        }
+    }
+
    speculative_retry::from_sstring(get_string(KW_SPECULATIVE_RETRY, speculative_retry(speculative_retry::type::NONE, 0).to_sstring()));
 }

--- a/cql3/statements/cf_properties.hh
+++ b/cql3/statements/cf_properties.hh
@@ -13,6 +13,7 @@

 #include "cql3/statements/cf_prop_defs.hh"
 #include "cql3/column_identifier.hh"
+#include "data_dictionary/data_dictionary.hh"

 namespace cql3 {

--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -110,9 +110,6 @@ future<> modification_statement::check_access(query_processor& qp, const service

 future<std::vector<mutation>>
 modification_statement::get_mutations(query_processor& qp, const query_options& options, db::timeout_clock::time_point timeout, bool local, int64_t now, service::query_state& qs) const {
-    if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
-        throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
-    }
    auto cl = options.get_consistency();
    auto json_cache = maybe_prepare_json_cache(options);
    auto keys = build_partition_keys(options, json_cache);
@@ -255,6 +252,10 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs

    inc_cql_stats(qs.get_client_state().is_internal());

+    if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
+        throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
+    }
+
    if (has_conditions()) {
        return execute_with_condition(qp, qs, options);
    }
@@ -539,12 +540,8 @@ modification_statement::validate(query_processor&, const service::client_state&
    }
 }

-bool modification_statement::depends_on_keyspace(const sstring& ks_name) const {
-    return keyspace() == ks_name;
-}
-
-bool modification_statement::depends_on_column_family(const sstring& cf_name) const {
-    return column_family() == cf_name;
+bool modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
 }

 void modification_statement::add_operation(::shared_ptr<operation> op) {
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -137,9 +137,7 @@ public:
    // Validate before execute, using client state and current schema
    void validate(query_processor&, const service::client_state& state) const override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    void add_operation(::shared_ptr<operation> op);

--- a/cql3/statements/schema_altering_statement.cc
+++ b/cql3/statements/schema_altering_statement.cc
@@ -45,12 +45,7 @@ future<> schema_altering_statement::grant_permissions_to_creator(const service::
    return make_ready_future<>();
 }

-bool schema_altering_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool schema_altering_statement::depends_on_column_family(const sstring& cf_name) const
+bool schema_altering_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/schema_altering_statement.hh
+++ b/cql3/statements/schema_altering_statement.hh
@@ -53,9 +53,7 @@ protected:
     */
    virtual future<> grant_permissions_to_creator(const service::client_state&) const;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;

--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -167,12 +167,8 @@ void select_statement::validate(query_processor&, const service::client_state& s
    // Nothing to do, all validation has been done by raw_statemet::prepare()
 }

-bool select_statement::depends_on_keyspace(const sstring& ks_name) const {
-    return keyspace() == ks_name;
-}
-
-bool select_statement::depends_on_column_family(const sstring& cf_name) const {
-    return column_family() == cf_name;
+bool select_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
 }

 const sstring& select_statement::keyspace() const {
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -100,8 +100,7 @@ public:
    virtual uint32_t get_bound_terms() const override;
    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;
    virtual void validate(query_processor&, const service::client_state& state) const override;
-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<::shared_ptr<cql_transport::messages::result_message>> execute(query_processor& qp,
        service::query_state& state, const query_options& options) const override;
--- a/cql3/statements/service_level_statement.cc
+++ b/cql3/statements/service_level_statement.cc
@@ -17,13 +17,7 @@ uint32_t service_level_statement::get_bound_terms() const {
    return 0;
 }

-bool service_level_statement::depends_on_keyspace(
-        const sstring &ks_name) const {
-    return false;
-}
-
-bool service_level_statement::depends_on_column_family(
-        const sstring &cf_name) const {
+bool service_level_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/service_level_statement.hh
+++ b/cql3/statements/service_level_statement.hh
@@ -43,9 +43,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/statements/sl_prop_defs.cc
+++ b/cql3/statements/sl_prop_defs.cc
@@ -30,7 +30,7 @@ void sl_prop_defs::validate() {
        data_value v = duration_type->deserialize(duration_type->from_string(*repr));
        cql_duration duration = static_pointer_cast<const duration_type_impl>(duration_type)->from_value(v);
        if (duration.months || duration.days) {
-            throw exceptions::invalid_request_exception("Timeout values cannot be longer than 24h");
+            throw exceptions::invalid_request_exception("Timeout values cannot be expressed in days/months");
        }
        if (duration.nanoseconds % 1'000'000 != 0) {
            throw exceptions::invalid_request_exception("Timeout values must be expressed in millisecond granularity");
--- a/cql3/statements/truncate_statement.cc
+++ b/cql3/statements/truncate_statement.cc
@@ -39,12 +39,7 @@ std::unique_ptr<prepared_statement> truncate_statement::prepare(data_dictionary:
    return std::make_unique<prepared_statement>(::make_shared<truncate_statement>(*this));
 }

-bool truncate_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool truncate_statement::depends_on_column_family(const sstring& cf_name) const
+bool truncate_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/truncate_statement.hh
+++ b/cql3/statements/truncate_statement.hh
@@ -30,9 +30,7 @@ public:

    virtual std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/statements/use_statement.cc
+++ b/cql3/statements/use_statement.cc
@@ -46,12 +46,7 @@ std::unique_ptr<prepared_statement> use_statement::prepare(data_dictionary::data

 }

-bool use_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool use_statement::depends_on_column_family(const sstring& cf_name) const
+bool use_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/use_statement.hh
+++ b/cql3/statements/use_statement.hh
@@ -31,9 +31,7 @@ public:

    virtual uint32_t get_bound_terms() const override;

-    virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const seastar::sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual seastar::future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/type_json.cc
+++ b/cql3/type_json.cc
@@ -18,6 +18,8 @@
 #include "types/listlike_partial_deserializing_iterator.hh"
 #include "utils/managed_bytes.hh"
 #include "exceptions/exceptions.hh"
+#include <boost/algorithm/string/trim_all.hpp>
+#include <boost/algorithm/string.hpp>

 static inline bool is_control_char(char c) {
    return c >= 0 && c <= 0x1F;
@@ -78,8 +80,35 @@ static int64_t to_int64_t(const rjson::value& value) {
        return value.GetInt();
    } else if (value.IsUint()) {
        return value.GetUint();
-    } else if (value.GetUint64()) {
+    } else if (value.IsUint64()) {
        return value.GetUint64(); //NOTICE: large uint64_t values will get overflown
+    } else if (value.IsDouble()) {
+        // We allow specifing integer constants
+        // using scientific notation (for example 1.3e8)
+        // and floating-point numbers ending with .0 (for example 12.0),
+        // but not floating-point numbers with fractional part (12.34).
+        //
+        // The reason is that JSON standard does not have separate
+        // types for integers and floating-point numbers, only
+        // a single "number" type. Some serializers may
+        // produce an integer in that floating-point format.
+        double double_value = value.GetDouble();
+
+        // Check if the value contains disallowed fractional part (.34 from 12.34).
+        // With RapidJSON and an integer value in range [-(2^53)+1, (2^53)-1], 
+        // the fractional part will be zero as the entire value
+        // fits in 53-bit significand. RapidJSON's parsing code does not lose accuracy:
+        // when parsing a number like 12.34e8, it accumulates 1234 to a int64_t number,
+        // then converts it to double and multiples by power of 10, never having any
+        // digit in fractional part.
+        double integral;
+        double fractional = std::modf(double_value, &integral);
+        if (fractional != 0.0 && fractional != -0.0) {
+            throw marshal_exception(format("Incorrect JSON floating-point value "
+                "for int64 type: {} (it should not contain fractional part {})", value, fractional));
+        }
+
+        return double_value;
    }
    throw marshal_exception(format("Incorrect JSON value for int64 type: {}", value));
 }
@@ -197,6 +226,17 @@ struct from_json_object_visitor {
    }
    bytes operator()(const boolean_type_impl& t) {
        if (!value.IsBool()) {
+            if (value.IsString()) {
+                std::string str(rjson::to_string_view(value));
+                boost::trim_all(str);
+                boost::to_lower(str);
+
+                if (str == "true") {
+                    return t.decompose(true);
+                } else if (str == "false") {
+                    return t.decompose(false);
+                }
+            }
            throw marshal_exception(format("Invalid JSON object {}", value));
        }
        return t.decompose(value.GetBool());
--- a/cql3/util.hh
+++ b/cql3/util.hh
@@ -74,6 +74,13 @@ std::unique_ptr<cql3::statements::raw::select_statement> build_select_statement(
 /// forbids non-alpha-numeric characters in identifier names.
 /// Quoting involves wrapping the string in double-quotes ("). A double-quote
 /// character itself is quoted by doubling it.
+/// maybe_quote() also quotes reserved CQL keywords (e.g., "to", "where")
+/// but doesn't quote *unreserved* keywords (like ttl, int or as).
+/// Note that this means that if new reserved keywords are added to the
+/// parser, a saved output of maybe_quote() may no longer be parsable by
+/// parser. To avoid this forward-compatibility issue, use quote() instead
+/// of maybe_quote() - to unconditionally quote an identifier even if it is
+/// lowercase and not (yet) a keyword.
 sstring maybe_quote(const sstring& s);

 // Check whether timestamp is not too far in the future as this probably
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -11,6 +11,7 @@
 */

 #include <chrono>
+#include <exception>
 #include <seastar/core/future-util.hh>
 #include <seastar/core/do_with.hh>
 #include <seastar/core/semaphore.hh>
@@ -247,6 +248,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
            } catch (data_dictionary::no_such_keyspace& ex) {
                // should probably ignore and drop the batch
            } catch (...) {
+                blogger.warn("Replay failed (will retry): {}", std::current_exception());
                // timeout, overload etc.
                // Do _not_ remove the batch, assuning we got a node write error.
                // Since we don't have hints (which origin is satisfied with),
--- a/db/config.cc
+++ b/db/config.cc
@@ -65,6 +65,25 @@ hinted_handoff_enabled_to_json(const db::config::hinted_handoff_enabled_type& h)
    return value_to_json(h.to_configuration_string());
 }

+// Convert a value that can be printed with operator<<, or a vector of
+// such values, to JSON. An example is enum_option<T>, because enum_option<T>
+// has a operator<<.
+template <typename T>
+static json::json_return_type
+printable_to_json(const T& e) {
+    return value_to_json(format("{}", e));
+}
+template <typename T>
+static json::json_return_type
+printable_vector_to_json(const std::vector<T>& e) {
+    std::vector<sstring> converted;
+    converted.reserve(e.size());
+    for (const auto& option : e) {
+        converted.push_back(format("{}", option));
+    }
+    return value_to_json(converted);
+}
+
 template <>
 const config_type config_type_for<bool> = config_type("bool", value_to_json<bool>);

@@ -109,11 +128,11 @@ const config_type config_type_for<db::seed_provider_type> = config_type("seed pr

 template <>
 const config_type config_type_for<std::vector<enum_option<db::experimental_features_t>>> = config_type(
-        "experimental features", value_to_json<std::vector<sstring>>);
+        "experimental features", printable_vector_to_json<enum_option<db::experimental_features_t>>);

 template <>
 const config_type config_type_for<enum_option<db::tri_mode_restriction_t>> = config_type(
-        "restriction mode", value_to_json<sstring>);
+        "restriction mode", printable_to_json<enum_option<db::tri_mode_restriction_t>>);

 template <>
 const config_type config_type_for<db::config::hinted_handoff_enabled_type> = config_type("hinted handoff enabled", hinted_handoff_enabled_to_json);
@@ -862,6 +881,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Flush tables in the system_schema keyspace after schema modification. This is required for crash recovery, but slows down tests and can be disabled for them")
    , restrict_replication_simplestrategy(this, "restrict_replication_simplestrategy", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::FALSE, "Controls whether to disable SimpleStrategy replication. Can be true, false, or warn.")
    , restrict_dtcs(this, "restrict_dtcs", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::WARN, "Controls whether to prevent setting DateTieredCompactionStrategy. Can be true, false, or warn.")
+    , cache_index_pages(this, "cache_index_pages", liveness::LiveUpdate, value_status::Used, true,
+        "Keep SSTable index pages in the global cache after a SSTable read. Expected to improve performance for workloads with big partitions, but may degrade performance for workloads with small partitions.")
    , default_log_level(this, "default_log_level", value_status::Used)
    , logger_log_level(this, "logger_log_level", value_status::Used)
    , log_to_stdout(this, "log_to_stdout", value_status::Used)
--- a/db/config.hh
+++ b/db/config.hh
@@ -365,6 +365,9 @@ public:
    named_value<tri_mode_restriction> restrict_replication_simplestrategy;
    named_value<tri_mode_restriction> restrict_dtcs;

+
+    named_value<bool> cache_index_pages;
+
    seastar::logging_settings logging_settings(const log_cli::options&) const;

    const db::extensions& extensions() const;
--- a/db/legacy_schema_migrator.cc
+++ b/db/legacy_schema_migrator.cc
@@ -574,12 +574,8 @@ public:
    }

    future<> flush_schemas() {
-        return _qp.proxy().get_db().invoke_on_all([this] (replica::database& db) {
-            return parallel_for_each(db::schema_tables::all_table_names(schema_features::full()), [this, &db](const sstring& cf_name) {
-                auto& cf = db.find_column_family(db::schema_tables::NAME, cf_name);
-                return cf.flush();
-            });
-        });
+        auto& db = _qp.db().real_database();
+        return db.flush_on_all(db::schema_tables::NAME, db::schema_tables::all_table_names(schema_features::full()));
    }

    future<> migrate() {
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -1042,12 +1042,9 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
    co_await proxy.local().mutate_locally(std::move(mutations), tracing::trace_state_ptr());

    if (do_flush) {
-        co_await proxy.local().get_db().invoke_on_all([&] (replica::database& db) -> future<> {
-            auto& cfs = column_families;
-            co_await parallel_for_each(cfs.begin(), cfs.end(), [&] (const utils::UUID& id) -> future<> {
-                auto& cf = db.find_column_family(id);
-                co_await cf.flush();
-            });
+        auto& db = proxy.local().local_db();
+        co_await parallel_for_each(column_families, [&db] (const utils::UUID& id) -> future<> {
+            return db.flush_on_all(id);
        });
    }

--- a/db/snapshot-ctl.cc
+++ b/db/snapshot-ctl.cc
@@ -11,6 +11,8 @@
 */

 #include <boost/range/adaptors.hpp>
+#include <seastar/core/coroutine.hh>
+#include <seastar/coroutine/maybe_yield.hh>
 #include "db/snapshot-ctl.hh"
 #include "replica/database.hh"

@@ -59,24 +61,21 @@ future<> snapshot_ctl::take_snapshot(sstring tag, std::vector<sstring> keyspace_
        boost::copy(_db.local().get_keyspaces() | boost::adaptors::map_keys, std::back_inserter(keyspace_names));
    };

-    return run_snapshot_modify_operation([tag = std::move(tag), keyspace_names = std::move(keyspace_names), sf, this] {
-        return parallel_for_each(keyspace_names, [tag, this] (auto& ks_name) {
-            return check_snapshot_not_exist(ks_name, tag);
-        }).then([this, tag, keyspace_names, sf] {
-            return _db.invoke_on_all([tag = std::move(tag), keyspace_names, sf] (replica::database& db) {
-                return parallel_for_each(keyspace_names, [&db, tag = std::move(tag), sf] (auto& ks_name) {
-                    auto& ks = db.find_keyspace(ks_name);
-                    return parallel_for_each(ks.metadata()->cf_meta_data(), [&db, tag = std::move(tag), sf] (auto& pair) {
-                        auto& cf = db.find_column_family(pair.second);
-                        return cf.snapshot(db, tag, bool(sf));
-                    });
-                });
-            });
-        });
+    return run_snapshot_modify_operation([tag = std::move(tag), keyspace_names = std::move(keyspace_names), sf, this] () mutable {
+        return do_take_snapshot(std::move(tag), std::move(keyspace_names), sf);
    });
 }

-future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf) {
+future<> snapshot_ctl::do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf) {
+    co_await parallel_for_each(keyspace_names, [tag, this] (const auto& ks_name) {
+        return check_snapshot_not_exist(ks_name, tag);
+    });
+    co_await parallel_for_each(keyspace_names, [this, tag = std::move(tag), sf] (const auto& ks_name) {
+        return _db.local().snapshot_on_all(ks_name, tag, bool(sf));
+    });
+}
+
+future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf, allow_view_snapshots av) {
    if (ks_name.empty()) {
        throw std::runtime_error("You must supply a keyspace name");
    }
@@ -87,25 +86,25 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
        throw std::runtime_error("You must supply a snapshot name.");
    }

-    return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), sf] {
-        return check_snapshot_not_exist(ks_name, tag, tables).then([this, ks_name, tables, tag, sf] {
-            return do_with(std::vector<sstring>(std::move(tables)),[this, ks_name, tag, sf](const std::vector<sstring>& tables) {
-                return do_for_each(tables, [ks_name, tag, sf, this] (const sstring& table_name) {
-                    if (table_name.find(".") != sstring::npos) {
-                        throw std::invalid_argument("Cannot take a snapshot of a secondary index by itself. Run snapshot on the table that owns the index.");
-                    }
-                    return _db.invoke_on_all([ks_name, table_name, tag, sf] (replica::database &db) {
-                        auto& cf = db.find_column_family(ks_name, table_name);
-                        return cf.snapshot(db, tag, bool(sf));
-                    });
-                });
-            });
-        });
+    return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), sf, av] () mutable {
+        return do_take_column_family_snapshot(std::move(ks_name), std::move(tables), std::move(tag), sf, av);
    });
 }

-future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf) {
-    return take_column_family_snapshot(ks_name, std::vector<sstring>{cf_name}, tag, sf);
+future<> snapshot_ctl::do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf, allow_view_snapshots av) {
+    co_await check_snapshot_not_exist(ks_name, tag, tables);
+
+    for (const auto& table_name : tables) {
+        auto& cf = _db.local().find_column_family(ks_name, table_name);
+        if (cf.schema()->is_view() && !av) {
+            throw std::invalid_argument("Do not take a snapshot of a materialized view or a secondary index by itself. Run snapshot on the base table instead.");
+        }
+    }
+    co_await _db.local().snapshot_on_all(ks_name, std::move(tables), std::move(tag), bool(sf));
+}
+
+future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf, allow_view_snapshots av) {
+    return take_column_family_snapshot(ks_name, std::vector<sstring>{cf_name}, tag, sf, av);
 }

 future<> snapshot_ctl::clear_snapshot(sstring tag, std::vector<sstring> keyspace_names, sstring cf_name) {
--- a/db/snapshot-ctl.hh
+++ b/db/snapshot-ctl.hh
@@ -27,6 +27,7 @@ namespace db {
 class snapshot_ctl : public peering_sharded_service<snapshot_ctl> {
 public:
    using skip_flush = bool_class<class skip_flush_tag>;
+    using allow_view_snapshots = bool_class<class allow_view_snapsots_tag>;

    struct snapshot_details {
        int64_t live;
@@ -64,7 +65,7 @@ public:
     * @param tables a vector of tables names to snapshot
     * @param tag the tag given to the snapshot; may not be null or empty
     */
-    future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no);
+    future<> take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);

    /**
     * Takes the snapshot of a specific column family. A snapshot name must be specified.
@@ -73,7 +74,7 @@ public:
     * @param columnFamilyName the column family to snapshot
     * @param tag the tag given to the snapshot; may not be null or empty
     */
-    future<> take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf = skip_flush::no);
+    future<> take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);

    /**
     * Remove the snapshot with the given name from the given keyspaces.
@@ -97,6 +98,9 @@ private:

    template <typename Func>
    std::result_of_t<Func()> run_snapshot_list_operation(Func&&);
+
+    future<> do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf = skip_flush::no);
+    future<> do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no, allow_view_snapshots av = allow_view_snapshots::no);
 };

 }
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -2482,10 +2482,14 @@ class db_config_table final : public streaming_virtual_table {
            for (auto& c_ref : cfg.values()) {
                auto& c = c_ref.get();
                if (c.name() == name) {
-                    if (c.set_value(value, utils::config_file::config_source::CQL)) {
-                        return cfg.broadcast_to_all_shards();
-                    } else {
-                        return make_exception_future<>(virtual_table_update_exception("option is not live-updateable"));
+                    try {
+                        if (c.set_value(value, utils::config_file::config_source::CQL)) {
+                            return cfg.broadcast_to_all_shards();
+                        } else {
+                            return make_exception_future<>(virtual_table_update_exception("option is not live-updateable"));
+                        }
+                    } catch (boost::bad_lexical_cast&) {
+                        return make_exception_future<>(virtual_table_update_exception("cannot parse option value"));
                    }
                }
            }
--- a/db/system_keyspace_view_types.hh
+++ b/db/system_keyspace_view_types.hh
@@ -10,6 +10,7 @@

 #include <seastar/core/seastar.hh>
 #include <seastar/core/sstring.hh>
+#include <seastar/core/reactor.hh>
 #include <utility>
 #include <optional>
 #include "dht/token.hh"
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -121,6 +121,9 @@ const column_definition* view_info::view_column(const column_definition& base_de

 void view_info::set_base_info(db::view::base_info_ptr base_info) {
    _base_info = std::move(base_info);
+    // Forget the cached objects which may refer to the base schema.
+    _select_statement = nullptr;
+    _partition_slice = std::nullopt;
 }

 // A constructor for a base info that can facilitate reads and writes from the materialized view.
@@ -322,7 +325,11 @@ public:
    view_filter_checking_visitor(const schema& base, const view_info& view)
        : _base(base)
        , _view(view)
-        , _selection(cql3::selection::selection::wildcard(_base.shared_from_this()))
+        , _selection(cql3::selection::selection::for_columns(_base.shared_from_this(),
+            boost::copy_range<std::vector<const column_definition*>>(
+                _base.regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return &cdef; }))
+            )
+        )
    {}

    void accept_new_partition(const partition_key& key, uint64_t row_count) {
@@ -859,13 +866,18 @@ void view_updates::generate_update(
    bool same_row = true;
    for (auto col_id : col_ids) {
        auto* after = update.cells().find_cell(col_id);
-        // Note: multi-cell columns can't be part of the primary key.
        auto& cdef = _base->regular_column_at(col_id);
        if (existing) {
            auto* before = existing->cells().find_cell(col_id);
+            // Note that this cell is necessarily atomic, because col_ids are
+            // view key columns, and keys must be atomic.
            if (before && before->as_atomic_cell(cdef).is_live()) {
                if (after && after->as_atomic_cell(cdef).is_live()) {
-                    auto cmp = compare_atomic_cell_for_merge(before->as_atomic_cell(cdef), after->as_atomic_cell(cdef));
+                    // We need to compare just the values of the keys, not
+                    // metadata like the timestamp. This is because below,
+                    // if the old and new view row have the same key, we need
+                    // to be sure to reach the update_entry() case.
+                    auto cmp = compare_unsigned(before->as_atomic_cell(cdef).value(), after->as_atomic_cell(cdef).value());
                    if (cmp != 0) {
                        same_row = false;
                    }
@@ -885,7 +897,13 @@ void view_updates::generate_update(
            if (same_row) {
                update_entry(base_key, update, *existing, now);
            } else {
-                replace_entry(base_key, update, *existing, now);
+                // This code doesn't work if the old and new view row have the
+                // same key, because if they do we get both data and tombstone
+                // for the same timestamp (now) and the tombstone wins. This
+                // is why we need the "same_row" case above - it's not just a
+                // performance optimization.
+                delete_old_entry(base_key, *existing, update, now);
+                create_entry(base_key, update, now);
            }
        } else {
            delete_old_entry(base_key, *existing, update, now);
@@ -1293,7 +1311,7 @@ future<> mutate_MV(
            auto mut_ptr = remote_endpoints.empty() ? std::make_unique<frozen_mutation>(std::move(mut.fm)) : std::make_unique<frozen_mutation>(mut.fm);
            tracing::trace(tr_state, "Locally applying view update for {}.{}; base token = {}; view token = {}",
                    mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
-            local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, std::move(tr_state), db::commitlog::force_sync::no).then_wrapped(
+            local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, tr_state, db::commitlog::force_sync::no).then_wrapped(
                    [s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
                            units = sem_units.split(sem_units.count())] (future<>&& f) {
                --stats.writes;
@@ -2031,15 +2049,21 @@ public:
 // Called in the context of a seastar::thread.
 void view_builder::execute(build_step& step, exponential_backoff_retry r) {
    gc_clock::time_point now = gc_clock::now();
-    auto consumer = compact_for_query<emit_only_live_rows::yes, view_builder::consumer>(
+    auto compaction_state = make_lw_shared<compact_for_query_state<emit_only_live_rows::yes>>(
            *step.reader.schema(),
            now,
            step.pslice,
            batch_size,
-            query::max_partitions,
-            view_builder::consumer{*this, step, now});
-    consumer.consume_new_partition(step.current_key); // Initialize the state in case we're resuming a partition
+            query::max_partitions);
+    auto consumer = compact_for_query<emit_only_live_rows::yes, view_builder::consumer>(compaction_state, view_builder::consumer{*this, step, now});
    auto built = step.reader.consume_in_thread(std::move(consumer));
+    if (auto ds = std::move(*compaction_state).detach_state()) {
+        auto& range_tombstones = std::get<std::deque<range_tombstone>>(ds->range_tombstones);
+        for (auto& rt : range_tombstones) {
+            step.reader.unpop_mutation_fragment(mutation_fragment(*step.reader.schema(), step.reader.permit(), std::move(rt)));
+        }
+        step.reader.unpop_mutation_fragment(mutation_fragment(*step.reader.schema(), step.reader.permit(), std::move(ds->partition_start)));
+    }

    _as.check();

--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -154,10 +154,7 @@ private:
    void delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
    void do_delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
    void update_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now);
-    void replace_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now) {
-        create_entry(base_key, update, now);
-        delete_old_entry(base_key, existing, update, now);
-    }
+    void update_entry_for_computed_column(const partition_key& base_key, const clustering_row& update, const std::optional<clustering_row>& existing, gc_clock::time_point now);
 };

 class view_update_builder {
--- a/db/view/view_updating_consumer.hh
+++ b/db/view/view_updating_consumer.hh
@@ -16,6 +16,7 @@
 #include "db/view/row_locking.hh"
 #include <seastar/core/abort_source.hh>
 #include "mutation.hh"
+#include <seastar/core/circular_buffer.hh>

 class evictable_reader_handle;

--- a/dirty_memory_manager.hh
+++ b/dirty_memory_manager.hh
@@ -202,6 +202,12 @@ public:
        });
    }

+    future<flush_permit> get_all_flush_permits() {
+        return get_units(_background_work_flush_serializer, _max_background_work).then([this] (auto&& units) {
+            return this->get_flush_permit(std::move(units));
+        });
+    }
+
    bool has_extraneous_flushes_requested() const {
        return _extraneous_flushes > 0;
    }
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -123,10 +123,14 @@ WantedBy=multi-user.target
        #  - Storage: /path/to/file (inacessible)
        #  - Storage: /path/to/file
        #
+        # After systemd-v248, available coredump file output changed like this:
+        #  - Storage: /path/to/file (present)
+        # We need to support both versions.
+        #
        # reference: https://github.com/systemd/systemd/commit/47f50642075a7a215c9f7b600599cbfee81a2913

        corefail = False
-        res = re.findall(r'Storage: (.*)$', coreinfo, flags=re.MULTILINE)
+        res = re.findall(r'Storage: (\S+)(?: \(.+\))?$', coreinfo, flags=re.MULTILINE)
        # v232 or later
        if res:
            corepath = res[0]
--- a/dist/common/scripts/scylla_sysconfig_setup
+++ b/dist/common/scripts/scylla_sysconfig_setup
@@ -70,7 +70,17 @@ if __name__ == '__main__':
    network_mode = args.mode if args.mode else cfg.get('NETWORK_MODE')

    if args.setup_nic_and_disks:
-        rps_cpus = run('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname), shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
+        res = run('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname), shell=True, check=True, capture_output=True, encoding='utf-8').stdout
+        # we need to extract CPU mask from output, since perftune.py may also print warning messages (#10082)
+        match = re.match('(.*\n)?(0x[0-9a-f]+(?:,0x[0-9a-f]+)*)', res, re.DOTALL)
+        try:
+            warning = match.group(1)
+            rps_cpus = match.group(2)
+        except:
+            raise Exception(f'Failed to retrive CPU mask: {res}')
+        # print warning message if available
+        if warning:
+            print(warning.strip())
        if len(rps_cpus) > 0:
            cpuset = hex2list(rps_cpus)
            run('/opt/scylladb/scripts/scylla_cpuset_setup --cpuset {}'.format(cpuset), shell=True, check=True)
--- a/dist/common/supervisor/scylla_util.sh
+++ b/dist/common/supervisor/scylla_util.sh
@@ -6,12 +6,16 @@ is_nonroot() {
    [ -f "$scylladir"/SCYLLA-NONROOT-FILE ]
 }

+is_container() {
+    [ -f "$scylladir"/SCYLLA-CONTAINER-FILE ]
+}
+
 is_privileged() {
    [ ${EUID:-${UID}} = 0 ]
 }

 execsudo() {
-    if is_nonroot; then
+    if is_nonroot || is_container; then
        exec "$@"
    else
        exec sudo -u scylla -g scylla "$@"
--- a/dist/docker/debian/build_docker.sh
+++ b/dist/docker/debian/build_docker.sh
@@ -82,15 +82,17 @@ run bash -ec "echo 'debconf debconf/frontend select Noninteractive' | debconf-se
 run bash -ec "rm -rf /etc/rsyslog.conf"
 run apt-get -y install hostname supervisor openssh-server openssh-client openjdk-11-jre-headless python python-yaml curl rsyslog locales sudo
 run locale-gen en_US.UTF-8
-run update-locale LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF_8
+run update-locale LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF-8
 run bash -ec "dpkg -i packages/*.deb"
 run apt-get -y clean all
 run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
 run mkdir -p /etc/supervisor.conf.d
 run mkdir -p /var/log/scylla
 run chown -R scylla:scylla /var/lib/scylla
+run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"/' /etc/default/scylla-server

 run mkdir -p /opt/scylladb/supervisor
+run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
 bcp dist/common/supervisor/scylla-server.sh /opt/scylladb/supervisor/scylla-server.sh
 bcp dist/common/supervisor/scylla-jmx.sh /opt/scylladb/supervisor/scylla-jmx.sh
 bcp dist/common/supervisor/scylla-node-exporter.sh /opt/scylladb/supervisor/scylla-node-exporter.sh
--- a/dist/docker/etc/supervisord.conf.d/scylla-server.conf
+++ b/dist/docker/etc/supervisord.conf.d/scylla-server.conf
@@ -1,4 +1,4 @@
-[program:scylla-server]
+[program:scylla]
 command=/opt/scylladb/supervisor/scylla-server.sh
 stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
--- a/dist/docker/etc/sysconfig/scylla-server
+++ b/dist/docker/etc/sysconfig/scylla-server
@@ -1,41 +0,0 @@
-# choose following mode: virtio, dpdk, posix
-NETWORK_MODE=posix
-
-# tap device name(virtio)
-TAP=tap0
-
-# bridge device name (virtio)
-BRIDGE=virbr0
-
-# ethernet device name
-IFNAME=eth0
-
-# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
-SET_NIC_AND_DISKS=no
-
-# ethernet device driver (dpdk)
-ETHDRV=
-
-# ethernet device PCI ID (dpdk)
-ETHPCIID=
-
-# number of hugepages
-NR_HUGEPAGES=64
-
-# user for process (must be root for dpdk)
-USER=scylla
-
-# group for process
-GROUP=scylla
-
-# scylla home dir
-SCYLLA_HOME=/var/lib/scylla
-
-# scylla config dir
-SCYLLA_CONF=/etc/scylla
-
-# scylla arguments
-SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"
-
-# setup as AMI instance
-AMI=no
--- a/flat_mutation_reader.cc
+++ b/flat_mutation_reader.cc
@@ -32,7 +32,7 @@
 logging::logger fmr_logger("flat_mutation_reader");

 flat_mutation_reader& flat_mutation_reader::operator=(flat_mutation_reader&& o) noexcept {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -45,7 +45,7 @@ flat_mutation_reader& flat_mutation_reader::operator=(flat_mutation_reader&& o)
 }

 flat_mutation_reader::~flat_mutation_reader() {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -1580,6 +1580,9 @@ bool mutation_fragment_stream_validator::operator()(dht::token t) {
 }

 bool mutation_fragment_stream_validator::operator()(mutation_fragment_v2::kind kind, position_in_partition_view pos) {
+    if (kind == mutation_fragment_v2::kind::partition_end && _current_tombstone) {
+        return false;
+    }
    if (_prev_kind == mutation_fragment_v2::kind::partition_end) {
        const bool valid = (kind == mutation_fragment_v2::kind::partition_start);
        if (valid) {
@@ -1607,7 +1610,11 @@ bool mutation_fragment_stream_validator::operator()(mutation_fragment::kind kind
 }

 bool mutation_fragment_stream_validator::operator()(const mutation_fragment_v2& mf) {
-    return (*this)(mf.mutation_fragment_kind(), mf.position());
+    const auto valid = (*this)(mf.mutation_fragment_kind(), mf.position());
+    if (valid && mf.is_range_tombstone_change()) {
+        _current_tombstone = mf.as_range_tombstone_change().tombstone();
+    }
+    return valid;
 }
 bool mutation_fragment_stream_validator::operator()(const mutation_fragment& mf) {
    return (*this)(to_mutation_fragment_kind_v2(mf.mutation_fragment_kind()), mf.position());
@@ -1646,11 +1653,17 @@ void mutation_fragment_stream_validator::reset(dht::decorated_key dk) {
    _prev_partition_key = dk;
    _prev_pos = position_in_partition::for_partition_start();
    _prev_kind = mutation_fragment_v2::kind::partition_start;
+    _current_tombstone = {};
 }

 void mutation_fragment_stream_validator::reset(const mutation_fragment_v2& mf) {
    _prev_pos = mf.position();
    _prev_kind = mf.mutation_fragment_kind();
+    if (mf.is_range_tombstone_change()) {
+        _current_tombstone = mf.as_range_tombstone_change().tombstone();
+    } else {
+        _current_tombstone = {};
+    }
 }
 void mutation_fragment_stream_validator::reset(const mutation_fragment& mf) {
    _prev_pos = mf.position();
@@ -1719,6 +1732,11 @@ bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment_v2

    fmr_logger.debug("[validator {}] {}:{}", static_cast<void*>(this), kind, pos);

+    if (kind == mutation_fragment_v2::kind::partition_end && _current_tombstone) {
+        on_validation_error(fmr_logger, format("[validator {} for {}] Unexpected active tombstone at partition-end: partition key {}: tombstone {}",
+                static_cast<void*>(this), _name, _validator.previous_partition_key(), _current_tombstone));
+    }
+
    if (_validation_level >= mutation_fragment_stream_validation_level::clustering_key) {
        valid = _validator(kind, pos);
    } else {
@@ -1745,7 +1763,11 @@ bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment::k
 }

 bool mutation_fragment_stream_validating_filter::operator()(const mutation_fragment_v2& mv) {
-    return (*this)(mv.mutation_fragment_kind(), mv.position());
+    auto valid = (*this)(mv.mutation_fragment_kind(), mv.position());
+    if (valid && mv.is_range_tombstone_change()) {
+        _current_tombstone = mv.as_range_tombstone_change().tombstone();
+    }
+    return valid;
 }
 bool mutation_fragment_stream_validating_filter::operator()(const mutation_fragment& mv) {
    return (*this)(to_mutation_fragment_kind_v2(mv.mutation_fragment_kind()), mv.position());
@@ -1764,7 +1786,7 @@ void mutation_fragment_stream_validating_filter::on_end_of_stream() {
 }

 flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader_v2&& o) noexcept {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -1777,7 +1799,7 @@ flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader
 }

 flat_mutation_reader_v2::~flat_mutation_reader_v2() {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
--- a/flat_mutation_reader.hh
+++ b/flat_mutation_reader.hh
@@ -132,6 +132,7 @@ public:
    private:
        tracked_buffer _buffer;
        size_t _buffer_size = 0;
+        bool _close_required = false;
    protected:
        size_t max_buffer_size_in_bytes = default_max_buffer_size_in_bytes();
        bool _end_of_stream = false;
@@ -167,6 +168,8 @@ public:
        bool is_end_of_stream() const { return _end_of_stream; }
        bool is_buffer_empty() const { return _buffer.empty(); }
        bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; }
+        bool is_close_required() const { return _close_required; }
+        void set_close_required() { _close_required = true; }
        static constexpr size_t default_max_buffer_size_in_bytes() { return 8 * 1024; }

        mutation_fragment pop_mutation_fragment() {
@@ -504,9 +507,15 @@ public:
    //
    // Can be used to skip over entire partitions if interleaved with
    // `operator()()` calls.
-    future<> next_partition() { return _impl->next_partition(); }
+    future<> next_partition() {
+        _impl->set_close_required();
+        return _impl->next_partition();
+    }

-    future<> fill_buffer() { return _impl->fill_buffer(); }
+    future<> fill_buffer() {
+        _impl->set_close_required();
+        return _impl->fill_buffer();
+    }

    // Changes the range of partitions to pr. The range can only be moved
    // forwards. pr.begin() needs to be larger than pr.end() of the previousl
@@ -515,6 +524,7 @@ public:
    // pr needs to be valid until the reader is destroyed or fast_forward_to()
    // is called again.
    future<> fast_forward_to(const dht::partition_range& pr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(pr);
    }
    // Skips to a later range of rows.
@@ -544,6 +554,7 @@ public:
    // In particular one must first enter a partition by fetching a `partition_start`
    // fragment before calling `fast_forward_to`.
    future<> fast_forward_to(position_range cr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(std::move(cr));
    }
    // Closes the reader.
--- a/flat_mutation_reader_v2.hh
+++ b/flat_mutation_reader_v2.hh
@@ -164,6 +164,7 @@ public:
    private:
        tracked_buffer _buffer;
        size_t _buffer_size = 0;
+        bool _close_required = false;
    protected:
        size_t max_buffer_size_in_bytes = default_max_buffer_size_in_bytes();

@@ -205,6 +206,8 @@ public:
        bool is_end_of_stream() const { return _end_of_stream; }
        bool is_buffer_empty() const { return _buffer.empty(); }
        bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; }
+        bool is_close_required() const { return _close_required; }
+        void set_close_required() { _close_required = true; }
        static constexpr size_t default_max_buffer_size_in_bytes() { return 8 * 1024; }

        mutation_fragment_v2 pop_mutation_fragment() {
@@ -542,9 +545,15 @@ public:
    //
    // Can be used to skip over entire partitions if interleaved with
    // `operator()()` calls.
-    future<> next_partition() { return _impl->next_partition(); }
+    future<> next_partition() {
+        _impl->set_close_required();
+        return _impl->next_partition();
+    }

-    future<> fill_buffer() { return _impl->fill_buffer(); }
+    future<> fill_buffer() {
+        _impl->set_close_required();
+        return _impl->fill_buffer();
+    }

    // Changes the range of partitions to pr. The range can only be moved
    // forwards. pr.begin() needs to be larger than pr.end() of the previousl
@@ -553,6 +562,7 @@ public:
    // pr needs to be valid until the reader is destroyed or fast_forward_to()
    // is called again.
    future<> fast_forward_to(const dht::partition_range& pr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(pr);
    }
    // Skips to a later range of rows.
@@ -582,6 +592,7 @@ public:
    // In particular one must first enter a partition by fetching a `partition_start`
    // fragment before calling `fast_forward_to`.
    future<> fast_forward_to(position_range cr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(std::move(cr));
    }
    // Closes the reader.
--- a/install.sh
+++ b/install.sh
@@ -143,7 +143,7 @@ export LD_LIBRARY_PATH="$prefix/libreloc"
 export UBSAN_OPTIONS="${UBSAN_OPTIONS:+$UBSAN_OPTIONS:}suppressions=$prefix/libexec/ubsan-suppressions.supp"
 exec -a "\$0" "$prefix/libexec/$bin" "\$@"
 EOF
-    chmod +x "$root/$prefix/bin/$bin"
+    chmod 755 "$root/$prefix/bin/$bin"
 }

 relocate_python3() {
@@ -156,11 +156,11 @@ relocate_python3() {
    local pythonpath="$(dirname "$pythoncmd")"

    if [ ! -x "$script" ]; then
-        cp "$script" "$install"
+        install -m755 "$script" "$install"
        return
    fi
-    mkdir -p "$relocateddir"
-    cp "$script" "$relocateddir"
+    install -d -m755 "$relocateddir"
+    install -m755 "$script" "$relocateddir"
    cat > "$install"<<EOF
 #!/usr/bin/env bash
 [[ -z "\$LD_PRELOAD" ]] || { echo "\$0: not compatible with LD_PRELOAD" >&2; exit 110; }
@@ -178,7 +178,7 @@ if [ -f "\${DEBIAN_SSL_CERT_FILE}" ]; then
 fi
 PYTHONPATH="\${d}:\${d}/libexec:\$PYTHONPATH" PATH="\${d}/../bin:\${d}/$pythonpath:\${PATH}" SSL_CERT_FILE="\${c}" exec -a "\$0" "\${d}/libexec/\${b}" "\$@"
 EOF
-    chmod +x "$install"
+    chmod 755 "$install"
 }

 install() {
@@ -392,6 +392,7 @@ install -d -m755 -d "$rprefix"/scyllatop
 cp -r tools/scyllatop/* "$rprefix"/scyllatop
 install -d -m755 -d "$rprefix"/scripts
 cp -r dist/common/scripts/* "$rprefix"/scripts
+chmod 755 "$rprefix"/scripts/*
 ln -srf "$rprefix/scyllatop/scyllatop.py" "$rprefix/bin/scyllatop"
 if $supervisor; then
    install -d -m755 "$rprefix"/supervisor
@@ -508,8 +509,13 @@ relocate_python3 "$rprefix"/scripts fix_system_distributed_tables.py
 if $supervisor; then
    install -d -m755 `supervisor_dir $retc`
    for service in scylla-server scylla-jmx scylla-node-exporter; do
+        if [ "$service" = "scylla-server" ]; then
+            program="scylla"
+        else
+            program=$service
+        fi
        cat << EOS > `supervisor_conf $retc $service`
-[program:$service]
+[program:$program]
 directory=$rprefix
 command=/bin/bash -c './supervisor/$service.sh'
 EOS
--- a/locator/azure_snitch.cc
+++ b/locator/azure_snitch.cc
@@ -34,6 +34,10 @@ azure_snitch::azure_snitch(const sstring& fname, unsigned io_cpuid) : production
 }

 future<> azure_snitch::load_config() {
+    if (this_shard_id() != io_cpu_id()) {
+        co_return;
+    }
+
    sstring region = co_await azure_api_call(REGION_NAME_QUERY_PATH);
    sstring azure_zone = co_await azure_api_call(ZONE_NAME_QUERY_PATH);

--- a/locator/ec2_snitch.cc
+++ b/locator/ec2_snitch.cc
@@ -1,5 +1,7 @@
 #include "locator/ec2_snitch.hh"
 #include <seastar/core/seastar.hh>
+#include <seastar/core/sleep.hh>
+#include <seastar/core/do_with.hh>

 #include <boost/algorithm/string/classification.hpp>
 #include <boost/algorithm/string/split.hpp>
@@ -67,6 +69,30 @@ future<> ec2_snitch::start() {
 }

 future<sstring> ec2_snitch::aws_api_call(sstring addr, uint16_t port, sstring cmd) {
+    return do_with(int(0), [this, addr, port, cmd] (int& i) {
+        return repeat_until_value([this, addr, port, cmd, &i]() -> future<std::optional<sstring>> {
+            ++i;
+            return aws_api_call_once(addr, port, cmd).then([] (auto res) {
+                return make_ready_future<std::optional<sstring>>(std::move(res));
+            }).handle_exception([&i] (auto ep) {
+                try {
+                    std::rethrow_exception(ep);
+                } catch (const std::system_error &e) {
+                    logger().error(e.what());
+                    if (i >= AWS_API_CALL_RETRIES - 1) {
+                        logger().error("Maximum number of retries exceeded");
+                        throw e;
+                    }
+                }
+                return sleep(AWS_API_CALL_RETRY_INTERVAL).then([] {
+                    return make_ready_future<std::optional<sstring>>(std::nullopt);
+                });
+            });
+        });
+    });
+}
+
+future<sstring> ec2_snitch::aws_api_call_once(sstring addr, uint16_t port, sstring cmd) {
    return connect(socket_address(inet_address{addr}, port))
    .then([this, addr, cmd] (connected_socket fd) {
        _sd = std::move(fd);
--- a/locator/ec2_snitch.hh
+++ b/locator/ec2_snitch.hh
@@ -16,6 +16,8 @@ public:
    static constexpr const char* ZONE_NAME_QUERY_REQ = "/latest/meta-data/placement/availability-zone";
    static constexpr const char* AWS_QUERY_SERVER_ADDR = "169.254.169.254";
    static constexpr uint16_t AWS_QUERY_SERVER_PORT = 80;
+    static constexpr int AWS_API_CALL_RETRIES = 5;
+    static constexpr auto AWS_API_CALL_RETRY_INTERVAL = std::chrono::seconds{5};

    ec2_snitch(const sstring& fname = "", unsigned io_cpu_id = 0);
    virtual future<> start() override;
@@ -32,5 +34,6 @@ private:
    output_stream<char> _out;
    http_response_parser _parser;
    sstring _zone_req;
+    future<sstring> aws_api_call_once(sstring addr, uint16_t port, const sstring cmd);
 };
 } // namespace locator
--- a/main.cc
+++ b/main.cc
@@ -367,11 +367,38 @@ static auto defer_verbose_shutdown(const char* what, Func&& func) {
        startlog.info("Shutting down {}", what);
        try {
            func();
+            startlog.info("Shutting down {} was successful", what);
        } catch (...) {
-            startlog.error("Unexpected error shutting down {}: {}", what, std::current_exception());
-            throw;
+            auto ex = std::current_exception();
+            bool do_abort = true;
+            try {
+                std::rethrow_exception(ex);
+            } catch (const std::system_error& e) {
+                // System error codes we consider "environmental",
+                // i.e. not scylla's fault, therefore there is no point in
+                // aborting and dumping core.
+                for (int i : {EIO, EACCES, ENOSPC}) {
+                    if (e.code() == std::error_code(i, std::system_category())) {
+                        do_abort = false;
+                        break;
+                    }
+                }
+            } catch (...) {
+            }
+            auto msg = fmt::format("Unexpected error shutting down {}: {}", what, ex);
+            if (do_abort) {
+                startlog.error("{}: aborting", msg);
+                abort();
+            } else {
+                startlog.error("{}: exiting, at {}", msg, current_backtrace());
+
+                // Call _exit() rather than exit() to exit immediately
+                // without calling exit handlers, avoiding
+                // boost::intrusive::detail::destructor_impl assert failure
+                // from ~segment_pool exit handler.
+                _exit(255);
+            }
        }
-        startlog.info("Shutting down {} was successful", what);
    };

    auto ret = deferred_action(std::move(vfunc));
@@ -547,6 +574,12 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl

            cfg->broadcast_to_all_shards().get();

+            // We pass this piece of config through a global as a temporary hack.
+            // See the comment at the definition of sstables::global_cache_index_pages.
+            smp::invoke_on_all([&cfg] {
+                sstables::global_cache_index_pages = cfg->cache_index_pages.operator utils::updateable_value<bool>();
+            }).get();
+
            ::sighup_handler sighup_handler(opts, *cfg);
            auto stop_sighup_handler = defer_verbose_shutdown("sighup", [&] {
                sighup_handler.stop().get();
--- a/memtable-sstable.hh
+++ b/memtable-sstable.hh
@@ -15,6 +15,7 @@
 #include "sstables/shared_sstable.hh"
 #include <seastar/core/future.hh>
 #include <seastar/core/io_priority_class.hh>
+#include "reader_permit.hh"

 class memtable;
 class flat_mutation_reader;
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -438,6 +438,8 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
    // should not be blocked by any data requests.
    case messaging_verb::GROUP0_PEER_EXCHANGE:
    case messaging_verb::GROUP0_MODIFY_CONFIG:
+        // ATTN -- if moving GOSSIP_ verbs elsewhere, mind updating the tcp_nodelay
+        // setting in get_rpc_client(), which assumes gossiper verbs live in idx 0
        return 0;
    case messaging_verb::PREPARE_MESSAGE:
    case messaging_verb::PREPARE_DONE_MESSAGE:
@@ -695,7 +697,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }();

    auto must_tcp_nodelay = [&] {
-        if (idx == 1) {
+        if (idx == 0) {
            return true; // gossip
        }
        if (_cfg.tcp_nodelay == tcp_nodelay_what::local) {
@@ -716,10 +718,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }
    opts.tcp_nodelay = must_tcp_nodelay;
    opts.reuseaddr = true;
-    // We send cookies only for non-default statement tenant clients.
-    if (idx > 3) {
-        opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;
-    }
+    opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;

    auto client = must_encrypt ?
                    ::make_shared<rpc_protocol_client_wrapper>(_rpc->protocol(), std::move(opts),
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -272,8 +272,8 @@ public:

    future<> lookup_readers(db::timeout_clock::time_point timeout);

-    future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
-            std::optional<clustering_key_prefix> last_ckey);
+    future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, std::optional<detached_compaction_state> compaction_state,
+            dht::decorated_key last_pkey, std::optional<clustering_key_prefix> last_ckey);

    future<> stop();
 };
@@ -580,19 +580,22 @@ future<> read_context::lookup_readers(db::timeout_clock::time_point timeout) {
    });
 }

-future<> read_context::save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
-            std::optional<clustering_key_prefix> last_ckey) {
+future<> read_context::save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, std::optional<detached_compaction_state> compaction_state,
+            dht::decorated_key last_pkey, std::optional<clustering_key_prefix> last_ckey) {
    if (_cmd.query_uuid == utils::UUID{}) {
        return make_ready_future<>();
    }

-    auto last_pkey = compaction_state.partition_start.key();
-
    const auto cb_stats = dismantle_combined_buffer(std::move(unconsumed_buffer), last_pkey);
    tracing::trace(_trace_state, "Dismantled combined buffer: {}", cb_stats);

-    const auto cs_stats = dismantle_compaction_state(std::move(compaction_state));
-    tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);
+    auto cs_stats = dismantle_buffer_stats{};
+    if (compaction_state) {
+        cs_stats = dismantle_compaction_state(std::move(*compaction_state));
+        tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);
+    } else {
+        tracing::trace(_trace_state, "No compaction state to dismantle, partition exhausted", cs_stats);
+    }

    return do_with(std::move(last_pkey), std::move(last_ckey), [this] (const dht::decorated_key& last_pkey,
            const std::optional<clustering_key_prefix>& last_ckey) {
@@ -745,16 +748,18 @@ future<typename ResultBuilder::result_type> do_query(
        ResultBuilder&& result_builder) {
    auto ctx = seastar::make_shared<read_context>(db, s, cmd, ranges, trace_state, timeout);

-    co_await ctx->lookup_readers(timeout);
-
    std::exception_ptr ex;

    try {
+        co_await ctx->lookup_readers(timeout);
+
        auto [last_ckey, result, unconsumed_buffer, compaction_state] = co_await read_page<ResultBuilder>(ctx, s, cmd, ranges, trace_state,
                std::move(result_builder));

        if (compaction_state->are_limits_reached() || result.is_short_read()) {
-            co_await ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_ckey));
+            // Must call before calling 'detached_state()`.
+            auto last_pkey = *compaction_state->current_partition();
+            co_await ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_pkey), std::move(last_ckey));
        }

        co_await ctx->stop();
--- a/mutation_compactor.hh
+++ b/mutation_compactor.hh
@@ -167,6 +167,9 @@ class compact_mutation_state {
    std::unique_ptr<mutation_compactor_garbage_collector> _collector;

    compaction_stats _stats;
+
+    // Remember if we requested to stop mid-partition.
+    stop_iteration _stop = stop_iteration::no;
 private:
    template <typename Consumer, typename GCConsumer>
    requires CompactedFragmentsConsumer<Consumer> && CompactedFragmentsConsumer<GCConsumer>
@@ -304,6 +307,7 @@ public:
    }

    void consume_new_partition(const dht::decorated_key& dk) {
+        _stop = stop_iteration::no;
        auto& pk = dk.key();
        _dk = &dk;
        _return_static_content_on_partition_with_no_rows =
@@ -370,9 +374,9 @@ public:
        _static_row_live = is_live;
        if (is_live || (!only_live() && !sr.empty())) {
            partition_is_not_empty(consumer);
-            return consumer.consume(std::move(sr), current_tombstone, is_live);
+            _stop = consumer.consume(std::move(sr), current_tombstone, is_live);
        }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -424,22 +428,21 @@ public:
        };

        if (only_live() && is_live) {
-            auto stop = consume_row();
+            _stop = consume_row();
            if (++_rows_in_current_partition == _current_partition_limit) {
-                return stop_iteration::yes;
+                _stop = stop_iteration::yes;
            }
-            return stop;
+            return _stop;
        } else if (!only_live()) {
-            auto stop = stop_iteration::no;
            if (!cr.empty()) {
-                stop = consume_row();
+                _stop = consume_row();
            }
            if (!sstable_compaction() && is_live && ++_rows_in_current_partition == _current_partition_limit) {
-                return stop_iteration::yes;
+                _stop = stop_iteration::yes;
            }
-            return stop;
+            return _stop;
        }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -448,7 +451,8 @@ public:
        ++_stats.range_tombstones;
        _range_tombstones.apply(rt);
        // FIXME: drop tombstone if it is fully covered by other range tombstones
-        return do_consume(std::move(rt), consumer, gc_consumer);
+        _stop = do_consume(std::move(rt), consumer, gc_consumer);
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -459,9 +463,9 @@ public:
            _rt_assembler.emplace();
        }
        if (auto rt_opt = _rt_assembler->consume(_schema, std::move(rtc))) {
-            return do_consume(std::move(*rt_opt), consumer, gc_consumer);
+            _stop = do_consume(std::move(*rt_opt), consumer, gc_consumer);
        }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -562,16 +566,31 @@ public:
    /// compactor will result in the new compactor being in the same state *this
    /// is (given the same outside parameters of course). Practically this
    /// allows the compaction state to be stored in the compacted reader.
-    detached_compaction_state detach_state() && {
+    /// If the currently compacted partition is exhausted a disengaged optional
+    /// is returned -- in this case there is no state to detach.
+    std::optional<detached_compaction_state> detach_state() && {
+        // If we exhausted the partition, there is no need to detach-restore the
+        // compaction state.
+        // We exhausted the partition if `consume_partition_end()` was called
+        // without us requesting the consumption to stop (remembered in _stop)
+        // from one of the consume() overloads.
+        // The consume algorithm calls `consume_partition_end()` in two cases:
+        // * on a partition-end fragment
+        // * consume() requested to stop
+        // In the latter case, the partition is not exhausted. Even if the next
+        // fragment to process is a partition-end, it will not be consumed.
+        if (!_stop) {
+            return {};
+        }
        partition_start ps(std::move(_last_dk), _range_tombstones.get_partition_tombstone());
        if (_rt_assembler) {
            if (_current_tombstone) {
-                return {std::move(ps), std::move(_last_static_row), range_tombstone_change(position_in_partition_view::after_key(_last_clustering_pos), _current_tombstone)};
+                return detached_compaction_state{std::move(ps), std::move(_last_static_row), range_tombstone_change(position_in_partition_view::after_key(_last_clustering_pos), _current_tombstone)};
            } else {
-                return {std::move(ps), std::move(_last_static_row), std::optional<range_tombstone_change>{}};
+                return detached_compaction_state{std::move(ps), std::move(_last_static_row), std::optional<range_tombstone_change>{}};
            }
        }
-        return {std::move(ps), std::move(_last_static_row), std::move(_range_tombstones).range_tombstones()};
+        return detached_compaction_state{std::move(ps), std::move(_last_static_row), std::move(_range_tombstones).range_tombstones()};
    }

    const compaction_stats& stats() const { return _stats; }
--- a/mutation_fragment_stream_validator.hh
+++ b/mutation_fragment_stream_validator.hh
@@ -28,6 +28,7 @@ class mutation_fragment_stream_validator {
    mutation_fragment_v2::kind _prev_kind;
    position_in_partition _prev_pos;
    dht::decorated_key _prev_partition_key;
+    tombstone _current_tombstone;
 public:
    explicit mutation_fragment_stream_validator(const schema& s);

@@ -122,6 +123,12 @@ public:
    const position_in_partition& previous_position() const {
        return _prev_pos;
    }
+    /// Get the current effective tombstone
+    ///
+    /// Not meaningful, when operator()(mutation_fragment_v2) is not used.
+    tombstone current_tombstone() const {
+        return _current_tombstone;
+    }
    /// The previous valid partition key.
    ///
    /// Only valid if `operator()(const dht::decorated_key&)` or
@@ -151,6 +158,7 @@ class mutation_fragment_stream_validating_filter {
    mutation_fragment_stream_validator _validator;
    sstring _name;
    mutation_fragment_stream_validation_level _validation_level;
+    tombstone _current_tombstone;

 public:
    /// Constructor.
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -826,6 +826,7 @@ public:

    void apply(tombstone deleted_at) {
        _deleted_at.apply(deleted_at);
+        maybe_shadow();
    }

    void apply(shadowable_tombstone deleted_at) {
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -1581,11 +1581,7 @@ private:
    tracing::global_trace_state_ptr _trace_state;
    const mutation_reader::forwarding _fwd_mr;
    reader_concurrency_semaphore::inactive_read_handle _irh;
-    bool _drop_partition_start = false;
-    bool _drop_static_row = false;
-    // Validate the partition key of the first emitted partition, set after the
-    // reader was recreated.
-    bool _validate_partition_key = false;
+    bool _reader_recreated = false; // set if reader was recreated since last operation
    position_in_partition::tri_compare _tri_cmp;

    std::optional<dht::decorated_key> _last_pkey;
@@ -1606,10 +1602,9 @@ private:
    void adjust_partition_slice();
    flat_mutation_reader_v2 recreate_reader();
    future<flat_mutation_reader_v2> resume_or_create_reader();
-    void maybe_validate_partition_start(const flat_mutation_reader_v2::tracked_buffer& buffer);
+    void validate_partition_start(const partition_start& ps);
    void validate_position_in_partition(position_in_partition_view pos) const;
-    bool should_drop_fragment(const mutation_fragment_v2& mf);
-    future<> do_fill_buffer();
+    void examine_first_fragments(mutation_fragment_v2_opt& mf1, mutation_fragment_v2_opt& mf2, mutation_fragment_v2_opt& mf3);

 public:
    evictable_reader_v2(
@@ -1725,9 +1720,6 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
    _range_override.reset();
    _slice_override.reset();

-    _drop_partition_start = false;
-    _drop_static_row = false;
-
    if (_last_pkey) {
        bool partition_range_is_inclusive = true;

@@ -1736,11 +1728,8 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
            partition_range_is_inclusive = false;
            break;
        case partition_region::static_row:
-            _drop_partition_start = true;
            break;
        case partition_region::clustered:
-            _drop_partition_start = true;
-            _drop_static_row = true;
            adjust_partition_slice();
            slice = &*_slice_override;
            break;
@@ -1763,7 +1752,7 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
        _range_override = dht::partition_range({dht::partition_range::bound(*_last_pkey, partition_range_is_inclusive)}, _pr->end());
        range = &*_range_override;

-        _validate_partition_key = true;
+        _reader_recreated = true;
    }

    return _ms.make_reader_v2(
@@ -1788,41 +1777,33 @@ future<flat_mutation_reader_v2> evictable_reader_v2::resume_or_create_reader() {
    co_return recreate_reader();
 }

-void evictable_reader_v2::maybe_validate_partition_start(const flat_mutation_reader_v2::tracked_buffer& buffer) {
-    if (!_validate_partition_key || buffer.empty()) {
-        return;
-    }
-
-    // If this is set we can assume the first fragment is a partition-start.
-    const auto& ps = buffer.front().as_partition_start();
+void evictable_reader_v2::validate_partition_start(const partition_start& ps) {
    const auto tri_cmp = dht::ring_position_comparator(*_schema);
    // If we recreated the reader after fast-forwarding it we won't have
    // _last_pkey set. In this case it is enough to check if the partition
    // is in range.
    if (_last_pkey) {
        const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
-        if (_drop_partition_start) { // we expect to continue from the same partition
+        if (_next_position_in_partition.region() != partition_region::partition_start) { // we expect to continue from the same partition
            // We cannot assume the partition we stopped the read at is still alive
            // when we recreate the reader. It might have been compacted away in the
            // meanwhile, so allow for a larger partition too.
            require(
                    cmp_res <= 0,
-                    "{}(): validation failed, expected partition with key larger or equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
+                    "{}(): validation failed, expected partition with key larger or equal to _last_pkey {}, but got {}",
                    __FUNCTION__,
                    *_last_pkey,
                    ps.key());
-            // Reset drop flags and next pos if we are not continuing from the same partition
+            // Reset next pos if we are not continuing from the same partition
            if (cmp_res < 0) {
                // Close previous partition, we are not going to continue it.
                push_mutation_fragment(*_schema, _permit, partition_end{});
-                _drop_partition_start = false;
-                _drop_static_row = false;
                _next_position_in_partition = position_in_partition::for_partition_start();
            }
        } else { // should be a larger partition
            require(
                    cmp_res < 0,
-                    "{}(): validation failed, expected partition with key larger than _last_pkey {} due to _drop_partition_start being unset, but got {}",
+                    "{}(): validation failed, expected partition with key larger than _last_pkey {}, but got {}",
                    __FUNCTION__,
                    *_last_pkey,
                    ps.key());
@@ -1836,8 +1817,6 @@ void evictable_reader_v2::maybe_validate_partition_start(const flat_mutation_rea
            __FUNCTION__,
            prange,
            ps.key());
-
-    _validate_partition_key = false;
 }

 void evictable_reader_v2::validate_position_in_partition(position_in_partition_view pos) const {
@@ -1860,7 +1839,12 @@ void evictable_reader_v2::validate_position_in_partition(position_in_partition_v
        const bool any_contains = std::any_of(ranges.begin(), ranges.end(), [this, &pos] (const query::clustering_range& cr) {
            // TODO: somehow avoid this copy
            auto range = position_range(cr);
-            return range.contains(*_schema, pos);
+            // We cannot use range.contains() because that treats range as a
+            // [a, b) range, meaning a range tombstone change with position
+            // after_key(b) will be considered outside of it. Such range
+            // tombstone changes can be emitted however when recreating the
+            // reader on clustering range edge.
+            return _tri_cmp(range.start(), pos) <= 0 && _tri_cmp(pos, range.end()) <= 0;
        });
        require(
                any_contains,
@@ -1871,42 +1855,40 @@ void evictable_reader_v2::validate_position_in_partition(position_in_partition_v
    }
 }

-bool evictable_reader_v2::should_drop_fragment(const mutation_fragment_v2& mf) {
-    if (_drop_partition_start && mf.is_partition_start()) {
-        _drop_partition_start = false;
-        return true;
+void evictable_reader_v2::examine_first_fragments(mutation_fragment_v2_opt& mf1, mutation_fragment_v2_opt& mf2, mutation_fragment_v2_opt& mf3) {
+    if (!mf1) {
+        return; // the reader is at EOS
    }
-    // Unlike partition-start above, a partition is not guaranteed to have a
-    // static row fragment. So reset the flag regardless of whether we could
-    // drop one or not.
-    // We are guaranteed to get here only right after dropping a partition-start,
-    // so if we are not seeing a static row here, the partition doesn't have one.
-    if (_drop_static_row) {
-         _drop_static_row = false;
-        return mf.is_static_row();
-    }
-    return false;
-}

-future<> evictable_reader_v2::do_fill_buffer() {
-    if (!_drop_partition_start && !_drop_static_row) {
-        auto fill_buf_fut = _reader->fill_buffer();
-        if (_validate_partition_key) {
-            fill_buf_fut = fill_buf_fut.then([this] {
-                maybe_validate_partition_start(_reader->buffer());
-            });
-        }
-        return fill_buf_fut;
+    // If engaged, the first fragment is always a partition-start.
+    validate_partition_start(mf1->as_partition_start());
+    if (_tri_cmp(mf1->position(), _next_position_in_partition) < 0) {
+        mf1 = {}; // drop mf1
+    }
+
+    const auto continue_same_partition = _next_position_in_partition.region() != partition_region::partition_start;
+
+    // If we have a first fragment, we are guaranteed to have a second one -- if not else, a partition-end.
+    if (mf2->is_end_of_partition()) {
+        return; // no further fragments, nothing to do
+    }
+
+    // We want to validate the position of the first non-dropped fragment.
+    // If mf2 is a static row and we need to drop it, this will be mf3.
+    if (mf2->is_static_row() && _tri_cmp(mf2->position(), _next_position_in_partition) < 0) {
+        mf2 = {}; // drop mf2
+    } else {
+        if (continue_same_partition) {
+            validate_position_in_partition(mf2->position());
+        }
+        return;
+    }
+
+    if (mf3->is_end_of_partition()) {
+        return; // no further fragments, nothing to do
+    } else if (continue_same_partition) {
+        validate_position_in_partition(mf3->position());
    }
-    return repeat([this] {
-        return _reader->fill_buffer().then([this] {
-            maybe_validate_partition_start(_reader->buffer());
-            while (!_reader->is_buffer_empty() && should_drop_fragment(_reader->peek_buffer())) {
-                _reader->pop_mutation_fragment();
-            }
-            return stop_iteration(_reader->is_buffer_full() || _reader->is_end_of_stream());
-        });
-    });
 }

 evictable_reader_v2::evictable_reader_v2(
@@ -1935,10 +1917,62 @@ future<> evictable_reader_v2::fill_buffer() {
        co_return;
    }
    _reader = co_await resume_or_create_reader();
-    co_await do_fill_buffer();
+
+    if (_reader_recreated) {
+        // Recreating the reader breaks snapshot isolation and creates all sorts
+        // of complications around the continuity of range tombstone changes,
+        // e.g. a range tombstone started by the previous reader object
+        // might not exist anymore with the new reader object.
+        // To avoid complications we reset the tombstone state on each reader
+        // recreation by emitting a null tombstone change, if we read at least
+        // one clustering fragment from the partition.
+        if (_next_position_in_partition.region() == partition_region::clustered
+                && _tri_cmp(_next_position_in_partition, position_in_partition::before_all_clustered_rows()) > 0) {
+            push_mutation_fragment(*_schema, _permit, range_tombstone_change{position_in_partition_view::before_key(_next_position_in_partition), {}});
+        }
+        auto mf1 = co_await (*_reader)();
+        auto mf2 = co_await (*_reader)();
+        auto mf3 = co_await (*_reader)();
+        examine_first_fragments(mf1, mf2, mf3);
+        if (mf3) {
+            _reader->unpop_mutation_fragment(std::move(*mf3));
+        }
+        if (mf2) {
+            _reader->unpop_mutation_fragment(std::move(*mf2));
+        }
+        if (mf1) {
+            _reader->unpop_mutation_fragment(std::move(*mf1));
+        }
+        _reader_recreated = false;
+    } else {
+        co_await _reader->fill_buffer();
+    }
+
    _reader->move_buffer_content_to(*this);
+
+    // Ensure that each buffer represents forward progress. Only a concern when
+    // the last fragment in the buffer is range tombstone change. In this case
+    // ensure that:
+    // * buffer().back().position() > _next_position_in_partition;
+    // * _reader.peek()->position() > buffer().back().position();
+    if (!is_buffer_empty() && buffer().back().is_range_tombstone_change()) {
+        auto* next_mf = co_await _reader->peek();
+
+        // First make sure we've made progress w.r.t. _next_position_in_partition.
+        while (next_mf && _tri_cmp(_next_position_in_partition, buffer().back().position()) <= 0) {
+            push_mutation_fragment(_reader->pop_mutation_fragment());
+            next_mf = co_await _reader->peek();
+        }
+
+        const auto last_pos = position_in_partition(buffer().back().position());
+        while (next_mf && _tri_cmp(last_pos, next_mf->position()) == 0) {
+            push_mutation_fragment(_reader->pop_mutation_fragment());
+            next_mf = co_await _reader->peek();
+        }
+    }
+
    update_next_position();
-    _end_of_stream = _reader->is_end_of_stream() && _reader->is_buffer_empty();
+    _end_of_stream = _reader->is_end_of_stream();
    maybe_pause(std::move(*_reader));
 }

--- a/partition_snapshot_reader.hh
+++ b/partition_snapshot_reader.hh
@@ -292,14 +292,23 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
                const std::optional<position_in_partition>& last_row,
                const std::optional<position_in_partition>& last_rts,
                position_in_partition_view pos) {
-            if (!_rt_stream.empty()) {
-                return _rt_stream.get_next(std::move(pos));
-            }
            return in_alloc_section([&] () -> mutation_fragment_opt {
                maybe_refresh_state(ck_range_snapshot, last_row, last_rts);

                position_in_partition::less_compare rt_less(_query_schema);

+                // The while below moves range tombstones from partition versions
+                // into _rt_stream, just enough to produce the next range tombstone
+                // The main goal behind moving to _rt_stream is to deoverlap range tombstones
+                // which have the same starting position. This is not in order to satisfy
+                // flat_mutation_reader stream requirements, the reader can emit range tombstones
+                // which have the same position incrementally. This is to guarantee forward
+                // progress in the case iterators get invalidated and maybe_refresh_state()
+                // above needs to restore them. It does so using last_rts, which tracks
+                // the position of the last emitted range tombstone. All range tombstones
+                // with positions <= than last_rts are skipped on refresh. To make progress,
+                // we need to make sure that all range tombstones with duplicated positions
+                // are emitted before maybe_refresh_state().
                while (has_more_range_tombstones()
                        && !rt_less(pos, peek_range_tombstone().position())
                        && (_rt_stream.empty() || !rt_less(_rt_stream.peek_next().position(), peek_range_tombstone().position()))) {
--- a/partition_snapshot_row_cursor.hh
+++ b/partition_snapshot_row_cursor.hh
@@ -444,7 +444,7 @@ public:
    // When throws, the cursor is invalidated and its position is not changed.
    bool advance_to(position_in_partition_view lower_bound) {
        maybe_advance_to(lower_bound);
-        return no_clustering_row_between(_schema, lower_bound, position());
+        return no_clustering_row_between_weak(_schema, lower_bound, position());
    }

    // Call only when valid.
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -567,6 +567,20 @@ bool no_clustering_row_between(const schema& s, position_in_partition_view a, po
    }
 }

+// Returns true if and only if there can't be any clustering_row with position >= a and < b.
+// It is assumed that a <= b.
+inline
+bool no_clustering_row_between_weak(const schema& s, position_in_partition_view a, position_in_partition_view b) {
+    clustering_key_prefix::equality eq(s);
+    if (a.has_key() && b.has_key()) {
+        return eq(a.key(), b.key())
+               && (a.get_bound_weight() == bound_weight::after_all_prefixed
+                   || b.get_bound_weight() != bound_weight::after_all_prefixed);
+    } else {
+        return !a.has_key() && !b.has_key();
+    }
+}
+
 // Includes all position_in_partition objects "p" for which: start <= p < end
 // And only those.
 class position_range {
--- a/protocol_server.hh
+++ b/protocol_server.hh
@@ -10,6 +10,7 @@

 #include "seastarx.hh"
 #include <seastar/core/future.hh>
+#include <seastar/net/socket_defs.hh>
 #include <vector>

 // Abstraction for a server serving some kind of user-facing protocol.
--- a/querier.cc
+++ b/querier.cc
@@ -414,25 +414,6 @@ future<bool> querier_cache::evict_one() noexcept {
    co_return false;
 }

-future<> querier_cache::evict_all_for_table(const utils::UUID& schema_id) noexcept {
-    for (auto ip : {&_data_querier_index, &_mutation_querier_index, &_shard_mutation_querier_index}) {
-        auto& idx = *ip;
-        for (auto it = idx.begin(); it != idx.end();) {
-            if (it->second->schema().id() == schema_id) {
-                auto reader_opt = it->second->permit().semaphore().unregister_inactive_read(querier_utils::get_inactive_read_handle(*it->second));
-                it = idx.erase(it);
-                --_stats.population;
-                if (reader_opt) {
-                    co_await reader_opt->close();
-                }
-            } else {
-                ++it;
-            }
-        }
-    }
-    co_return;
-}
-
 future<> querier_cache::stop() noexcept {
    co_await _closing_gate.close();

--- a/querier.hh
+++ b/querier.hh
@@ -476,11 +476,6 @@ public:
    /// is empty).
    future<bool> evict_one() noexcept;

-    /// Evict all queriers that belong to a table.
-    ///
-    /// Should be used when dropping a table.
-    future<> evict_all_for_table(const utils::UUID& schema_id) noexcept;
-
    /// Close all queriers and wait on background work.
    ///
    /// Should be used before destroying the querier_cache.
--- a/range_tombstone_list.cc
+++ b/range_tombstone_list.cc
@@ -9,6 +9,7 @@
 #include <boost/range/adaptor/reversed.hpp>
 #include "range_tombstone_list.hh"
 #include "utils/allocation_strategy.hh"
+#include "utils/amortized_reserve.hh"
 #include <seastar/util/variant_utils.hh>

 range_tombstone_list::range_tombstone_list(const range_tombstone_list& x)
@@ -96,7 +97,7 @@ void range_tombstone_list::insert_from(const schema& s,
        if (cmp(end, it->position()) < 0) {
            // not overlapping
            if (it->tombstone().tomb == tomb && cmp(end, it->position()) == 0) {
-                rev.update(it, {std::move(start), std::move(start), tomb});
+                rev.update(it, {std::move(start), std::move(end), tomb});
            } else {
                auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), tomb);
                rev.insert(it, *rt);
@@ -375,13 +376,13 @@ range_tombstone_list::reverter::insert(range_tombstones_type::iterator it, range

 range_tombstone_list::range_tombstones_type::iterator
 range_tombstone_list::reverter::erase(range_tombstones_type::iterator it) {
-    _ops.reserve(_ops.size() + 1);
+    amortized_reserve(_ops, _ops.size() + 1);
    _ops.emplace_back(erase_undo_op(*it));
    return _dst._tombstones.erase(it);
 }

 void range_tombstone_list::reverter::update(range_tombstones_type::iterator it, range_tombstone&& new_rt) {
-    _ops.reserve(_ops.size() + 1);
+    amortized_reserve(_ops, _ops.size() + 1);
    swap(it->tombstone(), new_rt);
    _ops.emplace_back(update_undo_op(std::move(new_rt), *it));
 }
--- a/range_tombstone_list.hh
+++ b/range_tombstone_list.hh
@@ -12,6 +12,7 @@
 #include "range_tombstone.hh"
 #include "query-request.hh"
 #include "utils/preempt.hh"
+#include "utils/chunked_vector.hh"
 #include <iosfwd>
 #include <variant>

@@ -106,7 +107,7 @@ class range_tombstone_list final {
    class reverter {
    private:
        using op = std::variant<erase_undo_op, insert_undo_op, update_undo_op>;
-        std::vector<op> _ops;
+        utils::chunked_vector<op> _ops;
        const schema& _s;
    protected:
        range_tombstone_list& _dst;
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -743,6 +743,25 @@ void reader_concurrency_semaphore::clear_inactive_reads() {
    }
 }

+future<> reader_concurrency_semaphore::evict_inactive_reads_for_table(utils::UUID id) noexcept {
+    inactive_reads_type evicted_readers;
+    auto it = _inactive_reads.begin();
+    while (it != _inactive_reads.end()) {
+        auto& ir = *it;
+        ++it;
+        if (ir.reader.schema()->id() == id) {
+            do_detach_inactive_reader(ir, evict_reason::manual);
+            ir.ttl_timer.cancel();
+            ir.unlink();
+            evicted_readers.push_back(ir);
+        }
+    }
+    while (!evicted_readers.empty()) {
+        std::unique_ptr<inactive_read> irp(&evicted_readers.front());
+        co_await irp->reader.close();
+    }
+}
+
 std::runtime_error reader_concurrency_semaphore::stopped_exception() {
    return std::runtime_error(format("{} was stopped", _name));
 }
@@ -765,11 +784,9 @@ future<> reader_concurrency_semaphore::stop() noexcept {
    co_return;
 }

-flat_mutation_reader_v2 reader_concurrency_semaphore::detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
-    auto reader = std::move(ir.reader);
+void reader_concurrency_semaphore::do_detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
    ir.detach();
-    reader.permit()._impl->on_evicted();
-    std::unique_ptr<inactive_read> irp(&ir);
+    ir.reader.permit()._impl->on_evicted();
    try {
        if (ir.notify_handler) {
            ir.notify_handler(reason);
@@ -788,7 +805,12 @@ flat_mutation_reader_v2 reader_concurrency_semaphore::detach_inactive_reader(ina
            break;
    }
    --_stats.inactive_reads;
-    return reader;
+}
+
+flat_mutation_reader_v2 reader_concurrency_semaphore::detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
+    std::unique_ptr<inactive_read> irp(&ir);
+    do_detach_inactive_reader(ir, reason);
+    return std::move(irp->reader);
 }

 void reader_concurrency_semaphore::evict(inactive_read& ir, evict_reason reason) noexcept {
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -186,6 +186,7 @@ private:
    std::optional<future<>> _execution_loop_future;

 private:
+    void do_detach_inactive_reader(inactive_read&, evict_reason reason) noexcept;
    [[nodiscard]] flat_mutation_reader_v2 detach_inactive_reader(inactive_read&, evict_reason reason) noexcept;
    void evict(inactive_read&, evict_reason reason) noexcept;

@@ -301,6 +302,9 @@ public:

    /// Clear all inactive reads.
    void clear_inactive_reads();
+
+    /// Evict all inactive reads the belong to the table designated by the id.
+    future<> evict_inactive_reads_for_table(utils::UUID id) noexcept;
 private:
    // The following two functions are extension points for
    // future inheriting classes that needs to run some stop
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -25,6 +25,7 @@
 #include "utils/bit_cast.hh"
 #include "service/migration_manager.hh"
 #include "partition_range_compat.hh"
+#include "gms/feature_service.hh"

 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/algorithm/string/split.hpp>
@@ -41,6 +42,7 @@
 #include <seastar/core/sleep.hh>

 #include <cfloat>
+#include <algorithm>

 #include "idl/partition_checksum.dist.hh"

@@ -118,6 +120,13 @@ std::ostream& operator<<(std::ostream& out, row_level_diff_detect_algorithm algo
    return out << "unknown";
 }

+static size_t get_nr_tables(const replica::database& db, const sstring& keyspace) {
+    auto& m = db.get_column_families_mapping();
+    return std::count_if(m.begin(), m.end(), [&keyspace] (auto& e) {
+        return e.first.first == keyspace;
+    });
+}
+
 static std::vector<sstring> list_column_families(const replica::database& db, const sstring& keyspace) {
    std::vector<sstring> ret;
    for (auto &&e : db.get_column_families_mapping()) {
@@ -443,7 +452,7 @@ float tracker::report_progress(streaming::stream_reason reason) {
    for (auto& x : _repairs) {
        auto& ri = x.second;
        if (ri->reason == reason) {
-            nr_ranges_total += ri->nr_ranges_total;
+            nr_ranges_total += ri->ranges_size();
            nr_ranges_finished += ri->nr_ranges_finished;
        }
    }
@@ -555,8 +564,8 @@ void repair_info::check_failed_ranges() {
    rlogger.info("repair id {} on shard {} stats: repair_reason={}, keyspace={}, tables={}, ranges_nr={}, {}",
        id, shard, reason, keyspace, table_names(), ranges.size(), _stats.get_stats());
    if (nr_failed_ranges) {
-        rlogger.warn("repair id {} on shard {} failed - {} out of {} ranges failed", id, shard, nr_failed_ranges, ranges.size());
-        throw std::runtime_error(format("repair id {} on shard {} failed to repair {} out of {} ranges", id, shard, nr_failed_ranges, ranges.size()));
+        rlogger.warn("repair id {} on shard {} failed - {} out of {} ranges failed", id, shard, nr_failed_ranges, ranges_size());
+        throw std::runtime_error(format("repair id {} on shard {} failed to repair {} out of {} ranges", id, shard, nr_failed_ranges, ranges_size()));
    } else {
        if (dropped_tables.size()) {
            rlogger.warn("repair id {} on shard {} completed successfully, keyspace={}, ignoring dropped tables={}", id, shard, keyspace, dropped_tables);
@@ -582,14 +591,18 @@ repair_neighbors repair_info::get_repair_neighbors(const dht::token_range& range
        neighbors[range];
 }

+size_t repair_info::ranges_size() {
+    return ranges.size() * table_ids.size();
+}
+
 // Repair a single local range, multiple column families.
 // Comparable to RepairSession in Origin
-future<> repair_info::repair_range(const dht::token_range& range) {
+future<> repair_info::repair_range(const dht::token_range& range, utils::UUID table_id) {
    check_in_shutdown();
    check_in_abort();
    ranges_index++;
    repair_neighbors neighbors = get_repair_neighbors(range);
-    return do_with(std::move(neighbors.all), std::move(neighbors.mandatory), [this, range] (auto& neighbors, auto& mandatory_neighbors) {
+    return do_with(std::move(neighbors.all), std::move(neighbors.mandatory), [this, range, table_id] (auto& neighbors, auto& mandatory_neighbors) {
      auto live_neighbors = boost::copy_range<std::vector<gms::inet_address>>(neighbors |
                    boost::adaptors::filtered([this] (const gms::inet_address& node) { return gossiper.is_alive(node); }));
      for (auto& node : mandatory_neighbors) {
@@ -598,7 +611,7 @@ future<> repair_info::repair_range(const dht::token_range& range) {
                nr_failed_ranges++;
                auto status = format("failed: mandatory neighbor={} is not alive", node);
                rlogger.error("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
-                    ranges_index, ranges.size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
+                    ranges_index, ranges_size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
                abort();
                return make_exception_future<>(std::runtime_error(format("Repair mandatory neighbor={} is not alive, keyspace={}, mandatory_neighbors={}",
                    node, keyspace, mandatory_neighbors)));
@@ -608,7 +621,7 @@ future<> repair_info::repair_range(const dht::token_range& range) {
            nr_failed_ranges++;
            auto status = live_neighbors.empty() ? "skipped" : "partial";
            rlogger.warn("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
-            ranges_index, ranges.size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
+            ranges_index, ranges_size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
            if (live_neighbors.empty()) {
                return make_ready_future<>();
            }
@@ -617,13 +630,12 @@ future<> repair_info::repair_range(const dht::token_range& range) {
      if (neighbors.empty()) {
            auto status = "skipped_no_followers";
            rlogger.warn("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
-            ranges_index, ranges.size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
+            ranges_index, ranges_size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
            return make_ready_future<>();
      }
      rlogger.info("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}",
-            ranges_index, ranges.size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors);
-      return mm.sync_schema(db.local(), neighbors).then([this, &neighbors, range] {
-        return do_for_each(table_ids.begin(), table_ids.end(), [this, &neighbors, range] (utils::UUID table_id) {
+            ranges_index, ranges_size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors);
+      return mm.sync_schema(db.local(), neighbors).then([this, &neighbors, range, table_id] {
            sstring cf;
            try {
                cf = db.local().find_column_family(table_id).schema()->cf_name();
@@ -641,7 +653,6 @@ future<> repair_info::repair_range(const dht::token_range& range) {
                nr_failed_ranges++;
                return make_exception_future<>(std::move(ep));
            });
-        });
      });
    });
 }
@@ -914,27 +925,55 @@ private:


 static future<> do_repair_ranges(lw_shared_ptr<repair_info> ri) {
-    // repair all the ranges in limited parallelism
-    return parallel_for_each(ri->ranges, [ri] (auto&& range) {
-        return with_semaphore(ri->rs.repair_tracker().range_parallelism_semaphore(), 1, [ri, &range] {
-            return ri->repair_range(range).then([ri] {
-                if (ri->reason == streaming::stream_reason::bootstrap) {
-                    ri->rs.get_metrics().bootstrap_finished_ranges++;
-                } else if (ri->reason == streaming::stream_reason::replace) {
-                    ri->rs.get_metrics().replace_finished_ranges++;
-                } else if (ri->reason == streaming::stream_reason::rebuild) {
-                    ri->rs.get_metrics().rebuild_finished_ranges++;
-                } else if (ri->reason == streaming::stream_reason::decommission) {
-                    ri->rs.get_metrics().decommission_finished_ranges++;
-                } else if (ri->reason == streaming::stream_reason::removenode) {
-                    ri->rs.get_metrics().removenode_finished_ranges++;
-                } else if (ri->reason == streaming::stream_reason::repair) {
-                    ri->rs.get_metrics().repair_finished_ranges_sum++;
-                    ri->nr_ranges_finished++;
-                }
+    // Repair tables in the keyspace one after another
+    assert(ri->table_names().size() == ri->table_ids.size());
+    for (int idx = 0; idx < ri->table_ids.size(); idx++) {
+        auto table_id = ri->table_ids[idx];
+        auto table_name = ri->table_names()[idx];
+        // repair all the ranges in limited parallelism
+        rlogger.info("repair[{}]: Started to repair {} out of {} tables in keyspace={}, table={}, table_id={}, repair_reason={}",
+                ri->id.uuid, idx + 1, ri->table_ids.size(), ri->keyspace, table_name, table_id, ri->reason);
+        co_await parallel_for_each(ri->ranges, [ri, table_id] (auto&& range) {
+            return with_semaphore(ri->rs.repair_tracker().range_parallelism_semaphore(), 1, [ri, &range, table_id] {
+                return ri->repair_range(range, table_id).then([ri] {
+                    if (ri->reason == streaming::stream_reason::bootstrap) {
+                        ri->rs.get_metrics().bootstrap_finished_ranges++;
+                    } else if (ri->reason == streaming::stream_reason::replace) {
+                        ri->rs.get_metrics().replace_finished_ranges++;
+                    } else if (ri->reason == streaming::stream_reason::rebuild) {
+                        ri->rs.get_metrics().rebuild_finished_ranges++;
+                    } else if (ri->reason == streaming::stream_reason::decommission) {
+                        ri->rs.get_metrics().decommission_finished_ranges++;
+                    } else if (ri->reason == streaming::stream_reason::removenode) {
+                        ri->rs.get_metrics().removenode_finished_ranges++;
+                    } else if (ri->reason == streaming::stream_reason::repair) {
+                        ri->rs.get_metrics().repair_finished_ranges_sum++;
+                        ri->nr_ranges_finished++;
+                    }
+                    rlogger.debug("repair[{}]: node ops progress bootstrap={}, replace={}, rebuild={}, decommission={}, removenode={}, repair={}",
+                        ri->id.uuid,
+                        ri->rs.get_metrics().bootstrap_finished_percentage(),
+                        ri->rs.get_metrics().replace_finished_percentage(),
+                        ri->rs.get_metrics().rebuild_finished_percentage(),
+                        ri->rs.get_metrics().decommission_finished_percentage(),
+                        ri->rs.get_metrics().removenode_finished_percentage(),
+                        ri->rs.get_metrics().repair_finished_percentage());
+                });
            });
        });
-    });
+
+        if (ri->reason != streaming::stream_reason::repair) {
+            try {
+                auto& table = ri->db.local().find_column_family(table_id);
+                rlogger.debug("repair[{}]: Trigger off-strategy compaction for keyspace={}, table={}",
+                    ri->id.uuid, table.schema()->ks_name(), table.schema()->cf_name());
+                table.trigger_offstrategy_compaction();
+            } catch (replica::no_such_column_family&) {
+                // Ignore dropped table
+            }
+        }
+    }
+    co_return;
 }

 // repair_ranges repairs a list of token ranges, each assumed to be a token
@@ -1060,33 +1099,48 @@ int repair_service::do_repair_start(sstring keyspace, std::unordered_map<sstring
            cfs = std::move(cfs), ranges = std::move(ranges), options = std::move(options), ignore_nodes = std::move(ignore_nodes)] () mutable {
        auto uuid = id.uuid;

-        auto waiting_nodes = db.local().get_token_metadata().get_all_endpoints();
-        std::erase_if(waiting_nodes, [&] (const auto& addr) {
-            return ignore_nodes.contains(addr);
-        });
-        auto participants = get_hosts_participating_in_repair(db.local(), keyspace, ranges, options.data_centers, options.hosts, ignore_nodes).get();
-        auto hints_timeout = std::chrono::seconds(300);
-        auto batchlog_timeout = std::chrono::seconds(300);
-        repair_flush_hints_batchlog_request req{id.uuid, participants, hints_timeout, batchlog_timeout};
+        bool needs_flush_before_repair = false;
+        if (db.local().features().cluster_supports_tombstone_gc_options()) {
+            for (auto& table: cfs) {
+                auto s = db.local().find_column_family(keyspace, table).schema();
+                const auto& options = s->tombstone_gc_options();
+                if (options.mode() == tombstone_gc_mode::repair) {
+                    needs_flush_before_repair = true;
+                }
+            }
+        }

        bool hints_batchlog_flushed = false;
-        try {
-            parallel_for_each(waiting_nodes, [this, uuid, &req, &participants] (gms::inet_address node) -> future<> {
-                rlogger.info("repair[{}]: Sending repair_flush_hints_batchlog to node={}, participants={}, started",
-                        uuid, node, participants);
-                try {
-                    auto& ms = get_messaging();
-                    auto resp = co_await ser::partition_checksum_rpc_verbs::send_repair_flush_hints_batchlog(&ms, netw::msg_addr(node), req);
-                } catch (...) {
-                    rlogger.warn("repair[{}]: Sending repair_flush_hints_batchlog to node={}, participants={}, failed: {}",
-                            uuid, node, participants, std::current_exception());
-                    throw;
-                }
-            }).get();
-            hints_batchlog_flushed = true;
-        } catch (...) {
-            rlogger.warn("repair[{}]: Sending repair_flush_hints_batchlog to participants={} failed, continue to run repair",
-                    uuid, participants);
+        auto participants = get_hosts_participating_in_repair(db.local(), keyspace, ranges, options.data_centers, options.hosts, ignore_nodes).get();
+        if (needs_flush_before_repair) {
+            auto waiting_nodes = db.local().get_token_metadata().get_all_endpoints();
+            std::erase_if(waiting_nodes, [&] (const auto& addr) {
+                return ignore_nodes.contains(addr);
+            });
+            auto hints_timeout = std::chrono::seconds(300);
+            auto batchlog_timeout = std::chrono::seconds(300);
+            repair_flush_hints_batchlog_request req{id.uuid, participants, hints_timeout, batchlog_timeout};
+
+            try {
+                parallel_for_each(waiting_nodes, [this, uuid, &req, &participants] (gms::inet_address node) -> future<> {
+                    rlogger.info("repair[{}]: Sending repair_flush_hints_batchlog to node={}, participants={}, started",
+                            uuid, node, participants);
+                    try {
+                        auto& ms = get_messaging();
+                        auto resp = co_await ser::partition_checksum_rpc_verbs::send_repair_flush_hints_batchlog(&ms, netw::msg_addr(node), req);
+                    } catch (...) {
+                        rlogger.warn("repair[{}]: Sending repair_flush_hints_batchlog to node={}, participants={}, failed: {}",
+                                uuid, node, participants, std::current_exception());
+                        throw;
+                    }
+                }).get();
+                hints_batchlog_flushed = true;
+            } catch (...) {
+                rlogger.warn("repair[{}]: Sending repair_flush_hints_batchlog to participants={} failed, continue to run repair",
+                        uuid, participants);
+            }
+        } else {
+            rlogger.info("repair[{}]: Skipped sending repair_flush_hints_batchlog to nodes={}", uuid, participants);
        }

        std::vector<future<>> repair_results;
@@ -1288,7 +1342,8 @@ future<> repair_service::bootstrap_with_repair(locator::token_metadata_ptr tmptr
            auto& strat = ks.get_replication_strategy();
            dht::token_range_vector desired_ranges = strat.get_pending_address_ranges(tmptr, tokens, myip).get0();
            seastar::thread::maybe_yield();
-            nr_ranges_total += desired_ranges.size();
+            auto nr_tables = get_nr_tables(db.local(), keyspace_name);
+            nr_ranges_total += desired_ranges.size() * nr_tables;
        }
        container().invoke_on_all([nr_ranges_total] (repair_service& rs) {
            rs.get_metrics().bootstrap_finished_ranges = 0;
@@ -1320,7 +1375,8 @@ future<> repair_service::bootstrap_with_repair(locator::token_metadata_ptr tmptr
            //Collects the source that will have its range moved to the new node
            std::unordered_map<dht::token_range, repair_neighbors> range_sources;

-            rlogger.info("bootstrap_with_repair: started with keyspace={}, nr_ranges={}", keyspace_name, desired_ranges.size());
+            auto nr_tables = get_nr_tables(db.local(), keyspace_name);
+            rlogger.info("bootstrap_with_repair: started with keyspace={}, nr_ranges={}", keyspace_name, desired_ranges.size() * nr_tables);
            for (auto& desired_range : desired_ranges) {
                for (auto& x : range_addresses) {
                    const range<dht::token>& src_range = x.first;
@@ -1461,7 +1517,8 @@ future<> repair_service::do_decommission_removenode_with_repair(locator::token_m
            }
            auto& ks = db.local().find_keyspace(keyspace_name);
            dht::token_range_vector ranges = ks.get_effective_replication_map()->get_ranges(leaving_node);
-            nr_ranges_total += ranges.size();
+            auto nr_tables = get_nr_tables(db.local(), keyspace_name);
+            nr_ranges_total += ranges.size() * nr_tables;
        }
        if (reason == streaming::stream_reason::decommission) {
            container().invoke_on_all([nr_ranges_total] (repair_service& rs) {
@@ -1485,8 +1542,9 @@ future<> repair_service::do_decommission_removenode_with_repair(locator::token_m
            auto erm = ks.get_effective_replication_map();
            // First get all ranges the leaving node is responsible for
            dht::token_range_vector ranges = erm->get_ranges(leaving_node);
-            rlogger.info("{}: started with keyspace={}, leaving_node={}, nr_ranges={}", op, keyspace_name, leaving_node, ranges.size());
-            size_t nr_ranges_total = ranges.size();
+            auto nr_tables = get_nr_tables(db.local(), keyspace_name);
+            rlogger.info("{}: started with keyspace={}, leaving_node={}, nr_ranges={}", op, keyspace_name, leaving_node, ranges.size() * nr_tables);
+            size_t nr_ranges_total = ranges.size() * nr_tables;
            size_t nr_ranges_skipped = 0;
            std::unordered_map<dht::token_range, inet_address_vector_replica_set> current_replica_endpoints;
            // Find (for each range) all nodes that store replicas for these ranges as well
@@ -1677,7 +1735,8 @@ future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_
            auto& strat = ks.get_replication_strategy();
            // Okay to yield since tm is immutable
            dht::token_range_vector ranges = strat.get_ranges(myip, tmptr).get0();
-            nr_ranges_total += ranges.size();
+            auto nr_tables = get_nr_tables(db.local(), keyspace_name);
+            nr_ranges_total += ranges.size() * nr_tables;

        }
        if (reason == streaming::stream_reason::rebuild) {
@@ -1702,7 +1761,8 @@ future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_
            auto& strat = ks.get_replication_strategy();
            dht::token_range_vector ranges = strat.get_ranges(myip, tmptr).get0();
            std::unordered_map<dht::token_range, repair_neighbors> range_sources;
-            rlogger.info("{}: started with keyspace={}, source_dc={}, nr_ranges={}, ignore_nodes={}", op, keyspace_name, source_dc, ranges.size(), ignore_nodes);
+            auto nr_tables = get_nr_tables(db.local(), keyspace_name);
+            rlogger.info("{}: started with keyspace={}, source_dc={}, nr_ranges={}, ignore_nodes={}", op, keyspace_name, source_dc, ranges.size() * nr_tables, ignore_nodes);
            for (auto it = ranges.begin(); it != ranges.end();) {
                auto& r = *it;
                seastar::thread::maybe_yield();
@@ -1730,12 +1790,12 @@ future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_
                }
            }
            if (reason == streaming::stream_reason::rebuild) {
-                container().invoke_on_all([nr_ranges_skipped] (repair_service& rs) {
-                    rs.get_metrics().rebuild_finished_ranges += nr_ranges_skipped;
+                container().invoke_on_all([nr_ranges_skipped, nr_tables] (repair_service& rs) {
+                    rs.get_metrics().rebuild_finished_ranges += nr_ranges_skipped * nr_tables;
                }).get();
            } else if (reason == streaming::stream_reason::replace) {
-                container().invoke_on_all([nr_ranges_skipped] (repair_service& rs) {
-                    rs.get_metrics().replace_finished_ranges += nr_ranges_skipped;
+                container().invoke_on_all([nr_ranges_skipped, nr_tables] (repair_service& rs) {
+                    rs.get_metrics().replace_finished_ranges += nr_ranges_skipped * nr_tables;
                }).get();
            }
            auto nr_ranges = ranges.size();
--- a/repair/repair.hh
+++ b/repair/repair.hh
@@ -200,7 +200,9 @@ public:
        return _hints_batchlog_flushed;
    }

-    future<> repair_range(const dht::token_range& range);
+    future<> repair_range(const dht::token_range& range, utils::UUID table_id);
+
+    size_t ranges_size();
 };

 // The repair_tracker tracks ongoing repair operations and their progress.
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -347,9 +347,9 @@ private:
    // Only needed for local readers, the multishard reader takes care
    // of pinning tables on used shards.
    std::optional<utils::phased_barrier::operation> _local_read_op;
+    std::optional<evictable_reader_handle> _reader_handle;
    // Local reader or multishard reader to read the range
    flat_mutation_reader _reader;
-    std::optional<evictable_reader_handle> _reader_handle;
    // Current partition read from disk
    lw_shared_ptr<const decorated_key_with_hash> _current_dk;
    uint64_t _reads_issued = 0;
--- a/repair/row_level.hh
+++ b/repair/row_level.hh
@@ -67,6 +67,7 @@ public:
    uint64_t repair_finished_ranges_sum{0};
 private:
    seastar::metrics::metric_groups _metrics;
+public:
    float bootstrap_finished_percentage();
    float replace_finished_percentage();
    float rebuild_finished_percentage();
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -910,10 +910,9 @@ bool database::update_column_family(schema_ptr new_schema) {
    return columns_changed;
 }

-future<> database::remove(const column_family& cf) noexcept {
+void database::remove(const table& cf) noexcept {
    auto s = cf.schema();
    auto& ks = find_keyspace(s->ks_name());
-    co_await _querier_cache.evict_all_for_table(s->id());
    _column_families.erase(s->id());
    ks.metadata()->remove_column_family(s);
    _ks_cf_to_uuid.erase(std::make_pair(s->ks_name(), s->cf_name()));
@@ -937,13 +936,22 @@ future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_
        on_internal_error(dblog, fmt::format("drop_column_family {}.{}: UUID={} not found", ks_name, cf_name, uuid));
    }
    dblog.debug("Dropping {}.{}", ks_name, cf_name);
-    co_await remove(*cf);
+    remove(*cf);
    cf->clear_views();
-    co_return co_await cf->await_pending_ops().then([this, &ks, cf, tsf = std::move(tsf), snapshot] {
-        return truncate(ks, *cf, std::move(tsf), snapshot).finally([this, cf] {
-            return cf->stop();
-        });
-    }).finally([cf] {});
+    co_await cf->await_pending_ops();
+    for (auto* sem : {&_read_concurrency_sem, &_streaming_concurrency_sem, &_compaction_concurrency_sem, &_system_read_concurrency_sem}) {
+        co_await sem->evict_inactive_reads_for_table(uuid);
+    }
+    std::exception_ptr ex;
+    try {
+        co_await truncate(ks, *cf, std::move(tsf), snapshot);
+    } catch (...) {
+        ex = std::current_exception();
+    }
+    co_await cf->stop();
+    if (ex) {
+        std::rethrow_exception(std::move(ex));
+    }
 }

 const utils::UUID& database::find_uuid(std::string_view ks, std::string_view cf) const {
@@ -2054,6 +2062,53 @@ future<> database::flush(const sstring& ksname, const sstring& cfname) {
    return cf.flush();
 }

+future<> database::flush_on_all(utils::UUID id) {
+    return container().invoke_on_all([id] (replica::database& db) {
+        return db.find_column_family(id).flush();
+    });
+}
+
+future<> database::flush_on_all(std::string_view ks_name, std::string_view table_name) {
+    return flush_on_all(find_uuid(ks_name, table_name));
+}
+
+future<> database::flush_on_all(std::string_view ks_name, std::vector<sstring> table_names) {
+    return parallel_for_each(table_names, [this, ks_name] (const auto& table_name) {
+        return flush_on_all(ks_name, table_name);
+    });
+}
+
+future<> database::flush_on_all(std::string_view ks_name) {
+    return parallel_for_each(find_keyspace(ks_name).metadata()->cf_meta_data(), [this] (auto& pair) {
+        return flush_on_all(pair.second->id());
+    });
+}
+
+future<> database::snapshot_on_all(std::string_view ks_name, std::vector<sstring> table_names, sstring tag, bool skip_flush) {
+    co_await parallel_for_each(table_names, [this, ks_name, tag = std::move(tag), skip_flush] (const auto& table_name) -> future<> {
+        if (!skip_flush) {
+            co_await flush_on_all(ks_name, table_name);
+        }
+        co_await container().invoke_on_all([ks_name, &table_name, tag, skip_flush] (replica::database& db) {
+            auto& t = db.find_column_family(ks_name, table_name);
+            return t.snapshot(db, tag);
+        });
+    });
+}
+
+future<> database::snapshot_on_all(std::string_view ks_name, sstring tag, bool skip_flush) {
+    auto& ks = find_keyspace(ks_name);
+    co_await parallel_for_each(ks.metadata()->cf_meta_data(), [this, tag = std::move(tag), skip_flush] (const auto& pair) -> future<> {
+        if (!skip_flush) {
+            co_await flush_on_all(pair.second->id());
+        }
+        co_await container().invoke_on_all([id = pair.second, tag, skip_flush] (replica::database& db) {
+            auto& t = db.find_column_family(id);
+            return t.snapshot(db, tag);
+        });
+    });
+}
+
 future<> database::truncate(sstring ksname, sstring cfname, timestamp_func tsf) {
    auto& ks = find_keyspace(ksname);
    auto& cf = find_column_family(ksname, cfname);
@@ -2062,80 +2117,77 @@ future<> database::truncate(sstring ksname, sstring cfname, timestamp_func tsf)

 future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_func tsf, bool with_snapshot) {
    dblog.debug("Truncating {}.{}", cf.schema()->ks_name(), cf.schema()->cf_name());
-    return with_gate(cf.async_gate(), [this, &ks, &cf, tsf = std::move(tsf), with_snapshot] () mutable -> future<> {
-        const auto auto_snapshot = with_snapshot && get_config().auto_snapshot();
-        const auto should_flush = auto_snapshot;
+    auto holder = cf.async_gate().hold();

-        // Force mutations coming in to re-acquire higher rp:s
-        // This creates a "soft" ordering, in that we will guarantee that
-        // any sstable written _after_ we issue the flush below will
-        // only have higher rp:s than we will get from the discard_sstable
-        // call.
-        auto low_mark = cf.set_low_replay_position_mark();
+    const auto auto_snapshot = with_snapshot && get_config().auto_snapshot();
+    const auto should_flush = auto_snapshot;

-        const auto uuid = cf.schema()->id();
+    // Force mutations coming in to re-acquire higher rp:s
+    // This creates a "soft" ordering, in that we will guarantee that
+    // any sstable written _after_ we issue the flush below will
+    // only have higher rp:s than we will get from the discard_sstable
+    // call.
+    auto low_mark = cf.set_low_replay_position_mark();

-        return _compaction_manager->run_with_compaction_disabled(&cf, [this, &cf, should_flush, auto_snapshot, tsf = std::move(tsf), low_mark]() mutable {
-            future<> f = make_ready_future<>();
-            bool did_flush = false;
-            if (should_flush && cf.can_flush()) {
-                // TODO:
-                // this is not really a guarantee at all that we've actually
-                // gotten all things to disk. Again, need queue-ish or something.
-                f = cf.flush();
-                did_flush = true;
-            } else {
-                f = cf.clear();
-            }
-            return f.then([this, &cf, auto_snapshot, tsf = std::move(tsf), low_mark, should_flush, did_flush] {
-                dblog.debug("Discarding sstable data for truncated CF + indexes");
-                // TODO: notify truncation
+    const auto uuid = cf.schema()->id();

-                return tsf().then([this, &cf, auto_snapshot, low_mark, should_flush, did_flush](db_clock::time_point truncated_at) {
-                    future<> f = make_ready_future<>();
-                    if (auto_snapshot) {
-                        auto name = format("{:d}-{}", truncated_at.time_since_epoch().count(), cf.schema()->cf_name());
-                        f = cf.snapshot(*this, name);
-                    }
-                    return f.then([this, &cf, truncated_at, low_mark, should_flush, did_flush] {
-                        return cf.discard_sstables(truncated_at).then([this, &cf, truncated_at, low_mark, should_flush, did_flush](db::replay_position rp) {
-                            // TODO: indexes.
-                            // Note: since discard_sstables was changed to only count tables owned by this shard,
-                            // we can get zero rp back. Changed assert, and ensure we save at least low_mark.
-                            // #6995 - the assert below was broken in c2c6c71 and remained so for many years. 
-                            // We nowadays do not flush tables with sstables but autosnapshot=false. This means
-                            // the low_mark assertion does not hold, because we maybe/probably never got around to 
-                            // creating the sstables that would create them.
-                            assert(!did_flush || low_mark <= rp || rp == db::replay_position());
-                            rp = std::max(low_mark, rp);
-                            return truncate_views(cf, truncated_at, should_flush).then([&cf, truncated_at, rp] {
-                                // save_truncation_record() may actually fail after we cached the truncation time
-                                // but this is not be worse that if failing without caching: at least the correct time
-                                // will be available until next reboot and a client will have to retry truncation anyway.
-                                cf.cache_truncation_record(truncated_at);
-                                return db::system_keyspace::save_truncation_record(cf, truncated_at, rp);
-                            });
-                        });
-                    });
-                });
-            });
-        }).then([this, uuid] {
-            drop_repair_history_map_for_table(uuid);
-        });
-    });
-}
+    std::vector<compaction_manager::compaction_reenabler> cres;
+    cres.reserve(1 + cf.views().size());

-future<> database::truncate_views(const column_family& base, db_clock::time_point truncated_at, bool should_flush) {
-    return parallel_for_each(base.views(), [this, truncated_at, should_flush] (view_ptr v) {
+    cres.emplace_back(co_await _compaction_manager->stop_and_disable_compaction(&cf));
+    co_await parallel_for_each(cf.views(), [&, this] (view_ptr v) -> future<> {
        auto& vcf = find_column_family(v);
-        return _compaction_manager->run_with_compaction_disabled(&vcf, [&vcf, truncated_at, should_flush] {
-            return (should_flush ? vcf.flush() : vcf.clear()).then([&vcf, truncated_at, should_flush] {
-                return vcf.discard_sstables(truncated_at).then([&vcf, truncated_at, should_flush](db::replay_position rp) {
-                    return db::system_keyspace::save_truncation_record(vcf, truncated_at, rp);
-                });
-            });
-        });
+        cres.emplace_back(co_await _compaction_manager->stop_and_disable_compaction(&vcf));
    });
+
+    bool did_flush = false;
+    if (should_flush && cf.can_flush()) {
+        // TODO:
+        // this is not really a guarantee at all that we've actually
+        // gotten all things to disk. Again, need queue-ish or something.
+        co_await cf.flush();
+        did_flush = true;
+    } else {
+        co_await cf.clear();
+    }
+
+    dblog.debug("Discarding sstable data for truncated CF + indexes");
+    // TODO: notify truncation
+
+    db_clock::time_point truncated_at = co_await tsf();
+
+    if (auto_snapshot) {
+        auto name = format("{:d}-{}", truncated_at.time_since_epoch().count(), cf.schema()->cf_name());
+        co_await cf.snapshot(*this, name);
+    }
+
+    db::replay_position rp = co_await cf.discard_sstables(truncated_at);
+    // TODO: indexes.
+    // Note: since discard_sstables was changed to only count tables owned by this shard,
+    // we can get zero rp back. Changed assert, and ensure we save at least low_mark.
+    // #6995 - the assert below was broken in c2c6c71 and remained so for many years. 
+    // We nowadays do not flush tables with sstables but autosnapshot=false. This means
+    // the low_mark assertion does not hold, because we maybe/probably never got around to 
+    // creating the sstables that would create them.
+    assert(!did_flush || low_mark <= rp || rp == db::replay_position());
+    rp = std::max(low_mark, rp);
+    co_await parallel_for_each(cf.views(), [this, truncated_at, should_flush] (view_ptr v) -> future<> {
+        auto& vcf = find_column_family(v);
+            if (should_flush) {
+                co_await vcf.flush();
+            } else {
+                co_await vcf.clear();
+            }
+            db::replay_position rp = co_await vcf.discard_sstables(truncated_at);
+            co_await db::system_keyspace::save_truncation_record(vcf, truncated_at, rp);
+    });
+    // save_truncation_record() may actually fail after we cached the truncation time
+    // but this is not be worse that if failing without caching: at least the correct time
+    // will be available until next reboot and a client will have to retry truncation anyway.
+    cf.cache_truncation_record(truncated_at);
+    co_await db::system_keyspace::save_truncation_record(cf, truncated_at, rp);
+
+    drop_repair_history_map_for_table(uuid);
 }

 const sstring& database::get_snitch_name() const {
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -839,7 +839,11 @@ public:

    db::replay_position set_low_replay_position_mark();

-    future<> snapshot(database& db, sstring name, bool skip_flush = false);
+private:
+    future<> snapshot(database& db, sstring name);
+
+    friend class database;
+public:
    future<std::unordered_map<sstring, snapshot_details>> get_snapshot_details();

    /*!
@@ -1217,7 +1221,7 @@ struct string_pair_eq {
 //   local metadata reads
 //   use shard_of() for data

-class database {
+class database : public peering_sharded_service<database> {
    friend class ::database_test;
 public:
    enum class table_kind {
@@ -1371,6 +1375,7 @@ private:
    Future update_write_metrics(Future&& f);
    void update_write_metrics_for_timed_out_write();
    future<> create_keyspace(const lw_shared_ptr<keyspace_metadata>&, locator::effective_replication_map_factory& erm_factory, bool is_bootstrap, system_keyspace system);
+    void remove(const table&) noexcept;
 public:
    static utils::UUID empty_version;

@@ -1560,6 +1565,17 @@ public:

    future<> flush_all_memtables();
    future<> flush(const sstring& ks, const sstring& cf);
+    // flush a table identified by the given id on all shards.
+    future<> flush_on_all(utils::UUID id);
+    // flush a single table in a keyspace on all shards.
+    future<> flush_on_all(std::string_view ks_name, std::string_view table_name);
+    // flush a list of tables in a keyspace on all shards.
+    future<> flush_on_all(std::string_view ks_name, std::vector<sstring> table_names);
+    // flush all tables in a keyspace on all shards.
+    future<> flush_on_all(std::string_view ks_name);
+
+    future<> snapshot_on_all(std::string_view ks_name, std::vector<sstring> table_names, sstring tag, bool skip_flush);
+    future<> snapshot_on_all(std::string_view ks_name, sstring tag, bool skip_flush);

    // See #937. Truncation now requires a callback to get a time stamp
    // that must be guaranteed to be the same for all shards.
@@ -1568,11 +1584,9 @@ public:
    /** Truncates the given column family */
    future<> truncate(sstring ksname, sstring cfname, timestamp_func);
    future<> truncate(const keyspace& ks, column_family& cf, timestamp_func, bool with_snapshot = true);
-    future<> truncate_views(const column_family& base, db_clock::time_point truncated_at, bool should_flush);

    bool update_column_family(schema_ptr s);
    future<> drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func, bool with_snapshot = true);
-    future<> remove(const column_family&) noexcept;

    const logalloc::region_group& dirty_memory_region_group() const {
        return _dirty_memory_manager.region_group();
--- a/replica/distributed_loader.cc
+++ b/replica/distributed_loader.cc
@@ -454,12 +454,13 @@ future<> distributed_loader::handle_sstables_pending_delete(sstring pending_dele
    });
 }

-future<> distributed_loader::populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, bool must_exist) {
-    return async([&db, sstdir = std::move(sstdir), ks = std::move(ks), cf = std::move(cf), must_exist] {
+future<> distributed_loader::populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, allow_offstrategy_compaction do_allow_offstrategy_compaction, must_exist dir_must_exist) {
+    dblog.debug("Populating {}/{}/{} allow_offstrategy_compaction={} must_exist={}", ks, cf, sstdir, do_allow_offstrategy_compaction, dir_must_exist);
+    return async([&db, sstdir = std::move(sstdir), ks = std::move(ks), cf = std::move(cf), do_allow_offstrategy_compaction, dir_must_exist] {
        assert(this_shard_id() == 0);

        if (!file_exists(sstdir).get0()) {
-            if (must_exist) {
+            if (dir_must_exist) {
                throw std::runtime_error(format("Populating {}/{} failed: {} does not exist", ks, cf, sstdir));
            }
            return;
@@ -529,12 +530,14 @@ future<> distributed_loader::populate_column_family(distributed<replica::databas
            return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
        }, eligible_for_reshape_on_boot).get();

-        directory.invoke_on_all([global_table, &eligible_for_reshape_on_boot] (sstables::sstable_directory& dir) {
-            return dir.do_for_each_sstable([&global_table, &eligible_for_reshape_on_boot] (sstables::shared_sstable sst) {
-                auto requires_offstrategy = sstables::offstrategy(!eligible_for_reshape_on_boot(sst));
+        directory.invoke_on_all([global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::sstable_directory& dir) {
+            return dir.do_for_each_sstable([&global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::shared_sstable sst) {
+                auto requires_offstrategy = sstables::offstrategy(do_allow_offstrategy_compaction && !eligible_for_reshape_on_boot(sst));
                return global_table->add_sstable_and_update_cache(sst, requires_offstrategy);
-            }).then([&global_table] {
+            }).then([&global_table, do_allow_offstrategy_compaction] {
+              if (do_allow_offstrategy_compaction) {
                global_table->trigger_offstrategy_compaction();
+              }
            });
        }).get();
    });
@@ -560,11 +563,11 @@ future<> distributed_loader::populate_keyspace(distributed<replica::database>& d
                auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
                dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
                return ks.make_directory_for_column_family(cfname, uuid).then([&db, sstdir, uuid, ks_name, cfname] {
-                    return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::staging_dir, ks_name, cfname);
+                    return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::staging_dir, ks_name, cfname, allow_offstrategy_compaction::no);
                }).then([&db, sstdir, ks_name, cfname] {
-                    return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::quarantine_dir, ks_name, cfname, false /* must_exist */);
+                    return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::quarantine_dir, ks_name, cfname, allow_offstrategy_compaction::no, must_exist::no);
                }).then([&db, sstdir, uuid, ks_name, cfname] {
-                    return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname);
+                    return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname, allow_offstrategy_compaction::yes);
                }).handle_exception([ks_name, cfname, sstdir](std::exception_ptr eptr) {
                    std::string msg =
                        format("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
--- a/replica/distributed_loader.hh
+++ b/replica/distributed_loader.hh
@@ -13,6 +13,7 @@
 #include <seastar/core/distributed.hh>
 #include <seastar/core/sstring.hh>
 #include <seastar/core/file.hh>
+#include <seastar/util/bool_class.hh>
 #include <vector>
 #include <functional>
 #include <filesystem>
@@ -67,7 +68,9 @@ class distributed_loader {
    static future<size_t> make_sstables_available(sstables::sstable_directory& dir,
            sharded<replica::database>& db, sharded<db::view::view_update_generator>& view_update_generator,
            std::filesystem::path datadir, sstring ks, sstring cf);
-    static future<> populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, bool must_exist = true);
+    using allow_offstrategy_compaction = bool_class<struct allow_offstrategy_compaction_tag>;
+    using must_exist = bool_class<struct must_exist_tag>;
+    static future<> populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, allow_offstrategy_compaction, must_exist = must_exist::yes);
    static future<> populate_keyspace(distributed<replica::database>& db, sstring datadir, sstring ks_name);
    static future<> cleanup_column_family_temp_sst_dirs(sstring sstdir);
    static future<> handle_sstables_pending_delete(sstring pending_deletes_dir);
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -9,6 +9,7 @@
 #include <seastar/core/seastar.hh>
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/maybe_yield.hh>
+#include <seastar/coroutine/exception.hh>
 #include <seastar/util/closeable.hh>

 #include "replica/database.hh"
@@ -662,11 +663,21 @@ table::try_flush_memtable_to_sstable(lw_shared_ptr<memtable> old, sstable_write_
                [] (const dht::decorated_key&) { return api::min_timestamp; });
        }

-        mutation_fragment* fragment = co_await reader.peek();
-        if (!fragment) {
+        std::exception_ptr err;
+        try {
+            mutation_fragment* fragment = co_await reader.peek();
+            if (!fragment) {
+                co_await reader.close();
+                _memtables->erase(old);
+                co_return stop_iteration::yes;
+            }
+        } catch (...) {
+            err = std::current_exception();
+        }
+        if (err) {
+            tlogger.error("failed to flush memtable for {}.{}: {}", old->schema()->ks_name(), old->schema()->cf_name(), err);
            co_await reader.close();
-            _memtables->erase(old);
-            co_return stop_iteration::yes;
+            co_return stop_iteration(_async_gate.is_closed());
        }

        auto f = consumer(upgrade_to_v2(std::move(reader)));
@@ -1426,70 +1437,86 @@ future<> table::write_schema_as_cql(database& db, sstring dir) const {

 }

-future<> table::snapshot(database& db, sstring name, bool skip_flush) {
+future<> table::snapshot(database& db, sstring name) {
    auto jsondir = _config.datadir + "/snapshots/" + name;
-    tlogger.debug("snapshot {}: skip_flush={}", jsondir, skip_flush);
-    auto f = skip_flush ? make_ready_future<>() : flush();
-    return f.then([this, &db, jsondir = std::move(jsondir)]() {
-       return with_semaphore(_sstable_deletion_sem, 1, [this, &db, jsondir = std::move(jsondir)]() {
-        auto tables = boost::copy_range<std::vector<sstables::shared_sstable>>(*_sstables->all());
-        return do_with(std::move(tables), std::move(jsondir), [this, &db] (std::vector<sstables::shared_sstable>& tables, const sstring& jsondir) {
-            return io_check([&jsondir] { return recursive_touch_directory(jsondir); }).then([this, &db, &jsondir, &tables] {
-                return max_concurrent_for_each(tables, db.get_config().initial_sstable_loading_concurrency(), [&db, &jsondir] (sstables::shared_sstable sstable) {
-                  return with_semaphore(db.get_sharded_sst_dir_semaphore().local(), 1, [&jsondir, sstable] {
-                    return io_check([sstable, &dir = jsondir] {
-                        return sstable->create_links(dir);
-                    });
-                  });
-                });
-            }).then([&jsondir, &tables] {
-                return io_check(sync_directory, jsondir);
-            }).finally([this, &tables, &db, &jsondir] {
-                auto shard = std::hash<sstring>()(jsondir) % smp::count;
-                std::unordered_set<sstring> table_names;
-                for (auto& sst : tables) {
-                    auto f = sst->get_filename();
-                    auto rf = f.substr(sst->get_dir().size() + 1);
-                    table_names.insert(std::move(rf));
-                }
-                return smp::submit_to(shard, [requester = this_shard_id(), &jsondir, this, &db,
-                                              tables = std::move(table_names), datadir = _config.datadir] {
+    tlogger.debug("snapshot {}", jsondir);

-                    if (!pending_snapshots.contains(jsondir)) {
-                        pending_snapshots.emplace(jsondir, make_lw_shared<snapshot_manager>());
-                    }
-                    auto snapshot = pending_snapshots.at(jsondir);
-                    for (auto&& sst: tables) {
-                        snapshot->files.insert(std::move(sst));
-                    }
+    auto sstable_deletion_guard = co_await get_units(_sstable_deletion_sem, 1);
+    std::exception_ptr ex;

-                    snapshot->requests.signal(1);
-                    auto my_work = make_ready_future<>();
-                    if (requester == this_shard_id()) {
-                        my_work = snapshot->requests.wait(smp::count).then([&jsondir,
-                                                                            &db, snapshot, this] {
-                            // this_shard_id() here == requester == this_shard_id() before submit_to() above,
-                            // so the db reference is still local
-                            return write_schema_as_cql(db, jsondir).handle_exception([&jsondir](std::exception_ptr ptr) {
-                                tlogger.error("Failed writing schema file in snapshot in {} with exception {}", jsondir, ptr);
-                                return make_ready_future<>();
-                            }).finally([&jsondir, snapshot] () mutable {
-                                return seal_snapshot(jsondir).handle_exception([&jsondir] (std::exception_ptr ex) {
-                                    tlogger.error("Failed to seal snapshot in {}: {}. Ignored.", jsondir, ex);
-                                }).then([snapshot] {
-                                    snapshot->manifest_write.signal(smp::count);
-                                    return make_ready_future<>();
-                                });
-                            });
-                        });
-                    }
-                    return my_work.finally([snapshot] {
-                        return snapshot->manifest_write.wait(1);
-                    }).then([snapshot] {});
+    std::vector<sstables::shared_sstable> tables;
+    try {
+        tables = boost::copy_range<std::vector<sstables::shared_sstable>>(*_sstables->all());
+        co_await io_check([&jsondir] { return recursive_touch_directory(jsondir); });
+        co_await max_concurrent_for_each(tables, db.get_config().initial_sstable_loading_concurrency(), [&db, &jsondir] (sstables::shared_sstable sstable) {
+            return with_semaphore(db.get_sharded_sst_dir_semaphore().local(), 1, [&jsondir, sstable] {
+                return io_check([sstable, &dir = jsondir] {
+                    return sstable->create_links(dir);
                });
            });
        });
-       });
+        co_await io_check(sync_directory, jsondir);
+    } catch (...) {
+        ex = std::current_exception();
+    }
+
+    auto shard = std::hash<sstring>()(jsondir) % smp::count;
+    std::unordered_set<sstring> table_names;
+    try {
+        for (auto& sst : tables) {
+            auto f = sst->get_filename();
+            auto rf = f.substr(sst->get_dir().size() + 1);
+            table_names.insert(std::move(rf));
+        }
+    } catch (...) {
+        ex = std::current_exception();
+    }
+    co_await smp::submit_to(shard, [requester = this_shard_id(), &jsondir, this, &db,
+            tables = std::move(table_names), datadir = _config.datadir, ex = std::move(ex)] () mutable -> future<> {
+        if (!pending_snapshots.contains(jsondir)) {
+            try {
+                pending_snapshots.emplace(jsondir, make_lw_shared<snapshot_manager>());
+            } catch (...) {
+                // abort since the process will hang if we can't coordinate
+                // snapshot across shards, similar to failing to allocation a continuation.
+                tlogger.error("Failed allocating snapshot_manager: {}. Aborting.", std::current_exception());
+                abort();
+            }
+        }
+        auto snapshot = pending_snapshots.at(jsondir);
+        try {
+            for (auto&& sst: tables) {
+                snapshot->files.insert(std::move(sst));
+            }
+        } catch (...) {
+            ex = std::current_exception();
+        }
+
+        tlogger.debug("snapshot {}: signal requests", jsondir);
+        snapshot->requests.signal(1);
+        if (requester == this_shard_id()) {
+            tlogger.debug("snapshot {}: waiting for all shards", jsondir);
+            co_await snapshot->requests.wait(smp::count);
+            // this_shard_id() here == requester == this_shard_id() before submit_to() above,
+            // so the db reference is still local
+            tlogger.debug("snapshot {}: writing schema.cql", jsondir);
+            co_await write_schema_as_cql(db, jsondir).handle_exception([&] (std::exception_ptr ptr) {
+                tlogger.error("Failed writing schema file in snapshot in {} with exception {}", jsondir, ptr);
+                ex = std::move(ptr);
+            });
+            tlogger.debug("snapshot {}: seal_snapshot", jsondir);
+            co_await seal_snapshot(jsondir).handle_exception([&] (std::exception_ptr ptr) {
+                tlogger.error("Failed to seal snapshot in {}: {}.", jsondir, ptr);
+                ex = std::move(ptr);
+            });
+            snapshot->manifest_write.signal(smp::count);
+        }
+        tlogger.debug("snapshot {}: waiting for manifest on behalf of shard {}", jsondir, requester);
+        co_await snapshot->manifest_write.wait(1);
+        tlogger.debug("snapshot {}: done: error={}", jsondir, ex);
+        if (ex) {
+            std::rethrow_exception(std::move(ex));
+        }
    });
 }

@@ -1571,13 +1598,14 @@ bool table::can_flush() const {
 }

 future<> table::clear() {
+    auto permits = co_await _config.dirty_memory_manager->get_all_flush_permits();
    if (_commitlog) {
        for (auto& t : *_memtables) {
            _commitlog->discard_completed_segments(_schema->id(), t->get_and_discard_rp_set());
        }
    }
    _memtables->clear_and_add();
-    return _cache.invalidate(row_cache::external_updater([] { /* There is no underlying mutation source */ }));
+    co_await _cache.invalidate(row_cache::external_updater([] { /* There is no underlying mutation source */ }));
 }

 // NOTE: does not need to be futurized, but might eventually, depending on
@@ -2235,7 +2263,7 @@ std::chrono::milliseconds table::get_coordinator_read_latency_percentile(double

 void
 table::enable_auto_compaction() {
-    // FIXME: unmute backlog. turn table backlog back on.
+    // XXX: unmute backlog. turn table backlog back on.
    //      see table::disable_auto_compaction() notes.
    _compaction_disabled_by_user = false;
    trigger_compaction();
@@ -2243,7 +2271,7 @@ table::enable_auto_compaction() {

 future<>
 table::disable_auto_compaction() {
-    // FIXME: mute backlog. When we disable background compactions
+    // XXX: mute backlog. When we disable background compactions
    // for the table, we must also disable current backlog of the
    // table compaction strategy that contributes to the scheduling
    // group resources prioritization.
@@ -2270,9 +2298,8 @@ table::disable_auto_compaction() {
    // - it will break computation of major compaction descriptor
    //   for new submissions
    _compaction_disabled_by_user = true;
-    return with_gate(_async_gate, [this] {
-        return compaction_manager().stop_ongoing_compactions("disable auto-compaction", this, sstables::compaction_type::Compaction);
-    });
+    // FIXME: stop ongoing compactions
+    return make_ready_future<>();
 }

 flat_mutation_reader
--- a/Show More
+++ b/Show More