release: prepare for 5.0.4

Merge 'Fix mutation commutativity with shadowable tombstone'
from Tomasz Grabiec This series fixes lack of mutation associativity which manifests as sporadic failures in row_cache_test.cc::test_concurrent_reads_and_eviction due to differences in mutations applied and read. No known production impact. Refs https://github.com/scylladb/scylladb/issues/11307 Closes #11312 * github.com:scylladb/scylladb: test: mutation_test: Add explicit test for mutation commutativity test: random_mutation_generator: Workaround for non-associativity of mutations with shadowable tombstones db: mutation_partition: Drop unnecessary maybe_shadow() db: mutation_partition: Maintain shadowable tombstone invariant when applying a hard tombstone mutation_partition: row: make row marker shadowing symmetric (cherry picked from commit 484004e766)
2022-09-21 09:16:13 +03:00 · 2022-09-20 23:21:06 +02:00 · 2022-09-20 23:20:43 +02:00 · 2022-09-20 13:42:10 +03:00 · 2022-09-19 10:31:58 +03:00 · 2022-09-19 06:54:25 +03:00
142 changed files with 3045 additions and 768 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -60,7 +60,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.0.dev
+VERSION=5.0.4

 if test -f version
 then
--- a/alternator/controller.cc
+++ b/alternator/controller.cc
@@ -78,6 +78,11 @@ future<> controller::start_server() {

        _executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_mm), std::ref(_sys_dist_ks), sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value()).get();
        _server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper)).get();
+        // Note: from this point on, if start_server() throws for any reason,
+        // it must first call stop_server() to stop the executor and server
+        // services we just started - or Scylla will cause an assertion
+        // failure when the controller object is destroyed in the exception
+        // unwinding.
        std::optional<uint16_t> alternator_port;
        if (_config.alternator_port()) {
            alternator_port = _config.alternator_port();
@@ -104,7 +109,13 @@ future<> controller::start_server() {
            }
            opts.erase("require_client_auth");
            opts.erase("truststore");
-            utils::configure_tls_creds_builder(creds.value(), std::move(opts)).get();
+            try {
+                utils::configure_tls_creds_builder(creds.value(), std::move(opts)).get();
+            } catch(...) {
+                logger.error("Failed to set up Alternator TLS credentials: {}", std::current_exception());
+                stop_server().get();
+                std::throw_with_nested(std::runtime_error("Failed to set up Alternator TLS credentials"));
+            }
        }
        bool alternator_enforce_authorization = _config.alternator_enforce_authorization();
        _server.invoke_on_all(
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -2173,6 +2173,9 @@ static attrs_to_get calculate_attrs_to_get(const rjson::value& req, std::unorder
        for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) {
            attribute_path_map_add("AttributesToGet", ret, it->GetString());
        }
+        if (ret.empty()) {
+            throw api_error::validation("Empty AttributesToGet is not allowed. Consider using Select=COUNT instead.");
+        }
        return ret;
    } else if (has_projection_expression) {
        const rjson::value& projection_expression = req["ProjectionExpression"];
@@ -2577,8 +2580,8 @@ static bool hierarchy_actions(
                        // attr member so we can use add()
                        rjson::add_with_string_name(v, attr, std::move(*newv));
                    } else {
-                        throw api_error::validation(format("Can't remove document path {} - not present in item",
-                            subh.get_value()._path));
+                        // Removing a.b when a is a map but a.b doesn't exist
+                        // is silently ignored. It's not considered an error.
                    }
                } else {
                    throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -116,9 +116,6 @@ future<executor::request_return_type> executor::update_time_to_live(client_state

 future<executor::request_return_type> executor::describe_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
    _stats.api_operations.describe_time_to_live++;
-    if (!_proxy.data_dictionary().features().cluster_supports_alternator_ttl()) {
-        co_return api_error::unknown_operation("DescribeTimeToLive not yet supported. Experimental support is available if the 'alternator_ttl' experimental feature is enabled on all nodes.");
-    }
    schema_ptr schema = get_table(_proxy, request);
    std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
    rjson::value desc = rjson::empty_object();
--- a/alternator/ttl.hh
+++ b/alternator/ttl.hh
@@ -12,6 +12,7 @@
 #include <seastar/core/sharded.hh>
 #include <seastar/core/abort_source.hh>
 #include <seastar/core/semaphore.hh>
+#include "data_dictionary/data_dictionary.hh"

 namespace replica {
 class database;
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -624,7 +624,7 @@
                  },
                  {
                     "name":"kn",
-                     "description":"Comma seperated keyspaces name to snapshot",
+                     "description":"Keyspace(s) to snapshot. Multiple keyspaces can be provided using a comma-separated list. If omitted, snapshot all keyspaces.",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
@@ -632,7 +632,7 @@
                  },
                  {
                     "name":"cf",
-                     "description":"the column family to snapshot",
+                     "description":"Table(s) to snapshot. Multiple tables (in a single keyspace) can be provided using a comma-separated list. If omitted, snapshot all tables in the given keyspace(s).",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -669,19 +669,16 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        });
    }));

-    ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) {
+    ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) -> future<json::json_return_type> {
        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
+        auto &db = ctx.db.local();
        if (column_families.empty()) {
-            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
+            co_await db.flush_on_all(keyspace);
+        } else {
+            co_await db.flush_on_all(keyspace, std::move(column_families));
        }
-        return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) {
-            return parallel_for_each(column_families, [&db, keyspace](const sstring& cf) mutable {
-                return db.find_column_family(keyspace, cf).flush();
-            });
-        }).then([]{
-                return make_ready_future<json::json_return_type>(json_void());
-        });
+        co_return json_void();
    });


@@ -1284,40 +1281,46 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
        });
    });

-    ss::take_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) {
-        apilog.debug("take_snapshot: {}", req->query_parameters);
+    ss::take_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) -> future<json::json_return_type> {
+        apilog.info("take_snapshot: {}", req->query_parameters);
        auto tag = req->get_query_param("tag");
        auto column_families = split(req->get_query_param("cf"), ",");
        auto sfopt = req->get_query_param("sf");
        auto sf = db::snapshot_ctl::skip_flush(strcasecmp(sfopt.c_str(), "true") == 0);

        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-
-        auto resp = make_ready_future<>();
-        if (column_families.empty()) {
-            resp = snap_ctl.local().take_snapshot(tag, keynames, sf);
-        } else {
-            if (keynames.empty()) {
-                throw httpd::bad_param_exception("The keyspace of column families must be specified");
+        try {
+            if (column_families.empty()) {
+                co_await snap_ctl.local().take_snapshot(tag, keynames, sf);
+            } else {
+                if (keynames.empty()) {
+                    throw httpd::bad_param_exception("The keyspace of column families must be specified");
+                }
+                if (keynames.size() > 1) {
+                    throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
+                }
+                co_await snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, sf);
            }
-            if (keynames.size() > 1) {
-                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
-            }
-            resp = snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag, sf);
+            co_return json_void();
+        } catch (...) {
+            apilog.error("take_snapshot failed: {}", std::current_exception());
+            throw;
        }
-        return resp.then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
    });

-    ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) {
+    ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) -> future<json::json_return_type> {
+        apilog.info("del_snapshot: {}", req->query_parameters);
        auto tag = req->get_query_param("tag");
        auto column_family = req->get_query_param("cf");

        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-        return snap_ctl.local().clear_snapshot(tag, keynames, column_family).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
+        try {
+            co_await snap_ctl.local().clear_snapshot(tag, keynames, column_family);
+            co_return json_void();
+        } catch (...) {
+            apilog.error("del_snapshot failed: {}", std::current_exception());
+            throw;
+        }
    });

    ss::true_snapshots_size.set(r, [&snap_ctl](std::unique_ptr<request> req) {
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -87,19 +87,24 @@ compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
            // prefer expiring cells.
            return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
        }
-        if (left.is_live_and_has_ttl() && left.expiry() != right.expiry()) {
-            return left.expiry() <=> right.expiry();
+        if (left.is_live_and_has_ttl()) {
+            if (left.expiry() != right.expiry()) {
+                return left.expiry() <=> right.expiry();
+            } else {
+                // prefer the cell that was written later,
+                // so it survives longer after it expires, until purged.
+                return right.ttl() <=> left.ttl();
+            }
        }
    } else {
        // Both are deleted
-        if (left.deletion_time() != right.deletion_time()) {
-            // Origin compares big-endian serialized deletion time. That's because it
-            // delegates to AbstractCell.reconcile() which compares values after
-            // comparing timestamps, which in case of deleted cells will hold
-            // serialized expiry.
-            return (uint64_t) left.deletion_time().time_since_epoch().count()
-                   <=> (uint64_t) right.deletion_time().time_since_epoch().count();
-        }
+
+        // Origin compares big-endian serialized deletion time. That's because it
+        // delegates to AbstractCell.reconcile() which compares values after
+        // comparing timestamps, which in case of deleted cells will hold
+        // serialized expiry.
+        return (uint64_t) left.deletion_time().time_since_epoch().count()
+                <=> (uint64_t) right.deletion_time().time_since_epoch().count();
    }
    return std::strong_ordering::equal;
 }
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -59,7 +59,7 @@ using namespace std::chrono_literals;
 logging::logger cdc_log("cdc");

 namespace cdc {
-static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {});
+static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {}, schema_ptr = nullptr);
 }

 static constexpr auto cdc_group_name = "cdc";
@@ -206,7 +206,7 @@ public:
                return;
            }

-            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt);
+            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);

            auto log_mut = log_schema 
                ? db::schema_tables::make_update_table_mutations(db, keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
@@ -484,7 +484,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
    return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
 }

-static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid) {
+static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid, schema_ptr old) {
    schema_builder b(s.ks_name(), log_name(s.cf_name()));
    b.with_partitioner("com.scylladb.dht.CDCPartitioner");
    b.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
@@ -571,6 +571,20 @@ static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID>
        b.set_uuid(*uuid);
    }

+    /**
+     * #10473 - if we are redefining the log table, we need to ensure any dropped
+     * columns are registered in "dropped_columns" table, otherwise clients will not
+     * be able to read data older than now.
+     */
+    if (old) {
+        // not super efficient, but we don't do this often.
+        for (auto& col : old->all_columns()) {
+            if (!b.has_column({col.name(), col.name_as_text() })) {
+                b.without_column(col.name_as_text(), col.type, api::new_timestamp());
+            }
+        }
+    }
+
    return b.build();
 }

--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -1281,6 +1281,13 @@ private:

            const auto& key = _validator.previous_partition_key();

+            if (_validator.current_tombstone()) {
+                throw compaction_aborted_exception(
+                        _schema->ks_name(),
+                        _schema->cf_name(),
+                        "scrub compaction cannot handle invalid fragments with an active range tombstone change");
+            }
+
            // If the unexpected fragment is a partition end, we just drop it.
            // The only case a partition end is invalid is when it comes after
            // another partition end, and we can just drop it in that case.
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -317,9 +317,9 @@ future<> compaction_manager::run_custom_job(replica::table* t, sstables::compact

    auto job_ptr = std::make_unique<noncopyable_function<future<>(sstables::compaction_data&)>>(std::move(job));

-    task->compaction_done = with_semaphore(_maintenance_ops_sem, 1, [this, task, &job = *job_ptr] () mutable {
-        // take read lock for table, so major compaction and resharding can't proceed in parallel.
-        return with_lock(task->compaction_state.lock.for_read(), [this, task, &job] () mutable {
+    task->compaction_done = with_semaphore(_custom_jobs_sem, 1, [this, task, &job = *job_ptr] () mutable {
+            // We don't need to take task->compaction_state.lock.for_read() as it only serializes minor and major
+
            // Allow caller to know that task (e.g. reshape) was asked to stop while waiting for a chance to run.
            if (task->stopping) {
                throw sstables::compaction_stopped_exception(task->compacting_table->schema()->ks_name(), task->compacting_table->schema()->cf_name(),
@@ -335,7 +335,6 @@ future<> compaction_manager::run_custom_job(replica::table* t, sstables::compact
            // no need to register shared sstables because they're excluded from non-resharding
            // compaction and some of them may not even belong to current shard.
            return job(task->compaction_data);
-        });
    }).then_wrapped([this, task, job_ptr = std::move(job_ptr), type] (future<> f) {
        _stats.active_tasks--;
        _tasks.remove(task);
@@ -353,32 +352,50 @@ future<> compaction_manager::run_custom_job(replica::table* t, sstables::compact
    return task->compaction_done.get_future().then([task] {});
 }

+compaction_manager::compaction_reenabler::compaction_reenabler(compaction_manager& cm, replica::table* t)
+    : _cm(cm)
+    , _table(t)
+    , _compaction_state(cm.get_compaction_state(_table))
+    , _holder(_compaction_state.gate.hold())
+{
+    _compaction_state.compaction_disabled_counter++;
+    cmlog.debug("Temporarily disabled compaction for {}.{}. compaction_disabled_counter={}",
+            _table->schema()->ks_name(), _table->schema()->cf_name(), _compaction_state.compaction_disabled_counter);
+}
+
+compaction_manager::compaction_reenabler::compaction_reenabler(compaction_reenabler&& o) noexcept
+    : _cm(o._cm)
+    , _table(std::exchange(o._table, nullptr))
+    , _compaction_state(o._compaction_state)
+    , _holder(std::move(o._holder))
+{}
+
+compaction_manager::compaction_reenabler::~compaction_reenabler() {
+    // submit compaction request if we're the last holder of the gate which is still opened.
+    if (_table && --_compaction_state.compaction_disabled_counter == 0 && !_compaction_state.gate.is_closed()) {
+        cmlog.debug("Reenabling compaction for {}.{}",
+                _table->schema()->ks_name(), _table->schema()->cf_name());
+        try {
+            _cm.submit(_table);
+        } catch (...) {
+            cmlog.warn("compaction_reenabler could not reenable compaction for {}.{}: {}",
+                    _table->schema()->ks_name(), _table->schema()->cf_name(), std::current_exception());
+        }
+    }
+}
+
+future<compaction_manager::compaction_reenabler>
+compaction_manager::stop_and_disable_compaction(replica::table* t) {
+    compaction_reenabler cre(*this, t);
+    co_await stop_ongoing_compactions("user-triggered operation", t);
+    co_return cre;
+}
+
 future<>
 compaction_manager::run_with_compaction_disabled(replica::table* t, std::function<future<> ()> func) {
-    auto& c_state = _compaction_state[t];
-    auto holder = c_state.gate.hold();
+    compaction_reenabler cre = co_await stop_and_disable_compaction(t);

-    c_state.compaction_disabled_counter++;
-
-    std::exception_ptr err;
-    try {
-        co_await stop_ongoing_compactions("user-triggered operation", t);
-        co_await func();
-    } catch (...) {
-        err = std::current_exception();
-    }
-
-#ifdef DEBUG
-    assert(_compaction_state.contains(t));
-#endif
-    // submit compaction request if we're the last holder of the gate which is still opened.
-    if (--c_state.compaction_disabled_counter == 0 && !c_state.gate.is_closed()) {
-        submit(t);
-    }
-    if (err) {
-        std::rethrow_exception(err);
-    }
-    co_return;
+    co_await func();
 }

 void compaction_manager::task::setup_new_compaction() {
@@ -742,6 +759,7 @@ future<> compaction_manager::perform_offstrategy(replica::table* t) {
                _stats.active_tasks++;
                task->setup_new_compaction();

+              return with_scheduling_group(_maintenance_sg.cpu, [this, task, t] {
                return t->run_offstrategy_compaction(task->compaction_data).then_wrapped([this, task, schema = t->schema()] (future<> f) mutable {
                    _stats.active_tasks--;
                    task->finish_compaction();
@@ -763,6 +781,7 @@ future<> compaction_manager::perform_offstrategy(replica::table* t) {
                    }
                    return make_ready_future<stop_iteration>(stop_iteration::yes);
                });
+              });
            });
        });
    }).finally([this, task] {
@@ -810,7 +829,8 @@ future<> compaction_manager::rewrite_sstables(replica::table* t, sstables::compa
            auto sstable_level = sst->get_sstable_level();
            auto run_identifier = sst->run_identifier();
            auto sstable_set_snapshot = can_purge ? std::make_optional(t.get_sstable_set()) : std::nullopt;
-            auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), _maintenance_sg.io,
+            // FIXME: this compaction should run with maintenance priority.
+            auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), service::get_local_compaction_priority(),
                sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, options);

            // Releases reference to cleaned sstable such that respective used disk space can be freed.
@@ -819,8 +839,9 @@ future<> compaction_manager::rewrite_sstables(replica::table* t, sstables::compa
            };

            auto maintenance_permit = co_await seastar::get_units(_maintenance_ops_sem, 1);
-            // Take write lock for table to serialize cleanup/upgrade sstables/scrub with major compaction/reshape/reshard.
-            auto write_lock_holder = co_await _compaction_state[&t].lock.hold_write_lock();
+            // FIXME: acquiring the read lock is not needed after acquiring the _maintenance_ops_sem
+            // only major compaction needs to acquire the write lock to synchronize with regular compaction.
+            auto lock_holder = co_await _compaction_state[&t].lock.hold_read_lock();

            _stats.pending_tasks--;
            _stats.active_tasks++;
@@ -852,7 +873,7 @@ future<> compaction_manager::rewrite_sstables(replica::table* t, sstables::compa
            };

            compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
-            completed = co_await with_scheduling_group(_maintenance_sg.cpu, std::ref(perform_rewrite));
+            completed = co_await with_scheduling_group(_compaction_controller.sg(), std::ref(perform_rewrite));
        } while (!completed);
    };

--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -147,6 +147,8 @@ private:
    // If the operation must be serialized with regular, then the per-table write lock must be taken.
    seastar::named_semaphore _maintenance_ops_sem = {1, named_semaphore_exception_factory{"maintenance operation"}};

+    seastar::named_semaphore _custom_jobs_sem = {1, named_semaphore_exception_factory{"custom jobs"}};
+
    std::function<void()> compaction_submission_callback();
    // all registered tables are reevaluated at a constant interval.
    // Submission is a NO-OP when there's nothing to do, so it's fine to call it regularly.
@@ -269,6 +271,31 @@ public:
    // parameter job is a function that will carry the operation
    future<> run_custom_job(replica::table* t, sstables::compaction_type type, noncopyable_function<future<>(sstables::compaction_data&)> job);

+    class compaction_reenabler {
+        compaction_manager& _cm;
+        replica::table* _table;
+        compaction_state& _compaction_state;
+        gate::holder _holder;
+
+    public:
+        compaction_reenabler(compaction_manager&, replica::table*);
+        compaction_reenabler(compaction_reenabler&&) noexcept;
+
+        ~compaction_reenabler();
+
+        replica::table* compacting_table() const noexcept {
+            return _table;
+        }
+
+        const compaction_state& compaction_state() const noexcept {
+            return _compaction_state;
+        }
+    };
+
+    // Disable compaction temporarily for a table t.
+    // Caller should call the compaction_reenabler::reenable
+    future<compaction_reenabler> stop_and_disable_compaction(replica::table* t);
+
    // Run a function with compaction temporarily disabled for a table T.
    future<> run_with_compaction_disabled(replica::table* t, std::function<future<> ()> func);

--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -69,7 +69,11 @@ compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(tabl
 }

 void leveled_compaction_strategy::notify_completion(const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
-    if (removed.empty() || added.empty()) {
+    // All the update here is only relevant for regular compaction's round-robin picking policy, and if
+    // last_compacted_keys wasn't generated by regular, it means regular is disabled since last restart,
+    // therefore we can skip the updates here until regular runs for the first time. Once it runs,
+    // it will be able to generate last_compacted_keys correctly by looking at metadata of files.
+    if (removed.empty() || added.empty() || !_last_compacted_keys) {
        return;
    }
    auto min_level = std::numeric_limits<uint32_t>::max();
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -217,6 +217,7 @@ time_window_compaction_strategy::get_sstables_for_compaction(table_state& table_
    auto compaction_time = gc_clock::now();

    if (candidates.empty()) {
+        _estimated_remaining_tasks = 0;
        return compaction_descriptor();
    }

--- a/cql3/cql_statement.hh
+++ b/cql3/cql_statement.hh
@@ -81,9 +81,7 @@ public:
    virtual seastar::future<seastar::shared_ptr<cql_transport::messages::result_message>>
        execute(query_processor& qp, service::query_state& state, const query_options& options) const = 0;

-    virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const = 0;
-
-    virtual bool depends_on_column_family(const seastar::sstring& cf_name) const = 0;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const = 0;

    virtual seastar::shared_ptr<const metadata> get_result_metadata() const = 0;

--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -103,10 +103,50 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
        if (!col_type->is_map()) {
            throw exceptions::invalid_request_exception(format("subscripting non-map column {}", cdef->name_as_text()));
        }
-        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*data.other_columns[data.sel.index_of(*cdef)]));
+        int32_t index = data.sel.index_of(*cdef);
+        if (index == -1) {
+            throw std::runtime_error(
+                    format("Column definition {} does not match any column in the query selection",
+                    cdef->name_as_text()));
+        }
+        const managed_bytes_opt& serialized = data.other_columns[index];
+        if (!serialized) {
+            // For null[i] we return null.
+            return std::nullopt;
+        }
+        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*serialized));
        const auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
        const auto key = evaluate(*col.sub, options);
        auto&& key_type = col_type->name_comparator();
+        if (key.is_null()) {
+            // For m[null] return null.
+            // This is different from Cassandra - which treats m[null]
+            // as an invalid request error. But m[null] -> null is more
+            // consistent with our usual null treatement (e.g., both
+            // null[2] and null < 2 return null). It will also allow us
+            // to support non-constant subscripts (e.g., m[a]) where "a"
+            // may be null in some rows and non-null in others, and it's
+            // not an error.
+            return std::nullopt;
+        }
+        if (key.is_unset_value()) {
+            // An m[?] with ? bound to UNSET_VALUE is a invalid query.
+            // We could have detected it earlier while binding, but since
+            // we currently don't, we must protect the following code
+            // which can't work with an UNSET_VALUE. Note that the
+            // placement of this check here means that in an empty table,
+            // where we never need to evaluate the filter expression, this
+            // error will not be detected.
+            throw exceptions::invalid_request_exception(
+                format("Unsupported unset map key for column {}",
+                    cdef->name_as_text()));
+        }
+        if (key.type != key_type) {
+            // This can't happen, we always verify the index type earlier.
+            throw std::logic_error(
+                format("Tried to evaluate expression with wrong type for subscript of {}",
+                    cdef->name_as_text()));
+        }
        const auto found = key.view().with_linearized([&] (bytes_view key_bv) {
            using entry = std::pair<data_value, data_value>;
            return std::find_if(data_map.cbegin(), data_map.cend(), [&] (const entry& element) {
@@ -121,8 +161,16 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
        case column_kind::clustering_key:
            return managed_bytes(data.clustering_key[cdef->id]);
        case column_kind::static_column:
-        case column_kind::regular_column:
-            return managed_bytes_opt(data.other_columns[data.sel.index_of(*cdef)]);
+            [[fallthrough]];
+        case column_kind::regular_column: {
+            int32_t index = data.sel.index_of(*cdef);
+            if (index == -1) {
+                throw std::runtime_error(
+                        format("Column definition {} does not match any column in the query selection",
+                        cdef->name_as_text()));
+            }
+            return managed_bytes_opt(data.other_columns[index]);
+        }
        default:
            throw exceptions::unsupported_operation_exception("Unknown column kind");
        }
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -953,7 +953,7 @@ bool query_processor::migration_subscriber::should_invalidate(
        sstring ks_name,
        std::optional<sstring> cf_name,
        ::shared_ptr<cql_statement> statement) {
-    return statement->depends_on_keyspace(ks_name) && (!cf_name || statement->depends_on_column_family(*cf_name));
+    return statement->depends_on(ks_name, cf_name);
 }

 future<> query_processor::query_internal(
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -514,7 +514,7 @@ statement_restrictions::statement_restrictions(data_dictionary::database db,
    }

    if (!_nonprimary_key_restrictions->empty()) {
-        if (_has_queriable_regular_index) {
+        if (_has_queriable_regular_index && _partition_range_is_simple) {
            _uses_secondary_indexing = true;
        } else if (!allow_filtering) {
            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -165,7 +165,7 @@ public:

    template<typename RowComparator>
    void sort(const RowComparator& cmp) {
-        std::sort(_rows.begin(), _rows.end(), std::ref(cmp));
+        std::sort(_rows.begin(), _rows.end(), cmp);
    }

    metadata& get_metadata();
--- a/cql3/statements/authentication_statement.cc
+++ b/cql3/statements/authentication_statement.cc
@@ -18,13 +18,7 @@ uint32_t cql3::statements::authentication_statement::get_bound_terms() const {
    return 0;
 }

-bool cql3::statements::authentication_statement::depends_on_keyspace(
-                const sstring& ks_name) const {
-    return false;
-}
-
-bool cql3::statements::authentication_statement::depends_on_column_family(
-                const sstring& cf_name) const {
+bool cql3::statements::authentication_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/authentication_statement.hh
+++ b/cql3/statements/authentication_statement.hh
@@ -27,9 +27,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/statements/authorization_statement.cc
+++ b/cql3/statements/authorization_statement.cc
@@ -20,13 +20,7 @@ uint32_t cql3::statements::authorization_statement::get_bound_terms() const {
    return 0;
 }

-bool cql3::statements::authorization_statement::depends_on_keyspace(
-                const sstring& ks_name) const {
-    return false;
-}
-
-bool cql3::statements::authorization_statement::depends_on_column_family(
-                const sstring& cf_name) const {
+bool cql3::statements::authorization_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/authorization_statement.hh
+++ b/cql3/statements/authorization_statement.hh
@@ -31,9 +31,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -70,14 +70,9 @@ batch_statement::batch_statement(type type_,
 {
 }

-bool batch_statement::depends_on_keyspace(const sstring& ks_name) const
+bool batch_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
-    return false;
-}
-
-bool batch_statement::depends_on_column_family(const sstring& cf_name) const
-{
-    return false;
+    return boost::algorithm::any_of(_statements, [&ks_name, &cf_name] (auto&& s) { return s.statement->depends_on(ks_name, cf_name); });
 }

 uint32_t batch_statement::get_bound_terms() const
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -88,9 +88,7 @@ public:
                    std::unique_ptr<attributes> attrs,
                    cql_stats& stats);

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;

--- a/cql3/statements/cf_properties.hh
+++ b/cql3/statements/cf_properties.hh
@@ -13,6 +13,7 @@

 #include "cql3/statements/cf_prop_defs.hh"
 #include "cql3/column_identifier.hh"
+#include "data_dictionary/data_dictionary.hh"

 namespace cql3 {

--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -539,12 +539,8 @@ modification_statement::validate(query_processor&, const service::client_state&
    }
 }

-bool modification_statement::depends_on_keyspace(const sstring& ks_name) const {
-    return keyspace() == ks_name;
-}
-
-bool modification_statement::depends_on_column_family(const sstring& cf_name) const {
-    return column_family() == cf_name;
+bool modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
 }

 void modification_statement::add_operation(::shared_ptr<operation> op) {
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -137,9 +137,7 @@ public:
    // Validate before execute, using client state and current schema
    void validate(query_processor&, const service::client_state& state) const override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    void add_operation(::shared_ptr<operation> op);

--- a/cql3/statements/schema_altering_statement.cc
+++ b/cql3/statements/schema_altering_statement.cc
@@ -45,12 +45,7 @@ future<> schema_altering_statement::grant_permissions_to_creator(const service::
    return make_ready_future<>();
 }

-bool schema_altering_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool schema_altering_statement::depends_on_column_family(const sstring& cf_name) const
+bool schema_altering_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/schema_altering_statement.hh
+++ b/cql3/statements/schema_altering_statement.hh
@@ -53,9 +53,7 @@ protected:
     */
    virtual future<> grant_permissions_to_creator(const service::client_state&) const;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;

--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -167,12 +167,8 @@ void select_statement::validate(query_processor&, const service::client_state& s
    // Nothing to do, all validation has been done by raw_statemet::prepare()
 }

-bool select_statement::depends_on_keyspace(const sstring& ks_name) const {
-    return keyspace() == ks_name;
-}
-
-bool select_statement::depends_on_column_family(const sstring& cf_name) const {
-    return column_family() == cf_name;
+bool select_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
 }

 const sstring& select_statement::keyspace() const {
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -100,8 +100,7 @@ public:
    virtual uint32_t get_bound_terms() const override;
    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;
    virtual void validate(query_processor&, const service::client_state& state) const override;
-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<::shared_ptr<cql_transport::messages::result_message>> execute(query_processor& qp,
        service::query_state& state, const query_options& options) const override;
--- a/cql3/statements/service_level_statement.cc
+++ b/cql3/statements/service_level_statement.cc
@@ -17,13 +17,7 @@ uint32_t service_level_statement::get_bound_terms() const {
    return 0;
 }

-bool service_level_statement::depends_on_keyspace(
-        const sstring &ks_name) const {
-    return false;
-}
-
-bool service_level_statement::depends_on_column_family(
-        const sstring &cf_name) const {
+bool service_level_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/service_level_statement.hh
+++ b/cql3/statements/service_level_statement.hh
@@ -43,9 +43,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/statements/sl_prop_defs.cc
+++ b/cql3/statements/sl_prop_defs.cc
@@ -30,7 +30,7 @@ void sl_prop_defs::validate() {
        data_value v = duration_type->deserialize(duration_type->from_string(*repr));
        cql_duration duration = static_pointer_cast<const duration_type_impl>(duration_type)->from_value(v);
        if (duration.months || duration.days) {
-            throw exceptions::invalid_request_exception("Timeout values cannot be longer than 24h");
+            throw exceptions::invalid_request_exception("Timeout values cannot be expressed in days/months");
        }
        if (duration.nanoseconds % 1'000'000 != 0) {
            throw exceptions::invalid_request_exception("Timeout values must be expressed in millisecond granularity");
--- a/cql3/statements/truncate_statement.cc
+++ b/cql3/statements/truncate_statement.cc
@@ -39,12 +39,7 @@ std::unique_ptr<prepared_statement> truncate_statement::prepare(data_dictionary:
    return std::make_unique<prepared_statement>(::make_shared<truncate_statement>(*this));
 }

-bool truncate_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool truncate_statement::depends_on_column_family(const sstring& cf_name) const
+bool truncate_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/truncate_statement.hh
+++ b/cql3/statements/truncate_statement.hh
@@ -30,9 +30,7 @@ public:

    virtual std::unique_ptr<prepared_statement> prepare(data_dictionary::database db, cql_stats& stats) override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/statements/use_statement.cc
+++ b/cql3/statements/use_statement.cc
@@ -46,12 +46,7 @@ std::unique_ptr<prepared_statement> use_statement::prepare(data_dictionary::data

 }

-bool use_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool use_statement::depends_on_column_family(const sstring& cf_name) const
+bool use_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/use_statement.hh
+++ b/cql3/statements/use_statement.hh
@@ -31,9 +31,7 @@ public:

    virtual uint32_t get_bound_terms() const override;

-    virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const seastar::sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual seastar::future<> check_access(query_processor& qp, const service::client_state& state) const override;

--- a/cql3/type_json.cc
+++ b/cql3/type_json.cc
@@ -78,8 +78,35 @@ static int64_t to_int64_t(const rjson::value& value) {
        return value.GetInt();
    } else if (value.IsUint()) {
        return value.GetUint();
-    } else if (value.GetUint64()) {
+    } else if (value.IsUint64()) {
        return value.GetUint64(); //NOTICE: large uint64_t values will get overflown
+    } else if (value.IsDouble()) {
+        // We allow specifing integer constants
+        // using scientific notation (for example 1.3e8)
+        // and floating-point numbers ending with .0 (for example 12.0),
+        // but not floating-point numbers with fractional part (12.34).
+        //
+        // The reason is that JSON standard does not have separate
+        // types for integers and floating-point numbers, only
+        // a single "number" type. Some serializers may
+        // produce an integer in that floating-point format.
+        double double_value = value.GetDouble();
+
+        // Check if the value contains disallowed fractional part (.34 from 12.34).
+        // With RapidJSON and an integer value in range [-(2^53)+1, (2^53)-1], 
+        // the fractional part will be zero as the entire value
+        // fits in 53-bit significand. RapidJSON's parsing code does not lose accuracy:
+        // when parsing a number like 12.34e8, it accumulates 1234 to a int64_t number,
+        // then converts it to double and multiples by power of 10, never having any
+        // digit in fractional part.
+        double integral;
+        double fractional = std::modf(double_value, &integral);
+        if (fractional != 0.0 && fractional != -0.0) {
+            throw marshal_exception(format("Incorrect JSON floating-point value "
+                "for int64 type: {} (it should not contain fractional part {})", value, fractional));
+        }
+
+        return double_value;
    }
    throw marshal_exception(format("Incorrect JSON value for int64 type: {}", value));
 }
--- a/db/config.cc
+++ b/db/config.cc
@@ -65,6 +65,25 @@ hinted_handoff_enabled_to_json(const db::config::hinted_handoff_enabled_type& h)
    return value_to_json(h.to_configuration_string());
 }

+// Convert a value that can be printed with operator<<, or a vector of
+// such values, to JSON. An example is enum_option<T>, because enum_option<T>
+// has a operator<<.
+template <typename T>
+static json::json_return_type
+printable_to_json(const T& e) {
+    return value_to_json(format("{}", e));
+}
+template <typename T>
+static json::json_return_type
+printable_vector_to_json(const std::vector<T>& e) {
+    std::vector<sstring> converted;
+    converted.reserve(e.size());
+    for (const auto& option : e) {
+        converted.push_back(format("{}", option));
+    }
+    return value_to_json(converted);
+}
+
 template <>
 const config_type config_type_for<bool> = config_type("bool", value_to_json<bool>);

@@ -109,11 +128,11 @@ const config_type config_type_for<db::seed_provider_type> = config_type("seed pr

 template <>
 const config_type config_type_for<std::vector<enum_option<db::experimental_features_t>>> = config_type(
-        "experimental features", value_to_json<std::vector<sstring>>);
+        "experimental features", printable_vector_to_json<enum_option<db::experimental_features_t>>);

 template <>
 const config_type config_type_for<enum_option<db::tri_mode_restriction_t>> = config_type(
-        "restriction mode", value_to_json<sstring>);
+        "restriction mode", printable_to_json<enum_option<db::tri_mode_restriction_t>>);

 template <>
 const config_type config_type_for<db::config::hinted_handoff_enabled_type> = config_type("hinted handoff enabled", hinted_handoff_enabled_to_json);
@@ -862,6 +881,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Flush tables in the system_schema keyspace after schema modification. This is required for crash recovery, but slows down tests and can be disabled for them")
    , restrict_replication_simplestrategy(this, "restrict_replication_simplestrategy", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::FALSE, "Controls whether to disable SimpleStrategy replication. Can be true, false, or warn.")
    , restrict_dtcs(this, "restrict_dtcs", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::WARN, "Controls whether to prevent setting DateTieredCompactionStrategy. Can be true, false, or warn.")
+    , cache_index_pages(this, "cache_index_pages", liveness::LiveUpdate, value_status::Used, true,
+        "Keep SSTable index pages in the global cache after a SSTable read. Expected to improve performance for workloads with big partitions, but may degrade performance for workloads with small partitions.")
    , default_log_level(this, "default_log_level", value_status::Used)
    , logger_log_level(this, "logger_log_level", value_status::Used)
    , log_to_stdout(this, "log_to_stdout", value_status::Used)
--- a/db/config.hh
+++ b/db/config.hh
@@ -365,6 +365,9 @@ public:
    named_value<tri_mode_restriction> restrict_replication_simplestrategy;
    named_value<tri_mode_restriction> restrict_dtcs;

+
+    named_value<bool> cache_index_pages;
+
    seastar::logging_settings logging_settings(const log_cli::options&) const;

    const db::extensions& extensions() const;
--- a/db/legacy_schema_migrator.cc
+++ b/db/legacy_schema_migrator.cc
@@ -574,12 +574,8 @@ public:
    }

    future<> flush_schemas() {
-        return _qp.proxy().get_db().invoke_on_all([this] (replica::database& db) {
-            return parallel_for_each(db::schema_tables::all_table_names(schema_features::full()), [this, &db](const sstring& cf_name) {
-                auto& cf = db.find_column_family(db::schema_tables::NAME, cf_name);
-                return cf.flush();
-            });
-        });
+        auto& db = _qp.db().real_database();
+        return db.flush_on_all(db::schema_tables::NAME, db::schema_tables::all_table_names(schema_features::full()));
    }

    future<> migrate() {
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -1042,12 +1042,9 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
    co_await proxy.local().mutate_locally(std::move(mutations), tracing::trace_state_ptr());

    if (do_flush) {
-        co_await proxy.local().get_db().invoke_on_all([&] (replica::database& db) -> future<> {
-            auto& cfs = column_families;
-            co_await parallel_for_each(cfs.begin(), cfs.end(), [&] (const utils::UUID& id) -> future<> {
-                auto& cf = db.find_column_family(id);
-                co_await cf.flush();
-            });
+        auto& db = proxy.local().local_db();
+        co_await parallel_for_each(column_families, [&db] (const utils::UUID& id) -> future<> {
+            return db.flush_on_all(id);
        });
    }

--- a/db/snapshot-ctl.cc
+++ b/db/snapshot-ctl.cc
@@ -11,6 +11,8 @@
 */

 #include <boost/range/adaptors.hpp>
+#include <seastar/core/coroutine.hh>
+#include <seastar/coroutine/maybe_yield.hh>
 #include "db/snapshot-ctl.hh"
 #include "replica/database.hh"

@@ -59,20 +61,17 @@ future<> snapshot_ctl::take_snapshot(sstring tag, std::vector<sstring> keyspace_
        boost::copy(_db.local().get_keyspaces() | boost::adaptors::map_keys, std::back_inserter(keyspace_names));
    };

-    return run_snapshot_modify_operation([tag = std::move(tag), keyspace_names = std::move(keyspace_names), sf, this] {
-        return parallel_for_each(keyspace_names, [tag, this] (auto& ks_name) {
-            return check_snapshot_not_exist(ks_name, tag);
-        }).then([this, tag, keyspace_names, sf] {
-            return _db.invoke_on_all([tag = std::move(tag), keyspace_names, sf] (replica::database& db) {
-                return parallel_for_each(keyspace_names, [&db, tag = std::move(tag), sf] (auto& ks_name) {
-                    auto& ks = db.find_keyspace(ks_name);
-                    return parallel_for_each(ks.metadata()->cf_meta_data(), [&db, tag = std::move(tag), sf] (auto& pair) {
-                        auto& cf = db.find_column_family(pair.second);
-                        return cf.snapshot(db, tag, bool(sf));
-                    });
-                });
-            });
-        });
+    return run_snapshot_modify_operation([tag = std::move(tag), keyspace_names = std::move(keyspace_names), sf, this] () mutable {
+        return do_take_snapshot(std::move(tag), std::move(keyspace_names), sf);
+    });
+}
+
+future<> snapshot_ctl::do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf) {
+    co_await parallel_for_each(keyspace_names, [tag, this] (const auto& ks_name) {
+        return check_snapshot_not_exist(ks_name, tag);
+    });
+    co_await parallel_for_each(keyspace_names, [this, tag = std::move(tag), sf] (const auto& ks_name) {
+        return _db.local().snapshot_on_all(ks_name, tag, bool(sf));
    });
 }

@@ -87,23 +86,23 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
        throw std::runtime_error("You must supply a snapshot name.");
    }

-    return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), sf] {
-        return check_snapshot_not_exist(ks_name, tag, tables).then([this, ks_name, tables, tag, sf] {
-            return do_with(std::vector<sstring>(std::move(tables)),[this, ks_name, tag, sf](const std::vector<sstring>& tables) {
-                return do_for_each(tables, [ks_name, tag, sf, this] (const sstring& table_name) {
-                    if (table_name.find(".") != sstring::npos) {
-                        throw std::invalid_argument("Cannot take a snapshot of a secondary index by itself. Run snapshot on the table that owns the index.");
-                    }
-                    return _db.invoke_on_all([ks_name, table_name, tag, sf] (replica::database &db) {
-                        auto& cf = db.find_column_family(ks_name, table_name);
-                        return cf.snapshot(db, tag, bool(sf));
-                    });
-                });
-            });
-        });
+    return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag), sf] () mutable {
+        return do_take_column_family_snapshot(std::move(ks_name), std::move(tables), std::move(tag), sf);
    });
 }

+future<> snapshot_ctl::do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf) {
+    co_await check_snapshot_not_exist(ks_name, tag, tables);
+
+    for (const auto& table_name : tables) {
+        auto& cf = _db.local().find_column_family(ks_name, table_name);
+        if (cf.schema()->is_view()) {
+            throw std::invalid_argument("Do not take a snapshot of a materialized view or a secondary index by itself. Run snapshot on the base table instead.");
+        }
+    }
+    co_await _db.local().snapshot_on_all(ks_name, std::move(tables), std::move(tag), bool(sf));
+}
+
 future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag, skip_flush sf) {
    return take_column_family_snapshot(ks_name, std::vector<sstring>{cf_name}, tag, sf);
 }
--- a/db/snapshot-ctl.hh
+++ b/db/snapshot-ctl.hh
@@ -97,6 +97,9 @@ private:

    template <typename Func>
    std::result_of_t<Func()> run_snapshot_list_operation(Func&&);
+
+    future<> do_take_snapshot(sstring tag, std::vector<sstring> keyspace_names, skip_flush sf = skip_flush::no);
+    future<> do_take_column_family_snapshot(sstring ks_name, std::vector<sstring> tables, sstring tag, skip_flush sf = skip_flush::no);
 };

 }
--- a/db/system_keyspace_view_types.hh
+++ b/db/system_keyspace_view_types.hh
@@ -10,6 +10,7 @@

 #include <seastar/core/seastar.hh>
 #include <seastar/core/sstring.hh>
+#include <seastar/core/reactor.hh>
 #include <utility>
 #include <optional>
 #include "dht/token.hh"
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -322,7 +322,11 @@ public:
    view_filter_checking_visitor(const schema& base, const view_info& view)
        : _base(base)
        , _view(view)
-        , _selection(cql3::selection::selection::wildcard(_base.shared_from_this()))
+        , _selection(cql3::selection::selection::for_columns(_base.shared_from_this(),
+            boost::copy_range<std::vector<const column_definition*>>(
+                _base.regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return &cdef; }))
+            )
+        )
    {}

    void accept_new_partition(const partition_key& key, uint64_t row_count) {
@@ -1293,7 +1297,7 @@ future<> mutate_MV(
            auto mut_ptr = remote_endpoints.empty() ? std::make_unique<frozen_mutation>(std::move(mut.fm)) : std::make_unique<frozen_mutation>(mut.fm);
            tracing::trace(tr_state, "Locally applying view update for {}.{}; base token = {}; view token = {}",
                    mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
-            local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, std::move(tr_state), db::commitlog::force_sync::no).then_wrapped(
+            local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, tr_state, db::commitlog::force_sync::no).then_wrapped(
                    [s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
                            units = sem_units.split(sem_units.count())] (future<>&& f) {
                --stats.writes;
--- a/db/view/view_updating_consumer.hh
+++ b/db/view/view_updating_consumer.hh
@@ -16,6 +16,7 @@
 #include "db/view/row_locking.hh"
 #include <seastar/core/abort_source.hh>
 #include "mutation.hh"
+#include <seastar/core/circular_buffer.hh>

 class evictable_reader_handle;

--- a/dirty_memory_manager.hh
+++ b/dirty_memory_manager.hh
@@ -202,6 +202,12 @@ public:
        });
    }

+    future<flush_permit> get_all_flush_permits() {
+        return get_units(_background_work_flush_serializer, _max_background_work).then([this] (auto&& units) {
+            return this->get_flush_permit(std::move(units));
+        });
+    }
+
    bool has_extraneous_flushes_requested() const {
        return _extraneous_flushes > 0;
    }
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -123,10 +123,14 @@ WantedBy=multi-user.target
        #  - Storage: /path/to/file (inacessible)
        #  - Storage: /path/to/file
        #
+        # After systemd-v248, available coredump file output changed like this:
+        #  - Storage: /path/to/file (present)
+        # We need to support both versions.
+        #
        # reference: https://github.com/systemd/systemd/commit/47f50642075a7a215c9f7b600599cbfee81a2913

        corefail = False
-        res = re.findall(r'Storage: (.*)$', coreinfo, flags=re.MULTILINE)
+        res = re.findall(r'Storage: (\S+)(?: \(.+\))?$', coreinfo, flags=re.MULTILINE)
        # v232 or later
        if res:
            corepath = res[0]
--- a/dist/common/scripts/scylla_sysconfig_setup
+++ b/dist/common/scripts/scylla_sysconfig_setup
@@ -70,7 +70,17 @@ if __name__ == '__main__':
    network_mode = args.mode if args.mode else cfg.get('NETWORK_MODE')

    if args.setup_nic_and_disks:
-        rps_cpus = run('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname), shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
+        res = run('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname), shell=True, check=True, capture_output=True, encoding='utf-8').stdout
+        # we need to extract CPU mask from output, since perftune.py may also print warning messages (#10082)
+        match = re.match('(.*\n)?(0x[0-9a-f]+(?:,0x[0-9a-f]+)*)', res, re.DOTALL)
+        try:
+            warning = match.group(1)
+            rps_cpus = match.group(2)
+        except:
+            raise Exception(f'Failed to retrive CPU mask: {res}')
+        # print warning message if available
+        if warning:
+            print(warning.strip())
        if len(rps_cpus) > 0:
            cpuset = hex2list(rps_cpus)
            run('/opt/scylladb/scripts/scylla_cpuset_setup --cpuset {}'.format(cpuset), shell=True, check=True)
--- a/dist/common/supervisor/scylla_util.sh
+++ b/dist/common/supervisor/scylla_util.sh
@@ -6,12 +6,16 @@ is_nonroot() {
    [ -f "$scylladir"/SCYLLA-NONROOT-FILE ]
 }

+is_container() {
+    [ -f "$scylladir"/SCYLLA-CONTAINER-FILE ]
+}
+
 is_privileged() {
    [ ${EUID:-${UID}} = 0 ]
 }

 execsudo() {
-    if is_nonroot; then
+    if is_nonroot || is_container; then
        exec "$@"
    else
        exec sudo -u scylla -g scylla "$@"
--- a/dist/docker/debian/build_docker.sh
+++ b/dist/docker/debian/build_docker.sh
@@ -82,15 +82,17 @@ run bash -ec "echo 'debconf debconf/frontend select Noninteractive' | debconf-se
 run bash -ec "rm -rf /etc/rsyslog.conf"
 run apt-get -y install hostname supervisor openssh-server openssh-client openjdk-11-jre-headless python python-yaml curl rsyslog locales sudo
 run locale-gen en_US.UTF-8
-run update-locale LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF_8
+run update-locale LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF-8
 run bash -ec "dpkg -i packages/*.deb"
 run apt-get -y clean all
 run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
 run mkdir -p /etc/supervisor.conf.d
 run mkdir -p /var/log/scylla
 run chown -R scylla:scylla /var/lib/scylla
+run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"/' /etc/default/scylla-server

 run mkdir -p /opt/scylladb/supervisor
+run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
 bcp dist/common/supervisor/scylla-server.sh /opt/scylladb/supervisor/scylla-server.sh
 bcp dist/common/supervisor/scylla-jmx.sh /opt/scylladb/supervisor/scylla-jmx.sh
 bcp dist/common/supervisor/scylla-node-exporter.sh /opt/scylladb/supervisor/scylla-node-exporter.sh
--- a/dist/docker/etc/supervisord.conf.d/scylla-server.conf
+++ b/dist/docker/etc/supervisord.conf.d/scylla-server.conf
@@ -1,4 +1,4 @@
-[program:scylla-server]
+[program:scylla]
 command=/opt/scylladb/supervisor/scylla-server.sh
 stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
--- a/dist/docker/etc/sysconfig/scylla-server
+++ b/dist/docker/etc/sysconfig/scylla-server
@@ -1,41 +0,0 @@
-# choose following mode: virtio, dpdk, posix
-NETWORK_MODE=posix
-
-# tap device name(virtio)
-TAP=tap0
-
-# bridge device name (virtio)
-BRIDGE=virbr0
-
-# ethernet device name
-IFNAME=eth0
-
-# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
-SET_NIC_AND_DISKS=no
-
-# ethernet device driver (dpdk)
-ETHDRV=
-
-# ethernet device PCI ID (dpdk)
-ETHPCIID=
-
-# number of hugepages
-NR_HUGEPAGES=64
-
-# user for process (must be root for dpdk)
-USER=scylla
-
-# group for process
-GROUP=scylla
-
-# scylla home dir
-SCYLLA_HOME=/var/lib/scylla
-
-# scylla config dir
-SCYLLA_CONF=/etc/scylla
-
-# scylla arguments
-SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"
-
-# setup as AMI instance
-AMI=no
--- a/flat_mutation_reader.cc
+++ b/flat_mutation_reader.cc
@@ -1580,6 +1580,9 @@ bool mutation_fragment_stream_validator::operator()(dht::token t) {
 }

 bool mutation_fragment_stream_validator::operator()(mutation_fragment_v2::kind kind, position_in_partition_view pos) {
+    if (kind == mutation_fragment_v2::kind::partition_end && _current_tombstone) {
+        return false;
+    }
    if (_prev_kind == mutation_fragment_v2::kind::partition_end) {
        const bool valid = (kind == mutation_fragment_v2::kind::partition_start);
        if (valid) {
@@ -1607,7 +1610,11 @@ bool mutation_fragment_stream_validator::operator()(mutation_fragment::kind kind
 }

 bool mutation_fragment_stream_validator::operator()(const mutation_fragment_v2& mf) {
-    return (*this)(mf.mutation_fragment_kind(), mf.position());
+    const auto valid = (*this)(mf.mutation_fragment_kind(), mf.position());
+    if (valid && mf.is_range_tombstone_change()) {
+        _current_tombstone = mf.as_range_tombstone_change().tombstone();
+    }
+    return valid;
 }
 bool mutation_fragment_stream_validator::operator()(const mutation_fragment& mf) {
    return (*this)(to_mutation_fragment_kind_v2(mf.mutation_fragment_kind()), mf.position());
@@ -1646,11 +1653,17 @@ void mutation_fragment_stream_validator::reset(dht::decorated_key dk) {
    _prev_partition_key = dk;
    _prev_pos = position_in_partition::for_partition_start();
    _prev_kind = mutation_fragment_v2::kind::partition_start;
+    _current_tombstone = {};
 }

 void mutation_fragment_stream_validator::reset(const mutation_fragment_v2& mf) {
    _prev_pos = mf.position();
    _prev_kind = mf.mutation_fragment_kind();
+    if (mf.is_range_tombstone_change()) {
+        _current_tombstone = mf.as_range_tombstone_change().tombstone();
+    } else {
+        _current_tombstone = {};
+    }
 }
 void mutation_fragment_stream_validator::reset(const mutation_fragment& mf) {
    _prev_pos = mf.position();
@@ -1719,6 +1732,11 @@ bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment_v2

    fmr_logger.debug("[validator {}] {}:{}", static_cast<void*>(this), kind, pos);

+    if (kind == mutation_fragment_v2::kind::partition_end && _current_tombstone) {
+        on_validation_error(fmr_logger, format("[validator {} for {}] Unexpected active tombstone at partition-end: partition key {}: tombstone {}",
+                static_cast<void*>(this), _name, _validator.previous_partition_key(), _current_tombstone));
+    }
+
    if (_validation_level >= mutation_fragment_stream_validation_level::clustering_key) {
        valid = _validator(kind, pos);
    } else {
@@ -1745,7 +1763,11 @@ bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment::k
 }

 bool mutation_fragment_stream_validating_filter::operator()(const mutation_fragment_v2& mv) {
-    return (*this)(mv.mutation_fragment_kind(), mv.position());
+    auto valid = (*this)(mv.mutation_fragment_kind(), mv.position());
+    if (valid && mv.is_range_tombstone_change()) {
+        _current_tombstone = mv.as_range_tombstone_change().tombstone();
+    }
+    return valid;
 }
 bool mutation_fragment_stream_validating_filter::operator()(const mutation_fragment& mv) {
    return (*this)(to_mutation_fragment_kind_v2(mv.mutation_fragment_kind()), mv.position());
--- a/install.sh
+++ b/install.sh
@@ -143,7 +143,7 @@ export LD_LIBRARY_PATH="$prefix/libreloc"
 export UBSAN_OPTIONS="${UBSAN_OPTIONS:+$UBSAN_OPTIONS:}suppressions=$prefix/libexec/ubsan-suppressions.supp"
 exec -a "\$0" "$prefix/libexec/$bin" "\$@"
 EOF
-    chmod +x "$root/$prefix/bin/$bin"
+    chmod 755 "$root/$prefix/bin/$bin"
 }

 relocate_python3() {
@@ -156,11 +156,11 @@ relocate_python3() {
    local pythonpath="$(dirname "$pythoncmd")"

    if [ ! -x "$script" ]; then
-        cp "$script" "$install"
+        install -m755 "$script" "$install"
        return
    fi
-    mkdir -p "$relocateddir"
-    cp "$script" "$relocateddir"
+    install -d -m755 "$relocateddir"
+    install -m755 "$script" "$relocateddir"
    cat > "$install"<<EOF
 #!/usr/bin/env bash
 [[ -z "\$LD_PRELOAD" ]] || { echo "\$0: not compatible with LD_PRELOAD" >&2; exit 110; }
@@ -178,7 +178,7 @@ if [ -f "\${DEBIAN_SSL_CERT_FILE}" ]; then
 fi
 PYTHONPATH="\${d}:\${d}/libexec:\$PYTHONPATH" PATH="\${d}/../bin:\${d}/$pythonpath:\${PATH}" SSL_CERT_FILE="\${c}" exec -a "\$0" "\${d}/libexec/\${b}" "\$@"
 EOF
-    chmod +x "$install"
+    chmod 755 "$install"
 }

 install() {
@@ -392,6 +392,7 @@ install -d -m755 -d "$rprefix"/scyllatop
 cp -r tools/scyllatop/* "$rprefix"/scyllatop
 install -d -m755 -d "$rprefix"/scripts
 cp -r dist/common/scripts/* "$rprefix"/scripts
+chmod 755 "$rprefix"/scripts/*
 ln -srf "$rprefix/scyllatop/scyllatop.py" "$rprefix/bin/scyllatop"
 if $supervisor; then
    install -d -m755 "$rprefix"/supervisor
@@ -508,8 +509,13 @@ relocate_python3 "$rprefix"/scripts fix_system_distributed_tables.py
 if $supervisor; then
    install -d -m755 `supervisor_dir $retc`
    for service in scylla-server scylla-jmx scylla-node-exporter; do
+        if [ "$service" = "scylla-server" ]; then
+            program="scylla"
+        else
+            program=$service
+        fi
        cat << EOS > `supervisor_conf $retc $service`
-[program:$service]
+[program:$program]
 directory=$rprefix
 command=/bin/bash -c './supervisor/$service.sh'
 EOS
--- a/locator/azure_snitch.cc
+++ b/locator/azure_snitch.cc
@@ -34,6 +34,10 @@ azure_snitch::azure_snitch(const sstring& fname, unsigned io_cpuid) : production
 }

 future<> azure_snitch::load_config() {
+    if (this_shard_id() != io_cpu_id()) {
+        co_return;
+    }
+
    sstring region = co_await azure_api_call(REGION_NAME_QUERY_PATH);
    sstring azure_zone = co_await azure_api_call(ZONE_NAME_QUERY_PATH);

--- a/main.cc
+++ b/main.cc
@@ -367,11 +367,38 @@ static auto defer_verbose_shutdown(const char* what, Func&& func) {
        startlog.info("Shutting down {}", what);
        try {
            func();
+            startlog.info("Shutting down {} was successful", what);
        } catch (...) {
-            startlog.error("Unexpected error shutting down {}: {}", what, std::current_exception());
-            throw;
+            auto ex = std::current_exception();
+            bool do_abort = true;
+            try {
+                std::rethrow_exception(ex);
+            } catch (const std::system_error& e) {
+                // System error codes we consider "environmental",
+                // i.e. not scylla's fault, therefore there is no point in
+                // aborting and dumping core.
+                for (int i : {EIO, EACCES, ENOSPC}) {
+                    if (e.code() == std::error_code(i, std::system_category())) {
+                        do_abort = false;
+                        break;
+                    }
+                }
+            } catch (...) {
+            }
+            auto msg = fmt::format("Unexpected error shutting down {}: {}", what, ex);
+            if (do_abort) {
+                startlog.error("{}: aborting", msg);
+                abort();
+            } else {
+                startlog.error("{}: exiting, at {}", msg, current_backtrace());
+
+                // Call _exit() rather than exit() to exit immediately
+                // without calling exit handlers, avoiding
+                // boost::intrusive::detail::destructor_impl assert failure
+                // from ~segment_pool exit handler.
+                _exit(255);
+            }
        }
-        startlog.info("Shutting down {} was successful", what);
    };

    auto ret = deferred_action(std::move(vfunc));
@@ -547,6 +574,12 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl

            cfg->broadcast_to_all_shards().get();

+            // We pass this piece of config through a global as a temporary hack.
+            // See the comment at the definition of sstables::global_cache_index_pages.
+            smp::invoke_on_all([&cfg] {
+                sstables::global_cache_index_pages = cfg->cache_index_pages.operator utils::updateable_value<bool>();
+            }).get();
+
            ::sighup_handler sighup_handler(opts, *cfg);
            auto stop_sighup_handler = defer_verbose_shutdown("sighup", [&] {
                sighup_handler.stop().get();
--- a/memtable-sstable.hh
+++ b/memtable-sstable.hh
@@ -15,6 +15,7 @@
 #include "sstables/shared_sstable.hh"
 #include <seastar/core/future.hh>
 #include <seastar/core/io_priority_class.hh>
+#include "reader_permit.hh"

 class memtable;
 class flat_mutation_reader;
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -438,6 +438,8 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
    // should not be blocked by any data requests.
    case messaging_verb::GROUP0_PEER_EXCHANGE:
    case messaging_verb::GROUP0_MODIFY_CONFIG:
+        // ATTN -- if moving GOSSIP_ verbs elsewhere, mind updating the tcp_nodelay
+        // setting in get_rpc_client(), which assumes gossiper verbs live in idx 0
        return 0;
    case messaging_verb::PREPARE_MESSAGE:
    case messaging_verb::PREPARE_DONE_MESSAGE:
@@ -695,7 +697,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }();

    auto must_tcp_nodelay = [&] {
-        if (idx == 1) {
+        if (idx == 0) {
            return true; // gossip
        }
        if (_cfg.tcp_nodelay == tcp_nodelay_what::local) {
@@ -716,10 +718,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }
    opts.tcp_nodelay = must_tcp_nodelay;
    opts.reuseaddr = true;
-    // We send cookies only for non-default statement tenant clients.
-    if (idx > 3) {
-        opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;
-    }
+    opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;

    auto client = must_encrypt ?
                    ::make_shared<rpc_protocol_client_wrapper>(_rpc->protocol(), std::move(opts),
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -745,11 +745,11 @@ future<typename ResultBuilder::result_type> do_query(
        ResultBuilder&& result_builder) {
    auto ctx = seastar::make_shared<read_context>(db, s, cmd, ranges, trace_state, timeout);

-    co_await ctx->lookup_readers(timeout);
-
    std::exception_ptr ex;

    try {
+        co_await ctx->lookup_readers(timeout);
+
        auto [last_ckey, result, unconsumed_buffer, compaction_state] = co_await read_page<ResultBuilder>(ctx, s, cmd, ranges, trace_state,
                std::move(result_builder));

--- a/mutation_fragment_stream_validator.hh
+++ b/mutation_fragment_stream_validator.hh
@@ -28,6 +28,7 @@ class mutation_fragment_stream_validator {
    mutation_fragment_v2::kind _prev_kind;
    position_in_partition _prev_pos;
    dht::decorated_key _prev_partition_key;
+    tombstone _current_tombstone;
 public:
    explicit mutation_fragment_stream_validator(const schema& s);

@@ -122,6 +123,12 @@ public:
    const position_in_partition& previous_position() const {
        return _prev_pos;
    }
+    /// Get the current effective tombstone
+    ///
+    /// Not meaningful, when operator()(mutation_fragment_v2) is not used.
+    tombstone current_tombstone() const {
+        return _current_tombstone;
+    }
    /// The previous valid partition key.
    ///
    /// Only valid if `operator()(const dht::decorated_key&)` or
@@ -151,6 +158,7 @@ class mutation_fragment_stream_validating_filter {
    mutation_fragment_stream_validator _validator;
    sstring _name;
    mutation_fragment_stream_validation_level _validation_level;
+    tombstone _current_tombstone;

 public:
    /// Constructor.
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -826,6 +826,7 @@ public:

    void apply(tombstone deleted_at) {
        _deleted_at.apply(deleted_at);
+        maybe_shadow();
    }

    void apply(shadowable_tombstone deleted_at) {
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -1581,11 +1581,7 @@ private:
    tracing::global_trace_state_ptr _trace_state;
    const mutation_reader::forwarding _fwd_mr;
    reader_concurrency_semaphore::inactive_read_handle _irh;
-    bool _drop_partition_start = false;
-    bool _drop_static_row = false;
-    // Validate the partition key of the first emitted partition, set after the
-    // reader was recreated.
-    bool _validate_partition_key = false;
+    bool _reader_recreated = false; // set if reader was recreated since last operation
    position_in_partition::tri_compare _tri_cmp;

    std::optional<dht::decorated_key> _last_pkey;
@@ -1606,10 +1602,9 @@ private:
    void adjust_partition_slice();
    flat_mutation_reader_v2 recreate_reader();
    future<flat_mutation_reader_v2> resume_or_create_reader();
-    void maybe_validate_partition_start(const flat_mutation_reader_v2::tracked_buffer& buffer);
+    void validate_partition_start(const partition_start& ps);
    void validate_position_in_partition(position_in_partition_view pos) const;
-    bool should_drop_fragment(const mutation_fragment_v2& mf);
-    future<> do_fill_buffer();
+    void examine_first_fragments(mutation_fragment_v2_opt& mf1, mutation_fragment_v2_opt& mf2, mutation_fragment_v2_opt& mf3);

 public:
    evictable_reader_v2(
@@ -1725,9 +1720,6 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
    _range_override.reset();
    _slice_override.reset();

-    _drop_partition_start = false;
-    _drop_static_row = false;
-
    if (_last_pkey) {
        bool partition_range_is_inclusive = true;

@@ -1736,11 +1728,8 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
            partition_range_is_inclusive = false;
            break;
        case partition_region::static_row:
-            _drop_partition_start = true;
            break;
        case partition_region::clustered:
-            _drop_partition_start = true;
-            _drop_static_row = true;
            adjust_partition_slice();
            slice = &*_slice_override;
            break;
@@ -1763,7 +1752,7 @@ flat_mutation_reader_v2 evictable_reader_v2::recreate_reader() {
        _range_override = dht::partition_range({dht::partition_range::bound(*_last_pkey, partition_range_is_inclusive)}, _pr->end());
        range = &*_range_override;

-        _validate_partition_key = true;
+        _reader_recreated = true;
    }

    return _ms.make_reader_v2(
@@ -1788,41 +1777,33 @@ future<flat_mutation_reader_v2> evictable_reader_v2::resume_or_create_reader() {
    co_return recreate_reader();
 }

-void evictable_reader_v2::maybe_validate_partition_start(const flat_mutation_reader_v2::tracked_buffer& buffer) {
-    if (!_validate_partition_key || buffer.empty()) {
-        return;
-    }
-
-    // If this is set we can assume the first fragment is a partition-start.
-    const auto& ps = buffer.front().as_partition_start();
+void evictable_reader_v2::validate_partition_start(const partition_start& ps) {
    const auto tri_cmp = dht::ring_position_comparator(*_schema);
    // If we recreated the reader after fast-forwarding it we won't have
    // _last_pkey set. In this case it is enough to check if the partition
    // is in range.
    if (_last_pkey) {
        const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
-        if (_drop_partition_start) { // we expect to continue from the same partition
+        if (_next_position_in_partition.region() != partition_region::partition_start) { // we expect to continue from the same partition
            // We cannot assume the partition we stopped the read at is still alive
            // when we recreate the reader. It might have been compacted away in the
            // meanwhile, so allow for a larger partition too.
            require(
                    cmp_res <= 0,
-                    "{}(): validation failed, expected partition with key larger or equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
+                    "{}(): validation failed, expected partition with key larger or equal to _last_pkey {}, but got {}",
                    __FUNCTION__,
                    *_last_pkey,
                    ps.key());
-            // Reset drop flags and next pos if we are not continuing from the same partition
+            // Reset next pos if we are not continuing from the same partition
            if (cmp_res < 0) {
                // Close previous partition, we are not going to continue it.
                push_mutation_fragment(*_schema, _permit, partition_end{});
-                _drop_partition_start = false;
-                _drop_static_row = false;
                _next_position_in_partition = position_in_partition::for_partition_start();
            }
        } else { // should be a larger partition
            require(
                    cmp_res < 0,
-                    "{}(): validation failed, expected partition with key larger than _last_pkey {} due to _drop_partition_start being unset, but got {}",
+                    "{}(): validation failed, expected partition with key larger than _last_pkey {}, but got {}",
                    __FUNCTION__,
                    *_last_pkey,
                    ps.key());
@@ -1836,8 +1817,6 @@ void evictable_reader_v2::maybe_validate_partition_start(const flat_mutation_rea
            __FUNCTION__,
            prange,
            ps.key());
-
-    _validate_partition_key = false;
 }

 void evictable_reader_v2::validate_position_in_partition(position_in_partition_view pos) const {
@@ -1860,7 +1839,12 @@ void evictable_reader_v2::validate_position_in_partition(position_in_partition_v
        const bool any_contains = std::any_of(ranges.begin(), ranges.end(), [this, &pos] (const query::clustering_range& cr) {
            // TODO: somehow avoid this copy
            auto range = position_range(cr);
-            return range.contains(*_schema, pos);
+            // We cannot use range.contains() because that treats range as a
+            // [a, b) range, meaning a range tombstone change with position
+            // after_key(b) will be considered outside of it. Such range
+            // tombstone changes can be emitted however when recreating the
+            // reader on clustering range edge.
+            return _tri_cmp(range.start(), pos) <= 0 && _tri_cmp(pos, range.end()) <= 0;
        });
        require(
                any_contains,
@@ -1871,42 +1855,40 @@ void evictable_reader_v2::validate_position_in_partition(position_in_partition_v
    }
 }

-bool evictable_reader_v2::should_drop_fragment(const mutation_fragment_v2& mf) {
-    if (_drop_partition_start && mf.is_partition_start()) {
-        _drop_partition_start = false;
-        return true;
+void evictable_reader_v2::examine_first_fragments(mutation_fragment_v2_opt& mf1, mutation_fragment_v2_opt& mf2, mutation_fragment_v2_opt& mf3) {
+    if (!mf1) {
+        return; // the reader is at EOS
    }
-    // Unlike partition-start above, a partition is not guaranteed to have a
-    // static row fragment. So reset the flag regardless of whether we could
-    // drop one or not.
-    // We are guaranteed to get here only right after dropping a partition-start,
-    // so if we are not seeing a static row here, the partition doesn't have one.
-    if (_drop_static_row) {
-         _drop_static_row = false;
-        return mf.is_static_row();
-    }
-    return false;
-}

-future<> evictable_reader_v2::do_fill_buffer() {
-    if (!_drop_partition_start && !_drop_static_row) {
-        auto fill_buf_fut = _reader->fill_buffer();
-        if (_validate_partition_key) {
-            fill_buf_fut = fill_buf_fut.then([this] {
-                maybe_validate_partition_start(_reader->buffer());
-            });
-        }
-        return fill_buf_fut;
+    // If engaged, the first fragment is always a partition-start.
+    validate_partition_start(mf1->as_partition_start());
+    if (_tri_cmp(mf1->position(), _next_position_in_partition) < 0) {
+        mf1 = {}; // drop mf1
+    }
+
+    const auto continue_same_partition = _next_position_in_partition.region() != partition_region::partition_start;
+
+    // If we have a first fragment, we are guaranteed to have a second one -- if not else, a partition-end.
+    if (mf2->is_end_of_partition()) {
+        return; // no further fragments, nothing to do
+    }
+
+    // We want to validate the position of the first non-dropped fragment.
+    // If mf2 is a static row and we need to drop it, this will be mf3.
+    if (mf2->is_static_row() && _tri_cmp(mf2->position(), _next_position_in_partition) < 0) {
+        mf2 = {}; // drop mf2
+    } else {
+        if (continue_same_partition) {
+            validate_position_in_partition(mf2->position());
+        }
+        return;
+    }
+
+    if (mf3->is_end_of_partition()) {
+        return; // no further fragments, nothing to do
+    } else if (continue_same_partition) {
+        validate_position_in_partition(mf3->position());
    }
-    return repeat([this] {
-        return _reader->fill_buffer().then([this] {
-            maybe_validate_partition_start(_reader->buffer());
-            while (!_reader->is_buffer_empty() && should_drop_fragment(_reader->peek_buffer())) {
-                _reader->pop_mutation_fragment();
-            }
-            return stop_iteration(_reader->is_buffer_full() || _reader->is_end_of_stream());
-        });
-    });
 }

 evictable_reader_v2::evictable_reader_v2(
@@ -1935,10 +1917,62 @@ future<> evictable_reader_v2::fill_buffer() {
        co_return;
    }
    _reader = co_await resume_or_create_reader();
-    co_await do_fill_buffer();
+
+    if (_reader_recreated) {
+        // Recreating the reader breaks snapshot isolation and creates all sorts
+        // of complications around the continuity of range tombstone changes,
+        // e.g. a range tombstone started by the previous reader object
+        // might not exist anymore with the new reader object.
+        // To avoid complications we reset the tombstone state on each reader
+        // recreation by emitting a null tombstone change, if we read at least
+        // one clustering fragment from the partition.
+        if (_next_position_in_partition.region() == partition_region::clustered
+                && _tri_cmp(_next_position_in_partition, position_in_partition::before_all_clustered_rows()) > 0) {
+            push_mutation_fragment(*_schema, _permit, range_tombstone_change{position_in_partition_view::before_key(_next_position_in_partition), {}});
+        }
+        auto mf1 = co_await (*_reader)();
+        auto mf2 = co_await (*_reader)();
+        auto mf3 = co_await (*_reader)();
+        examine_first_fragments(mf1, mf2, mf3);
+        if (mf3) {
+            _reader->unpop_mutation_fragment(std::move(*mf3));
+        }
+        if (mf2) {
+            _reader->unpop_mutation_fragment(std::move(*mf2));
+        }
+        if (mf1) {
+            _reader->unpop_mutation_fragment(std::move(*mf1));
+        }
+        _reader_recreated = false;
+    } else {
+        co_await _reader->fill_buffer();
+    }
+
    _reader->move_buffer_content_to(*this);
+
+    // Ensure that each buffer represents forward progress. Only a concern when
+    // the last fragment in the buffer is range tombstone change. In this case
+    // ensure that:
+    // * buffer().back().position() > _next_position_in_partition;
+    // * _reader.peek()->position() > buffer().back().position();
+    if (!is_buffer_empty() && buffer().back().is_range_tombstone_change()) {
+        auto* next_mf = co_await _reader->peek();
+
+        // First make sure we've made progress w.r.t. _next_position_in_partition.
+        while (next_mf && _tri_cmp(_next_position_in_partition, buffer().back().position()) <= 0) {
+            push_mutation_fragment(_reader->pop_mutation_fragment());
+            next_mf = co_await _reader->peek();
+        }
+
+        const auto last_pos = position_in_partition(buffer().back().position());
+        while (next_mf && _tri_cmp(last_pos, next_mf->position()) == 0) {
+            push_mutation_fragment(_reader->pop_mutation_fragment());
+            next_mf = co_await _reader->peek();
+        }
+    }
+
    update_next_position();
-    _end_of_stream = _reader->is_end_of_stream() && _reader->is_buffer_empty();
+    _end_of_stream = _reader->is_end_of_stream();
    maybe_pause(std::move(*_reader));
 }

--- a/partition_snapshot_reader.hh
+++ b/partition_snapshot_reader.hh
@@ -292,14 +292,23 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
                const std::optional<position_in_partition>& last_row,
                const std::optional<position_in_partition>& last_rts,
                position_in_partition_view pos) {
-            if (!_rt_stream.empty()) {
-                return _rt_stream.get_next(std::move(pos));
-            }
            return in_alloc_section([&] () -> mutation_fragment_opt {
                maybe_refresh_state(ck_range_snapshot, last_row, last_rts);

                position_in_partition::less_compare rt_less(_query_schema);

+                // The while below moves range tombstones from partition versions
+                // into _rt_stream, just enough to produce the next range tombstone
+                // The main goal behind moving to _rt_stream is to deoverlap range tombstones
+                // which have the same starting position. This is not in order to satisfy
+                // flat_mutation_reader stream requirements, the reader can emit range tombstones
+                // which have the same position incrementally. This is to guarantee forward
+                // progress in the case iterators get invalidated and maybe_refresh_state()
+                // above needs to restore them. It does so using last_rts, which tracks
+                // the position of the last emitted range tombstone. All range tombstones
+                // with positions <= than last_rts are skipped on refresh. To make progress,
+                // we need to make sure that all range tombstones with duplicated positions
+                // are emitted before maybe_refresh_state().
                while (has_more_range_tombstones()
                        && !rt_less(pos, peek_range_tombstone().position())
                        && (_rt_stream.empty() || !rt_less(_rt_stream.peek_next().position(), peek_range_tombstone().position()))) {
--- a/partition_snapshot_row_cursor.hh
+++ b/partition_snapshot_row_cursor.hh
@@ -444,7 +444,7 @@ public:
    // When throws, the cursor is invalidated and its position is not changed.
    bool advance_to(position_in_partition_view lower_bound) {
        maybe_advance_to(lower_bound);
-        return no_clustering_row_between(_schema, lower_bound, position());
+        return no_clustering_row_between_weak(_schema, lower_bound, position());
    }

    // Call only when valid.
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -567,6 +567,20 @@ bool no_clustering_row_between(const schema& s, position_in_partition_view a, po
    }
 }

+// Returns true if and only if there can't be any clustering_row with position >= a and < b.
+// It is assumed that a <= b.
+inline
+bool no_clustering_row_between_weak(const schema& s, position_in_partition_view a, position_in_partition_view b) {
+    clustering_key_prefix::equality eq(s);
+    if (a.has_key() && b.has_key()) {
+        return eq(a.key(), b.key())
+               && (a.get_bound_weight() == bound_weight::after_all_prefixed
+                   || b.get_bound_weight() != bound_weight::after_all_prefixed);
+    } else {
+        return !a.has_key() && !b.has_key();
+    }
+}
+
 // Includes all position_in_partition objects "p" for which: start <= p < end
 // And only those.
 class position_range {
--- a/protocol_server.hh
+++ b/protocol_server.hh
@@ -10,6 +10,7 @@

 #include "seastarx.hh"
 #include <seastar/core/future.hh>
+#include <seastar/net/socket_defs.hh>
 #include <vector>

 // Abstraction for a server serving some kind of user-facing protocol.
--- a/querier.cc
+++ b/querier.cc
@@ -414,25 +414,6 @@ future<bool> querier_cache::evict_one() noexcept {
    co_return false;
 }

-future<> querier_cache::evict_all_for_table(const utils::UUID& schema_id) noexcept {
-    for (auto ip : {&_data_querier_index, &_mutation_querier_index, &_shard_mutation_querier_index}) {
-        auto& idx = *ip;
-        for (auto it = idx.begin(); it != idx.end();) {
-            if (it->second->schema().id() == schema_id) {
-                auto reader_opt = it->second->permit().semaphore().unregister_inactive_read(querier_utils::get_inactive_read_handle(*it->second));
-                it = idx.erase(it);
-                --_stats.population;
-                if (reader_opt) {
-                    co_await reader_opt->close();
-                }
-            } else {
-                ++it;
-            }
-        }
-    }
-    co_return;
-}
-
 future<> querier_cache::stop() noexcept {
    co_await _closing_gate.close();

--- a/querier.hh
+++ b/querier.hh
@@ -476,11 +476,6 @@ public:
    /// is empty).
    future<bool> evict_one() noexcept;

-    /// Evict all queriers that belong to a table.
-    ///
-    /// Should be used when dropping a table.
-    future<> evict_all_for_table(const utils::UUID& schema_id) noexcept;
-
    /// Close all queriers and wait on background work.
    ///
    /// Should be used before destroying the querier_cache.
--- a/range_tombstone_list.cc
+++ b/range_tombstone_list.cc
@@ -96,7 +96,7 @@ void range_tombstone_list::insert_from(const schema& s,
        if (cmp(end, it->position()) < 0) {
            // not overlapping
            if (it->tombstone().tomb == tomb && cmp(end, it->position()) == 0) {
-                rev.update(it, {std::move(start), std::move(start), tomb});
+                rev.update(it, {std::move(start), std::move(end), tomb});
            } else {
                auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), tomb);
                rev.insert(it, *rt);
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -743,6 +743,25 @@ void reader_concurrency_semaphore::clear_inactive_reads() {
    }
 }

+future<> reader_concurrency_semaphore::evict_inactive_reads_for_table(utils::UUID id) noexcept {
+    inactive_reads_type evicted_readers;
+    auto it = _inactive_reads.begin();
+    while (it != _inactive_reads.end()) {
+        auto& ir = *it;
+        ++it;
+        if (ir.reader.schema()->id() == id) {
+            do_detach_inactive_reader(ir, evict_reason::manual);
+            ir.ttl_timer.cancel();
+            ir.unlink();
+            evicted_readers.push_back(ir);
+        }
+    }
+    while (!evicted_readers.empty()) {
+        std::unique_ptr<inactive_read> irp(&evicted_readers.front());
+        co_await irp->reader.close();
+    }
+}
+
 std::runtime_error reader_concurrency_semaphore::stopped_exception() {
    return std::runtime_error(format("{} was stopped", _name));
 }
@@ -765,11 +784,9 @@ future<> reader_concurrency_semaphore::stop() noexcept {
    co_return;
 }

-flat_mutation_reader_v2 reader_concurrency_semaphore::detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
-    auto reader = std::move(ir.reader);
+void reader_concurrency_semaphore::do_detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
    ir.detach();
-    reader.permit()._impl->on_evicted();
-    std::unique_ptr<inactive_read> irp(&ir);
+    ir.reader.permit()._impl->on_evicted();
    try {
        if (ir.notify_handler) {
            ir.notify_handler(reason);
@@ -788,7 +805,12 @@ flat_mutation_reader_v2 reader_concurrency_semaphore::detach_inactive_reader(ina
            break;
    }
    --_stats.inactive_reads;
-    return reader;
+}
+
+flat_mutation_reader_v2 reader_concurrency_semaphore::detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
+    std::unique_ptr<inactive_read> irp(&ir);
+    do_detach_inactive_reader(ir, reason);
+    return std::move(irp->reader);
 }

 void reader_concurrency_semaphore::evict(inactive_read& ir, evict_reason reason) noexcept {
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -186,6 +186,7 @@ private:
    std::optional<future<>> _execution_loop_future;

 private:
+    void do_detach_inactive_reader(inactive_read&, evict_reason reason) noexcept;
    [[nodiscard]] flat_mutation_reader_v2 detach_inactive_reader(inactive_read&, evict_reason reason) noexcept;
    void evict(inactive_read&, evict_reason reason) noexcept;

@@ -301,6 +302,9 @@ public:

    /// Clear all inactive reads.
    void clear_inactive_reads();
+
+    /// Evict all inactive reads the belong to the table designated by the id.
+    future<> evict_inactive_reads_for_table(utils::UUID id) noexcept;
 private:
    // The following two functions are extension points for
    // future inheriting classes that needs to run some stop
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -25,6 +25,7 @@
 #include "utils/bit_cast.hh"
 #include "service/migration_manager.hh"
 #include "partition_range_compat.hh"
+#include "gms/feature_service.hh"

 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/algorithm/string/split.hpp>
@@ -41,6 +42,7 @@
 #include <seastar/core/sleep.hh>

 #include <cfloat>
+#include <algorithm>

 #include "idl/partition_checksum.dist.hh"

@@ -118,6 +120,13 @@ std::ostream& operator<<(std::ostream& out, row_level_diff_detect_algorithm algo
    return out << "unknown";
 }

+static size_t get_nr_tables(const replica::database& db, const sstring& keyspace) {
+    auto& m = db.get_column_families_mapping();
+    return std::count_if(m.begin(), m.end(), [&keyspace] (auto& e) {
+        return e.first.first == keyspace;
+    });
+}
+
 static std::vector<sstring> list_column_families(const replica::database& db, const sstring& keyspace) {
    std::vector<sstring> ret;
    for (auto &&e : db.get_column_families_mapping()) {
@@ -443,7 +452,7 @@ float tracker::report_progress(streaming::stream_reason reason) {
    for (auto& x : _repairs) {
        auto& ri = x.second;
        if (ri->reason == reason) {
-            nr_ranges_total += ri->nr_ranges_total;
+            nr_ranges_total += ri->ranges_size();
            nr_ranges_finished += ri->nr_ranges_finished;
        }
    }
@@ -555,8 +564,8 @@ void repair_info::check_failed_ranges() {
    rlogger.info("repair id {} on shard {} stats: repair_reason={}, keyspace={}, tables={}, ranges_nr={}, {}",
        id, shard, reason, keyspace, table_names(), ranges.size(), _stats.get_stats());
    if (nr_failed_ranges) {
-        rlogger.warn("repair id {} on shard {} failed - {} out of {} ranges failed", id, shard, nr_failed_ranges, ranges.size());
-        throw std::runtime_error(format("repair id {} on shard {} failed to repair {} out of {} ranges", id, shard, nr_failed_ranges, ranges.size()));
+        rlogger.warn("repair id {} on shard {} failed - {} out of {} ranges failed", id, shard, nr_failed_ranges, ranges_size());
+        throw std::runtime_error(format("repair id {} on shard {} failed to repair {} out of {} ranges", id, shard, nr_failed_ranges, ranges_size()));
    } else {
        if (dropped_tables.size()) {
            rlogger.warn("repair id {} on shard {} completed successfully, keyspace={}, ignoring dropped tables={}", id, shard, keyspace, dropped_tables);
@@ -582,14 +591,18 @@ repair_neighbors repair_info::get_repair_neighbors(const dht::token_range& range
        neighbors[range];
 }

+size_t repair_info::ranges_size() {
+    return ranges.size() * table_ids.size();
+}
+
 // Repair a single local range, multiple column families.
 // Comparable to RepairSession in Origin
-future<> repair_info::repair_range(const dht::token_range& range) {
+future<> repair_info::repair_range(const dht::token_range& range, utils::UUID table_id) {
    check_in_shutdown();
    check_in_abort();
    ranges_index++;
    repair_neighbors neighbors = get_repair_neighbors(range);
-    return do_with(std::move(neighbors.all), std::move(neighbors.mandatory), [this, range] (auto& neighbors, auto& mandatory_neighbors) {
+    return do_with(std::move(neighbors.all), std::move(neighbors.mandatory), [this, range, table_id] (auto& neighbors, auto& mandatory_neighbors) {
      auto live_neighbors = boost::copy_range<std::vector<gms::inet_address>>(neighbors |
                    boost::adaptors::filtered([this] (const gms::inet_address& node) { return gossiper.is_alive(node); }));
      for (auto& node : mandatory_neighbors) {
@@ -598,7 +611,7 @@ future<> repair_info::repair_range(const dht::token_range& range) {
                nr_failed_ranges++;
                auto status = format("failed: mandatory neighbor={} is not alive", node);
                rlogger.error("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
-                    ranges_index, ranges.size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
+                    ranges_index, ranges_size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
                abort();
                return make_exception_future<>(std::runtime_error(format("Repair mandatory neighbor={} is not alive, keyspace={}, mandatory_neighbors={}",
                    node, keyspace, mandatory_neighbors)));
@@ -608,7 +621,7 @@ future<> repair_info::repair_range(const dht::token_range& range) {
            nr_failed_ranges++;
            auto status = live_neighbors.empty() ? "skipped" : "partial";
            rlogger.warn("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
-            ranges_index, ranges.size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
+            ranges_index, ranges_size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
            if (live_neighbors.empty()) {
                return make_ready_future<>();
            }
@@ -617,13 +630,12 @@ future<> repair_info::repair_range(const dht::token_range& range) {
      if (neighbors.empty()) {
            auto status = "skipped_no_followers";
            rlogger.warn("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
-            ranges_index, ranges.size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
+            ranges_index, ranges_size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors, status);
            return make_ready_future<>();
      }
      rlogger.info("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}",
-            ranges_index, ranges.size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors);
-      return mm.sync_schema(db.local(), neighbors).then([this, &neighbors, range] {
-        return do_for_each(table_ids.begin(), table_ids.end(), [this, &neighbors, range] (utils::UUID table_id) {
+            ranges_index, ranges_size(), id, shard, keyspace, table_names(), range, neighbors, live_neighbors);
+      return mm.sync_schema(db.local(), neighbors).then([this, &neighbors, range, table_id] {
            sstring cf;
            try {
                cf = db.local().find_column_family(table_id).schema()->cf_name();
@@ -641,7 +653,6 @@ future<> repair_info::repair_range(const dht::token_range& range) {
                nr_failed_ranges++;
                return make_exception_future<>(std::move(ep));
            });
-        });
      });
    });
 }
@@ -914,27 +925,55 @@ private:


 static future<> do_repair_ranges(lw_shared_ptr<repair_info> ri) {
-    // repair all the ranges in limited parallelism
-    return parallel_for_each(ri->ranges, [ri] (auto&& range) {
-        return with_semaphore(ri->rs.repair_tracker().range_parallelism_semaphore(), 1, [ri, &range] {
-            return ri->repair_range(range).then([ri] {
-                if (ri->reason == streaming::stream_reason::bootstrap) {
-                    ri->rs.get_metrics().bootstrap_finished_ranges++;
-                } else if (ri->reason == streaming::stream_reason::replace) {
-                    ri->rs.get_metrics().replace_finished_ranges++;
-                } else if (ri->reason == streaming::stream_reason::rebuild) {
-                    ri->rs.get_metrics().rebuild_finished_ranges++;
-                } else if (ri->reason == streaming::stream_reason::decommission) {
-                    ri->rs.get_metrics().decommission_finished_ranges++;
-                } else if (ri->reason == streaming::stream_reason::removenode) {
-                    ri->rs.get_metrics().removenode_finished_ranges++;
-                } else if (ri->reason == streaming::stream_reason::repair) {
-                    ri->rs.get_metrics().repair_finished_ranges_sum++;
-                    ri->nr_ranges_finished++;
-                }
+    // Repair tables in the keyspace one after another
+    assert(ri->table_names().size() == ri->table_ids.size());
+    for (int idx = 0; idx < ri->table_ids.size(); idx++) {
+        auto table_id = ri->table_ids[idx];
+        auto table_name = ri->table_names()[idx];
+        // repair all the ranges in limited parallelism
+        rlogger.info("repair[{}]: Started to repair {} out of {} tables in keyspace={}, table={}, table_id={}, repair_reason={}",
+                ri->id.uuid, idx + 1, ri->table_ids.size(), ri->keyspace, table_name, table_id, ri->reason);
+        co_await parallel_for_each(ri->ranges, [ri, table_id] (auto&& range) {
+            return with_semaphore(ri->rs.repair_tracker().range_parallelism_semaphore(), 1, [ri, &range, table_id] {
+                return ri->repair_range(range, table_id).then([ri] {
+                    if (ri->reason == streaming::stream_reason::bootstrap) {
+                        ri->rs.get_metrics().bootstrap_finished_ranges++;
+                    } else if (ri->reason == streaming::stream_reason::replace) {
+                        ri->rs.get_metrics().replace_finished_ranges++;
+                    } else if (ri->reason == streaming::stream_reason::rebuild) {
+                        ri->rs.get_metrics().rebuild_finished_ranges++;
+                    } else if (ri->reason == streaming::stream_reason::decommission) {
+                        ri->rs.get_metrics().decommission_finished_ranges++;
+                    } else if (ri->reason == streaming::stream_reason::removenode) {
+                        ri->rs.get_metrics().removenode_finished_ranges++;
+                    } else if (ri->reason == streaming::stream_reason::repair) {
+                        ri->rs.get_metrics().repair_finished_ranges_sum++;
+                        ri->nr_ranges_finished++;
+                    }
+                    rlogger.debug("repair[{}]: node ops progress bootstrap={}, replace={}, rebuild={}, decommission={}, removenode={}, repair={}",
+                        ri->id.uuid,
+                        ri->rs.get_metrics().bootstrap_finished_percentage(),
+                        ri->rs.get_metrics().replace_finished_percentage(),
+                        ri->rs.get_metrics().rebuild_finished_percentage(),
+                        ri->rs.get_metrics().decommission_finished_percentage(),
+                        ri->rs.get_metrics().removenode_finished_percentage(),
+                        ri->rs.get_metrics().repair_finished_percentage());
+                });
            });
        });
-    });
+
+        if (ri->reason != streaming::stream_reason::repair) {
+            try {
+                auto& table = ri->db.local().find_column_family(table_id);
+                rlogger.debug("repair[{}]: Trigger off-strategy compaction for keyspace={}, table={}",
+                    ri->id.uuid, table.schema()->ks_name(), table.schema()->cf_name());
+                table.trigger_offstrategy_compaction();
+            } catch (replica::no_such_column_family&) {
+                // Ignore dropped table
+            }
+        }
+    }
+    co_return;
 }

 // repair_ranges repairs a list of token ranges, each assumed to be a token
@@ -1060,33 +1099,48 @@ int repair_service::do_repair_start(sstring keyspace, std::unordered_map<sstring
            cfs = std::move(cfs), ranges = std::move(ranges), options = std::move(options), ignore_nodes = std::move(ignore_nodes)] () mutable {
        auto uuid = id.uuid;

-        auto waiting_nodes = db.local().get_token_metadata().get_all_endpoints();
-        std::erase_if(waiting_nodes, [&] (const auto& addr) {
-            return ignore_nodes.contains(addr);
-        });
-        auto participants = get_hosts_participating_in_repair(db.local(), keyspace, ranges, options.data_centers, options.hosts, ignore_nodes).get();
-        auto hints_timeout = std::chrono::seconds(300);
-        auto batchlog_timeout = std::chrono::seconds(300);
-        repair_flush_hints_batchlog_request req{id.uuid, participants, hints_timeout, batchlog_timeout};
+        bool needs_flush_before_repair = false;
+        if (db.local().features().cluster_supports_tombstone_gc_options()) {
+            for (auto& table: cfs) {
+                auto s = db.local().find_column_family(keyspace, table).schema();
+                const auto& options = s->tombstone_gc_options();
+                if (options.mode() == tombstone_gc_mode::repair) {
+                    needs_flush_before_repair = true;
+                }
+            }
+        }

        bool hints_batchlog_flushed = false;
-        try {
-            parallel_for_each(waiting_nodes, [this, uuid, &req, &participants] (gms::inet_address node) -> future<> {
-                rlogger.info("repair[{}]: Sending repair_flush_hints_batchlog to node={}, participants={}, started",
-                        uuid, node, participants);
-                try {
-                    auto& ms = get_messaging();
-                    auto resp = co_await ser::partition_checksum_rpc_verbs::send_repair_flush_hints_batchlog(&ms, netw::msg_addr(node), req);
-                } catch (...) {
-                    rlogger.warn("repair[{}]: Sending repair_flush_hints_batchlog to node={}, participants={}, failed: {}",
-                            uuid, node, participants, std::current_exception());
-                    throw;
-                }
-            }).get();
-            hints_batchlog_flushed = true;
-        } catch (...) {
-            rlogger.warn("repair[{}]: Sending repair_flush_hints_batchlog to participants={} failed, continue to run repair",
-                    uuid, participants);
+        auto participants = get_hosts_participating_in_repair(db.local(), keyspace, ranges, options.data_centers, options.hosts, ignore_nodes).get();
+        if (needs_flush_before_repair) {
+            auto waiting_nodes = db.local().get_token_metadata().get_all_endpoints();
+            std::erase_if(waiting_nodes, [&] (const auto& addr) {
+                return ignore_nodes.contains(addr);
+            });
+            auto hints_timeout = std::chrono::seconds(300);
+            auto batchlog_timeout = std::chrono::seconds(300);
+            repair_flush_hints_batchlog_request req{id.uuid, participants, hints_timeout, batchlog_timeout};
+
+            try {
+                parallel_for_each(waiting_nodes, [this, uuid, &req, &participants] (gms::inet_address node) -> future<> {
+                    rlogger.info("repair[{}]: Sending repair_flush_hints_batchlog to node={}, participants={}, started",
+                            uuid, node, participants);
+                    try {
+                        auto& ms = get_messaging();
+                        auto resp = co_await ser::partition_checksum_rpc_verbs::send_repair_flush_hints_batchlog(&ms, netw::msg_addr(node), req);
+                    } catch (...) {
+                        rlogger.warn("repair[{}]: Sending repair_flush_hints_batchlog to node={}, participants={}, failed: {}",
+                                uuid, node, participants, std::current_exception());
+                        throw;
+                    }
+                }).get();
+                hints_batchlog_flushed = true;
+            } catch (...) {
+                rlogger.warn("repair[{}]: Sending repair_flush_hints_batchlog to participants={} failed, continue to run repair",
+                        uuid, participants);
+            }
+        } else {
+            rlogger.info("repair[{}]: Skipped sending repair_flush_hints_batchlog to nodes={}", uuid, participants);
        }

        std::vector<future<>> repair_results;
@@ -1288,7 +1342,8 @@ future<> repair_service::bootstrap_with_repair(locator::token_metadata_ptr tmptr
            auto& strat = ks.get_replication_strategy();
            dht::token_range_vector desired_ranges = strat.get_pending_address_ranges(tmptr, tokens, myip).get0();
            seastar::thread::maybe_yield();
-            nr_ranges_total += desired_ranges.size();
+            auto nr_tables = get_nr_tables(db.local(), keyspace_name);
+            nr_ranges_total += desired_ranges.size() * nr_tables;
        }
        container().invoke_on_all([nr_ranges_total] (repair_service& rs) {
            rs.get_metrics().bootstrap_finished_ranges = 0;
@@ -1320,7 +1375,8 @@ future<> repair_service::bootstrap_with_repair(locator::token_metadata_ptr tmptr
            //Collects the source that will have its range moved to the new node
            std::unordered_map<dht::token_range, repair_neighbors> range_sources;

-            rlogger.info("bootstrap_with_repair: started with keyspace={}, nr_ranges={}", keyspace_name, desired_ranges.size());
+            auto nr_tables = get_nr_tables(db.local(), keyspace_name);
+            rlogger.info("bootstrap_with_repair: started with keyspace={}, nr_ranges={}", keyspace_name, desired_ranges.size() * nr_tables);
            for (auto& desired_range : desired_ranges) {
                for (auto& x : range_addresses) {
                    const range<dht::token>& src_range = x.first;
@@ -1461,7 +1517,8 @@ future<> repair_service::do_decommission_removenode_with_repair(locator::token_m
            }
            auto& ks = db.local().find_keyspace(keyspace_name);
            dht::token_range_vector ranges = ks.get_effective_replication_map()->get_ranges(leaving_node);
-            nr_ranges_total += ranges.size();
+            auto nr_tables = get_nr_tables(db.local(), keyspace_name);
+            nr_ranges_total += ranges.size() * nr_tables;
        }
        if (reason == streaming::stream_reason::decommission) {
            container().invoke_on_all([nr_ranges_total] (repair_service& rs) {
@@ -1485,8 +1542,9 @@ future<> repair_service::do_decommission_removenode_with_repair(locator::token_m
            auto erm = ks.get_effective_replication_map();
            // First get all ranges the leaving node is responsible for
            dht::token_range_vector ranges = erm->get_ranges(leaving_node);
-            rlogger.info("{}: started with keyspace={}, leaving_node={}, nr_ranges={}", op, keyspace_name, leaving_node, ranges.size());
-            size_t nr_ranges_total = ranges.size();
+            auto nr_tables = get_nr_tables(db.local(), keyspace_name);
+            rlogger.info("{}: started with keyspace={}, leaving_node={}, nr_ranges={}", op, keyspace_name, leaving_node, ranges.size() * nr_tables);
+            size_t nr_ranges_total = ranges.size() * nr_tables;
            size_t nr_ranges_skipped = 0;
            std::unordered_map<dht::token_range, inet_address_vector_replica_set> current_replica_endpoints;
            // Find (for each range) all nodes that store replicas for these ranges as well
@@ -1677,7 +1735,8 @@ future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_
            auto& strat = ks.get_replication_strategy();
            // Okay to yield since tm is immutable
            dht::token_range_vector ranges = strat.get_ranges(myip, tmptr).get0();
-            nr_ranges_total += ranges.size();
+            auto nr_tables = get_nr_tables(db.local(), keyspace_name);
+            nr_ranges_total += ranges.size() * nr_tables;

        }
        if (reason == streaming::stream_reason::rebuild) {
@@ -1702,7 +1761,8 @@ future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_
            auto& strat = ks.get_replication_strategy();
            dht::token_range_vector ranges = strat.get_ranges(myip, tmptr).get0();
            std::unordered_map<dht::token_range, repair_neighbors> range_sources;
-            rlogger.info("{}: started with keyspace={}, source_dc={}, nr_ranges={}, ignore_nodes={}", op, keyspace_name, source_dc, ranges.size(), ignore_nodes);
+            auto nr_tables = get_nr_tables(db.local(), keyspace_name);
+            rlogger.info("{}: started with keyspace={}, source_dc={}, nr_ranges={}, ignore_nodes={}", op, keyspace_name, source_dc, ranges.size() * nr_tables, ignore_nodes);
            for (auto it = ranges.begin(); it != ranges.end();) {
                auto& r = *it;
                seastar::thread::maybe_yield();
@@ -1730,12 +1790,12 @@ future<> repair_service::do_rebuild_replace_with_repair(locator::token_metadata_
                }
            }
            if (reason == streaming::stream_reason::rebuild) {
-                container().invoke_on_all([nr_ranges_skipped] (repair_service& rs) {
-                    rs.get_metrics().rebuild_finished_ranges += nr_ranges_skipped;
+                container().invoke_on_all([nr_ranges_skipped, nr_tables] (repair_service& rs) {
+                    rs.get_metrics().rebuild_finished_ranges += nr_ranges_skipped * nr_tables;
                }).get();
            } else if (reason == streaming::stream_reason::replace) {
-                container().invoke_on_all([nr_ranges_skipped] (repair_service& rs) {
-                    rs.get_metrics().replace_finished_ranges += nr_ranges_skipped;
+                container().invoke_on_all([nr_ranges_skipped, nr_tables] (repair_service& rs) {
+                    rs.get_metrics().replace_finished_ranges += nr_ranges_skipped * nr_tables;
                }).get();
            }
            auto nr_ranges = ranges.size();
--- a/repair/repair.hh
+++ b/repair/repair.hh
@@ -200,7 +200,9 @@ public:
        return _hints_batchlog_flushed;
    }

-    future<> repair_range(const dht::token_range& range);
+    future<> repair_range(const dht::token_range& range, utils::UUID table_id);
+
+    size_t ranges_size();
 };

 // The repair_tracker tracks ongoing repair operations and their progress.
--- a/repair/row_level.hh
+++ b/repair/row_level.hh
@@ -67,6 +67,7 @@ public:
    uint64_t repair_finished_ranges_sum{0};
 private:
    seastar::metrics::metric_groups _metrics;
+public:
    float bootstrap_finished_percentage();
    float replace_finished_percentage();
    float rebuild_finished_percentage();
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -910,10 +910,9 @@ bool database::update_column_family(schema_ptr new_schema) {
    return columns_changed;
 }

-future<> database::remove(const column_family& cf) noexcept {
+void database::remove(const table& cf) noexcept {
    auto s = cf.schema();
    auto& ks = find_keyspace(s->ks_name());
-    co_await _querier_cache.evict_all_for_table(s->id());
    _column_families.erase(s->id());
    ks.metadata()->remove_column_family(s);
    _ks_cf_to_uuid.erase(std::make_pair(s->ks_name(), s->cf_name()));
@@ -937,13 +936,22 @@ future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_
        on_internal_error(dblog, fmt::format("drop_column_family {}.{}: UUID={} not found", ks_name, cf_name, uuid));
    }
    dblog.debug("Dropping {}.{}", ks_name, cf_name);
-    co_await remove(*cf);
+    remove(*cf);
    cf->clear_views();
-    co_return co_await cf->await_pending_ops().then([this, &ks, cf, tsf = std::move(tsf), snapshot] {
-        return truncate(ks, *cf, std::move(tsf), snapshot).finally([this, cf] {
-            return cf->stop();
-        });
-    }).finally([cf] {});
+    co_await cf->await_pending_ops();
+    for (auto* sem : {&_read_concurrency_sem, &_streaming_concurrency_sem, &_compaction_concurrency_sem, &_system_read_concurrency_sem}) {
+        co_await sem->evict_inactive_reads_for_table(uuid);
+    }
+    std::exception_ptr ex;
+    try {
+        co_await truncate(ks, *cf, std::move(tsf), snapshot);
+    } catch (...) {
+        ex = std::current_exception();
+    }
+    co_await cf->stop();
+    if (ex) {
+        std::rethrow_exception(std::move(ex));
+    }
 }

 const utils::UUID& database::find_uuid(std::string_view ks, std::string_view cf) const {
@@ -2054,6 +2062,53 @@ future<> database::flush(const sstring& ksname, const sstring& cfname) {
    return cf.flush();
 }

+future<> database::flush_on_all(utils::UUID id) {
+    return container().invoke_on_all([id] (replica::database& db) {
+        return db.find_column_family(id).flush();
+    });
+}
+
+future<> database::flush_on_all(std::string_view ks_name, std::string_view table_name) {
+    return flush_on_all(find_uuid(ks_name, table_name));
+}
+
+future<> database::flush_on_all(std::string_view ks_name, std::vector<sstring> table_names) {
+    return parallel_for_each(table_names, [this, ks_name] (const auto& table_name) {
+        return flush_on_all(ks_name, table_name);
+    });
+}
+
+future<> database::flush_on_all(std::string_view ks_name) {
+    return parallel_for_each(find_keyspace(ks_name).metadata()->cf_meta_data(), [this] (auto& pair) {
+        return flush_on_all(pair.second->id());
+    });
+}
+
+future<> database::snapshot_on_all(std::string_view ks_name, std::vector<sstring> table_names, sstring tag, bool skip_flush) {
+    co_await parallel_for_each(table_names, [this, ks_name, tag = std::move(tag), skip_flush] (const auto& table_name) -> future<> {
+        if (!skip_flush) {
+            co_await flush_on_all(ks_name, table_name);
+        }
+        co_await container().invoke_on_all([ks_name, &table_name, tag, skip_flush] (replica::database& db) {
+            auto& t = db.find_column_family(ks_name, table_name);
+            return t.snapshot(db, tag);
+        });
+    });
+}
+
+future<> database::snapshot_on_all(std::string_view ks_name, sstring tag, bool skip_flush) {
+    auto& ks = find_keyspace(ks_name);
+    co_await parallel_for_each(ks.metadata()->cf_meta_data(), [this, tag = std::move(tag), skip_flush] (const auto& pair) -> future<> {
+        if (!skip_flush) {
+            co_await flush_on_all(pair.second->id());
+        }
+        co_await container().invoke_on_all([id = pair.second, tag, skip_flush] (replica::database& db) {
+            auto& t = db.find_column_family(id);
+            return t.snapshot(db, tag);
+        });
+    });
+}
+
 future<> database::truncate(sstring ksname, sstring cfname, timestamp_func tsf) {
    auto& ks = find_keyspace(ksname);
    auto& cf = find_column_family(ksname, cfname);
@@ -2062,80 +2117,77 @@ future<> database::truncate(sstring ksname, sstring cfname, timestamp_func tsf)

 future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_func tsf, bool with_snapshot) {
    dblog.debug("Truncating {}.{}", cf.schema()->ks_name(), cf.schema()->cf_name());
-    return with_gate(cf.async_gate(), [this, &ks, &cf, tsf = std::move(tsf), with_snapshot] () mutable -> future<> {
-        const auto auto_snapshot = with_snapshot && get_config().auto_snapshot();
-        const auto should_flush = auto_snapshot;
+    auto holder = cf.async_gate().hold();

-        // Force mutations coming in to re-acquire higher rp:s
-        // This creates a "soft" ordering, in that we will guarantee that
-        // any sstable written _after_ we issue the flush below will
-        // only have higher rp:s than we will get from the discard_sstable
-        // call.
-        auto low_mark = cf.set_low_replay_position_mark();
+    const auto auto_snapshot = with_snapshot && get_config().auto_snapshot();
+    const auto should_flush = auto_snapshot;

-        const auto uuid = cf.schema()->id();
+    // Force mutations coming in to re-acquire higher rp:s
+    // This creates a "soft" ordering, in that we will guarantee that
+    // any sstable written _after_ we issue the flush below will
+    // only have higher rp:s than we will get from the discard_sstable
+    // call.
+    auto low_mark = cf.set_low_replay_position_mark();

-        return _compaction_manager->run_with_compaction_disabled(&cf, [this, &cf, should_flush, auto_snapshot, tsf = std::move(tsf), low_mark]() mutable {
-            future<> f = make_ready_future<>();
-            bool did_flush = false;
-            if (should_flush && cf.can_flush()) {
-                // TODO:
-                // this is not really a guarantee at all that we've actually
-                // gotten all things to disk. Again, need queue-ish or something.
-                f = cf.flush();
-                did_flush = true;
-            } else {
-                f = cf.clear();
-            }
-            return f.then([this, &cf, auto_snapshot, tsf = std::move(tsf), low_mark, should_flush, did_flush] {
-                dblog.debug("Discarding sstable data for truncated CF + indexes");
-                // TODO: notify truncation
+    const auto uuid = cf.schema()->id();

-                return tsf().then([this, &cf, auto_snapshot, low_mark, should_flush, did_flush](db_clock::time_point truncated_at) {
-                    future<> f = make_ready_future<>();
-                    if (auto_snapshot) {
-                        auto name = format("{:d}-{}", truncated_at.time_since_epoch().count(), cf.schema()->cf_name());
-                        f = cf.snapshot(*this, name);
-                    }
-                    return f.then([this, &cf, truncated_at, low_mark, should_flush, did_flush] {
-                        return cf.discard_sstables(truncated_at).then([this, &cf, truncated_at, low_mark, should_flush, did_flush](db::replay_position rp) {
-                            // TODO: indexes.
-                            // Note: since discard_sstables was changed to only count tables owned by this shard,
-                            // we can get zero rp back. Changed assert, and ensure we save at least low_mark.
-                            // #6995 - the assert below was broken in c2c6c71 and remained so for many years. 
-                            // We nowadays do not flush tables with sstables but autosnapshot=false. This means
-                            // the low_mark assertion does not hold, because we maybe/probably never got around to 
-                            // creating the sstables that would create them.
-                            assert(!did_flush || low_mark <= rp || rp == db::replay_position());
-                            rp = std::max(low_mark, rp);
-                            return truncate_views(cf, truncated_at, should_flush).then([&cf, truncated_at, rp] {
-                                // save_truncation_record() may actually fail after we cached the truncation time
-                                // but this is not be worse that if failing without caching: at least the correct time
-                                // will be available until next reboot and a client will have to retry truncation anyway.
-                                cf.cache_truncation_record(truncated_at);
-                                return db::system_keyspace::save_truncation_record(cf, truncated_at, rp);
-                            });
-                        });
-                    });
-                });
-            });
-        }).then([this, uuid] {
-            drop_repair_history_map_for_table(uuid);
-        });
-    });
-}
+    std::vector<compaction_manager::compaction_reenabler> cres;
+    cres.reserve(1 + cf.views().size());

-future<> database::truncate_views(const column_family& base, db_clock::time_point truncated_at, bool should_flush) {
-    return parallel_for_each(base.views(), [this, truncated_at, should_flush] (view_ptr v) {
+    cres.emplace_back(co_await _compaction_manager->stop_and_disable_compaction(&cf));
+    co_await parallel_for_each(cf.views(), [&, this] (view_ptr v) -> future<> {
        auto& vcf = find_column_family(v);
-        return _compaction_manager->run_with_compaction_disabled(&vcf, [&vcf, truncated_at, should_flush] {
-            return (should_flush ? vcf.flush() : vcf.clear()).then([&vcf, truncated_at, should_flush] {
-                return vcf.discard_sstables(truncated_at).then([&vcf, truncated_at, should_flush](db::replay_position rp) {
-                    return db::system_keyspace::save_truncation_record(vcf, truncated_at, rp);
-                });
-            });
-        });
+        cres.emplace_back(co_await _compaction_manager->stop_and_disable_compaction(&vcf));
    });
+
+    bool did_flush = false;
+    if (should_flush && cf.can_flush()) {
+        // TODO:
+        // this is not really a guarantee at all that we've actually
+        // gotten all things to disk. Again, need queue-ish or something.
+        co_await cf.flush();
+        did_flush = true;
+    } else {
+        co_await cf.clear();
+    }
+
+    dblog.debug("Discarding sstable data for truncated CF + indexes");
+    // TODO: notify truncation
+
+    db_clock::time_point truncated_at = co_await tsf();
+
+    if (auto_snapshot) {
+        auto name = format("{:d}-{}", truncated_at.time_since_epoch().count(), cf.schema()->cf_name());
+        co_await cf.snapshot(*this, name);
+    }
+
+    db::replay_position rp = co_await cf.discard_sstables(truncated_at);
+    // TODO: indexes.
+    // Note: since discard_sstables was changed to only count tables owned by this shard,
+    // we can get zero rp back. Changed assert, and ensure we save at least low_mark.
+    // #6995 - the assert below was broken in c2c6c71 and remained so for many years. 
+    // We nowadays do not flush tables with sstables but autosnapshot=false. This means
+    // the low_mark assertion does not hold, because we maybe/probably never got around to 
+    // creating the sstables that would create them.
+    assert(!did_flush || low_mark <= rp || rp == db::replay_position());
+    rp = std::max(low_mark, rp);
+    co_await parallel_for_each(cf.views(), [this, truncated_at, should_flush] (view_ptr v) -> future<> {
+        auto& vcf = find_column_family(v);
+            if (should_flush) {
+                co_await vcf.flush();
+            } else {
+                co_await vcf.clear();
+            }
+            db::replay_position rp = co_await vcf.discard_sstables(truncated_at);
+            co_await db::system_keyspace::save_truncation_record(vcf, truncated_at, rp);
+    });
+    // save_truncation_record() may actually fail after we cached the truncation time
+    // but this is not be worse that if failing without caching: at least the correct time
+    // will be available until next reboot and a client will have to retry truncation anyway.
+    cf.cache_truncation_record(truncated_at);
+    co_await db::system_keyspace::save_truncation_record(cf, truncated_at, rp);
+
+    drop_repair_history_map_for_table(uuid);
 }

 const sstring& database::get_snitch_name() const {
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -839,7 +839,11 @@ public:

    db::replay_position set_low_replay_position_mark();

-    future<> snapshot(database& db, sstring name, bool skip_flush = false);
+private:
+    future<> snapshot(database& db, sstring name);
+
+    friend class database;
+public:
    future<std::unordered_map<sstring, snapshot_details>> get_snapshot_details();

    /*!
@@ -1217,7 +1221,7 @@ struct string_pair_eq {
 //   local metadata reads
 //   use shard_of() for data

-class database {
+class database : public peering_sharded_service<database> {
    friend class ::database_test;
 public:
    enum class table_kind {
@@ -1371,6 +1375,7 @@ private:
    Future update_write_metrics(Future&& f);
    void update_write_metrics_for_timed_out_write();
    future<> create_keyspace(const lw_shared_ptr<keyspace_metadata>&, locator::effective_replication_map_factory& erm_factory, bool is_bootstrap, system_keyspace system);
+    void remove(const table&) noexcept;
 public:
    static utils::UUID empty_version;

@@ -1560,6 +1565,17 @@ public:

    future<> flush_all_memtables();
    future<> flush(const sstring& ks, const sstring& cf);
+    // flush a table identified by the given id on all shards.
+    future<> flush_on_all(utils::UUID id);
+    // flush a single table in a keyspace on all shards.
+    future<> flush_on_all(std::string_view ks_name, std::string_view table_name);
+    // flush a list of tables in a keyspace on all shards.
+    future<> flush_on_all(std::string_view ks_name, std::vector<sstring> table_names);
+    // flush all tables in a keyspace on all shards.
+    future<> flush_on_all(std::string_view ks_name);
+
+    future<> snapshot_on_all(std::string_view ks_name, std::vector<sstring> table_names, sstring tag, bool skip_flush);
+    future<> snapshot_on_all(std::string_view ks_name, sstring tag, bool skip_flush);

    // See #937. Truncation now requires a callback to get a time stamp
    // that must be guaranteed to be the same for all shards.
@@ -1568,11 +1584,9 @@ public:
    /** Truncates the given column family */
    future<> truncate(sstring ksname, sstring cfname, timestamp_func);
    future<> truncate(const keyspace& ks, column_family& cf, timestamp_func, bool with_snapshot = true);
-    future<> truncate_views(const column_family& base, db_clock::time_point truncated_at, bool should_flush);

    bool update_column_family(schema_ptr s);
    future<> drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func, bool with_snapshot = true);
-    future<> remove(const column_family&) noexcept;

    const logalloc::region_group& dirty_memory_region_group() const {
        return _dirty_memory_manager.region_group();
--- a/replica/distributed_loader.cc
+++ b/replica/distributed_loader.cc
@@ -454,12 +454,13 @@ future<> distributed_loader::handle_sstables_pending_delete(sstring pending_dele
    });
 }

-future<> distributed_loader::populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, bool must_exist) {
-    return async([&db, sstdir = std::move(sstdir), ks = std::move(ks), cf = std::move(cf), must_exist] {
+future<> distributed_loader::populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, allow_offstrategy_compaction do_allow_offstrategy_compaction, must_exist dir_must_exist) {
+    dblog.debug("Populating {}/{}/{} allow_offstrategy_compaction={} must_exist={}", ks, cf, sstdir, do_allow_offstrategy_compaction, dir_must_exist);
+    return async([&db, sstdir = std::move(sstdir), ks = std::move(ks), cf = std::move(cf), do_allow_offstrategy_compaction, dir_must_exist] {
        assert(this_shard_id() == 0);

        if (!file_exists(sstdir).get0()) {
-            if (must_exist) {
+            if (dir_must_exist) {
                throw std::runtime_error(format("Populating {}/{} failed: {} does not exist", ks, cf, sstdir));
            }
            return;
@@ -529,12 +530,14 @@ future<> distributed_loader::populate_column_family(distributed<replica::databas
            return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
        }, eligible_for_reshape_on_boot).get();

-        directory.invoke_on_all([global_table, &eligible_for_reshape_on_boot] (sstables::sstable_directory& dir) {
-            return dir.do_for_each_sstable([&global_table, &eligible_for_reshape_on_boot] (sstables::shared_sstable sst) {
-                auto requires_offstrategy = sstables::offstrategy(!eligible_for_reshape_on_boot(sst));
+        directory.invoke_on_all([global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::sstable_directory& dir) {
+            return dir.do_for_each_sstable([&global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::shared_sstable sst) {
+                auto requires_offstrategy = sstables::offstrategy(do_allow_offstrategy_compaction && !eligible_for_reshape_on_boot(sst));
                return global_table->add_sstable_and_update_cache(sst, requires_offstrategy);
-            }).then([&global_table] {
+            }).then([&global_table, do_allow_offstrategy_compaction] {
+              if (do_allow_offstrategy_compaction) {
                global_table->trigger_offstrategy_compaction();
+              }
            });
        }).get();
    });
@@ -560,11 +563,11 @@ future<> distributed_loader::populate_keyspace(distributed<replica::database>& d
                auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
                dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
                return ks.make_directory_for_column_family(cfname, uuid).then([&db, sstdir, uuid, ks_name, cfname] {
-                    return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::staging_dir, ks_name, cfname);
+                    return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::staging_dir, ks_name, cfname, allow_offstrategy_compaction::no);
                }).then([&db, sstdir, ks_name, cfname] {
-                    return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::quarantine_dir, ks_name, cfname, false /* must_exist */);
+                    return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::quarantine_dir, ks_name, cfname, allow_offstrategy_compaction::no, must_exist::no);
                }).then([&db, sstdir, uuid, ks_name, cfname] {
-                    return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname);
+                    return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname, allow_offstrategy_compaction::yes);
                }).handle_exception([ks_name, cfname, sstdir](std::exception_ptr eptr) {
                    std::string msg =
                        format("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
--- a/replica/distributed_loader.hh
+++ b/replica/distributed_loader.hh
@@ -13,6 +13,7 @@
 #include <seastar/core/distributed.hh>
 #include <seastar/core/sstring.hh>
 #include <seastar/core/file.hh>
+#include <seastar/util/bool_class.hh>
 #include <vector>
 #include <functional>
 #include <filesystem>
@@ -67,7 +68,9 @@ class distributed_loader {
    static future<size_t> make_sstables_available(sstables::sstable_directory& dir,
            sharded<replica::database>& db, sharded<db::view::view_update_generator>& view_update_generator,
            std::filesystem::path datadir, sstring ks, sstring cf);
-    static future<> populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, bool must_exist = true);
+    using allow_offstrategy_compaction = bool_class<struct allow_offstrategy_compaction_tag>;
+    using must_exist = bool_class<struct must_exist_tag>;
+    static future<> populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, allow_offstrategy_compaction, must_exist = must_exist::yes);
    static future<> populate_keyspace(distributed<replica::database>& db, sstring datadir, sstring ks_name);
    static future<> cleanup_column_family_temp_sst_dirs(sstring sstdir);
    static future<> handle_sstables_pending_delete(sstring pending_deletes_dir);
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -9,6 +9,7 @@
 #include <seastar/core/seastar.hh>
 #include <seastar/core/coroutine.hh>
 #include <seastar/coroutine/maybe_yield.hh>
+#include <seastar/coroutine/exception.hh>
 #include <seastar/util/closeable.hh>

 #include "replica/database.hh"
@@ -662,11 +663,21 @@ table::try_flush_memtable_to_sstable(lw_shared_ptr<memtable> old, sstable_write_
                [] (const dht::decorated_key&) { return api::min_timestamp; });
        }

-        mutation_fragment* fragment = co_await reader.peek();
-        if (!fragment) {
+        std::exception_ptr err;
+        try {
+            mutation_fragment* fragment = co_await reader.peek();
+            if (!fragment) {
+                co_await reader.close();
+                _memtables->erase(old);
+                co_return stop_iteration::yes;
+            }
+        } catch (...) {
+            err = std::current_exception();
+        }
+        if (err) {
+            tlogger.error("failed to flush memtable for {}.{}: {}", old->schema()->ks_name(), old->schema()->cf_name(), err);
            co_await reader.close();
-            _memtables->erase(old);
-            co_return stop_iteration::yes;
+            co_return stop_iteration(_async_gate.is_closed());
        }

        auto f = consumer(upgrade_to_v2(std::move(reader)));
@@ -1426,70 +1437,86 @@ future<> table::write_schema_as_cql(database& db, sstring dir) const {

 }

-future<> table::snapshot(database& db, sstring name, bool skip_flush) {
+future<> table::snapshot(database& db, sstring name) {
    auto jsondir = _config.datadir + "/snapshots/" + name;
-    tlogger.debug("snapshot {}: skip_flush={}", jsondir, skip_flush);
-    auto f = skip_flush ? make_ready_future<>() : flush();
-    return f.then([this, &db, jsondir = std::move(jsondir)]() {
-       return with_semaphore(_sstable_deletion_sem, 1, [this, &db, jsondir = std::move(jsondir)]() {
-        auto tables = boost::copy_range<std::vector<sstables::shared_sstable>>(*_sstables->all());
-        return do_with(std::move(tables), std::move(jsondir), [this, &db] (std::vector<sstables::shared_sstable>& tables, const sstring& jsondir) {
-            return io_check([&jsondir] { return recursive_touch_directory(jsondir); }).then([this, &db, &jsondir, &tables] {
-                return max_concurrent_for_each(tables, db.get_config().initial_sstable_loading_concurrency(), [&db, &jsondir] (sstables::shared_sstable sstable) {
-                  return with_semaphore(db.get_sharded_sst_dir_semaphore().local(), 1, [&jsondir, sstable] {
-                    return io_check([sstable, &dir = jsondir] {
-                        return sstable->create_links(dir);
-                    });
-                  });
-                });
-            }).then([&jsondir, &tables] {
-                return io_check(sync_directory, jsondir);
-            }).finally([this, &tables, &db, &jsondir] {
-                auto shard = std::hash<sstring>()(jsondir) % smp::count;
-                std::unordered_set<sstring> table_names;
-                for (auto& sst : tables) {
-                    auto f = sst->get_filename();
-                    auto rf = f.substr(sst->get_dir().size() + 1);
-                    table_names.insert(std::move(rf));
-                }
-                return smp::submit_to(shard, [requester = this_shard_id(), &jsondir, this, &db,
-                                              tables = std::move(table_names), datadir = _config.datadir] {
+    tlogger.debug("snapshot {}", jsondir);

-                    if (!pending_snapshots.contains(jsondir)) {
-                        pending_snapshots.emplace(jsondir, make_lw_shared<snapshot_manager>());
-                    }
-                    auto snapshot = pending_snapshots.at(jsondir);
-                    for (auto&& sst: tables) {
-                        snapshot->files.insert(std::move(sst));
-                    }
+    auto sstable_deletion_guard = co_await get_units(_sstable_deletion_sem, 1);
+    std::exception_ptr ex;

-                    snapshot->requests.signal(1);
-                    auto my_work = make_ready_future<>();
-                    if (requester == this_shard_id()) {
-                        my_work = snapshot->requests.wait(smp::count).then([&jsondir,
-                                                                            &db, snapshot, this] {
-                            // this_shard_id() here == requester == this_shard_id() before submit_to() above,
-                            // so the db reference is still local
-                            return write_schema_as_cql(db, jsondir).handle_exception([&jsondir](std::exception_ptr ptr) {
-                                tlogger.error("Failed writing schema file in snapshot in {} with exception {}", jsondir, ptr);
-                                return make_ready_future<>();
-                            }).finally([&jsondir, snapshot] () mutable {
-                                return seal_snapshot(jsondir).handle_exception([&jsondir] (std::exception_ptr ex) {
-                                    tlogger.error("Failed to seal snapshot in {}: {}. Ignored.", jsondir, ex);
-                                }).then([snapshot] {
-                                    snapshot->manifest_write.signal(smp::count);
-                                    return make_ready_future<>();
-                                });
-                            });
-                        });
-                    }
-                    return my_work.finally([snapshot] {
-                        return snapshot->manifest_write.wait(1);
-                    }).then([snapshot] {});
+    std::vector<sstables::shared_sstable> tables;
+    try {
+        tables = boost::copy_range<std::vector<sstables::shared_sstable>>(*_sstables->all());
+        co_await io_check([&jsondir] { return recursive_touch_directory(jsondir); });
+        co_await max_concurrent_for_each(tables, db.get_config().initial_sstable_loading_concurrency(), [&db, &jsondir] (sstables::shared_sstable sstable) {
+            return with_semaphore(db.get_sharded_sst_dir_semaphore().local(), 1, [&jsondir, sstable] {
+                return io_check([sstable, &dir = jsondir] {
+                    return sstable->create_links(dir);
                });
            });
        });
-       });
+        co_await io_check(sync_directory, jsondir);
+    } catch (...) {
+        ex = std::current_exception();
+    }
+
+    auto shard = std::hash<sstring>()(jsondir) % smp::count;
+    std::unordered_set<sstring> table_names;
+    try {
+        for (auto& sst : tables) {
+            auto f = sst->get_filename();
+            auto rf = f.substr(sst->get_dir().size() + 1);
+            table_names.insert(std::move(rf));
+        }
+    } catch (...) {
+        ex = std::current_exception();
+    }
+    co_await smp::submit_to(shard, [requester = this_shard_id(), &jsondir, this, &db,
+            tables = std::move(table_names), datadir = _config.datadir, ex = std::move(ex)] () mutable -> future<> {
+        if (!pending_snapshots.contains(jsondir)) {
+            try {
+                pending_snapshots.emplace(jsondir, make_lw_shared<snapshot_manager>());
+            } catch (...) {
+                // abort since the process will hang if we can't coordinate
+                // snapshot across shards, similar to failing to allocation a continuation.
+                tlogger.error("Failed allocating snapshot_manager: {}. Aborting.", std::current_exception());
+                abort();
+            }
+        }
+        auto snapshot = pending_snapshots.at(jsondir);
+        try {
+            for (auto&& sst: tables) {
+                snapshot->files.insert(std::move(sst));
+            }
+        } catch (...) {
+            ex = std::current_exception();
+        }
+
+        tlogger.debug("snapshot {}: signal requests", jsondir);
+        snapshot->requests.signal(1);
+        if (requester == this_shard_id()) {
+            tlogger.debug("snapshot {}: waiting for all shards", jsondir);
+            co_await snapshot->requests.wait(smp::count);
+            // this_shard_id() here == requester == this_shard_id() before submit_to() above,
+            // so the db reference is still local
+            tlogger.debug("snapshot {}: writing schema.cql", jsondir);
+            co_await write_schema_as_cql(db, jsondir).handle_exception([&] (std::exception_ptr ptr) {
+                tlogger.error("Failed writing schema file in snapshot in {} with exception {}", jsondir, ptr);
+                ex = std::move(ptr);
+            });
+            tlogger.debug("snapshot {}: seal_snapshot", jsondir);
+            co_await seal_snapshot(jsondir).handle_exception([&] (std::exception_ptr ptr) {
+                tlogger.error("Failed to seal snapshot in {}: {}.", jsondir, ptr);
+                ex = std::move(ptr);
+            });
+            snapshot->manifest_write.signal(smp::count);
+        }
+        tlogger.debug("snapshot {}: waiting for manifest on behalf of shard {}", jsondir, requester);
+        co_await snapshot->manifest_write.wait(1);
+        tlogger.debug("snapshot {}: done: error={}", jsondir, ex);
+        if (ex) {
+            std::rethrow_exception(std::move(ex));
+        }
    });
 }

@@ -1571,13 +1598,14 @@ bool table::can_flush() const {
 }

 future<> table::clear() {
+    auto permits = co_await _config.dirty_memory_manager->get_all_flush_permits();
    if (_commitlog) {
        for (auto& t : *_memtables) {
            _commitlog->discard_completed_segments(_schema->id(), t->get_and_discard_rp_set());
        }
    }
    _memtables->clear_and_add();
-    return _cache.invalidate(row_cache::external_updater([] { /* There is no underlying mutation source */ }));
+    co_await _cache.invalidate(row_cache::external_updater([] { /* There is no underlying mutation source */ }));
 }

 // NOTE: does not need to be futurized, but might eventually, depending on
@@ -2235,7 +2263,7 @@ std::chrono::milliseconds table::get_coordinator_read_latency_percentile(double

 void
 table::enable_auto_compaction() {
-    // FIXME: unmute backlog. turn table backlog back on.
+    // XXX: unmute backlog. turn table backlog back on.
    //      see table::disable_auto_compaction() notes.
    _compaction_disabled_by_user = false;
    trigger_compaction();
@@ -2243,7 +2271,7 @@ table::enable_auto_compaction() {

 future<>
 table::disable_auto_compaction() {
-    // FIXME: mute backlog. When we disable background compactions
+    // XXX: mute backlog. When we disable background compactions
    // for the table, we must also disable current backlog of the
    // table compaction strategy that contributes to the scheduling
    // group resources prioritization.
@@ -2270,9 +2298,8 @@ table::disable_auto_compaction() {
    // - it will break computation of major compaction descriptor
    //   for new submissions
    _compaction_disabled_by_user = true;
-    return with_gate(_async_gate, [this] {
-        return compaction_manager().stop_ongoing_compactions("disable auto-compaction", this, sstables::compaction_type::Compaction);
-    });
+    // FIXME: stop ongoing compactions
+    return make_ready_future<>();
 }

 flat_mutation_reader
--- a/schema_upgrader.hh
+++ b/schema_upgrader.hh
@@ -9,6 +9,7 @@
 #pragma once

 #include "mutation_fragment.hh"
+#include "mutation_fragment_v2.hh"
 #include "converting_mutation_partition_applier.hh"

 // A StreamedMutationTransformer which transforms the stream to a different schema
--- a/2
+++ b/2
--- a/service/paxos/cas_request.hh
+++ b/service/paxos/cas_request.hh
@@ -8,6 +8,7 @@
 #pragma once

 #include <optional>
+#include <seastar/core/sharded.hh>

 #include "timestamp.hh"

--- a/sstables/downsampling.hh
+++ b/sstables/downsampling.hh
@@ -16,6 +16,7 @@
 #include <list>
 #include <map>
 #include <vector>
+#include <array>
 #include <algorithm>
 #include <iterator>
 #include <cassert>
--- a/sstables/kl/reader.cc
+++ b/sstables/kl/reader.cc
@@ -1142,7 +1142,7 @@ private:
    }
    index_reader& get_index_reader() {
        if (!_index_reader) {
-            auto caching = use_caching(!_slice.options.contains(query::partition_slice::option::bypass_cache));
+            auto caching = use_caching(global_cache_index_pages && !_slice.options.contains(query::partition_slice::option::bypass_cache));
            _index_reader = std::make_unique<index_reader>(_sst, _consumer.permit(), _consumer.io_priority(),
                                                           _consumer.trace_state(), caching);
        }
--- a/sstables/mx/reader.cc
+++ b/sstables/mx/reader.cc
@@ -1319,7 +1319,7 @@ private:
    }
    index_reader& get_index_reader() {
        if (!_index_reader) {
-            auto caching = use_caching(!_slice.options.contains(query::partition_slice::option::bypass_cache));
+            auto caching = use_caching(global_cache_index_pages && !_slice.options.contains(query::partition_slice::option::bypass_cache));
            _index_reader = std::make_unique<index_reader>(_sst, _consumer.permit(), _consumer.io_priority(),
                                                           _consumer.trace_state(), caching);
        }
--- a/sstables/partition_index_cache.hh
+++ b/sstables/partition_index_cache.hh
@@ -55,7 +55,12 @@ private:
        entry(entry&&) noexcept = default;

        ~entry() {
-            assert(!is_referenced());
+            if (is_referenced()) {
+                // Live entry_ptr should keep the entry alive, except when the entry failed on loading.
+                // In that case, entry_ptr holders are not supposed to use the pointer, so it's safe
+                // to nullify those entry_ptrs.
+                assert(!ready());
+            }
        }

        void on_evicted() noexcept override;
--- a/sstables/promoted_index_blocks_reader.hh
+++ b/sstables/promoted_index_blocks_reader.hh
@@ -14,6 +14,7 @@
 #include "m_format_read_helpers.hh"
 #include "sstables/mx/parsers.hh"
 #include "sstables/index_entry.hh"
+#include <seastar/core/circular_buffer.hh>

 namespace sstables {

--- a/sstables/sstable_set.cc
+++ b/sstables/sstable_set.cc
@@ -387,10 +387,15 @@ void time_series_sstable_set::for_each_sstable(std::function<void(const shared_s

 // O(log n)
 void time_series_sstable_set::insert(shared_sstable sst) {
+  try {
    auto min_pos = sst->min_position();
    auto max_pos_reversed = sst->max_position().reversed();
    _sstables->emplace(std::move(min_pos), sst);
    _sstables_reversed->emplace(std::move(max_pos_reversed), std::move(sst));
+  } catch (...) {
+    erase(sst);
+    throw;
+  }
 }

 // O(n) worst case, but should be close to O(log n) most of the time
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -77,6 +77,18 @@ thread_local disk_error_signal_type sstable_write_error;

 namespace sstables {

+// The below flag governs the mode of index file page caching used by the index
+// reader.
+//
+// If set to true, the reader will read and/or populate a common global cache,
+// which shares its capacity with the row cache. If false, the reader will use
+// BYPASS CACHE semantics for index caching.
+//
+// This flag is intended to be a temporary hack. The goal is to eventually
+// solve index caching problems via a smart cache replacement policy.
+//
+thread_local utils::updateable_value<bool> global_cache_index_pages(false);
+
 logging::logger sstlog("sstable");

 // Because this is a noop and won't hold any state, it is better to use a global than a
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -48,6 +48,7 @@
 #include "sstables/open_info.hh"
 #include "query-request.hh"
 #include "mutation_fragment_stream_validator.hh"
+#include "utils/updateable_value.hh"

 #include <seastar/util/optimized_optional.hh>

@@ -57,6 +58,8 @@ class cached_file;

 namespace sstables {

+extern thread_local utils::updateable_value<bool> global_cache_index_pages;
+
 namespace mc {
 class writer;
 }
--- a/test/alternator/test_item.py
+++ b/test/alternator/test_item.py
@@ -361,6 +361,14 @@ def test_getitem_attributes_to_get_duplicate(dynamodb, test_table):
    with pytest.raises(ClientError, match='ValidationException.*Duplicate'):
        test_table.get_item(Key={'p': p, 'c': c}, AttributesToGet=['a', 'a'], ConsistentRead=True)

+# Verify that it is forbidden to ask for an empty AttributesToGet
+# Reproduces issue #10332.
+def test_getitem_attributes_to_get_empty(dynamodb, test_table):
+    p = random_string()
+    c = random_string()
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.get_item(Key={'p': p, 'c': c}, AttributesToGet=[], ConsistentRead=True)
+
 # Basic test for DeleteItem, with hash key only
 def test_delete_item_hash(test_table_s):
    p = random_string()
--- a/test/alternator/test_query.py
+++ b/test/alternator/test_query.py
@@ -157,6 +157,13 @@ def test_query_attributes_to_get(dynamodb, test_table):
        expected_items = [{k: x[k] for k in wanted if k in x} for x in items]
        assert multiset(expected_items) == multiset(got_items)

+# Verify that it is forbidden to ask for an empty AttributesToGet
+# Reproduces issue #10332.
+def test_query_attributes_to_get_empty(dynamodb, test_table):
+    p = random_string()
+    with pytest.raises(ClientError, match='ValidationException'):
+        full_query(test_table, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, AttributesToGet=[])
+
 # Test that in a table with both hash key and sort key, which keys we can
 # Query by: We can Query by the hash key, by a combination of both hash and
 # sort keys, but *cannot* query by just the sort key, and obviously not
--- a/test/alternator/test_update_expression.py
+++ b/test/alternator/test_update_expression.py
@@ -1030,6 +1030,20 @@ def test_nested_attribute_remove_from_missing_item(test_table_s):
    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE x.y')
    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE x[0]')

+# Though in an above test (test_nested_attribute_update_bad_path_dot) we
+# showed that DynamoDB does not allow REMOVE x.y if attribute x doesn't
+# exist - and generates a ValidationException, if x *does* exist but y
+# doesn't, it's fine and the removal should just be silently ignored.
+def test_nested_attribute_remove_missing_leaf(test_table_s):
+    p = random_string()
+    item = {'p': p, 'a': {'x': 3}, 'b': ['hi']}
+    test_table_s.put_item(Item=item)
+    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE a.y')
+    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE b[7]')
+    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE c')
+    # The above UpdateItem calls didn't change anything...
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == item
+
 # Similarly for other types of bad paths - using [0] on something which
 # doesn't exist or isn't an array.
 def test_nested_attribute_update_bad_path_array(test_table_s):
--- a/test/boost/cached_file_test.cc
+++ b/test/boost/cached_file_test.cc
@@ -207,7 +207,9 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
        }

        {
-            cf_lru.evict_all();
+            with_allocator(region.allocator(), [] {
+                cf_lru.evict_all();
+            });

            BOOST_REQUIRE_EQUAL(0, metrics.cached_bytes); // change here
            BOOST_REQUIRE_EQUAL(0, cf.cached_bytes()); // change here
@@ -215,6 +217,8 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
            BOOST_REQUIRE_EQUAL(3, metrics.page_evictions); // change here
            BOOST_REQUIRE_EQUAL(0, metrics.page_hits);
            BOOST_REQUIRE_EQUAL(3, metrics.page_populations);
+
+            BOOST_REQUIRE_EQUAL(region.occupancy().used_space(), 0);
        }

        {
--- a/test/boost/chunked_managed_vector_test.cc
+++ b/test/boost/chunked_managed_vector_test.cc
@@ -12,6 +12,8 @@
 #include <deque>
 #include <random>
 #include "utils/lsa/chunked_managed_vector.hh"
+#include "utils/managed_ref.hh"
+#include "test/lib/log.hh"

 #include <boost/range/algorithm/sort.hpp>
 #include <boost/range/algorithm/equal.hpp>
@@ -203,3 +205,106 @@ SEASTAR_TEST_CASE(tests_reserve_partial) {
  });
  return make_ready_future<>();
 }
+
+SEASTAR_TEST_CASE(test_clear_and_release) {
+    region region;
+    allocating_section as;
+
+    with_allocator(region.allocator(), [&] {
+        lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
+
+        for (uint64_t i = 1; i < 4000; ++i) {
+            as(region, [&] {
+                v.emplace_back(make_managed<uint64_t>(i));
+            });
+        }
+
+        v.clear_and_release();
+    });
+
+    return make_ready_future<>();
+}
+
+SEASTAR_TEST_CASE(test_chunk_reserve) {
+    region region;
+    allocating_section as;
+
+    for (auto conf :
+            { // std::make_pair(reserve size, push count)
+                std::make_pair(0, 4000),
+                std::make_pair(100, 4000),
+                std::make_pair(200, 4000),
+                std::make_pair(1000, 4000),
+                std::make_pair(2000, 4000),
+                std::make_pair(3000, 4000),
+                std::make_pair(5000, 4000),
+                std::make_pair(500, 8000),
+                std::make_pair(1000, 8000),
+                std::make_pair(2000, 8000),
+                std::make_pair(8000, 500),
+            })
+    {
+        with_allocator(region.allocator(), [&] {
+            auto [reserve_size, push_count] = conf;
+            testlog.info("Testing reserve({}), {}x emplace_back()", reserve_size, push_count);
+            lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
+            v.reserve(reserve_size);
+            uint64_t seed = rand();
+            for (uint64_t i = 0; i < push_count; ++i) {
+                as(region, [&] {
+                    v.emplace_back(make_managed<uint64_t>(seed + i));
+                    BOOST_REQUIRE(**v.begin() == seed);
+                });
+            }
+            auto v_it = v.begin();
+            for (uint64_t i = 0; i < push_count; ++i) {
+                BOOST_REQUIRE(**v_it++ == seed + i);
+            }
+            v.clear_and_release();
+        });
+    }
+
+    return make_ready_future<>();
+}
+
+// Tests the case of make_room() invoked with last_chunk_capacity_deficit but _size not in
+// the last reserved chunk.
+SEASTAR_TEST_CASE(test_shrinking_and_expansion_involving_chunk_boundary) {
+    region region;
+    allocating_section as;
+
+    with_allocator(region.allocator(), [&] {
+        lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
+
+        // Fill two chunks
+        v.reserve(2000);
+        for (uint64_t i = 0; i < 2000; ++i) {
+            as(region, [&] {
+                v.emplace_back(make_managed<uint64_t>(i));
+            });
+        }
+
+        // Make the last chunk smaller than max size to trigger the last_chunk_capacity_deficit path in make_room()
+        v.shrink_to_fit();
+
+        // Leave the last chunk reserved but empty
+        for (uint64_t i = 0; i < 1000; ++i) {
+            v.pop_back();
+        }
+
+        // Try to reserve more than the currently reserved capacity and trigger last_chunk_capacity_deficit path
+        // with _size not in the last chunk. Should not sigsegv.
+        v.reserve(8000);
+
+        for (uint64_t i = 0; i < 2000; ++i) {
+            as(region, [&] {
+                v.emplace_back(make_managed<uint64_t>(i));
+            });
+        }
+
+        v.clear_and_release();
+    });
+
+    return make_ready_future<>();
+}
+
--- a/Show More
+++ b/Show More