release: prepare for 4.6.11

Merge 'cql3: don't ignore other restrictions when a multi column restriction is present during filtering' from Jan Ciołek
When filtering with multi column restriction present all other restrictions were ignored. So a query like: `SELECT * FROM WHERE pk = 0 AND (ck1, ck2) < (0, 0) AND regular_col = 0 ALLOW FILTERING;` would ignore the restriction `regular_col = 0`. This was caused by a bug in the filtering code: 2779a171fc/cql3/selection/selection.cc (L433-L449) When multi column restrictions were detected, the code checked if they are satisfied and returned immediately. This is fixed by returning only when these restrictions are not satisfied. When they are satisfied the other restrictions are checked as well to ensure all of them are satisfied. This code was introduced back in 2019, when fixing #3574. Perhaps back then it was impossible to mix multi column and regular columns and this approach was correct. Fixes: #6200 Fixes: #12014 Closes #12031 * github.com:scylladb/scylladb: cql-pytest: add a reproducer for #12014, verify that filtering multi column and regular restrictions works boost/restrictions-test: uncomment part of the test that passes now cql-pytest: enable test for filtering combined multi column and regular column restrictions cql3: don't ignore other restrictions when a multi column restriction is present during filtering (cherry picked from commit 2d2034ea28) Closes #12086
2022-11-28 15:45:26 +02:00 · 2022-11-27 00:15:04 +02:00 · 2022-11-16 11:52:51 +03:00 · 2022-11-14 10:30:20 +02:00 · 2022-11-10 20:43:44 +02:00 · 2022-11-07 16:51:14 +02:00
156 changed files with 2850 additions and 585 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -60,7 +60,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=4.6.dev
+VERSION=4.6.11

 if test -f version
 then
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -415,6 +415,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
    rjson::add(table_description, "BillingModeSummary", rjson::empty_object());
    rjson::add(table_description["BillingModeSummary"], "BillingMode", "PAY_PER_REQUEST");
    rjson::add(table_description["BillingModeSummary"], "LastUpdateToPayPerRequestDateTime", rjson::value(creation_date_seconds));
+    // In PAY_PER_REQUEST billing mode, provisioned capacity should return 0
+    rjson::add(table_description, "ProvisionedThroughput", rjson::empty_object());
+    rjson::add(table_description["ProvisionedThroughput"], "ReadCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);

    std::unordered_map<std::string,std::string> key_attribute_types;
    // Add base table's KeySchema and collect types for AttributeDefinitions:
@@ -1017,18 +1022,16 @@ future<executor::request_return_type> executor::update_table(client_state& clien
    _stats.api_operations.update_table++;
    elogger.trace("Updating table {}", request);

-    std::string table_name = get_table_name(request);
-    if (table_name.find(INTERNAL_TABLE_PREFIX) == 0) {
+    schema_ptr tab = get_table(_proxy, request);
+    // the ugly but harmless conversion to string_view here is because
+    // Seastar's sstring is missing a find(std::string_view) :-()
+    if (std::string_view(tab->cf_name()).find(INTERNAL_TABLE_PREFIX) == 0) {
        return make_ready_future<request_return_type>(api_error::validation(
                format("Prefix {} is reserved for accessing internal tables", INTERNAL_TABLE_PREFIX)));
    }
-    std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
-    tracing::add_table_name(trace_state, keyspace_name, table_name);
+    tracing::add_table_name(trace_state, tab->ks_name(), tab->cf_name());

-    auto& db = _proxy.get_db().local();
-    auto& cf = db.find_column_family(keyspace_name, table_name);
-
-    schema_builder builder(cf.schema());
+    schema_builder builder(tab);

    rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
    if (stream_specification && stream_specification->IsObject()) {
@@ -2080,6 +2083,9 @@ static attrs_to_get calculate_attrs_to_get(const rjson::value& req, std::unorder
        for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) {
            attribute_path_map_add("AttributesToGet", ret, it->GetString());
        }
+        if (ret.empty()) {
+            throw api_error::validation("Empty AttributesToGet is not allowed. Consider using Select=COUNT instead.");
+        }
        return ret;
    } else if (has_projection_expression) {
        const rjson::value& projection_expression = req["ProjectionExpression"];
@@ -2481,8 +2487,8 @@ static bool hierarchy_actions(
                        // attr member so we can use add()
                        rjson::add_with_string_name(v, attr, std::move(*newv));
                    } else {
-                        throw api_error::validation(format("Can't remove document path {} - not present in item",
-                            subh.get_value()._path));
+                        // Removing a.b when a is a map but a.b doesn't exist
+                        // is silently ignored. It's not considered an error.
                    }
                } else {
                    throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -94,10 +94,7 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
 }

 future<executor::request_return_type> executor::describe_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
-    _stats.api_operations.update_time_to_live++;
-    if (!_proxy.get_db().local().features().cluster_supports_alternator_ttl()) {
-        co_return api_error::unknown_operation("DescribeTimeToLive not yet supported. Experimental support is available if the 'alternator_ttl' experimental feature is enabled on all nodes.");
-    }
+    _stats.api_operations.describe_time_to_live++;
    schema_ptr schema = get_table(_proxy, request);
    std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
    rjson::value desc = rjson::empty_object();
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -604,15 +604,21 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                return make_exception_future<json::json_return_type>(
                        std::runtime_error("Can not perform cleanup operation when topology changes"));
            }
-            return ctx.db.invoke_on_all([keyspace, column_families] (database& db) {
-                std::vector<column_family*> column_families_vec;
-                auto& cm = db.get_compaction_manager();
-                for (auto cf : column_families) {
-                    column_families_vec.push_back(&db.find_column_family(keyspace, cf));
-                }
-                return parallel_for_each(column_families_vec, [&cm, &db] (column_family* cf) {
-                    return cm.perform_cleanup(db, cf);
+            return ctx.db.invoke_on_all([keyspace, column_families] (database& db) -> future<> {
+                auto table_ids = boost::copy_range<std::vector<utils::UUID>>(column_families | boost::adaptors::transformed([&] (auto& table_name) {
+                    return db.find_uuid(keyspace, table_name);
+                }));
+                // cleanup smaller tables first, to increase chances of success if low on space.
+                std::ranges::sort(table_ids, std::less<>(), [&] (const utils::UUID& id) {
+                    return db.find_column_family(id).get_stats().live_disk_space_used;
                });
+                auto& cm = db.get_compaction_manager();
+                // as a table can be dropped during loop below, let's find it before issuing the cleanup request.
+                for (auto& id : table_ids) {
+                    table& t = db.find_column_family(id);
+                    co_await cm.perform_cleanup(db, &t);
+                }
+                co_return;
            }).then([]{
                return make_ready_future<json::json_return_type>(0);
            });
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -79,6 +79,49 @@ atomic_cell::atomic_cell(const abstract_type& type, atomic_cell_view other)
    set_view(_data);
 }

+// Based on:
+//  - org.apache.cassandra.db.AbstractCell#reconcile()
+//  - org.apache.cassandra.db.BufferExpiringCell#reconcile()
+//  - org.apache.cassandra.db.BufferDeletedCell#reconcile()
+std::strong_ordering
+compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
+    if (left.timestamp() != right.timestamp()) {
+        return left.timestamp() <=> right.timestamp();
+    }
+    if (left.is_live() != right.is_live()) {
+        return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
+    }
+    if (left.is_live()) {
+        auto c = compare_unsigned(left.value(), right.value()) <=> 0;
+        if (c != 0) {
+            return c;
+        }
+        if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
+            // prefer expiring cells.
+            return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
+        }
+        if (left.is_live_and_has_ttl()) {
+            if (left.expiry() != right.expiry()) {
+                return left.expiry() <=> right.expiry();
+            } else {
+                // prefer the cell that was written later,
+                // so it survives longer after it expires, until purged.
+                return right.ttl() <=> left.ttl();
+            }
+        }
+    } else {
+        // Both are deleted
+
+        // Origin compares big-endian serialized deletion time. That's because it
+        // delegates to AbstractCell.reconcile() which compares values after
+        // comparing timestamps, which in case of deleted cells will hold
+        // serialized expiry.
+        return (uint64_t) left.deletion_time().time_since_epoch().count()
+                <=> (uint64_t) right.deletion_time().time_since_epoch().count();
+    }
+    return std::strong_ordering::equal;
+}
+
 atomic_cell_or_collection atomic_cell_or_collection::copy(const abstract_type& type) const {
    if (_data.empty()) {
        return atomic_cell_or_collection();
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -593,8 +593,8 @@ void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::con
                clogger.trace("csm {}: insert dummy at {}", fmt::ptr(this), _lower_bound);
                auto it = with_allocator(_lsa_manager.region().allocator(), [&] {
                    auto& rows = _snp->version()->partition().mutable_clustered_rows();
-                    auto new_entry = current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no);
-                    return rows.insert_before(_next_row.get_iterator_in_latest_version(), *new_entry);
+                    auto new_entry = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no));
+                    return rows.insert_before(_next_row.get_iterator_in_latest_version(), std::move(new_entry));
                });
                _snp->tracker()->insert(*it);
                _last_row = partition_snapshot_row_weakref(*_snp, it, true);
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -765,8 +765,12 @@ future<> generation_service::check_and_repair_cdc_streams() {
    std::optional<cdc::generation_id> latest = _gen_id;
    const auto& endpoint_states = _gossiper.get_endpoint_states();
    for (const auto& [addr, state] : endpoint_states) {
-        if (!_gossiper.is_normal(addr))  {
-            throw std::runtime_error(format("All nodes must be in NORMAL state while performing check_and_repair_cdc_streams"
+        if (_gossiper.is_left(addr)) {
+            cdc_log.info("check_and_repair_cdc_streams ignored node {} because it is in LEFT state", addr);
+            continue;
+        }
+        if (!_gossiper.is_normal(addr)) {
+            throw std::runtime_error(format("All nodes must be in NORMAL or LEFT state while performing check_and_repair_cdc_streams"
                    " ({} is in state {})", addr, _gossiper.get_gossip_status(state)));
        }

@@ -830,6 +834,11 @@ future<> generation_service::check_and_repair_cdc_streams() {
                latest, db_clock::now());
            should_regenerate = true;
        } else {
+          if (tmptr->sorted_tokens().size() != gen->entries().size()) {
+              // We probably have garbage streams from old generations
+              cdc_log.info("Generation size does not match the token ring, regenerating");
+              should_regenerate = true;
+          } else {
            std::unordered_set<dht::token> gen_ends;
            for (const auto& entry : gen->entries()) {
                gen_ends.insert(entry.token_range_end);
@@ -841,6 +850,7 @@ future<> generation_service::check_and_repair_cdc_streams() {
                    break;
                }
            }
+          }
        }
    }

--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -73,7 +73,7 @@ using namespace std::chrono_literals;
 logging::logger cdc_log("cdc");

 namespace cdc {
-static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {});
+static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {}, schema_ptr = nullptr);
 }

 static constexpr auto cdc_group_name = "cdc";
@@ -220,7 +220,7 @@ public:
                return;
            }

-            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt);
+            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);

            auto log_mut = log_schema 
                ? db::schema_tables::make_update_table_mutations(db, keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
@@ -503,7 +503,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
    return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
 }

-static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid) {
+static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid, schema_ptr old) {
    schema_builder b(s.ks_name(), log_name(s.cf_name()));
    b.with_partitioner("com.scylladb.dht.CDCPartitioner");
    b.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
@@ -590,6 +590,20 @@ static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID>
        b.set_uuid(*uuid);
    }

+    /**
+     * #10473 - if we are redefining the log table, we need to ensure any dropped
+     * columns are registered in "dropped_columns" table, otherwise clients will not
+     * be able to read data older than now.
+     */
+    if (old) {
+        // not super efficient, but we don't do this often.
+        for (auto& col : old->all_columns()) {
+            if (!b.has_column({col.name(), col.name_as_text() })) {
+                b.without_column(col.name_as_text(), col.type, api::new_timestamp());
+            }
+        }
+    }
+
    return b.build();
 }

@@ -1511,6 +1525,11 @@ public:
        }

        auto process_cell = [&, this] (const column_definition& cdef) {
+            // If table uses compact storage it may contain a column of type empty
+            // and we need to ignore such a field because it is not present in CDC log.
+            if (cdef.type->get_kind() == abstract_type::kind::empty) {
+                return;
+            }
            if (auto current = get_col_from_row_state(row_state, cdef)) {
                _builder->set_value(image_ck, cdef, *current);
            } else if (op == operation::pre_image) {
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -1634,7 +1634,7 @@ future<bool> scrub_validate_mode_validate_reader(flat_mutation_reader reader, co
        while (auto mf_opt = co_await reader()) {
            if (cdata.is_stop_requested()) [[unlikely]] {
                // Compaction manager will catch this exception and re-schedule the compaction.
-                co_return coroutine::make_exception(compaction_stopped_exception(schema->ks_name(), schema->cf_name(), cdata.stop_requested));
+                throw compaction_stopped_exception(schema->ks_name(), schema->cf_name(), cdata.stop_requested);
            }

            const auto& mf = *mf_opt;
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -326,6 +326,11 @@ future<> compaction_manager::run_custom_job(column_family* cf, sstables::compact
    task->compaction_done = with_semaphore(_custom_job_sem, 1, [this, task, cf, &job = *job_ptr] () mutable {
        // take read lock for cf, so major compaction and resharding can't proceed in parallel.
        return with_lock(_compaction_locks[cf].for_read(), [this, task, cf, &job] () mutable {
+            // Allow caller to know that task (e.g. reshape) was asked to stop while waiting for a chance to run.
+            if (task->compaction_data.is_stop_requested()) {
+                throw sstables::compaction_stopped_exception(task->compacting_cf->schema()->ks_name(), task->compacting_cf->schema()->cf_name(),
+                    task->compaction_data.stop_requested);
+            }
            _stats.active_tasks++;
            if (!can_proceed(task)) {
                return make_ready_future<>();
@@ -522,16 +527,11 @@ future<> compaction_manager::stop() {
    }
 }

-void compaction_manager::really_do_stop() {
-    if (_state == state::none || _state == state::stopped) {
-        return;
-    }
-
-    _state = state::stopped;
+future<> compaction_manager::really_do_stop() {
    cmlog.info("Asked to stop");
    // Reset the metrics registry
    _metrics.clear();
-    _stop_future.emplace(stop_ongoing_compactions("shutdown").then([this] () mutable {
+    return stop_ongoing_compactions("shutdown").then([this] () mutable {
        reevaluate_postponed_compactions();
        return std::move(_waiting_reevalution);
    }).then([this] {
@@ -539,12 +539,34 @@ void compaction_manager::really_do_stop() {
        _compaction_submission_timer.cancel();
        cmlog.info("Stopped");
        return _compaction_controller.shutdown();
-    }));
+    });
+}
+
+template <typename Ex>
+requires std::is_base_of_v<std::exception, Ex> &&
+requires (const Ex& ex) {
+    { ex.code() } noexcept -> std::same_as<const std::error_code&>;
+}
+auto swallow_enospc(const Ex& ex) noexcept {
+    if (ex.code().value() != ENOSPC) {
+        return make_exception_future<>(std::make_exception_ptr(ex));
+    }
+
+    cmlog.warn("Got ENOSPC on stop, ignoring...");
+    return make_ready_future<>();
 }

 void compaction_manager::do_stop() noexcept {
+    if (_state == state::none || _state == state::stopped) {
+        return;
+    }
+
    try {
-        really_do_stop();
+        _state = state::stopped;
+        _stop_future = really_do_stop()
+            .handle_exception_type([] (const std::system_error& ex) { return swallow_enospc(ex); })
+            .handle_exception_type([] (const storage_io_error& ex) { return swallow_enospc(ex); })
+        ;
    } catch (...) {
        try {
            cmlog.error("Failed to stop the manager: {}", std::current_exception());
@@ -676,6 +698,7 @@ void compaction_manager::submit_offstrategy(column_family* cf) {
                _stats.active_tasks++;
                task->setup_new_compaction();

+              return with_scheduling_group(_maintenance_sg.cpu, [this, task, cf] {
                return cf->run_offstrategy_compaction(task->compaction_data).then_wrapped([this, task] (future<> f) mutable {
                    _stats.active_tasks--;
                    task->finish_compaction();
@@ -698,6 +721,7 @@ void compaction_manager::submit_offstrategy(column_family* cf) {
                    _tasks.remove(task);
                    return make_ready_future<stop_iteration>(stop_iteration::yes);
                });
+              });
            });
        });
    });
@@ -714,9 +738,20 @@ inline bool compaction_manager::check_for_cleanup(column_family* cf) {

 future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compaction_type_options options, get_candidates_func get_func, can_purge_tombstones can_purge) {
    auto task = make_lw_shared<compaction_manager::task>(cf, options.type());
-    _tasks.push_back(task);

-    auto sstables = std::make_unique<std::vector<sstables::shared_sstable>>(get_func(*cf));
+    std::unique_ptr<std::vector<sstables::shared_sstable>> sstables;
+    lw_shared_ptr<compacting_sstable_registration> compacting;
+
+    // since we might potentially have ongoing compactions, and we
+    // must ensure that all sstables created before we run are included
+    // in the re-write, we need to barrier out any previously running
+    // compaction.
+    auto get_and_register_candidates_func = [this, &sstables, &compacting, &get_func] () mutable -> future<> {
+        sstables = std::make_unique<std::vector<sstables::shared_sstable>>(co_await get_func());
+        compacting = make_lw_shared<compacting_sstable_registration>(this, *sstables);
+    };
+
+    co_await cf->run_with_compaction_disabled(std::ref(get_and_register_candidates_func));
    // sort sstables by size in descending order, such that the smallest files will be rewritten first
    // (as sstable to be rewritten is popped off from the back of container), so rewrite will have higher
    // chance to succeed when the biggest files are reached.
@@ -724,10 +759,11 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
        return a->data_size() > b->data_size();
    });

-    auto compacting = make_lw_shared<compacting_sstable_registration>(this, *sstables);
    auto sstables_ptr = sstables.get();
    _stats.pending_tasks += sstables->size();

+    _tasks.push_back(task);
+
    task->compaction_done = do_until([this, sstables_ptr, task] { return sstables_ptr->empty() || !can_proceed(task); },
             [this, task, options, sstables_ptr, compacting, can_purge] () mutable {
        auto sst = sstables_ptr->back();
@@ -737,8 +773,10 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
            column_family& cf = *task->compacting_cf;
            auto sstable_level = sst->get_sstable_level();
            auto run_identifier = sst->run_identifier();
+
            auto sstable_set_snapshot = can_purge ? std::make_optional(cf.get_sstable_set()) : std::nullopt;
-            auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), _maintenance_sg.io,
+            // FIXME: this compaction should run with maintenance priority.
+            auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), service::get_local_compaction_priority(),
                sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, options);

            // Releases reference to cleaned sstable such that respective used disk space can be freed.
@@ -747,15 +785,14 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
            };

            return with_semaphore(_rewrite_sstables_sem, 1, [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
-              // Take write lock for cf to serialize cleanup/upgrade sstables/scrub with major compaction/reshape/reshard.
-              return with_lock(_compaction_locks[&cf].for_write(), [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
+              return with_lock(_compaction_locks[&cf].for_read(), [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
                _stats.pending_tasks--;
                _stats.active_tasks++;
                task->setup_new_compaction();
                task->output_run_identifier = descriptor.run_identifier;
                compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
                return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor), task] (compaction_backlog_tracker& bt) mutable {
-                    return with_scheduling_group(_maintenance_sg.cpu, [this, &cf, descriptor = std::move(descriptor), task]() mutable {
+                    return with_scheduling_group(_compaction_controller.sg(), [this, &cf, descriptor = std::move(descriptor), task]() mutable {
                        return cf.compact_sstables(std::move(descriptor), task->compaction_data);
                    });
                });
@@ -783,7 +820,7 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
        _tasks.remove(task);
    });

-    return task->compaction_done.get_future().then([task] {});
+    co_return co_await task->compaction_done.get_future();
 }

 future<> compaction_manager::perform_sstable_scrub_validate_mode(column_family* cf) {
@@ -865,31 +902,29 @@ future<> compaction_manager::perform_cleanup(database& db, column_family* cf) {
        return make_exception_future<>(std::runtime_error(format("cleanup request failed: there is an ongoing cleanup on {}.{}",
            cf->schema()->ks_name(), cf->schema()->cf_name())));
    }
-    return seastar::async([this, cf, &db] {
+  // FIXME: indentation
+  auto sorted_owned_ranges = db.get_keyspace_local_ranges(cf->schema()->ks_name());
+  auto get_sstables = [this, &db, cf, sorted_owned_ranges] () -> future<std::vector<sstables::shared_sstable>> {
+    return seastar::async([this, &db, cf, sorted_owned_ranges = std::move(sorted_owned_ranges)] {
        auto schema = cf->schema();
-        auto sorted_owned_ranges = db.get_keyspace_local_ranges(schema->ks_name());
        auto sstables = std::vector<sstables::shared_sstable>{};
        const auto candidates = get_candidates(*cf);
        std::copy_if(candidates.begin(), candidates.end(), std::back_inserter(sstables), [&sorted_owned_ranges, schema] (const sstables::shared_sstable& sst) {
            seastar::thread::maybe_yield();
            return sorted_owned_ranges.empty() || needs_cleanup(sst, sorted_owned_ranges, schema);
        });
-        return std::tuple<dht::token_range_vector, std::vector<sstables::shared_sstable>>(sorted_owned_ranges, sstables);
-    }).then_unpack([this, cf, &db] (dht::token_range_vector owned_ranges, std::vector<sstables::shared_sstable> sstables) {
-        return rewrite_sstables(cf, sstables::compaction_type_options::make_cleanup(std::move(owned_ranges)),
-                [sstables = std::move(sstables)] (const table&) { return sstables; });
+       return sstables;
    });
+  };
+  return rewrite_sstables(cf, sstables::compaction_type_options::make_cleanup(std::move(sorted_owned_ranges)), std::move(get_sstables));
 }

 // Submit a column family to be upgraded and wait for its termination.
 future<> compaction_manager::perform_sstable_upgrade(database& db, column_family* cf, bool exclude_current_version) {
-    using shared_sstables = std::vector<sstables::shared_sstable>;
-    return do_with(shared_sstables{}, [this, &db, cf, exclude_current_version](shared_sstables& tables) {
-        // since we might potentially have ongoing compactions, and we
-        // must ensure that all sstables created before we run are included
-        // in the re-write, we need to barrier out any previously running
-        // compaction.
-        return cf->run_with_compaction_disabled([this, cf, &tables, exclude_current_version] {
+    auto get_sstables = [this, &db, cf, exclude_current_version] {
+            // FIXME: indentation
+            std::vector<sstables::shared_sstable> tables;
+
            auto last_version = cf->get_sstables_manager().get_highest_supported_format();

            for (auto& sst : get_candidates(*cf)) {
@@ -900,21 +935,17 @@ future<> compaction_manager::perform_sstable_upgrade(database& db, column_family
                    tables.emplace_back(sst);
                }
            }
-            return make_ready_future<>();
-        }).then([&db, cf] {
-             return db.get_keyspace_local_ranges(cf->schema()->ks_name());
-        }).then([this, &db, cf, &tables] (dht::token_range_vector owned_ranges) {
-            // doing a "cleanup" is about as compacting as we need
-            // to be, provided we get to decide the tables to process,
-            // and ignoring any existing operations.
-            // Note that we potentially could be doing multiple
-            // upgrades here in parallel, but that is really the users
-            // problem.
-            return rewrite_sstables(cf, sstables::compaction_type_options::make_upgrade(std::move(owned_ranges)), [&](auto&) mutable {
-                return std::exchange(tables, {});
-            });
-        });
-    });
+
+            return make_ready_future<std::vector<sstables::shared_sstable>>(tables);
+    };
+
+    // doing a "cleanup" is about as compacting as we need
+    // to be, provided we get to decide the tables to process,
+    // and ignoring any existing operations.
+    // Note that we potentially could be doing multiple
+    // upgrades here in parallel, but that is really the users
+    // problem.
+    return rewrite_sstables(cf, sstables::compaction_type_options::make_upgrade(db.get_keyspace_local_ranges(cf->schema()->ks_name())), std::move(get_sstables));
 }

 // Submit a column family to be scrubbed and wait for its termination.
@@ -922,14 +953,10 @@ future<> compaction_manager::perform_sstable_scrub(column_family* cf, sstables::
    if (scrub_mode == sstables::compaction_type_options::scrub::mode::validate) {
        return perform_sstable_scrub_validate_mode(cf);
    }
-    // since we might potentially have ongoing compactions, and we
-    // must ensure that all sstables created before we run are scrubbed,
-    // we need to barrier out any previously running compaction.
-    return cf->run_with_compaction_disabled([this, cf, scrub_mode] {
-        return rewrite_sstables(cf, sstables::compaction_type_options::make_scrub(scrub_mode), [this] (const table& cf) {
-            return get_candidates(cf);
+        // FIXME: indentation
+        return rewrite_sstables(cf, sstables::compaction_type_options::make_scrub(scrub_mode), [this, cf] {
+            return make_ready_future<std::vector<sstables::shared_sstable>>(get_candidates(*cf));
        }, can_purge_tombstones::no);
-    });
 }

 future<> compaction_manager::remove(column_family* cf) {
@@ -979,7 +1006,7 @@ void compaction_manager::stop_compaction(sstring type) {
    }
    // FIXME: switch to task_stop(), and wait for their termination, so API user can know when compactions actually stopped.
    for (auto& task : _tasks) {
-        if (task->compaction_running && target_type == task->type) {
+        if (target_type == task->type) {
            task->compaction_data.stop("user request");
        }
    }
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -178,7 +178,7 @@ private:
    maintenance_scheduling_group _maintenance_sg;
    size_t _available_memory;

-    using get_candidates_func = std::function<std::vector<sstables::shared_sstable>(const column_family&)>;
+    using get_candidates_func = std::function<future<std::vector<sstables::shared_sstable>>()>;
    class can_purge_tombstones_tag;
    using can_purge_tombstones = bool_class<can_purge_tombstones_tag>;

@@ -209,7 +209,7 @@ public:

    // Stop all fibers, without waiting. Safe to be called multiple times.
    void do_stop() noexcept;
-    void really_do_stop();
+    future<> really_do_stop();

    // Submit a column family to be compacted.
    void submit(column_family* cf);
--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -80,7 +80,11 @@ compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(colu
 }

 void leveled_compaction_strategy::notify_completion(const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
-    if (removed.empty() || added.empty()) {
+    // All the update here is only relevant for regular compaction's round-robin picking policy, and if
+    // last_compacted_keys wasn't generated by regular, it means regular is disabled since last restart,
+    // therefore we can skip the updates here until regular runs for the first time. Once it runs,
+    // it will be able to generate last_compacted_keys correctly by looking at metadata of files.
+    if (removed.empty() || added.empty() || !_last_compacted_keys) {
        return;
    }
    auto min_level = std::numeric_limits<uint32_t>::max();
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -225,6 +225,7 @@ time_window_compaction_strategy::get_sstables_for_compaction(column_family& cf,
    auto gc_before = gc_clock::now() - cf.schema()->gc_grace_seconds();

    if (candidates.empty()) {
+        _estimated_remaining_tasks = 0;
        return compaction_descriptor();
    }

--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -1403,7 +1403,7 @@ serviceLevelOrRoleName returns [sstring name]
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower); }
 | t=STRING_LITERAL     { $name = sstring($t.text); }
 | t=QUOTED_NAME        { $name = sstring($t.text); }
-| k=unreserved_keyword { $name = sstring($t.text); 
+| k=unreserved_keyword { $name = k;
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower);}
 | QMARK {add_recognition_error("Bind variables cannot be used for service levels or role names");}
 ;
--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -25,6 +25,7 @@

 #include "cql3_type.hh"
 #include "cql3/util.hh"
+#include "exceptions/exceptions.hh"
 #include "ut_name.hh"
 #include "database.hh"
 #include "user_types_metadata.hh"
@@ -448,7 +449,20 @@ sstring maybe_quote(const sstring& identifier) {
    }

    if (!need_quotes) {
-        return identifier;
+        // A seemingly valid identifier matching [a-z][a-z0-9_]* may still
+        // need quoting if it is a CQL keyword, e.g., "to" (see issue #9450).
+        // While our parser Cql.g has different production rules for different
+        // types of identifiers (column names, table names, etc.), all of
+        // these behave identically for alphanumeric strings: they exclude
+        // many keywords but allow keywords listed as "unreserved keywords".
+        // So we can use any of them, for example cident.
+        try {
+            cql3::util::do_with_parser(identifier, std::mem_fn(&cql3_parser::CqlParser::cident));
+            return identifier;
+        } catch(exceptions::syntax_exception&) {
+            // This alphanumeric string is not a valid identifier, so fall
+            // through to have it quoted:
+        }
    }
    if (num_quotes == 0) {
        return make_sstring("\"", identifier, "\"");
--- a/cql3/cql_statement.hh
+++ b/cql3/cql_statement.hh
@@ -109,9 +109,7 @@ public:
    virtual seastar::future<seastar::shared_ptr<cql_transport::messages::result_message>>
        execute(query_processor& qp, service::query_state& state, const query_options& options) const = 0;

-    virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const = 0;
-
-    virtual bool depends_on_column_family(const seastar::sstring& cf_name) const = 0;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const = 0;

    virtual seastar::shared_ptr<const metadata> get_result_metadata() const = 0;

--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -117,10 +117,44 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
        if (!col_type->is_map()) {
            throw exceptions::invalid_request_exception(format("subscripting non-map column {}", cdef->name_as_text()));
        }
-        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*data.other_columns[data.sel.index_of(*cdef)]));
+        int32_t index = data.sel.index_of(*cdef);
+        if (index == -1) {
+            throw std::runtime_error(
+                    format("Column definition {} does not match any column in the query selection",
+                    cdef->name_as_text()));
+        }
+        const managed_bytes_opt& serialized = data.other_columns[index];
+        if (!serialized) {
+            // For null[i] we return null.
+            return std::nullopt;
+        }
+        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*serialized));
        const auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
        const auto key = evaluate_to_raw_view(col.sub, options);
        auto&& key_type = col_type->name_comparator();
+        if (key.is_null()) {
+            // For m[null] return null.
+            // This is different from Cassandra - which treats m[null]
+            // as an invalid request error. But m[null] -> null is more
+            // consistent with our usual null treatement (e.g., both
+            // null[2] and null < 2 return null). It will also allow us
+            // to support non-constant subscripts (e.g., m[a]) where "a"
+            // may be null in some rows and non-null in others, and it's
+            // not an error.
+            return std::nullopt;
+        }
+        if (key.is_unset_value()) {
+            // An m[?] with ? bound to UNSET_VALUE is a invalid query.
+            // We could have detected it earlier while binding, but since
+            // we currently don't, we must protect the following code
+            // which can't work with an UNSET_VALUE. Note that the
+            // placement of this check here means that in an empty table,
+            // where we never need to evaluate the filter expression, this
+            // error will not be detected.
+            throw exceptions::invalid_request_exception(
+                format("Unsupported unset map key for column {}",
+                    cdef->name_as_text()));
+        }
        const auto found = key.with_linearized([&] (bytes_view key_bv) {
            using entry = std::pair<data_value, data_value>;
            return std::find_if(data_map.cbegin(), data_map.cend(), [&] (const entry& element) {
@@ -135,8 +169,16 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
        case column_kind::clustering_key:
            return managed_bytes(data.clustering_key[cdef->id]);
        case column_kind::static_column:
-        case column_kind::regular_column:
-            return managed_bytes_opt(data.other_columns[data.sel.index_of(*cdef)]);
+            [[fallthrough]];
+        case column_kind::regular_column: {
+            int32_t index = data.sel.index_of(*cdef);
+            if (index == -1) {
+                throw std::runtime_error(
+                        format("Column definition {} does not match any column in the query selection",
+                        cdef->name_as_text()));
+            }
+            return managed_bytes_opt(data.other_columns[index]);
+        }
        default:
            throw exceptions::unsupported_operation_exception("Unknown column kind");
        }
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -970,7 +970,7 @@ bool query_processor::migration_subscriber::should_invalidate(
        sstring ks_name,
        std::optional<sstring> cf_name,
        ::shared_ptr<cql_statement> statement) {
-    return statement->depends_on_keyspace(ks_name) && (!cf_name || statement->depends_on_column_family(*cf_name));
+    return statement->depends_on(ks_name, cf_name);
 }

 future<> query_processor::query_internal(
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -528,7 +528,7 @@ statement_restrictions::statement_restrictions(database& db,
    }

    if (!_nonprimary_key_restrictions->empty()) {
-        if (_has_queriable_regular_index) {
+        if (_has_queriable_regular_index && _partition_range_is_simple) {
            _uses_secondary_indexing = true;
        } else if (!allow_filtering) {
            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -193,7 +193,7 @@ public:

    template<typename RowComparator>
    void sort(const RowComparator& cmp) {
-        std::sort(_rows.begin(), _rows.end(), std::ref(cmp));
+        std::sort(_rows.begin(), _rows.end(), cmp);
    }

    metadata& get_metadata();
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -450,11 +450,16 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
    }

    auto clustering_columns_restrictions = _restrictions->get_clustering_columns_restrictions();
-    if (dynamic_pointer_cast<cql3::restrictions::multi_column_restriction>(clustering_columns_restrictions)) {
+    bool has_multi_col_clustering_restrictions =
+        dynamic_pointer_cast<cql3::restrictions::multi_column_restriction>(clustering_columns_restrictions) != nullptr;
+    if (has_multi_col_clustering_restrictions) {
        clustering_key_prefix ckey = clustering_key_prefix::from_exploded(clustering_key);
-        return expr::is_satisfied_by(
+        bool multi_col_clustering_satisfied = expr::is_satisfied_by(
                clustering_columns_restrictions->expression,
                partition_key, clustering_key, static_row, row, selection, _options);
+        if (!multi_col_clustering_satisfied) {
+            return false;
+        }
    }

    auto static_row_iterator = static_row.iterator();
@@ -502,6 +507,13 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
            if (_skip_ck_restrictions) {
                continue;
            }
+            if (has_multi_col_clustering_restrictions) {
+                // Mixing multi column and single column restrictions on clustering
+                // key columns is forbidden.
+                // Since there are multi column restrictions we have to skip
+                // evaluating single column restrictions or we will get an error.
+                continue;
+            }
            auto clustering_key_restrictions_map = _restrictions->get_single_column_clustering_key_restrictions();
            auto restr_it = clustering_key_restrictions_map.find(cdef);
            if (restr_it == clustering_key_restrictions_map.end()) {
--- a/cql3/statements/authentication_statement.cc
+++ b/cql3/statements/authentication_statement.cc
@@ -46,13 +46,7 @@ uint32_t cql3::statements::authentication_statement::get_bound_terms() const {
    return 0;
 }

-bool cql3::statements::authentication_statement::depends_on_keyspace(
-                const sstring& ks_name) const {
-    return false;
-}
-
-bool cql3::statements::authentication_statement::depends_on_column_family(
-                const sstring& cf_name) const {
+bool cql3::statements::authentication_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/authentication_statement.hh
+++ b/cql3/statements/authentication_statement.hh
@@ -55,9 +55,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;

--- a/cql3/statements/authorization_statement.cc
+++ b/cql3/statements/authorization_statement.cc
@@ -48,13 +48,7 @@ uint32_t cql3::statements::authorization_statement::get_bound_terms() const {
    return 0;
 }

-bool cql3::statements::authorization_statement::depends_on_keyspace(
-                const sstring& ks_name) const {
-    return false;
-}
-
-bool cql3::statements::authorization_statement::depends_on_column_family(
-                const sstring& cf_name) const {
+bool cql3::statements::authorization_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/authorization_statement.hh
+++ b/cql3/statements/authorization_statement.hh
@@ -59,9 +59,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;

--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -98,14 +98,9 @@ batch_statement::batch_statement(type type_,
 {
 }

-bool batch_statement::depends_on_keyspace(const sstring& ks_name) const
+bool batch_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
-    return false;
-}
-
-bool batch_statement::depends_on_column_family(const sstring& cf_name) const
-{
-    return false;
+    return boost::algorithm::any_of(_statements, [&ks_name, &cf_name] (auto&& s) { return s.statement->depends_on(ks_name, cf_name); });
 }

 uint32_t batch_statement::get_bound_terms() const
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -115,9 +115,7 @@ public:
                    std::unique_ptr<attributes> attrs,
                    cql_stats& stats);

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;

--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -46,6 +46,7 @@
 #include "cdc/cdc_extension.hh"
 #include "gms/feature.hh"
 #include "gms/feature_service.hh"
+#include "utils/bloom_calculations.hh"

 #include <boost/algorithm/string/predicate.hpp>

@@ -168,6 +169,16 @@ void cf_prop_defs::validate(const database& db, const schema::extensions_map& sc
        throw exceptions::configuration_exception(KW_MAX_INDEX_INTERVAL + " must be greater than " + KW_MIN_INDEX_INTERVAL);
    }

+    if (get_simple(KW_BF_FP_CHANCE)) {
+        double bloom_filter_fp_chance = get_double(KW_BF_FP_CHANCE, 0/*not used*/);
+        double min_bloom_filter_fp_chance = utils::bloom_calculations::min_supported_bloom_filter_fp_chance();
+        if (bloom_filter_fp_chance <= min_bloom_filter_fp_chance || bloom_filter_fp_chance > 1.0) {
+            throw exceptions::configuration_exception(format(
+                "{} must be larger than {} and less than or equal to 1.0 (got {})",
+                KW_BF_FP_CHANCE, min_bloom_filter_fp_chance, bloom_filter_fp_chance));
+        }
+    }
+
    speculative_retry::from_sstring(get_string(KW_SPECULATIVE_RETRY, speculative_retry(speculative_retry::type::NONE, 0).to_sstring()));
 }

--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -571,12 +571,8 @@ modification_statement::validate(service::storage_proxy&, const service::client_
    }
 }

-bool modification_statement::depends_on_keyspace(const sstring& ks_name) const {
-    return keyspace() == ks_name;
-}
-
-bool modification_statement::depends_on_column_family(const sstring& cf_name) const {
-    return column_family() == cf_name;
+bool modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
 }

 void modification_statement::add_operation(::shared_ptr<operation> op) {
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -165,9 +165,7 @@ public:
    // Validate before execute, using client state and current schema
    void validate(service::storage_proxy&, const service::client_state& state) const override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    void add_operation(::shared_ptr<operation> op);

--- a/cql3/statements/schema_altering_statement.cc
+++ b/cql3/statements/schema_altering_statement.cc
@@ -67,12 +67,7 @@ future<> schema_altering_statement::grant_permissions_to_creator(const service::
    return make_ready_future<>();
 }

-bool schema_altering_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool schema_altering_statement::depends_on_column_family(const sstring& cf_name) const
+bool schema_altering_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/schema_altering_statement.hh
+++ b/cql3/statements/schema_altering_statement.hh
@@ -79,9 +79,7 @@ protected:
     */
    virtual future<> grant_permissions_to_creator(const service::client_state&) const;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;

--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -194,12 +194,8 @@ void select_statement::validate(service::storage_proxy&, const service::client_s
    // Nothing to do, all validation has been done by raw_statemet::prepare()
 }

-bool select_statement::depends_on_keyspace(const sstring& ks_name) const {
-    return keyspace() == ks_name;
-}
-
-bool select_statement::depends_on_column_family(const sstring& cf_name) const {
-    return column_family() == cf_name;
+bool select_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
 }

 const sstring& select_statement::keyspace() const {
@@ -995,6 +991,7 @@ lw_shared_ptr<const service::pager::paging_state> indexed_table_select_statement
    }

    auto paging_state_copy = make_lw_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
+    paging_state_copy->set_remaining(internal_paging_size);
    paging_state_copy->set_partition_key(std::move(index_pk));
    paging_state_copy->set_clustering_key(std::move(index_ck));
    return std::move(paging_state_copy);
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -127,8 +127,7 @@ public:
    virtual uint32_t get_bound_terms() const override;
    virtual future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
    virtual void validate(service::storage_proxy&, const service::client_state& state) const override;
-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<::shared_ptr<cql_transport::messages::result_message>> execute(query_processor& qp,
        service::query_state& state, const query_options& options) const override;
--- a/cql3/statements/service_level_statement.cc
+++ b/cql3/statements/service_level_statement.cc
@@ -30,13 +30,7 @@ uint32_t service_level_statement::get_bound_terms() const {
    return 0;
 }

-bool service_level_statement::depends_on_keyspace(
-        const sstring &ks_name) const {
-    return false;
-}
-
-bool service_level_statement::depends_on_column_family(
-        const sstring &cf_name) const {
+bool service_level_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/service_level_statement.hh
+++ b/cql3/statements/service_level_statement.hh
@@ -56,9 +56,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(service::storage_proxy& sp, const service::client_state& state) const override;

--- a/cql3/statements/sl_prop_defs.cc
+++ b/cql3/statements/sl_prop_defs.cc
@@ -43,7 +43,7 @@ void sl_prop_defs::validate() {
        data_value v = duration_type->deserialize(duration_type->from_string(*repr));
        cql_duration duration = static_pointer_cast<const duration_type_impl>(duration_type)->from_value(v);
        if (duration.months || duration.days) {
-            throw exceptions::invalid_request_exception("Timeout values cannot be longer than 24h");
+            throw exceptions::invalid_request_exception("Timeout values cannot be expressed in days/months");
        }
        if (duration.nanoseconds % 1'000'000 != 0) {
            throw exceptions::invalid_request_exception("Timeout values must be expressed in millisecond granularity");
--- a/cql3/statements/truncate_statement.cc
+++ b/cql3/statements/truncate_statement.cc
@@ -67,12 +67,7 @@ std::unique_ptr<prepared_statement> truncate_statement::prepare(database& db,cql
    return std::make_unique<prepared_statement>(::make_shared<truncate_statement>(*this));
 }

-bool truncate_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool truncate_statement::depends_on_column_family(const sstring& cf_name) const
+bool truncate_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/truncate_statement.hh
+++ b/cql3/statements/truncate_statement.hh
@@ -58,9 +58,7 @@ public:

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;

--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -53,6 +53,7 @@
 #include "types/list.hh"
 #include "types/user.hh"
 #include "concrete_types.hh"
+#include "validation.hh"

 namespace cql3 {

@@ -251,6 +252,7 @@ insert_prepared_json_statement::build_partition_keys(const query_options& option
        exploded.emplace_back(json_value->second);
    }
    auto pkey = partition_key::from_optional_exploded(*s, std::move(exploded));
+    validation::validate_cql_key(*s, pkey);
    auto k = query::range<query::ring_position>::make_singular(dht::decorate_key(*s, std::move(pkey)));
    ranges.emplace_back(std::move(k));
    return ranges;
--- a/cql3/statements/use_statement.cc
+++ b/cql3/statements/use_statement.cc
@@ -74,12 +74,7 @@ std::unique_ptr<prepared_statement> use_statement::prepare(database& db, cql_sta

 }

-bool use_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool use_statement::depends_on_column_family(const sstring& cf_name) const
+bool use_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/use_statement.hh
+++ b/cql3/statements/use_statement.hh
@@ -59,9 +59,7 @@ public:

    virtual uint32_t get_bound_terms() const override;

-    virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const seastar::sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual seastar::future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;

--- a/cql3/type_json.cc
+++ b/cql3/type_json.cc
@@ -31,6 +31,8 @@
 #include "types/listlike_partial_deserializing_iterator.hh"
 #include "utils/managed_bytes.hh"
 #include "exceptions/exceptions.hh"
+#include <boost/algorithm/string/trim_all.hpp>
+#include <boost/algorithm/string.hpp>

 static inline bool is_control_char(char c) {
    return c >= 0 && c <= 0x1F;
@@ -212,6 +214,17 @@ struct from_json_object_visitor {
    }
    bytes operator()(const boolean_type_impl& t) {
        if (!value.IsBool()) {
+            if (value.IsString()) {
+                std::string str(rjson::to_string_view(value));
+                boost::trim_all(str);
+                boost::to_lower(str);
+
+                if (str == "true") {
+                    return t.decompose(true);
+                } else if (str == "false") {
+                    return t.decompose(false);
+                }
+            }
            throw marshal_exception(format("Invalid JSON object {}", value));
        }
        return t.decompose(value.GetBool());
--- a/cql3/util.hh
+++ b/cql3/util.hh
@@ -87,6 +87,13 @@ std::unique_ptr<cql3::statements::raw::select_statement> build_select_statement(
 /// forbids non-alpha-numeric characters in identifier names.
 /// Quoting involves wrapping the string in double-quotes ("). A double-quote
 /// character itself is quoted by doubling it.
+/// maybe_quote() also quotes reserved CQL keywords (e.g., "to", "where")
+/// but doesn't quote *unreserved* keywords (like ttl, int or as).
+/// Note that this means that if new reserved keywords are added to the
+/// parser, a saved output of maybe_quote() may no longer be parsable by
+/// parser. To avoid this forward-compatibility issue, use quote() instead
+/// of maybe_quote() - to unconditionally quote an identifier even if it is
+/// lowercase and not (yet) a keyword.
 sstring maybe_quote(const sstring& s);

 // Check whether timestamp is not too far in the future as this probably
--- a/database.cc
+++ b/database.cc
@@ -926,10 +926,9 @@ bool database::update_column_family(schema_ptr new_schema) {
    return columns_changed;
 }

-future<> database::remove(const column_family& cf) noexcept {
+void database::remove(const table& cf) noexcept {
    auto s = cf.schema();
    auto& ks = find_keyspace(s->ks_name());
-    co_await _querier_cache.evict_all_for_table(s->id());
    _column_families.erase(s->id());
    ks.metadata()->remove_column_family(s);
    _ks_cf_to_uuid.erase(std::make_pair(s->ks_name(), s->cf_name()));
@@ -946,13 +945,20 @@ future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_
    auto& ks = find_keyspace(ks_name);
    auto uuid = find_uuid(ks_name, cf_name);
    auto cf = _column_families.at(uuid);
-    co_await remove(*cf);
+    remove(*cf);
    cf->clear_views();
-    co_return co_await cf->await_pending_ops().then([this, &ks, cf, tsf = std::move(tsf), snapshot] {
-        return truncate(ks, *cf, std::move(tsf), snapshot).finally([this, cf] {
-            return cf->stop();
-        });
-    }).finally([cf] {});
+    co_await cf->await_pending_ops();
+    co_await _querier_cache.evict_all_for_table(cf->schema()->id());
+    std::exception_ptr ex;
+    try {
+        co_await truncate(ks, *cf, std::move(tsf), snapshot);
+    } catch (...) {
+        ex = std::current_exception();
+    }
+    co_await cf->stop();
+    if (ex) {
+        std::rethrow_exception(std::move(ex));
+    }
 }

 const utils::UUID& database::find_uuid(std::string_view ks, std::string_view cf) const {
@@ -1348,44 +1354,6 @@ database::existing_index_names(const sstring& ks_name, const sstring& cf_to_excl
    return names;
 }

-// Based on:
-//  - org.apache.cassandra.db.AbstractCell#reconcile()
-//  - org.apache.cassandra.db.BufferExpiringCell#reconcile()
-//  - org.apache.cassandra.db.BufferDeletedCell#reconcile()
-std::strong_ordering
-compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
-    if (left.timestamp() != right.timestamp()) {
-        return left.timestamp() <=> right.timestamp();
-    }
-    if (left.is_live() != right.is_live()) {
-        return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
-    }
-    if (left.is_live()) {
-        auto c = compare_unsigned(left.value(), right.value()) <=> 0;
-        if (c != 0) {
-            return c;
-        }
-        if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
-            // prefer expiring cells.
-            return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
-        }
-        if (left.is_live_and_has_ttl() && left.expiry() != right.expiry()) {
-            return left.expiry() <=> right.expiry();
-        }
-    } else {
-        // Both are deleted
-        if (left.deletion_time() != right.deletion_time()) {
-            // Origin compares big-endian serialized deletion time. That's because it
-            // delegates to AbstractCell.reconcile() which compares values after
-            // comparing timestamps, which in case of deleted cells will hold
-            // serialized expiry.
-            return (uint64_t) left.deletion_time().time_since_epoch().count()
-                   <=> (uint64_t) right.deletion_time().time_since_epoch().count();
-        }
-    }
-    return std::strong_ordering::equal;
-}
-
 future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>>
 database::query(schema_ptr s, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges,
                tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
--- a/database.hh
+++ b/database.hh
@@ -1384,6 +1384,7 @@ private:
    Future update_write_metrics(Future&& f);
    void update_write_metrics_for_timed_out_write();
    future<> create_keyspace(const lw_shared_ptr<keyspace_metadata>&, bool is_bootstrap, system_keyspace system);
+    void remove(const table&) noexcept;
 public:
    static utils::UUID empty_version;

@@ -1582,7 +1583,6 @@ public:

    bool update_column_family(schema_ptr s);
    future<> drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func, bool with_snapshot = true);
-    future<> remove(const column_family&) noexcept;

    const logalloc::region_group& dirty_memory_region_group() const {
        return _dirty_memory_manager.region_group();
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -39,6 +39,7 @@
 */

 #include <chrono>
+#include <exception>
 #include <seastar/core/future-util.hh>
 #include <seastar/core/do_with.hh>
 #include <seastar/core/semaphore.hh>
@@ -306,6 +307,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
            } catch (no_such_keyspace& ex) {
                // should probably ignore and drop the batch
            } catch (...) {
+                blogger.warn("Replay failed (will retry): {}", std::current_exception());
                // timeout, overload etc.
                // Do _not_ remove the batch, assuning we got a node write error.
                // Since we don't have hints (which origin is satisfied with),
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -428,6 +428,8 @@ private:
    void abort_recycled_list(std::exception_ptr);
    void abort_deletion_promise(std::exception_ptr);

+    future<> recalculate_footprint();
+
    future<> rename_file(sstring, sstring) const;
    size_t max_request_controller_units() const;
    segment_id_type _ids = 0;
@@ -444,6 +446,7 @@ private:
    seastar::gate _gate;
    uint64_t _new_counter = 0;
    std::optional<size_t> _disk_write_alignment;
+    seastar::semaphore _reserve_recalculation_guard;
 };

 template<typename T>
@@ -512,6 +515,7 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
    uint64_t _file_pos = 0;
    uint64_t _flush_pos = 0;
    uint64_t _size_on_disk = 0;
+    uint64_t _waste = 0;

    size_t _alignment;

@@ -598,7 +602,7 @@ public:
            clogger.debug("Segment {} is no longer active and will submitted for delete now", *this);
            ++_segment_manager->totals.segments_destroyed;
            _segment_manager->totals.active_size_on_disk -= file_position();
-            _segment_manager->totals.wasted_size_on_disk -= (_size_on_disk - file_position());
+            _segment_manager->totals.wasted_size_on_disk -= _waste;
            _segment_manager->add_file_to_delete(_file_name, _desc);
        } else if (_segment_manager->cfg.warn_about_segments_left_on_disk_after_shutdown) {
            clogger.warn("Segment {} is dirty and is left on disk.", *this);
@@ -725,7 +729,8 @@ public:
        auto s = co_await sync();
        co_await flush();
        co_await terminate();
-        _segment_manager->totals.wasted_size_on_disk += (_size_on_disk - file_position());
+        _waste = _size_on_disk - file_position();
+        _segment_manager->totals.wasted_size_on_disk += _waste;
        co_return s;
    }
    future<sseg_ptr> do_flush(uint64_t pos) {
@@ -1223,6 +1228,7 @@ db::commitlog::segment_manager::segment_manager(config c)
    , _recycled_segments(std::numeric_limits<size_t>::max())
    , _reserve_replenisher(make_ready_future<>())
    , _background_sync(make_ready_future<>())
+    , _reserve_recalculation_guard(1)
 {
    assert(max_size > 0);
    assert(max_mutation_size < segment::multi_entry_size_magic);
@@ -1248,6 +1254,11 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
        }
        try {
            gate::holder g(_gate);
+            auto guard = co_await get_units(_reserve_recalculation_guard, 1);
+            if (_reserve_segments.full()) {
+                // can happen if we recalculate
+                continue;
+            }
            // note: if we were strict with disk size, we would refuse to do this 
            // unless disk footprint is lower than threshold. but we cannot (yet?)
            // trust that flush logic will absolutely free up an existing 
@@ -1519,7 +1530,7 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:

        if (cfg.extensions && !cfg.extensions->commitlog_file_extensions().empty()) {
            for (auto * ext : cfg.extensions->commitlog_file_extensions()) {
-                auto nf = co_await ext->wrap_file(std::move(filename), f, flags);
+                auto nf = co_await ext->wrap_file(filename, f, flags);
                if (nf) {
                    f = std::move(nf);
                    align = is_overwrite ? f.disk_overwrite_dma_alignment() : f.disk_write_dma_alignment();
@@ -1530,12 +1541,21 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
        f = make_checked_file(commit_error_handler, std::move(f));
    } catch (...) {
        ep = std::current_exception();
-        commit_error_handler(ep);
+    }
+    if (ep) {
+        // do this early, so iff we are to fast-fail server,
+        // we do it before anything else can go wrong.
+        try {
+            commit_error_handler(ep);
+        } catch (...) {
+            ep = std::current_exception();
+        }
    }
    if (ep && f) {
        co_await f.close();
    }
    if (ep) {
+        add_file_to_delete(filename, d);
        co_return coroutine::exception(std::move(ep));
    }

@@ -1594,6 +1614,8 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
 }

 future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::new_segment() {
+    gate::holder g(_gate);
+
    if (_shutdown) {
        co_return coroutine::make_exception(std::runtime_error("Commitlog has been shut down. Cannot add data"));
    }
@@ -1628,22 +1650,23 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
            co_return _segments.back();
        }

-        if (_segment_allocating) {
-            co_await _segment_allocating->get_future(timeout);
-            continue;
-        }
-
-        promise<> p;
-        _segment_allocating.emplace(p.get_future());
-        auto finally = defer([&] () noexcept { _segment_allocating = std::nullopt; });
-        try {
-            gate::holder g(_gate);
-            auto s = co_await with_timeout(timeout, new_segment());
-            p.set_value();
-        } catch (...) {
-            p.set_exception(std::current_exception());
-            throw;
+        // #9896 - we don't want to issue a new_segment call until
+        // the old one has terminated with either result or exception.
+        // Do all waiting through the shared_future
+        if (!_segment_allocating) {
+            auto f = new_segment();
+            // must check that we are not already done.
+            if (f.available()) {
+                f.get(); // maybe force exception
+                continue;
+            }
+            _segment_allocating.emplace(f.discard_result().finally([this] {
+                // clear the shared_future _before_ resolving its contents
+                // (i.e. with result of this finally)
+                _segment_allocating = std::nullopt;
+            }));
        }
+        co_await _segment_allocating->get_future(timeout);
    }
 }

@@ -1865,6 +1888,8 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi

    std::exception_ptr recycle_error;

+    size_t num_deleted = 0;
+    bool except = false;
    while (!files.empty()) {
        auto filename = std::move(files.back());
        files.pop_back();
@@ -1914,8 +1939,10 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi
                }
            }
            co_await delete_file(filename);
+            ++num_deleted;
        } catch (...) {
            clogger.error("Could not delete segment {}: {}", filename, std::current_exception());
+            except = true;
        }
    }

@@ -1928,6 +1955,16 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi
    if (recycle_error && _recycled_segments.empty()) {
        abort_recycled_list(recycle_error);
    }
+    // If recycle failed and turned into a delete, we should fake-wakeup waiters
+    // since we might still have cleaned up disk space.
+    if (!recycle_error && num_deleted && cfg.reuse_segments && _recycled_segments.empty()) {
+        abort_recycled_list(std::make_exception_ptr(std::runtime_error("deleted files")));
+    }
+
+    // #9348 - if we had an exception, we can't trust our bookeep any more. recalculate.
+    if (except) {
+        co_await recalculate_footprint();
+    }
 }

 void db::commitlog::segment_manager::abort_recycled_list(std::exception_ptr ep) {
@@ -1942,6 +1979,67 @@ void db::commitlog::segment_manager::abort_deletion_promise(std::exception_ptr e
    std::exchange(_disk_deletions, {}).set_exception(ep);
 }

+future<> db::commitlog::segment_manager::recalculate_footprint() {
+    try {
+        co_await do_pending_deletes();
+
+        auto guard = co_await get_units(_reserve_recalculation_guard, 1);
+        auto segments_copy = _segments;
+        std::vector<sseg_ptr> reserves;
+        std::vector<sstring> recycles;
+        // this causes haywire things while we steal stuff, but...
+        while (!_reserve_segments.empty()) {
+            reserves.push_back(_reserve_segments.pop());
+        }
+        while (!_recycled_segments.empty()) {
+            recycles.push_back(_recycled_segments.pop());
+        }
+        // #9955 - must re-stock the queues before we do anything
+        // interruptable/continuation. Because both queues are
+        // used with push/pop eventually which _waits_ for signal
+        // but does _not_ verify that the condition is true once
+        // we return. So copy the objects and look at instead.
+        for (auto& filename : recycles) {
+            _recycled_segments.push(sstring(filename));
+        }
+        for (auto& s : reserves) {
+            _reserve_segments.push(sseg_ptr(s)); // you can have it back now.
+        }
+
+        // first, guesstimate sizes
+        uint64_t recycle_size = recycles.size() * max_size;
+        auto old = totals.total_size_on_disk;
+
+        totals.total_size_on_disk = recycle_size;
+        for (auto& s : _segments) {
+            totals.total_size_on_disk += s->_size_on_disk;
+        }
+        for (auto& s : reserves) {
+            totals.total_size_on_disk += s->_size_on_disk;
+        }
+
+        // now we need to adjust the actual sizes of recycled files
+
+        uint64_t actual_recycled_size = 0;
+
+        try {
+            for (auto& filename : recycles) {
+                auto s = co_await seastar::file_size(filename);
+                actual_recycled_size += s;
+            }
+        } catch (...) {
+            clogger.error("Exception reading disk footprint ({}).", std::current_exception());
+            actual_recycled_size = recycle_size; // best we got
+        }
+
+        totals.total_size_on_disk += actual_recycled_size - recycle_size;
+        // pushing things to reserve/recycled queues will have resumed any
+        // waiters, so we should be done.
+    } catch (...) {
+        clogger.error("Exception recalculating disk footprint ({}). Values might be off...", std::current_exception());
+    }
+}
+
 future<> db::commitlog::segment_manager::do_pending_deletes() {
    auto ftc = std::exchange(_files_to_close, {});
    auto ftd = std::exchange(_files_to_delete, {});
--- a/db/config.cc
+++ b/db/config.cc
@@ -860,6 +860,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Flush tables in the system_schema keyspace after schema modification. This is required for crash recovery, but slows down tests and can be disabled for them")
    , restrict_replication_simplestrategy(this, "restrict_replication_simplestrategy", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::FALSE, "Controls whether to disable SimpleStrategy replication. Can be true, false, or warn.")
    , restrict_dtcs(this, "restrict_dtcs", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::WARN, "Controls whether to prevent setting DateTieredCompactionStrategy. Can be true, false, or warn.")
+    , cache_index_pages(this, "cache_index_pages", liveness::LiveUpdate, value_status::Used, true,
+        "Keep SSTable index pages in the global cache after a SSTable read. Expected to improve performance for workloads with big partitions, but may degrade performance for workloads with small partitions.")
    , default_log_level(this, "default_log_level", value_status::Used)
    , logger_log_level(this, "logger_log_level", value_status::Used)
    , log_to_stdout(this, "log_to_stdout", value_status::Used)
--- a/db/config.hh
+++ b/db/config.hh
@@ -372,6 +372,8 @@ public:
    named_value<tri_mode_restriction> restrict_replication_simplestrategy;
    named_value<tri_mode_restriction> restrict_dtcs;

+    named_value<bool> cache_index_pages;
+
    seastar::logging_settings logging_settings(const boost::program_options::variables_map&) const;

    const db::extensions& extensions() const;
--- a/db/snapshot-ctl.cc
+++ b/db/snapshot-ctl.cc
@@ -119,8 +119,9 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
        return check_snapshot_not_exist(ks_name, tag, tables).then([this, ks_name, tables, tag, sf] {
            return do_with(std::vector<sstring>(std::move(tables)),[this, ks_name, tag, sf](const std::vector<sstring>& tables) {
                return do_for_each(tables, [ks_name, tag, sf, this] (const sstring& table_name) {
-                    if (table_name.find(".") != sstring::npos) {
-                        throw std::invalid_argument("Cannot take a snapshot of a secondary index by itself. Run snapshot on the table that owns the index.");
+                    auto& cf = _db.local().find_column_family(ks_name, table_name);
+                    if (cf.schema()->is_view()) {
+                        throw std::invalid_argument("Do not take a snapshot of a materialized view or a secondary index by itself. Run snapshot on the base table instead.");
                    }
                    return _db.invoke_on_all([ks_name, table_name, tag, sf] (database &db) {
                        auto& cf = db.find_column_family(ks_name, table_name);
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -350,7 +350,11 @@ public:
    view_filter_checking_visitor(const schema& base, const view_info& view)
        : _base(base)
        , _view(view)
-        , _selection(cql3::selection::selection::wildcard(_base.shared_from_this()))
+        , _selection(cql3::selection::selection::for_columns(_base.shared_from_this(),
+            boost::copy_range<std::vector<const column_definition*>>(
+                _base.regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return &cdef; }))
+            )
+        )
    {}

    void accept_new_partition(const partition_key& key, uint64_t row_count) {
@@ -887,13 +891,18 @@ void view_updates::generate_update(
    bool same_row = true;
    for (auto col_id : col_ids) {
        auto* after = update.cells().find_cell(col_id);
-        // Note: multi-cell columns can't be part of the primary key.
        auto& cdef = _base->regular_column_at(col_id);
        if (existing) {
            auto* before = existing->cells().find_cell(col_id);
+            // Note that this cell is necessarily atomic, because col_ids are
+            // view key columns, and keys must be atomic.
            if (before && before->as_atomic_cell(cdef).is_live()) {
                if (after && after->as_atomic_cell(cdef).is_live()) {
-                    auto cmp = compare_atomic_cell_for_merge(before->as_atomic_cell(cdef), after->as_atomic_cell(cdef));
+                    // We need to compare just the values of the keys, not
+                    // metadata like the timestamp. This is because below,
+                    // if the old and new view row have the same key, we need
+                    // to be sure to reach the update_entry() case.
+                    auto cmp = compare_unsigned(before->as_atomic_cell(cdef).value(), after->as_atomic_cell(cdef).value());
                    if (cmp != 0) {
                        same_row = false;
                    }
@@ -913,7 +922,13 @@ void view_updates::generate_update(
            if (same_row) {
                update_entry(base_key, update, *existing, now);
            } else {
-                replace_entry(base_key, update, *existing, now);
+                // This code doesn't work if the old and new view row have the
+                // same key, because if they do we get both data and tombstone
+                // for the same timestamp (now) and the tombstone wins. This
+                // is why we need the "same_row" case above - it's not just a
+                // performance optimization.
+                delete_old_entry(base_key, *existing, update, now);
+                create_entry(base_key, update, now);
            }
        } else {
            delete_old_entry(base_key, *existing, update, now);
@@ -1320,7 +1335,7 @@ future<> mutate_MV(
            auto mut_ptr = remote_endpoints.empty() ? std::make_unique<frozen_mutation>(std::move(mut.fm)) : std::make_unique<frozen_mutation>(mut.fm);
            tracing::trace(tr_state, "Locally applying view update for {}.{}; base token = {}; view token = {}",
                    mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
-            local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, std::move(tr_state), db::commitlog::force_sync::no).then_wrapped(
+            local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, tr_state, db::commitlog::force_sync::no).then_wrapped(
                    [s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
                            units = sem_units.split(sem_units.count())] (future<>&& f) {
                --stats.writes;
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -164,10 +164,7 @@ private:
    void delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
    void do_delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
    void update_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now);
-    void replace_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now) {
-        create_entry(base_key, update, now);
-        delete_old_entry(base_key, existing, update, now);
-    }
+    void update_entry_for_computed_column(const partition_key& base_key, const clustering_row& update, const std::optional<clustering_row>& existing, gc_clock::time_point now);
 };

 class view_update_builder {
--- a/dirty_memory_manager.hh
+++ b/dirty_memory_manager.hh
@@ -215,6 +215,12 @@ public:
        });
    }

+    future<flush_permit> get_all_flush_permits() {
+        return get_units(_background_work_flush_serializer, _max_background_work).then([this] (auto&& units) {
+            return this->get_flush_permit(std::move(units));
+        });
+    }
+
    bool has_extraneous_flushes_requested() const {
        return _extraneous_flushes > 0;
    }
--- a/dist/common/scripts/scylla-housekeeping
+++ b/dist/common/scripts/scylla-housekeeping
@@ -100,6 +100,7 @@ def version_compare(a, b):
 def create_uuid_file(fl):
    with open(args.uuid_file, 'w') as myfile:
        myfile.write(str(uuid.uuid1()) + "\n")
+    os.chmod(args.uuid_file, 0o644)


 def sanitize_version(version):
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -127,10 +127,14 @@ WantedBy=multi-user.target
        #  - Storage: /path/to/file (inacessible)
        #  - Storage: /path/to/file
        #
+        # After systemd-v248, available coredump file output changed like this:
+        #  - Storage: /path/to/file (present)
+        # We need to support both versions.
+        #
        # reference: https://github.com/systemd/systemd/commit/47f50642075a7a215c9f7b600599cbfee81a2913

        corefail = False
-        res = re.findall(r'Storage: (.*)$', coreinfo, flags=re.MULTILINE)
+        res = re.findall(r'Storage: (\S+)(?: \(.+\))?$', coreinfo, flags=re.MULTILINE)
        # v232 or later
        if res:
            corepath = res[0]
--- a/dist/common/scripts/scylla_io_setup
+++ b/dist/common/scripts/scylla_io_setup
@@ -278,6 +278,66 @@ if __name__ == "__main__":
                    disk_properties["read_bandwidth"] = 2527296683 * nr_disks
                    disk_properties["write_iops"] = 156326 * nr_disks
                    disk_properties["write_bandwidth"] = 1063657088 * nr_disks
+            elif idata.instance() == "im4gn.large":
+                disk_properties["read_iops"] = 33943
+                disk_properties["read_bandwidth"] = 288433525
+                disk_properties["write_iops"] = 27877
+                disk_properties["write_bandwidth"] = 126864680
+            elif idata.instance() == "im4gn.xlarge":
+                disk_properties["read_iops"] = 68122
+                disk_properties["read_bandwidth"] = 576603520
+                disk_properties["write_iops"] = 55246
+                disk_properties["write_bandwidth"] = 254534954
+            elif idata.instance() == "im4gn.2xlarge":
+                disk_properties["read_iops"] = 136422
+                disk_properties["read_bandwidth"] = 1152663765
+                disk_properties["write_iops"] = 92184
+                disk_properties["write_bandwidth"] = 508926453
+            elif idata.instance() == "im4gn.4xlarge":
+                disk_properties["read_iops"] = 273050
+                disk_properties["read_bandwidth"] = 1638427264
+                disk_properties["write_iops"] = 92173
+                disk_properties["write_bandwidth"] = 1027966826
+            elif idata.instance() == "im4gn.8xlarge":
+                disk_properties["read_iops"] = 250241 * nr_disks
+                disk_properties["read_bandwidth"] = 1163130709 * nr_disks
+                disk_properties["write_iops"] = 86374 * nr_disks
+                disk_properties["write_bandwidth"] = 977617664 * nr_disks
+            elif idata.instance() == "im4gn.16xlarge":
+                disk_properties["read_iops"] = 273030 * nr_disks
+                disk_properties["read_bandwidth"] = 1638211413 * nr_disks
+                disk_properties["write_iops"] = 92607 * nr_disks
+                disk_properties["write_bandwidth"] = 1028340266 * nr_disks
+            elif idata.instance() == "is4gen.medium":
+                disk_properties["read_iops"] = 33965
+                disk_properties["read_bandwidth"] = 288462506
+                disk_properties["write_iops"] = 27876
+                disk_properties["write_bandwidth"] = 126954200
+            elif idata.instance() == "is4gen.large":
+                disk_properties["read_iops"] = 68131
+                disk_properties["read_bandwidth"] = 576654869
+                disk_properties["write_iops"] = 55257
+                disk_properties["write_bandwidth"] = 254551002
+            elif idata.instance() == "is4gen.xlarge":
+                disk_properties["read_iops"] = 136413
+                disk_properties["read_bandwidth"] = 1152747904
+                disk_properties["write_iops"] = 92180
+                disk_properties["write_bandwidth"] = 508889546
+            elif idata.instance() == "is4gen.2xlarge":
+                disk_properties["read_iops"] = 273038
+                disk_properties["read_bandwidth"] = 1628982613
+                disk_properties["write_iops"] = 92182
+                disk_properties["write_bandwidth"] = 1027983530
+            elif idata.instance() == "is4gen.4xlarge":
+                disk_properties["read_iops"] = 260493 * nr_disks
+                disk_properties["read_bandwidth"] = 1217396928 * nr_disks
+                disk_properties["write_iops"] = 83169 * nr_disks
+                disk_properties["write_bandwidth"] = 1000390784 * nr_disks
+            elif idata.instance() == "is4gen.8xlarge":
+                disk_properties["read_iops"] = 273021 * nr_disks
+                disk_properties["read_bandwidth"] = 1656354602 * nr_disks
+                disk_properties["write_iops"] = 92233 * nr_disks
+                disk_properties["write_bandwidth"] = 1028010325 * nr_disks
            properties_file = open(etcdir() + "/scylla.d/io_properties.yaml", "w")
            yaml.dump({ "disks": [ disk_properties ] }, properties_file,  default_flow_style=False)
            ioconf = open(etcdir() + "/scylla.d/io.conf", "w")
--- a/dist/common/scripts/scylla_ntp_setup
+++ b/dist/common/scripts/scylla_ntp_setup
@@ -66,18 +66,18 @@ if __name__ == '__main__':

    target = None
    if os.path.exists('/lib/systemd/systemd-timesyncd'):
-        if systemd_unit('systemd-timesyncd').is_active():
+        if systemd_unit('systemd-timesyncd').is_active() == 'active':
            print('ntp is already configured, skip setup')
            sys.exit(0)
        target = 'systemd-timesyncd'
    if shutil.which('chronyd'):
-        if get_chrony_unit().is_active():
+        if get_chrony_unit().is_active() == 'active':
            print('ntp is already configured, skip setup')
            sys.exit(0)
        if not target:
            target = 'chrony'
    if shutil.which('ntpd'):
-        if get_ntp_unit().is_active():
+        if get_ntp_unit().is_active() == 'active':
            print('ntp is already configured, skip setup')
            sys.exit(0)
        if not target:
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -117,10 +117,11 @@ if __name__ == '__main__':
        pkg_install('xfsprogs')
    if not shutil.which('mdadm'):
        pkg_install('mdadm')
-    try:
-        md_service = systemd_unit('mdmonitor.service')
-    except SystemdException:
-        md_service = systemd_unit('mdadm.service')
+    if args.raid_level != '0':
+        try:
+            md_service = systemd_unit('mdmonitor.service')
+        except SystemdException:
+            md_service = systemd_unit('mdadm.service')

    print('Creating {type} for scylla using {nr_disk} disk(s): {disks}'.format(type='fRAID{args.raid_level}' if raid else 'XFS volume', nr_disk=len(disks), disks=args.disks))
    procs=[]
@@ -164,14 +165,15 @@ if __name__ == '__main__':

    uuid = run(f'blkid -s UUID -o value {fsdev}', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
    after = 'local-fs.target'
-    if raid:
+    wants = ''
+    if raid and args.raid_level != '0':
        after += f' {md_service}'
+        wants = f'\nWants={md_service}'
    unit_data = f'''
 [Unit]
 Description=Scylla data directory
 Before=scylla-server.service
-After={after}
-Wants={md_service}
+After={after}{wants}
 DefaultDependencies=no

 [Mount]
@@ -195,7 +197,8 @@ WantedBy=multi-user.target
            f.write(f'RequiresMountsFor={mount_at}\n')

    systemd_unit.reload()
-    md_service.start()
+    if args.raid_level != '0':
+        md_service.start()
    mount = systemd_unit(mntunit_bn)
    mount.start()
    if args.enable_on_nextboot:
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -370,6 +370,10 @@ if __name__ == '__main__':
            version_check = interactive_ask_service('Do you want to enable Scylla to check if there is a newer version of Scylla available?', 'Yes - start the Scylla-housekeeping service to check for a newer version. This check runs periodically. No - skips this step.', version_check)
            args.no_version_check = not version_check
            if version_check:
+                cfg = sysconfig_parser(sysconfdir_p() / 'scylla-housekeeping')
+                repo_files = cfg.get('REPO_FILES')
+                for f in glob.glob(repo_files):
+                    os.chmod(f, 0o644)
                with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
                    f.write('[housekeeping]\ncheck-version: True\n')
                os.chmod('/etc/scylla.d/housekeeping.cfg', 0o644)
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -674,7 +674,7 @@ class aws_instance:
        return self._type.split(".")[0]

    def is_supported_instance_class(self):
-        if self.instance_class() in ['i2', 'i3', 'i3en', 'c5d', 'm5d', 'm5ad', 'r5d', 'z1d', 'c6gd', 'm6gd', 'r6gd', 'x2gd']:
+        if self.instance_class() in ['i2', 'i3', 'i3en', 'c5d', 'm5d', 'm5ad', 'r5d', 'z1d', 'c6gd', 'm6gd', 'r6gd', 'x2gd', 'im4gn', 'is4gen']:
            return True
        return False

@@ -683,7 +683,7 @@ class aws_instance:
        instance_size = self.instance_size()
        if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
            return 'ixgbevf'
-        if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5b', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d', 'c6g', 'c6gd', 'm6g', 'm6gd', 't4g', 'r6g', 'r6gd', 'x2gd']:
+        if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5b', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d', 'c6g', 'c6gd', 'm6g', 'm6gd', 't4g', 'r6g', 'r6gd', 'x2gd', 'im4gn', 'is4gen']:
            return 'ena'
        if instance_class == 'm4':
            if instance_size == '16xlarge':
@@ -1041,7 +1041,7 @@ class systemd_unit:
        return run('systemctl {} disable {}'.format(self.ctlparam, self._unit), shell=True, check=True)

    def is_active(self):
-        return True if run('systemctl {} is-active {}'.format(self.ctlparam, self._unit), shell=True, capture_output=True, encoding='utf-8').stdout.strip() == 'active' else False
+        return run('systemctl {} is-active {}'.format(self.ctlparam, self._unit), shell=True, capture_output=True, encoding='utf-8').stdout.strip()

    def mask(self):
        return run('systemctl {} mask {}'.format(self.ctlparam, self._unit), shell=True, check=True)
--- a/dist/common/supervisor/scylla_util.sh
+++ b/dist/common/supervisor/scylla_util.sh
@@ -6,12 +6,16 @@ is_nonroot() {
    [ -f "$scylladir"/SCYLLA-NONROOT-FILE ]
 }

+is_container() {
+    [ -f "$scylladir"/SCYLLA-CONTAINER-FILE ]
+}
+
 is_privileged() {
    [ ${EUID:-${UID}} = 0 ]
 }

 execsudo() {
-    if is_nonroot; then
+    if is_nonroot || is_container; then
        exec "$@"
    else
        exec sudo -u scylla -g scylla "$@"
--- a/dist/docker/debian/build_docker.sh
+++ b/dist/docker/debian/build_docker.sh
@@ -25,6 +25,10 @@ product="$(<build/SCYLLA-PRODUCT-FILE)"
 version="$(<build/SCYLLA-VERSION-FILE)"
 release="$(<build/SCYLLA-RELEASE-FILE)"

+if [[ "$version" = *rc* ]]; then
+ version=$(echo $version |sed 's/\(.*\)\.)*/\1~/')
+fi
+
 mode="release"

 if uname -m | grep x86_64 ; then
@@ -93,12 +97,14 @@ run apt-get -y install hostname supervisor openssh-server openssh-client openjdk
 run locale-gen en_US.UTF-8
 run bash -ec "dpkg -i packages/*.deb"
 run apt-get -y clean all
-run bash -ec "cat /scylla_bashrc >> /etc/bashrc"
+run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
 run mkdir -p /etc/supervisor.conf.d
 run mkdir -p /var/log/scylla
 run chown -R scylla:scylla /var/lib/scylla
+run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"/' /etc/default/scylla-server

 run mkdir -p /opt/scylladb/supervisor
+run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
 bcp dist/common/supervisor/scylla-server.sh /opt/scylladb/supervisor/scylla-server.sh
 bcp dist/common/supervisor/scylla-jmx.sh /opt/scylladb/supervisor/scylla-jmx.sh
 bcp dist/common/supervisor/scylla-node-exporter.sh /opt/scylladb/supervisor/scylla-node-exporter.sh
--- a/dist/docker/etc/supervisord.conf.d/scylla-server.conf
+++ b/dist/docker/etc/supervisord.conf.d/scylla-server.conf
@@ -1,4 +1,4 @@
-[program:scylla-server]
+[program:scylla]
 command=/opt/scylladb/supervisor/scylla-server.sh
 stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
--- a/dist/docker/etc/sysconfig/scylla-server
+++ b/dist/docker/etc/sysconfig/scylla-server
@@ -1,41 +0,0 @@
-# choose following mode: virtio, dpdk, posix
-NETWORK_MODE=posix
-
-# tap device name(virtio)
-TAP=tap0
-
-# bridge device name (virtio)
-BRIDGE=virbr0
-
-# ethernet device name
-IFNAME=eth0
-
-# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
-SET_NIC_AND_DISKS=no
-
-# ethernet device driver (dpdk)
-ETHDRV=
-
-# ethernet device PCI ID (dpdk)
-ETHPCIID=
-
-# number of hugepages
-NR_HUGEPAGES=64
-
-# user for process (must be root for dpdk)
-USER=scylla
-
-# group for process
-GROUP=scylla
-
-# scylla home dir
-SCYLLA_HOME=/var/lib/scylla
-
-# scylla config dir
-SCYLLA_CONF=/etc/scylla
-
-# scylla arguments
-SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"
-
-# setup as AMI instance
-AMI=no
--- a/dist/docker/scyllasetup.py
+++ b/dist/docker/scyllasetup.py
@@ -121,12 +121,13 @@ class ScyllaSetup:
        if self._apiAddress is not None:
            args += ["--api-address %s" % self._apiAddress]

-        if self._alternatorPort is not None:
+        if self._alternatorAddress is not None:
            args += ["--alternator-address %s" % self._alternatorAddress]
+
+        if self._alternatorPort is not None:
            args += ["--alternator-port %s" % self._alternatorPort]

        if self._alternatorHttpsPort is not None:
-            args += ["--alternator-address %s" % self._alternatorAddress]
            args += ["--alternator-https-port %s" % self._alternatorHttpsPort]

        if self._alternatorWriteIsolation is not None:
--- a/flat_mutation_reader.cc
+++ b/flat_mutation_reader.cc
@@ -45,7 +45,7 @@
 logging::logger fmr_logger("flat_mutation_reader");

 flat_mutation_reader& flat_mutation_reader::operator=(flat_mutation_reader&& o) noexcept {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -58,7 +58,7 @@ flat_mutation_reader& flat_mutation_reader::operator=(flat_mutation_reader&& o)
 }

 flat_mutation_reader::~flat_mutation_reader() {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -1344,7 +1344,7 @@ void mutation_fragment_stream_validating_filter::on_end_of_stream() {
 }

 flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader_v2&& o) noexcept {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -1357,7 +1357,7 @@ flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader
 }

 flat_mutation_reader_v2::~flat_mutation_reader_v2() {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
--- a/flat_mutation_reader.hh
+++ b/flat_mutation_reader.hh
@@ -142,6 +142,7 @@ public:
    private:
        tracked_buffer _buffer;
        size_t _buffer_size = 0;
+        bool _close_required = false;
    protected:
        size_t max_buffer_size_in_bytes = default_max_buffer_size_in_bytes();
        bool _end_of_stream = false;
@@ -175,6 +176,8 @@ public:
        bool is_end_of_stream() const { return _end_of_stream; }
        bool is_buffer_empty() const { return _buffer.empty(); }
        bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; }
+        bool is_close_required() const { return _close_required; }
+        void set_close_required() { _close_required = true; }
        static constexpr size_t default_max_buffer_size_in_bytes() { return 8 * 1024; }

        mutation_fragment pop_mutation_fragment() {
@@ -506,9 +509,15 @@ public:
    //
    // Can be used to skip over entire partitions if interleaved with
    // `operator()()` calls.
-    future<> next_partition() { return _impl->next_partition(); }
+    future<> next_partition() {
+        _impl->set_close_required();
+        return _impl->next_partition();
+    }

-    future<> fill_buffer() { return _impl->fill_buffer(); }
+    future<> fill_buffer() {
+        _impl->set_close_required();
+        return _impl->fill_buffer();
+    }

    // Changes the range of partitions to pr. The range can only be moved
    // forwards. pr.begin() needs to be larger than pr.end() of the previousl
@@ -517,6 +526,7 @@ public:
    // pr needs to be valid until the reader is destroyed or fast_forward_to()
    // is called again.
    future<> fast_forward_to(const dht::partition_range& pr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(pr);
    }
    // Skips to a later range of rows.
@@ -546,6 +556,7 @@ public:
    // In particular one must first enter a partition by fetching a `partition_start`
    // fragment before calling `fast_forward_to`.
    future<> fast_forward_to(position_range cr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(std::move(cr));
    }
    // Closes the reader.
--- a/flat_mutation_reader_v2.hh
+++ b/flat_mutation_reader_v2.hh
@@ -177,6 +177,7 @@ public:
    private:
        tracked_buffer _buffer;
        size_t _buffer_size = 0;
+        bool _close_required = false;
    protected:
        size_t max_buffer_size_in_bytes = default_max_buffer_size_in_bytes();

@@ -216,6 +217,8 @@ public:
        bool is_end_of_stream() const { return _end_of_stream; }
        bool is_buffer_empty() const { return _buffer.empty(); }
        bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; }
+        bool is_close_required() const { return _close_required; }
+        void set_close_required() { _close_required = true; }
        static constexpr size_t default_max_buffer_size_in_bytes() { return 8 * 1024; }

        mutation_fragment_v2 pop_mutation_fragment() {
@@ -547,9 +550,15 @@ public:
    //
    // Can be used to skip over entire partitions if interleaved with
    // `operator()()` calls.
-    future<> next_partition() { return _impl->next_partition(); }
+    future<> next_partition() {
+        _impl->set_close_required();
+        return _impl->next_partition();
+    }

-    future<> fill_buffer() { return _impl->fill_buffer(); }
+    future<> fill_buffer() {
+        _impl->set_close_required();
+        return _impl->fill_buffer();
+    }

    // Changes the range of partitions to pr. The range can only be moved
    // forwards. pr.begin() needs to be larger than pr.end() of the previousl
@@ -558,6 +567,7 @@ public:
    // pr needs to be valid until the reader is destroyed or fast_forward_to()
    // is called again.
    future<> fast_forward_to(const dht::partition_range& pr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(pr);
    }
    // Skips to a later range of rows.
@@ -587,6 +597,7 @@ public:
    // In particular one must first enter a partition by fetching a `partition_start`
    // fragment before calling `fast_forward_to`.
    future<> fast_forward_to(position_range cr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(std::move(cr));
    }
    // Closes the reader.
--- a/generic_server.cc
+++ b/generic_server.cc
@@ -184,14 +184,18 @@ future<> server::do_accepts(int which, bool keepalive, socket_address server_add
                    _logger.info("exception while advertising new connection: {}", std::current_exception());
                }
                // Block while monitoring for lifetime/errors.
-                return conn->process().finally([this, conn] {
-                    return unadvertise_connection(conn);
-                }).handle_exception([this] (std::exception_ptr ep) {
-                    if (is_broken_pipe_or_connection_reset(ep)) {
-                        // expected if another side closes a connection or we're shutting down
-                        return;
+                return conn->process().then_wrapped([this, conn] (auto f) {
+                    try {
+                        f.get();
+                    } catch (...) {
+                        auto ep = std::current_exception();
+                        if (!is_broken_pipe_or_connection_reset(ep)) {
+                            // some exceptions are expected if another side closes a connection
+                            // or we're shutting down
+                            _logger.info("exception while processing connection: {}", ep);
+                        }
                    }
-                    _logger.info("exception while processing connection: {}", ep);
+                    return unadvertise_connection(conn);
                });
            });
            return stop_iteration::no;
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -477,49 +477,42 @@ gossiper::handle_get_endpoint_states_msg(gossip_get_endpoint_states_request requ
    return make_ready_future<gossip_get_endpoint_states_response>(gossip_get_endpoint_states_response{std::move(map)});
 }

+rpc::no_wait_type gossiper::background_msg(sstring type, noncopyable_function<future<>(gossiper&)> fn) {
+    (void)with_gate(_background_msg, [this, type = std::move(type), fn = std::move(fn)] () mutable {
+        return container().invoke_on(0, std::move(fn)).handle_exception([type = std::move(type)] (auto ep) {
+            logger.warn("Failed to handle {}: {}", type, ep);
+        });
+    });
+    return messaging_service::no_wait();
+}
+
 void gossiper::init_messaging_service_handler() {
    _messaging.register_gossip_digest_syn([this] (const rpc::client_info& cinfo, gossip_digest_syn syn_msg) {
        auto from = netw::messaging_service::get_source(cinfo);
-        // In a new fiber.
-        (void)container().invoke_on(0, [from, syn_msg = std::move(syn_msg)] (gms::gossiper& gossiper) mutable {
+        return background_msg("GOSSIP_DIGEST_SYN", [from, syn_msg = std::move(syn_msg)] (gms::gossiper& gossiper) mutable {
            return gossiper.handle_syn_msg(from, std::move(syn_msg));
-        }).handle_exception([] (auto ep) {
-            logger.warn("Fail to handle GOSSIP_DIGEST_SYN: {}", ep);
        });
-        return messaging_service::no_wait();
    });
    _messaging.register_gossip_digest_ack([this] (const rpc::client_info& cinfo, gossip_digest_ack msg) {
        auto from = netw::messaging_service::get_source(cinfo);
-        // In a new fiber.
-        (void)container().invoke_on(0, [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
+        return background_msg("GOSSIP_DIGEST_ACK", [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
            return gossiper.handle_ack_msg(from, std::move(msg));
-        }).handle_exception([] (auto ep) {
-            logger.warn("Fail to handle GOSSIP_DIGEST_ACK: {}", ep);
        });
-        return messaging_service::no_wait();
    });
    _messaging.register_gossip_digest_ack2([this] (const rpc::client_info& cinfo, gossip_digest_ack2 msg) {
        auto from = netw::messaging_service::get_source(cinfo);
-        // In a new fiber.
-        (void)container().invoke_on(0, [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
+        return background_msg("GOSSIP_DIGEST_ACK2", [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
            return gossiper.handle_ack2_msg(from, std::move(msg));
-        }).handle_exception([] (auto ep) {
-            logger.warn("Fail to handle GOSSIP_DIGEST_ACK2: {}", ep);
        });
-        return messaging_service::no_wait();
    });
    _messaging.register_gossip_echo([this] (const rpc::client_info& cinfo, rpc::optional<int64_t> generation_number_opt) {
        auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
        return handle_echo_msg(from, generation_number_opt);
    });
    _messaging.register_gossip_shutdown([this] (inet_address from, rpc::optional<int64_t> generation_number_opt) {
-        // In a new fiber.
-        (void)container().invoke_on(0, [from, generation_number_opt] (gms::gossiper& gossiper) {
+        return background_msg("GOSSIP_SHUTDOWN", [from, generation_number_opt] (gms::gossiper& gossiper) {
            return gossiper.handle_shutdown_msg(from, generation_number_opt);
-        }).handle_exception([] (auto ep) {
-            logger.warn("Fail to handle GOSSIP_SHUTDOWN: {}", ep);
        });
-        return messaging_service::no_wait();
    });
    _messaging.register_gossip_get_endpoint_states([this] (const rpc::client_info& cinfo, gossip_get_endpoint_states_request request) {
        return container().invoke_on(0, [request = std::move(request)] (gms::gossiper& gossiper) mutable {
@@ -1679,6 +1672,10 @@ bool gossiper::is_normal(const inet_address& endpoint) const {
    return get_gossip_status(endpoint) == sstring(versioned_value::STATUS_NORMAL);
 }

+bool gossiper::is_left(const inet_address& endpoint) const {
+    return get_gossip_status(endpoint) == sstring(versioned_value::STATUS_LEFT);
+}
+
 bool gossiper::is_normal_ring_member(const inet_address& endpoint) const {
    auto status = get_gossip_status(endpoint);
    return status == sstring(versioned_value::STATUS_NORMAL) || status == sstring(versioned_value::SHUTDOWN);
@@ -2178,6 +2175,9 @@ future<> gossiper::start() {
 }

 future<> gossiper::shutdown() {
+    if (!_background_msg.is_closed()) {
+        co_await _background_msg.close();
+    }
    if (this_shard_id() == 0) {
        co_await do_stop_gossiping();
    }
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -41,7 +41,9 @@
 #include "unimplemented.hh"
 #include <seastar/core/distributed.hh>
 #include <seastar/core/shared_ptr.hh>
+#include <seastar/core/gate.hh>
 #include <seastar/core/print.hh>
+#include <seastar/rpc/rpc_types.hh>
 #include "utils/atomic_vector.hh"
 #include "utils/UUID.hh"
 #include "utils/fb_utilities.hh"
@@ -138,12 +140,16 @@ private:
    bool _enabled = false;
    semaphore _callback_running{1};
    semaphore _apply_state_locally_semaphore{100};
+    seastar::gate _background_msg;
    std::unordered_map<gms::inet_address, syn_msg_pending> _syn_handlers;
    std::unordered_map<gms::inet_address, ack_msg_pending> _ack_handlers;
    bool _advertise_myself = true;
    // Map ip address and generation number
    std::unordered_map<gms::inet_address, int32_t> _advertise_to_nodes;
    future<> _failure_detector_loop_done{make_ready_future<>()} ;
+
+    rpc::no_wait_type background_msg(sstring type, noncopyable_function<future<>(gossiper&)> fn);
+
 public:
    // Get current generation number for the given nodes
    future<std::unordered_map<gms::inet_address, int32_t>>
@@ -565,6 +571,7 @@ public:
    bool is_seed(const inet_address& endpoint) const;
    bool is_shutdown(const inet_address& endpoint) const;
    bool is_normal(const inet_address& endpoint) const;
+    bool is_left(const inet_address& endpoint) const;
    // Check if a node is in NORMAL or SHUTDOWN status which means the node is
    // part of the token ring from the gossip point of view and operates in
    // normal status or was in normal status but is shutdown.
--- a/install.sh
+++ b/install.sh
@@ -520,8 +520,13 @@ relocate_python3 "$rprefix"/scyllatop tools/scyllatop/scyllatop.py
 if $supervisor; then
    install -d -m755 `supervisor_dir $retc`
    for service in scylla-server scylla-jmx scylla-node-exporter; do
+        if [ "$service" = "scylla-server" ]; then
+            program="scylla"
+        else
+            program=$service
+        fi
        cat << EOS > `supervisor_conf $retc $service`
-[program:$service]
+[program:$program]
 directory=$rprefix
 command=/bin/bash -c './supervisor/$service.sh'
 EOS
--- a/locator/azure_snitch.cc
+++ b/locator/azure_snitch.cc
@@ -61,6 +61,10 @@ azure_snitch::azure_snitch(const sstring& fname, unsigned io_cpuid) : production
 }

 future<> azure_snitch::load_config() {
+    if (this_shard_id() != io_cpu_id()) {
+        co_return;
+    }
+
    sstring region = co_await azure_api_call(REGION_NAME_QUERY_PATH);
    sstring azure_zone = co_await azure_api_call(ZONE_NAME_QUERY_PATH);

--- a/locator/ec2_snitch.cc
+++ b/locator/ec2_snitch.cc
@@ -1,5 +1,7 @@
 #include "locator/ec2_snitch.hh"
 #include <seastar/core/seastar.hh>
+#include <seastar/core/sleep.hh>
+#include <seastar/core/do_with.hh>

 #include <boost/algorithm/string/classification.hpp>
 #include <boost/algorithm/string/split.hpp>
@@ -67,6 +69,30 @@ future<> ec2_snitch::start() {
 }

 future<sstring> ec2_snitch::aws_api_call(sstring addr, uint16_t port, sstring cmd) {
+    return do_with(int(0), [this, addr, port, cmd] (int& i) {
+        return repeat_until_value([this, addr, port, cmd, &i]() -> future<std::optional<sstring>> {
+            ++i;
+            return aws_api_call_once(addr, port, cmd).then([] (auto res) {
+                return make_ready_future<std::optional<sstring>>(std::move(res));
+            }).handle_exception([&i] (auto ep) {
+                try {
+                    std::rethrow_exception(ep);
+                } catch (const std::system_error &e) {
+                    logger().error(e.what());
+                    if (i >= AWS_API_CALL_RETRIES - 1) {
+                        logger().error("Maximum number of retries exceeded");
+                        throw e;
+                    }
+                }
+                return sleep(AWS_API_CALL_RETRY_INTERVAL).then([] {
+                    return make_ready_future<std::optional<sstring>>(std::nullopt);
+                });
+            });
+        });
+    });
+}
+
+future<sstring> ec2_snitch::aws_api_call_once(sstring addr, uint16_t port, sstring cmd) {
    return connect(socket_address(inet_address{addr}, port))
    .then([this, addr, cmd] (connected_socket fd) {
        _sd = std::move(fd);
--- a/locator/ec2_snitch.hh
+++ b/locator/ec2_snitch.hh
@@ -29,6 +29,8 @@ public:
    static constexpr const char* ZONE_NAME_QUERY_REQ = "/latest/meta-data/placement/availability-zone";
    static constexpr const char* AWS_QUERY_SERVER_ADDR = "169.254.169.254";
    static constexpr uint16_t AWS_QUERY_SERVER_PORT = 80;
+    static constexpr int AWS_API_CALL_RETRIES = 5;
+    static constexpr auto AWS_API_CALL_RETRY_INTERVAL = std::chrono::seconds{5};

    ec2_snitch(const sstring& fname = "", unsigned io_cpu_id = 0);
    virtual future<> start() override;
@@ -45,5 +47,6 @@ private:
    output_stream<char> _out;
    http_response_parser _parser;
    sstring _zone_req;
+    future<sstring> aws_api_call_once(sstring addr, uint16_t port, const sstring cmd);
 };
 } // namespace locator
--- a/main.cc
+++ b/main.cc
@@ -377,11 +377,38 @@ static auto defer_verbose_shutdown(const char* what, Func&& func) {
        startlog.info("Shutting down {}", what);
        try {
            func();
+            startlog.info("Shutting down {} was successful", what);
        } catch (...) {
-            startlog.error("Unexpected error shutting down {}: {}", what, std::current_exception());
-            throw;
+            auto ex = std::current_exception();
+            bool do_abort = true;
+            try {
+                std::rethrow_exception(ex);
+            } catch (const std::system_error& e) {
+                // System error codes we consider "environmental",
+                // i.e. not scylla's fault, therefore there is no point in
+                // aborting and dumping core.
+                for (int i : {EIO, EACCES, ENOSPC}) {
+                    if (e.code() == std::error_code(i, std::system_category())) {
+                        do_abort = false;
+                        break;
+                    }
+                }
+            } catch (...) {
+            }
+            auto msg = fmt::format("Unexpected error shutting down {}: {}", what, ex);
+            if (do_abort) {
+                startlog.error("{}: aborting", msg);
+                abort();
+            } else {
+                startlog.error("{}: exiting, at {}", msg, current_backtrace());
+
+                // Call _exit() rather than exit() to exit immediately
+                // without calling exit handlers, avoiding
+                // boost::intrusive::detail::destructor_impl assert failure
+                // from ~segment_pool exit handler.
+                _exit(255);
+            }
        }
-        startlog.info("Shutting down {} was successful", what);
    };

    auto ret = deferred_action(std::move(vfunc));
@@ -535,6 +562,12 @@ int main(int ac, char** av) {

            cfg->broadcast_to_all_shards().get();

+            // We pass this piece of config through a global as a temporary hack.
+            // See the comment at the definition of sstables::global_cache_index_pages.
+            smp::invoke_on_all([&cfg] {
+                sstables::global_cache_index_pages = cfg->cache_index_pages.operator utils::updateable_value<bool>();
+            }).get();
+
            ::sighup_handler sighup_handler(opts, *cfg);
            auto stop_sighup_handler = defer_verbose_shutdown("sighup", [&] {
                sighup_handler.stop().get();
--- a/memtable-sstable.hh
+++ b/memtable-sstable.hh
@@ -30,6 +30,7 @@
 #include <seastar/core/io_priority_class.hh>

 class memtable;
+class reader_permit;
 class flat_mutation_reader;

 namespace sstables {
--- a/memtable.cc
+++ b/memtable.cc
@@ -613,7 +613,8 @@ static flat_mutation_reader make_partition_snapshot_flat_reader_from_snp_schema(
        schema_ptr rev_snp_schema = snp->schema()->make_reversed();
        return make_partition_snapshot_flat_reader<true, partition_snapshot_read_accounter>(std::move(rev_snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
    } else {
-        return make_partition_snapshot_flat_reader<false, partition_snapshot_read_accounter>(snp->schema(), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
+        schema_ptr snp_schema = snp->schema();
+        return make_partition_snapshot_flat_reader<false, partition_snapshot_read_accounter>(std::move(snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
    }
 }

--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -442,6 +442,8 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
    case messaging_verb::GOSSIP_ECHO:
    case messaging_verb::GOSSIP_GET_ENDPOINT_STATES:
    case messaging_verb::GET_SCHEMA_VERSION:
+        // ATTN -- if moving GOSSIP_ verbs elsewhere, mind updating the tcp_nodelay
+        // setting in get_rpc_client(), which assumes gossiper verbs live in idx 0
        return 0;
    case messaging_verb::PREPARE_MESSAGE:
    case messaging_verb::PREPARE_DONE_MESSAGE:
@@ -628,7 +630,12 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        remove_error_rpc_client(verb, id);
    }

-    auto must_encrypt = [&id, &verb, this] {
+    auto addr = get_preferred_ip(id.addr);
+    auto broadcast_address = utils::fb_utilities::get_broadcast_address();
+    bool listen_to_bc = _cfg.listen_on_broadcast_address && _cfg.ip != broadcast_address;
+    auto laddr = socket_address(listen_to_bc ? broadcast_address : _cfg.ip, 0);
+
+    auto must_encrypt = [&] {
        if (_cfg.encrypt == encrypt_what::none) {
            return false;
        }
@@ -646,13 +653,27 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        auto& snitch_ptr = locator::i_endpoint_snitch::get_local_snitch_ptr();

        // either rack/dc need to be in same dc to use non-tls
-        if (snitch_ptr->get_datacenter(id.addr) != snitch_ptr->get_datacenter(utils::fb_utilities::get_broadcast_address())) {
+        auto my_dc = snitch_ptr->get_datacenter(broadcast_address);
+        if (snitch_ptr->get_datacenter(addr) != my_dc) {
+            return true;
+        }
+        // #9653 - if our idea of dc for bind address differs from our official endpoint address,
+        // we cannot trust downgrading. We need to ensure either (local) bind address is same as
+        // broadcast or that the dc info we get for it is the same.
+        if (broadcast_address != laddr && snitch_ptr->get_datacenter(laddr) != my_dc) {
            return true;
        }
        // if cross-rack tls, check rack.
-        return _cfg.encrypt == encrypt_what::rack &&
-            snitch_ptr->get_rack(id.addr) != snitch_ptr->get_rack(utils::fb_utilities::get_broadcast_address())
-            ;
+        if (_cfg.encrypt == encrypt_what::dc) {
+            return false;
+        }
+        auto my_rack = snitch_ptr->get_rack(broadcast_address);
+        if (snitch_ptr->get_rack(addr) != my_rack) {
+            return true;
+        }
+        // See above: We need to ensure either (local) bind address is same as
+        // broadcast or that the rack info we get for it is the same.
+        return broadcast_address != laddr && snitch_ptr->get_rack(laddr) != my_rack;
    }();

    auto must_compress = [&id, this] {
@@ -670,7 +691,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }();

    auto must_tcp_nodelay = [&] {
-        if (idx == 1) {
+        if (idx == 0) {
            return true; // gossip
        }
        if (_cfg.tcp_nodelay == tcp_nodelay_what::local) {
@@ -681,7 +702,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        return true;
    }();

-    auto remote_addr = socket_address(get_preferred_ip(id.addr), must_encrypt ? _cfg.ssl_port : _cfg.port);
+    auto remote_addr = socket_address(addr, must_encrypt ? _cfg.ssl_port : _cfg.port);

    rpc::client_options opts;
    // send keepalive messages each minute if connection is idle, drop connection after 10 failures
@@ -691,13 +712,8 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }
    opts.tcp_nodelay = must_tcp_nodelay;
    opts.reuseaddr = true;
-    // We send cookies only for non-default statement tenant clients.
-    if (idx > 3) {
-        opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;
-    }
+    opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;

-    bool listen_to_bc = _cfg.listen_on_broadcast_address && _cfg.ip != utils::fb_utilities::get_broadcast_address();
-    auto laddr = socket_address(listen_to_bc ? utils::fb_utilities::get_broadcast_address() : _cfg.ip, 0);
    auto client = must_encrypt ?
                    ::make_shared<rpc_protocol_client_wrapper>(_rpc->protocol(), std::move(opts),
                                    remote_addr, laddr, _credentials) :
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -283,8 +283,8 @@ public:

    future<> lookup_readers(db::timeout_clock::time_point timeout);

-    future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
-            std::optional<clustering_key_prefix> last_ckey);
+    future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, std::optional<detached_compaction_state> compaction_state,
+            dht::decorated_key last_pkey, std::optional<clustering_key_prefix> last_ckey);

    future<> stop();
 };
@@ -583,19 +583,22 @@ future<> read_context::lookup_readers(db::timeout_clock::time_point timeout) {
    });
 }

-future<> read_context::save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
-            std::optional<clustering_key_prefix> last_ckey) {
+future<> read_context::save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, std::optional<detached_compaction_state> compaction_state,
+            dht::decorated_key last_pkey, std::optional<clustering_key_prefix> last_ckey) {
    if (_cmd.query_uuid == utils::UUID{}) {
        return make_ready_future<>();
    }

-    auto last_pkey = compaction_state.partition_start.key();
-
    const auto cb_stats = dismantle_combined_buffer(std::move(unconsumed_buffer), last_pkey);
    tracing::trace(_trace_state, "Dismantled combined buffer: {}", cb_stats);

-    const auto cs_stats = dismantle_compaction_state(std::move(compaction_state));
-    tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);
+    auto cs_stats = dismantle_buffer_stats{};
+    if (compaction_state) {
+        cs_stats = dismantle_compaction_state(std::move(*compaction_state));
+        tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);
+    } else {
+        tracing::trace(_trace_state, "No compaction state to dismantle, partition exhausted", cs_stats);
+    }

    return do_with(std::move(last_pkey), std::move(last_ckey), [this] (const dht::decorated_key& last_pkey,
            const std::optional<clustering_key_prefix>& last_ckey) {
@@ -694,16 +697,18 @@ future<typename ResultBuilder::result_type> do_query(
        ResultBuilder&& result_builder) {
    auto ctx = seastar::make_shared<read_context>(db, s, cmd, ranges, trace_state, timeout);

-    co_await ctx->lookup_readers(timeout);
-
    std::exception_ptr ex;

    try {
+        co_await ctx->lookup_readers(timeout);
+
        auto [last_ckey, result, unconsumed_buffer, compaction_state] = co_await read_page<ResultBuilder>(ctx, s, cmd, ranges, trace_state,
                std::move(result_builder));

        if (compaction_state->are_limits_reached() || result.is_short_read()) {
-            co_await ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_ckey));
+            // Must call before calling 'detached_state()`.
+            auto last_pkey = *compaction_state->current_partition();
+            co_await ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_pkey), std::move(last_ckey));
        }

        co_await ctx->stop();
--- a/mutation_compactor.hh
+++ b/mutation_compactor.hh
@@ -175,6 +175,9 @@ class compact_mutation_state {
    std::unique_ptr<mutation_compactor_garbage_collector> _collector;

    compaction_stats _stats;
+
+    // Remember if we requested to stop mid-partition.
+    stop_iteration _stop = stop_iteration::no;
 private:
    static constexpr bool only_live() {
        return OnlyLive == emit_only_live_rows::yes;
@@ -270,6 +273,7 @@ public:
    }

    void consume_new_partition(const dht::decorated_key& dk) {
+        _stop = stop_iteration::no;
        auto& pk = dk.key();
        _dk = &dk;
        _return_static_content_on_partition_with_no_rows =
@@ -323,9 +327,9 @@ public:
        _static_row_live = is_live;
        if (is_live || (!only_live() && !sr.empty())) {
            partition_is_not_empty(consumer);
-            return consumer.consume(std::move(sr), current_tombstone, is_live);
+            _stop = consumer.consume(std::move(sr), current_tombstone, is_live);
        }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -370,23 +374,22 @@ public:

        if (only_live() && is_live) {
            partition_is_not_empty(consumer);
-            auto stop = consumer.consume(std::move(cr), t, true);
+            _stop = consumer.consume(std::move(cr), t, true);
            if (++_rows_in_current_partition == _current_partition_limit) {
-                return stop_iteration::yes;
+                _stop = stop_iteration::yes;
            }
-            return stop;
+            return _stop;
        } else if (!only_live()) {
-            auto stop = stop_iteration::no;
            if (!cr.empty()) {
                partition_is_not_empty(consumer);
-                stop = consumer.consume(std::move(cr), t, is_live);
+                _stop = consumer.consume(std::move(cr), t, is_live);
            }
            if (!sstable_compaction() && is_live && ++_rows_in_current_partition == _current_partition_limit) {
-                return stop_iteration::yes;
+                _stop = stop_iteration::yes;
            }
-            return stop;
+            return _stop;
        }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -398,13 +401,13 @@ public:
        if (rt.tomb > _range_tombstones.get_partition_tombstone()) {
            if (can_purge_tombstone(rt.tomb)) {
                partition_is_not_empty_for_gc_consumer(gc_consumer);
-                return gc_consumer.consume(std::move(rt));
+                _stop = gc_consumer.consume(std::move(rt));
            } else {
                partition_is_not_empty(consumer);
-                return consumer.consume(std::move(rt));
+                _stop = consumer.consume(std::move(rt));
            }
         }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -492,9 +495,24 @@ public:
    /// compactor will result in the new compactor being in the same state *this
    /// is (given the same outside parameters of course). Practically this
    /// allows the compaction state to be stored in the compacted reader.
-    detached_compaction_state detach_state() && {
+    /// If the currently compacted partition is exhausted a disengaged optional
+    /// is returned -- in this case there is no state to detach.
+    std::optional<detached_compaction_state> detach_state() && {
+        // If we exhausted the partition, there is no need to detach-restore the
+        // compaction state.
+        // We exhausted the partition if `consume_partition_end()` was called
+        // without us requesting the consumption to stop (remembered in _stop)
+        // from one of the consume() overloads.
+        // The consume algorithm calls `consume_partition_end()` in two cases:
+        // * on a partition-end fragment
+        // * consume() requested to stop
+        // In the latter case, the partition is not exhausted. Even if the next
+        // fragment to process is a partition-end, it will not be consumed.
+        if (!_stop) {
+            return {};
+        }
        partition_start ps(std::move(_last_dk), _range_tombstones.get_partition_tombstone());
-        return {std::move(ps), std::move(_last_static_row), std::move(_range_tombstones).range_tombstones()};
+        return detached_compaction_state{std::move(ps), std::move(_last_static_row), std::move(_range_tombstones).range_tombstones()};
    }

    const compaction_stats& stats() const { return _stats; }
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -843,7 +843,6 @@ public:

    void apply(shadowable_tombstone deleted_at) {
        _deleted_at.apply(deleted_at, _marker);
-        maybe_shadow();
    }

    void apply(row_tombstone deleted_at) {
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -1545,18 +1545,20 @@ public:
 };

 future<> shard_reader::close() noexcept {
-    // Nothing to do if there was no reader created, nor is there a background
-    // read ahead in progress which will create one.
-    if (!_reader && !_read_ahead) {
-        co_return;
+    if (_read_ahead) {
+        try {
+            co_await *std::exchange(_read_ahead, std::nullopt);
+        } catch (...) {
+            mrlog.warn("shard_reader::close(): read_ahead on shard {} failed: {}", _shard, std::current_exception());
+        }
    }

    try {
-        if (_read_ahead) {
-            co_await *std::exchange(_read_ahead, std::nullopt);
-        }
-
        co_await smp::submit_to(_shard, [this] {
+            if (!_reader) {
+                return make_ready_future<>();
+            }
+
            auto irh = std::move(*_reader).inactive_read_handle();
            return with_closeable(flat_mutation_reader(_reader.release()), [this] (flat_mutation_reader& reader) mutable {
                auto permit = reader.permit();
--- a/mutation_writer/feed_writers.hh
+++ b/mutation_writer/feed_writers.hh
@@ -54,7 +54,7 @@ future<> feed_writer(flat_mutation_reader&& rd_ref, Writer wr) {
    auto rd = std::move(rd_ref);
    std::exception_ptr ex;
    try {
-        while (!rd.is_end_of_stream()) {
+        while (!rd.is_end_of_stream() || !rd.is_buffer_empty()) {
            co_await rd.fill_buffer();
            while (!rd.is_buffer_empty()) {
                co_await rd.pop_mutation_fragment().consume(wr);
--- a/partition_snapshot_reader.hh
+++ b/partition_snapshot_reader.hh
@@ -305,14 +305,23 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
                const std::optional<position_in_partition>& last_row,
                const std::optional<position_in_partition>& last_rts,
                position_in_partition_view pos) {
-            if (!_rt_stream.empty()) {
-                return _rt_stream.get_next(std::move(pos));
-            }
            return in_alloc_section([&] () -> mutation_fragment_opt {
                maybe_refresh_state(ck_range_snapshot, last_row, last_rts);

                position_in_partition::less_compare rt_less(_query_schema);

+                // The while below moves range tombstones from partition versions
+                // into _rt_stream, just enough to produce the next range tombstone
+                // The main goal behind moving to _rt_stream is to deoverlap range tombstones
+                // which have the same starting position. This is not in order to satisfy
+                // flat_mutation_reader stream requirements, the reader can emit range tombstones
+                // which have the same position incrementally. This is to guarantee forward
+                // progress in the case iterators get invalidated and maybe_refresh_state()
+                // above needs to restore them. It does so using last_rts, which tracks
+                // the position of the last emitted range tombstone. All range tombstones
+                // with positions <= than last_rts are skipped on refresh. To make progress,
+                // we need to make sure that all range tombstones with duplicated positions
+                // are emitted before maybe_refresh_state().
                while (has_more_range_tombstones()
                        && !rt_less(pos, peek_range_tombstone().position())
                        && (_rt_stream.empty() || !rt_less(_rt_stream.peek_next().position(), peek_range_tombstone().position()))) {
--- a/partition_snapshot_row_cursor.hh
+++ b/partition_snapshot_row_cursor.hh
@@ -325,7 +325,7 @@ public:
    // When throws, the cursor is invalidated and its position is not changed.
    bool advance_to(position_in_partition_view lower_bound) {
        prepare_heap(lower_bound);
-        bool found = no_clustering_row_between(_schema, lower_bound, _heap[0].it->position());
+        bool found = no_clustering_row_between_weak(_schema, lower_bound, _heap[0].it->position());
        recreate_current_row();
        return found;
    }
@@ -411,11 +411,11 @@ public:
        } else {
            // Copy row from older version because rows in evictable versions must
            // hold values which are independently complete to be consistent on eviction.
-            auto e = current_allocator().construct<rows_entry>(_schema, *_current_row[0].it);
+            auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(_schema, *_current_row[0].it));
            e->set_continuous(latest_i && latest_i->continuous());
            _snp.tracker()->insert(*e);
-            rows.insert_before(latest_i, *e);
-            return {*e, true};
+            auto e_i = rows.insert_before(latest_i, std::move(e));
+            return ensure_result{*e_i, true};
        }
    }

@@ -447,11 +447,11 @@ public:
        }
        auto&& rows = _snp.version()->partition().mutable_clustered_rows();
        auto latest_i = get_iterator_in_latest_version();
-        auto e = current_allocator().construct<rows_entry>(_schema, pos, is_dummy(!pos.is_clustering_row()),
-            is_continuous(latest_i && latest_i->continuous()));
+        auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(_schema, pos, is_dummy(!pos.is_clustering_row()),
+            is_continuous(latest_i && latest_i->continuous())));
        _snp.tracker()->insert(*e);
-        rows.insert_before(latest_i, *e);
-        return ensure_result{*e, true};
+        auto e_i = rows.insert_before(latest_i, std::move(e));
+        return ensure_result{*e_i, true};
    }

    // Brings the entry pointed to by the cursor to the front of the LRU
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -575,6 +575,20 @@ bool no_clustering_row_between(const schema& s, position_in_partition_view a, po
    }
 }

+// Returns true if and only if there can't be any clustering_row with position >= a and < b.
+// It is assumed that a <= b.
+inline
+bool no_clustering_row_between_weak(const schema& s, position_in_partition_view a, position_in_partition_view b) {
+    clustering_key_prefix::equality eq(s);
+    if (a.has_key() && b.has_key()) {
+        return eq(a.key(), b.key())
+               && (a.get_bound_weight() == bound_weight::after_all_prefixed
+                   || b.get_bound_weight() != bound_weight::after_all_prefixed);
+    } else {
+        return !a.has_key() && !b.has_key();
+    }
+}
+
 // Includes all position_in_partition objects "p" for which: start <= p < end
 // And only those.
 class position_range {
@@ -659,3 +673,9 @@ inline
 bool position_range::is_all_clustered_rows(const schema& s) const {
    return _start.is_before_all_clustered_rows(s) && _end.is_after_all_clustered_rows(s);
 }
+
+// Assumes that the bounds of `r` are of 'clustered' type
+// and that `r` is non-empty (the left bound is smaller than the right bound).
+//
+// If `r` does not contain any keys, returns nullopt.
+std::optional<query::clustering_range> position_range_to_clustering_range(const position_range& r, const schema&);
--- a/query.cc
+++ b/query.cc
@@ -379,3 +379,52 @@ foreign_ptr<lw_shared_ptr<query::result>> result_merger::get() {
 }

 }
+
+std::optional<query::clustering_range> position_range_to_clustering_range(const position_range& r, const schema& s) {
+    assert(r.start().get_type() == partition_region::clustered);
+    assert(r.end().get_type() == partition_region::clustered);
+
+    if (r.start().has_key() && r.end().has_key()
+            && clustering_key_prefix::equality(s)(r.start().key(), r.end().key())) {
+        assert(r.start().get_bound_weight() != r.end().get_bound_weight());
+
+        if (r.end().get_bound_weight() == bound_weight::after_all_prefixed
+                && r.start().get_bound_weight() != bound_weight::after_all_prefixed) {
+            // [before x, after x) and [for x, after x) get converted to [x, x].
+            return query::clustering_range::make_singular(r.start().key());
+        }
+
+        // [before x, for x) does not contain any keys.
+        return std::nullopt;
+    }
+
+    // position_range -> clustering_range
+    // (recall that position_ranges are always left-closed, right opened):
+    // [before x, ...), [for x, ...) -> [x, ...
+    // [after x, ...) -> (x, ...
+    // [..., before x), [..., for x) -> ..., x)
+    // [..., after x) -> ..., x]
+
+    auto to_bound = [&s] (const position_in_partition& p, bool left) -> std::optional<query::clustering_range::bound> {
+        if (p.is_before_all_clustered_rows(s)) {
+            assert(left);
+            return {};
+        }
+
+        if (p.is_after_all_clustered_rows(s)) {
+            assert(!left);
+            return {};
+        }
+
+        assert(p.has_key());
+
+        auto bw = p.get_bound_weight();
+        bool inclusive = left
+            ? bw != bound_weight::after_all_prefixed
+            : bw == bound_weight::after_all_prefixed;
+
+        return query::clustering_range::bound{p.key(), inclusive};
+    };
+
+    return query::clustering_range{to_bound(r.start(), true), to_bound(r.end(), false)};
+}
--- a/range_tombstone_list.cc
+++ b/range_tombstone_list.cc
@@ -42,28 +42,34 @@ static auto construct_range_tombstone_entry(Args&&... args) {
 }

 void range_tombstone_list::apply_reversibly(const schema& s,
-        clustering_key_prefix start, bound_kind start_kind,
-        clustering_key_prefix end,
+        clustering_key_prefix start_key, bound_kind start_kind,
+        clustering_key_prefix end_key,
        bound_kind end_kind,
        tombstone tomb,
        reverter& rev)
 {
+    position_in_partition::less_compare less(s);
+    position_in_partition start(position_in_partition::range_tag_t(), bound_view(std::move(start_key), start_kind));
+    position_in_partition end(position_in_partition::range_tag_t(), bound_view(std::move(end_key), end_kind));
+
+    if (!less(start, end)) {
+        return;
+    }
+
    if (!_tombstones.empty()) {
-        bound_view::compare less(s);
-        bound_view start_bound(start, start_kind);
        auto last = --_tombstones.end();
        range_tombstones_type::iterator it;
-        if (less(start_bound, last->end_bound())) {
-            it = _tombstones.upper_bound(start_bound, [less](auto&& sb, auto&& rt) {
-                return less(sb, rt.end_bound());
+        if (less(start, last->end_position())) {
+            it = _tombstones.upper_bound(start, [less](auto&& sb, auto&& rt) {
+                return less(sb, rt.end_position());
            });
        } else {
            it = _tombstones.end();
        }
-        insert_from(s, std::move(it), std::move(start), start_kind, std::move(end), end_kind, std::move(tomb), rev);
+        insert_from(s, std::move(it), std::move(start), std::move(end), std::move(tomb), rev);
        return;
    }
-    auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
+    auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
    rev.insert(_tombstones.end(), *rt);
    rt.release();
 }
@@ -81,35 +87,31 @@ void range_tombstone_list::apply_reversibly(const schema& s,
 */
 void range_tombstone_list::insert_from(const schema& s,
    range_tombstones_type::iterator it,
-    clustering_key_prefix start,
-    bound_kind start_kind,
-    clustering_key_prefix end,
-    bound_kind end_kind,
+    position_in_partition start,
+    position_in_partition end,
    tombstone tomb,
    reverter& rev)
 {
-    bound_view::compare less(s);
-    bound_view end_bound(end, end_kind);
+    position_in_partition::tri_compare cmp(s);
+
    if (it != _tombstones.begin()) {
        auto prev = std::prev(it);
-        if (prev->tombstone().tomb == tomb && prev->end_bound().adjacent(s, bound_view(start, start_kind))) {
-            start = prev->tombstone().start;
-            start_kind = prev->tombstone().start_kind;
+        if (prev->tombstone().tomb == tomb && cmp(prev->end_position(), start) == 0) {
+            start = prev->position();
            rev.erase(prev);
        }
    }
    while (it != _tombstones.end()) {
-        bound_view start_bound(start, start_kind);
-        if (less(end_bound, start_bound)) {
+        if (cmp(end, start) <= 0) {
            return;
        }

-        if (less(end_bound, it->start_bound())) {
+        if (cmp(end, it->position()) < 0) {
            // not overlapping
-            if (it->tombstone().tomb == tomb && end_bound.adjacent(s, it->start_bound())) {
-                rev.update(it, {std::move(start), start_kind, it->tombstone().end, it->tombstone().end_kind, tomb});
+            if (it->tombstone().tomb == tomb && cmp(end, it->position()) == 0) {
+                rev.update(it, {std::move(start), std::move(end), tomb});
            } else {
-                auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, tomb);
+                auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), tomb);
                rev.insert(it, *rt);
                rt.release();
            }
@@ -119,34 +121,29 @@ void range_tombstone_list::insert_from(const schema& s,
        auto c = tomb <=> it->tombstone().tomb;
        if (c == 0) {
            // same timestamp, overlapping or adjacent, so merge.
-            if (less(it->start_bound(), start_bound)) {
-                start = it->tombstone().start;
-                start_kind = it->tombstone().start_kind;
+            if (cmp(it->position(), start) < 0) {
+                start = it->position();
            }
-            if (less(end_bound, it->end_bound())) {
-                end = it->tombstone().end;
-                end_kind = it->tombstone().end_kind;
-                end_bound = bound_view(end, end_kind);
+            if (cmp(end, it->end_position()) < 0) {
+                end = it->end_position();
            }
            it = rev.erase(it);
        } else if (c > 0) {
            // We overwrite the current tombstone.

-            if (less(it->start_bound(), start_bound)) {
-                auto new_end = bound_view(start, invert_kind(start_kind));
-                if (!less(new_end, it->start_bound())) {
-                    // Here it->start < start
-                    auto rt = construct_range_tombstone_entry(it->start_bound(), new_end, it->tombstone().tomb);
-                    rev.update(it, {start_bound, it->end_bound(), it->tombstone().tomb});
+            if (cmp(it->position(), start) < 0) {
+                {
+                    auto rt = construct_range_tombstone_entry(it->position(), start, it->tombstone().tomb);
+                    rev.update(it, {start, it->end_position(), it->tombstone().tomb});
                    rev.insert(it, *rt);
                    rt.release();
                }
            }

-            if (less(end_bound, it->end_bound())) {
+            if (cmp(end, it->end_position()) < 0) {
                // Here start <= it->start and end < it->end.
-                auto rt = construct_range_tombstone_entry(std::move(start), start_kind, end, end_kind, std::move(tomb));
-                rev.update(it, {std::move(end), invert_kind(end_kind), it->tombstone().end, it->tombstone().end_kind, it->tombstone().tomb});
+                auto rt = construct_range_tombstone_entry(std::move(start), end, std::move(tomb));
+                rev.update(it, {std::move(end), it->end_position(), it->tombstone().tomb});
                rev.insert(it, *rt);
                rt.release();
                return;
@@ -157,30 +154,28 @@ void range_tombstone_list::insert_from(const schema& s,
        } else {
            // We don't overwrite the current tombstone.

-            if (less(start_bound, it->start_bound())) {
+            if (cmp(start, it->position()) < 0) {
                // The new tombstone starts before the current one.
-                if (less(it->start_bound(), end_bound)) {
+                if (cmp(it->position(), end) < 0) {
                    // Here start < it->start and it->start < end.
-                    auto new_end_kind = invert_kind(it->tombstone().start_kind);
-                    if (!less(bound_view(it->tombstone().start, new_end_kind), start_bound)) {
-                        auto rt = construct_range_tombstone_entry(std::move(start), start_kind, it->tombstone().start, new_end_kind, tomb);
+                    {
+                        auto rt = construct_range_tombstone_entry(std::move(start), it->position(), tomb);
                        it = rev.insert(it, *rt);
                        rt.release();
                        ++it;
                    }
                } else {
                    // Here start < it->start and end <= it->start, so just insert the new tombstone.
-                    auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
+                    auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
                    rev.insert(it, *rt);
                    rt.release();
                    return;
                }
            }

-            if (less(it->end_bound(), end_bound)) {
+            if (cmp(it->end_position(), end) < 0) {
                // Here the current tombstone overwrites a range of the new one.
-                start = it->tombstone().end;
-                start_kind = invert_kind(it->tombstone().end_kind);
+                start = it->end_position();
                ++it;
            } else {
                // Here the current tombstone completely overwrites the new one.
@@ -190,7 +185,7 @@ void range_tombstone_list::insert_from(const schema& s,
    }

    // If we got here, then just insert the remainder at the end.
-    auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
+    auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
    rev.insert(it, *rt);
    rt.release();
 }
--- a/range_tombstone_list.hh
+++ b/range_tombstone_list.hh
@@ -297,7 +297,13 @@ public:
 private:
    void apply_reversibly(const schema& s, clustering_key_prefix start, bound_kind start_kind,
                          clustering_key_prefix end, bound_kind end_kind, tombstone tomb, reverter& rev);
-    void insert_from(const schema& s, range_tombstones_type::iterator it, clustering_key_prefix start,
-                     bound_kind start_kind, clustering_key_prefix end, bound_kind end_kind, tombstone tomb, reverter& rev);
+
+    void insert_from(const schema& s,
+                     range_tombstones_type::iterator it,
+                     position_in_partition start,
+                     position_in_partition end,
+                     tombstone tomb,
+                     reverter& rev);
+
    range_tombstones_type::iterator find(const schema& s, const range_tombstone_entry& rt);
 };
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -249,6 +249,14 @@ public:
        return _base_resources;
    }

+    void release_base_resources() noexcept {
+        if (_base_resources_consumed) {
+            _resources -= _base_resources;
+            _base_resources_consumed = false;
+        }
+        _semaphore.signal(std::exchange(_base_resources, {}));
+    }
+
    sstring description() const {
        return format("{}.{}:{}",
                _schema ? _schema->ks_name() : "*",
@@ -394,6 +402,10 @@ reader_resources reader_permit::base_resources() const {
    return _impl->base_resources();
 }

+void reader_permit::release_base_resources() noexcept {
+    return _impl->release_base_resources();
+}
+
 sstring reader_permit::description() const {
    return _impl->description();
 }
--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -161,6 +161,8 @@ public:

    reader_resources base_resources() const;

+    void release_base_resources() noexcept;
+
    sstring description() const;

    db::timeout_clock::time_point timeout() const noexcept;
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -407,6 +407,10 @@ public:
                    {},
                    mutation_reader::forwarding::no);
        } else {
+            // We can't have two permits with count resource for 1 repair.
+            // So we release the one on _permit so the only one is the one the
+            // shard reader will obtain.
+            _permit.release_base_resources();
            _reader = make_multishard_streaming_reader(db, _schema, _permit, [this] {
                auto shard_range = _sharder.next();
                if (shard_range) {
--- a/schema_upgrader.hh
+++ b/schema_upgrader.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include "mutation_fragment.hh"
+#include "mutation_fragment_v2.hh"
 #include "converting_mutation_partition_applier.hh"

 // A StreamedMutationTransformer which transforms the stream to a different schema
--- a/2
+++ b/2
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -635,16 +635,16 @@ void storage_service::bootstrap() {

        // Update pending ranges now, so we correctly count ourselves as a pending replica
        // when inserting the new CDC generation.
-      if (!bootstrap_rbno) {
-        // When is_repair_based_node_ops_enabled is true, the bootstrap node
-        // will use node_ops_cmd to bootstrap, node_ops_cmd will update the pending ranges.
-        slogger.debug("bootstrap: update pending ranges: endpoint={} bootstrap_tokens={}", get_broadcast_address(), _bootstrap_tokens);
-        mutate_token_metadata([this] (mutable_token_metadata_ptr tmptr) {
-            auto endpoint = get_broadcast_address();
-            tmptr->add_bootstrap_tokens(_bootstrap_tokens, endpoint);
-            return update_pending_ranges(std::move(tmptr), format("bootstrapping node {}", endpoint));
-        }).get();
-      }
+        if (!bootstrap_rbno) {
+            // When is_repair_based_node_ops_enabled is true, the bootstrap node
+            // will use node_ops_cmd to bootstrap, node_ops_cmd will update the pending ranges.
+            slogger.debug("bootstrap: update pending ranges: endpoint={} bootstrap_tokens={}", get_broadcast_address(), _bootstrap_tokens);
+            mutate_token_metadata([this] (mutable_token_metadata_ptr tmptr) {
+                auto endpoint = get_broadcast_address();
+                tmptr->add_bootstrap_tokens(_bootstrap_tokens, endpoint);
+                return update_pending_ranges(std::move(tmptr), format("bootstrapping node {}", endpoint));
+            }).get();
+        }

        // After we pick a generation timestamp, we start gossiping it, and we stick with it.
        // We don't do any other generation switches (unless we crash before complecting bootstrap).
@@ -652,19 +652,23 @@ void storage_service::bootstrap() {

        _cdc_gen_id = _cdc_gen_service.local().make_new_generation(_bootstrap_tokens, !is_first_node()).get0();

-      if (!bootstrap_rbno) {
-        // When is_repair_based_node_ops_enabled is true, the bootstrap node
-        // will use node_ops_cmd to bootstrap, bootstrapping gossip status is not needed for bootstrap.
-        _gossiper.add_local_application_state({
-            // Order is important: both the CDC streams timestamp and tokens must be known when a node handles our status.
-            { gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens) },
-            { gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
-            { gms::application_state::STATUS, versioned_value::bootstrapping(_bootstrap_tokens) },
-        }).get();
+        if (!bootstrap_rbno) {
+            // When is_repair_based_node_ops_enabled is true, the bootstrap node
+            // will use node_ops_cmd to bootstrap, bootstrapping gossip status is not needed for bootstrap.
+            _gossiper.add_local_application_state({
+                { gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens) },
+                { gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
+                { gms::application_state::STATUS, versioned_value::bootstrapping(_bootstrap_tokens) },
+            }).get();

-        set_mode(mode::JOINING, format("sleeping {} ms for pending range setup", get_ring_delay().count()), true);
-        _gossiper.wait_for_range_setup().get();
-     }
+            set_mode(mode::JOINING, format("sleeping {} ms for pending range setup", get_ring_delay().count()), true);
+            _gossiper.wait_for_range_setup().get();
+        } else {
+            // Even with RBNO bootstrap we need to announce the new CDC generation immediately after it's created.
+            _gossiper.add_local_application_state({
+                { gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
+            }).get();
+        }
    } else {
        // Wait until we know tokens of existing node before announcing replacing status.
        set_mode(mode::JOINING, fmt::format("Wait until local node knows tokens of peer nodes"), true);
@@ -3670,7 +3674,7 @@ shared_ptr<abort_source> node_ops_meta_data::get_abort_source() {

 void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {
    slogger.debug("node_ops_update_heartbeat: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
+    auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
    auto it = _node_ops.find(ops_uuid);
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
@@ -3680,7 +3684,7 @@ void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {

 void storage_service::node_ops_done(utils::UUID ops_uuid) {
    slogger.debug("node_ops_done: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
+    auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
    auto it = _node_ops.find(ops_uuid);
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
@@ -3691,7 +3695,7 @@ void storage_service::node_ops_done(utils::UUID ops_uuid) {

 void storage_service::node_ops_abort(utils::UUID ops_uuid) {
    slogger.debug("node_ops_abort: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
+    auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
    auto it = _node_ops.find(ops_uuid);
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
--- a/sstables/kl/reader.cc
+++ b/sstables/kl/reader.cc
@@ -1155,7 +1155,7 @@ private:
    }
    index_reader& get_index_reader() {
        if (!_index_reader) {
-            auto caching = use_caching(!_slice.options.contains(query::partition_slice::option::bypass_cache));
+            auto caching = use_caching(global_cache_index_pages && !_slice.options.contains(query::partition_slice::option::bypass_cache));
            _index_reader = std::make_unique<index_reader>(_sst, _consumer.permit(), _consumer.io_priority(),
                                                           _consumer.trace_state(), caching);
        }
--- a/sstables/mx/reader.cc
+++ b/sstables/mx/reader.cc
@@ -1308,7 +1308,7 @@ private:
    }
    index_reader& get_index_reader() {
        if (!_index_reader) {
-            auto caching = use_caching(!_slice.options.contains(query::partition_slice::option::bypass_cache));
+            auto caching = use_caching(global_cache_index_pages && !_slice.options.contains(query::partition_slice::option::bypass_cache));
            _index_reader = std::make_unique<index_reader>(_sst, _consumer.permit(), _consumer.io_priority(),
                                                           _consumer.trace_state(), caching);
        }
@@ -1745,9 +1745,7 @@ public:
        _monitor.on_read_started(_context->reader_position());
    }
 public:
-    void on_out_of_clustering_range() override {
-        push_mutation_fragment(mutation_fragment_v2(*_schema, _permit, partition_end()));
-    }
+    void on_out_of_clustering_range() override { }
    virtual future<> fast_forward_to(const dht::partition_range& pr) override {
        on_internal_error(sstlog, "mx_crawling_sstable_mutation_reader: doesn't support fast_forward_to(const dht::partition_range&)");
    }
--- a/Show More
+++ b/Show More