release: prepare for 4.6.7

transport/server.cc: Return correct size of decompressed lz4 buffer
An incorrect size is returned from the function, which could lead to crashes or undefined behavior. Fix by erroring out in these cases. Fixes #11476 (cherry picked from commit 1c2eef384d)
2022-09-07 11:17:55 +03:00 · 2022-09-07 10:58:54 +03:00 · 2022-09-06 17:56:30 +03:00 · 2022-09-01 20:34:22 +03:00 · 2022-09-01 15:44:35 +03:00 · 2022-08-11 19:19:30 +02:00
115 changed files with 2234 additions and 507 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -60,7 +60,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=4.6.dev
+VERSION=4.6.7

 if test -f version
 then
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -1017,18 +1017,16 @@ future<executor::request_return_type> executor::update_table(client_state& clien
    _stats.api_operations.update_table++;
    elogger.trace("Updating table {}", request);

-    std::string table_name = get_table_name(request);
-    if (table_name.find(INTERNAL_TABLE_PREFIX) == 0) {
+    schema_ptr tab = get_table(_proxy, request);
+    // the ugly but harmless conversion to string_view here is because
+    // Seastar's sstring is missing a find(std::string_view) :-()
+    if (std::string_view(tab->cf_name()).find(INTERNAL_TABLE_PREFIX) == 0) {
        return make_ready_future<request_return_type>(api_error::validation(
                format("Prefix {} is reserved for accessing internal tables", INTERNAL_TABLE_PREFIX)));
    }
-    std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
-    tracing::add_table_name(trace_state, keyspace_name, table_name);
+    tracing::add_table_name(trace_state, tab->ks_name(), tab->cf_name());

-    auto& db = _proxy.get_db().local();
-    auto& cf = db.find_column_family(keyspace_name, table_name);
-
-    schema_builder builder(cf.schema());
+    schema_builder builder(tab);

    rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
    if (stream_specification && stream_specification->IsObject()) {
@@ -2080,6 +2078,9 @@ static attrs_to_get calculate_attrs_to_get(const rjson::value& req, std::unorder
        for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) {
            attribute_path_map_add("AttributesToGet", ret, it->GetString());
        }
+        if (ret.empty()) {
+            throw api_error::validation("Empty AttributesToGet is not allowed. Consider using Select=COUNT instead.");
+        }
        return ret;
    } else if (has_projection_expression) {
        const rjson::value& projection_expression = req["ProjectionExpression"];
@@ -2481,8 +2482,8 @@ static bool hierarchy_actions(
                        // attr member so we can use add()
                        rjson::add_with_string_name(v, attr, std::move(*newv));
                    } else {
-                        throw api_error::validation(format("Can't remove document path {} - not present in item",
-                            subh.get_value()._path));
+                        // Removing a.b when a is a map but a.b doesn't exist
+                        // is silently ignored. It's not considered an error.
                    }
                } else {
                    throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -94,10 +94,7 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
 }

 future<executor::request_return_type> executor::describe_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
-    _stats.api_operations.update_time_to_live++;
-    if (!_proxy.get_db().local().features().cluster_supports_alternator_ttl()) {
-        co_return api_error::unknown_operation("DescribeTimeToLive not yet supported. Experimental support is available if the 'alternator_ttl' experimental feature is enabled on all nodes.");
-    }
+    _stats.api_operations.describe_time_to_live++;
    schema_ptr schema = get_table(_proxy, request);
    std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
    rjson::value desc = rjson::empty_object();
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -79,6 +79,49 @@ atomic_cell::atomic_cell(const abstract_type& type, atomic_cell_view other)
    set_view(_data);
 }

+// Based on:
+//  - org.apache.cassandra.db.AbstractCell#reconcile()
+//  - org.apache.cassandra.db.BufferExpiringCell#reconcile()
+//  - org.apache.cassandra.db.BufferDeletedCell#reconcile()
+std::strong_ordering
+compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
+    if (left.timestamp() != right.timestamp()) {
+        return left.timestamp() <=> right.timestamp();
+    }
+    if (left.is_live() != right.is_live()) {
+        return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
+    }
+    if (left.is_live()) {
+        auto c = compare_unsigned(left.value(), right.value()) <=> 0;
+        if (c != 0) {
+            return c;
+        }
+        if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
+            // prefer expiring cells.
+            return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
+        }
+        if (left.is_live_and_has_ttl()) {
+            if (left.expiry() != right.expiry()) {
+                return left.expiry() <=> right.expiry();
+            } else {
+                // prefer the cell that was written later,
+                // so it survives longer after it expires, until purged.
+                return right.ttl() <=> left.ttl();
+            }
+        }
+    } else {
+        // Both are deleted
+
+        // Origin compares big-endian serialized deletion time. That's because it
+        // delegates to AbstractCell.reconcile() which compares values after
+        // comparing timestamps, which in case of deleted cells will hold
+        // serialized expiry.
+        return (uint64_t) left.deletion_time().time_since_epoch().count()
+                <=> (uint64_t) right.deletion_time().time_since_epoch().count();
+    }
+    return std::strong_ordering::equal;
+}
+
 atomic_cell_or_collection atomic_cell_or_collection::copy(const abstract_type& type) const {
    if (_data.empty()) {
        return atomic_cell_or_collection();
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -593,8 +593,8 @@ void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::con
                clogger.trace("csm {}: insert dummy at {}", fmt::ptr(this), _lower_bound);
                auto it = with_allocator(_lsa_manager.region().allocator(), [&] {
                    auto& rows = _snp->version()->partition().mutable_clustered_rows();
-                    auto new_entry = current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no);
-                    return rows.insert_before(_next_row.get_iterator_in_latest_version(), *new_entry);
+                    auto new_entry = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no));
+                    return rows.insert_before(_next_row.get_iterator_in_latest_version(), std::move(new_entry));
                });
                _snp->tracker()->insert(*it);
                _last_row = partition_snapshot_row_weakref(*_snp, it, true);
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -765,8 +765,12 @@ future<> generation_service::check_and_repair_cdc_streams() {
    std::optional<cdc::generation_id> latest = _gen_id;
    const auto& endpoint_states = _gossiper.get_endpoint_states();
    for (const auto& [addr, state] : endpoint_states) {
-        if (!_gossiper.is_normal(addr))  {
-            throw std::runtime_error(format("All nodes must be in NORMAL state while performing check_and_repair_cdc_streams"
+        if (_gossiper.is_left(addr)) {
+            cdc_log.info("check_and_repair_cdc_streams ignored node {} because it is in LEFT state", addr);
+            continue;
+        }
+        if (!_gossiper.is_normal(addr)) {
+            throw std::runtime_error(format("All nodes must be in NORMAL or LEFT state while performing check_and_repair_cdc_streams"
                    " ({} is in state {})", addr, _gossiper.get_gossip_status(state)));
        }

@@ -830,6 +834,11 @@ future<> generation_service::check_and_repair_cdc_streams() {
                latest, db_clock::now());
            should_regenerate = true;
        } else {
+          if (tmptr->sorted_tokens().size() != gen->entries().size()) {
+              // We probably have garbage streams from old generations
+              cdc_log.info("Generation size does not match the token ring, regenerating");
+              should_regenerate = true;
+          } else {
            std::unordered_set<dht::token> gen_ends;
            for (const auto& entry : gen->entries()) {
                gen_ends.insert(entry.token_range_end);
@@ -841,6 +850,7 @@ future<> generation_service::check_and_repair_cdc_streams() {
                    break;
                }
            }
+          }
        }
    }

--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -73,7 +73,7 @@ using namespace std::chrono_literals;
 logging::logger cdc_log("cdc");

 namespace cdc {
-static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {});
+static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {}, schema_ptr = nullptr);
 }

 static constexpr auto cdc_group_name = "cdc";
@@ -220,7 +220,7 @@ public:
                return;
            }

-            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt);
+            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);

            auto log_mut = log_schema 
                ? db::schema_tables::make_update_table_mutations(db, keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
@@ -503,7 +503,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
    return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
 }

-static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid) {
+static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid, schema_ptr old) {
    schema_builder b(s.ks_name(), log_name(s.cf_name()));
    b.with_partitioner("com.scylladb.dht.CDCPartitioner");
    b.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
@@ -590,6 +590,20 @@ static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID>
        b.set_uuid(*uuid);
    }

+    /**
+     * #10473 - if we are redefining the log table, we need to ensure any dropped
+     * columns are registered in "dropped_columns" table, otherwise clients will not
+     * be able to read data older than now.
+     */
+    if (old) {
+        // not super efficient, but we don't do this often.
+        for (auto& col : old->all_columns()) {
+            if (!b.has_column({col.name(), col.name_as_text() })) {
+                b.without_column(col.name_as_text(), col.type, api::new_timestamp());
+            }
+        }
+    }
+
    return b.build();
 }

@@ -1511,6 +1525,11 @@ public:
        }

        auto process_cell = [&, this] (const column_definition& cdef) {
+            // If table uses compact storage it may contain a column of type empty
+            // and we need to ignore such a field because it is not present in CDC log.
+            if (cdef.type->get_kind() == abstract_type::kind::empty) {
+                return;
+            }
            if (auto current = get_col_from_row_state(row_state, cdef)) {
                _builder->set_value(image_ck, cdef, *current);
            } else if (op == operation::pre_image) {
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -1634,7 +1634,7 @@ future<bool> scrub_validate_mode_validate_reader(flat_mutation_reader reader, co
        while (auto mf_opt = co_await reader()) {
            if (cdata.is_stop_requested()) [[unlikely]] {
                // Compaction manager will catch this exception and re-schedule the compaction.
-                co_return coroutine::make_exception(compaction_stopped_exception(schema->ks_name(), schema->cf_name(), cdata.stop_requested));
+                throw compaction_stopped_exception(schema->ks_name(), schema->cf_name(), cdata.stop_requested);
            }

            const auto& mf = *mf_opt;
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -326,6 +326,11 @@ future<> compaction_manager::run_custom_job(column_family* cf, sstables::compact
    task->compaction_done = with_semaphore(_custom_job_sem, 1, [this, task, cf, &job = *job_ptr] () mutable {
        // take read lock for cf, so major compaction and resharding can't proceed in parallel.
        return with_lock(_compaction_locks[cf].for_read(), [this, task, cf, &job] () mutable {
+            // Allow caller to know that task (e.g. reshape) was asked to stop while waiting for a chance to run.
+            if (task->compaction_data.is_stop_requested()) {
+                throw sstables::compaction_stopped_exception(task->compacting_cf->schema()->ks_name(), task->compacting_cf->schema()->cf_name(),
+                    task->compaction_data.stop_requested);
+            }
            _stats.active_tasks++;
            if (!can_proceed(task)) {
                return make_ready_future<>();
@@ -676,6 +681,7 @@ void compaction_manager::submit_offstrategy(column_family* cf) {
                _stats.active_tasks++;
                task->setup_new_compaction();

+              return with_scheduling_group(_maintenance_sg.cpu, [this, task, cf] {
                return cf->run_offstrategy_compaction(task->compaction_data).then_wrapped([this, task] (future<> f) mutable {
                    _stats.active_tasks--;
                    task->finish_compaction();
@@ -698,6 +704,7 @@ void compaction_manager::submit_offstrategy(column_family* cf) {
                    _tasks.remove(task);
                    return make_ready_future<stop_iteration>(stop_iteration::yes);
                });
+              });
            });
        });
    });
@@ -714,9 +721,20 @@ inline bool compaction_manager::check_for_cleanup(column_family* cf) {

 future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compaction_type_options options, get_candidates_func get_func, can_purge_tombstones can_purge) {
    auto task = make_lw_shared<compaction_manager::task>(cf, options.type());
-    _tasks.push_back(task);

-    auto sstables = std::make_unique<std::vector<sstables::shared_sstable>>(get_func(*cf));
+    std::unique_ptr<std::vector<sstables::shared_sstable>> sstables;
+    lw_shared_ptr<compacting_sstable_registration> compacting;
+
+    // since we might potentially have ongoing compactions, and we
+    // must ensure that all sstables created before we run are included
+    // in the re-write, we need to barrier out any previously running
+    // compaction.
+    auto get_and_register_candidates_func = [this, &sstables, &compacting, &get_func] () mutable -> future<> {
+        sstables = std::make_unique<std::vector<sstables::shared_sstable>>(co_await get_func());
+        compacting = make_lw_shared<compacting_sstable_registration>(this, *sstables);
+    };
+
+    co_await cf->run_with_compaction_disabled(std::ref(get_and_register_candidates_func));
    // sort sstables by size in descending order, such that the smallest files will be rewritten first
    // (as sstable to be rewritten is popped off from the back of container), so rewrite will have higher
    // chance to succeed when the biggest files are reached.
@@ -724,10 +742,11 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
        return a->data_size() > b->data_size();
    });

-    auto compacting = make_lw_shared<compacting_sstable_registration>(this, *sstables);
    auto sstables_ptr = sstables.get();
    _stats.pending_tasks += sstables->size();

+    _tasks.push_back(task);
+
    task->compaction_done = do_until([this, sstables_ptr, task] { return sstables_ptr->empty() || !can_proceed(task); },
             [this, task, options, sstables_ptr, compacting, can_purge] () mutable {
        auto sst = sstables_ptr->back();
@@ -737,8 +756,10 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
            column_family& cf = *task->compacting_cf;
            auto sstable_level = sst->get_sstable_level();
            auto run_identifier = sst->run_identifier();
+
            auto sstable_set_snapshot = can_purge ? std::make_optional(cf.get_sstable_set()) : std::nullopt;
-            auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), _maintenance_sg.io,
+            // FIXME: this compaction should run with maintenance priority.
+            auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), service::get_local_compaction_priority(),
                sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, options);

            // Releases reference to cleaned sstable such that respective used disk space can be freed.
@@ -747,15 +768,14 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
            };

            return with_semaphore(_rewrite_sstables_sem, 1, [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
-              // Take write lock for cf to serialize cleanup/upgrade sstables/scrub with major compaction/reshape/reshard.
-              return with_lock(_compaction_locks[&cf].for_write(), [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
+              return with_lock(_compaction_locks[&cf].for_read(), [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
                _stats.pending_tasks--;
                _stats.active_tasks++;
                task->setup_new_compaction();
                task->output_run_identifier = descriptor.run_identifier;
                compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
                return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor), task] (compaction_backlog_tracker& bt) mutable {
-                    return with_scheduling_group(_maintenance_sg.cpu, [this, &cf, descriptor = std::move(descriptor), task]() mutable {
+                    return with_scheduling_group(_compaction_controller.sg(), [this, &cf, descriptor = std::move(descriptor), task]() mutable {
                        return cf.compact_sstables(std::move(descriptor), task->compaction_data);
                    });
                });
@@ -783,7 +803,7 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
        _tasks.remove(task);
    });

-    return task->compaction_done.get_future().then([task] {});
+    co_return co_await task->compaction_done.get_future();
 }

 future<> compaction_manager::perform_sstable_scrub_validate_mode(column_family* cf) {
@@ -865,31 +885,29 @@ future<> compaction_manager::perform_cleanup(database& db, column_family* cf) {
        return make_exception_future<>(std::runtime_error(format("cleanup request failed: there is an ongoing cleanup on {}.{}",
            cf->schema()->ks_name(), cf->schema()->cf_name())));
    }
-    return seastar::async([this, cf, &db] {
+  // FIXME: indentation
+  auto sorted_owned_ranges = db.get_keyspace_local_ranges(cf->schema()->ks_name());
+  auto get_sstables = [this, &db, cf, sorted_owned_ranges] () -> future<std::vector<sstables::shared_sstable>> {
+    return seastar::async([this, &db, cf, sorted_owned_ranges = std::move(sorted_owned_ranges)] {
        auto schema = cf->schema();
-        auto sorted_owned_ranges = db.get_keyspace_local_ranges(schema->ks_name());
        auto sstables = std::vector<sstables::shared_sstable>{};
        const auto candidates = get_candidates(*cf);
        std::copy_if(candidates.begin(), candidates.end(), std::back_inserter(sstables), [&sorted_owned_ranges, schema] (const sstables::shared_sstable& sst) {
            seastar::thread::maybe_yield();
            return sorted_owned_ranges.empty() || needs_cleanup(sst, sorted_owned_ranges, schema);
        });
-        return std::tuple<dht::token_range_vector, std::vector<sstables::shared_sstable>>(sorted_owned_ranges, sstables);
-    }).then_unpack([this, cf, &db] (dht::token_range_vector owned_ranges, std::vector<sstables::shared_sstable> sstables) {
-        return rewrite_sstables(cf, sstables::compaction_type_options::make_cleanup(std::move(owned_ranges)),
-                [sstables = std::move(sstables)] (const table&) { return sstables; });
+       return sstables;
    });
+  };
+  return rewrite_sstables(cf, sstables::compaction_type_options::make_cleanup(std::move(sorted_owned_ranges)), std::move(get_sstables));
 }

 // Submit a column family to be upgraded and wait for its termination.
 future<> compaction_manager::perform_sstable_upgrade(database& db, column_family* cf, bool exclude_current_version) {
-    using shared_sstables = std::vector<sstables::shared_sstable>;
-    return do_with(shared_sstables{}, [this, &db, cf, exclude_current_version](shared_sstables& tables) {
-        // since we might potentially have ongoing compactions, and we
-        // must ensure that all sstables created before we run are included
-        // in the re-write, we need to barrier out any previously running
-        // compaction.
-        return cf->run_with_compaction_disabled([this, cf, &tables, exclude_current_version] {
+    auto get_sstables = [this, &db, cf, exclude_current_version] {
+            // FIXME: indentation
+            std::vector<sstables::shared_sstable> tables;
+
            auto last_version = cf->get_sstables_manager().get_highest_supported_format();

            for (auto& sst : get_candidates(*cf)) {
@@ -900,21 +918,17 @@ future<> compaction_manager::perform_sstable_upgrade(database& db, column_family
                    tables.emplace_back(sst);
                }
            }
-            return make_ready_future<>();
-        }).then([&db, cf] {
-             return db.get_keyspace_local_ranges(cf->schema()->ks_name());
-        }).then([this, &db, cf, &tables] (dht::token_range_vector owned_ranges) {
-            // doing a "cleanup" is about as compacting as we need
-            // to be, provided we get to decide the tables to process,
-            // and ignoring any existing operations.
-            // Note that we potentially could be doing multiple
-            // upgrades here in parallel, but that is really the users
-            // problem.
-            return rewrite_sstables(cf, sstables::compaction_type_options::make_upgrade(std::move(owned_ranges)), [&](auto&) mutable {
-                return std::exchange(tables, {});
-            });
-        });
-    });
+
+            return make_ready_future<std::vector<sstables::shared_sstable>>(tables);
+    };
+
+    // doing a "cleanup" is about as compacting as we need
+    // to be, provided we get to decide the tables to process,
+    // and ignoring any existing operations.
+    // Note that we potentially could be doing multiple
+    // upgrades here in parallel, but that is really the users
+    // problem.
+    return rewrite_sstables(cf, sstables::compaction_type_options::make_upgrade(db.get_keyspace_local_ranges(cf->schema()->ks_name())), std::move(get_sstables));
 }

 // Submit a column family to be scrubbed and wait for its termination.
@@ -922,14 +936,10 @@ future<> compaction_manager::perform_sstable_scrub(column_family* cf, sstables::
    if (scrub_mode == sstables::compaction_type_options::scrub::mode::validate) {
        return perform_sstable_scrub_validate_mode(cf);
    }
-    // since we might potentially have ongoing compactions, and we
-    // must ensure that all sstables created before we run are scrubbed,
-    // we need to barrier out any previously running compaction.
-    return cf->run_with_compaction_disabled([this, cf, scrub_mode] {
-        return rewrite_sstables(cf, sstables::compaction_type_options::make_scrub(scrub_mode), [this] (const table& cf) {
-            return get_candidates(cf);
+        // FIXME: indentation
+        return rewrite_sstables(cf, sstables::compaction_type_options::make_scrub(scrub_mode), [this, cf] {
+            return make_ready_future<std::vector<sstables::shared_sstable>>(get_candidates(*cf));
        }, can_purge_tombstones::no);
-    });
 }

 future<> compaction_manager::remove(column_family* cf) {
@@ -979,7 +989,7 @@ void compaction_manager::stop_compaction(sstring type) {
    }
    // FIXME: switch to task_stop(), and wait for their termination, so API user can know when compactions actually stopped.
    for (auto& task : _tasks) {
-        if (task->compaction_running && target_type == task->type) {
+        if (target_type == task->type) {
            task->compaction_data.stop("user request");
        }
    }
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -178,7 +178,7 @@ private:
    maintenance_scheduling_group _maintenance_sg;
    size_t _available_memory;

-    using get_candidates_func = std::function<std::vector<sstables::shared_sstable>(const column_family&)>;
+    using get_candidates_func = std::function<future<std::vector<sstables::shared_sstable>>()>;
    class can_purge_tombstones_tag;
    using can_purge_tombstones = bool_class<can_purge_tombstones_tag>;

--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -80,7 +80,11 @@ compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(colu
 }

 void leveled_compaction_strategy::notify_completion(const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
-    if (removed.empty() || added.empty()) {
+    // All the update here is only relevant for regular compaction's round-robin picking policy, and if
+    // last_compacted_keys wasn't generated by regular, it means regular is disabled since last restart,
+    // therefore we can skip the updates here until regular runs for the first time. Once it runs,
+    // it will be able to generate last_compacted_keys correctly by looking at metadata of files.
+    if (removed.empty() || added.empty() || !_last_compacted_keys) {
        return;
    }
    auto min_level = std::numeric_limits<uint32_t>::max();
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -225,6 +225,7 @@ time_window_compaction_strategy::get_sstables_for_compaction(column_family& cf,
    auto gc_before = gc_clock::now() - cf.schema()->gc_grace_seconds();

    if (candidates.empty()) {
+        _estimated_remaining_tasks = 0;
        return compaction_descriptor();
    }

--- a/cql3/cql_statement.hh
+++ b/cql3/cql_statement.hh
@@ -109,9 +109,7 @@ public:
    virtual seastar::future<seastar::shared_ptr<cql_transport::messages::result_message>>
        execute(query_processor& qp, service::query_state& state, const query_options& options) const = 0;

-    virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const = 0;
-
-    virtual bool depends_on_column_family(const seastar::sstring& cf_name) const = 0;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const = 0;

    virtual seastar::shared_ptr<const metadata> get_result_metadata() const = 0;

--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -117,10 +117,44 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
        if (!col_type->is_map()) {
            throw exceptions::invalid_request_exception(format("subscripting non-map column {}", cdef->name_as_text()));
        }
-        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*data.other_columns[data.sel.index_of(*cdef)]));
+        int32_t index = data.sel.index_of(*cdef);
+        if (index == -1) {
+            throw std::runtime_error(
+                    format("Column definition {} does not match any column in the query selection",
+                    cdef->name_as_text()));
+        }
+        const managed_bytes_opt& serialized = data.other_columns[index];
+        if (!serialized) {
+            // For null[i] we return null.
+            return std::nullopt;
+        }
+        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*serialized));
        const auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
        const auto key = evaluate_to_raw_view(col.sub, options);
        auto&& key_type = col_type->name_comparator();
+        if (key.is_null()) {
+            // For m[null] return null.
+            // This is different from Cassandra - which treats m[null]
+            // as an invalid request error. But m[null] -> null is more
+            // consistent with our usual null treatement (e.g., both
+            // null[2] and null < 2 return null). It will also allow us
+            // to support non-constant subscripts (e.g., m[a]) where "a"
+            // may be null in some rows and non-null in others, and it's
+            // not an error.
+            return std::nullopt;
+        }
+        if (key.is_unset_value()) {
+            // An m[?] with ? bound to UNSET_VALUE is a invalid query.
+            // We could have detected it earlier while binding, but since
+            // we currently don't, we must protect the following code
+            // which can't work with an UNSET_VALUE. Note that the
+            // placement of this check here means that in an empty table,
+            // where we never need to evaluate the filter expression, this
+            // error will not be detected.
+            throw exceptions::invalid_request_exception(
+                format("Unsupported unset map key for column {}",
+                    cdef->name_as_text()));
+        }
        const auto found = key.with_linearized([&] (bytes_view key_bv) {
            using entry = std::pair<data_value, data_value>;
            return std::find_if(data_map.cbegin(), data_map.cend(), [&] (const entry& element) {
@@ -135,8 +169,16 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
        case column_kind::clustering_key:
            return managed_bytes(data.clustering_key[cdef->id]);
        case column_kind::static_column:
-        case column_kind::regular_column:
-            return managed_bytes_opt(data.other_columns[data.sel.index_of(*cdef)]);
+            [[fallthrough]];
+        case column_kind::regular_column: {
+            int32_t index = data.sel.index_of(*cdef);
+            if (index == -1) {
+                throw std::runtime_error(
+                        format("Column definition {} does not match any column in the query selection",
+                        cdef->name_as_text()));
+            }
+            return managed_bytes_opt(data.other_columns[index]);
+        }
        default:
            throw exceptions::unsupported_operation_exception("Unknown column kind");
        }
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -970,7 +970,7 @@ bool query_processor::migration_subscriber::should_invalidate(
        sstring ks_name,
        std::optional<sstring> cf_name,
        ::shared_ptr<cql_statement> statement) {
-    return statement->depends_on_keyspace(ks_name) && (!cf_name || statement->depends_on_column_family(*cf_name));
+    return statement->depends_on(ks_name, cf_name);
 }

 future<> query_processor::query_internal(
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -528,7 +528,7 @@ statement_restrictions::statement_restrictions(database& db,
    }

    if (!_nonprimary_key_restrictions->empty()) {
-        if (_has_queriable_regular_index) {
+        if (_has_queriable_regular_index && _partition_range_is_simple) {
            _uses_secondary_indexing = true;
        } else if (!allow_filtering) {
            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -193,7 +193,7 @@ public:

    template<typename RowComparator>
    void sort(const RowComparator& cmp) {
-        std::sort(_rows.begin(), _rows.end(), std::ref(cmp));
+        std::sort(_rows.begin(), _rows.end(), cmp);
    }

    metadata& get_metadata();
--- a/cql3/statements/authentication_statement.cc
+++ b/cql3/statements/authentication_statement.cc
@@ -46,13 +46,7 @@ uint32_t cql3::statements::authentication_statement::get_bound_terms() const {
    return 0;
 }

-bool cql3::statements::authentication_statement::depends_on_keyspace(
-                const sstring& ks_name) const {
-    return false;
-}
-
-bool cql3::statements::authentication_statement::depends_on_column_family(
-                const sstring& cf_name) const {
+bool cql3::statements::authentication_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/authentication_statement.hh
+++ b/cql3/statements/authentication_statement.hh
@@ -55,9 +55,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;

--- a/cql3/statements/authorization_statement.cc
+++ b/cql3/statements/authorization_statement.cc
@@ -48,13 +48,7 @@ uint32_t cql3::statements::authorization_statement::get_bound_terms() const {
    return 0;
 }

-bool cql3::statements::authorization_statement::depends_on_keyspace(
-                const sstring& ks_name) const {
-    return false;
-}
-
-bool cql3::statements::authorization_statement::depends_on_column_family(
-                const sstring& cf_name) const {
+bool cql3::statements::authorization_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/authorization_statement.hh
+++ b/cql3/statements/authorization_statement.hh
@@ -59,9 +59,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;

--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -98,14 +98,9 @@ batch_statement::batch_statement(type type_,
 {
 }

-bool batch_statement::depends_on_keyspace(const sstring& ks_name) const
+bool batch_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
-    return false;
-}
-
-bool batch_statement::depends_on_column_family(const sstring& cf_name) const
-{
-    return false;
+    return boost::algorithm::any_of(_statements, [&ks_name, &cf_name] (auto&& s) { return s.statement->depends_on(ks_name, cf_name); });
 }

 uint32_t batch_statement::get_bound_terms() const
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -115,9 +115,7 @@ public:
                    std::unique_ptr<attributes> attrs,
                    cql_stats& stats);

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;

--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -571,12 +571,8 @@ modification_statement::validate(service::storage_proxy&, const service::client_
    }
 }

-bool modification_statement::depends_on_keyspace(const sstring& ks_name) const {
-    return keyspace() == ks_name;
-}
-
-bool modification_statement::depends_on_column_family(const sstring& cf_name) const {
-    return column_family() == cf_name;
+bool modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
 }

 void modification_statement::add_operation(::shared_ptr<operation> op) {
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -165,9 +165,7 @@ public:
    // Validate before execute, using client state and current schema
    void validate(service::storage_proxy&, const service::client_state& state) const override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    void add_operation(::shared_ptr<operation> op);

--- a/cql3/statements/schema_altering_statement.cc
+++ b/cql3/statements/schema_altering_statement.cc
@@ -67,12 +67,7 @@ future<> schema_altering_statement::grant_permissions_to_creator(const service::
    return make_ready_future<>();
 }

-bool schema_altering_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool schema_altering_statement::depends_on_column_family(const sstring& cf_name) const
+bool schema_altering_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/schema_altering_statement.hh
+++ b/cql3/statements/schema_altering_statement.hh
@@ -79,9 +79,7 @@ protected:
     */
    virtual future<> grant_permissions_to_creator(const service::client_state&) const;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;

--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -194,12 +194,8 @@ void select_statement::validate(service::storage_proxy&, const service::client_s
    // Nothing to do, all validation has been done by raw_statemet::prepare()
 }

-bool select_statement::depends_on_keyspace(const sstring& ks_name) const {
-    return keyspace() == ks_name;
-}
-
-bool select_statement::depends_on_column_family(const sstring& cf_name) const {
-    return column_family() == cf_name;
+bool select_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
 }

 const sstring& select_statement::keyspace() const {
@@ -995,6 +991,7 @@ lw_shared_ptr<const service::pager::paging_state> indexed_table_select_statement
    }

    auto paging_state_copy = make_lw_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
+    paging_state_copy->set_remaining(internal_paging_size);
    paging_state_copy->set_partition_key(std::move(index_pk));
    paging_state_copy->set_clustering_key(std::move(index_ck));
    return std::move(paging_state_copy);
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -127,8 +127,7 @@ public:
    virtual uint32_t get_bound_terms() const override;
    virtual future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
    virtual void validate(service::storage_proxy&, const service::client_state& state) const override;
-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<::shared_ptr<cql_transport::messages::result_message>> execute(query_processor& qp,
        service::query_state& state, const query_options& options) const override;
--- a/cql3/statements/service_level_statement.cc
+++ b/cql3/statements/service_level_statement.cc
@@ -30,13 +30,7 @@ uint32_t service_level_statement::get_bound_terms() const {
    return 0;
 }

-bool service_level_statement::depends_on_keyspace(
-        const sstring &ks_name) const {
-    return false;
-}
-
-bool service_level_statement::depends_on_column_family(
-        const sstring &cf_name) const {
+bool service_level_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/service_level_statement.hh
+++ b/cql3/statements/service_level_statement.hh
@@ -56,9 +56,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(service::storage_proxy& sp, const service::client_state& state) const override;

--- a/cql3/statements/sl_prop_defs.cc
+++ b/cql3/statements/sl_prop_defs.cc
@@ -43,7 +43,7 @@ void sl_prop_defs::validate() {
        data_value v = duration_type->deserialize(duration_type->from_string(*repr));
        cql_duration duration = static_pointer_cast<const duration_type_impl>(duration_type)->from_value(v);
        if (duration.months || duration.days) {
-            throw exceptions::invalid_request_exception("Timeout values cannot be longer than 24h");
+            throw exceptions::invalid_request_exception("Timeout values cannot be expressed in days/months");
        }
        if (duration.nanoseconds % 1'000'000 != 0) {
            throw exceptions::invalid_request_exception("Timeout values must be expressed in millisecond granularity");
--- a/cql3/statements/truncate_statement.cc
+++ b/cql3/statements/truncate_statement.cc
@@ -67,12 +67,7 @@ std::unique_ptr<prepared_statement> truncate_statement::prepare(database& db,cql
    return std::make_unique<prepared_statement>(::make_shared<truncate_statement>(*this));
 }

-bool truncate_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool truncate_statement::depends_on_column_family(const sstring& cf_name) const
+bool truncate_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/truncate_statement.hh
+++ b/cql3/statements/truncate_statement.hh
@@ -58,9 +58,7 @@ public:

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;

--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -53,6 +53,7 @@
 #include "types/list.hh"
 #include "types/user.hh"
 #include "concrete_types.hh"
+#include "validation.hh"

 namespace cql3 {

@@ -251,6 +252,7 @@ insert_prepared_json_statement::build_partition_keys(const query_options& option
        exploded.emplace_back(json_value->second);
    }
    auto pkey = partition_key::from_optional_exploded(*s, std::move(exploded));
+    validation::validate_cql_key(*s, pkey);
    auto k = query::range<query::ring_position>::make_singular(dht::decorate_key(*s, std::move(pkey)));
    ranges.emplace_back(std::move(k));
    return ranges;
--- a/cql3/statements/use_statement.cc
+++ b/cql3/statements/use_statement.cc
@@ -74,12 +74,7 @@ std::unique_ptr<prepared_statement> use_statement::prepare(database& db, cql_sta

 }

-bool use_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool use_statement::depends_on_column_family(const sstring& cf_name) const
+bool use_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/use_statement.hh
+++ b/cql3/statements/use_statement.hh
@@ -59,9 +59,7 @@ public:

    virtual uint32_t get_bound_terms() const override;

-    virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const seastar::sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual seastar::future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;

--- a/database.cc
+++ b/database.cc
@@ -926,10 +926,9 @@ bool database::update_column_family(schema_ptr new_schema) {
    return columns_changed;
 }

-future<> database::remove(const column_family& cf) noexcept {
+void database::remove(const table& cf) noexcept {
    auto s = cf.schema();
    auto& ks = find_keyspace(s->ks_name());
-    co_await _querier_cache.evict_all_for_table(s->id());
    _column_families.erase(s->id());
    ks.metadata()->remove_column_family(s);
    _ks_cf_to_uuid.erase(std::make_pair(s->ks_name(), s->cf_name()));
@@ -946,13 +945,20 @@ future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_
    auto& ks = find_keyspace(ks_name);
    auto uuid = find_uuid(ks_name, cf_name);
    auto cf = _column_families.at(uuid);
-    co_await remove(*cf);
+    remove(*cf);
    cf->clear_views();
-    co_return co_await cf->await_pending_ops().then([this, &ks, cf, tsf = std::move(tsf), snapshot] {
-        return truncate(ks, *cf, std::move(tsf), snapshot).finally([this, cf] {
-            return cf->stop();
-        });
-    }).finally([cf] {});
+    co_await cf->await_pending_ops();
+    co_await _querier_cache.evict_all_for_table(cf->schema()->id());
+    std::exception_ptr ex;
+    try {
+        co_await truncate(ks, *cf, std::move(tsf), snapshot);
+    } catch (...) {
+        ex = std::current_exception();
+    }
+    co_await cf->stop();
+    if (ex) {
+        std::rethrow_exception(std::move(ex));
+    }
 }

 const utils::UUID& database::find_uuid(std::string_view ks, std::string_view cf) const {
@@ -1348,44 +1354,6 @@ database::existing_index_names(const sstring& ks_name, const sstring& cf_to_excl
    return names;
 }

-// Based on:
-//  - org.apache.cassandra.db.AbstractCell#reconcile()
-//  - org.apache.cassandra.db.BufferExpiringCell#reconcile()
-//  - org.apache.cassandra.db.BufferDeletedCell#reconcile()
-std::strong_ordering
-compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
-    if (left.timestamp() != right.timestamp()) {
-        return left.timestamp() <=> right.timestamp();
-    }
-    if (left.is_live() != right.is_live()) {
-        return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
-    }
-    if (left.is_live()) {
-        auto c = compare_unsigned(left.value(), right.value()) <=> 0;
-        if (c != 0) {
-            return c;
-        }
-        if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
-            // prefer expiring cells.
-            return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
-        }
-        if (left.is_live_and_has_ttl() && left.expiry() != right.expiry()) {
-            return left.expiry() <=> right.expiry();
-        }
-    } else {
-        // Both are deleted
-        if (left.deletion_time() != right.deletion_time()) {
-            // Origin compares big-endian serialized deletion time. That's because it
-            // delegates to AbstractCell.reconcile() which compares values after
-            // comparing timestamps, which in case of deleted cells will hold
-            // serialized expiry.
-            return (uint64_t) left.deletion_time().time_since_epoch().count()
-                   <=> (uint64_t) right.deletion_time().time_since_epoch().count();
-        }
-    }
-    return std::strong_ordering::equal;
-}
-
 future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>>
 database::query(schema_ptr s, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges,
                tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
--- a/database.hh
+++ b/database.hh
@@ -1384,6 +1384,7 @@ private:
    Future update_write_metrics(Future&& f);
    void update_write_metrics_for_timed_out_write();
    future<> create_keyspace(const lw_shared_ptr<keyspace_metadata>&, bool is_bootstrap, system_keyspace system);
+    void remove(const table&) noexcept;
 public:
    static utils::UUID empty_version;

@@ -1582,7 +1583,6 @@ public:

    bool update_column_family(schema_ptr s);
    future<> drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func, bool with_snapshot = true);
-    future<> remove(const column_family&) noexcept;

    const logalloc::region_group& dirty_memory_region_group() const {
        return _dirty_memory_manager.region_group();
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -428,6 +428,8 @@ private:
    void abort_recycled_list(std::exception_ptr);
    void abort_deletion_promise(std::exception_ptr);

+    future<> recalculate_footprint();
+
    future<> rename_file(sstring, sstring) const;
    size_t max_request_controller_units() const;
    segment_id_type _ids = 0;
@@ -444,6 +446,7 @@ private:
    seastar::gate _gate;
    uint64_t _new_counter = 0;
    std::optional<size_t> _disk_write_alignment;
+    seastar::semaphore _reserve_recalculation_guard;
 };

 template<typename T>
@@ -512,6 +515,7 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
    uint64_t _file_pos = 0;
    uint64_t _flush_pos = 0;
    uint64_t _size_on_disk = 0;
+    uint64_t _waste = 0;

    size_t _alignment;

@@ -598,7 +602,7 @@ public:
            clogger.debug("Segment {} is no longer active and will submitted for delete now", *this);
            ++_segment_manager->totals.segments_destroyed;
            _segment_manager->totals.active_size_on_disk -= file_position();
-            _segment_manager->totals.wasted_size_on_disk -= (_size_on_disk - file_position());
+            _segment_manager->totals.wasted_size_on_disk -= _waste;
            _segment_manager->add_file_to_delete(_file_name, _desc);
        } else if (_segment_manager->cfg.warn_about_segments_left_on_disk_after_shutdown) {
            clogger.warn("Segment {} is dirty and is left on disk.", *this);
@@ -725,7 +729,8 @@ public:
        auto s = co_await sync();
        co_await flush();
        co_await terminate();
-        _segment_manager->totals.wasted_size_on_disk += (_size_on_disk - file_position());
+        _waste = _size_on_disk - file_position();
+        _segment_manager->totals.wasted_size_on_disk += _waste;
        co_return s;
    }
    future<sseg_ptr> do_flush(uint64_t pos) {
@@ -1223,6 +1228,7 @@ db::commitlog::segment_manager::segment_manager(config c)
    , _recycled_segments(std::numeric_limits<size_t>::max())
    , _reserve_replenisher(make_ready_future<>())
    , _background_sync(make_ready_future<>())
+    , _reserve_recalculation_guard(1)
 {
    assert(max_size > 0);
    assert(max_mutation_size < segment::multi_entry_size_magic);
@@ -1248,6 +1254,11 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
        }
        try {
            gate::holder g(_gate);
+            auto guard = co_await get_units(_reserve_recalculation_guard, 1);
+            if (_reserve_segments.full()) {
+                // can happen if we recalculate
+                continue;
+            }
            // note: if we were strict with disk size, we would refuse to do this 
            // unless disk footprint is lower than threshold. but we cannot (yet?)
            // trust that flush logic will absolutely free up an existing 
@@ -1519,7 +1530,7 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:

        if (cfg.extensions && !cfg.extensions->commitlog_file_extensions().empty()) {
            for (auto * ext : cfg.extensions->commitlog_file_extensions()) {
-                auto nf = co_await ext->wrap_file(std::move(filename), f, flags);
+                auto nf = co_await ext->wrap_file(filename, f, flags);
                if (nf) {
                    f = std::move(nf);
                    align = is_overwrite ? f.disk_overwrite_dma_alignment() : f.disk_write_dma_alignment();
@@ -1530,12 +1541,21 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
        f = make_checked_file(commit_error_handler, std::move(f));
    } catch (...) {
        ep = std::current_exception();
-        commit_error_handler(ep);
+    }
+    if (ep) {
+        // do this early, so iff we are to fast-fail server,
+        // we do it before anything else can go wrong.
+        try {
+            commit_error_handler(ep);
+        } catch (...) {
+            ep = std::current_exception();
+        }
    }
    if (ep && f) {
        co_await f.close();
    }
    if (ep) {
+        add_file_to_delete(filename, d);
        co_return coroutine::exception(std::move(ep));
    }

@@ -1594,6 +1614,8 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
 }

 future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::new_segment() {
+    gate::holder g(_gate);
+
    if (_shutdown) {
        co_return coroutine::make_exception(std::runtime_error("Commitlog has been shut down. Cannot add data"));
    }
@@ -1628,22 +1650,23 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
            co_return _segments.back();
        }

-        if (_segment_allocating) {
-            co_await _segment_allocating->get_future(timeout);
-            continue;
-        }
-
-        promise<> p;
-        _segment_allocating.emplace(p.get_future());
-        auto finally = defer([&] () noexcept { _segment_allocating = std::nullopt; });
-        try {
-            gate::holder g(_gate);
-            auto s = co_await with_timeout(timeout, new_segment());
-            p.set_value();
-        } catch (...) {
-            p.set_exception(std::current_exception());
-            throw;
+        // #9896 - we don't want to issue a new_segment call until
+        // the old one has terminated with either result or exception.
+        // Do all waiting through the shared_future
+        if (!_segment_allocating) {
+            auto f = new_segment();
+            // must check that we are not already done.
+            if (f.available()) {
+                f.get(); // maybe force exception
+                continue;
+            }
+            _segment_allocating.emplace(f.discard_result().finally([this] {
+                // clear the shared_future _before_ resolving its contents
+                // (i.e. with result of this finally)
+                _segment_allocating = std::nullopt;
+            }));
        }
+        co_await _segment_allocating->get_future(timeout);
    }
 }

@@ -1865,6 +1888,8 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi

    std::exception_ptr recycle_error;

+    size_t num_deleted = 0;
+    bool except = false;
    while (!files.empty()) {
        auto filename = std::move(files.back());
        files.pop_back();
@@ -1914,8 +1939,10 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi
                }
            }
            co_await delete_file(filename);
+            ++num_deleted;
        } catch (...) {
            clogger.error("Could not delete segment {}: {}", filename, std::current_exception());
+            except = true;
        }
    }

@@ -1928,6 +1955,16 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi
    if (recycle_error && _recycled_segments.empty()) {
        abort_recycled_list(recycle_error);
    }
+    // If recycle failed and turned into a delete, we should fake-wakeup waiters
+    // since we might still have cleaned up disk space.
+    if (!recycle_error && num_deleted && cfg.reuse_segments && _recycled_segments.empty()) {
+        abort_recycled_list(std::make_exception_ptr(std::runtime_error("deleted files")));
+    }
+
+    // #9348 - if we had an exception, we can't trust our bookeep any more. recalculate.
+    if (except) {
+        co_await recalculate_footprint();
+    }
 }

 void db::commitlog::segment_manager::abort_recycled_list(std::exception_ptr ep) {
@@ -1942,6 +1979,67 @@ void db::commitlog::segment_manager::abort_deletion_promise(std::exception_ptr e
    std::exchange(_disk_deletions, {}).set_exception(ep);
 }

+future<> db::commitlog::segment_manager::recalculate_footprint() {
+    try {
+        co_await do_pending_deletes();
+
+        auto guard = co_await get_units(_reserve_recalculation_guard, 1);
+        auto segments_copy = _segments;
+        std::vector<sseg_ptr> reserves;
+        std::vector<sstring> recycles;
+        // this causes haywire things while we steal stuff, but...
+        while (!_reserve_segments.empty()) {
+            reserves.push_back(_reserve_segments.pop());
+        }
+        while (!_recycled_segments.empty()) {
+            recycles.push_back(_recycled_segments.pop());
+        }
+        // #9955 - must re-stock the queues before we do anything
+        // interruptable/continuation. Because both queues are
+        // used with push/pop eventually which _waits_ for signal
+        // but does _not_ verify that the condition is true once
+        // we return. So copy the objects and look at instead.
+        for (auto& filename : recycles) {
+            _recycled_segments.push(sstring(filename));
+        }
+        for (auto& s : reserves) {
+            _reserve_segments.push(sseg_ptr(s)); // you can have it back now.
+        }
+
+        // first, guesstimate sizes
+        uint64_t recycle_size = recycles.size() * max_size;
+        auto old = totals.total_size_on_disk;
+
+        totals.total_size_on_disk = recycle_size;
+        for (auto& s : _segments) {
+            totals.total_size_on_disk += s->_size_on_disk;
+        }
+        for (auto& s : reserves) {
+            totals.total_size_on_disk += s->_size_on_disk;
+        }
+
+        // now we need to adjust the actual sizes of recycled files
+
+        uint64_t actual_recycled_size = 0;
+
+        try {
+            for (auto& filename : recycles) {
+                auto s = co_await seastar::file_size(filename);
+                actual_recycled_size += s;
+            }
+        } catch (...) {
+            clogger.error("Exception reading disk footprint ({}).", std::current_exception());
+            actual_recycled_size = recycle_size; // best we got
+        }
+
+        totals.total_size_on_disk += actual_recycled_size - recycle_size;
+        // pushing things to reserve/recycled queues will have resumed any
+        // waiters, so we should be done.
+    } catch (...) {
+        clogger.error("Exception recalculating disk footprint ({}). Values might be off...", std::current_exception());
+    }
+}
+
 future<> db::commitlog::segment_manager::do_pending_deletes() {
    auto ftc = std::exchange(_files_to_close, {});
    auto ftd = std::exchange(_files_to_delete, {});
--- a/db/snapshot-ctl.cc
+++ b/db/snapshot-ctl.cc
@@ -119,8 +119,9 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
        return check_snapshot_not_exist(ks_name, tag, tables).then([this, ks_name, tables, tag, sf] {
            return do_with(std::vector<sstring>(std::move(tables)),[this, ks_name, tag, sf](const std::vector<sstring>& tables) {
                return do_for_each(tables, [ks_name, tag, sf, this] (const sstring& table_name) {
-                    if (table_name.find(".") != sstring::npos) {
-                        throw std::invalid_argument("Cannot take a snapshot of a secondary index by itself. Run snapshot on the table that owns the index.");
+                    auto& cf = _db.local().find_column_family(ks_name, table_name);
+                    if (cf.schema()->is_view()) {
+                        throw std::invalid_argument("Do not take a snapshot of a materialized view or a secondary index by itself. Run snapshot on the base table instead.");
                    }
                    return _db.invoke_on_all([ks_name, table_name, tag, sf] (database &db) {
                        auto& cf = db.find_column_family(ks_name, table_name);
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -350,7 +350,11 @@ public:
    view_filter_checking_visitor(const schema& base, const view_info& view)
        : _base(base)
        , _view(view)
-        , _selection(cql3::selection::selection::wildcard(_base.shared_from_this()))
+        , _selection(cql3::selection::selection::for_columns(_base.shared_from_this(),
+            boost::copy_range<std::vector<const column_definition*>>(
+                _base.regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return &cdef; }))
+            )
+        )
    {}

    void accept_new_partition(const partition_key& key, uint64_t row_count) {
@@ -1320,7 +1324,7 @@ future<> mutate_MV(
            auto mut_ptr = remote_endpoints.empty() ? std::make_unique<frozen_mutation>(std::move(mut.fm)) : std::make_unique<frozen_mutation>(mut.fm);
            tracing::trace(tr_state, "Locally applying view update for {}.{}; base token = {}; view token = {}",
                    mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
-            local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, std::move(tr_state), db::commitlog::force_sync::no).then_wrapped(
+            local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, tr_state, db::commitlog::force_sync::no).then_wrapped(
                    [s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
                            units = sem_units.split(sem_units.count())] (future<>&& f) {
                --stats.writes;
--- a/dirty_memory_manager.hh
+++ b/dirty_memory_manager.hh
@@ -215,6 +215,12 @@ public:
        });
    }

+    future<flush_permit> get_all_flush_permits() {
+        return get_units(_background_work_flush_serializer, _max_background_work).then([this] (auto&& units) {
+            return this->get_flush_permit(std::move(units));
+        });
+    }
+
    bool has_extraneous_flushes_requested() const {
        return _extraneous_flushes > 0;
    }
--- a/dist/common/scripts/scylla-housekeeping
+++ b/dist/common/scripts/scylla-housekeeping
@@ -100,6 +100,7 @@ def version_compare(a, b):
 def create_uuid_file(fl):
    with open(args.uuid_file, 'w') as myfile:
        myfile.write(str(uuid.uuid1()) + "\n")
+    os.chmod(args.uuid_file, 0o644)


 def sanitize_version(version):
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -127,10 +127,14 @@ WantedBy=multi-user.target
        #  - Storage: /path/to/file (inacessible)
        #  - Storage: /path/to/file
        #
+        # After systemd-v248, available coredump file output changed like this:
+        #  - Storage: /path/to/file (present)
+        # We need to support both versions.
+        #
        # reference: https://github.com/systemd/systemd/commit/47f50642075a7a215c9f7b600599cbfee81a2913

        corefail = False
-        res = re.findall(r'Storage: (.*)$', coreinfo, flags=re.MULTILINE)
+        res = re.findall(r'Storage: (\S+)(?: \(.+\))?$', coreinfo, flags=re.MULTILINE)
        # v232 or later
        if res:
            corepath = res[0]
--- a/dist/common/scripts/scylla_io_setup
+++ b/dist/common/scripts/scylla_io_setup
@@ -278,6 +278,66 @@ if __name__ == "__main__":
                    disk_properties["read_bandwidth"] = 2527296683 * nr_disks
                    disk_properties["write_iops"] = 156326 * nr_disks
                    disk_properties["write_bandwidth"] = 1063657088 * nr_disks
+            elif idata.instance() == "im4gn.large":
+                disk_properties["read_iops"] = 33943
+                disk_properties["read_bandwidth"] = 288433525
+                disk_properties["write_iops"] = 27877
+                disk_properties["write_bandwidth"] = 126864680
+            elif idata.instance() == "im4gn.xlarge":
+                disk_properties["read_iops"] = 68122
+                disk_properties["read_bandwidth"] = 576603520
+                disk_properties["write_iops"] = 55246
+                disk_properties["write_bandwidth"] = 254534954
+            elif idata.instance() == "im4gn.2xlarge":
+                disk_properties["read_iops"] = 136422
+                disk_properties["read_bandwidth"] = 1152663765
+                disk_properties["write_iops"] = 92184
+                disk_properties["write_bandwidth"] = 508926453
+            elif idata.instance() == "im4gn.4xlarge":
+                disk_properties["read_iops"] = 273050
+                disk_properties["read_bandwidth"] = 1638427264
+                disk_properties["write_iops"] = 92173
+                disk_properties["write_bandwidth"] = 1027966826
+            elif idata.instance() == "im4gn.8xlarge":
+                disk_properties["read_iops"] = 250241 * nr_disks
+                disk_properties["read_bandwidth"] = 1163130709 * nr_disks
+                disk_properties["write_iops"] = 86374 * nr_disks
+                disk_properties["write_bandwidth"] = 977617664 * nr_disks
+            elif idata.instance() == "im4gn.16xlarge":
+                disk_properties["read_iops"] = 273030 * nr_disks
+                disk_properties["read_bandwidth"] = 1638211413 * nr_disks
+                disk_properties["write_iops"] = 92607 * nr_disks
+                disk_properties["write_bandwidth"] = 1028340266 * nr_disks
+            elif idata.instance() == "is4gen.medium":
+                disk_properties["read_iops"] = 33965
+                disk_properties["read_bandwidth"] = 288462506
+                disk_properties["write_iops"] = 27876
+                disk_properties["write_bandwidth"] = 126954200
+            elif idata.instance() == "is4gen.large":
+                disk_properties["read_iops"] = 68131
+                disk_properties["read_bandwidth"] = 576654869
+                disk_properties["write_iops"] = 55257
+                disk_properties["write_bandwidth"] = 254551002
+            elif idata.instance() == "is4gen.xlarge":
+                disk_properties["read_iops"] = 136413
+                disk_properties["read_bandwidth"] = 1152747904
+                disk_properties["write_iops"] = 92180
+                disk_properties["write_bandwidth"] = 508889546
+            elif idata.instance() == "is4gen.2xlarge":
+                disk_properties["read_iops"] = 273038
+                disk_properties["read_bandwidth"] = 1628982613
+                disk_properties["write_iops"] = 92182
+                disk_properties["write_bandwidth"] = 1027983530
+            elif idata.instance() == "is4gen.4xlarge":
+                disk_properties["read_iops"] = 260493 * nr_disks
+                disk_properties["read_bandwidth"] = 1217396928 * nr_disks
+                disk_properties["write_iops"] = 83169 * nr_disks
+                disk_properties["write_bandwidth"] = 1000390784 * nr_disks
+            elif idata.instance() == "is4gen.8xlarge":
+                disk_properties["read_iops"] = 273021 * nr_disks
+                disk_properties["read_bandwidth"] = 1656354602 * nr_disks
+                disk_properties["write_iops"] = 92233 * nr_disks
+                disk_properties["write_bandwidth"] = 1028010325 * nr_disks
            properties_file = open(etcdir() + "/scylla.d/io_properties.yaml", "w")
            yaml.dump({ "disks": [ disk_properties ] }, properties_file,  default_flow_style=False)
            ioconf = open(etcdir() + "/scylla.d/io.conf", "w")
--- a/dist/common/scripts/scylla_ntp_setup
+++ b/dist/common/scripts/scylla_ntp_setup
@@ -66,18 +66,18 @@ if __name__ == '__main__':

    target = None
    if os.path.exists('/lib/systemd/systemd-timesyncd'):
-        if systemd_unit('systemd-timesyncd').is_active():
+        if systemd_unit('systemd-timesyncd').is_active() == 'active':
            print('ntp is already configured, skip setup')
            sys.exit(0)
        target = 'systemd-timesyncd'
    if shutil.which('chronyd'):
-        if get_chrony_unit().is_active():
+        if get_chrony_unit().is_active() == 'active':
            print('ntp is already configured, skip setup')
            sys.exit(0)
        if not target:
            target = 'chrony'
    if shutil.which('ntpd'):
-        if get_ntp_unit().is_active():
+        if get_ntp_unit().is_active() == 'active':
            print('ntp is already configured, skip setup')
            sys.exit(0)
        if not target:
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -117,10 +117,11 @@ if __name__ == '__main__':
        pkg_install('xfsprogs')
    if not shutil.which('mdadm'):
        pkg_install('mdadm')
-    try:
-        md_service = systemd_unit('mdmonitor.service')
-    except SystemdException:
-        md_service = systemd_unit('mdadm.service')
+    if args.raid_level != '0':
+        try:
+            md_service = systemd_unit('mdmonitor.service')
+        except SystemdException:
+            md_service = systemd_unit('mdadm.service')

    print('Creating {type} for scylla using {nr_disk} disk(s): {disks}'.format(type='fRAID{args.raid_level}' if raid else 'XFS volume', nr_disk=len(disks), disks=args.disks))
    procs=[]
@@ -164,14 +165,15 @@ if __name__ == '__main__':

    uuid = run(f'blkid -s UUID -o value {fsdev}', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
    after = 'local-fs.target'
-    if raid:
+    wants = ''
+    if raid and args.raid_level != '0':
        after += f' {md_service}'
+        wants = f'\nWants={md_service}'
    unit_data = f'''
 [Unit]
 Description=Scylla data directory
 Before=scylla-server.service
-After={after}
-Wants={md_service}
+After={after}{wants}
 DefaultDependencies=no

 [Mount]
@@ -195,7 +197,8 @@ WantedBy=multi-user.target
            f.write(f'RequiresMountsFor={mount_at}\n')

    systemd_unit.reload()
-    md_service.start()
+    if args.raid_level != '0':
+        md_service.start()
    mount = systemd_unit(mntunit_bn)
    mount.start()
    if args.enable_on_nextboot:
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -370,6 +370,10 @@ if __name__ == '__main__':
            version_check = interactive_ask_service('Do you want to enable Scylla to check if there is a newer version of Scylla available?', 'Yes - start the Scylla-housekeeping service to check for a newer version. This check runs periodically. No - skips this step.', version_check)
            args.no_version_check = not version_check
            if version_check:
+                cfg = sysconfig_parser(sysconfdir_p() / 'scylla-housekeeping')
+                repo_files = cfg.get('REPO_FILES')
+                for f in glob.glob(repo_files):
+                    os.chmod(f, 0o644)
                with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
                    f.write('[housekeeping]\ncheck-version: True\n')
                os.chmod('/etc/scylla.d/housekeeping.cfg', 0o644)
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -674,7 +674,7 @@ class aws_instance:
        return self._type.split(".")[0]

    def is_supported_instance_class(self):
-        if self.instance_class() in ['i2', 'i3', 'i3en', 'c5d', 'm5d', 'm5ad', 'r5d', 'z1d', 'c6gd', 'm6gd', 'r6gd', 'x2gd']:
+        if self.instance_class() in ['i2', 'i3', 'i3en', 'c5d', 'm5d', 'm5ad', 'r5d', 'z1d', 'c6gd', 'm6gd', 'r6gd', 'x2gd', 'im4gn', 'is4gen']:
            return True
        return False

@@ -683,7 +683,7 @@ class aws_instance:
        instance_size = self.instance_size()
        if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
            return 'ixgbevf'
-        if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5b', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d', 'c6g', 'c6gd', 'm6g', 'm6gd', 't4g', 'r6g', 'r6gd', 'x2gd']:
+        if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5b', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d', 'c6g', 'c6gd', 'm6g', 'm6gd', 't4g', 'r6g', 'r6gd', 'x2gd', 'im4gn', 'is4gen']:
            return 'ena'
        if instance_class == 'm4':
            if instance_size == '16xlarge':
@@ -1041,7 +1041,7 @@ class systemd_unit:
        return run('systemctl {} disable {}'.format(self.ctlparam, self._unit), shell=True, check=True)

    def is_active(self):
-        return True if run('systemctl {} is-active {}'.format(self.ctlparam, self._unit), shell=True, capture_output=True, encoding='utf-8').stdout.strip() == 'active' else False
+        return run('systemctl {} is-active {}'.format(self.ctlparam, self._unit), shell=True, capture_output=True, encoding='utf-8').stdout.strip()

    def mask(self):
        return run('systemctl {} mask {}'.format(self.ctlparam, self._unit), shell=True, check=True)
--- a/dist/common/supervisor/scylla_util.sh
+++ b/dist/common/supervisor/scylla_util.sh
@@ -6,12 +6,16 @@ is_nonroot() {
    [ -f "$scylladir"/SCYLLA-NONROOT-FILE ]
 }

+is_container() {
+    [ -f "$scylladir"/SCYLLA-CONTAINER-FILE ]
+}
+
 is_privileged() {
    [ ${EUID:-${UID}} = 0 ]
 }

 execsudo() {
-    if is_nonroot; then
+    if is_nonroot || is_container; then
        exec "$@"
    else
        exec sudo -u scylla -g scylla "$@"
--- a/dist/docker/debian/build_docker.sh
+++ b/dist/docker/debian/build_docker.sh
@@ -25,6 +25,10 @@ product="$(<build/SCYLLA-PRODUCT-FILE)"
 version="$(<build/SCYLLA-VERSION-FILE)"
 release="$(<build/SCYLLA-RELEASE-FILE)"

+if [[ "$version" = *rc* ]]; then
+ version=$(echo $version |sed 's/\(.*\)\.)*/\1~/')
+fi
+
 mode="release"

 if uname -m | grep x86_64 ; then
@@ -93,12 +97,14 @@ run apt-get -y install hostname supervisor openssh-server openssh-client openjdk
 run locale-gen en_US.UTF-8
 run bash -ec "dpkg -i packages/*.deb"
 run apt-get -y clean all
-run bash -ec "cat /scylla_bashrc >> /etc/bashrc"
+run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
 run mkdir -p /etc/supervisor.conf.d
 run mkdir -p /var/log/scylla
 run chown -R scylla:scylla /var/lib/scylla
+run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"/' /etc/default/scylla-server

 run mkdir -p /opt/scylladb/supervisor
+run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
 bcp dist/common/supervisor/scylla-server.sh /opt/scylladb/supervisor/scylla-server.sh
 bcp dist/common/supervisor/scylla-jmx.sh /opt/scylladb/supervisor/scylla-jmx.sh
 bcp dist/common/supervisor/scylla-node-exporter.sh /opt/scylladb/supervisor/scylla-node-exporter.sh
--- a/dist/docker/etc/supervisord.conf.d/scylla-server.conf
+++ b/dist/docker/etc/supervisord.conf.d/scylla-server.conf
@@ -1,4 +1,4 @@
-[program:scylla-server]
+[program:scylla]
 command=/opt/scylladb/supervisor/scylla-server.sh
 stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
--- a/dist/docker/etc/sysconfig/scylla-server
+++ b/dist/docker/etc/sysconfig/scylla-server
@@ -1,41 +0,0 @@
-# choose following mode: virtio, dpdk, posix
-NETWORK_MODE=posix
-
-# tap device name(virtio)
-TAP=tap0
-
-# bridge device name (virtio)
-BRIDGE=virbr0
-
-# ethernet device name
-IFNAME=eth0
-
-# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
-SET_NIC_AND_DISKS=no
-
-# ethernet device driver (dpdk)
-ETHDRV=
-
-# ethernet device PCI ID (dpdk)
-ETHPCIID=
-
-# number of hugepages
-NR_HUGEPAGES=64
-
-# user for process (must be root for dpdk)
-USER=scylla
-
-# group for process
-GROUP=scylla
-
-# scylla home dir
-SCYLLA_HOME=/var/lib/scylla
-
-# scylla config dir
-SCYLLA_CONF=/etc/scylla
-
-# scylla arguments
-SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"
-
-# setup as AMI instance
-AMI=no
--- a/dist/docker/scyllasetup.py
+++ b/dist/docker/scyllasetup.py
@@ -121,12 +121,13 @@ class ScyllaSetup:
        if self._apiAddress is not None:
            args += ["--api-address %s" % self._apiAddress]

-        if self._alternatorPort is not None:
+        if self._alternatorAddress is not None:
            args += ["--alternator-address %s" % self._alternatorAddress]
+
+        if self._alternatorPort is not None:
            args += ["--alternator-port %s" % self._alternatorPort]

        if self._alternatorHttpsPort is not None:
-            args += ["--alternator-address %s" % self._alternatorAddress]
            args += ["--alternator-https-port %s" % self._alternatorHttpsPort]

        if self._alternatorWriteIsolation is not None:
--- a/generic_server.cc
+++ b/generic_server.cc
@@ -184,14 +184,18 @@ future<> server::do_accepts(int which, bool keepalive, socket_address server_add
                    _logger.info("exception while advertising new connection: {}", std::current_exception());
                }
                // Block while monitoring for lifetime/errors.
-                return conn->process().finally([this, conn] {
-                    return unadvertise_connection(conn);
-                }).handle_exception([this] (std::exception_ptr ep) {
-                    if (is_broken_pipe_or_connection_reset(ep)) {
-                        // expected if another side closes a connection or we're shutting down
-                        return;
+                return conn->process().then_wrapped([this, conn] (auto f) {
+                    try {
+                        f.get();
+                    } catch (...) {
+                        auto ep = std::current_exception();
+                        if (!is_broken_pipe_or_connection_reset(ep)) {
+                            // some exceptions are expected if another side closes a connection
+                            // or we're shutting down
+                            _logger.info("exception while processing connection: {}", ep);
+                        }
                    }
-                    _logger.info("exception while processing connection: {}", ep);
+                    return unadvertise_connection(conn);
                });
            });
            return stop_iteration::no;
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -477,49 +477,42 @@ gossiper::handle_get_endpoint_states_msg(gossip_get_endpoint_states_request requ
    return make_ready_future<gossip_get_endpoint_states_response>(gossip_get_endpoint_states_response{std::move(map)});
 }

+rpc::no_wait_type gossiper::background_msg(sstring type, noncopyable_function<future<>(gossiper&)> fn) {
+    (void)with_gate(_background_msg, [this, type = std::move(type), fn = std::move(fn)] () mutable {
+        return container().invoke_on(0, std::move(fn)).handle_exception([type = std::move(type)] (auto ep) {
+            logger.warn("Failed to handle {}: {}", type, ep);
+        });
+    });
+    return messaging_service::no_wait();
+}
+
 void gossiper::init_messaging_service_handler() {
    _messaging.register_gossip_digest_syn([this] (const rpc::client_info& cinfo, gossip_digest_syn syn_msg) {
        auto from = netw::messaging_service::get_source(cinfo);
-        // In a new fiber.
-        (void)container().invoke_on(0, [from, syn_msg = std::move(syn_msg)] (gms::gossiper& gossiper) mutable {
+        return background_msg("GOSSIP_DIGEST_SYN", [from, syn_msg = std::move(syn_msg)] (gms::gossiper& gossiper) mutable {
            return gossiper.handle_syn_msg(from, std::move(syn_msg));
-        }).handle_exception([] (auto ep) {
-            logger.warn("Fail to handle GOSSIP_DIGEST_SYN: {}", ep);
        });
-        return messaging_service::no_wait();
    });
    _messaging.register_gossip_digest_ack([this] (const rpc::client_info& cinfo, gossip_digest_ack msg) {
        auto from = netw::messaging_service::get_source(cinfo);
-        // In a new fiber.
-        (void)container().invoke_on(0, [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
+        return background_msg("GOSSIP_DIGEST_ACK", [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
            return gossiper.handle_ack_msg(from, std::move(msg));
-        }).handle_exception([] (auto ep) {
-            logger.warn("Fail to handle GOSSIP_DIGEST_ACK: {}", ep);
        });
-        return messaging_service::no_wait();
    });
    _messaging.register_gossip_digest_ack2([this] (const rpc::client_info& cinfo, gossip_digest_ack2 msg) {
        auto from = netw::messaging_service::get_source(cinfo);
-        // In a new fiber.
-        (void)container().invoke_on(0, [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
+        return background_msg("GOSSIP_DIGEST_ACK2", [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
            return gossiper.handle_ack2_msg(from, std::move(msg));
-        }).handle_exception([] (auto ep) {
-            logger.warn("Fail to handle GOSSIP_DIGEST_ACK2: {}", ep);
        });
-        return messaging_service::no_wait();
    });
    _messaging.register_gossip_echo([this] (const rpc::client_info& cinfo, rpc::optional<int64_t> generation_number_opt) {
        auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
        return handle_echo_msg(from, generation_number_opt);
    });
    _messaging.register_gossip_shutdown([this] (inet_address from, rpc::optional<int64_t> generation_number_opt) {
-        // In a new fiber.
-        (void)container().invoke_on(0, [from, generation_number_opt] (gms::gossiper& gossiper) {
+        return background_msg("GOSSIP_SHUTDOWN", [from, generation_number_opt] (gms::gossiper& gossiper) {
            return gossiper.handle_shutdown_msg(from, generation_number_opt);
-        }).handle_exception([] (auto ep) {
-            logger.warn("Fail to handle GOSSIP_SHUTDOWN: {}", ep);
        });
-        return messaging_service::no_wait();
    });
    _messaging.register_gossip_get_endpoint_states([this] (const rpc::client_info& cinfo, gossip_get_endpoint_states_request request) {
        return container().invoke_on(0, [request = std::move(request)] (gms::gossiper& gossiper) mutable {
@@ -1679,6 +1672,10 @@ bool gossiper::is_normal(const inet_address& endpoint) const {
    return get_gossip_status(endpoint) == sstring(versioned_value::STATUS_NORMAL);
 }

+bool gossiper::is_left(const inet_address& endpoint) const {
+    return get_gossip_status(endpoint) == sstring(versioned_value::STATUS_LEFT);
+}
+
 bool gossiper::is_normal_ring_member(const inet_address& endpoint) const {
    auto status = get_gossip_status(endpoint);
    return status == sstring(versioned_value::STATUS_NORMAL) || status == sstring(versioned_value::SHUTDOWN);
@@ -2178,6 +2175,9 @@ future<> gossiper::start() {
 }

 future<> gossiper::shutdown() {
+    if (!_background_msg.is_closed()) {
+        co_await _background_msg.close();
+    }
    if (this_shard_id() == 0) {
        co_await do_stop_gossiping();
    }
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -41,7 +41,9 @@
 #include "unimplemented.hh"
 #include <seastar/core/distributed.hh>
 #include <seastar/core/shared_ptr.hh>
+#include <seastar/core/gate.hh>
 #include <seastar/core/print.hh>
+#include <seastar/rpc/rpc_types.hh>
 #include "utils/atomic_vector.hh"
 #include "utils/UUID.hh"
 #include "utils/fb_utilities.hh"
@@ -138,12 +140,16 @@ private:
    bool _enabled = false;
    semaphore _callback_running{1};
    semaphore _apply_state_locally_semaphore{100};
+    seastar::gate _background_msg;
    std::unordered_map<gms::inet_address, syn_msg_pending> _syn_handlers;
    std::unordered_map<gms::inet_address, ack_msg_pending> _ack_handlers;
    bool _advertise_myself = true;
    // Map ip address and generation number
    std::unordered_map<gms::inet_address, int32_t> _advertise_to_nodes;
    future<> _failure_detector_loop_done{make_ready_future<>()} ;
+
+    rpc::no_wait_type background_msg(sstring type, noncopyable_function<future<>(gossiper&)> fn);
+
 public:
    // Get current generation number for the given nodes
    future<std::unordered_map<gms::inet_address, int32_t>>
@@ -565,6 +571,7 @@ public:
    bool is_seed(const inet_address& endpoint) const;
    bool is_shutdown(const inet_address& endpoint) const;
    bool is_normal(const inet_address& endpoint) const;
+    bool is_left(const inet_address& endpoint) const;
    // Check if a node is in NORMAL or SHUTDOWN status which means the node is
    // part of the token ring from the gossip point of view and operates in
    // normal status or was in normal status but is shutdown.
--- a/install.sh
+++ b/install.sh
@@ -520,8 +520,13 @@ relocate_python3 "$rprefix"/scyllatop tools/scyllatop/scyllatop.py
 if $supervisor; then
    install -d -m755 `supervisor_dir $retc`
    for service in scylla-server scylla-jmx scylla-node-exporter; do
+        if [ "$service" = "scylla-server" ]; then
+            program="scylla"
+        else
+            program=$service
+        fi
        cat << EOS > `supervisor_conf $retc $service`
-[program:$service]
+[program:$program]
 directory=$rprefix
 command=/bin/bash -c './supervisor/$service.sh'
 EOS
--- a/locator/azure_snitch.cc
+++ b/locator/azure_snitch.cc
@@ -61,6 +61,10 @@ azure_snitch::azure_snitch(const sstring& fname, unsigned io_cpuid) : production
 }

 future<> azure_snitch::load_config() {
+    if (this_shard_id() != io_cpu_id()) {
+        co_return;
+    }
+
    sstring region = co_await azure_api_call(REGION_NAME_QUERY_PATH);
    sstring azure_zone = co_await azure_api_call(ZONE_NAME_QUERY_PATH);

--- a/main.cc
+++ b/main.cc
@@ -377,11 +377,38 @@ static auto defer_verbose_shutdown(const char* what, Func&& func) {
        startlog.info("Shutting down {}", what);
        try {
            func();
+            startlog.info("Shutting down {} was successful", what);
        } catch (...) {
-            startlog.error("Unexpected error shutting down {}: {}", what, std::current_exception());
-            throw;
+            auto ex = std::current_exception();
+            bool do_abort = true;
+            try {
+                std::rethrow_exception(ex);
+            } catch (const std::system_error& e) {
+                // System error codes we consider "environmental",
+                // i.e. not scylla's fault, therefore there is no point in
+                // aborting and dumping core.
+                for (int i : {EIO, EACCES, ENOSPC}) {
+                    if (e.code() == std::error_code(i, std::system_category())) {
+                        do_abort = false;
+                        break;
+                    }
+                }
+            } catch (...) {
+            }
+            auto msg = fmt::format("Unexpected error shutting down {}: {}", what, ex);
+            if (do_abort) {
+                startlog.error("{}: aborting", msg);
+                abort();
+            } else {
+                startlog.error("{}: exiting, at {}", msg, current_backtrace());
+
+                // Call _exit() rather than exit() to exit immediately
+                // without calling exit handlers, avoiding
+                // boost::intrusive::detail::destructor_impl assert failure
+                // from ~segment_pool exit handler.
+                _exit(255);
+            }
        }
-        startlog.info("Shutting down {} was successful", what);
    };

    auto ret = deferred_action(std::move(vfunc));
--- a/memtable.cc
+++ b/memtable.cc
@@ -613,7 +613,8 @@ static flat_mutation_reader make_partition_snapshot_flat_reader_from_snp_schema(
        schema_ptr rev_snp_schema = snp->schema()->make_reversed();
        return make_partition_snapshot_flat_reader<true, partition_snapshot_read_accounter>(std::move(rev_snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
    } else {
-        return make_partition_snapshot_flat_reader<false, partition_snapshot_read_accounter>(snp->schema(), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
+        schema_ptr snp_schema = snp->schema();
+        return make_partition_snapshot_flat_reader<false, partition_snapshot_read_accounter>(std::move(snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
    }
 }

--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -628,7 +628,12 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        remove_error_rpc_client(verb, id);
    }

-    auto must_encrypt = [&id, &verb, this] {
+    auto addr = get_preferred_ip(id.addr);
+    auto broadcast_address = utils::fb_utilities::get_broadcast_address();
+    bool listen_to_bc = _cfg.listen_on_broadcast_address && _cfg.ip != broadcast_address;
+    auto laddr = socket_address(listen_to_bc ? broadcast_address : _cfg.ip, 0);
+
+    auto must_encrypt = [&] {
        if (_cfg.encrypt == encrypt_what::none) {
            return false;
        }
@@ -646,13 +651,27 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        auto& snitch_ptr = locator::i_endpoint_snitch::get_local_snitch_ptr();

        // either rack/dc need to be in same dc to use non-tls
-        if (snitch_ptr->get_datacenter(id.addr) != snitch_ptr->get_datacenter(utils::fb_utilities::get_broadcast_address())) {
+        auto my_dc = snitch_ptr->get_datacenter(broadcast_address);
+        if (snitch_ptr->get_datacenter(addr) != my_dc) {
+            return true;
+        }
+        // #9653 - if our idea of dc for bind address differs from our official endpoint address,
+        // we cannot trust downgrading. We need to ensure either (local) bind address is same as
+        // broadcast or that the dc info we get for it is the same.
+        if (broadcast_address != laddr && snitch_ptr->get_datacenter(laddr) != my_dc) {
            return true;
        }
        // if cross-rack tls, check rack.
-        return _cfg.encrypt == encrypt_what::rack &&
-            snitch_ptr->get_rack(id.addr) != snitch_ptr->get_rack(utils::fb_utilities::get_broadcast_address())
-            ;
+        if (_cfg.encrypt == encrypt_what::dc) {
+            return false;
+        }
+        auto my_rack = snitch_ptr->get_rack(broadcast_address);
+        if (snitch_ptr->get_rack(addr) != my_rack) {
+            return true;
+        }
+        // See above: We need to ensure either (local) bind address is same as
+        // broadcast or that the rack info we get for it is the same.
+        return broadcast_address != laddr && snitch_ptr->get_rack(laddr) != my_rack;
    }();

    auto must_compress = [&id, this] {
@@ -681,7 +700,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        return true;
    }();

-    auto remote_addr = socket_address(get_preferred_ip(id.addr), must_encrypt ? _cfg.ssl_port : _cfg.port);
+    auto remote_addr = socket_address(addr, must_encrypt ? _cfg.ssl_port : _cfg.port);

    rpc::client_options opts;
    // send keepalive messages each minute if connection is idle, drop connection after 10 failures
@@ -691,13 +710,8 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }
    opts.tcp_nodelay = must_tcp_nodelay;
    opts.reuseaddr = true;
-    // We send cookies only for non-default statement tenant clients.
-    if (idx > 3) {
-        opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;
-    }
+    opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;

-    bool listen_to_bc = _cfg.listen_on_broadcast_address && _cfg.ip != utils::fb_utilities::get_broadcast_address();
-    auto laddr = socket_address(listen_to_bc ? utils::fb_utilities::get_broadcast_address() : _cfg.ip, 0);
    auto client = must_encrypt ?
                    ::make_shared<rpc_protocol_client_wrapper>(_rpc->protocol(), std::move(opts),
                                    remote_addr, laddr, _credentials) :
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -694,11 +694,11 @@ future<typename ResultBuilder::result_type> do_query(
        ResultBuilder&& result_builder) {
    auto ctx = seastar::make_shared<read_context>(db, s, cmd, ranges, trace_state, timeout);

-    co_await ctx->lookup_readers(timeout);
-
    std::exception_ptr ex;

    try {
+        co_await ctx->lookup_readers(timeout);
+
        auto [last_ckey, result, unconsumed_buffer, compaction_state] = co_await read_page<ResultBuilder>(ctx, s, cmd, ranges, trace_state,
                std::move(result_builder));

--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -1545,18 +1545,20 @@ public:
 };

 future<> shard_reader::close() noexcept {
-    // Nothing to do if there was no reader created, nor is there a background
-    // read ahead in progress which will create one.
-    if (!_reader && !_read_ahead) {
-        co_return;
+    if (_read_ahead) {
+        try {
+            co_await *std::exchange(_read_ahead, std::nullopt);
+        } catch (...) {
+            mrlog.warn("shard_reader::close(): read_ahead on shard {} failed: {}", _shard, std::current_exception());
+        }
    }

    try {
-        if (_read_ahead) {
-            co_await *std::exchange(_read_ahead, std::nullopt);
-        }
-
        co_await smp::submit_to(_shard, [this] {
+            if (!_reader) {
+                return make_ready_future<>();
+            }
+
            auto irh = std::move(*_reader).inactive_read_handle();
            return with_closeable(flat_mutation_reader(_reader.release()), [this] (flat_mutation_reader& reader) mutable {
                auto permit = reader.permit();
--- a/mutation_writer/feed_writers.hh
+++ b/mutation_writer/feed_writers.hh
@@ -54,7 +54,7 @@ future<> feed_writer(flat_mutation_reader&& rd_ref, Writer wr) {
    auto rd = std::move(rd_ref);
    std::exception_ptr ex;
    try {
-        while (!rd.is_end_of_stream()) {
+        while (!rd.is_end_of_stream() || !rd.is_buffer_empty()) {
            co_await rd.fill_buffer();
            while (!rd.is_buffer_empty()) {
                co_await rd.pop_mutation_fragment().consume(wr);
--- a/partition_snapshot_reader.hh
+++ b/partition_snapshot_reader.hh
@@ -305,14 +305,23 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
                const std::optional<position_in_partition>& last_row,
                const std::optional<position_in_partition>& last_rts,
                position_in_partition_view pos) {
-            if (!_rt_stream.empty()) {
-                return _rt_stream.get_next(std::move(pos));
-            }
            return in_alloc_section([&] () -> mutation_fragment_opt {
                maybe_refresh_state(ck_range_snapshot, last_row, last_rts);

                position_in_partition::less_compare rt_less(_query_schema);

+                // The while below moves range tombstones from partition versions
+                // into _rt_stream, just enough to produce the next range tombstone
+                // The main goal behind moving to _rt_stream is to deoverlap range tombstones
+                // which have the same starting position. This is not in order to satisfy
+                // flat_mutation_reader stream requirements, the reader can emit range tombstones
+                // which have the same position incrementally. This is to guarantee forward
+                // progress in the case iterators get invalidated and maybe_refresh_state()
+                // above needs to restore them. It does so using last_rts, which tracks
+                // the position of the last emitted range tombstone. All range tombstones
+                // with positions <= than last_rts are skipped on refresh. To make progress,
+                // we need to make sure that all range tombstones with duplicated positions
+                // are emitted before maybe_refresh_state().
                while (has_more_range_tombstones()
                        && !rt_less(pos, peek_range_tombstone().position())
                        && (_rt_stream.empty() || !rt_less(_rt_stream.peek_next().position(), peek_range_tombstone().position()))) {
--- a/partition_snapshot_row_cursor.hh
+++ b/partition_snapshot_row_cursor.hh
@@ -325,7 +325,7 @@ public:
    // When throws, the cursor is invalidated and its position is not changed.
    bool advance_to(position_in_partition_view lower_bound) {
        prepare_heap(lower_bound);
-        bool found = no_clustering_row_between(_schema, lower_bound, _heap[0].it->position());
+        bool found = no_clustering_row_between_weak(_schema, lower_bound, _heap[0].it->position());
        recreate_current_row();
        return found;
    }
@@ -411,11 +411,11 @@ public:
        } else {
            // Copy row from older version because rows in evictable versions must
            // hold values which are independently complete to be consistent on eviction.
-            auto e = current_allocator().construct<rows_entry>(_schema, *_current_row[0].it);
+            auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(_schema, *_current_row[0].it));
            e->set_continuous(latest_i && latest_i->continuous());
            _snp.tracker()->insert(*e);
-            rows.insert_before(latest_i, *e);
-            return {*e, true};
+            auto e_i = rows.insert_before(latest_i, std::move(e));
+            return ensure_result{*e_i, true};
        }
    }

@@ -447,11 +447,11 @@ public:
        }
        auto&& rows = _snp.version()->partition().mutable_clustered_rows();
        auto latest_i = get_iterator_in_latest_version();
-        auto e = current_allocator().construct<rows_entry>(_schema, pos, is_dummy(!pos.is_clustering_row()),
-            is_continuous(latest_i && latest_i->continuous()));
+        auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(_schema, pos, is_dummy(!pos.is_clustering_row()),
+            is_continuous(latest_i && latest_i->continuous())));
        _snp.tracker()->insert(*e);
-        rows.insert_before(latest_i, *e);
-        return ensure_result{*e, true};
+        auto e_i = rows.insert_before(latest_i, std::move(e));
+        return ensure_result{*e_i, true};
    }

    // Brings the entry pointed to by the cursor to the front of the LRU
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -575,6 +575,20 @@ bool no_clustering_row_between(const schema& s, position_in_partition_view a, po
    }
 }

+// Returns true if and only if there can't be any clustering_row with position >= a and < b.
+// It is assumed that a <= b.
+inline
+bool no_clustering_row_between_weak(const schema& s, position_in_partition_view a, position_in_partition_view b) {
+    clustering_key_prefix::equality eq(s);
+    if (a.has_key() && b.has_key()) {
+        return eq(a.key(), b.key())
+               && (a.get_bound_weight() == bound_weight::after_all_prefixed
+                   || b.get_bound_weight() != bound_weight::after_all_prefixed);
+    } else {
+        return !a.has_key() && !b.has_key();
+    }
+}
+
 // Includes all position_in_partition objects "p" for which: start <= p < end
 // And only those.
 class position_range {
@@ -659,3 +673,9 @@ inline
 bool position_range::is_all_clustered_rows(const schema& s) const {
    return _start.is_before_all_clustered_rows(s) && _end.is_after_all_clustered_rows(s);
 }
+
+// Assumes that the bounds of `r` are of 'clustered' type
+// and that `r` is non-empty (the left bound is smaller than the right bound).
+//
+// If `r` does not contain any keys, returns nullopt.
+std::optional<query::clustering_range> position_range_to_clustering_range(const position_range& r, const schema&);
--- a/query.cc
+++ b/query.cc
@@ -379,3 +379,52 @@ foreign_ptr<lw_shared_ptr<query::result>> result_merger::get() {
 }

 }
+
+std::optional<query::clustering_range> position_range_to_clustering_range(const position_range& r, const schema& s) {
+    assert(r.start().get_type() == partition_region::clustered);
+    assert(r.end().get_type() == partition_region::clustered);
+
+    if (r.start().has_key() && r.end().has_key()
+            && clustering_key_prefix::equality(s)(r.start().key(), r.end().key())) {
+        assert(r.start().get_bound_weight() != r.end().get_bound_weight());
+
+        if (r.end().get_bound_weight() == bound_weight::after_all_prefixed
+                && r.start().get_bound_weight() != bound_weight::after_all_prefixed) {
+            // [before x, after x) and [for x, after x) get converted to [x, x].
+            return query::clustering_range::make_singular(r.start().key());
+        }
+
+        // [before x, for x) does not contain any keys.
+        return std::nullopt;
+    }
+
+    // position_range -> clustering_range
+    // (recall that position_ranges are always left-closed, right opened):
+    // [before x, ...), [for x, ...) -> [x, ...
+    // [after x, ...) -> (x, ...
+    // [..., before x), [..., for x) -> ..., x)
+    // [..., after x) -> ..., x]
+
+    auto to_bound = [&s] (const position_in_partition& p, bool left) -> std::optional<query::clustering_range::bound> {
+        if (p.is_before_all_clustered_rows(s)) {
+            assert(left);
+            return {};
+        }
+
+        if (p.is_after_all_clustered_rows(s)) {
+            assert(!left);
+            return {};
+        }
+
+        assert(p.has_key());
+
+        auto bw = p.get_bound_weight();
+        bool inclusive = left
+            ? bw != bound_weight::after_all_prefixed
+            : bw == bound_weight::after_all_prefixed;
+
+        return query::clustering_range::bound{p.key(), inclusive};
+    };
+
+    return query::clustering_range{to_bound(r.start(), true), to_bound(r.end(), false)};
+}
--- a/range_tombstone_list.cc
+++ b/range_tombstone_list.cc
@@ -42,28 +42,34 @@ static auto construct_range_tombstone_entry(Args&&... args) {
 }

 void range_tombstone_list::apply_reversibly(const schema& s,
-        clustering_key_prefix start, bound_kind start_kind,
-        clustering_key_prefix end,
+        clustering_key_prefix start_key, bound_kind start_kind,
+        clustering_key_prefix end_key,
        bound_kind end_kind,
        tombstone tomb,
        reverter& rev)
 {
+    position_in_partition::less_compare less(s);
+    position_in_partition start(position_in_partition::range_tag_t(), bound_view(std::move(start_key), start_kind));
+    position_in_partition end(position_in_partition::range_tag_t(), bound_view(std::move(end_key), end_kind));
+
+    if (!less(start, end)) {
+        return;
+    }
+
    if (!_tombstones.empty()) {
-        bound_view::compare less(s);
-        bound_view start_bound(start, start_kind);
        auto last = --_tombstones.end();
        range_tombstones_type::iterator it;
-        if (less(start_bound, last->end_bound())) {
-            it = _tombstones.upper_bound(start_bound, [less](auto&& sb, auto&& rt) {
-                return less(sb, rt.end_bound());
+        if (less(start, last->end_position())) {
+            it = _tombstones.upper_bound(start, [less](auto&& sb, auto&& rt) {
+                return less(sb, rt.end_position());
            });
        } else {
            it = _tombstones.end();
        }
-        insert_from(s, std::move(it), std::move(start), start_kind, std::move(end), end_kind, std::move(tomb), rev);
+        insert_from(s, std::move(it), std::move(start), std::move(end), std::move(tomb), rev);
        return;
    }
-    auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
+    auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
    rev.insert(_tombstones.end(), *rt);
    rt.release();
 }
@@ -81,35 +87,31 @@ void range_tombstone_list::apply_reversibly(const schema& s,
 */
 void range_tombstone_list::insert_from(const schema& s,
    range_tombstones_type::iterator it,
-    clustering_key_prefix start,
-    bound_kind start_kind,
-    clustering_key_prefix end,
-    bound_kind end_kind,
+    position_in_partition start,
+    position_in_partition end,
    tombstone tomb,
    reverter& rev)
 {
-    bound_view::compare less(s);
-    bound_view end_bound(end, end_kind);
+    position_in_partition::tri_compare cmp(s);
+
    if (it != _tombstones.begin()) {
        auto prev = std::prev(it);
-        if (prev->tombstone().tomb == tomb && prev->end_bound().adjacent(s, bound_view(start, start_kind))) {
-            start = prev->tombstone().start;
-            start_kind = prev->tombstone().start_kind;
+        if (prev->tombstone().tomb == tomb && cmp(prev->end_position(), start) == 0) {
+            start = prev->position();
            rev.erase(prev);
        }
    }
    while (it != _tombstones.end()) {
-        bound_view start_bound(start, start_kind);
-        if (less(end_bound, start_bound)) {
+        if (cmp(end, start) <= 0) {
            return;
        }

-        if (less(end_bound, it->start_bound())) {
+        if (cmp(end, it->position()) < 0) {
            // not overlapping
-            if (it->tombstone().tomb == tomb && end_bound.adjacent(s, it->start_bound())) {
-                rev.update(it, {std::move(start), start_kind, it->tombstone().end, it->tombstone().end_kind, tomb});
+            if (it->tombstone().tomb == tomb && cmp(end, it->position()) == 0) {
+                rev.update(it, {std::move(start), std::move(start), tomb});
            } else {
-                auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, tomb);
+                auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), tomb);
                rev.insert(it, *rt);
                rt.release();
            }
@@ -119,34 +121,29 @@ void range_tombstone_list::insert_from(const schema& s,
        auto c = tomb <=> it->tombstone().tomb;
        if (c == 0) {
            // same timestamp, overlapping or adjacent, so merge.
-            if (less(it->start_bound(), start_bound)) {
-                start = it->tombstone().start;
-                start_kind = it->tombstone().start_kind;
+            if (cmp(it->position(), start) < 0) {
+                start = it->position();
            }
-            if (less(end_bound, it->end_bound())) {
-                end = it->tombstone().end;
-                end_kind = it->tombstone().end_kind;
-                end_bound = bound_view(end, end_kind);
+            if (cmp(end, it->end_position()) < 0) {
+                end = it->end_position();
            }
            it = rev.erase(it);
        } else if (c > 0) {
            // We overwrite the current tombstone.

-            if (less(it->start_bound(), start_bound)) {
-                auto new_end = bound_view(start, invert_kind(start_kind));
-                if (!less(new_end, it->start_bound())) {
-                    // Here it->start < start
-                    auto rt = construct_range_tombstone_entry(it->start_bound(), new_end, it->tombstone().tomb);
-                    rev.update(it, {start_bound, it->end_bound(), it->tombstone().tomb});
+            if (cmp(it->position(), start) < 0) {
+                {
+                    auto rt = construct_range_tombstone_entry(it->position(), start, it->tombstone().tomb);
+                    rev.update(it, {start, it->end_position(), it->tombstone().tomb});
                    rev.insert(it, *rt);
                    rt.release();
                }
            }

-            if (less(end_bound, it->end_bound())) {
+            if (cmp(end, it->end_position()) < 0) {
                // Here start <= it->start and end < it->end.
-                auto rt = construct_range_tombstone_entry(std::move(start), start_kind, end, end_kind, std::move(tomb));
-                rev.update(it, {std::move(end), invert_kind(end_kind), it->tombstone().end, it->tombstone().end_kind, it->tombstone().tomb});
+                auto rt = construct_range_tombstone_entry(std::move(start), end, std::move(tomb));
+                rev.update(it, {std::move(end), it->end_position(), it->tombstone().tomb});
                rev.insert(it, *rt);
                rt.release();
                return;
@@ -157,30 +154,28 @@ void range_tombstone_list::insert_from(const schema& s,
        } else {
            // We don't overwrite the current tombstone.

-            if (less(start_bound, it->start_bound())) {
+            if (cmp(start, it->position()) < 0) {
                // The new tombstone starts before the current one.
-                if (less(it->start_bound(), end_bound)) {
+                if (cmp(it->position(), end) < 0) {
                    // Here start < it->start and it->start < end.
-                    auto new_end_kind = invert_kind(it->tombstone().start_kind);
-                    if (!less(bound_view(it->tombstone().start, new_end_kind), start_bound)) {
-                        auto rt = construct_range_tombstone_entry(std::move(start), start_kind, it->tombstone().start, new_end_kind, tomb);
+                    {
+                        auto rt = construct_range_tombstone_entry(std::move(start), it->position(), tomb);
                        it = rev.insert(it, *rt);
                        rt.release();
                        ++it;
                    }
                } else {
                    // Here start < it->start and end <= it->start, so just insert the new tombstone.
-                    auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
+                    auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
                    rev.insert(it, *rt);
                    rt.release();
                    return;
                }
            }

-            if (less(it->end_bound(), end_bound)) {
+            if (cmp(it->end_position(), end) < 0) {
                // Here the current tombstone overwrites a range of the new one.
-                start = it->tombstone().end;
-                start_kind = invert_kind(it->tombstone().end_kind);
+                start = it->end_position();
                ++it;
            } else {
                // Here the current tombstone completely overwrites the new one.
@@ -190,7 +185,7 @@ void range_tombstone_list::insert_from(const schema& s,
    }

    // If we got here, then just insert the remainder at the end.
-    auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
+    auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
    rev.insert(it, *rt);
    rt.release();
 }
--- a/range_tombstone_list.hh
+++ b/range_tombstone_list.hh
@@ -297,7 +297,13 @@ public:
 private:
    void apply_reversibly(const schema& s, clustering_key_prefix start, bound_kind start_kind,
                          clustering_key_prefix end, bound_kind end_kind, tombstone tomb, reverter& rev);
-    void insert_from(const schema& s, range_tombstones_type::iterator it, clustering_key_prefix start,
-                     bound_kind start_kind, clustering_key_prefix end, bound_kind end_kind, tombstone tomb, reverter& rev);
+
+    void insert_from(const schema& s,
+                     range_tombstones_type::iterator it,
+                     position_in_partition start,
+                     position_in_partition end,
+                     tombstone tomb,
+                     reverter& rev);
+
    range_tombstones_type::iterator find(const schema& s, const range_tombstone_entry& rt);
 };
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -249,6 +249,14 @@ public:
        return _base_resources;
    }

+    void release_base_resources() noexcept {
+        if (_base_resources_consumed) {
+            _resources -= _base_resources;
+            _base_resources_consumed = false;
+        }
+        _semaphore.signal(std::exchange(_base_resources, {}));
+    }
+
    sstring description() const {
        return format("{}.{}:{}",
                _schema ? _schema->ks_name() : "*",
@@ -394,6 +402,10 @@ reader_resources reader_permit::base_resources() const {
    return _impl->base_resources();
 }

+void reader_permit::release_base_resources() noexcept {
+    return _impl->release_base_resources();
+}
+
 sstring reader_permit::description() const {
    return _impl->description();
 }
--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -161,6 +161,8 @@ public:

    reader_resources base_resources() const;

+    void release_base_resources() noexcept;
+
    sstring description() const;

    db::timeout_clock::time_point timeout() const noexcept;
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -407,6 +407,10 @@ public:
                    {},
                    mutation_reader::forwarding::no);
        } else {
+            // We can't have two permits with count resource for 1 repair.
+            // So we release the one on _permit so the only one is the one the
+            // shard reader will obtain.
+            _permit.release_base_resources();
            _reader = make_multishard_streaming_reader(db, _schema, _permit, [this] {
                auto shard_range = _sharder.next();
                if (shard_range) {
--- a/2
+++ b/2
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -635,16 +635,16 @@ void storage_service::bootstrap() {

        // Update pending ranges now, so we correctly count ourselves as a pending replica
        // when inserting the new CDC generation.
-      if (!bootstrap_rbno) {
-        // When is_repair_based_node_ops_enabled is true, the bootstrap node
-        // will use node_ops_cmd to bootstrap, node_ops_cmd will update the pending ranges.
-        slogger.debug("bootstrap: update pending ranges: endpoint={} bootstrap_tokens={}", get_broadcast_address(), _bootstrap_tokens);
-        mutate_token_metadata([this] (mutable_token_metadata_ptr tmptr) {
-            auto endpoint = get_broadcast_address();
-            tmptr->add_bootstrap_tokens(_bootstrap_tokens, endpoint);
-            return update_pending_ranges(std::move(tmptr), format("bootstrapping node {}", endpoint));
-        }).get();
-      }
+        if (!bootstrap_rbno) {
+            // When is_repair_based_node_ops_enabled is true, the bootstrap node
+            // will use node_ops_cmd to bootstrap, node_ops_cmd will update the pending ranges.
+            slogger.debug("bootstrap: update pending ranges: endpoint={} bootstrap_tokens={}", get_broadcast_address(), _bootstrap_tokens);
+            mutate_token_metadata([this] (mutable_token_metadata_ptr tmptr) {
+                auto endpoint = get_broadcast_address();
+                tmptr->add_bootstrap_tokens(_bootstrap_tokens, endpoint);
+                return update_pending_ranges(std::move(tmptr), format("bootstrapping node {}", endpoint));
+            }).get();
+        }

        // After we pick a generation timestamp, we start gossiping it, and we stick with it.
        // We don't do any other generation switches (unless we crash before complecting bootstrap).
@@ -652,19 +652,23 @@ void storage_service::bootstrap() {

        _cdc_gen_id = _cdc_gen_service.local().make_new_generation(_bootstrap_tokens, !is_first_node()).get0();

-      if (!bootstrap_rbno) {
-        // When is_repair_based_node_ops_enabled is true, the bootstrap node
-        // will use node_ops_cmd to bootstrap, bootstrapping gossip status is not needed for bootstrap.
-        _gossiper.add_local_application_state({
-            // Order is important: both the CDC streams timestamp and tokens must be known when a node handles our status.
-            { gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens) },
-            { gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
-            { gms::application_state::STATUS, versioned_value::bootstrapping(_bootstrap_tokens) },
-        }).get();
+        if (!bootstrap_rbno) {
+            // When is_repair_based_node_ops_enabled is true, the bootstrap node
+            // will use node_ops_cmd to bootstrap, bootstrapping gossip status is not needed for bootstrap.
+            _gossiper.add_local_application_state({
+                { gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens) },
+                { gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
+                { gms::application_state::STATUS, versioned_value::bootstrapping(_bootstrap_tokens) },
+            }).get();

-        set_mode(mode::JOINING, format("sleeping {} ms for pending range setup", get_ring_delay().count()), true);
-        _gossiper.wait_for_range_setup().get();
-     }
+            set_mode(mode::JOINING, format("sleeping {} ms for pending range setup", get_ring_delay().count()), true);
+            _gossiper.wait_for_range_setup().get();
+        } else {
+            // Even with RBNO bootstrap we need to announce the new CDC generation immediately after it's created.
+            _gossiper.add_local_application_state({
+                { gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
+            }).get();
+        }
    } else {
        // Wait until we know tokens of existing node before announcing replacing status.
        set_mode(mode::JOINING, fmt::format("Wait until local node knows tokens of peer nodes"), true);
@@ -3670,7 +3674,7 @@ shared_ptr<abort_source> node_ops_meta_data::get_abort_source() {

 void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {
    slogger.debug("node_ops_update_heartbeat: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
+    auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
    auto it = _node_ops.find(ops_uuid);
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
@@ -3680,7 +3684,7 @@ void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {

 void storage_service::node_ops_done(utils::UUID ops_uuid) {
    slogger.debug("node_ops_done: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
+    auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
    auto it = _node_ops.find(ops_uuid);
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
@@ -3691,7 +3695,7 @@ void storage_service::node_ops_done(utils::UUID ops_uuid) {

 void storage_service::node_ops_abort(utils::UUID ops_uuid) {
    slogger.debug("node_ops_abort: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
+    auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
    auto it = _node_ops.find(ops_uuid);
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
--- a/sstables/partition_index_cache.hh
+++ b/sstables/partition_index_cache.hh
@@ -49,12 +49,13 @@ private:
    public:
        partition_index_cache* _parent;
        key_type _key;
-        std::variant<shared_promise<>, partition_index_page> _page;
+        std::variant<lw_shared_ptr<shared_promise<>>, partition_index_page> _page;
        size_t _size_in_allocator = 0;
    public:
        entry(partition_index_cache* parent, key_type key)
                : _parent(parent)
                , _key(key)
+                , _page(make_lw_shared<shared_promise<>>())
        { }

        void set_page(partition_index_page&& page) noexcept {
@@ -67,7 +68,12 @@ private:
        entry(entry&&) noexcept = default;

        ~entry() {
-            assert(!is_referenced());
+            if (is_referenced()) {
+                // Live entry_ptr should keep the entry alive, except when the entry failed on loading.
+                // In that case, entry_ptr holders are not supposed to use the pointer, so it's safe
+                // to nullify those entry_ptrs.
+                assert(!ready());
+            }
        }

        void on_evicted() noexcept override;
@@ -76,7 +82,7 @@ private:
        // Always returns the same value for a given state of _page.
        size_t size_in_allocator() const { return _size_in_allocator; }

-        shared_promise<>& promise() { return std::get<shared_promise<>>(_page); }
+        lw_shared_ptr<shared_promise<>> promise() { return std::get<lw_shared_ptr<shared_promise<>>>(_page); }
        bool ready() const { return std::holds_alternative<partition_index_page>(_page); }
        partition_index_page& page() { return std::get<partition_index_page>(_page); }
        const partition_index_page& page() const { return std::get<partition_index_page>(_page); }
@@ -207,9 +213,7 @@ public:
                return make_ready_future<entry_ptr>(std::move(ptr));
            } else {
                ++_shard_stats.blocks;
-                return _as(_region, [ptr] () mutable {
-                    return ptr.get_entry().promise().get_shared_future();
-                }).then([ptr] () mutable {
+                return ptr.get_entry().promise()->get_shared_future().then([ptr] () mutable {
                    return std::move(ptr);
                });
            }
@@ -234,23 +238,23 @@ public:

        // No exceptions before then_wrapped() is installed so that ptr will be eventually populated.

-        return futurize_invoke(loader, key).then_wrapped([this, key, ptr] (auto&& f) mutable {
+        return futurize_invoke(loader, key).then_wrapped([this, key, ptr = std::move(ptr)] (auto&& f) mutable {
            entry& e = ptr.get_entry();
            try {
                partition_index_page&& page = f.get0();
-                e.promise().set_value();
+                e.promise()->set_value();
                e.set_page(std::move(page));
                _shard_stats.used_bytes += e.size_in_allocator();
                ++_shard_stats.populations;
+                return ptr;
            } catch (...) {
-                e.promise().set_exception(std::current_exception());
+                e.promise()->set_exception(std::current_exception());
+                ptr = {};
                with_allocator(_region.allocator(), [&] {
                    _cache.erase(key);
                });
                throw;
            }
-        }).then([ptr] {
-            return ptr;
        });
    }

--- a/sstables/sstable_set.cc
+++ b/sstables/sstable_set.cc
@@ -400,10 +400,15 @@ void time_series_sstable_set::for_each_sstable(std::function<void(const shared_s

 // O(log n)
 void time_series_sstable_set::insert(shared_sstable sst) {
+  try {
    auto min_pos = sst->min_position();
    auto max_pos_reversed = sst->max_position().reversed();
    _sstables->emplace(std::move(min_pos), sst);
    _sstables_reversed->emplace(std::move(max_pos_reversed), std::move(sst));
+  } catch (...) {
+    erase(sst);
+    throw;
+  }
 }

 // O(n) worst case, but should be close to O(log n) most of the time
--- a/table.cc
+++ b/table.cc
@@ -1493,13 +1493,14 @@ bool table::can_flush() const {
 }

 future<> table::clear() {
+    auto permits = co_await _config.dirty_memory_manager->get_all_flush_permits();
    if (_commitlog) {
        for (auto& t : *_memtables) {
            _commitlog->discard_completed_segments(_schema->id(), t->get_and_discard_rp_set());
        }
    }
    _memtables->clear_and_add();
-    return _cache.invalidate(row_cache::external_updater([] { /* There is no underlying mutation source */ }));
+    co_await _cache.invalidate(row_cache::external_updater([] { /* There is no underlying mutation source */ }));
 }

 // NOTE: does not need to be futurized, but might eventually, depending on
--- a/test.py
+++ b/test.py
@@ -291,6 +291,8 @@ class Test:
    def print_summary(self):
        pass

+    def get_junit_etree(self):
+        return None

    def check_log(self, trim):
        """Check and trim logs and xml output for tests which have it"""
@@ -338,9 +340,36 @@ class BoostTest(UnitTest):
        boost_args += ['--color_output=false']
        boost_args += ['--']
        self.args = boost_args + self.args
+        self.casename = casename
+        self.__junit_etree = None
+
+    def get_junit_etree(self):
+        def adjust_suite_name(name):
+            # Normalize "path/to/file.cc" to "path.to.file" to conform to
+            # Jenkins expectations that the suite name is a class name. ".cc"
+            # doesn't add any infomation. Add the mode, otherwise failures
+            # in different modes are indistinguishable. The "test/" prefix adds
+            # no information, so remove it.
+            import re
+            name = re.sub(r'^test/', '', name)
+            name = re.sub(r'\.cc$', '', name)
+            name = re.sub(r'/', '.', name)
+            name = f'{name}.{self.mode}'
+            return name
+        if self.__junit_etree is None:
+            self.__junit_etree = ET.parse(self.xmlout)
+            root = self.__junit_etree.getroot()
+            suites = root.findall('.//TestSuite')
+            for suite in suites:
+                suite.attrib['name'] = adjust_suite_name(suite.attrib['name'])
+                skipped = suite.findall('./TestCase[@reason="disabled"]')
+                for e in skipped:
+                    suite.remove(e)
+            os.unlink(self.xmlout)
+        return self.__junit_etree

    def check_log(self, trim):
-        ET.parse(self.xmlout)
+        self.get_junit_etree()
        super().check_log(trim)


@@ -800,6 +829,17 @@ def write_junit_report(tmpdir, mode):
    with open(junit_filename, "w") as f:
        ET.ElementTree(xml_results).write(f, encoding="unicode")

+def write_consolidated_boost_junit_xml(tmpdir, mode):
+    xml = ET.Element("TestLog")
+    for suite in TestSuite.suites.values():
+        for test in suite.tests:
+            if test.mode != mode:
+                continue
+            test_xml = test.get_junit_etree()
+            if test_xml is not None:
+                xml.extend(test_xml.getroot().findall('.//TestSuite'))
+    et = ET.ElementTree(xml)
+    et.write(f'{tmpdir}/{mode}/xml/boost.xunit.xml', encoding='unicode')

 def open_log(tmpdir):
    pathlib.Path(tmpdir).mkdir(parents=True, exist_ok=True)
@@ -839,6 +879,7 @@ async def main():

    for mode in options.modes:
        write_junit_report(options.tmpdir, mode)
+        write_consolidated_boost_junit_xml(options.tmpdir, mode)

    if 'coverage' in options.modes:
        coverage.generate_coverage_report("build/coverage", "tests")
--- a/test/alternator/test_item.py
+++ b/test/alternator/test_item.py
@@ -374,6 +374,14 @@ def test_getitem_attributes_to_get_duplicate(dynamodb, test_table):
    with pytest.raises(ClientError, match='ValidationException.*Duplicate'):
        test_table.get_item(Key={'p': p, 'c': c}, AttributesToGet=['a', 'a'], ConsistentRead=True)

+# Verify that it is forbidden to ask for an empty AttributesToGet
+# Reproduces issue #10332.
+def test_getitem_attributes_to_get_empty(dynamodb, test_table):
+    p = random_string()
+    c = random_string()
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.get_item(Key={'p': p, 'c': c}, AttributesToGet=[], ConsistentRead=True)
+
 # Basic test for DeleteItem, with hash key only
 def test_delete_item_hash(test_table_s):
    p = random_string()
--- a/test/alternator/test_query.py
+++ b/test/alternator/test_query.py
@@ -170,6 +170,13 @@ def test_query_attributes_to_get(dynamodb, test_table):
        expected_items = [{k: x[k] for k in wanted if k in x} for x in items]
        assert multiset(expected_items) == multiset(got_items)

+# Verify that it is forbidden to ask for an empty AttributesToGet
+# Reproduces issue #10332.
+def test_query_attributes_to_get_empty(dynamodb, test_table):
+    p = random_string()
+    with pytest.raises(ClientError, match='ValidationException'):
+        full_query(test_table, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, AttributesToGet=[])
+
 # Test that in a table with both hash key and sort key, which keys we can
 # Query by: We can Query by the hash key, by a combination of both hash and
 # sort keys, but *cannot* query by just the sort key, and obviously not
--- a/test/alternator/test_table.py
+++ b/test/alternator/test_table.py
@@ -16,6 +16,9 @@
 # along with Scylla.  If not, see <http://www.gnu.org/licenses/>.

 # Tests for basic table operations: CreateTable, DeleteTable, ListTables.
+# Also some basic tests for UpdateTable - although UpdateTable usually
+# enables more elaborate features (such as GSI or Streams) and those are
+# tested elsewhere.

 import pytest
 from botocore.exceptions import ClientError
@@ -311,3 +314,17 @@ def test_table_sse_off(dynamodb):
        KeySchema=[{ 'AttributeName': 'p', 'KeyType': 'HASH' }],
        AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]);
    table.delete();
+
+# Test that trying to delete a table that doesn't exist fails in the
+# appropriate way (ResourceNotFoundException)
+def test_delete_table_non_existent(dynamodb, test_table):
+    client = dynamodb.meta.client
+    with pytest.raises(ClientError, match='ResourceNotFoundException'):
+        client.delete_table(TableName=random_string(20))
+
+# Test that trying to update a table that doesn't exist fails in the
+# appropriate way (ResourceNotFoundException)
+def test_update_table_non_existent(dynamodb, test_table):
+    client = dynamodb.meta.client
+    with pytest.raises(ClientError, match='ResourceNotFoundException'):
+        client.update_table(TableName=random_string(20), BillingMode='PAY_PER_REQUEST')
--- a/test/alternator/test_update_expression.py
+++ b/test/alternator/test_update_expression.py
@@ -1043,6 +1043,20 @@ def test_nested_attribute_remove_from_missing_item(test_table_s):
    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE x.y')
    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE x[0]')

+# Though in an above test (test_nested_attribute_update_bad_path_dot) we
+# showed that DynamoDB does not allow REMOVE x.y if attribute x doesn't
+# exist - and generates a ValidationException, if x *does* exist but y
+# doesn't, it's fine and the removal should just be silently ignored.
+def test_nested_attribute_remove_missing_leaf(test_table_s):
+    p = random_string()
+    item = {'p': p, 'a': {'x': 3}, 'b': ['hi']}
+    test_table_s.put_item(Item=item)
+    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE a.y')
+    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE b[7]')
+    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE c')
+    # The above UpdateItem calls didn't change anything...
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == item
+
 # Similarly for other types of bad paths - using [0] on something which
 # doesn't exist or isn't an array.
 def test_nested_attribute_update_bad_path_array(test_table_s):
--- a/test/boost/cached_file_test.cc
+++ b/test/boost/cached_file_test.cc
@@ -19,6 +19,7 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <boost/range/irange.hpp>
 #include <seastar/testing/test_case.hh>
 #include <seastar/testing/thread_test_case.hh>
 #include <seastar/core/iostream.hh>
@@ -49,6 +50,15 @@ static sstring read_to_string(cached_file::stream& s, size_t limit = std::numeri
    return b.substr(0, limit);
 }

+static void read_to_void(cached_file::stream& s, size_t limit = std::numeric_limits<size_t>::max()) {
+    while (auto buf = s.next().get0()) {
+        if (buf.size() >= limit) {
+            break;
+        }
+        limit -= buf.size();
+    }
+}
+
 static sstring read_to_string(file& f, size_t start, size_t len) {
    file_input_stream_options opt;
    auto in = make_file_input_stream(f, start, len, opt);
@@ -61,6 +71,12 @@ static sstring read_to_string(cached_file& cf, size_t off, size_t limit = std::n
    return read_to_string(s, limit);
 }

+[[gnu::unused]]
+static void read_to_void(cached_file& cf, size_t off, size_t limit = std::numeric_limits<size_t>::max()) {
+    auto s = cf.read(off, default_priority_class(), std::nullopt);
+    read_to_void(s, limit);
+}
+
 struct test_file {
    tmpdir dir;
    file f;
@@ -204,7 +220,9 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
        }

        {
-            cf_lru.evict_all();
+            with_allocator(region.allocator(), [] {
+                cf_lru.evict_all();
+            });

            BOOST_REQUIRE_EQUAL(0, metrics.cached_bytes); // change here
            BOOST_REQUIRE_EQUAL(0, cf.cached_bytes()); // change here
@@ -212,6 +230,8 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
            BOOST_REQUIRE_EQUAL(3, metrics.page_evictions); // change here
            BOOST_REQUIRE_EQUAL(0, metrics.page_hits);
            BOOST_REQUIRE_EQUAL(3, metrics.page_populations);
+
+            BOOST_REQUIRE_EQUAL(region.occupancy().used_space(), 0);
        }

        {
@@ -255,6 +275,88 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
    }
 }

+// A file which serves garbage but is very fast.
+class garbage_file_impl : public file_impl {
+private:
+    [[noreturn]] void unsupported() {
+        throw_with_backtrace<std::logic_error>("unsupported operation");
+    }
+public:
+    // unsupported
+    virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override { unsupported(); }
+    virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override { unsupported(); }
+    virtual future<> flush(void) override { unsupported(); }
+    virtual future<> truncate(uint64_t length) override { unsupported(); }
+    virtual future<> discard(uint64_t offset, uint64_t length) override { unsupported(); }
+    virtual future<> allocate(uint64_t position, uint64_t length) override { unsupported(); }
+    virtual subscription<directory_entry> list_directory(std::function<future<>(directory_entry)>) override { unsupported(); }
+    virtual future<struct stat> stat(void) override { unsupported(); }
+    virtual future<uint64_t> size(void) override { unsupported(); }
+    virtual std::unique_ptr<seastar::file_handle_impl> dup() override { unsupported(); }
+
+    virtual future<> close() override { return make_ready_future<>(); }
+
+    virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t size, const io_priority_class& pc) override {
+        return make_ready_future<temporary_buffer<uint8_t>>(temporary_buffer<uint8_t>(size));
+    }
+
+    virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override {
+        unsupported(); // FIXME
+    }
+
+    virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
+        unsupported(); // FIXME
+    }
+};
+
+#ifndef SEASTAR_DEFAULT_ALLOCATOR // Eviction works only with the seastar allocator
+SEASTAR_THREAD_TEST_CASE(test_stress_eviction) {
+    auto page_size = cached_file::page_size;
+    auto n_pages = 8'000'000 / page_size;
+    auto file_size = page_size * n_pages;
+    auto cached_size = 4'000'000;
+
+    cached_file::metrics metrics;
+    logalloc::region region;
+
+    auto f = file(make_shared<garbage_file_impl>());
+    cached_file cf(f, metrics, cf_lru, region, file_size);
+
+    region.make_evictable([&] {
+        testlog.trace("Evicting");
+        cf.invalidate_at_most_front(file_size / 2);
+        return cf_lru.evict();
+    });
+
+    for (int i = 0; i < (cached_size / page_size); ++i) {
+        read_to_string(cf, page_size * i, page_size);
+    }
+
+    testlog.debug("Saturating memory...");
+
+    // Disable background reclaiming which will prevent bugs from reproducing
+    // We want reclamation to happen synchronously with page cache population in read_to_void()
+    seastar::memory::set_min_free_pages(0);
+
+    // Saturate std memory
+    chunked_fifo<bytes> blobs;
+    auto rc = region.reclaim_counter();
+    while (region.reclaim_counter() == rc) {
+        blobs.emplace_back(bytes(bytes::initialized_later(), 1024));
+    }
+
+    testlog.debug("Memory: allocated={}, free={}", seastar::memory::stats().allocated_memory(), seastar::memory::stats().free_memory());
+    testlog.debug("Starting test...");
+
+    for (int j = 0; j < n_pages * 16; ++j) {
+        testlog.trace("Allocating");
+        auto stride = tests::random::get_int(1, 20);
+        auto page_idx = tests::random::get_int(n_pages - stride);
+        read_to_void(cf, page_idx * page_size, page_size * stride);
+    }
+}
+#endif
+
 SEASTAR_THREAD_TEST_CASE(test_invalidation) {
    auto page_size = cached_file::page_size;
    test_file tf = make_test_file(page_size * 2);
--- a/test/boost/chunked_managed_vector_test.cc
+++ b/test/boost/chunked_managed_vector_test.cc
@@ -25,6 +25,8 @@
 #include <deque>
 #include <random>
 #include "utils/lsa/chunked_managed_vector.hh"
+#include "utils/managed_ref.hh"
+#include "test/lib/log.hh"

 #include <boost/range/algorithm/sort.hpp>
 #include <boost/range/algorithm/equal.hpp>
@@ -216,3 +218,106 @@ SEASTAR_TEST_CASE(tests_reserve_partial) {
  });
  return make_ready_future<>();
 }
+
+SEASTAR_TEST_CASE(test_clear_and_release) {
+    region region;
+    allocating_section as;
+
+    with_allocator(region.allocator(), [&] {
+        lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
+
+        for (uint64_t i = 1; i < 4000; ++i) {
+            as(region, [&] {
+                v.emplace_back(make_managed<uint64_t>(i));
+            });
+        }
+
+        v.clear_and_release();
+    });
+
+    return make_ready_future<>();
+}
+
+SEASTAR_TEST_CASE(test_chunk_reserve) {
+    region region;
+    allocating_section as;
+
+    for (auto conf :
+            { // std::make_pair(reserve size, push count)
+                std::make_pair(0, 4000),
+                std::make_pair(100, 4000),
+                std::make_pair(200, 4000),
+                std::make_pair(1000, 4000),
+                std::make_pair(2000, 4000),
+                std::make_pair(3000, 4000),
+                std::make_pair(5000, 4000),
+                std::make_pair(500, 8000),
+                std::make_pair(1000, 8000),
+                std::make_pair(2000, 8000),
+                std::make_pair(8000, 500),
+            })
+    {
+        with_allocator(region.allocator(), [&] {
+            auto [reserve_size, push_count] = conf;
+            testlog.info("Testing reserve({}), {}x emplace_back()", reserve_size, push_count);
+            lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
+            v.reserve(reserve_size);
+            uint64_t seed = rand();
+            for (uint64_t i = 0; i < push_count; ++i) {
+                as(region, [&] {
+                    v.emplace_back(make_managed<uint64_t>(seed + i));
+                    BOOST_REQUIRE(**v.begin() == seed);
+                });
+            }
+            auto v_it = v.begin();
+            for (uint64_t i = 0; i < push_count; ++i) {
+                BOOST_REQUIRE(**v_it++ == seed + i);
+            }
+            v.clear_and_release();
+        });
+    }
+
+    return make_ready_future<>();
+}
+
+// Tests the case of make_room() invoked with last_chunk_capacity_deficit but _size not in
+// the last reserved chunk.
+SEASTAR_TEST_CASE(test_shrinking_and_expansion_involving_chunk_boundary) {
+    region region;
+    allocating_section as;
+
+    with_allocator(region.allocator(), [&] {
+        lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
+
+        // Fill two chunks
+        v.reserve(2000);
+        for (uint64_t i = 0; i < 2000; ++i) {
+            as(region, [&] {
+                v.emplace_back(make_managed<uint64_t>(i));
+            });
+        }
+
+        // Make the last chunk smaller than max size to trigger the last_chunk_capacity_deficit path in make_room()
+        v.shrink_to_fit();
+
+        // Leave the last chunk reserved but empty
+        for (uint64_t i = 0; i < 1000; ++i) {
+            v.pop_back();
+        }
+
+        // Try to reserve more than the currently reserved capacity and trigger last_chunk_capacity_deficit path
+        // with _size not in the last chunk. Should not sigsegv.
+        v.reserve(8000);
+
+        for (uint64_t i = 0; i < 2000; ++i) {
+            as(region, [&] {
+                v.emplace_back(make_managed<uint64_t>(i));
+            });
+        }
+
+        v.clear_and_release();
+    });
+
+    return make_ready_future<>();
+}
+
--- a/test/boost/chunked_vector_test.cc
+++ b/test/boost/chunked_vector_test.cc
@@ -191,3 +191,32 @@ BOOST_AUTO_TEST_CASE(tests_reserve_partial) {
        BOOST_REQUIRE_EQUAL(v.capacity(), orig_size);
    }
 }
+
+// Tests the case of make_room() invoked with last_chunk_capacity_deficit but _size not in
+// the last reserved chunk.
+BOOST_AUTO_TEST_CASE(test_shrinking_and_expansion_involving_chunk_boundary) {
+    using vector_type = utils::chunked_vector<std::unique_ptr<uint64_t>>;
+    vector_type v;
+
+    // Fill two chunks
+    v.reserve(vector_type::max_chunk_capacity() * 3 / 2);
+    for (uint64_t i = 0; i < vector_type::max_chunk_capacity() * 3 / 2; ++i) {
+        v.emplace_back(std::make_unique<uint64_t>(i));
+    }
+
+    // Make the last chunk smaller than max size to trigger the last_chunk_capacity_deficit path in make_room()
+    v.shrink_to_fit();
+
+    // Leave the last chunk reserved but empty
+    for (uint64_t i = 0; i < vector_type::max_chunk_capacity(); ++i) {
+        v.pop_back();
+    }
+
+    // Try to reserve more than the currently reserved capacity and trigger last_chunk_capacity_deficit path
+    // with _size not in the last chunk. Should not sigsegv.
+    v.reserve(vector_type::max_chunk_capacity() * 4);
+
+    for (uint64_t i = 0; i < vector_type::max_chunk_capacity() * 2; ++i) {
+        v.emplace_back(std::make_unique<uint64_t>(i));
+    }
+}
--- a/test/boost/commitlog_test.cc
+++ b/test/boost/commitlog_test.cc
@@ -44,7 +44,9 @@
 #include "test/lib/tmpdir.hh"
 #include "db/commitlog/commitlog.hh"
 #include "db/commitlog/commitlog_replayer.hh"
+#include "db/commitlog/commitlog_extensions.hh"
 #include "db/commitlog/rp_set.hh"
+#include "db/extensions.hh"
 #include "log.hh"
 #include "service/priority_manager.hh"
 #include "test/lib/exception_utils.hh"
@@ -947,3 +949,113 @@ SEASTAR_TEST_CASE(test_commitlog_deadlock_with_flush_threshold) {
        co_await log.clear();
    }
 }
+
+static future<> do_test_exception_in_allocate_ex(bool do_file_delete, bool reuse = true) {
+    commitlog::config cfg;
+
+    constexpr auto max_size_mb = 1;
+
+    cfg.commitlog_segment_size_in_mb = max_size_mb;
+    cfg.commitlog_total_space_in_mb = 2 * max_size_mb * smp::count;
+    cfg.commitlog_sync_period_in_ms = 10;
+    cfg.reuse_segments = reuse;
+    cfg.allow_going_over_size_limit = false; // #9348 - now can enforce size limit always
+    cfg.use_o_dsync = true; // make sure we pre-allocate.
+
+    // not using cl_test, because we need to be able to abandon
+    // the log.
+
+    tmpdir tmp;
+    cfg.commit_log_location = tmp.path().string();
+
+    class myfail : public std::exception {
+    public:
+        using std::exception::exception;
+    };
+
+    struct myext: public db::commitlog_file_extension {
+    public:
+        bool fail = false;
+        bool thrown = false;
+        bool do_file_delete;
+
+        myext(bool dd)
+            : do_file_delete(dd)
+        {}
+
+        seastar::future<seastar::file> wrap_file(const seastar::sstring& filename, seastar::file f, seastar::open_flags flags) override {
+            if (fail && !thrown) {
+                thrown = true;
+                if (do_file_delete) {
+                    co_await f.close();
+                    co_await seastar::remove_file(filename);
+                }
+                throw myfail{};
+            }
+            co_return f;
+        }
+        seastar::future<> before_delete(const seastar::sstring&) override {
+            co_return;
+        }
+    };
+
+    auto ep = std::make_unique<myext>(do_file_delete);
+    auto& mx = *ep;
+
+    db::extensions myexts;
+    myexts.add_commitlog_file_extension("hufflepuff", std::move(ep));
+
+    cfg.extensions = &myexts;
+
+    auto log = co_await commitlog::create_commitlog(cfg);
+
+    rp_set rps;
+    // uncomment for verbosity
+    // logging::logger_registry().set_logger_level("commitlog", logging::log_level::debug);
+
+    auto uuid = utils::UUID_gen::get_time_UUID();
+    auto size = log.max_record_size();
+
+    auto r = log.add_flush_handler([&](cf_id_type id, replay_position pos) {
+        log.discard_completed_segments(id, rps);
+        mx.fail = true;
+    });
+
+    try {
+        while (!mx.thrown) {
+            rp_handle h = co_await log.add_mutation(uuid, size, db::commitlog::force_sync::no, [&](db::commitlog::output& dst) {
+                dst.fill('1', size);
+            });
+            rps.put(std::move(h));
+        }
+    } catch (...) {
+        BOOST_FAIL("log write timed out. maybe it is deadlocked... Will not free log. ASAN errors and leaks will follow...");
+    }
+
+    co_await log.shutdown();
+    co_await log.clear();
+}
+
+/**
+ * Test generating an exception in segment file allocation
+ */
+SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex) {
+    co_await do_test_exception_in_allocate_ex(false);
+}
+
+SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex_no_recycle) {
+    co_await do_test_exception_in_allocate_ex(false, false);
+}
+
+/**
+ * Test generating an exception in segment file allocation, but also 
+ * delete the file, which in turn should cause follow-up exceptions
+ * in cleanup delete. Which CL should handle
+ */
+SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex_deleted_file) {
+    co_await do_test_exception_in_allocate_ex(true, false);
+}
+
+SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex_deleted_file_no_recycle) {
+    co_await do_test_exception_in_allocate_ex(true);
+}
--- a/test/boost/database_test.cc
+++ b/test/boost/database_test.cc
@@ -784,3 +784,38 @@ SEASTAR_TEST_CASE(upgrade_sstables) {
        }).get();
    });
 }
+
+SEASTAR_TEST_CASE(database_drop_column_family_clears_querier_cache) {
+    return do_with_cql_env_thread([] (cql_test_env& e) {
+        e.execute_cql("create table ks.cf (k text, v int, primary key (k));").get();
+        auto& db = e.local_db();
+        const auto ts = db_clock::now();
+        auto& tbl = db.find_column_family("ks", "cf");
+
+        auto op = std::optional(tbl.read_in_progress());
+        auto s = tbl.schema();
+        auto q = query::data_querier(
+                tbl.as_mutation_source(),
+                tbl.schema(),
+                database_test(db).get_user_read_concurrency_semaphore().make_tracking_only_permit(s.get(), "test", db::no_timeout),
+                query::full_partition_range,
+                s->full_slice(),
+                default_priority_class(),
+                nullptr);
+
+        auto f = e.db().invoke_on_all([ts] (database& db) {
+            return db.drop_column_family("ks", "cf", [ts] { return make_ready_future<db_clock::time_point>(ts); });
+        });
+
+        // we add a querier to the querier cache while the drop is ongoing
+        auto& qc = db.get_querier_cache();
+        qc.insert(utils::make_random_uuid(), std::move(q), nullptr);
+        BOOST_REQUIRE_EQUAL(qc.get_stats().population, 1);
+
+        op.reset(); // this should allow the drop to finish
+        f.get();
+
+        // the drop should have cleaned up all entries belonging to that table
+        BOOST_REQUIRE_EQUAL(qc.get_stats().population, 0);
+    });
+}
--- a/test/boost/index_with_paging_test.cc
+++ b/test/boost/index_with_paging_test.cc
@@ -22,6 +22,8 @@
 #include <seastar/testing/test_case.hh>
 #include "test/lib/cql_test_env.hh"
 #include "test/lib/cql_assertions.hh"
+#include "cql3/untyped_result_set.hh"
+#include "cql3/query_processor.hh"
 #include "transport/messages/result_message.hh"

 SEASTAR_TEST_CASE(test_index_with_paging) {
@@ -56,3 +58,51 @@ SEASTAR_TEST_CASE(test_index_with_paging) {
        });
    });
 }
+
+SEASTAR_TEST_CASE(test_index_with_paging_with_base_short_read) {
+    return do_with_cql_env_thread([] (auto& e) {
+        e.execute_cql("CREATE TABLE tab (pk int, ck text, v int, v2 int, v3 text, PRIMARY KEY (pk, ck))").get();
+        e.execute_cql("CREATE INDEX ON tab (v)").get();
+
+        // Enough to trigger a short read on the base table during scan
+        sstring big_string(2 * query::result_memory_limiter::maximum_result_size, 'j');
+
+        const int row_count = 67;
+        for (int i = 0; i < row_count; ++i) {
+            e.execute_cql(format("INSERT INTO tab (pk, ck, v, v2, v3) VALUES ({}, 'hello{}', 1, {}, '{}')", i % 3, i, i, big_string)).get();
+        }
+
+        eventually([&] {
+            uint64_t count = 0;
+            e.qp().local().query_internal("SELECT * FROM ks.tab WHERE v = 1", [&] (const cql3::untyped_result_set_row&) {
+                ++count;
+                return make_ready_future<stop_iteration>(stop_iteration::no);
+            }).get();
+            BOOST_REQUIRE_EQUAL(count, row_count);
+        });
+    });
+}
+
+SEASTAR_TEST_CASE(test_index_with_paging_with_base_short_read_no_ck) {
+    return do_with_cql_env_thread([] (auto& e) {
+        e.execute_cql("CREATE TABLE tab (pk int, v int, v2 int, v3 text, PRIMARY KEY (pk))").get();
+        e.execute_cql("CREATE INDEX ON tab (v)").get();
+
+        // Enough to trigger a short read on the base table during scan
+        sstring big_string(2 * query::result_memory_limiter::maximum_result_size, 'j');
+
+        const int row_count = 67;
+        for (int i = 0; i < row_count; ++i) {
+            e.execute_cql(format("INSERT INTO tab (pk, v, v2, v3) VALUES ({}, 1, {}, '{}')", i, i, big_string)).get();
+        }
+
+        eventually([&] {
+            uint64_t count = 0;
+            e.qp().local().query_internal("SELECT * FROM ks.tab WHERE v = 1", [&] (const cql3::untyped_result_set_row&) {
+                ++count;
+                return make_ready_future<stop_iteration>(stop_iteration::no);
+            }).get();
+            BOOST_REQUIRE_EQUAL(count, row_count);
+        });
+    });
+}
--- a/test/boost/loading_cache_test.cc
+++ b/test/boost/loading_cache_test.cc
@@ -391,3 +391,87 @@ SEASTAR_TEST_CASE(test_loading_cache_reload_during_eviction) {
        BOOST_REQUIRE_EQUAL(loading_cache.size(), 1);
    });
 }
+
+SEASTAR_THREAD_TEST_CASE(test_loading_cache_remove_leaves_no_old_entries_behind) {
+    using namespace std::chrono;
+    load_count = 0;
+
+    auto load_v1 = [] (auto key) { return make_ready_future<sstring>("v1"); };
+    auto load_v2 = [] (auto key) { return make_ready_future<sstring>("v2"); };
+    auto load_v3 = [] (auto key) { return make_ready_future<sstring>("v3"); };
+
+    {
+        utils::loading_cache<int, sstring> loading_cache(num_loaders, 100s, testlog);
+        auto stop_cache_reload = seastar::defer([&loading_cache] { loading_cache.stop().get(); });
+
+        //
+        // Test remove() concurrent with loading
+        //
+
+        auto f = loading_cache.get_ptr(0, [&](auto key) {
+            return later().then([&] {
+                return load_v1(key);
+            });
+        });
+
+        loading_cache.remove(0);
+
+        BOOST_REQUIRE_EQUAL(loading_cache.find(0), nullptr);
+        BOOST_REQUIRE_EQUAL(loading_cache.size(), 0);
+
+        auto ptr1 = f.get0();
+        BOOST_REQUIRE_EQUAL(*ptr1, "v1");
+
+        BOOST_REQUIRE_EQUAL(loading_cache.find(0), nullptr);
+        BOOST_REQUIRE_EQUAL(loading_cache.size(), 0);
+
+        ptr1 = loading_cache.get_ptr(0, load_v2).get0();
+        loading_cache.remove(0);
+        BOOST_REQUIRE_EQUAL(*ptr1, "v2");
+
+        //
+        // Test that live ptr1, removed from cache, does not prevent reload of new value
+        //
+        auto ptr2 = loading_cache.get_ptr(0, load_v3).get0();
+        ptr1 = nullptr;
+        BOOST_REQUIRE_EQUAL(*ptr2, "v3");
+    }
+
+    // Test remove_if()
+    {
+        utils::loading_cache<int, sstring> loading_cache(num_loaders, 100s, testlog);
+        auto stop_cache_reload = seastar::defer([&loading_cache] { loading_cache.stop().get(); });
+
+        //
+        // Test remove_if() concurrent with loading
+        //
+        auto f = loading_cache.get_ptr(0, [&](auto key) {
+            return later().then([&] {
+                return load_v1(key);
+            });
+        });
+
+        loading_cache.remove_if([] (auto&& v) { return v == "v1"; });
+
+        BOOST_REQUIRE_EQUAL(loading_cache.find(0), nullptr);
+        BOOST_REQUIRE_EQUAL(loading_cache.size(), 0);
+
+        auto ptr1 = f.get0();
+        BOOST_REQUIRE_EQUAL(*ptr1, "v1");
+
+        BOOST_REQUIRE_EQUAL(loading_cache.find(0), nullptr);
+        BOOST_REQUIRE_EQUAL(loading_cache.size(), 0);
+
+        ptr1 = loading_cache.get_ptr(0, load_v2).get0();
+        loading_cache.remove_if([] (auto&& v) { return v == "v2"; });
+        BOOST_REQUIRE_EQUAL(*ptr1, "v2");
+
+        //
+        // Test that live ptr1, removed from cache, does not prevent reload of new value
+        //
+        auto ptr2 = loading_cache.get_ptr(0, load_v3).get0();
+        ptr1 = nullptr;
+        BOOST_REQUIRE_EQUAL(*ptr2, "v3");
+        ptr2 = nullptr;
+    }
+}
--- a/test/boost/memtable_test.cc
+++ b/test/boost/memtable_test.cc
@@ -39,6 +39,9 @@
 #include "test/lib/random_utils.hh"
 #include "test/lib/log.hh"
 #include "test/lib/reader_concurrency_semaphore.hh"
+#include "test/lib/simple_schema.hh"
+#include "test/lib/make_random_string.hh"
+#include "utils/error_injection.hh"

 static api::timestamp_type next_timestamp() {
    static thread_local api::timestamp_type next_timestamp = 1;
@@ -528,6 +531,74 @@ SEASTAR_TEST_CASE(test_exception_safety_of_single_partition_reads) {
    });
 }

+SEASTAR_THREAD_TEST_CASE(test_tombstone_merging_with_multiple_versions) {
+    tests::reader_concurrency_semaphore_wrapper semaphore;
+    simple_schema ss;
+    auto s = ss.schema();
+    auto mt = make_lw_shared<memtable>(ss.schema());
+
+    auto pk = ss.make_pkey(0);
+    auto pr = dht::partition_range::make_singular(pk);
+
+    auto t0 = ss.new_tombstone();
+    auto t1 = ss.new_tombstone();
+    auto t2 = ss.new_tombstone();
+    auto t3 = ss.new_tombstone();
+
+    mutation m1(s, pk);
+    ss.delete_range(m1, *position_range_to_clustering_range(position_range(
+                position_in_partition::before_key(ss.make_ckey(0)),
+                position_in_partition::for_key(ss.make_ckey(3))), *s), t1);
+    ss.add_row(m1, ss.make_ckey(0), "v");
+    ss.add_row(m1, ss.make_ckey(1), "v");
+
+    // Fill so that rd1 stays in the partition snapshot
+    int n_rows = 1000;
+    auto v = make_random_string(512);
+    for (int i = 0; i < n_rows; ++i) {
+        ss.add_row(m1, ss.make_ckey(i), v);
+    }
+
+    mutation m2(s, pk);
+    ss.delete_range(m2, *position_range_to_clustering_range(position_range(
+            position_in_partition::before_key(ss.make_ckey(0)),
+            position_in_partition::before_key(ss.make_ckey(1))), *s), t2);
+    ss.delete_range(m2, *position_range_to_clustering_range(position_range(
+            position_in_partition::before_key(ss.make_ckey(1)),
+            position_in_partition::for_key(ss.make_ckey(3))), *s), t3);
+
+    mutation m3(s, pk);
+    ss.delete_range(m3, *position_range_to_clustering_range(position_range(
+            position_in_partition::before_key(ss.make_ckey(0)),
+            position_in_partition::for_key(ss.make_ckey(4))), *s), t0);
+
+    mt->apply(m1);
+
+    auto rd1 = mt->make_flat_reader(s, semaphore.make_permit(), pr, s->full_slice(), default_priority_class(),
+                                    nullptr, streamed_mutation::forwarding::no, mutation_reader::forwarding::no);
+    auto close_rd1 = defer([&] { rd1.close().get(); });
+
+    rd1.fill_buffer().get();
+    BOOST_REQUIRE(!rd1.is_end_of_stream()); // rd1 must keep the m1 version alive
+
+    mt->apply(m2);
+
+    auto rd2 = mt->make_flat_reader(s, semaphore.make_permit(), pr, s->full_slice(), default_priority_class(),
+                                    nullptr, streamed_mutation::forwarding::no, mutation_reader::forwarding::no);
+    auto close_r2 = defer([&] { rd2.close().get(); });
+
+    rd2.fill_buffer().get();
+    BOOST_REQUIRE(!rd2.is_end_of_stream()); // rd2 must keep the m1 version alive
+
+    mt->apply(m3);
+
+    assert_that(mt->make_flat_reader(s, semaphore.make_permit(), pr))
+        .has_monotonic_positions();
+
+    assert_that(mt->make_flat_reader(s, semaphore.make_permit(), pr))
+        .produces(m1 + m2 + m3);
+}
+
 SEASTAR_TEST_CASE(test_hash_is_cached) {
    return seastar::async([] {
        auto s = schema_builder("ks", "cf")
--- a/test/boost/mutation_test.cc
+++ b/test/boost/mutation_test.cc
@@ -702,6 +702,7 @@ SEASTAR_TEST_CASE(test_cell_ordering) {
    };

    auto assert_equal = [] (atomic_cell_view c1, atomic_cell_view c2) {
+        testlog.trace("Expected {} == {}", c1, c2);
        BOOST_REQUIRE(compare_atomic_cell_for_merge(c1, c2) == 0);
        BOOST_REQUIRE(compare_atomic_cell_for_merge(c2, c1) == 0);
    };
@@ -723,9 +724,11 @@ SEASTAR_TEST_CASE(test_cell_ordering) {
        atomic_cell::make_live(*bytes_type, 1, bytes(), expiry_2, ttl_2));

    // Origin doesn't compare ttl (is it wise?)
-    assert_equal(
-        atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_1),
-        atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_2));
+    // But we do. See https://github.com/scylladb/scylla/issues/10156
+    // and https://github.com/scylladb/scylla/issues/10173
+    assert_order(
+        atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_2),
+        atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_1));

    assert_order(
        atomic_cell::make_live(*bytes_type, 0, bytes("value1")),
--- a/test/boost/mvcc_test.cc
+++ b/test/boost/mvcc_test.cc
@@ -560,7 +560,7 @@ SEASTAR_TEST_CASE(test_apply_to_incomplete_respects_continuity) {
 static mutation_partition read_using_cursor(partition_snapshot& snap) {
    tests::reader_concurrency_semaphore_wrapper semaphore;
    partition_snapshot_row_cursor cur(*snap.schema(), snap);
-    cur.maybe_refresh();
+    cur.advance_to(position_in_partition::before_all_clustered_rows());
    auto mp = read_partition_from(*snap.schema(), cur);
    for (auto&& rt : snap.range_tombstones()) {
        mp.apply_delete(*snap.schema(), rt);
--- a/test/boost/range_tombstone_list_test.cc
+++ b/test/boost/range_tombstone_list_test.cc
@@ -210,6 +210,35 @@ BOOST_AUTO_TEST_CASE(test_overlapping_addition) {
    BOOST_REQUIRE(it == l.end());
 }

+BOOST_AUTO_TEST_CASE(test_adjacent_empty_range_tombstone) {
+    range_tombstone_list l(*s);
+
+    l.apply(*s, rtie(1, 1, 2));
+    l.apply(*s, rt(1, 2, 3));
+    l.apply(*s, rtei(2, 2, 2));
+    l.apply(*s, rtei(2, 4, 3));
+
+    auto it = l.begin();
+    assert_rt(rt(1, 4, 3), *it++);
+    BOOST_REQUIRE(it == l.end());
+}
+
+BOOST_AUTO_TEST_CASE(test_empty_range_tombstones_are_dropped) {
+    range_tombstone_list l(*s);
+
+    l.apply(*s, rtei(0, 0, 1));
+    l.apply(*s, rtie(0, 0, 1));
+    l.apply(*s, rt(1, 2, 1));
+    l.apply(*s, rtei(4, 4, 1));
+    l.apply(*s, rtie(5, 5, 1));
+    l.apply(*s, rt(7, 8, 1));
+
+    auto it = l.begin();
+    assert_rt(rt(1, 2, 1), *it++);
+    assert_rt(rt(7, 8, 1), *it++);
+    BOOST_REQUIRE(it == l.end());
+}
+
 BOOST_AUTO_TEST_CASE(test_simple_overlap) {
    range_tombstone_list l1(*s);

@@ -473,6 +502,23 @@ static std::vector<range_tombstone> make_random() {
        rts.emplace_back(std::move(start_b), std::move(end_b), tombstone(dist(gen), gc_now));
    }

+    int32_t size_empty = dist(gen) / 2;
+    for (int32_t i = 0; i < size_empty; ++i) {
+        clustering_key_prefix key = make_random_ckey();
+        bool start_incl = dist(gen) > 25;
+        if (start_incl) {
+            rts.emplace_back(
+                    position_in_partition::before_key(key),
+                    position_in_partition::before_key(key),
+                    tombstone(dist(gen), gc_now));
+        } else {
+            rts.emplace_back(
+                    position_in_partition::after_key(key),
+                    position_in_partition::after_key(key),
+                    tombstone(dist(gen), gc_now));
+        }
+    }
+
    return rts;
 }

--- a/test/boost/row_cache_test.cc
+++ b/test/boost/row_cache_test.cc
@@ -1242,9 +1242,13 @@ SEASTAR_TEST_CASE(test_update_failure) {
 class throttle {
    unsigned _block_counter = 0;
    promise<> _p; // valid when _block_counter != 0, resolves when goes down to 0
+    std::optional<promise<>> _entered;
+    bool _one_shot;
 public:
+    // one_shot means whether only the first enter() after block() will block.
+    throttle(bool one_shot = false) : _one_shot(one_shot) {}
    future<> enter() {
-        if (_block_counter) {
+        if (_block_counter && (!_one_shot || _entered)) {
            promise<> p1;
            promise<> p2;

@@ -1256,16 +1260,21 @@ public:
                p3.set_value();
            });
            _p = std::move(p2);
-
+            if (_entered) {
+                _entered->set_value();
+                _entered.reset();
+            }
            return f1;
        } else {
            return make_ready_future<>();
        }
    }

-    void block() {
+    future<> block() {
        ++_block_counter;
        _p = promise<>();
+        _entered = promise<>();
+        return _entered->get_future();
    }

    void unblock() {
@@ -1410,7 +1419,7 @@ SEASTAR_TEST_CASE(test_cache_population_and_update_race) {
            mt2->apply(m);
        }

-        thr.block();
+        auto f = thr.block();

        auto m0_range = dht::partition_range::make_singular(ring[0].ring_position());
        auto rd1 = cache.make_reader(s, semaphore.make_permit(), m0_range);
@@ -1421,6 +1430,7 @@ SEASTAR_TEST_CASE(test_cache_population_and_update_race) {
        rd2.set_max_buffer_size(1);
        auto rd2_fill_buffer = rd2.fill_buffer();

+        f.get();
        sleep(10ms).get();

        // This update should miss on all partitions
@@ -1548,12 +1558,13 @@ SEASTAR_TEST_CASE(test_cache_population_and_clear_race) {
            mt2->apply(m);
        }

-        thr.block();
+        auto f = thr.block();

        auto rd1 = cache.make_reader(s, semaphore.make_permit());
        rd1.set_max_buffer_size(1);
        auto rd1_fill_buffer = rd1.fill_buffer();

+        f.get();
        sleep(10ms).get();

        // This update should miss on all partitions
@@ -3777,3 +3788,81 @@ SEASTAR_TEST_CASE(test_scans_erase_dummies) {
        BOOST_REQUIRE_EQUAL(tracker.get_stats().rows, 2);
    });
 }
+
+SEASTAR_TEST_CASE(test_eviction_of_upper_bound_of_population_range) {
+    return seastar::async([] {
+        simple_schema s;
+        tests::reader_concurrency_semaphore_wrapper semaphore;
+        auto cache_mt = make_lw_shared<memtable>(s.schema());
+
+        auto pkey = s.make_pkey("pk");
+
+        mutation m1(s.schema(), pkey);
+        s.add_row(m1, s.make_ckey(1), "v1");
+        s.add_row(m1, s.make_ckey(2), "v2");
+        cache_mt->apply(m1);
+
+        cache_tracker tracker;
+        throttle thr(true);
+        auto cache_source = make_decorated_snapshot_source(snapshot_source([&] { return cache_mt->as_data_source(); }),
+                                                           [&] (mutation_source src) {
+            return throttled_mutation_source(thr, std::move(src));
+        });
+        row_cache cache(s.schema(), cache_source, tracker);
+
+        auto pr = dht::partition_range::make_singular(pkey);
+
+        auto read = [&] (int start, int end) {
+            auto slice = partition_slice_builder(*s.schema())
+                    .with_range(query::clustering_range::make(s.make_ckey(start), s.make_ckey(end)))
+                    .build();
+            auto rd = cache.make_reader(s.schema(), semaphore.make_permit(), pr, slice);
+            auto close_rd = deferred_close(rd);
+            auto m_cache = read_mutation_from_flat_mutation_reader(rd).get0();
+            close_rd.close_now();
+            rd = cache_mt->make_flat_reader(s.schema(), semaphore.make_permit(), pr, slice);
+            auto close_rd2 = deferred_close(rd);
+            auto m_mt = read_mutation_from_flat_mutation_reader(rd).get0();
+            BOOST_REQUIRE(m_mt);
+            assert_that(m_cache).has_mutation().is_equal_to(*m_mt);
+        };
+
+        // populate [2]
+        {
+            auto slice = partition_slice_builder(*s.schema())
+                    .with_range(query::clustering_range::make_singular(s.make_ckey(2)))
+                    .build();
+            assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr, slice))
+                    .has_monotonic_positions();
+        }
+
+        auto arrived = thr.block();
+
+        // Read [0, 2]
+        auto f = seastar::async([&] {
+            read(0, 2);
+        });
+
+        arrived.get();
+
+        // populate (2, 3]
+        {
+            auto slice = partition_slice_builder(*s.schema())
+                    .with_range(query::clustering_range::make(query::clustering_range::bound(s.make_ckey(2), false),
+                                                              query::clustering_range::bound(s.make_ckey(3), true)))
+                    .build();
+            assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr, slice))
+                    .has_monotonic_positions();
+        }
+
+        testlog.trace("Evicting");
+        evict_one_row(tracker); // Evicts before(0)
+        evict_one_row(tracker); // Evicts ck(2)
+        testlog.trace("Unblocking");
+
+        thr.unblock();
+        f.get();
+
+        read(0, 3);
+    });
+}
--- a/test/boost/sstable_partition_index_cache_test.cc
+++ b/test/boost/sstable_partition_index_cache_test.cc
@@ -37,20 +37,30 @@ static void add_entry(logalloc::region& r,
 {
    logalloc::allocating_section as;
    as(r, [&] {
-        sstables::key sst_key = sstables::key::from_partition_key(s, key);
-        page._entries.push_back(make_managed<index_entry>(
-                managed_bytes(sst_key.get_bytes()),
-                position,
-                managed_ref<promoted_index>()));
+        with_allocator(r.allocator(), [&] {
+            sstables::key sst_key = sstables::key::from_partition_key(s, key);
+            page._entries.push_back(make_managed<index_entry>(
+                    managed_bytes(sst_key.get_bytes()),
+                    position,
+                    managed_ref<promoted_index>()));
+        });
    });
 }

 static partition_index_page make_page0(logalloc::region& r, simple_schema& s) {
    partition_index_page page;
+    auto destroy_page = defer([&] {
+        with_allocator(r.allocator(), [&] {
+           auto p = std::move(page);
+        });
+    });
+
    add_entry(r, *s.schema(), page, s.make_pkey(0).key(), 0);
    add_entry(r, *s.schema(), page, s.make_pkey(1).key(), 1);
    add_entry(r, *s.schema(), page, s.make_pkey(2).key(), 2);
    add_entry(r, *s.schema(), page, s.make_pkey(3).key(), 3);
+
+    destroy_page.cancel();
    return page;
 }

@@ -141,6 +151,47 @@ SEASTAR_THREAD_TEST_CASE(test_caching) {
    }
 }

+template <typename T>
+static future<> ignore_result(future<T>&& f) {
+    return f.then_wrapped([] (auto&& f) {
+        try {
+            f.get();
+        } catch (...) {
+            // expected, silence warnings about ignored failed futures
+        }
+    });
+}
+
+SEASTAR_THREAD_TEST_CASE(test_exception_while_loading) {
+    ::lru lru;
+    simple_schema s;
+    logalloc::region r;
+    partition_index_cache cache(lru, r);
+
+    auto clear_lru = defer([&] {
+        with_allocator(r.allocator(), [&] {
+            lru.evict_all();
+        });
+    });
+
+    auto page0_loader = [&] (partition_index_cache::key_type k) {
+        return later().then([&] {
+            return make_page0(r, s);
+        });
+    };
+
+    memory::with_allocation_failures([&] {
+        cache.evict_gently().get();
+        auto f0 = ignore_result(cache.get_or_load(0, page0_loader));
+        auto f1 = ignore_result(cache.get_or_load(0, page0_loader));
+        f0.get();
+        f1.get();
+    });
+
+    auto ptr = cache.get_or_load(0, page0_loader).get0();
+    has_page0(ptr);
+}
+
 SEASTAR_THREAD_TEST_CASE(test_auto_clear) {
    ::lru lru;
    simple_schema s;
--- a/test/cql-pytest/test_cdc.py
+++ b/test/cql-pytest/test_cdc.py
@@ -19,6 +19,7 @@ from cassandra.cluster import ConsistencyLevel
 from cassandra.query import SimpleStatement

 from util import new_test_table
+from nodetool import flush

 def test_cdc_log_entries_use_cdc_streams(scylla_only, cql, test_keyspace):
    '''Test that the stream IDs chosen for CDC log entries come from the CDC generation
@@ -44,3 +45,16 @@ def test_cdc_log_entries_use_cdc_streams(scylla_only, cql, test_keyspace):

    assert(log_stream_ids.issubset(stream_ids))

+
+# Test for #10473 - reading logs (from sstable) after dropping
+# column in base.
+def test_cdc_alter_table_drop_column(scylla_only, cql, test_keyspace):
+    schema = "pk int primary key, v int"
+    extra = " with cdc = {'enabled': true}"
+    with new_test_table(cql, test_keyspace, schema, extra) as table:
+        cql.execute(f"insert into {table} (pk, v) values (0, 0)")
+        cql.execute(f"insert into {table} (pk, v) values (1, null)")
+        flush(cql, table)
+        flush(cql, table + "_scylla_cdc_log")
+        cql.execute(f"alter table {table} drop v")
+        cql.execute(f"select * from {table}_scylla_cdc_log")
--- a/Show More
+++ b/Show More