release: prepare for 4.6.2

docker: run scylla as root
Previous versions of Docker image runs scylla as root, but cb19048 accidently modified it to scylla user. To keep compatibility we need to revert this to root. Fixes #10261 Closes #10325 (cherry picked from commit f95a531407)
2022-04-05 16:59:53 +03:00 · 2022-04-05 12:46:12 +03:00 · 2022-04-05 12:42:36 +03:00 · 2022-04-03 11:21:43 +03:00 · 2022-03-29 12:40:17 +03:00 · 2022-03-28 10:57:31 +03:00
58 changed files with 1139 additions and 277 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -60,7 +60,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=4.6.dev
+VERSION=4.6.2

 if test -f version
 then
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -1017,18 +1017,16 @@ future<executor::request_return_type> executor::update_table(client_state& clien
    _stats.api_operations.update_table++;
    elogger.trace("Updating table {}", request);

-    std::string table_name = get_table_name(request);
-    if (table_name.find(INTERNAL_TABLE_PREFIX) == 0) {
+    schema_ptr tab = get_table(_proxy, request);
+    // the ugly but harmless conversion to string_view here is because
+    // Seastar's sstring is missing a find(std::string_view) :-()
+    if (std::string_view(tab->cf_name()).find(INTERNAL_TABLE_PREFIX) == 0) {
        return make_ready_future<request_return_type>(api_error::validation(
                format("Prefix {} is reserved for accessing internal tables", INTERNAL_TABLE_PREFIX)));
    }
-    std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
-    tracing::add_table_name(trace_state, keyspace_name, table_name);
+    tracing::add_table_name(trace_state, tab->ks_name(), tab->cf_name());

-    auto& db = _proxy.get_db().local();
-    auto& cf = db.find_column_family(keyspace_name, table_name);
-
-    schema_builder builder(cf.schema());
+    schema_builder builder(tab);

    rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
    if (stream_specification && stream_specification->IsObject()) {
@@ -2481,8 +2479,8 @@ static bool hierarchy_actions(
                        // attr member so we can use add()
                        rjson::add_with_string_name(v, attr, std::move(*newv));
                    } else {
-                        throw api_error::validation(format("Can't remove document path {} - not present in item",
-                            subh.get_value()._path));
+                        // Removing a.b when a is a map but a.b doesn't exist
+                        // is silently ignored. It's not considered an error.
                    }
                } else {
                    throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -79,6 +79,49 @@ atomic_cell::atomic_cell(const abstract_type& type, atomic_cell_view other)
    set_view(_data);
 }

+// Based on:
+//  - org.apache.cassandra.db.AbstractCell#reconcile()
+//  - org.apache.cassandra.db.BufferExpiringCell#reconcile()
+//  - org.apache.cassandra.db.BufferDeletedCell#reconcile()
+std::strong_ordering
+compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
+    if (left.timestamp() != right.timestamp()) {
+        return left.timestamp() <=> right.timestamp();
+    }
+    if (left.is_live() != right.is_live()) {
+        return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
+    }
+    if (left.is_live()) {
+        auto c = compare_unsigned(left.value(), right.value()) <=> 0;
+        if (c != 0) {
+            return c;
+        }
+        if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
+            // prefer expiring cells.
+            return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
+        }
+        if (left.is_live_and_has_ttl()) {
+            if (left.expiry() != right.expiry()) {
+                return left.expiry() <=> right.expiry();
+            } else {
+                // prefer the cell that was written later,
+                // so it survives longer after it expires, until purged.
+                return right.ttl() <=> left.ttl();
+            }
+        }
+    } else {
+        // Both are deleted
+
+        // Origin compares big-endian serialized deletion time. That's because it
+        // delegates to AbstractCell.reconcile() which compares values after
+        // comparing timestamps, which in case of deleted cells will hold
+        // serialized expiry.
+        return (uint64_t) left.deletion_time().time_since_epoch().count()
+                <=> (uint64_t) right.deletion_time().time_since_epoch().count();
+    }
+    return std::strong_ordering::equal;
+}
+
 atomic_cell_or_collection atomic_cell_or_collection::copy(const abstract_type& type) const {
    if (_data.empty()) {
        return atomic_cell_or_collection();
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -593,8 +593,8 @@ void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::con
                clogger.trace("csm {}: insert dummy at {}", fmt::ptr(this), _lower_bound);
                auto it = with_allocator(_lsa_manager.region().allocator(), [&] {
                    auto& rows = _snp->version()->partition().mutable_clustered_rows();
-                    auto new_entry = current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no);
-                    return rows.insert_before(_next_row.get_iterator_in_latest_version(), *new_entry);
+                    auto new_entry = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no));
+                    return rows.insert_before(_next_row.get_iterator_in_latest_version(), std::move(new_entry));
                });
                _snp->tracker()->insert(*it);
                _last_row = partition_snapshot_row_weakref(*_snp, it, true);
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -1511,6 +1511,11 @@ public:
        }

        auto process_cell = [&, this] (const column_definition& cdef) {
+            // If table uses compact storage it may contain a column of type empty
+            // and we need to ignore such a field because it is not present in CDC log.
+            if (cdef.type->get_kind() == abstract_type::kind::empty) {
+                return;
+            }
            if (auto current = get_col_from_row_state(row_state, cdef)) {
                _builder->set_value(image_ck, cdef, *current);
            } else if (op == operation::pre_image) {
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -1634,7 +1634,7 @@ future<bool> scrub_validate_mode_validate_reader(flat_mutation_reader reader, co
        while (auto mf_opt = co_await reader()) {
            if (cdata.is_stop_requested()) [[unlikely]] {
                // Compaction manager will catch this exception and re-schedule the compaction.
-                co_return coroutine::make_exception(compaction_stopped_exception(schema->ks_name(), schema->cf_name(), cdata.stop_requested));
+                throw compaction_stopped_exception(schema->ks_name(), schema->cf_name(), cdata.stop_requested);
            }

            const auto& mf = *mf_opt;
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -326,6 +326,11 @@ future<> compaction_manager::run_custom_job(column_family* cf, sstables::compact
    task->compaction_done = with_semaphore(_custom_job_sem, 1, [this, task, cf, &job = *job_ptr] () mutable {
        // take read lock for cf, so major compaction and resharding can't proceed in parallel.
        return with_lock(_compaction_locks[cf].for_read(), [this, task, cf, &job] () mutable {
+            // Allow caller to know that task (e.g. reshape) was asked to stop while waiting for a chance to run.
+            if (task->compaction_data.is_stop_requested()) {
+                throw sstables::compaction_stopped_exception(task->compacting_cf->schema()->ks_name(), task->compacting_cf->schema()->cf_name(),
+                    task->compaction_data.stop_requested);
+            }
            _stats.active_tasks++;
            if (!can_proceed(task)) {
                return make_ready_future<>();
@@ -737,8 +742,10 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
            column_family& cf = *task->compacting_cf;
            auto sstable_level = sst->get_sstable_level();
            auto run_identifier = sst->run_identifier();
+
            auto sstable_set_snapshot = can_purge ? std::make_optional(cf.get_sstable_set()) : std::nullopt;
-            auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), _maintenance_sg.io,
+            // FIXME: this compaction should run with maintenance priority.
+            auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), service::get_local_compaction_priority(),
                sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, options);

            // Releases reference to cleaned sstable such that respective used disk space can be freed.
@@ -747,15 +754,14 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
            };

            return with_semaphore(_rewrite_sstables_sem, 1, [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
-              // Take write lock for cf to serialize cleanup/upgrade sstables/scrub with major compaction/reshape/reshard.
-              return with_lock(_compaction_locks[&cf].for_write(), [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
+              return with_lock(_compaction_locks[&cf].for_read(), [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
                _stats.pending_tasks--;
                _stats.active_tasks++;
                task->setup_new_compaction();
                task->output_run_identifier = descriptor.run_identifier;
                compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
                return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor), task] (compaction_backlog_tracker& bt) mutable {
-                    return with_scheduling_group(_maintenance_sg.cpu, [this, &cf, descriptor = std::move(descriptor), task]() mutable {
+                    return with_scheduling_group(_compaction_controller.sg(), [this, &cf, descriptor = std::move(descriptor), task]() mutable {
                        return cf.compact_sstables(std::move(descriptor), task->compaction_data);
                    });
                });
@@ -979,7 +985,7 @@ void compaction_manager::stop_compaction(sstring type) {
    }
    // FIXME: switch to task_stop(), and wait for their termination, so API user can know when compactions actually stopped.
    for (auto& task : _tasks) {
-        if (task->compaction_running && target_type == task->type) {
+        if (target_type == task->type) {
            task->compaction_data.stop("user request");
        }
    }
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -117,7 +117,13 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
        if (!col_type->is_map()) {
            throw exceptions::invalid_request_exception(format("subscripting non-map column {}", cdef->name_as_text()));
        }
-        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*data.other_columns[data.sel.index_of(*cdef)]));
+        int32_t index = data.sel.index_of(*cdef);
+        if (index == -1) {
+            throw std::runtime_error(
+                    format("Column definition {} does not match any column in the query selection",
+                    cdef->name_as_text()));
+        }
+        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*data.other_columns[index]));
        const auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
        const auto key = evaluate_to_raw_view(col.sub, options);
        auto&& key_type = col_type->name_comparator();
@@ -135,8 +141,16 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
        case column_kind::clustering_key:
            return managed_bytes(data.clustering_key[cdef->id]);
        case column_kind::static_column:
-        case column_kind::regular_column:
-            return managed_bytes_opt(data.other_columns[data.sel.index_of(*cdef)]);
+            [[fallthrough]];
+        case column_kind::regular_column: {
+            int32_t index = data.sel.index_of(*cdef);
+            if (index == -1) {
+                throw std::runtime_error(
+                        format("Column definition {} does not match any column in the query selection",
+                        cdef->name_as_text()));
+            }
+            return managed_bytes_opt(data.other_columns[index]);
+        }
        default:
            throw exceptions::unsupported_operation_exception("Unknown column kind");
        }
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -528,7 +528,7 @@ statement_restrictions::statement_restrictions(database& db,
    }

    if (!_nonprimary_key_restrictions->empty()) {
-        if (_has_queriable_regular_index) {
+        if (_has_queriable_regular_index && _partition_range_is_simple) {
            _uses_secondary_indexing = true;
        } else if (!allow_filtering) {
            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -193,7 +193,7 @@ public:

    template<typename RowComparator>
    void sort(const RowComparator& cmp) {
-        std::sort(_rows.begin(), _rows.end(), std::ref(cmp));
+        std::sort(_rows.begin(), _rows.end(), cmp);
    }

    metadata& get_metadata();
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -995,6 +995,7 @@ lw_shared_ptr<const service::pager::paging_state> indexed_table_select_statement
    }

    auto paging_state_copy = make_lw_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
+    paging_state_copy->set_remaining(internal_paging_size);
    paging_state_copy->set_partition_key(std::move(index_pk));
    paging_state_copy->set_clustering_key(std::move(index_ck));
    return std::move(paging_state_copy);
--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -53,6 +53,7 @@
 #include "types/list.hh"
 #include "types/user.hh"
 #include "concrete_types.hh"
+#include "validation.hh"

 namespace cql3 {

@@ -251,6 +252,7 @@ insert_prepared_json_statement::build_partition_keys(const query_options& option
        exploded.emplace_back(json_value->second);
    }
    auto pkey = partition_key::from_optional_exploded(*s, std::move(exploded));
+    validation::validate_cql_key(*s, pkey);
    auto k = query::range<query::ring_position>::make_singular(dht::decorate_key(*s, std::move(pkey)));
    ranges.emplace_back(std::move(k));
    return ranges;
--- a/database.cc
+++ b/database.cc
@@ -1348,44 +1348,6 @@ database::existing_index_names(const sstring& ks_name, const sstring& cf_to_excl
    return names;
 }

-// Based on:
-//  - org.apache.cassandra.db.AbstractCell#reconcile()
-//  - org.apache.cassandra.db.BufferExpiringCell#reconcile()
-//  - org.apache.cassandra.db.BufferDeletedCell#reconcile()
-std::strong_ordering
-compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
-    if (left.timestamp() != right.timestamp()) {
-        return left.timestamp() <=> right.timestamp();
-    }
-    if (left.is_live() != right.is_live()) {
-        return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
-    }
-    if (left.is_live()) {
-        auto c = compare_unsigned(left.value(), right.value()) <=> 0;
-        if (c != 0) {
-            return c;
-        }
-        if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
-            // prefer expiring cells.
-            return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
-        }
-        if (left.is_live_and_has_ttl() && left.expiry() != right.expiry()) {
-            return left.expiry() <=> right.expiry();
-        }
-    } else {
-        // Both are deleted
-        if (left.deletion_time() != right.deletion_time()) {
-            // Origin compares big-endian serialized deletion time. That's because it
-            // delegates to AbstractCell.reconcile() which compares values after
-            // comparing timestamps, which in case of deleted cells will hold
-            // serialized expiry.
-            return (uint64_t) left.deletion_time().time_since_epoch().count()
-                   <=> (uint64_t) right.deletion_time().time_since_epoch().count();
-        }
-    }
-    return std::strong_ordering::equal;
-}
-
 future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>>
 database::query(schema_ptr s, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges,
                tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -428,6 +428,8 @@ private:
    void abort_recycled_list(std::exception_ptr);
    void abort_deletion_promise(std::exception_ptr);

+    future<> recalculate_footprint();
+
    future<> rename_file(sstring, sstring) const;
    size_t max_request_controller_units() const;
    segment_id_type _ids = 0;
@@ -444,6 +446,7 @@ private:
    seastar::gate _gate;
    uint64_t _new_counter = 0;
    std::optional<size_t> _disk_write_alignment;
+    seastar::semaphore _reserve_recalculation_guard;
 };

 template<typename T>
@@ -512,6 +515,7 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
    uint64_t _file_pos = 0;
    uint64_t _flush_pos = 0;
    uint64_t _size_on_disk = 0;
+    uint64_t _waste = 0;

    size_t _alignment;

@@ -598,7 +602,7 @@ public:
            clogger.debug("Segment {} is no longer active and will submitted for delete now", *this);
            ++_segment_manager->totals.segments_destroyed;
            _segment_manager->totals.active_size_on_disk -= file_position();
-            _segment_manager->totals.wasted_size_on_disk -= (_size_on_disk - file_position());
+            _segment_manager->totals.wasted_size_on_disk -= _waste;
            _segment_manager->add_file_to_delete(_file_name, _desc);
        } else if (_segment_manager->cfg.warn_about_segments_left_on_disk_after_shutdown) {
            clogger.warn("Segment {} is dirty and is left on disk.", *this);
@@ -725,7 +729,8 @@ public:
        auto s = co_await sync();
        co_await flush();
        co_await terminate();
-        _segment_manager->totals.wasted_size_on_disk += (_size_on_disk - file_position());
+        _waste = _size_on_disk - file_position();
+        _segment_manager->totals.wasted_size_on_disk += _waste;
        co_return s;
    }
    future<sseg_ptr> do_flush(uint64_t pos) {
@@ -1223,6 +1228,7 @@ db::commitlog::segment_manager::segment_manager(config c)
    , _recycled_segments(std::numeric_limits<size_t>::max())
    , _reserve_replenisher(make_ready_future<>())
    , _background_sync(make_ready_future<>())
+    , _reserve_recalculation_guard(1)
 {
    assert(max_size > 0);
    assert(max_mutation_size < segment::multi_entry_size_magic);
@@ -1248,6 +1254,11 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
        }
        try {
            gate::holder g(_gate);
+            auto guard = co_await get_units(_reserve_recalculation_guard, 1);
+            if (_reserve_segments.full()) {
+                // can happen if we recalculate
+                continue;
+            }
            // note: if we were strict with disk size, we would refuse to do this 
            // unless disk footprint is lower than threshold. but we cannot (yet?)
            // trust that flush logic will absolutely free up an existing 
@@ -1519,7 +1530,7 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:

        if (cfg.extensions && !cfg.extensions->commitlog_file_extensions().empty()) {
            for (auto * ext : cfg.extensions->commitlog_file_extensions()) {
-                auto nf = co_await ext->wrap_file(std::move(filename), f, flags);
+                auto nf = co_await ext->wrap_file(filename, f, flags);
                if (nf) {
                    f = std::move(nf);
                    align = is_overwrite ? f.disk_overwrite_dma_alignment() : f.disk_write_dma_alignment();
@@ -1530,12 +1541,21 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
        f = make_checked_file(commit_error_handler, std::move(f));
    } catch (...) {
        ep = std::current_exception();
-        commit_error_handler(ep);
+    }
+    if (ep) {
+        // do this early, so iff we are to fast-fail server,
+        // we do it before anything else can go wrong.
+        try {
+            commit_error_handler(ep);
+        } catch (...) {
+            ep = std::current_exception();
+        }
    }
    if (ep && f) {
        co_await f.close();
    }
    if (ep) {
+        add_file_to_delete(filename, d);
        co_return coroutine::exception(std::move(ep));
    }

@@ -1594,6 +1614,8 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
 }

 future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::new_segment() {
+    gate::holder g(_gate);
+
    if (_shutdown) {
        co_return coroutine::make_exception(std::runtime_error("Commitlog has been shut down. Cannot add data"));
    }
@@ -1628,22 +1650,23 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
            co_return _segments.back();
        }

-        if (_segment_allocating) {
-            co_await _segment_allocating->get_future(timeout);
-            continue;
-        }
-
-        promise<> p;
-        _segment_allocating.emplace(p.get_future());
-        auto finally = defer([&] () noexcept { _segment_allocating = std::nullopt; });
-        try {
-            gate::holder g(_gate);
-            auto s = co_await with_timeout(timeout, new_segment());
-            p.set_value();
-        } catch (...) {
-            p.set_exception(std::current_exception());
-            throw;
+        // #9896 - we don't want to issue a new_segment call until
+        // the old one has terminated with either result or exception.
+        // Do all waiting through the shared_future
+        if (!_segment_allocating) {
+            auto f = new_segment();
+            // must check that we are not already done.
+            if (f.available()) {
+                f.get(); // maybe force exception
+                continue;
+            }
+            _segment_allocating.emplace(f.discard_result().finally([this] {
+                // clear the shared_future _before_ resolving its contents
+                // (i.e. with result of this finally)
+                _segment_allocating = std::nullopt;
+            }));
        }
+        co_await _segment_allocating->get_future(timeout);
    }
 }

@@ -1865,6 +1888,8 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi

    std::exception_ptr recycle_error;

+    size_t num_deleted = 0;
+    bool except = false;
    while (!files.empty()) {
        auto filename = std::move(files.back());
        files.pop_back();
@@ -1914,8 +1939,10 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi
                }
            }
            co_await delete_file(filename);
+            ++num_deleted;
        } catch (...) {
            clogger.error("Could not delete segment {}: {}", filename, std::current_exception());
+            except = true;
        }
    }

@@ -1928,6 +1955,16 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi
    if (recycle_error && _recycled_segments.empty()) {
        abort_recycled_list(recycle_error);
    }
+    // If recycle failed and turned into a delete, we should fake-wakeup waiters
+    // since we might still have cleaned up disk space.
+    if (!recycle_error && num_deleted && cfg.reuse_segments && _recycled_segments.empty()) {
+        abort_recycled_list(std::make_exception_ptr(std::runtime_error("deleted files")));
+    }
+
+    // #9348 - if we had an exception, we can't trust our bookeep any more. recalculate.
+    if (except) {
+        co_await recalculate_footprint();
+    }
 }

 void db::commitlog::segment_manager::abort_recycled_list(std::exception_ptr ep) {
@@ -1942,6 +1979,67 @@ void db::commitlog::segment_manager::abort_deletion_promise(std::exception_ptr e
    std::exchange(_disk_deletions, {}).set_exception(ep);
 }

+future<> db::commitlog::segment_manager::recalculate_footprint() {
+    try {
+        co_await do_pending_deletes();
+
+        auto guard = co_await get_units(_reserve_recalculation_guard, 1);
+        auto segments_copy = _segments;
+        std::vector<sseg_ptr> reserves;
+        std::vector<sstring> recycles;
+        // this causes haywire things while we steal stuff, but...
+        while (!_reserve_segments.empty()) {
+            reserves.push_back(_reserve_segments.pop());
+        }
+        while (!_recycled_segments.empty()) {
+            recycles.push_back(_recycled_segments.pop());
+        }
+        // #9955 - must re-stock the queues before we do anything
+        // interruptable/continuation. Because both queues are
+        // used with push/pop eventually which _waits_ for signal
+        // but does _not_ verify that the condition is true once
+        // we return. So copy the objects and look at instead.
+        for (auto& filename : recycles) {
+            _recycled_segments.push(sstring(filename));
+        }
+        for (auto& s : reserves) {
+            _reserve_segments.push(sseg_ptr(s)); // you can have it back now.
+        }
+
+        // first, guesstimate sizes
+        uint64_t recycle_size = recycles.size() * max_size;
+        auto old = totals.total_size_on_disk;
+
+        totals.total_size_on_disk = recycle_size;
+        for (auto& s : _segments) {
+            totals.total_size_on_disk += s->_size_on_disk;
+        }
+        for (auto& s : reserves) {
+            totals.total_size_on_disk += s->_size_on_disk;
+        }
+
+        // now we need to adjust the actual sizes of recycled files
+
+        uint64_t actual_recycled_size = 0;
+
+        try {
+            for (auto& filename : recycles) {
+                auto s = co_await seastar::file_size(filename);
+                actual_recycled_size += s;
+            }
+        } catch (...) {
+            clogger.error("Exception reading disk footprint ({}).", std::current_exception());
+            actual_recycled_size = recycle_size; // best we got
+        }
+
+        totals.total_size_on_disk += actual_recycled_size - recycle_size;
+        // pushing things to reserve/recycled queues will have resumed any
+        // waiters, so we should be done.
+    } catch (...) {
+        clogger.error("Exception recalculating disk footprint ({}). Values might be off...", std::current_exception());
+    }
+}
+
 future<> db::commitlog::segment_manager::do_pending_deletes() {
    auto ftc = std::exchange(_files_to_close, {});
    auto ftd = std::exchange(_files_to_delete, {});
--- a/dist/common/scripts/scylla-housekeeping
+++ b/dist/common/scripts/scylla-housekeeping
@@ -100,6 +100,7 @@ def version_compare(a, b):
 def create_uuid_file(fl):
    with open(args.uuid_file, 'w') as myfile:
        myfile.write(str(uuid.uuid1()) + "\n")
+    os.chmod(args.uuid_file, 0o644)


 def sanitize_version(version):
--- a/dist/common/scripts/scylla_io_setup
+++ b/dist/common/scripts/scylla_io_setup
@@ -278,6 +278,66 @@ if __name__ == "__main__":
                    disk_properties["read_bandwidth"] = 2527296683 * nr_disks
                    disk_properties["write_iops"] = 156326 * nr_disks
                    disk_properties["write_bandwidth"] = 1063657088 * nr_disks
+            elif idata.instance() == "im4gn.large":
+                disk_properties["read_iops"] = 33943
+                disk_properties["read_bandwidth"] = 288433525
+                disk_properties["write_iops"] = 27877
+                disk_properties["write_bandwidth"] = 126864680
+            elif idata.instance() == "im4gn.xlarge":
+                disk_properties["read_iops"] = 68122
+                disk_properties["read_bandwidth"] = 576603520
+                disk_properties["write_iops"] = 55246
+                disk_properties["write_bandwidth"] = 254534954
+            elif idata.instance() == "im4gn.2xlarge":
+                disk_properties["read_iops"] = 136422
+                disk_properties["read_bandwidth"] = 1152663765
+                disk_properties["write_iops"] = 92184
+                disk_properties["write_bandwidth"] = 508926453
+            elif idata.instance() == "im4gn.4xlarge":
+                disk_properties["read_iops"] = 273050
+                disk_properties["read_bandwidth"] = 1638427264
+                disk_properties["write_iops"] = 92173
+                disk_properties["write_bandwidth"] = 1027966826
+            elif idata.instance() == "im4gn.8xlarge":
+                disk_properties["read_iops"] = 250241 * nr_disks
+                disk_properties["read_bandwidth"] = 1163130709 * nr_disks
+                disk_properties["write_iops"] = 86374 * nr_disks
+                disk_properties["write_bandwidth"] = 977617664 * nr_disks
+            elif idata.instance() == "im4gn.16xlarge":
+                disk_properties["read_iops"] = 273030 * nr_disks
+                disk_properties["read_bandwidth"] = 1638211413 * nr_disks
+                disk_properties["write_iops"] = 92607 * nr_disks
+                disk_properties["write_bandwidth"] = 1028340266 * nr_disks
+            elif idata.instance() == "is4gen.medium":
+                disk_properties["read_iops"] = 33965
+                disk_properties["read_bandwidth"] = 288462506
+                disk_properties["write_iops"] = 27876
+                disk_properties["write_bandwidth"] = 126954200
+            elif idata.instance() == "is4gen.large":
+                disk_properties["read_iops"] = 68131
+                disk_properties["read_bandwidth"] = 576654869
+                disk_properties["write_iops"] = 55257
+                disk_properties["write_bandwidth"] = 254551002
+            elif idata.instance() == "is4gen.xlarge":
+                disk_properties["read_iops"] = 136413
+                disk_properties["read_bandwidth"] = 1152747904
+                disk_properties["write_iops"] = 92180
+                disk_properties["write_bandwidth"] = 508889546
+            elif idata.instance() == "is4gen.2xlarge":
+                disk_properties["read_iops"] = 273038
+                disk_properties["read_bandwidth"] = 1628982613
+                disk_properties["write_iops"] = 92182
+                disk_properties["write_bandwidth"] = 1027983530
+            elif idata.instance() == "is4gen.4xlarge":
+                disk_properties["read_iops"] = 260493 * nr_disks
+                disk_properties["read_bandwidth"] = 1217396928 * nr_disks
+                disk_properties["write_iops"] = 83169 * nr_disks
+                disk_properties["write_bandwidth"] = 1000390784 * nr_disks
+            elif idata.instance() == "is4gen.8xlarge":
+                disk_properties["read_iops"] = 273021 * nr_disks
+                disk_properties["read_bandwidth"] = 1656354602 * nr_disks
+                disk_properties["write_iops"] = 92233 * nr_disks
+                disk_properties["write_bandwidth"] = 1028010325 * nr_disks
            properties_file = open(etcdir() + "/scylla.d/io_properties.yaml", "w")
            yaml.dump({ "disks": [ disk_properties ] }, properties_file,  default_flow_style=False)
            ioconf = open(etcdir() + "/scylla.d/io.conf", "w")
--- a/dist/common/scripts/scylla_ntp_setup
+++ b/dist/common/scripts/scylla_ntp_setup
@@ -66,18 +66,18 @@ if __name__ == '__main__':

    target = None
    if os.path.exists('/lib/systemd/systemd-timesyncd'):
-        if systemd_unit('systemd-timesyncd').is_active():
+        if systemd_unit('systemd-timesyncd').is_active() == 'active':
            print('ntp is already configured, skip setup')
            sys.exit(0)
        target = 'systemd-timesyncd'
    if shutil.which('chronyd'):
-        if get_chrony_unit().is_active():
+        if get_chrony_unit().is_active() == 'active':
            print('ntp is already configured, skip setup')
            sys.exit(0)
        if not target:
            target = 'chrony'
    if shutil.which('ntpd'):
-        if get_ntp_unit().is_active():
+        if get_ntp_unit().is_active() == 'active':
            print('ntp is already configured, skip setup')
            sys.exit(0)
        if not target:
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -117,10 +117,11 @@ if __name__ == '__main__':
        pkg_install('xfsprogs')
    if not shutil.which('mdadm'):
        pkg_install('mdadm')
-    try:
-        md_service = systemd_unit('mdmonitor.service')
-    except SystemdException:
-        md_service = systemd_unit('mdadm.service')
+    if args.raid_level != '0':
+        try:
+            md_service = systemd_unit('mdmonitor.service')
+        except SystemdException:
+            md_service = systemd_unit('mdadm.service')

    print('Creating {type} for scylla using {nr_disk} disk(s): {disks}'.format(type='fRAID{args.raid_level}' if raid else 'XFS volume', nr_disk=len(disks), disks=args.disks))
    procs=[]
@@ -164,14 +165,15 @@ if __name__ == '__main__':

    uuid = run(f'blkid -s UUID -o value {fsdev}', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
    after = 'local-fs.target'
-    if raid:
+    wants = ''
+    if raid and args.raid_level != '0':
        after += f' {md_service}'
+        wants = f'\nWants={md_service}'
    unit_data = f'''
 [Unit]
 Description=Scylla data directory
 Before=scylla-server.service
-After={after}
-Wants={md_service}
+After={after}{wants}
 DefaultDependencies=no

 [Mount]
@@ -195,7 +197,8 @@ WantedBy=multi-user.target
            f.write(f'RequiresMountsFor={mount_at}\n')

    systemd_unit.reload()
-    md_service.start()
+    if args.raid_level != '0':
+        md_service.start()
    mount = systemd_unit(mntunit_bn)
    mount.start()
    if args.enable_on_nextboot:
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -370,6 +370,10 @@ if __name__ == '__main__':
            version_check = interactive_ask_service('Do you want to enable Scylla to check if there is a newer version of Scylla available?', 'Yes - start the Scylla-housekeeping service to check for a newer version. This check runs periodically. No - skips this step.', version_check)
            args.no_version_check = not version_check
            if version_check:
+                cfg = sysconfig_parser(sysconfdir_p() / 'scylla-housekeeping')
+                repo_files = cfg.get('REPO_FILES')
+                for f in glob.glob(repo_files):
+                    os.chmod(f, 0o644)
                with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
                    f.write('[housekeeping]\ncheck-version: True\n')
                os.chmod('/etc/scylla.d/housekeeping.cfg', 0o644)
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -674,7 +674,7 @@ class aws_instance:
        return self._type.split(".")[0]

    def is_supported_instance_class(self):
-        if self.instance_class() in ['i2', 'i3', 'i3en', 'c5d', 'm5d', 'm5ad', 'r5d', 'z1d', 'c6gd', 'm6gd', 'r6gd', 'x2gd']:
+        if self.instance_class() in ['i2', 'i3', 'i3en', 'c5d', 'm5d', 'm5ad', 'r5d', 'z1d', 'c6gd', 'm6gd', 'r6gd', 'x2gd', 'im4gn', 'is4gen']:
            return True
        return False

@@ -683,7 +683,7 @@ class aws_instance:
        instance_size = self.instance_size()
        if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
            return 'ixgbevf'
-        if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5b', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d', 'c6g', 'c6gd', 'm6g', 'm6gd', 't4g', 'r6g', 'r6gd', 'x2gd']:
+        if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5b', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d', 'c6g', 'c6gd', 'm6g', 'm6gd', 't4g', 'r6g', 'r6gd', 'x2gd', 'im4gn', 'is4gen']:
            return 'ena'
        if instance_class == 'm4':
            if instance_size == '16xlarge':
@@ -1041,7 +1041,7 @@ class systemd_unit:
        return run('systemctl {} disable {}'.format(self.ctlparam, self._unit), shell=True, check=True)

    def is_active(self):
-        return True if run('systemctl {} is-active {}'.format(self.ctlparam, self._unit), shell=True, capture_output=True, encoding='utf-8').stdout.strip() == 'active' else False
+        return run('systemctl {} is-active {}'.format(self.ctlparam, self._unit), shell=True, capture_output=True, encoding='utf-8').stdout.strip()

    def mask(self):
        return run('systemctl {} mask {}'.format(self.ctlparam, self._unit), shell=True, check=True)
--- a/dist/common/supervisor/scylla_util.sh
+++ b/dist/common/supervisor/scylla_util.sh
@@ -6,12 +6,16 @@ is_nonroot() {
    [ -f "$scylladir"/SCYLLA-NONROOT-FILE ]
 }

+is_container() {
+    [ -f "$scylladir"/SCYLLA-CONTAINER-FILE ]
+}
+
 is_privileged() {
    [ ${EUID:-${UID}} = 0 ]
 }

 execsudo() {
-    if is_nonroot; then
+    if is_nonroot || is_container; then
        exec "$@"
    else
        exec sudo -u scylla -g scylla "$@"
--- a/dist/docker/debian/build_docker.sh
+++ b/dist/docker/debian/build_docker.sh
@@ -25,6 +25,10 @@ product="$(<build/SCYLLA-PRODUCT-FILE)"
 version="$(<build/SCYLLA-VERSION-FILE)"
 release="$(<build/SCYLLA-RELEASE-FILE)"

+if [[ "$version" = *rc* ]]; then
+ version=$(echo $version |sed 's/\(.*\)\.)*/\1~/')
+fi
+
 mode="release"

 if uname -m | grep x86_64 ; then
@@ -93,12 +97,13 @@ run apt-get -y install hostname supervisor openssh-server openssh-client openjdk
 run locale-gen en_US.UTF-8
 run bash -ec "dpkg -i packages/*.deb"
 run apt-get -y clean all
-run bash -ec "cat /scylla_bashrc >> /etc/bashrc"
+run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
 run mkdir -p /etc/supervisor.conf.d
 run mkdir -p /var/log/scylla
 run chown -R scylla:scylla /var/lib/scylla

 run mkdir -p /opt/scylladb/supervisor
+run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
 bcp dist/common/supervisor/scylla-server.sh /opt/scylladb/supervisor/scylla-server.sh
 bcp dist/common/supervisor/scylla-jmx.sh /opt/scylladb/supervisor/scylla-jmx.sh
 bcp dist/common/supervisor/scylla-node-exporter.sh /opt/scylladb/supervisor/scylla-node-exporter.sh
--- a/dist/docker/etc/supervisord.conf.d/scylla-server.conf
+++ b/dist/docker/etc/supervisord.conf.d/scylla-server.conf
@@ -1,4 +1,4 @@
-[program:scylla-server]
+[program:scylla]
 command=/opt/scylladb/supervisor/scylla-server.sh
 stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
--- a/dist/docker/scyllasetup.py
+++ b/dist/docker/scyllasetup.py
@@ -121,12 +121,13 @@ class ScyllaSetup:
        if self._apiAddress is not None:
            args += ["--api-address %s" % self._apiAddress]

-        if self._alternatorPort is not None:
+        if self._alternatorAddress is not None:
            args += ["--alternator-address %s" % self._alternatorAddress]
+
+        if self._alternatorPort is not None:
            args += ["--alternator-port %s" % self._alternatorPort]

        if self._alternatorHttpsPort is not None:
-            args += ["--alternator-address %s" % self._alternatorAddress]
            args += ["--alternator-https-port %s" % self._alternatorHttpsPort]

        if self._alternatorWriteIsolation is not None:
--- a/generic_server.cc
+++ b/generic_server.cc
@@ -184,14 +184,18 @@ future<> server::do_accepts(int which, bool keepalive, socket_address server_add
                    _logger.info("exception while advertising new connection: {}", std::current_exception());
                }
                // Block while monitoring for lifetime/errors.
-                return conn->process().finally([this, conn] {
-                    return unadvertise_connection(conn);
-                }).handle_exception([this] (std::exception_ptr ep) {
-                    if (is_broken_pipe_or_connection_reset(ep)) {
-                        // expected if another side closes a connection or we're shutting down
-                        return;
+                return conn->process().then_wrapped([this, conn] (auto f) {
+                    try {
+                        f.get();
+                    } catch (...) {
+                        auto ep = std::current_exception();
+                        if (!is_broken_pipe_or_connection_reset(ep)) {
+                            // some exceptions are expected if another side closes a connection
+                            // or we're shutting down
+                            _logger.info("exception while processing connection: {}", ep);
+                        }
                    }
-                    _logger.info("exception while processing connection: {}", ep);
+                    return unadvertise_connection(conn);
                });
            });
            return stop_iteration::no;
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -477,49 +477,42 @@ gossiper::handle_get_endpoint_states_msg(gossip_get_endpoint_states_request requ
    return make_ready_future<gossip_get_endpoint_states_response>(gossip_get_endpoint_states_response{std::move(map)});
 }

+rpc::no_wait_type gossiper::background_msg(sstring type, noncopyable_function<future<>(gossiper&)> fn) {
+    (void)with_gate(_background_msg, [this, type = std::move(type), fn = std::move(fn)] () mutable {
+        return container().invoke_on(0, std::move(fn)).handle_exception([type = std::move(type)] (auto ep) {
+            logger.warn("Failed to handle {}: {}", type, ep);
+        });
+    });
+    return messaging_service::no_wait();
+}
+
 void gossiper::init_messaging_service_handler() {
    _messaging.register_gossip_digest_syn([this] (const rpc::client_info& cinfo, gossip_digest_syn syn_msg) {
        auto from = netw::messaging_service::get_source(cinfo);
-        // In a new fiber.
-        (void)container().invoke_on(0, [from, syn_msg = std::move(syn_msg)] (gms::gossiper& gossiper) mutable {
+        return background_msg("GOSSIP_DIGEST_SYN", [from, syn_msg = std::move(syn_msg)] (gms::gossiper& gossiper) mutable {
            return gossiper.handle_syn_msg(from, std::move(syn_msg));
-        }).handle_exception([] (auto ep) {
-            logger.warn("Fail to handle GOSSIP_DIGEST_SYN: {}", ep);
        });
-        return messaging_service::no_wait();
    });
    _messaging.register_gossip_digest_ack([this] (const rpc::client_info& cinfo, gossip_digest_ack msg) {
        auto from = netw::messaging_service::get_source(cinfo);
-        // In a new fiber.
-        (void)container().invoke_on(0, [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
+        return background_msg("GOSSIP_DIGEST_ACK", [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
            return gossiper.handle_ack_msg(from, std::move(msg));
-        }).handle_exception([] (auto ep) {
-            logger.warn("Fail to handle GOSSIP_DIGEST_ACK: {}", ep);
        });
-        return messaging_service::no_wait();
    });
    _messaging.register_gossip_digest_ack2([this] (const rpc::client_info& cinfo, gossip_digest_ack2 msg) {
        auto from = netw::messaging_service::get_source(cinfo);
-        // In a new fiber.
-        (void)container().invoke_on(0, [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
+        return background_msg("GOSSIP_DIGEST_ACK2", [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
            return gossiper.handle_ack2_msg(from, std::move(msg));
-        }).handle_exception([] (auto ep) {
-            logger.warn("Fail to handle GOSSIP_DIGEST_ACK2: {}", ep);
        });
-        return messaging_service::no_wait();
    });
    _messaging.register_gossip_echo([this] (const rpc::client_info& cinfo, rpc::optional<int64_t> generation_number_opt) {
        auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
        return handle_echo_msg(from, generation_number_opt);
    });
    _messaging.register_gossip_shutdown([this] (inet_address from, rpc::optional<int64_t> generation_number_opt) {
-        // In a new fiber.
-        (void)container().invoke_on(0, [from, generation_number_opt] (gms::gossiper& gossiper) {
+        return background_msg("GOSSIP_SHUTDOWN", [from, generation_number_opt] (gms::gossiper& gossiper) {
            return gossiper.handle_shutdown_msg(from, generation_number_opt);
-        }).handle_exception([] (auto ep) {
-            logger.warn("Fail to handle GOSSIP_SHUTDOWN: {}", ep);
        });
-        return messaging_service::no_wait();
    });
    _messaging.register_gossip_get_endpoint_states([this] (const rpc::client_info& cinfo, gossip_get_endpoint_states_request request) {
        return container().invoke_on(0, [request = std::move(request)] (gms::gossiper& gossiper) mutable {
@@ -2178,6 +2171,9 @@ future<> gossiper::start() {
 }

 future<> gossiper::shutdown() {
+    if (!_background_msg.is_closed()) {
+        co_await _background_msg.close();
+    }
    if (this_shard_id() == 0) {
        co_await do_stop_gossiping();
    }
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -41,7 +41,9 @@
 #include "unimplemented.hh"
 #include <seastar/core/distributed.hh>
 #include <seastar/core/shared_ptr.hh>
+#include <seastar/core/gate.hh>
 #include <seastar/core/print.hh>
+#include <seastar/rpc/rpc_types.hh>
 #include "utils/atomic_vector.hh"
 #include "utils/UUID.hh"
 #include "utils/fb_utilities.hh"
@@ -138,12 +140,16 @@ private:
    bool _enabled = false;
    semaphore _callback_running{1};
    semaphore _apply_state_locally_semaphore{100};
+    seastar::gate _background_msg;
    std::unordered_map<gms::inet_address, syn_msg_pending> _syn_handlers;
    std::unordered_map<gms::inet_address, ack_msg_pending> _ack_handlers;
    bool _advertise_myself = true;
    // Map ip address and generation number
    std::unordered_map<gms::inet_address, int32_t> _advertise_to_nodes;
    future<> _failure_detector_loop_done{make_ready_future<>()} ;
+
+    rpc::no_wait_type background_msg(sstring type, noncopyable_function<future<>(gossiper&)> fn);
+
 public:
    // Get current generation number for the given nodes
    future<std::unordered_map<gms::inet_address, int32_t>>
--- a/install.sh
+++ b/install.sh
@@ -520,8 +520,13 @@ relocate_python3 "$rprefix"/scyllatop tools/scyllatop/scyllatop.py
 if $supervisor; then
    install -d -m755 `supervisor_dir $retc`
    for service in scylla-server scylla-jmx scylla-node-exporter; do
+        if [ "$service" = "scylla-server" ]; then
+            program="scylla"
+        else
+            program=$service
+        fi
        cat << EOS > `supervisor_conf $retc $service`
-[program:$service]
+[program:$program]
 directory=$rprefix
 command=/bin/bash -c './supervisor/$service.sh'
 EOS
--- a/main.cc
+++ b/main.cc
@@ -377,11 +377,38 @@ static auto defer_verbose_shutdown(const char* what, Func&& func) {
        startlog.info("Shutting down {}", what);
        try {
            func();
+            startlog.info("Shutting down {} was successful", what);
        } catch (...) {
-            startlog.error("Unexpected error shutting down {}: {}", what, std::current_exception());
-            throw;
+            auto ex = std::current_exception();
+            bool do_abort = true;
+            try {
+                std::rethrow_exception(ex);
+            } catch (const std::system_error& e) {
+                // System error codes we consider "environmental",
+                // i.e. not scylla's fault, therefore there is no point in
+                // aborting and dumping core.
+                for (int i : {EIO, EACCES, ENOSPC}) {
+                    if (e.code() == std::error_code(i, std::system_category())) {
+                        do_abort = false;
+                        break;
+                    }
+                }
+            } catch (...) {
+            }
+            auto msg = fmt::format("Unexpected error shutting down {}: {}", what, ex);
+            if (do_abort) {
+                startlog.error("{}: aborting", msg);
+                abort();
+            } else {
+                startlog.error("{}: exiting, at {}", msg, current_backtrace());
+
+                // Call _exit() rather than exit() to exit immediately
+                // without calling exit handlers, avoiding
+                // boost::intrusive::detail::destructor_impl assert failure
+                // from ~segment_pool exit handler.
+                _exit(255);
+            }
        }
-        startlog.info("Shutting down {} was successful", what);
    };

    auto ret = deferred_action(std::move(vfunc));
--- a/memtable.cc
+++ b/memtable.cc
@@ -613,7 +613,8 @@ static flat_mutation_reader make_partition_snapshot_flat_reader_from_snp_schema(
        schema_ptr rev_snp_schema = snp->schema()->make_reversed();
        return make_partition_snapshot_flat_reader<true, partition_snapshot_read_accounter>(std::move(rev_snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
    } else {
-        return make_partition_snapshot_flat_reader<false, partition_snapshot_read_accounter>(snp->schema(), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
+        schema_ptr snp_schema = snp->schema();
+        return make_partition_snapshot_flat_reader<false, partition_snapshot_read_accounter>(std::move(snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
    }
 }

--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -628,7 +628,12 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        remove_error_rpc_client(verb, id);
    }

-    auto must_encrypt = [&id, &verb, this] {
+    auto addr = get_preferred_ip(id.addr);
+    auto broadcast_address = utils::fb_utilities::get_broadcast_address();
+    bool listen_to_bc = _cfg.listen_on_broadcast_address && _cfg.ip != broadcast_address;
+    auto laddr = socket_address(listen_to_bc ? broadcast_address : _cfg.ip, 0);
+
+    auto must_encrypt = [&] {
        if (_cfg.encrypt == encrypt_what::none) {
            return false;
        }
@@ -646,13 +651,27 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        auto& snitch_ptr = locator::i_endpoint_snitch::get_local_snitch_ptr();

        // either rack/dc need to be in same dc to use non-tls
-        if (snitch_ptr->get_datacenter(id.addr) != snitch_ptr->get_datacenter(utils::fb_utilities::get_broadcast_address())) {
+        auto my_dc = snitch_ptr->get_datacenter(broadcast_address);
+        if (snitch_ptr->get_datacenter(addr) != my_dc) {
+            return true;
+        }
+        // #9653 - if our idea of dc for bind address differs from our official endpoint address,
+        // we cannot trust downgrading. We need to ensure either (local) bind address is same as
+        // broadcast or that the dc info we get for it is the same.
+        if (broadcast_address != laddr && snitch_ptr->get_datacenter(laddr) != my_dc) {
            return true;
        }
        // if cross-rack tls, check rack.
-        return _cfg.encrypt == encrypt_what::rack &&
-            snitch_ptr->get_rack(id.addr) != snitch_ptr->get_rack(utils::fb_utilities::get_broadcast_address())
-            ;
+        if (_cfg.encrypt == encrypt_what::dc) {
+            return false;
+        }
+        auto my_rack = snitch_ptr->get_rack(broadcast_address);
+        if (snitch_ptr->get_rack(addr) != my_rack) {
+            return true;
+        }
+        // See above: We need to ensure either (local) bind address is same as
+        // broadcast or that the rack info we get for it is the same.
+        return broadcast_address != laddr && snitch_ptr->get_rack(laddr) != my_rack;
    }();

    auto must_compress = [&id, this] {
@@ -681,7 +700,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        return true;
    }();

-    auto remote_addr = socket_address(get_preferred_ip(id.addr), must_encrypt ? _cfg.ssl_port : _cfg.port);
+    auto remote_addr = socket_address(addr, must_encrypt ? _cfg.ssl_port : _cfg.port);

    rpc::client_options opts;
    // send keepalive messages each minute if connection is idle, drop connection after 10 failures
@@ -696,8 +715,6 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;
    }

-    bool listen_to_bc = _cfg.listen_on_broadcast_address && _cfg.ip != utils::fb_utilities::get_broadcast_address();
-    auto laddr = socket_address(listen_to_bc ? utils::fb_utilities::get_broadcast_address() : _cfg.ip, 0);
    auto client = must_encrypt ?
                    ::make_shared<rpc_protocol_client_wrapper>(_rpc->protocol(), std::move(opts),
                                    remote_addr, laddr, _credentials) :
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -1545,18 +1545,20 @@ public:
 };

 future<> shard_reader::close() noexcept {
-    // Nothing to do if there was no reader created, nor is there a background
-    // read ahead in progress which will create one.
-    if (!_reader && !_read_ahead) {
-        co_return;
+    if (_read_ahead) {
+        try {
+            co_await *std::exchange(_read_ahead, std::nullopt);
+        } catch (...) {
+            mrlog.warn("shard_reader::close(): read_ahead on shard {} failed: {}", _shard, std::current_exception());
+        }
    }

    try {
-        if (_read_ahead) {
-            co_await *std::exchange(_read_ahead, std::nullopt);
-        }
-
        co_await smp::submit_to(_shard, [this] {
+            if (!_reader) {
+                return make_ready_future<>();
+            }
+
            auto irh = std::move(*_reader).inactive_read_handle();
            return with_closeable(flat_mutation_reader(_reader.release()), [this] (flat_mutation_reader& reader) mutable {
                auto permit = reader.permit();
--- a/mutation_writer/feed_writers.hh
+++ b/mutation_writer/feed_writers.hh
@@ -54,7 +54,7 @@ future<> feed_writer(flat_mutation_reader&& rd_ref, Writer wr) {
    auto rd = std::move(rd_ref);
    std::exception_ptr ex;
    try {
-        while (!rd.is_end_of_stream()) {
+        while (!rd.is_end_of_stream() || !rd.is_buffer_empty()) {
            co_await rd.fill_buffer();
            while (!rd.is_buffer_empty()) {
                co_await rd.pop_mutation_fragment().consume(wr);
--- a/partition_snapshot_row_cursor.hh
+++ b/partition_snapshot_row_cursor.hh
@@ -411,11 +411,11 @@ public:
        } else {
            // Copy row from older version because rows in evictable versions must
            // hold values which are independently complete to be consistent on eviction.
-            auto e = current_allocator().construct<rows_entry>(_schema, *_current_row[0].it);
+            auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(_schema, *_current_row[0].it));
            e->set_continuous(latest_i && latest_i->continuous());
            _snp.tracker()->insert(*e);
-            rows.insert_before(latest_i, *e);
-            return {*e, true};
+            auto e_i = rows.insert_before(latest_i, std::move(e));
+            return ensure_result{*e_i, true};
        }
    }

@@ -447,11 +447,11 @@ public:
        }
        auto&& rows = _snp.version()->partition().mutable_clustered_rows();
        auto latest_i = get_iterator_in_latest_version();
-        auto e = current_allocator().construct<rows_entry>(_schema, pos, is_dummy(!pos.is_clustering_row()),
-            is_continuous(latest_i && latest_i->continuous()));
+        auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(_schema, pos, is_dummy(!pos.is_clustering_row()),
+            is_continuous(latest_i && latest_i->continuous())));
        _snp.tracker()->insert(*e);
-        rows.insert_before(latest_i, *e);
-        return ensure_result{*e, true};
+        auto e_i = rows.insert_before(latest_i, std::move(e));
+        return ensure_result{*e_i, true};
    }

    // Brings the entry pointed to by the cursor to the front of the LRU
--- a/range_tombstone_list.cc
+++ b/range_tombstone_list.cc
@@ -42,28 +42,34 @@ static auto construct_range_tombstone_entry(Args&&... args) {
 }

 void range_tombstone_list::apply_reversibly(const schema& s,
-        clustering_key_prefix start, bound_kind start_kind,
-        clustering_key_prefix end,
+        clustering_key_prefix start_key, bound_kind start_kind,
+        clustering_key_prefix end_key,
        bound_kind end_kind,
        tombstone tomb,
        reverter& rev)
 {
+    position_in_partition::less_compare less(s);
+    position_in_partition start(position_in_partition::range_tag_t(), bound_view(std::move(start_key), start_kind));
+    position_in_partition end(position_in_partition::range_tag_t(), bound_view(std::move(end_key), end_kind));
+
+    if (!less(start, end)) {
+        return;
+    }
+
    if (!_tombstones.empty()) {
-        bound_view::compare less(s);
-        bound_view start_bound(start, start_kind);
        auto last = --_tombstones.end();
        range_tombstones_type::iterator it;
-        if (less(start_bound, last->end_bound())) {
-            it = _tombstones.upper_bound(start_bound, [less](auto&& sb, auto&& rt) {
-                return less(sb, rt.end_bound());
+        if (less(start, last->end_position())) {
+            it = _tombstones.upper_bound(start, [less](auto&& sb, auto&& rt) {
+                return less(sb, rt.end_position());
            });
        } else {
            it = _tombstones.end();
        }
-        insert_from(s, std::move(it), std::move(start), start_kind, std::move(end), end_kind, std::move(tomb), rev);
+        insert_from(s, std::move(it), std::move(start), std::move(end), std::move(tomb), rev);
        return;
    }
-    auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
+    auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
    rev.insert(_tombstones.end(), *rt);
    rt.release();
 }
@@ -81,35 +87,31 @@ void range_tombstone_list::apply_reversibly(const schema& s,
 */
 void range_tombstone_list::insert_from(const schema& s,
    range_tombstones_type::iterator it,
-    clustering_key_prefix start,
-    bound_kind start_kind,
-    clustering_key_prefix end,
-    bound_kind end_kind,
+    position_in_partition start,
+    position_in_partition end,
    tombstone tomb,
    reverter& rev)
 {
-    bound_view::compare less(s);
-    bound_view end_bound(end, end_kind);
+    position_in_partition::tri_compare cmp(s);
+
    if (it != _tombstones.begin()) {
        auto prev = std::prev(it);
-        if (prev->tombstone().tomb == tomb && prev->end_bound().adjacent(s, bound_view(start, start_kind))) {
-            start = prev->tombstone().start;
-            start_kind = prev->tombstone().start_kind;
+        if (prev->tombstone().tomb == tomb && cmp(prev->end_position(), start) == 0) {
+            start = prev->position();
            rev.erase(prev);
        }
    }
    while (it != _tombstones.end()) {
-        bound_view start_bound(start, start_kind);
-        if (less(end_bound, start_bound)) {
+        if (cmp(end, start) <= 0) {
            return;
        }

-        if (less(end_bound, it->start_bound())) {
+        if (cmp(end, it->position()) < 0) {
            // not overlapping
-            if (it->tombstone().tomb == tomb && end_bound.adjacent(s, it->start_bound())) {
-                rev.update(it, {std::move(start), start_kind, it->tombstone().end, it->tombstone().end_kind, tomb});
+            if (it->tombstone().tomb == tomb && cmp(end, it->position()) == 0) {
+                rev.update(it, {std::move(start), std::move(start), tomb});
            } else {
-                auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, tomb);
+                auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), tomb);
                rev.insert(it, *rt);
                rt.release();
            }
@@ -119,34 +121,29 @@ void range_tombstone_list::insert_from(const schema& s,
        auto c = tomb <=> it->tombstone().tomb;
        if (c == 0) {
            // same timestamp, overlapping or adjacent, so merge.
-            if (less(it->start_bound(), start_bound)) {
-                start = it->tombstone().start;
-                start_kind = it->tombstone().start_kind;
+            if (cmp(it->position(), start) < 0) {
+                start = it->position();
            }
-            if (less(end_bound, it->end_bound())) {
-                end = it->tombstone().end;
-                end_kind = it->tombstone().end_kind;
-                end_bound = bound_view(end, end_kind);
+            if (cmp(end, it->end_position()) < 0) {
+                end = it->end_position();
            }
            it = rev.erase(it);
        } else if (c > 0) {
            // We overwrite the current tombstone.

-            if (less(it->start_bound(), start_bound)) {
-                auto new_end = bound_view(start, invert_kind(start_kind));
-                if (!less(new_end, it->start_bound())) {
-                    // Here it->start < start
-                    auto rt = construct_range_tombstone_entry(it->start_bound(), new_end, it->tombstone().tomb);
-                    rev.update(it, {start_bound, it->end_bound(), it->tombstone().tomb});
+            if (cmp(it->position(), start) < 0) {
+                {
+                    auto rt = construct_range_tombstone_entry(it->position(), start, it->tombstone().tomb);
+                    rev.update(it, {start, it->end_position(), it->tombstone().tomb});
                    rev.insert(it, *rt);
                    rt.release();
                }
            }

-            if (less(end_bound, it->end_bound())) {
+            if (cmp(end, it->end_position()) < 0) {
                // Here start <= it->start and end < it->end.
-                auto rt = construct_range_tombstone_entry(std::move(start), start_kind, end, end_kind, std::move(tomb));
-                rev.update(it, {std::move(end), invert_kind(end_kind), it->tombstone().end, it->tombstone().end_kind, it->tombstone().tomb});
+                auto rt = construct_range_tombstone_entry(std::move(start), end, std::move(tomb));
+                rev.update(it, {std::move(end), it->end_position(), it->tombstone().tomb});
                rev.insert(it, *rt);
                rt.release();
                return;
@@ -157,30 +154,28 @@ void range_tombstone_list::insert_from(const schema& s,
        } else {
            // We don't overwrite the current tombstone.

-            if (less(start_bound, it->start_bound())) {
+            if (cmp(start, it->position()) < 0) {
                // The new tombstone starts before the current one.
-                if (less(it->start_bound(), end_bound)) {
+                if (cmp(it->position(), end) < 0) {
                    // Here start < it->start and it->start < end.
-                    auto new_end_kind = invert_kind(it->tombstone().start_kind);
-                    if (!less(bound_view(it->tombstone().start, new_end_kind), start_bound)) {
-                        auto rt = construct_range_tombstone_entry(std::move(start), start_kind, it->tombstone().start, new_end_kind, tomb);
+                    {
+                        auto rt = construct_range_tombstone_entry(std::move(start), it->position(), tomb);
                        it = rev.insert(it, *rt);
                        rt.release();
                        ++it;
                    }
                } else {
                    // Here start < it->start and end <= it->start, so just insert the new tombstone.
-                    auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
+                    auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
                    rev.insert(it, *rt);
                    rt.release();
                    return;
                }
            }

-            if (less(it->end_bound(), end_bound)) {
+            if (cmp(it->end_position(), end) < 0) {
                // Here the current tombstone overwrites a range of the new one.
-                start = it->tombstone().end;
-                start_kind = invert_kind(it->tombstone().end_kind);
+                start = it->end_position();
                ++it;
            } else {
                // Here the current tombstone completely overwrites the new one.
@@ -190,7 +185,7 @@ void range_tombstone_list::insert_from(const schema& s,
    }

    // If we got here, then just insert the remainder at the end.
-    auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
+    auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
    rev.insert(it, *rt);
    rt.release();
 }
--- a/range_tombstone_list.hh
+++ b/range_tombstone_list.hh
@@ -297,7 +297,13 @@ public:
 private:
    void apply_reversibly(const schema& s, clustering_key_prefix start, bound_kind start_kind,
                          clustering_key_prefix end, bound_kind end_kind, tombstone tomb, reverter& rev);
-    void insert_from(const schema& s, range_tombstones_type::iterator it, clustering_key_prefix start,
-                     bound_kind start_kind, clustering_key_prefix end, bound_kind end_kind, tombstone tomb, reverter& rev);
+
+    void insert_from(const schema& s,
+                     range_tombstones_type::iterator it,
+                     position_in_partition start,
+                     position_in_partition end,
+                     tombstone tomb,
+                     reverter& rev);
+
    range_tombstones_type::iterator find(const schema& s, const range_tombstone_entry& rt);
 };
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -249,6 +249,14 @@ public:
        return _base_resources;
    }

+    void release_base_resources() noexcept {
+        if (_base_resources_consumed) {
+            _resources -= _base_resources;
+            _base_resources_consumed = false;
+        }
+        _semaphore.signal(std::exchange(_base_resources, {}));
+    }
+
    sstring description() const {
        return format("{}.{}:{}",
                _schema ? _schema->ks_name() : "*",
@@ -394,6 +402,10 @@ reader_resources reader_permit::base_resources() const {
    return _impl->base_resources();
 }

+void reader_permit::release_base_resources() noexcept {
+    return _impl->release_base_resources();
+}
+
 sstring reader_permit::description() const {
    return _impl->description();
 }
--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -161,6 +161,8 @@ public:

    reader_resources base_resources() const;

+    void release_base_resources() noexcept;
+
    sstring description() const;

    db::timeout_clock::time_point timeout() const noexcept;
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -407,6 +407,10 @@ public:
                    {},
                    mutation_reader::forwarding::no);
        } else {
+            // We can't have two permits with count resource for 1 repair.
+            // So we release the one on _permit so the only one is the one the
+            // shard reader will obtain.
+            _permit.release_base_resources();
            _reader = make_multishard_streaming_reader(db, _schema, _permit, [this] {
                auto shard_range = _sharder.next();
                if (shard_range) {
--- a/2
+++ b/2
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -635,16 +635,16 @@ void storage_service::bootstrap() {

        // Update pending ranges now, so we correctly count ourselves as a pending replica
        // when inserting the new CDC generation.
-      if (!bootstrap_rbno) {
-        // When is_repair_based_node_ops_enabled is true, the bootstrap node
-        // will use node_ops_cmd to bootstrap, node_ops_cmd will update the pending ranges.
-        slogger.debug("bootstrap: update pending ranges: endpoint={} bootstrap_tokens={}", get_broadcast_address(), _bootstrap_tokens);
-        mutate_token_metadata([this] (mutable_token_metadata_ptr tmptr) {
-            auto endpoint = get_broadcast_address();
-            tmptr->add_bootstrap_tokens(_bootstrap_tokens, endpoint);
-            return update_pending_ranges(std::move(tmptr), format("bootstrapping node {}", endpoint));
-        }).get();
-      }
+        if (!bootstrap_rbno) {
+            // When is_repair_based_node_ops_enabled is true, the bootstrap node
+            // will use node_ops_cmd to bootstrap, node_ops_cmd will update the pending ranges.
+            slogger.debug("bootstrap: update pending ranges: endpoint={} bootstrap_tokens={}", get_broadcast_address(), _bootstrap_tokens);
+            mutate_token_metadata([this] (mutable_token_metadata_ptr tmptr) {
+                auto endpoint = get_broadcast_address();
+                tmptr->add_bootstrap_tokens(_bootstrap_tokens, endpoint);
+                return update_pending_ranges(std::move(tmptr), format("bootstrapping node {}", endpoint));
+            }).get();
+        }

        // After we pick a generation timestamp, we start gossiping it, and we stick with it.
        // We don't do any other generation switches (unless we crash before complecting bootstrap).
@@ -652,19 +652,23 @@ void storage_service::bootstrap() {

        _cdc_gen_id = _cdc_gen_service.local().make_new_generation(_bootstrap_tokens, !is_first_node()).get0();

-      if (!bootstrap_rbno) {
-        // When is_repair_based_node_ops_enabled is true, the bootstrap node
-        // will use node_ops_cmd to bootstrap, bootstrapping gossip status is not needed for bootstrap.
-        _gossiper.add_local_application_state({
-            // Order is important: both the CDC streams timestamp and tokens must be known when a node handles our status.
-            { gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens) },
-            { gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
-            { gms::application_state::STATUS, versioned_value::bootstrapping(_bootstrap_tokens) },
-        }).get();
+        if (!bootstrap_rbno) {
+            // When is_repair_based_node_ops_enabled is true, the bootstrap node
+            // will use node_ops_cmd to bootstrap, bootstrapping gossip status is not needed for bootstrap.
+            _gossiper.add_local_application_state({
+                { gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens) },
+                { gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
+                { gms::application_state::STATUS, versioned_value::bootstrapping(_bootstrap_tokens) },
+            }).get();

-        set_mode(mode::JOINING, format("sleeping {} ms for pending range setup", get_ring_delay().count()), true);
-        _gossiper.wait_for_range_setup().get();
-     }
+            set_mode(mode::JOINING, format("sleeping {} ms for pending range setup", get_ring_delay().count()), true);
+            _gossiper.wait_for_range_setup().get();
+        } else {
+            // Even with RBNO bootstrap we need to announce the new CDC generation immediately after it's created.
+            _gossiper.add_local_application_state({
+                { gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
+            }).get();
+        }
    } else {
        // Wait until we know tokens of existing node before announcing replacing status.
        set_mode(mode::JOINING, fmt::format("Wait until local node knows tokens of peer nodes"), true);
@@ -3670,7 +3674,7 @@ shared_ptr<abort_source> node_ops_meta_data::get_abort_source() {

 void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {
    slogger.debug("node_ops_update_heartbeat: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
+    auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
    auto it = _node_ops.find(ops_uuid);
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
@@ -3680,7 +3684,7 @@ void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {

 void storage_service::node_ops_done(utils::UUID ops_uuid) {
    slogger.debug("node_ops_done: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
+    auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
    auto it = _node_ops.find(ops_uuid);
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
@@ -3691,7 +3695,7 @@ void storage_service::node_ops_done(utils::UUID ops_uuid) {

 void storage_service::node_ops_abort(utils::UUID ops_uuid) {
    slogger.debug("node_ops_abort: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
+    auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
    auto it = _node_ops.find(ops_uuid);
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
--- a/sstables/partition_index_cache.hh
+++ b/sstables/partition_index_cache.hh
@@ -49,12 +49,13 @@ private:
    public:
        partition_index_cache* _parent;
        key_type _key;
-        std::variant<shared_promise<>, partition_index_page> _page;
+        std::variant<lw_shared_ptr<shared_promise<>>, partition_index_page> _page;
        size_t _size_in_allocator = 0;
    public:
        entry(partition_index_cache* parent, key_type key)
                : _parent(parent)
                , _key(key)
+                , _page(make_lw_shared<shared_promise<>>())
        { }

        void set_page(partition_index_page&& page) noexcept {
@@ -76,7 +77,7 @@ private:
        // Always returns the same value for a given state of _page.
        size_t size_in_allocator() const { return _size_in_allocator; }

-        shared_promise<>& promise() { return std::get<shared_promise<>>(_page); }
+        lw_shared_ptr<shared_promise<>> promise() { return std::get<lw_shared_ptr<shared_promise<>>>(_page); }
        bool ready() const { return std::holds_alternative<partition_index_page>(_page); }
        partition_index_page& page() { return std::get<partition_index_page>(_page); }
        const partition_index_page& page() const { return std::get<partition_index_page>(_page); }
@@ -207,9 +208,7 @@ public:
                return make_ready_future<entry_ptr>(std::move(ptr));
            } else {
                ++_shard_stats.blocks;
-                return _as(_region, [ptr] () mutable {
-                    return ptr.get_entry().promise().get_shared_future();
-                }).then([ptr] () mutable {
+                return ptr.get_entry().promise()->get_shared_future().then([ptr] () mutable {
                    return std::move(ptr);
                });
            }
@@ -234,23 +233,23 @@ public:

        // No exceptions before then_wrapped() is installed so that ptr will be eventually populated.

-        return futurize_invoke(loader, key).then_wrapped([this, key, ptr] (auto&& f) mutable {
+        return futurize_invoke(loader, key).then_wrapped([this, key, ptr = std::move(ptr)] (auto&& f) mutable {
            entry& e = ptr.get_entry();
            try {
                partition_index_page&& page = f.get0();
-                e.promise().set_value();
+                e.promise()->set_value();
                e.set_page(std::move(page));
                _shard_stats.used_bytes += e.size_in_allocator();
                ++_shard_stats.populations;
+                return ptr;
            } catch (...) {
-                e.promise().set_exception(std::current_exception());
+                e.promise()->set_exception(std::current_exception());
+                ptr = {};
                with_allocator(_region.allocator(), [&] {
                    _cache.erase(key);
                });
                throw;
            }
-        }).then([ptr] {
-            return ptr;
        });
    }

--- a/test.py
+++ b/test.py
@@ -291,6 +291,8 @@ class Test:
    def print_summary(self):
        pass

+    def get_junit_etree(self):
+        return None

    def check_log(self, trim):
        """Check and trim logs and xml output for tests which have it"""
@@ -338,9 +340,36 @@ class BoostTest(UnitTest):
        boost_args += ['--color_output=false']
        boost_args += ['--']
        self.args = boost_args + self.args
+        self.casename = casename
+        self.__junit_etree = None
+
+    def get_junit_etree(self):
+        def adjust_suite_name(name):
+            # Normalize "path/to/file.cc" to "path.to.file" to conform to
+            # Jenkins expectations that the suite name is a class name. ".cc"
+            # doesn't add any infomation. Add the mode, otherwise failures
+            # in different modes are indistinguishable. The "test/" prefix adds
+            # no information, so remove it.
+            import re
+            name = re.sub(r'^test/', '', name)
+            name = re.sub(r'\.cc$', '', name)
+            name = re.sub(r'/', '.', name)
+            name = f'{name}.{self.mode}'
+            return name
+        if self.__junit_etree is None:
+            self.__junit_etree = ET.parse(self.xmlout)
+            root = self.__junit_etree.getroot()
+            suites = root.findall('.//TestSuite')
+            for suite in suites:
+                suite.attrib['name'] = adjust_suite_name(suite.attrib['name'])
+                skipped = suite.findall('./TestCase[@reason="disabled"]')
+                for e in skipped:
+                    suite.remove(e)
+            os.unlink(self.xmlout)
+        return self.__junit_etree

    def check_log(self, trim):
-        ET.parse(self.xmlout)
+        self.get_junit_etree()
        super().check_log(trim)


@@ -800,6 +829,17 @@ def write_junit_report(tmpdir, mode):
    with open(junit_filename, "w") as f:
        ET.ElementTree(xml_results).write(f, encoding="unicode")

+def write_consolidated_boost_junit_xml(tmpdir, mode):
+    xml = ET.Element("TestLog")
+    for suite in TestSuite.suites.values():
+        for test in suite.tests:
+            if test.mode != mode:
+                continue
+            test_xml = test.get_junit_etree()
+            if test_xml is not None:
+                xml.extend(test_xml.getroot().findall('.//TestSuite'))
+    et = ET.ElementTree(xml)
+    et.write(f'{tmpdir}/{mode}/xml/boost.xunit.xml', encoding='unicode')

 def open_log(tmpdir):
    pathlib.Path(tmpdir).mkdir(parents=True, exist_ok=True)
@@ -839,6 +879,7 @@ async def main():

    for mode in options.modes:
        write_junit_report(options.tmpdir, mode)
+        write_consolidated_boost_junit_xml(options.tmpdir, mode)

    if 'coverage' in options.modes:
        coverage.generate_coverage_report("build/coverage", "tests")
--- a/test/alternator/test_table.py
+++ b/test/alternator/test_table.py
@@ -16,6 +16,9 @@
 # along with Scylla.  If not, see <http://www.gnu.org/licenses/>.

 # Tests for basic table operations: CreateTable, DeleteTable, ListTables.
+# Also some basic tests for UpdateTable - although UpdateTable usually
+# enables more elaborate features (such as GSI or Streams) and those are
+# tested elsewhere.

 import pytest
 from botocore.exceptions import ClientError
@@ -311,3 +314,17 @@ def test_table_sse_off(dynamodb):
        KeySchema=[{ 'AttributeName': 'p', 'KeyType': 'HASH' }],
        AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]);
    table.delete();
+
+# Test that trying to delete a table that doesn't exist fails in the
+# appropriate way (ResourceNotFoundException)
+def test_delete_table_non_existent(dynamodb, test_table):
+    client = dynamodb.meta.client
+    with pytest.raises(ClientError, match='ResourceNotFoundException'):
+        client.delete_table(TableName=random_string(20))
+
+# Test that trying to update a table that doesn't exist fails in the
+# appropriate way (ResourceNotFoundException)
+def test_update_table_non_existent(dynamodb, test_table):
+    client = dynamodb.meta.client
+    with pytest.raises(ClientError, match='ResourceNotFoundException'):
+        client.update_table(TableName=random_string(20), BillingMode='PAY_PER_REQUEST')
--- a/test/alternator/test_update_expression.py
+++ b/test/alternator/test_update_expression.py
@@ -1043,6 +1043,20 @@ def test_nested_attribute_remove_from_missing_item(test_table_s):
    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE x.y')
    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE x[0]')

+# Though in an above test (test_nested_attribute_update_bad_path_dot) we
+# showed that DynamoDB does not allow REMOVE x.y if attribute x doesn't
+# exist - and generates a ValidationException, if x *does* exist but y
+# doesn't, it's fine and the removal should just be silently ignored.
+def test_nested_attribute_remove_missing_leaf(test_table_s):
+    p = random_string()
+    item = {'p': p, 'a': {'x': 3}, 'b': ['hi']}
+    test_table_s.put_item(Item=item)
+    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE a.y')
+    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE b[7]')
+    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE c')
+    # The above UpdateItem calls didn't change anything...
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == item
+
 # Similarly for other types of bad paths - using [0] on something which
 # doesn't exist or isn't an array.
 def test_nested_attribute_update_bad_path_array(test_table_s):
--- a/test/boost/cached_file_test.cc
+++ b/test/boost/cached_file_test.cc
@@ -19,6 +19,7 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <boost/range/irange.hpp>
 #include <seastar/testing/test_case.hh>
 #include <seastar/testing/thread_test_case.hh>
 #include <seastar/core/iostream.hh>
@@ -49,6 +50,15 @@ static sstring read_to_string(cached_file::stream& s, size_t limit = std::numeri
    return b.substr(0, limit);
 }

+static void read_to_void(cached_file::stream& s, size_t limit = std::numeric_limits<size_t>::max()) {
+    while (auto buf = s.next().get0()) {
+        if (buf.size() >= limit) {
+            break;
+        }
+        limit -= buf.size();
+    }
+}
+
 static sstring read_to_string(file& f, size_t start, size_t len) {
    file_input_stream_options opt;
    auto in = make_file_input_stream(f, start, len, opt);
@@ -61,6 +71,12 @@ static sstring read_to_string(cached_file& cf, size_t off, size_t limit = std::n
    return read_to_string(s, limit);
 }

+[[gnu::unused]]
+static void read_to_void(cached_file& cf, size_t off, size_t limit = std::numeric_limits<size_t>::max()) {
+    auto s = cf.read(off, default_priority_class(), std::nullopt);
+    read_to_void(s, limit);
+}
+
 struct test_file {
    tmpdir dir;
    file f;
@@ -204,7 +220,9 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
        }

        {
-            cf_lru.evict_all();
+            with_allocator(region.allocator(), [] {
+                cf_lru.evict_all();
+            });

            BOOST_REQUIRE_EQUAL(0, metrics.cached_bytes); // change here
            BOOST_REQUIRE_EQUAL(0, cf.cached_bytes()); // change here
@@ -212,6 +230,8 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
            BOOST_REQUIRE_EQUAL(3, metrics.page_evictions); // change here
            BOOST_REQUIRE_EQUAL(0, metrics.page_hits);
            BOOST_REQUIRE_EQUAL(3, metrics.page_populations);
+
+            BOOST_REQUIRE_EQUAL(region.occupancy().used_space(), 0);
        }

        {
@@ -255,6 +275,88 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
    }
 }

+// A file which serves garbage but is very fast.
+class garbage_file_impl : public file_impl {
+private:
+    [[noreturn]] void unsupported() {
+        throw_with_backtrace<std::logic_error>("unsupported operation");
+    }
+public:
+    // unsupported
+    virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override { unsupported(); }
+    virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override { unsupported(); }
+    virtual future<> flush(void) override { unsupported(); }
+    virtual future<> truncate(uint64_t length) override { unsupported(); }
+    virtual future<> discard(uint64_t offset, uint64_t length) override { unsupported(); }
+    virtual future<> allocate(uint64_t position, uint64_t length) override { unsupported(); }
+    virtual subscription<directory_entry> list_directory(std::function<future<>(directory_entry)>) override { unsupported(); }
+    virtual future<struct stat> stat(void) override { unsupported(); }
+    virtual future<uint64_t> size(void) override { unsupported(); }
+    virtual std::unique_ptr<seastar::file_handle_impl> dup() override { unsupported(); }
+
+    virtual future<> close() override { return make_ready_future<>(); }
+
+    virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t size, const io_priority_class& pc) override {
+        return make_ready_future<temporary_buffer<uint8_t>>(temporary_buffer<uint8_t>(size));
+    }
+
+    virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override {
+        unsupported(); // FIXME
+    }
+
+    virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
+        unsupported(); // FIXME
+    }
+};
+
+#ifndef SEASTAR_DEFAULT_ALLOCATOR // Eviction works only with the seastar allocator
+SEASTAR_THREAD_TEST_CASE(test_stress_eviction) {
+    auto page_size = cached_file::page_size;
+    auto n_pages = 8'000'000 / page_size;
+    auto file_size = page_size * n_pages;
+    auto cached_size = 4'000'000;
+
+    cached_file::metrics metrics;
+    logalloc::region region;
+
+    auto f = file(make_shared<garbage_file_impl>());
+    cached_file cf(f, metrics, cf_lru, region, file_size);
+
+    region.make_evictable([&] {
+        testlog.trace("Evicting");
+        cf.invalidate_at_most_front(file_size / 2);
+        return cf_lru.evict();
+    });
+
+    for (int i = 0; i < (cached_size / page_size); ++i) {
+        read_to_string(cf, page_size * i, page_size);
+    }
+
+    testlog.debug("Saturating memory...");
+
+    // Disable background reclaiming which will prevent bugs from reproducing
+    // We want reclamation to happen synchronously with page cache population in read_to_void()
+    seastar::memory::set_min_free_pages(0);
+
+    // Saturate std memory
+    chunked_fifo<bytes> blobs;
+    auto rc = region.reclaim_counter();
+    while (region.reclaim_counter() == rc) {
+        blobs.emplace_back(bytes(bytes::initialized_later(), 1024));
+    }
+
+    testlog.debug("Memory: allocated={}, free={}", seastar::memory::stats().allocated_memory(), seastar::memory::stats().free_memory());
+    testlog.debug("Starting test...");
+
+    for (int j = 0; j < n_pages * 16; ++j) {
+        testlog.trace("Allocating");
+        auto stride = tests::random::get_int(1, 20);
+        auto page_idx = tests::random::get_int(n_pages - stride);
+        read_to_void(cf, page_idx * page_size, page_size * stride);
+    }
+}
+#endif
+
 SEASTAR_THREAD_TEST_CASE(test_invalidation) {
    auto page_size = cached_file::page_size;
    test_file tf = make_test_file(page_size * 2);
--- a/test/boost/commitlog_test.cc
+++ b/test/boost/commitlog_test.cc
@@ -44,7 +44,9 @@
 #include "test/lib/tmpdir.hh"
 #include "db/commitlog/commitlog.hh"
 #include "db/commitlog/commitlog_replayer.hh"
+#include "db/commitlog/commitlog_extensions.hh"
 #include "db/commitlog/rp_set.hh"
+#include "db/extensions.hh"
 #include "log.hh"
 #include "service/priority_manager.hh"
 #include "test/lib/exception_utils.hh"
@@ -947,3 +949,113 @@ SEASTAR_TEST_CASE(test_commitlog_deadlock_with_flush_threshold) {
        co_await log.clear();
    }
 }
+
+static future<> do_test_exception_in_allocate_ex(bool do_file_delete, bool reuse = true) {
+    commitlog::config cfg;
+
+    constexpr auto max_size_mb = 1;
+
+    cfg.commitlog_segment_size_in_mb = max_size_mb;
+    cfg.commitlog_total_space_in_mb = 2 * max_size_mb * smp::count;
+    cfg.commitlog_sync_period_in_ms = 10;
+    cfg.reuse_segments = reuse;
+    cfg.allow_going_over_size_limit = false; // #9348 - now can enforce size limit always
+    cfg.use_o_dsync = true; // make sure we pre-allocate.
+
+    // not using cl_test, because we need to be able to abandon
+    // the log.
+
+    tmpdir tmp;
+    cfg.commit_log_location = tmp.path().string();
+
+    class myfail : public std::exception {
+    public:
+        using std::exception::exception;
+    };
+
+    struct myext: public db::commitlog_file_extension {
+    public:
+        bool fail = false;
+        bool thrown = false;
+        bool do_file_delete;
+
+        myext(bool dd)
+            : do_file_delete(dd)
+        {}
+
+        seastar::future<seastar::file> wrap_file(const seastar::sstring& filename, seastar::file f, seastar::open_flags flags) override {
+            if (fail && !thrown) {
+                thrown = true;
+                if (do_file_delete) {
+                    co_await f.close();
+                    co_await seastar::remove_file(filename);
+                }
+                throw myfail{};
+            }
+            co_return f;
+        }
+        seastar::future<> before_delete(const seastar::sstring&) override {
+            co_return;
+        }
+    };
+
+    auto ep = std::make_unique<myext>(do_file_delete);
+    auto& mx = *ep;
+
+    db::extensions myexts;
+    myexts.add_commitlog_file_extension("hufflepuff", std::move(ep));
+
+    cfg.extensions = &myexts;
+
+    auto log = co_await commitlog::create_commitlog(cfg);
+
+    rp_set rps;
+    // uncomment for verbosity
+    // logging::logger_registry().set_logger_level("commitlog", logging::log_level::debug);
+
+    auto uuid = utils::UUID_gen::get_time_UUID();
+    auto size = log.max_record_size();
+
+    auto r = log.add_flush_handler([&](cf_id_type id, replay_position pos) {
+        log.discard_completed_segments(id, rps);
+        mx.fail = true;
+    });
+
+    try {
+        while (!mx.thrown) {
+            rp_handle h = co_await log.add_mutation(uuid, size, db::commitlog::force_sync::no, [&](db::commitlog::output& dst) {
+                dst.fill('1', size);
+            });
+            rps.put(std::move(h));
+        }
+    } catch (...) {
+        BOOST_FAIL("log write timed out. maybe it is deadlocked... Will not free log. ASAN errors and leaks will follow...");
+    }
+
+    co_await log.shutdown();
+    co_await log.clear();
+}
+
+/**
+ * Test generating an exception in segment file allocation
+ */
+SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex) {
+    co_await do_test_exception_in_allocate_ex(false);
+}
+
+SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex_no_recycle) {
+    co_await do_test_exception_in_allocate_ex(false, false);
+}
+
+/**
+ * Test generating an exception in segment file allocation, but also 
+ * delete the file, which in turn should cause follow-up exceptions
+ * in cleanup delete. Which CL should handle
+ */
+SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex_deleted_file) {
+    co_await do_test_exception_in_allocate_ex(true, false);
+}
+
+SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex_deleted_file_no_recycle) {
+    co_await do_test_exception_in_allocate_ex(true);
+}
--- a/test/boost/index_with_paging_test.cc
+++ b/test/boost/index_with_paging_test.cc
@@ -22,6 +22,8 @@
 #include <seastar/testing/test_case.hh>
 #include "test/lib/cql_test_env.hh"
 #include "test/lib/cql_assertions.hh"
+#include "cql3/untyped_result_set.hh"
+#include "cql3/query_processor.hh"
 #include "transport/messages/result_message.hh"

 SEASTAR_TEST_CASE(test_index_with_paging) {
@@ -56,3 +58,51 @@ SEASTAR_TEST_CASE(test_index_with_paging) {
        });
    });
 }
+
+SEASTAR_TEST_CASE(test_index_with_paging_with_base_short_read) {
+    return do_with_cql_env_thread([] (auto& e) {
+        e.execute_cql("CREATE TABLE tab (pk int, ck text, v int, v2 int, v3 text, PRIMARY KEY (pk, ck))").get();
+        e.execute_cql("CREATE INDEX ON tab (v)").get();
+
+        // Enough to trigger a short read on the base table during scan
+        sstring big_string(2 * query::result_memory_limiter::maximum_result_size, 'j');
+
+        const int row_count = 67;
+        for (int i = 0; i < row_count; ++i) {
+            e.execute_cql(format("INSERT INTO tab (pk, ck, v, v2, v3) VALUES ({}, 'hello{}', 1, {}, '{}')", i % 3, i, i, big_string)).get();
+        }
+
+        eventually([&] {
+            uint64_t count = 0;
+            e.qp().local().query_internal("SELECT * FROM ks.tab WHERE v = 1", [&] (const cql3::untyped_result_set_row&) {
+                ++count;
+                return make_ready_future<stop_iteration>(stop_iteration::no);
+            }).get();
+            BOOST_REQUIRE_EQUAL(count, row_count);
+        });
+    });
+}
+
+SEASTAR_TEST_CASE(test_index_with_paging_with_base_short_read_no_ck) {
+    return do_with_cql_env_thread([] (auto& e) {
+        e.execute_cql("CREATE TABLE tab (pk int, v int, v2 int, v3 text, PRIMARY KEY (pk))").get();
+        e.execute_cql("CREATE INDEX ON tab (v)").get();
+
+        // Enough to trigger a short read on the base table during scan
+        sstring big_string(2 * query::result_memory_limiter::maximum_result_size, 'j');
+
+        const int row_count = 67;
+        for (int i = 0; i < row_count; ++i) {
+            e.execute_cql(format("INSERT INTO tab (pk, v, v2, v3) VALUES ({}, 1, {}, '{}')", i, i, big_string)).get();
+        }
+
+        eventually([&] {
+            uint64_t count = 0;
+            e.qp().local().query_internal("SELECT * FROM ks.tab WHERE v = 1", [&] (const cql3::untyped_result_set_row&) {
+                ++count;
+                return make_ready_future<stop_iteration>(stop_iteration::no);
+            }).get();
+            BOOST_REQUIRE_EQUAL(count, row_count);
+        });
+    });
+}
--- a/test/boost/mutation_test.cc
+++ b/test/boost/mutation_test.cc
@@ -702,6 +702,7 @@ SEASTAR_TEST_CASE(test_cell_ordering) {
    };

    auto assert_equal = [] (atomic_cell_view c1, atomic_cell_view c2) {
+        testlog.trace("Expected {} == {}", c1, c2);
        BOOST_REQUIRE(compare_atomic_cell_for_merge(c1, c2) == 0);
        BOOST_REQUIRE(compare_atomic_cell_for_merge(c2, c1) == 0);
    };
@@ -723,9 +724,11 @@ SEASTAR_TEST_CASE(test_cell_ordering) {
        atomic_cell::make_live(*bytes_type, 1, bytes(), expiry_2, ttl_2));

    // Origin doesn't compare ttl (is it wise?)
-    assert_equal(
-        atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_1),
-        atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_2));
+    // But we do. See https://github.com/scylladb/scylla/issues/10156
+    // and https://github.com/scylladb/scylla/issues/10173
+    assert_order(
+        atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_2),
+        atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_1));

    assert_order(
        atomic_cell::make_live(*bytes_type, 0, bytes("value1")),
--- a/test/boost/range_tombstone_list_test.cc
+++ b/test/boost/range_tombstone_list_test.cc
@@ -210,6 +210,35 @@ BOOST_AUTO_TEST_CASE(test_overlapping_addition) {
    BOOST_REQUIRE(it == l.end());
 }

+BOOST_AUTO_TEST_CASE(test_adjacent_empty_range_tombstone) {
+    range_tombstone_list l(*s);
+
+    l.apply(*s, rtie(1, 1, 2));
+    l.apply(*s, rt(1, 2, 3));
+    l.apply(*s, rtei(2, 2, 2));
+    l.apply(*s, rtei(2, 4, 3));
+
+    auto it = l.begin();
+    assert_rt(rt(1, 4, 3), *it++);
+    BOOST_REQUIRE(it == l.end());
+}
+
+BOOST_AUTO_TEST_CASE(test_empty_range_tombstones_are_dropped) {
+    range_tombstone_list l(*s);
+
+    l.apply(*s, rtei(0, 0, 1));
+    l.apply(*s, rtie(0, 0, 1));
+    l.apply(*s, rt(1, 2, 1));
+    l.apply(*s, rtei(4, 4, 1));
+    l.apply(*s, rtie(5, 5, 1));
+    l.apply(*s, rt(7, 8, 1));
+
+    auto it = l.begin();
+    assert_rt(rt(1, 2, 1), *it++);
+    assert_rt(rt(7, 8, 1), *it++);
+    BOOST_REQUIRE(it == l.end());
+}
+
 BOOST_AUTO_TEST_CASE(test_simple_overlap) {
    range_tombstone_list l1(*s);

@@ -473,6 +502,23 @@ static std::vector<range_tombstone> make_random() {
        rts.emplace_back(std::move(start_b), std::move(end_b), tombstone(dist(gen), gc_now));
    }

+    int32_t size_empty = dist(gen) / 2;
+    for (int32_t i = 0; i < size_empty; ++i) {
+        clustering_key_prefix key = make_random_ckey();
+        bool start_incl = dist(gen) > 25;
+        if (start_incl) {
+            rts.emplace_back(
+                    position_in_partition::before_key(key),
+                    position_in_partition::before_key(key),
+                    tombstone(dist(gen), gc_now));
+        } else {
+            rts.emplace_back(
+                    position_in_partition::after_key(key),
+                    position_in_partition::after_key(key),
+                    tombstone(dist(gen), gc_now));
+        }
+    }
+
    return rts;
 }

--- a/test/boost/sstable_partition_index_cache_test.cc
+++ b/test/boost/sstable_partition_index_cache_test.cc
@@ -37,11 +37,13 @@ static void add_entry(logalloc::region& r,
 {
    logalloc::allocating_section as;
    as(r, [&] {
-        sstables::key sst_key = sstables::key::from_partition_key(s, key);
-        page._entries.push_back(make_managed<index_entry>(
-                managed_bytes(sst_key.get_bytes()),
-                position,
-                managed_ref<promoted_index>()));
+        with_allocator(r.allocator(), [&] {
+            sstables::key sst_key = sstables::key::from_partition_key(s, key);
+            page._entries.push_back(make_managed<index_entry>(
+                    managed_bytes(sst_key.get_bytes()),
+                    position,
+                    managed_ref<promoted_index>()));
+        });
    });
 }

--- a/test/cql-pytest/test_filtering.py
+++ b/test/cql-pytest/test_filtering.py
@@ -128,3 +128,16 @@ def test_operator_ne_not_supported(cql, table1):
        cql.execute(f'SELECT a FROM {table1} WHERE a != 0')
    with pytest.raises(InvalidRequest, match='Unsupported.*!='):
        cql.execute(f'SELECT a FROM {table1} WHERE token(a) != 0')
+
+# Test that the fact that a column is indexed does not cause us to fetch
+# incorrect results from a filtering query (issue #10300).
+def test_index_with_in_relation(scylla_only, cql, test_keyspace):
+    schema = 'p int, c int, v boolean, primary key (p,c)'
+    with new_test_table(cql, test_keyspace, schema) as table:
+        cql.execute(f"create index on {table}(v)")
+        for p, c, v in [(0,0,True),(0,1,False),(0,2,True),(0,3,False),
+                (1,0,True),(1,1,False),(1,2,True),(1,3,False),
+                (2,0,True),(2,1,False),(2,2,True),(2,3,False)]:
+            cql.execute(f"insert into {table} (p,c,v) values ({p}, {c}, {v})")
+        res = cql.execute(f"select * from {table} where p in (0,1) and v = False ALLOW FILTERING")
+        assert set(res) == set([(0,1,False),(0,3,False),(1,1,False), (1,3,False)])
--- a/test/cql-pytest/test_null.py
+++ b/test/cql-pytest/test_null.py
@@ -63,8 +63,9 @@ def test_insert_null_key(cql, table1):
    with pytest.raises(InvalidRequest, match='null value'):
        cql.execute(stmt, [None, s])

+# Tests handling of "key_column in ?" where ? is bound to null.
+# Reproduces issue #8265.
 def test_primary_key_in_null(cql, table1):
-    '''Tests handling of "key_column in ?" where ? is bound to null.'''
    with pytest.raises(InvalidRequest, match='null value'):
        cql.execute(cql.prepare(f"SELECT p FROM {table1} WHERE p IN ?"), [None])
    with pytest.raises(InvalidRequest, match='null value'):
@@ -159,6 +160,20 @@ def test_delete_empty_string_key(cql, table1):
    with pytest.raises(InvalidRequest, match='Key may not be empty'):
        cql.execute(f"DELETE FROM {table1} WHERE p='' AND c='{s}'")

+# Another test like test_insert_empty_string_key() just using an INSERT JSON
+# instead of a regular INSERT. Because INSERT JSON takes a different code path
+# from regular INSERT, we need the emptiness test in yet another place.
+# Reproduces issue #9853 (the empty-string partition key was allowed, and
+# actually inserted into the table.)
+def test_insert_json_empty_string_key(cql, table1):
+    s = random_string()
+    # An empty-string clustering *is* allowed:
+    cql.execute("""INSERT INTO %s JSON '{"p": "%s", "c": "", "v": "cat"}'""" % (table1, s))
+    assert list(cql.execute(f"SELECT v FROM {table1} WHERE p='{s}' AND c=''")) == [('cat',)]
+    # But an empty-string partition key is *not* allowed, with a specific
+    # error that a "Key may not be empty":
+    with pytest.raises(InvalidRequest, match='Key may not be empty'):
+        cql.execute("""INSERT INTO %s JSON '{"p": "", "c": "%s", "v": "cat"}'""" % (table1, s))

 # Although an empty string is not allowed as a partition key (as tested
 # above by test_empty_string_key()), it turns out that in a *compound*
--- a/test/cql/cdc_compact_storage_test.cql
+++ b/test/cql/cdc_compact_storage_test.cql
@@ -1,4 +1,14 @@
-create table tb2 (pk int, ck int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
-- Should add 3 rows (preimage + postimage + delta). Delta has only key columns and "pk" + "ck"
-insert into tb2 (pk, ck) VALUES (2, 22) USING TTL 2222;
-select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb2_scylla_cdc_log;
+create table tb2 (pk int, ck int, v int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
+-- Should add 2 rows (postimage + delta).
+insert into tb2 (pk, ck, v) VALUES (2, 22, 111) USING TTL 2222;
+select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck, v from tb2_scylla_cdc_log;
+-- Should add 3 rows (preimage + postimage + delta).
+insert into tb2 (pk, ck, v) VALUES (2, 22, 1111) USING TTL 2223;
+select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck, v from tb2_scylla_cdc_log;
+create table tb3 (pk int, ck int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
+-- Should add 2 rows (postimage + delta).
+insert into tb3 (pk, ck) VALUES (2, 22) USING TTL 2222;
+select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb3_scylla_cdc_log;
+-- Should add 3 rows (preimage + postimage + delta).
+insert into tb3 (pk, ck) VALUES (2, 22) USING TTL 2223;
+select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb3_scylla_cdc_log;
--- a/test/cql/cdc_compact_storage_test.result
+++ b/test/cql/cdc_compact_storage_test.result
@@ -1,13 +1,91 @@
-create table tb2 (pk int, ck int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
+create table tb2 (pk int, ck int, v int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
 {
 	"status" : "ok"
 }
-- Should add 3 rows (preimage + postimage + delta). Delta has only key columns and "pk" + "ck"
-insert into tb2 (pk, ck) VALUES (2, 22) USING TTL 2222;
+-- Should add 2 rows (postimage + delta).
+insert into tb2 (pk, ck, v) VALUES (2, 22, 111) USING TTL 2222;
 {
 	"status" : "ok"
 }
-select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb2_scylla_cdc_log;
+select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck, v from tb2_scylla_cdc_log;
+{
+	"rows" : 
+	[
+		{
+			"cdc$batch_seq_no" : "0",
+			"cdc$operation" : "1",
+			"cdc$ttl" : "2222",
+			"ck" : "22",
+			"pk" : "2",
+			"v" : "111"
+		},
+		{
+			"cdc$batch_seq_no" : "1",
+			"cdc$operation" : "9",
+			"ck" : "22",
+			"pk" : "2",
+			"v" : "111"
+		}
+	]
+}
+-- Should add 3 rows (preimage + postimage + delta).
+insert into tb2 (pk, ck, v) VALUES (2, 22, 1111) USING TTL 2223;
+{
+	"status" : "ok"
+}
+select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck, v from tb2_scylla_cdc_log;
+{
+	"rows" : 
+	[
+		{
+			"cdc$batch_seq_no" : "0",
+			"cdc$operation" : "1",
+			"cdc$ttl" : "2222",
+			"ck" : "22",
+			"pk" : "2",
+			"v" : "111"
+		},
+		{
+			"cdc$batch_seq_no" : "1",
+			"cdc$operation" : "9",
+			"ck" : "22",
+			"pk" : "2",
+			"v" : "111"
+		},
+		{
+			"cdc$batch_seq_no" : "0",
+			"cdc$operation" : "0",
+			"ck" : "22",
+			"pk" : "2",
+			"v" : "111"
+		},
+		{
+			"cdc$batch_seq_no" : "1",
+			"cdc$operation" : "1",
+			"cdc$ttl" : "2223",
+			"ck" : "22",
+			"pk" : "2",
+			"v" : "1111"
+		},
+		{
+			"cdc$batch_seq_no" : "2",
+			"cdc$operation" : "9",
+			"ck" : "22",
+			"pk" : "2",
+			"v" : "1111"
+		}
+	]
+}
+create table tb3 (pk int, ck int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
+{
+	"status" : "ok"
+}
+-- Should add 2 rows (postimage + delta).
+insert into tb3 (pk, ck) VALUES (2, 22) USING TTL 2222;
+{
+	"status" : "ok"
+}
+select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb3_scylla_cdc_log;
 {
 	"rows" : 
 	[
@@ -26,3 +104,46 @@ select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb2_scylla_cd
 		}
 	]
 }
+-- Should add 3 rows (preimage + postimage + delta).
+insert into tb3 (pk, ck) VALUES (2, 22) USING TTL 2223;
+{
+	"status" : "ok"
+}
+select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb3_scylla_cdc_log;
+{
+	"rows" : 
+	[
+		{
+			"cdc$batch_seq_no" : "0",
+			"cdc$operation" : "1",
+			"cdc$ttl" : "2222",
+			"ck" : "22",
+			"pk" : "2"
+		},
+		{
+			"cdc$batch_seq_no" : "1",
+			"cdc$operation" : "9",
+			"ck" : "22",
+			"pk" : "2"
+		},
+		{
+			"cdc$batch_seq_no" : "0",
+			"cdc$operation" : "0",
+			"ck" : "22",
+			"pk" : "2"
+		},
+		{
+			"cdc$batch_seq_no" : "1",
+			"cdc$operation" : "1",
+			"cdc$ttl" : "2223",
+			"ck" : "22",
+			"pk" : "2"
+		},
+		{
+			"cdc$batch_seq_no" : "2",
+			"cdc$operation" : "9",
+			"ck" : "22",
+			"pk" : "2"
+		}
+	]
+}
--- a/utils/cached_file.hh
+++ b/utils/cached_file.hh
@@ -157,6 +157,7 @@ private:
    metrics& _metrics;
    lru& _lru;
    logalloc::region& _region;
+    logalloc::allocating_section _as;

    using cache_type = bplus::tree<page_idx_type, cached_page, page_idx_less_comparator, 12, bplus::key_search::linear>;
    cache_type _cache;
@@ -187,10 +188,15 @@ private:
            .then([this, idx] (temporary_buffer<char>&& buf) mutable {
                cached_page::ptr_type first_page;
                while (buf.size()) {
-                    auto this_buf = buf.share();
-                    this_buf.trim(std::min(page_size, buf.size()));
-                    buf.trim_front(this_buf.size());
-                    auto it_and_flag = _cache.emplace(idx, this, idx, std::move(this_buf));
+                    auto this_size = std::min(page_size, buf.size());
+                    // _cache.emplace() needs to run under allocating section even though it lives in the std space
+                    // because bplus::tree operations are not reentrant, so we need to prevent memory reclamation.
+                    auto it_and_flag = _as(_region, [&] {
+                        auto this_buf = buf.share();
+                        this_buf.trim(this_size);
+                        return _cache.emplace(idx, this, idx, std::move(this_buf));
+                    });
+                    buf.trim_front(this_size);
                    ++idx;
                    cached_page &cp = *it_and_flag.first;
                    if (it_and_flag.second) {
@@ -333,6 +339,7 @@ public:
    }

    size_t evict_range(cache_type::iterator start, cache_type::iterator end) noexcept {
+      return with_allocator(standard_allocator(), [&] {
        size_t count = 0;
        auto disposer = [] (auto* p) noexcept {};
        while (start != end) {
@@ -345,6 +352,7 @@ public:
            }
        }
        return count;
+      });
    }
 public:
    /// \brief Constructs a cached_file.
@@ -471,8 +479,10 @@ public:
 inline
 void cached_file::cached_page::on_evicted() noexcept {
    parent->on_evicted(*this);
-    cached_file::cache_type::iterator it(this);
-    it.erase(page_idx_less_comparator());
+    with_allocator(standard_allocator(), [this] {
+        cached_file::cache_type::iterator it(this);
+        it.erase(page_idx_less_comparator());
+    });
 }

 class cached_file_impl : public file_impl {
--- a/utils/logalloc.cc
+++ b/utils/logalloc.cc
@@ -597,6 +597,10 @@ static constexpr auto max_used_space_ratio_for_compaction = 0.85;
 static constexpr size_t max_used_space_for_compaction = segment_size * max_used_space_ratio_for_compaction;
 static constexpr size_t min_free_space_for_compaction = segment_size - max_used_space_for_compaction;

+struct [[gnu::packed]] non_lsa_object_cookie {
+    uint64_t value = 0xbadcaffe;
+};
+
 static_assert(min_free_space_for_compaction >= max_managed_object_size,
    "Segments which cannot fit max_managed_object_size must not be considered compactible for the sake of forward progress of compaction");

@@ -840,9 +844,13 @@ public:
    void clear_allocation_failure_flag() { _allocation_failure_flag = false; }
    bool allocation_failure_flag() { return _allocation_failure_flag; }
    void refill_emergency_reserve();
-    void update_non_lsa_memory_in_use(ssize_t n) {
+    void add_non_lsa_memory_in_use(size_t n) {
        _non_lsa_memory_in_use += n;
    }
+    void subtract_non_lsa_memory_in_use(size_t n) {
+        assert(_non_lsa_memory_in_use >= n);
+        _non_lsa_memory_in_use -= n;
+    }
    size_t non_lsa_memory_in_use() const {
        return _non_lsa_memory_in_use;
    }
@@ -1395,6 +1403,8 @@ private:
    }

    lsa_buffer alloc_buf(size_t buf_size) {
+        // Note: Can be re-entered from allocation sites below due to memory reclamation which
+        // invokes segment compaction.
        static_assert(segment::size % buf_align == 0);
        if (buf_size > segment::size) {
            throw_with_backtrace<std::runtime_error>(format("Buffer size {} too large", buf_size));
@@ -1447,6 +1457,7 @@ private:

        if (seg != _buf_active) {
            if (desc.is_empty()) {
+                assert(desc._buf_pointers.empty());
                _segment_descs.erase(desc);
                desc._buf_pointers = std::vector<entangled>();
                free_segment(seg, desc);
@@ -1457,7 +1468,7 @@ private:
        }
    }

-    void compact_segment_locked(segment* seg, segment_descriptor& desc) {
+    void compact_segment_locked(segment* seg, segment_descriptor& desc) noexcept {
        auto seg_occupancy = desc.occupancy();
        llogger.debug("Compacting segment {} from region {}, {}", fmt::ptr(seg), id(), seg_occupancy);

@@ -1472,6 +1483,7 @@ private:
            for (entangled& e : _buf_ptrs_for_compact_segment) {
                if (e) {
                    lsa_buffer* old_ptr = e.get(&lsa_buffer::_link);
+                    assert(&desc == old_ptr->_desc);
                    lsa_buffer dst = alloc_buf(old_ptr->_size);
                    memcpy(dst._buf, old_ptr->_buf, dst._size);
                    old_ptr->_link = std::move(dst._link);
@@ -1502,6 +1514,10 @@ private:
        std::vector<entangled> ptrs;
        ptrs.reserve(segment::size / buf_align);
        segment* new_active = new_segment();
+        if (_buf_active) [[unlikely]] {
+            // Memory allocation above could allocate active buffer during segment compaction.
+            close_buf_active();
+        }
        assert((uintptr_t)new_active->at(0) % buf_align == 0);
        segment_descriptor& desc = shard_segment_pool.descriptor(new_active);
        desc._buf_pointers = std::move(ptrs);
@@ -1635,17 +1651,18 @@ public:
        memory::on_alloc_point();
        shard_segment_pool.on_memory_allocation(size);
        if (size > max_managed_object_size) {
-            auto ptr = standard_allocator().alloc(migrator, size, alignment);
+            auto ptr = standard_allocator().alloc(migrator, size + sizeof(non_lsa_object_cookie), alignment);
            // This isn't very acurrate, the correct free_space value would be
            // malloc_usable_size(ptr) - size, but there is no way to get
            // the exact object size at free.
            auto allocated_size = malloc_usable_size(ptr);
+            new ((char*)ptr + allocated_size - sizeof(non_lsa_object_cookie)) non_lsa_object_cookie();
            _non_lsa_occupancy += occupancy_stats(0, allocated_size);
            if (_group) {
                 _evictable_space += allocated_size;
                _group->increase_usage(_heap_handle, allocated_size);
            }
-            shard_segment_pool.update_non_lsa_memory_in_use(allocated_size);
+            shard_segment_pool.add_non_lsa_memory_in_use(allocated_size);
            return ptr;
        } else {
            auto ptr = alloc_small(object_descriptor(migrator), (segment::size_type) size, alignment);
@@ -1657,12 +1674,14 @@ public:
 private:
    void on_non_lsa_free(void* obj) noexcept {
        auto allocated_size = malloc_usable_size(obj);
+        auto cookie = (non_lsa_object_cookie*)((char*)obj + allocated_size) - 1;
+        assert(cookie->value == non_lsa_object_cookie().value);
        _non_lsa_occupancy -= occupancy_stats(0, allocated_size);
        if (_group) {
            _evictable_space -= allocated_size;
            _group->decrease_usage(_heap_handle, allocated_size);
        }
-        shard_segment_pool.update_non_lsa_memory_in_use(-allocated_size);
+        shard_segment_pool.subtract_non_lsa_memory_in_use(allocated_size);
    }
 public:
    virtual void free(void* obj) noexcept override {
@@ -2188,8 +2207,8 @@ private:
        auto info_level = _stall_detected ? log_level::info : log_level::debug;
        auto MiB = 1024*1024;

-        timing_logger.log(time_level, "Reclamation cycle took {} ms, trying to release {:.3f} MiB {}preemptibly",
-                          _duration.count(), (float)_memory_to_release / MiB, _preemptible ? "" : "non-");
+        timing_logger.log(time_level, "Reclamation cycle took {} us, trying to release {:.3f} MiB {}preemptibly",
+                          _duration / 1us, (float)_memory_to_release / MiB, _preemptible ? "" : "non-");
        log_if_any(info_level, "reserved segments", _reserve_segments);
        if (_memory_released > 0) {
            auto bytes_per_second =