release: prepare for 4.6.3

utils/chunked_managed_vector: Fix sigsegv during reserve()
Fixes the case of make_room() invoked with last_chunk_capacity_deficit but _size not in the last reserved chunk. Found during code review, no user impact. Fixes #10364. Message-Id: <20220411224741.644113-1-tgrabiec@scylladb.com> (cherry picked from commit 0c365818c3)
2022-04-14 14:16:52 +03:00 · 2022-04-13 10:29:30 +03:00 · 2022-04-13 10:29:03 +03:00 · 2022-04-13 09:49:02 +03:00 · 2022-04-08 10:53:52 +03:00 · 2022-04-07 12:13:35 +03:00
64 changed files with 1291 additions and 327 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -60,7 +60,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=4.6.dev
+VERSION=4.6.3

 if test -f version
 then
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -1017,18 +1017,16 @@ future<executor::request_return_type> executor::update_table(client_state& clien
    _stats.api_operations.update_table++;
    elogger.trace("Updating table {}", request);

-    std::string table_name = get_table_name(request);
-    if (table_name.find(INTERNAL_TABLE_PREFIX) == 0) {
+    schema_ptr tab = get_table(_proxy, request);
+    // the ugly but harmless conversion to string_view here is because
+    // Seastar's sstring is missing a find(std::string_view) :-()
+    if (std::string_view(tab->cf_name()).find(INTERNAL_TABLE_PREFIX) == 0) {
        return make_ready_future<request_return_type>(api_error::validation(
                format("Prefix {} is reserved for accessing internal tables", INTERNAL_TABLE_PREFIX)));
    }
-    std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
-    tracing::add_table_name(trace_state, keyspace_name, table_name);
+    tracing::add_table_name(trace_state, tab->ks_name(), tab->cf_name());

-    auto& db = _proxy.get_db().local();
-    auto& cf = db.find_column_family(keyspace_name, table_name);
-
-    schema_builder builder(cf.schema());
+    schema_builder builder(tab);

    rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
    if (stream_specification && stream_specification->IsObject()) {
@@ -2481,8 +2479,8 @@ static bool hierarchy_actions(
                        // attr member so we can use add()
                        rjson::add_with_string_name(v, attr, std::move(*newv));
                    } else {
-                        throw api_error::validation(format("Can't remove document path {} - not present in item",
-                            subh.get_value()._path));
+                        // Removing a.b when a is a map but a.b doesn't exist
+                        // is silently ignored. It's not considered an error.
                    }
                } else {
                    throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -79,6 +79,49 @@ atomic_cell::atomic_cell(const abstract_type& type, atomic_cell_view other)
    set_view(_data);
 }

+// Based on:
+//  - org.apache.cassandra.db.AbstractCell#reconcile()
+//  - org.apache.cassandra.db.BufferExpiringCell#reconcile()
+//  - org.apache.cassandra.db.BufferDeletedCell#reconcile()
+std::strong_ordering
+compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
+    if (left.timestamp() != right.timestamp()) {
+        return left.timestamp() <=> right.timestamp();
+    }
+    if (left.is_live() != right.is_live()) {
+        return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
+    }
+    if (left.is_live()) {
+        auto c = compare_unsigned(left.value(), right.value()) <=> 0;
+        if (c != 0) {
+            return c;
+        }
+        if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
+            // prefer expiring cells.
+            return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
+        }
+        if (left.is_live_and_has_ttl()) {
+            if (left.expiry() != right.expiry()) {
+                return left.expiry() <=> right.expiry();
+            } else {
+                // prefer the cell that was written later,
+                // so it survives longer after it expires, until purged.
+                return right.ttl() <=> left.ttl();
+            }
+        }
+    } else {
+        // Both are deleted
+
+        // Origin compares big-endian serialized deletion time. That's because it
+        // delegates to AbstractCell.reconcile() which compares values after
+        // comparing timestamps, which in case of deleted cells will hold
+        // serialized expiry.
+        return (uint64_t) left.deletion_time().time_since_epoch().count()
+                <=> (uint64_t) right.deletion_time().time_since_epoch().count();
+    }
+    return std::strong_ordering::equal;
+}
+
 atomic_cell_or_collection atomic_cell_or_collection::copy(const abstract_type& type) const {
    if (_data.empty()) {
        return atomic_cell_or_collection();
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -593,8 +593,8 @@ void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::con
                clogger.trace("csm {}: insert dummy at {}", fmt::ptr(this), _lower_bound);
                auto it = with_allocator(_lsa_manager.region().allocator(), [&] {
                    auto& rows = _snp->version()->partition().mutable_clustered_rows();
-                    auto new_entry = current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no);
-                    return rows.insert_before(_next_row.get_iterator_in_latest_version(), *new_entry);
+                    auto new_entry = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no));
+                    return rows.insert_before(_next_row.get_iterator_in_latest_version(), std::move(new_entry));
                });
                _snp->tracker()->insert(*it);
                _last_row = partition_snapshot_row_weakref(*_snp, it, true);
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -1511,6 +1511,11 @@ public:
        }

        auto process_cell = [&, this] (const column_definition& cdef) {
+            // If table uses compact storage it may contain a column of type empty
+            // and we need to ignore such a field because it is not present in CDC log.
+            if (cdef.type->get_kind() == abstract_type::kind::empty) {
+                return;
+            }
            if (auto current = get_col_from_row_state(row_state, cdef)) {
                _builder->set_value(image_ck, cdef, *current);
            } else if (op == operation::pre_image) {
--- a/compaction/compaction.cc
+++ b/compaction/compaction.cc
@@ -1634,7 +1634,7 @@ future<bool> scrub_validate_mode_validate_reader(flat_mutation_reader reader, co
        while (auto mf_opt = co_await reader()) {
            if (cdata.is_stop_requested()) [[unlikely]] {
                // Compaction manager will catch this exception and re-schedule the compaction.
-                co_return coroutine::make_exception(compaction_stopped_exception(schema->ks_name(), schema->cf_name(), cdata.stop_requested));
+                throw compaction_stopped_exception(schema->ks_name(), schema->cf_name(), cdata.stop_requested);
            }

            const auto& mf = *mf_opt;
--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -326,6 +326,11 @@ future<> compaction_manager::run_custom_job(column_family* cf, sstables::compact
    task->compaction_done = with_semaphore(_custom_job_sem, 1, [this, task, cf, &job = *job_ptr] () mutable {
        // take read lock for cf, so major compaction and resharding can't proceed in parallel.
        return with_lock(_compaction_locks[cf].for_read(), [this, task, cf, &job] () mutable {
+            // Allow caller to know that task (e.g. reshape) was asked to stop while waiting for a chance to run.
+            if (task->compaction_data.is_stop_requested()) {
+                throw sstables::compaction_stopped_exception(task->compacting_cf->schema()->ks_name(), task->compacting_cf->schema()->cf_name(),
+                    task->compaction_data.stop_requested);
+            }
            _stats.active_tasks++;
            if (!can_proceed(task)) {
                return make_ready_future<>();
@@ -737,8 +742,10 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
            column_family& cf = *task->compacting_cf;
            auto sstable_level = sst->get_sstable_level();
            auto run_identifier = sst->run_identifier();
+
            auto sstable_set_snapshot = can_purge ? std::make_optional(cf.get_sstable_set()) : std::nullopt;
-            auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), _maintenance_sg.io,
+            // FIXME: this compaction should run with maintenance priority.
+            auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), service::get_local_compaction_priority(),
                sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, options);

            // Releases reference to cleaned sstable such that respective used disk space can be freed.
@@ -747,15 +754,14 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
            };

            return with_semaphore(_rewrite_sstables_sem, 1, [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
-              // Take write lock for cf to serialize cleanup/upgrade sstables/scrub with major compaction/reshape/reshard.
-              return with_lock(_compaction_locks[&cf].for_write(), [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
+              return with_lock(_compaction_locks[&cf].for_read(), [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
                _stats.pending_tasks--;
                _stats.active_tasks++;
                task->setup_new_compaction();
                task->output_run_identifier = descriptor.run_identifier;
                compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
                return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor), task] (compaction_backlog_tracker& bt) mutable {
-                    return with_scheduling_group(_maintenance_sg.cpu, [this, &cf, descriptor = std::move(descriptor), task]() mutable {
+                    return with_scheduling_group(_compaction_controller.sg(), [this, &cf, descriptor = std::move(descriptor), task]() mutable {
                        return cf.compact_sstables(std::move(descriptor), task->compaction_data);
                    });
                });
@@ -979,7 +985,7 @@ void compaction_manager::stop_compaction(sstring type) {
    }
    // FIXME: switch to task_stop(), and wait for their termination, so API user can know when compactions actually stopped.
    for (auto& task : _tasks) {
-        if (task->compaction_running && target_type == task->type) {
+        if (target_type == task->type) {
            task->compaction_data.stop("user request");
        }
    }
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -117,7 +117,13 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
        if (!col_type->is_map()) {
            throw exceptions::invalid_request_exception(format("subscripting non-map column {}", cdef->name_as_text()));
        }
-        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*data.other_columns[data.sel.index_of(*cdef)]));
+        int32_t index = data.sel.index_of(*cdef);
+        if (index == -1) {
+            throw std::runtime_error(
+                    format("Column definition {} does not match any column in the query selection",
+                    cdef->name_as_text()));
+        }
+        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*data.other_columns[index]));
        const auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
        const auto key = evaluate_to_raw_view(col.sub, options);
        auto&& key_type = col_type->name_comparator();
@@ -135,8 +141,16 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
        case column_kind::clustering_key:
            return managed_bytes(data.clustering_key[cdef->id]);
        case column_kind::static_column:
-        case column_kind::regular_column:
-            return managed_bytes_opt(data.other_columns[data.sel.index_of(*cdef)]);
+            [[fallthrough]];
+        case column_kind::regular_column: {
+            int32_t index = data.sel.index_of(*cdef);
+            if (index == -1) {
+                throw std::runtime_error(
+                        format("Column definition {} does not match any column in the query selection",
+                        cdef->name_as_text()));
+            }
+            return managed_bytes_opt(data.other_columns[index]);
+        }
        default:
            throw exceptions::unsupported_operation_exception("Unknown column kind");
        }
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -528,7 +528,7 @@ statement_restrictions::statement_restrictions(database& db,
    }

    if (!_nonprimary_key_restrictions->empty()) {
-        if (_has_queriable_regular_index) {
+        if (_has_queriable_regular_index && _partition_range_is_simple) {
            _uses_secondary_indexing = true;
        } else if (!allow_filtering) {
            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -193,7 +193,7 @@ public:

    template<typename RowComparator>
    void sort(const RowComparator& cmp) {
-        std::sort(_rows.begin(), _rows.end(), std::ref(cmp));
+        std::sort(_rows.begin(), _rows.end(), cmp);
    }

    metadata& get_metadata();
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -995,6 +995,7 @@ lw_shared_ptr<const service::pager::paging_state> indexed_table_select_statement
    }

    auto paging_state_copy = make_lw_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
+    paging_state_copy->set_remaining(internal_paging_size);
    paging_state_copy->set_partition_key(std::move(index_pk));
    paging_state_copy->set_clustering_key(std::move(index_ck));
    return std::move(paging_state_copy);
--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -53,6 +53,7 @@
 #include "types/list.hh"
 #include "types/user.hh"
 #include "concrete_types.hh"
+#include "validation.hh"

 namespace cql3 {

@@ -251,6 +252,7 @@ insert_prepared_json_statement::build_partition_keys(const query_options& option
        exploded.emplace_back(json_value->second);
    }
    auto pkey = partition_key::from_optional_exploded(*s, std::move(exploded));
+    validation::validate_cql_key(*s, pkey);
    auto k = query::range<query::ring_position>::make_singular(dht::decorate_key(*s, std::move(pkey)));
    ranges.emplace_back(std::move(k));
    return ranges;
--- a/database.cc
+++ b/database.cc
@@ -1348,44 +1348,6 @@ database::existing_index_names(const sstring& ks_name, const sstring& cf_to_excl
    return names;
 }

-// Based on:
-//  - org.apache.cassandra.db.AbstractCell#reconcile()
-//  - org.apache.cassandra.db.BufferExpiringCell#reconcile()
-//  - org.apache.cassandra.db.BufferDeletedCell#reconcile()
-std::strong_ordering
-compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
-    if (left.timestamp() != right.timestamp()) {
-        return left.timestamp() <=> right.timestamp();
-    }
-    if (left.is_live() != right.is_live()) {
-        return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
-    }
-    if (left.is_live()) {
-        auto c = compare_unsigned(left.value(), right.value()) <=> 0;
-        if (c != 0) {
-            return c;
-        }
-        if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
-            // prefer expiring cells.
-            return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
-        }
-        if (left.is_live_and_has_ttl() && left.expiry() != right.expiry()) {
-            return left.expiry() <=> right.expiry();
-        }
-    } else {
-        // Both are deleted
-        if (left.deletion_time() != right.deletion_time()) {
-            // Origin compares big-endian serialized deletion time. That's because it
-            // delegates to AbstractCell.reconcile() which compares values after
-            // comparing timestamps, which in case of deleted cells will hold
-            // serialized expiry.
-            return (uint64_t) left.deletion_time().time_since_epoch().count()
-                   <=> (uint64_t) right.deletion_time().time_since_epoch().count();
-        }
-    }
-    return std::strong_ordering::equal;
-}
-
 future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>>
 database::query(schema_ptr s, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges,
                tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -428,6 +428,8 @@ private:
    void abort_recycled_list(std::exception_ptr);
    void abort_deletion_promise(std::exception_ptr);

+    future<> recalculate_footprint();
+
    future<> rename_file(sstring, sstring) const;
    size_t max_request_controller_units() const;
    segment_id_type _ids = 0;
@@ -444,6 +446,7 @@ private:
    seastar::gate _gate;
    uint64_t _new_counter = 0;
    std::optional<size_t> _disk_write_alignment;
+    seastar::semaphore _reserve_recalculation_guard;
 };

 template<typename T>
@@ -512,6 +515,7 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
    uint64_t _file_pos = 0;
    uint64_t _flush_pos = 0;
    uint64_t _size_on_disk = 0;
+    uint64_t _waste = 0;

    size_t _alignment;

@@ -598,7 +602,7 @@ public:
            clogger.debug("Segment {} is no longer active and will submitted for delete now", *this);
            ++_segment_manager->totals.segments_destroyed;
            _segment_manager->totals.active_size_on_disk -= file_position();
-            _segment_manager->totals.wasted_size_on_disk -= (_size_on_disk - file_position());
+            _segment_manager->totals.wasted_size_on_disk -= _waste;
            _segment_manager->add_file_to_delete(_file_name, _desc);
        } else if (_segment_manager->cfg.warn_about_segments_left_on_disk_after_shutdown) {
            clogger.warn("Segment {} is dirty and is left on disk.", *this);
@@ -725,7 +729,8 @@ public:
        auto s = co_await sync();
        co_await flush();
        co_await terminate();
-        _segment_manager->totals.wasted_size_on_disk += (_size_on_disk - file_position());
+        _waste = _size_on_disk - file_position();
+        _segment_manager->totals.wasted_size_on_disk += _waste;
        co_return s;
    }
    future<sseg_ptr> do_flush(uint64_t pos) {
@@ -1223,6 +1228,7 @@ db::commitlog::segment_manager::segment_manager(config c)
    , _recycled_segments(std::numeric_limits<size_t>::max())
    , _reserve_replenisher(make_ready_future<>())
    , _background_sync(make_ready_future<>())
+    , _reserve_recalculation_guard(1)
 {
    assert(max_size > 0);
    assert(max_mutation_size < segment::multi_entry_size_magic);
@@ -1248,6 +1254,11 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
        }
        try {
            gate::holder g(_gate);
+            auto guard = co_await get_units(_reserve_recalculation_guard, 1);
+            if (_reserve_segments.full()) {
+                // can happen if we recalculate
+                continue;
+            }
            // note: if we were strict with disk size, we would refuse to do this 
            // unless disk footprint is lower than threshold. but we cannot (yet?)
            // trust that flush logic will absolutely free up an existing 
@@ -1519,7 +1530,7 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:

        if (cfg.extensions && !cfg.extensions->commitlog_file_extensions().empty()) {
            for (auto * ext : cfg.extensions->commitlog_file_extensions()) {
-                auto nf = co_await ext->wrap_file(std::move(filename), f, flags);
+                auto nf = co_await ext->wrap_file(filename, f, flags);
                if (nf) {
                    f = std::move(nf);
                    align = is_overwrite ? f.disk_overwrite_dma_alignment() : f.disk_write_dma_alignment();
@@ -1530,12 +1541,21 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
        f = make_checked_file(commit_error_handler, std::move(f));
    } catch (...) {
        ep = std::current_exception();
-        commit_error_handler(ep);
+    }
+    if (ep) {
+        // do this early, so iff we are to fast-fail server,
+        // we do it before anything else can go wrong.
+        try {
+            commit_error_handler(ep);
+        } catch (...) {
+            ep = std::current_exception();
+        }
    }
    if (ep && f) {
        co_await f.close();
    }
    if (ep) {
+        add_file_to_delete(filename, d);
        co_return coroutine::exception(std::move(ep));
    }

@@ -1594,6 +1614,8 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
 }

 future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::new_segment() {
+    gate::holder g(_gate);
+
    if (_shutdown) {
        co_return coroutine::make_exception(std::runtime_error("Commitlog has been shut down. Cannot add data"));
    }
@@ -1628,22 +1650,23 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
            co_return _segments.back();
        }

-        if (_segment_allocating) {
-            co_await _segment_allocating->get_future(timeout);
-            continue;
-        }
-
-        promise<> p;
-        _segment_allocating.emplace(p.get_future());
-        auto finally = defer([&] () noexcept { _segment_allocating = std::nullopt; });
-        try {
-            gate::holder g(_gate);
-            auto s = co_await with_timeout(timeout, new_segment());
-            p.set_value();
-        } catch (...) {
-            p.set_exception(std::current_exception());
-            throw;
+        // #9896 - we don't want to issue a new_segment call until
+        // the old one has terminated with either result or exception.
+        // Do all waiting through the shared_future
+        if (!_segment_allocating) {
+            auto f = new_segment();
+            // must check that we are not already done.
+            if (f.available()) {
+                f.get(); // maybe force exception
+                continue;
+            }
+            _segment_allocating.emplace(f.discard_result().finally([this] {
+                // clear the shared_future _before_ resolving its contents
+                // (i.e. with result of this finally)
+                _segment_allocating = std::nullopt;
+            }));
        }
+        co_await _segment_allocating->get_future(timeout);
    }
 }

@@ -1865,6 +1888,8 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi

    std::exception_ptr recycle_error;

+    size_t num_deleted = 0;
+    bool except = false;
    while (!files.empty()) {
        auto filename = std::move(files.back());
        files.pop_back();
@@ -1914,8 +1939,10 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi
                }
            }
            co_await delete_file(filename);
+            ++num_deleted;
        } catch (...) {
            clogger.error("Could not delete segment {}: {}", filename, std::current_exception());
+            except = true;
        }
    }

@@ -1928,6 +1955,16 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi
    if (recycle_error && _recycled_segments.empty()) {
        abort_recycled_list(recycle_error);
    }
+    // If recycle failed and turned into a delete, we should fake-wakeup waiters
+    // since we might still have cleaned up disk space.
+    if (!recycle_error && num_deleted && cfg.reuse_segments && _recycled_segments.empty()) {
+        abort_recycled_list(std::make_exception_ptr(std::runtime_error("deleted files")));
+    }
+
+    // #9348 - if we had an exception, we can't trust our bookeep any more. recalculate.
+    if (except) {
+        co_await recalculate_footprint();
+    }
 }

 void db::commitlog::segment_manager::abort_recycled_list(std::exception_ptr ep) {
@@ -1942,6 +1979,67 @@ void db::commitlog::segment_manager::abort_deletion_promise(std::exception_ptr e
    std::exchange(_disk_deletions, {}).set_exception(ep);
 }

+future<> db::commitlog::segment_manager::recalculate_footprint() {
+    try {
+        co_await do_pending_deletes();
+
+        auto guard = co_await get_units(_reserve_recalculation_guard, 1);
+        auto segments_copy = _segments;
+        std::vector<sseg_ptr> reserves;
+        std::vector<sstring> recycles;
+        // this causes haywire things while we steal stuff, but...
+        while (!_reserve_segments.empty()) {
+            reserves.push_back(_reserve_segments.pop());
+        }
+        while (!_recycled_segments.empty()) {
+            recycles.push_back(_recycled_segments.pop());
+        }
+        // #9955 - must re-stock the queues before we do anything
+        // interruptable/continuation. Because both queues are
+        // used with push/pop eventually which _waits_ for signal
+        // but does _not_ verify that the condition is true once
+        // we return. So copy the objects and look at instead.
+        for (auto& filename : recycles) {
+            _recycled_segments.push(sstring(filename));
+        }
+        for (auto& s : reserves) {
+            _reserve_segments.push(sseg_ptr(s)); // you can have it back now.
+        }
+
+        // first, guesstimate sizes
+        uint64_t recycle_size = recycles.size() * max_size;
+        auto old = totals.total_size_on_disk;
+
+        totals.total_size_on_disk = recycle_size;
+        for (auto& s : _segments) {
+            totals.total_size_on_disk += s->_size_on_disk;
+        }
+        for (auto& s : reserves) {
+            totals.total_size_on_disk += s->_size_on_disk;
+        }
+
+        // now we need to adjust the actual sizes of recycled files
+
+        uint64_t actual_recycled_size = 0;
+
+        try {
+            for (auto& filename : recycles) {
+                auto s = co_await seastar::file_size(filename);
+                actual_recycled_size += s;
+            }
+        } catch (...) {
+            clogger.error("Exception reading disk footprint ({}).", std::current_exception());
+            actual_recycled_size = recycle_size; // best we got
+        }
+
+        totals.total_size_on_disk += actual_recycled_size - recycle_size;
+        // pushing things to reserve/recycled queues will have resumed any
+        // waiters, so we should be done.
+    } catch (...) {
+        clogger.error("Exception recalculating disk footprint ({}). Values might be off...", std::current_exception());
+    }
+}
+
 future<> db::commitlog::segment_manager::do_pending_deletes() {
    auto ftc = std::exchange(_files_to_close, {});
    auto ftd = std::exchange(_files_to_delete, {});
--- a/dist/common/scripts/scylla-housekeeping
+++ b/dist/common/scripts/scylla-housekeeping
@@ -100,6 +100,7 @@ def version_compare(a, b):
 def create_uuid_file(fl):
    with open(args.uuid_file, 'w') as myfile:
        myfile.write(str(uuid.uuid1()) + "\n")
+    os.chmod(args.uuid_file, 0o644)


 def sanitize_version(version):
--- a/dist/common/scripts/scylla_io_setup
+++ b/dist/common/scripts/scylla_io_setup
@@ -278,6 +278,66 @@ if __name__ == "__main__":
                    disk_properties["read_bandwidth"] = 2527296683 * nr_disks
                    disk_properties["write_iops"] = 156326 * nr_disks
                    disk_properties["write_bandwidth"] = 1063657088 * nr_disks
+            elif idata.instance() == "im4gn.large":
+                disk_properties["read_iops"] = 33943
+                disk_properties["read_bandwidth"] = 288433525
+                disk_properties["write_iops"] = 27877
+                disk_properties["write_bandwidth"] = 126864680
+            elif idata.instance() == "im4gn.xlarge":
+                disk_properties["read_iops"] = 68122
+                disk_properties["read_bandwidth"] = 576603520
+                disk_properties["write_iops"] = 55246
+                disk_properties["write_bandwidth"] = 254534954
+            elif idata.instance() == "im4gn.2xlarge":
+                disk_properties["read_iops"] = 136422
+                disk_properties["read_bandwidth"] = 1152663765
+                disk_properties["write_iops"] = 92184
+                disk_properties["write_bandwidth"] = 508926453
+            elif idata.instance() == "im4gn.4xlarge":
+                disk_properties["read_iops"] = 273050
+                disk_properties["read_bandwidth"] = 1638427264
+                disk_properties["write_iops"] = 92173
+                disk_properties["write_bandwidth"] = 1027966826
+            elif idata.instance() == "im4gn.8xlarge":
+                disk_properties["read_iops"] = 250241 * nr_disks
+                disk_properties["read_bandwidth"] = 1163130709 * nr_disks
+                disk_properties["write_iops"] = 86374 * nr_disks
+                disk_properties["write_bandwidth"] = 977617664 * nr_disks
+            elif idata.instance() == "im4gn.16xlarge":
+                disk_properties["read_iops"] = 273030 * nr_disks
+                disk_properties["read_bandwidth"] = 1638211413 * nr_disks
+                disk_properties["write_iops"] = 92607 * nr_disks
+                disk_properties["write_bandwidth"] = 1028340266 * nr_disks
+            elif idata.instance() == "is4gen.medium":
+                disk_properties["read_iops"] = 33965
+                disk_properties["read_bandwidth"] = 288462506
+                disk_properties["write_iops"] = 27876
+                disk_properties["write_bandwidth"] = 126954200
+            elif idata.instance() == "is4gen.large":
+                disk_properties["read_iops"] = 68131
+                disk_properties["read_bandwidth"] = 576654869
+                disk_properties["write_iops"] = 55257
+                disk_properties["write_bandwidth"] = 254551002
+            elif idata.instance() == "is4gen.xlarge":
+                disk_properties["read_iops"] = 136413
+                disk_properties["read_bandwidth"] = 1152747904
+                disk_properties["write_iops"] = 92180
+                disk_properties["write_bandwidth"] = 508889546
+            elif idata.instance() == "is4gen.2xlarge":
+                disk_properties["read_iops"] = 273038
+                disk_properties["read_bandwidth"] = 1628982613
+                disk_properties["write_iops"] = 92182
+                disk_properties["write_bandwidth"] = 1027983530
+            elif idata.instance() == "is4gen.4xlarge":
+                disk_properties["read_iops"] = 260493 * nr_disks
+                disk_properties["read_bandwidth"] = 1217396928 * nr_disks
+                disk_properties["write_iops"] = 83169 * nr_disks
+                disk_properties["write_bandwidth"] = 1000390784 * nr_disks
+            elif idata.instance() == "is4gen.8xlarge":
+                disk_properties["read_iops"] = 273021 * nr_disks
+                disk_properties["read_bandwidth"] = 1656354602 * nr_disks
+                disk_properties["write_iops"] = 92233 * nr_disks
+                disk_properties["write_bandwidth"] = 1028010325 * nr_disks
            properties_file = open(etcdir() + "/scylla.d/io_properties.yaml", "w")
            yaml.dump({ "disks": [ disk_properties ] }, properties_file,  default_flow_style=False)
            ioconf = open(etcdir() + "/scylla.d/io.conf", "w")
--- a/dist/common/scripts/scylla_ntp_setup
+++ b/dist/common/scripts/scylla_ntp_setup
@@ -66,18 +66,18 @@ if __name__ == '__main__':

    target = None
    if os.path.exists('/lib/systemd/systemd-timesyncd'):
-        if systemd_unit('systemd-timesyncd').is_active():
+        if systemd_unit('systemd-timesyncd').is_active() == 'active':
            print('ntp is already configured, skip setup')
            sys.exit(0)
        target = 'systemd-timesyncd'
    if shutil.which('chronyd'):
-        if get_chrony_unit().is_active():
+        if get_chrony_unit().is_active() == 'active':
            print('ntp is already configured, skip setup')
            sys.exit(0)
        if not target:
            target = 'chrony'
    if shutil.which('ntpd'):
-        if get_ntp_unit().is_active():
+        if get_ntp_unit().is_active() == 'active':
            print('ntp is already configured, skip setup')
            sys.exit(0)
        if not target:
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -117,10 +117,11 @@ if __name__ == '__main__':
        pkg_install('xfsprogs')
    if not shutil.which('mdadm'):
        pkg_install('mdadm')
-    try:
-        md_service = systemd_unit('mdmonitor.service')
-    except SystemdException:
-        md_service = systemd_unit('mdadm.service')
+    if args.raid_level != '0':
+        try:
+            md_service = systemd_unit('mdmonitor.service')
+        except SystemdException:
+            md_service = systemd_unit('mdadm.service')

    print('Creating {type} for scylla using {nr_disk} disk(s): {disks}'.format(type='fRAID{args.raid_level}' if raid else 'XFS volume', nr_disk=len(disks), disks=args.disks))
    procs=[]
@@ -164,14 +165,15 @@ if __name__ == '__main__':

    uuid = run(f'blkid -s UUID -o value {fsdev}', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
    after = 'local-fs.target'
-    if raid:
+    wants = ''
+    if raid and args.raid_level != '0':
        after += f' {md_service}'
+        wants = f'\nWants={md_service}'
    unit_data = f'''
 [Unit]
 Description=Scylla data directory
 Before=scylla-server.service
-After={after}
-Wants={md_service}
+After={after}{wants}
 DefaultDependencies=no

 [Mount]
@@ -195,7 +197,8 @@ WantedBy=multi-user.target
            f.write(f'RequiresMountsFor={mount_at}\n')

    systemd_unit.reload()
-    md_service.start()
+    if args.raid_level != '0':
+        md_service.start()
    mount = systemd_unit(mntunit_bn)
    mount.start()
    if args.enable_on_nextboot:
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -370,6 +370,10 @@ if __name__ == '__main__':
            version_check = interactive_ask_service('Do you want to enable Scylla to check if there is a newer version of Scylla available?', 'Yes - start the Scylla-housekeeping service to check for a newer version. This check runs periodically. No - skips this step.', version_check)
            args.no_version_check = not version_check
            if version_check:
+                cfg = sysconfig_parser(sysconfdir_p() / 'scylla-housekeeping')
+                repo_files = cfg.get('REPO_FILES')
+                for f in glob.glob(repo_files):
+                    os.chmod(f, 0o644)
                with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
                    f.write('[housekeeping]\ncheck-version: True\n')
                os.chmod('/etc/scylla.d/housekeeping.cfg', 0o644)
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -674,7 +674,7 @@ class aws_instance:
        return self._type.split(".")[0]

    def is_supported_instance_class(self):
-        if self.instance_class() in ['i2', 'i3', 'i3en', 'c5d', 'm5d', 'm5ad', 'r5d', 'z1d', 'c6gd', 'm6gd', 'r6gd', 'x2gd']:
+        if self.instance_class() in ['i2', 'i3', 'i3en', 'c5d', 'm5d', 'm5ad', 'r5d', 'z1d', 'c6gd', 'm6gd', 'r6gd', 'x2gd', 'im4gn', 'is4gen']:
            return True
        return False

@@ -683,7 +683,7 @@ class aws_instance:
        instance_size = self.instance_size()
        if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
            return 'ixgbevf'
-        if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5b', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d', 'c6g', 'c6gd', 'm6g', 'm6gd', 't4g', 'r6g', 'r6gd', 'x2gd']:
+        if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5b', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d', 'c6g', 'c6gd', 'm6g', 'm6gd', 't4g', 'r6g', 'r6gd', 'x2gd', 'im4gn', 'is4gen']:
            return 'ena'
        if instance_class == 'm4':
            if instance_size == '16xlarge':
@@ -1041,7 +1041,7 @@ class systemd_unit:
        return run('systemctl {} disable {}'.format(self.ctlparam, self._unit), shell=True, check=True)

    def is_active(self):
-        return True if run('systemctl {} is-active {}'.format(self.ctlparam, self._unit), shell=True, capture_output=True, encoding='utf-8').stdout.strip() == 'active' else False
+        return run('systemctl {} is-active {}'.format(self.ctlparam, self._unit), shell=True, capture_output=True, encoding='utf-8').stdout.strip()

    def mask(self):
        return run('systemctl {} mask {}'.format(self.ctlparam, self._unit), shell=True, check=True)
--- a/dist/common/supervisor/scylla_util.sh
+++ b/dist/common/supervisor/scylla_util.sh
@@ -6,12 +6,16 @@ is_nonroot() {
    [ -f "$scylladir"/SCYLLA-NONROOT-FILE ]
 }

+is_container() {
+    [ -f "$scylladir"/SCYLLA-CONTAINER-FILE ]
+}
+
 is_privileged() {
    [ ${EUID:-${UID}} = 0 ]
 }

 execsudo() {
-    if is_nonroot; then
+    if is_nonroot || is_container; then
        exec "$@"
    else
        exec sudo -u scylla -g scylla "$@"
--- a/dist/docker/debian/build_docker.sh
+++ b/dist/docker/debian/build_docker.sh
@@ -25,6 +25,10 @@ product="$(<build/SCYLLA-PRODUCT-FILE)"
 version="$(<build/SCYLLA-VERSION-FILE)"
 release="$(<build/SCYLLA-RELEASE-FILE)"

+if [[ "$version" = *rc* ]]; then
+ version=$(echo $version |sed 's/\(.*\)\.)*/\1~/')
+fi
+
 mode="release"

 if uname -m | grep x86_64 ; then
@@ -93,12 +97,14 @@ run apt-get -y install hostname supervisor openssh-server openssh-client openjdk
 run locale-gen en_US.UTF-8
 run bash -ec "dpkg -i packages/*.deb"
 run apt-get -y clean all
-run bash -ec "cat /scylla_bashrc >> /etc/bashrc"
+run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
 run mkdir -p /etc/supervisor.conf.d
 run mkdir -p /var/log/scylla
 run chown -R scylla:scylla /var/lib/scylla
+run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"/' /etc/default/scylla-server

 run mkdir -p /opt/scylladb/supervisor
+run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
 bcp dist/common/supervisor/scylla-server.sh /opt/scylladb/supervisor/scylla-server.sh
 bcp dist/common/supervisor/scylla-jmx.sh /opt/scylladb/supervisor/scylla-jmx.sh
 bcp dist/common/supervisor/scylla-node-exporter.sh /opt/scylladb/supervisor/scylla-node-exporter.sh
--- a/dist/docker/etc/supervisord.conf.d/scylla-server.conf
+++ b/dist/docker/etc/supervisord.conf.d/scylla-server.conf
@@ -1,4 +1,4 @@
-[program:scylla-server]
+[program:scylla]
 command=/opt/scylladb/supervisor/scylla-server.sh
 stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
--- a/dist/docker/etc/sysconfig/scylla-server
+++ b/dist/docker/etc/sysconfig/scylla-server
@@ -1,41 +0,0 @@
-# choose following mode: virtio, dpdk, posix
-NETWORK_MODE=posix
-
-# tap device name(virtio)
-TAP=tap0
-
-# bridge device name (virtio)
-BRIDGE=virbr0
-
-# ethernet device name
-IFNAME=eth0
-
-# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
-SET_NIC_AND_DISKS=no
-
-# ethernet device driver (dpdk)
-ETHDRV=
-
-# ethernet device PCI ID (dpdk)
-ETHPCIID=
-
-# number of hugepages
-NR_HUGEPAGES=64
-
-# user for process (must be root for dpdk)
-USER=scylla
-
-# group for process
-GROUP=scylla
-
-# scylla home dir
-SCYLLA_HOME=/var/lib/scylla
-
-# scylla config dir
-SCYLLA_CONF=/etc/scylla
-
-# scylla arguments
-SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"
-
-# setup as AMI instance
-AMI=no
--- a/dist/docker/scyllasetup.py
+++ b/dist/docker/scyllasetup.py
@@ -121,12 +121,13 @@ class ScyllaSetup:
        if self._apiAddress is not None:
            args += ["--api-address %s" % self._apiAddress]

-        if self._alternatorPort is not None:
+        if self._alternatorAddress is not None:
            args += ["--alternator-address %s" % self._alternatorAddress]
+
+        if self._alternatorPort is not None:
            args += ["--alternator-port %s" % self._alternatorPort]

        if self._alternatorHttpsPort is not None:
-            args += ["--alternator-address %s" % self._alternatorAddress]
            args += ["--alternator-https-port %s" % self._alternatorHttpsPort]

        if self._alternatorWriteIsolation is not None:
--- a/generic_server.cc
+++ b/generic_server.cc
@@ -184,14 +184,18 @@ future<> server::do_accepts(int which, bool keepalive, socket_address server_add
                    _logger.info("exception while advertising new connection: {}", std::current_exception());
                }
                // Block while monitoring for lifetime/errors.
-                return conn->process().finally([this, conn] {
-                    return unadvertise_connection(conn);
-                }).handle_exception([this] (std::exception_ptr ep) {
-                    if (is_broken_pipe_or_connection_reset(ep)) {
-                        // expected if another side closes a connection or we're shutting down
-                        return;
+                return conn->process().then_wrapped([this, conn] (auto f) {
+                    try {
+                        f.get();
+                    } catch (...) {
+                        auto ep = std::current_exception();
+                        if (!is_broken_pipe_or_connection_reset(ep)) {
+                            // some exceptions are expected if another side closes a connection
+                            // or we're shutting down
+                            _logger.info("exception while processing connection: {}", ep);
+                        }
                    }
-                    _logger.info("exception while processing connection: {}", ep);
+                    return unadvertise_connection(conn);
                });
            });
            return stop_iteration::no;
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -477,49 +477,42 @@ gossiper::handle_get_endpoint_states_msg(gossip_get_endpoint_states_request requ
    return make_ready_future<gossip_get_endpoint_states_response>(gossip_get_endpoint_states_response{std::move(map)});
 }

+rpc::no_wait_type gossiper::background_msg(sstring type, noncopyable_function<future<>(gossiper&)> fn) {
+    (void)with_gate(_background_msg, [this, type = std::move(type), fn = std::move(fn)] () mutable {
+        return container().invoke_on(0, std::move(fn)).handle_exception([type = std::move(type)] (auto ep) {
+            logger.warn("Failed to handle {}: {}", type, ep);
+        });
+    });
+    return messaging_service::no_wait();
+}
+
 void gossiper::init_messaging_service_handler() {
    _messaging.register_gossip_digest_syn([this] (const rpc::client_info& cinfo, gossip_digest_syn syn_msg) {
        auto from = netw::messaging_service::get_source(cinfo);
-        // In a new fiber.
-        (void)container().invoke_on(0, [from, syn_msg = std::move(syn_msg)] (gms::gossiper& gossiper) mutable {
+        return background_msg("GOSSIP_DIGEST_SYN", [from, syn_msg = std::move(syn_msg)] (gms::gossiper& gossiper) mutable {
            return gossiper.handle_syn_msg(from, std::move(syn_msg));
-        }).handle_exception([] (auto ep) {
-            logger.warn("Fail to handle GOSSIP_DIGEST_SYN: {}", ep);
        });
-        return messaging_service::no_wait();
    });
    _messaging.register_gossip_digest_ack([this] (const rpc::client_info& cinfo, gossip_digest_ack msg) {
        auto from = netw::messaging_service::get_source(cinfo);
-        // In a new fiber.
-        (void)container().invoke_on(0, [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
+        return background_msg("GOSSIP_DIGEST_ACK", [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
            return gossiper.handle_ack_msg(from, std::move(msg));
-        }).handle_exception([] (auto ep) {
-            logger.warn("Fail to handle GOSSIP_DIGEST_ACK: {}", ep);
        });
-        return messaging_service::no_wait();
    });
    _messaging.register_gossip_digest_ack2([this] (const rpc::client_info& cinfo, gossip_digest_ack2 msg) {
        auto from = netw::messaging_service::get_source(cinfo);
-        // In a new fiber.
-        (void)container().invoke_on(0, [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
+        return background_msg("GOSSIP_DIGEST_ACK2", [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
            return gossiper.handle_ack2_msg(from, std::move(msg));
-        }).handle_exception([] (auto ep) {
-            logger.warn("Fail to handle GOSSIP_DIGEST_ACK2: {}", ep);
        });
-        return messaging_service::no_wait();
    });
    _messaging.register_gossip_echo([this] (const rpc::client_info& cinfo, rpc::optional<int64_t> generation_number_opt) {
        auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
        return handle_echo_msg(from, generation_number_opt);
    });
    _messaging.register_gossip_shutdown([this] (inet_address from, rpc::optional<int64_t> generation_number_opt) {
-        // In a new fiber.
-        (void)container().invoke_on(0, [from, generation_number_opt] (gms::gossiper& gossiper) {
+        return background_msg("GOSSIP_SHUTDOWN", [from, generation_number_opt] (gms::gossiper& gossiper) {
            return gossiper.handle_shutdown_msg(from, generation_number_opt);
-        }).handle_exception([] (auto ep) {
-            logger.warn("Fail to handle GOSSIP_SHUTDOWN: {}", ep);
        });
-        return messaging_service::no_wait();
    });
    _messaging.register_gossip_get_endpoint_states([this] (const rpc::client_info& cinfo, gossip_get_endpoint_states_request request) {
        return container().invoke_on(0, [request = std::move(request)] (gms::gossiper& gossiper) mutable {
@@ -2178,6 +2171,9 @@ future<> gossiper::start() {
 }

 future<> gossiper::shutdown() {
+    if (!_background_msg.is_closed()) {
+        co_await _background_msg.close();
+    }
    if (this_shard_id() == 0) {
        co_await do_stop_gossiping();
    }
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -41,7 +41,9 @@
 #include "unimplemented.hh"
 #include <seastar/core/distributed.hh>
 #include <seastar/core/shared_ptr.hh>
+#include <seastar/core/gate.hh>
 #include <seastar/core/print.hh>
+#include <seastar/rpc/rpc_types.hh>
 #include "utils/atomic_vector.hh"
 #include "utils/UUID.hh"
 #include "utils/fb_utilities.hh"
@@ -138,12 +140,16 @@ private:
    bool _enabled = false;
    semaphore _callback_running{1};
    semaphore _apply_state_locally_semaphore{100};
+    seastar::gate _background_msg;
    std::unordered_map<gms::inet_address, syn_msg_pending> _syn_handlers;
    std::unordered_map<gms::inet_address, ack_msg_pending> _ack_handlers;
    bool _advertise_myself = true;
    // Map ip address and generation number
    std::unordered_map<gms::inet_address, int32_t> _advertise_to_nodes;
    future<> _failure_detector_loop_done{make_ready_future<>()} ;
+
+    rpc::no_wait_type background_msg(sstring type, noncopyable_function<future<>(gossiper&)> fn);
+
 public:
    // Get current generation number for the given nodes
    future<std::unordered_map<gms::inet_address, int32_t>>
--- a/install.sh
+++ b/install.sh
@@ -520,8 +520,13 @@ relocate_python3 "$rprefix"/scyllatop tools/scyllatop/scyllatop.py
 if $supervisor; then
    install -d -m755 `supervisor_dir $retc`
    for service in scylla-server scylla-jmx scylla-node-exporter; do
+        if [ "$service" = "scylla-server" ]; then
+            program="scylla"
+        else
+            program=$service
+        fi
        cat << EOS > `supervisor_conf $retc $service`
-[program:$service]
+[program:$program]
 directory=$rprefix
 command=/bin/bash -c './supervisor/$service.sh'
 EOS
--- a/main.cc
+++ b/main.cc
@@ -377,11 +377,38 @@ static auto defer_verbose_shutdown(const char* what, Func&& func) {
        startlog.info("Shutting down {}", what);
        try {
            func();
+            startlog.info("Shutting down {} was successful", what);
        } catch (...) {
-            startlog.error("Unexpected error shutting down {}: {}", what, std::current_exception());
-            throw;
+            auto ex = std::current_exception();
+            bool do_abort = true;
+            try {
+                std::rethrow_exception(ex);
+            } catch (const std::system_error& e) {
+                // System error codes we consider "environmental",
+                // i.e. not scylla's fault, therefore there is no point in
+                // aborting and dumping core.
+                for (int i : {EIO, EACCES, ENOSPC}) {
+                    if (e.code() == std::error_code(i, std::system_category())) {
+                        do_abort = false;
+                        break;
+                    }
+                }
+            } catch (...) {
+            }
+            auto msg = fmt::format("Unexpected error shutting down {}: {}", what, ex);
+            if (do_abort) {
+                startlog.error("{}: aborting", msg);
+                abort();
+            } else {
+                startlog.error("{}: exiting, at {}", msg, current_backtrace());
+
+                // Call _exit() rather than exit() to exit immediately
+                // without calling exit handlers, avoiding
+                // boost::intrusive::detail::destructor_impl assert failure
+                // from ~segment_pool exit handler.
+                _exit(255);
+            }
        }
-        startlog.info("Shutting down {} was successful", what);
    };

    auto ret = deferred_action(std::move(vfunc));
--- a/memtable.cc
+++ b/memtable.cc
@@ -613,7 +613,8 @@ static flat_mutation_reader make_partition_snapshot_flat_reader_from_snp_schema(
        schema_ptr rev_snp_schema = snp->schema()->make_reversed();
        return make_partition_snapshot_flat_reader<true, partition_snapshot_read_accounter>(std::move(rev_snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
    } else {
-        return make_partition_snapshot_flat_reader<false, partition_snapshot_read_accounter>(snp->schema(), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
+        schema_ptr snp_schema = snp->schema();
+        return make_partition_snapshot_flat_reader<false, partition_snapshot_read_accounter>(std::move(snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
    }
 }

--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -628,7 +628,12 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        remove_error_rpc_client(verb, id);
    }

-    auto must_encrypt = [&id, &verb, this] {
+    auto addr = get_preferred_ip(id.addr);
+    auto broadcast_address = utils::fb_utilities::get_broadcast_address();
+    bool listen_to_bc = _cfg.listen_on_broadcast_address && _cfg.ip != broadcast_address;
+    auto laddr = socket_address(listen_to_bc ? broadcast_address : _cfg.ip, 0);
+
+    auto must_encrypt = [&] {
        if (_cfg.encrypt == encrypt_what::none) {
            return false;
        }
@@ -646,13 +651,27 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        auto& snitch_ptr = locator::i_endpoint_snitch::get_local_snitch_ptr();

        // either rack/dc need to be in same dc to use non-tls
-        if (snitch_ptr->get_datacenter(id.addr) != snitch_ptr->get_datacenter(utils::fb_utilities::get_broadcast_address())) {
+        auto my_dc = snitch_ptr->get_datacenter(broadcast_address);
+        if (snitch_ptr->get_datacenter(addr) != my_dc) {
+            return true;
+        }
+        // #9653 - if our idea of dc for bind address differs from our official endpoint address,
+        // we cannot trust downgrading. We need to ensure either (local) bind address is same as
+        // broadcast or that the dc info we get for it is the same.
+        if (broadcast_address != laddr && snitch_ptr->get_datacenter(laddr) != my_dc) {
            return true;
        }
        // if cross-rack tls, check rack.
-        return _cfg.encrypt == encrypt_what::rack &&
-            snitch_ptr->get_rack(id.addr) != snitch_ptr->get_rack(utils::fb_utilities::get_broadcast_address())
-            ;
+        if (_cfg.encrypt == encrypt_what::dc) {
+            return false;
+        }
+        auto my_rack = snitch_ptr->get_rack(broadcast_address);
+        if (snitch_ptr->get_rack(addr) != my_rack) {
+            return true;
+        }
+        // See above: We need to ensure either (local) bind address is same as
+        // broadcast or that the rack info we get for it is the same.
+        return broadcast_address != laddr && snitch_ptr->get_rack(laddr) != my_rack;
    }();

    auto must_compress = [&id, this] {
@@ -681,7 +700,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        return true;
    }();

-    auto remote_addr = socket_address(get_preferred_ip(id.addr), must_encrypt ? _cfg.ssl_port : _cfg.port);
+    auto remote_addr = socket_address(addr, must_encrypt ? _cfg.ssl_port : _cfg.port);

    rpc::client_options opts;
    // send keepalive messages each minute if connection is idle, drop connection after 10 failures
@@ -696,8 +715,6 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;
    }

-    bool listen_to_bc = _cfg.listen_on_broadcast_address && _cfg.ip != utils::fb_utilities::get_broadcast_address();
-    auto laddr = socket_address(listen_to_bc ? utils::fb_utilities::get_broadcast_address() : _cfg.ip, 0);
    auto client = must_encrypt ?
                    ::make_shared<rpc_protocol_client_wrapper>(_rpc->protocol(), std::move(opts),
                                    remote_addr, laddr, _credentials) :
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -1545,18 +1545,20 @@ public:
 };

 future<> shard_reader::close() noexcept {
-    // Nothing to do if there was no reader created, nor is there a background
-    // read ahead in progress which will create one.
-    if (!_reader && !_read_ahead) {
-        co_return;
+    if (_read_ahead) {
+        try {
+            co_await *std::exchange(_read_ahead, std::nullopt);
+        } catch (...) {
+            mrlog.warn("shard_reader::close(): read_ahead on shard {} failed: {}", _shard, std::current_exception());
+        }
    }

    try {
-        if (_read_ahead) {
-            co_await *std::exchange(_read_ahead, std::nullopt);
-        }
-
        co_await smp::submit_to(_shard, [this] {
+            if (!_reader) {
+                return make_ready_future<>();
+            }
+
            auto irh = std::move(*_reader).inactive_read_handle();
            return with_closeable(flat_mutation_reader(_reader.release()), [this] (flat_mutation_reader& reader) mutable {
                auto permit = reader.permit();
--- a/mutation_writer/feed_writers.hh
+++ b/mutation_writer/feed_writers.hh
@@ -54,7 +54,7 @@ future<> feed_writer(flat_mutation_reader&& rd_ref, Writer wr) {
    auto rd = std::move(rd_ref);
    std::exception_ptr ex;
    try {
-        while (!rd.is_end_of_stream()) {
+        while (!rd.is_end_of_stream() || !rd.is_buffer_empty()) {
            co_await rd.fill_buffer();
            while (!rd.is_buffer_empty()) {
                co_await rd.pop_mutation_fragment().consume(wr);
--- a/partition_snapshot_row_cursor.hh
+++ b/partition_snapshot_row_cursor.hh
@@ -411,11 +411,11 @@ public:
        } else {
            // Copy row from older version because rows in evictable versions must
            // hold values which are independently complete to be consistent on eviction.
-            auto e = current_allocator().construct<rows_entry>(_schema, *_current_row[0].it);
+            auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(_schema, *_current_row[0].it));
            e->set_continuous(latest_i && latest_i->continuous());
            _snp.tracker()->insert(*e);
-            rows.insert_before(latest_i, *e);
-            return {*e, true};
+            auto e_i = rows.insert_before(latest_i, std::move(e));
+            return ensure_result{*e_i, true};
        }
    }

@@ -447,11 +447,11 @@ public:
        }
        auto&& rows = _snp.version()->partition().mutable_clustered_rows();
        auto latest_i = get_iterator_in_latest_version();
-        auto e = current_allocator().construct<rows_entry>(_schema, pos, is_dummy(!pos.is_clustering_row()),
-            is_continuous(latest_i && latest_i->continuous()));
+        auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(_schema, pos, is_dummy(!pos.is_clustering_row()),
+            is_continuous(latest_i && latest_i->continuous())));
        _snp.tracker()->insert(*e);
-        rows.insert_before(latest_i, *e);
-        return ensure_result{*e, true};
+        auto e_i = rows.insert_before(latest_i, std::move(e));
+        return ensure_result{*e_i, true};
    }

    // Brings the entry pointed to by the cursor to the front of the LRU
--- a/range_tombstone_list.cc
+++ b/range_tombstone_list.cc
@@ -42,28 +42,34 @@ static auto construct_range_tombstone_entry(Args&&... args) {
 }

 void range_tombstone_list::apply_reversibly(const schema& s,
-        clustering_key_prefix start, bound_kind start_kind,
-        clustering_key_prefix end,
+        clustering_key_prefix start_key, bound_kind start_kind,
+        clustering_key_prefix end_key,
        bound_kind end_kind,
        tombstone tomb,
        reverter& rev)
 {
+    position_in_partition::less_compare less(s);
+    position_in_partition start(position_in_partition::range_tag_t(), bound_view(std::move(start_key), start_kind));
+    position_in_partition end(position_in_partition::range_tag_t(), bound_view(std::move(end_key), end_kind));
+
+    if (!less(start, end)) {
+        return;
+    }
+
    if (!_tombstones.empty()) {
-        bound_view::compare less(s);
-        bound_view start_bound(start, start_kind);
        auto last = --_tombstones.end();
        range_tombstones_type::iterator it;
-        if (less(start_bound, last->end_bound())) {
-            it = _tombstones.upper_bound(start_bound, [less](auto&& sb, auto&& rt) {
-                return less(sb, rt.end_bound());
+        if (less(start, last->end_position())) {
+            it = _tombstones.upper_bound(start, [less](auto&& sb, auto&& rt) {
+                return less(sb, rt.end_position());
            });
        } else {
            it = _tombstones.end();
        }
-        insert_from(s, std::move(it), std::move(start), start_kind, std::move(end), end_kind, std::move(tomb), rev);
+        insert_from(s, std::move(it), std::move(start), std::move(end), std::move(tomb), rev);
        return;
    }
-    auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
+    auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
    rev.insert(_tombstones.end(), *rt);
    rt.release();
 }
@@ -81,35 +87,31 @@ void range_tombstone_list::apply_reversibly(const schema& s,
 */
 void range_tombstone_list::insert_from(const schema& s,
    range_tombstones_type::iterator it,
-    clustering_key_prefix start,
-    bound_kind start_kind,
-    clustering_key_prefix end,
-    bound_kind end_kind,
+    position_in_partition start,
+    position_in_partition end,
    tombstone tomb,
    reverter& rev)
 {
-    bound_view::compare less(s);
-    bound_view end_bound(end, end_kind);
+    position_in_partition::tri_compare cmp(s);
+
    if (it != _tombstones.begin()) {
        auto prev = std::prev(it);
-        if (prev->tombstone().tomb == tomb && prev->end_bound().adjacent(s, bound_view(start, start_kind))) {
-            start = prev->tombstone().start;
-            start_kind = prev->tombstone().start_kind;
+        if (prev->tombstone().tomb == tomb && cmp(prev->end_position(), start) == 0) {
+            start = prev->position();
            rev.erase(prev);
        }
    }
    while (it != _tombstones.end()) {
-        bound_view start_bound(start, start_kind);
-        if (less(end_bound, start_bound)) {
+        if (cmp(end, start) <= 0) {
            return;
        }

-        if (less(end_bound, it->start_bound())) {
+        if (cmp(end, it->position()) < 0) {
            // not overlapping
-            if (it->tombstone().tomb == tomb && end_bound.adjacent(s, it->start_bound())) {
-                rev.update(it, {std::move(start), start_kind, it->tombstone().end, it->tombstone().end_kind, tomb});
+            if (it->tombstone().tomb == tomb && cmp(end, it->position()) == 0) {
+                rev.update(it, {std::move(start), std::move(start), tomb});
            } else {
-                auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, tomb);
+                auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), tomb);
                rev.insert(it, *rt);
                rt.release();
            }
@@ -119,34 +121,29 @@ void range_tombstone_list::insert_from(const schema& s,
        auto c = tomb <=> it->tombstone().tomb;
        if (c == 0) {
            // same timestamp, overlapping or adjacent, so merge.
-            if (less(it->start_bound(), start_bound)) {
-                start = it->tombstone().start;
-                start_kind = it->tombstone().start_kind;
+            if (cmp(it->position(), start) < 0) {
+                start = it->position();
            }
-            if (less(end_bound, it->end_bound())) {
-                end = it->tombstone().end;
-                end_kind = it->tombstone().end_kind;
-                end_bound = bound_view(end, end_kind);
+            if (cmp(end, it->end_position()) < 0) {
+                end = it->end_position();
            }
            it = rev.erase(it);
        } else if (c > 0) {
            // We overwrite the current tombstone.

-            if (less(it->start_bound(), start_bound)) {
-                auto new_end = bound_view(start, invert_kind(start_kind));
-                if (!less(new_end, it->start_bound())) {
-                    // Here it->start < start
-                    auto rt = construct_range_tombstone_entry(it->start_bound(), new_end, it->tombstone().tomb);
-                    rev.update(it, {start_bound, it->end_bound(), it->tombstone().tomb});
+            if (cmp(it->position(), start) < 0) {
+                {
+                    auto rt = construct_range_tombstone_entry(it->position(), start, it->tombstone().tomb);
+                    rev.update(it, {start, it->end_position(), it->tombstone().tomb});
                    rev.insert(it, *rt);
                    rt.release();
                }
            }

-            if (less(end_bound, it->end_bound())) {
+            if (cmp(end, it->end_position()) < 0) {
                // Here start <= it->start and end < it->end.
-                auto rt = construct_range_tombstone_entry(std::move(start), start_kind, end, end_kind, std::move(tomb));
-                rev.update(it, {std::move(end), invert_kind(end_kind), it->tombstone().end, it->tombstone().end_kind, it->tombstone().tomb});
+                auto rt = construct_range_tombstone_entry(std::move(start), end, std::move(tomb));
+                rev.update(it, {std::move(end), it->end_position(), it->tombstone().tomb});
                rev.insert(it, *rt);
                rt.release();
                return;
@@ -157,30 +154,28 @@ void range_tombstone_list::insert_from(const schema& s,
        } else {
            // We don't overwrite the current tombstone.

-            if (less(start_bound, it->start_bound())) {
+            if (cmp(start, it->position()) < 0) {
                // The new tombstone starts before the current one.
-                if (less(it->start_bound(), end_bound)) {
+                if (cmp(it->position(), end) < 0) {
                    // Here start < it->start and it->start < end.
-                    auto new_end_kind = invert_kind(it->tombstone().start_kind);
-                    if (!less(bound_view(it->tombstone().start, new_end_kind), start_bound)) {
-                        auto rt = construct_range_tombstone_entry(std::move(start), start_kind, it->tombstone().start, new_end_kind, tomb);
+                    {
+                        auto rt = construct_range_tombstone_entry(std::move(start), it->position(), tomb);
                        it = rev.insert(it, *rt);
                        rt.release();
                        ++it;
                    }
                } else {
                    // Here start < it->start and end <= it->start, so just insert the new tombstone.
-                    auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
+                    auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
                    rev.insert(it, *rt);
                    rt.release();
                    return;
                }
            }

-            if (less(it->end_bound(), end_bound)) {
+            if (cmp(it->end_position(), end) < 0) {
                // Here the current tombstone overwrites a range of the new one.
-                start = it->tombstone().end;
-                start_kind = invert_kind(it->tombstone().end_kind);
+                start = it->end_position();
                ++it;
            } else {
                // Here the current tombstone completely overwrites the new one.
@@ -190,7 +185,7 @@ void range_tombstone_list::insert_from(const schema& s,
    }

    // If we got here, then just insert the remainder at the end.
-    auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
+    auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
    rev.insert(it, *rt);
    rt.release();
 }
--- a/range_tombstone_list.hh
+++ b/range_tombstone_list.hh
@@ -297,7 +297,13 @@ public:
 private:
    void apply_reversibly(const schema& s, clustering_key_prefix start, bound_kind start_kind,
                          clustering_key_prefix end, bound_kind end_kind, tombstone tomb, reverter& rev);
-    void insert_from(const schema& s, range_tombstones_type::iterator it, clustering_key_prefix start,
-                     bound_kind start_kind, clustering_key_prefix end, bound_kind end_kind, tombstone tomb, reverter& rev);
+
+    void insert_from(const schema& s,
+                     range_tombstones_type::iterator it,
+                     position_in_partition start,
+                     position_in_partition end,
+                     tombstone tomb,
+                     reverter& rev);
+
    range_tombstones_type::iterator find(const schema& s, const range_tombstone_entry& rt);
 };
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -249,6 +249,14 @@ public:
        return _base_resources;
    }

+    void release_base_resources() noexcept {
+        if (_base_resources_consumed) {
+            _resources -= _base_resources;
+            _base_resources_consumed = false;
+        }
+        _semaphore.signal(std::exchange(_base_resources, {}));
+    }
+
    sstring description() const {
        return format("{}.{}:{}",
                _schema ? _schema->ks_name() : "*",
@@ -394,6 +402,10 @@ reader_resources reader_permit::base_resources() const {
    return _impl->base_resources();
 }

+void reader_permit::release_base_resources() noexcept {
+    return _impl->release_base_resources();
+}
+
 sstring reader_permit::description() const {
    return _impl->description();
 }
--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -161,6 +161,8 @@ public:

    reader_resources base_resources() const;

+    void release_base_resources() noexcept;
+
    sstring description() const;

    db::timeout_clock::time_point timeout() const noexcept;
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -407,6 +407,10 @@ public:
                    {},
                    mutation_reader::forwarding::no);
        } else {
+            // We can't have two permits with count resource for 1 repair.
+            // So we release the one on _permit so the only one is the one the
+            // shard reader will obtain.
+            _permit.release_base_resources();
            _reader = make_multishard_streaming_reader(db, _schema, _permit, [this] {
                auto shard_range = _sharder.next();
                if (shard_range) {
--- a/2
+++ b/2
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -635,16 +635,16 @@ void storage_service::bootstrap() {

        // Update pending ranges now, so we correctly count ourselves as a pending replica
        // when inserting the new CDC generation.
-      if (!bootstrap_rbno) {
-        // When is_repair_based_node_ops_enabled is true, the bootstrap node
-        // will use node_ops_cmd to bootstrap, node_ops_cmd will update the pending ranges.
-        slogger.debug("bootstrap: update pending ranges: endpoint={} bootstrap_tokens={}", get_broadcast_address(), _bootstrap_tokens);
-        mutate_token_metadata([this] (mutable_token_metadata_ptr tmptr) {
-            auto endpoint = get_broadcast_address();
-            tmptr->add_bootstrap_tokens(_bootstrap_tokens, endpoint);
-            return update_pending_ranges(std::move(tmptr), format("bootstrapping node {}", endpoint));
-        }).get();
-      }
+        if (!bootstrap_rbno) {
+            // When is_repair_based_node_ops_enabled is true, the bootstrap node
+            // will use node_ops_cmd to bootstrap, node_ops_cmd will update the pending ranges.
+            slogger.debug("bootstrap: update pending ranges: endpoint={} bootstrap_tokens={}", get_broadcast_address(), _bootstrap_tokens);
+            mutate_token_metadata([this] (mutable_token_metadata_ptr tmptr) {
+                auto endpoint = get_broadcast_address();
+                tmptr->add_bootstrap_tokens(_bootstrap_tokens, endpoint);
+                return update_pending_ranges(std::move(tmptr), format("bootstrapping node {}", endpoint));
+            }).get();
+        }

        // After we pick a generation timestamp, we start gossiping it, and we stick with it.
        // We don't do any other generation switches (unless we crash before complecting bootstrap).
@@ -652,19 +652,23 @@ void storage_service::bootstrap() {

        _cdc_gen_id = _cdc_gen_service.local().make_new_generation(_bootstrap_tokens, !is_first_node()).get0();

-      if (!bootstrap_rbno) {
-        // When is_repair_based_node_ops_enabled is true, the bootstrap node
-        // will use node_ops_cmd to bootstrap, bootstrapping gossip status is not needed for bootstrap.
-        _gossiper.add_local_application_state({
-            // Order is important: both the CDC streams timestamp and tokens must be known when a node handles our status.
-            { gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens) },
-            { gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
-            { gms::application_state::STATUS, versioned_value::bootstrapping(_bootstrap_tokens) },
-        }).get();
+        if (!bootstrap_rbno) {
+            // When is_repair_based_node_ops_enabled is true, the bootstrap node
+            // will use node_ops_cmd to bootstrap, bootstrapping gossip status is not needed for bootstrap.
+            _gossiper.add_local_application_state({
+                { gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens) },
+                { gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
+                { gms::application_state::STATUS, versioned_value::bootstrapping(_bootstrap_tokens) },
+            }).get();

-        set_mode(mode::JOINING, format("sleeping {} ms for pending range setup", get_ring_delay().count()), true);
-        _gossiper.wait_for_range_setup().get();
-     }
+            set_mode(mode::JOINING, format("sleeping {} ms for pending range setup", get_ring_delay().count()), true);
+            _gossiper.wait_for_range_setup().get();
+        } else {
+            // Even with RBNO bootstrap we need to announce the new CDC generation immediately after it's created.
+            _gossiper.add_local_application_state({
+                { gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
+            }).get();
+        }
    } else {
        // Wait until we know tokens of existing node before announcing replacing status.
        set_mode(mode::JOINING, fmt::format("Wait until local node knows tokens of peer nodes"), true);
@@ -3670,7 +3674,7 @@ shared_ptr<abort_source> node_ops_meta_data::get_abort_source() {

 void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {
    slogger.debug("node_ops_update_heartbeat: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
+    auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
    auto it = _node_ops.find(ops_uuid);
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
@@ -3680,7 +3684,7 @@ void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {

 void storage_service::node_ops_done(utils::UUID ops_uuid) {
    slogger.debug("node_ops_done: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
+    auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
    auto it = _node_ops.find(ops_uuid);
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
@@ -3691,7 +3695,7 @@ void storage_service::node_ops_done(utils::UUID ops_uuid) {

 void storage_service::node_ops_abort(utils::UUID ops_uuid) {
    slogger.debug("node_ops_abort: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
+    auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
    auto it = _node_ops.find(ops_uuid);
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
--- a/sstables/partition_index_cache.hh
+++ b/sstables/partition_index_cache.hh
@@ -49,12 +49,13 @@ private:
    public:
        partition_index_cache* _parent;
        key_type _key;
-        std::variant<shared_promise<>, partition_index_page> _page;
+        std::variant<lw_shared_ptr<shared_promise<>>, partition_index_page> _page;
        size_t _size_in_allocator = 0;
    public:
        entry(partition_index_cache* parent, key_type key)
                : _parent(parent)
                , _key(key)
+                , _page(make_lw_shared<shared_promise<>>())
        { }

        void set_page(partition_index_page&& page) noexcept {
@@ -76,7 +77,7 @@ private:
        // Always returns the same value for a given state of _page.
        size_t size_in_allocator() const { return _size_in_allocator; }

-        shared_promise<>& promise() { return std::get<shared_promise<>>(_page); }
+        lw_shared_ptr<shared_promise<>> promise() { return std::get<lw_shared_ptr<shared_promise<>>>(_page); }
        bool ready() const { return std::holds_alternative<partition_index_page>(_page); }
        partition_index_page& page() { return std::get<partition_index_page>(_page); }
        const partition_index_page& page() const { return std::get<partition_index_page>(_page); }
@@ -207,9 +208,7 @@ public:
                return make_ready_future<entry_ptr>(std::move(ptr));
            } else {
                ++_shard_stats.blocks;
-                return _as(_region, [ptr] () mutable {
-                    return ptr.get_entry().promise().get_shared_future();
-                }).then([ptr] () mutable {
+                return ptr.get_entry().promise()->get_shared_future().then([ptr] () mutable {
                    return std::move(ptr);
                });
            }
@@ -234,23 +233,23 @@ public:

        // No exceptions before then_wrapped() is installed so that ptr will be eventually populated.

-        return futurize_invoke(loader, key).then_wrapped([this, key, ptr] (auto&& f) mutable {
+        return futurize_invoke(loader, key).then_wrapped([this, key, ptr = std::move(ptr)] (auto&& f) mutable {
            entry& e = ptr.get_entry();
            try {
                partition_index_page&& page = f.get0();
-                e.promise().set_value();
+                e.promise()->set_value();
                e.set_page(std::move(page));
                _shard_stats.used_bytes += e.size_in_allocator();
                ++_shard_stats.populations;
+                return ptr;
            } catch (...) {
-                e.promise().set_exception(std::current_exception());
+                e.promise()->set_exception(std::current_exception());
+                ptr = {};
                with_allocator(_region.allocator(), [&] {
                    _cache.erase(key);
                });
                throw;
            }
-        }).then([ptr] {
-            return ptr;
        });
    }

--- a/test.py
+++ b/test.py
@@ -291,6 +291,8 @@ class Test:
    def print_summary(self):
        pass

+    def get_junit_etree(self):
+        return None

    def check_log(self, trim):
        """Check and trim logs and xml output for tests which have it"""
@@ -338,9 +340,36 @@ class BoostTest(UnitTest):
        boost_args += ['--color_output=false']
        boost_args += ['--']
        self.args = boost_args + self.args
+        self.casename = casename
+        self.__junit_etree = None
+
+    def get_junit_etree(self):
+        def adjust_suite_name(name):
+            # Normalize "path/to/file.cc" to "path.to.file" to conform to
+            # Jenkins expectations that the suite name is a class name. ".cc"
+            # doesn't add any infomation. Add the mode, otherwise failures
+            # in different modes are indistinguishable. The "test/" prefix adds
+            # no information, so remove it.
+            import re
+            name = re.sub(r'^test/', '', name)
+            name = re.sub(r'\.cc$', '', name)
+            name = re.sub(r'/', '.', name)
+            name = f'{name}.{self.mode}'
+            return name
+        if self.__junit_etree is None:
+            self.__junit_etree = ET.parse(self.xmlout)
+            root = self.__junit_etree.getroot()
+            suites = root.findall('.//TestSuite')
+            for suite in suites:
+                suite.attrib['name'] = adjust_suite_name(suite.attrib['name'])
+                skipped = suite.findall('./TestCase[@reason="disabled"]')
+                for e in skipped:
+                    suite.remove(e)
+            os.unlink(self.xmlout)
+        return self.__junit_etree

    def check_log(self, trim):
-        ET.parse(self.xmlout)
+        self.get_junit_etree()
        super().check_log(trim)


@@ -800,6 +829,17 @@ def write_junit_report(tmpdir, mode):
    with open(junit_filename, "w") as f:
        ET.ElementTree(xml_results).write(f, encoding="unicode")

+def write_consolidated_boost_junit_xml(tmpdir, mode):
+    xml = ET.Element("TestLog")
+    for suite in TestSuite.suites.values():
+        for test in suite.tests:
+            if test.mode != mode:
+                continue
+            test_xml = test.get_junit_etree()
+            if test_xml is not None:
+                xml.extend(test_xml.getroot().findall('.//TestSuite'))
+    et = ET.ElementTree(xml)
+    et.write(f'{tmpdir}/{mode}/xml/boost.xunit.xml', encoding='unicode')

 def open_log(tmpdir):
    pathlib.Path(tmpdir).mkdir(parents=True, exist_ok=True)
@@ -839,6 +879,7 @@ async def main():

    for mode in options.modes:
        write_junit_report(options.tmpdir, mode)
+        write_consolidated_boost_junit_xml(options.tmpdir, mode)

    if 'coverage' in options.modes:
        coverage.generate_coverage_report("build/coverage", "tests")
--- a/test/alternator/test_table.py
+++ b/test/alternator/test_table.py
@@ -16,6 +16,9 @@
 # along with Scylla.  If not, see <http://www.gnu.org/licenses/>.

 # Tests for basic table operations: CreateTable, DeleteTable, ListTables.
+# Also some basic tests for UpdateTable - although UpdateTable usually
+# enables more elaborate features (such as GSI or Streams) and those are
+# tested elsewhere.

 import pytest
 from botocore.exceptions import ClientError
@@ -311,3 +314,17 @@ def test_table_sse_off(dynamodb):
        KeySchema=[{ 'AttributeName': 'p', 'KeyType': 'HASH' }],
        AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]);
    table.delete();
+
+# Test that trying to delete a table that doesn't exist fails in the
+# appropriate way (ResourceNotFoundException)
+def test_delete_table_non_existent(dynamodb, test_table):
+    client = dynamodb.meta.client
+    with pytest.raises(ClientError, match='ResourceNotFoundException'):
+        client.delete_table(TableName=random_string(20))
+
+# Test that trying to update a table that doesn't exist fails in the
+# appropriate way (ResourceNotFoundException)
+def test_update_table_non_existent(dynamodb, test_table):
+    client = dynamodb.meta.client
+    with pytest.raises(ClientError, match='ResourceNotFoundException'):
+        client.update_table(TableName=random_string(20), BillingMode='PAY_PER_REQUEST')
--- a/test/alternator/test_update_expression.py
+++ b/test/alternator/test_update_expression.py
@@ -1043,6 +1043,20 @@ def test_nested_attribute_remove_from_missing_item(test_table_s):
    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE x.y')
    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE x[0]')

+# Though in an above test (test_nested_attribute_update_bad_path_dot) we
+# showed that DynamoDB does not allow REMOVE x.y if attribute x doesn't
+# exist - and generates a ValidationException, if x *does* exist but y
+# doesn't, it's fine and the removal should just be silently ignored.
+def test_nested_attribute_remove_missing_leaf(test_table_s):
+    p = random_string()
+    item = {'p': p, 'a': {'x': 3}, 'b': ['hi']}
+    test_table_s.put_item(Item=item)
+    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE a.y')
+    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE b[7]')
+    test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE c')
+    # The above UpdateItem calls didn't change anything...
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == item
+
 # Similarly for other types of bad paths - using [0] on something which
 # doesn't exist or isn't an array.
 def test_nested_attribute_update_bad_path_array(test_table_s):
--- a/test/boost/cached_file_test.cc
+++ b/test/boost/cached_file_test.cc
@@ -19,6 +19,7 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <boost/range/irange.hpp>
 #include <seastar/testing/test_case.hh>
 #include <seastar/testing/thread_test_case.hh>
 #include <seastar/core/iostream.hh>
@@ -49,6 +50,15 @@ static sstring read_to_string(cached_file::stream& s, size_t limit = std::numeri
    return b.substr(0, limit);
 }

+static void read_to_void(cached_file::stream& s, size_t limit = std::numeric_limits<size_t>::max()) {
+    while (auto buf = s.next().get0()) {
+        if (buf.size() >= limit) {
+            break;
+        }
+        limit -= buf.size();
+    }
+}
+
 static sstring read_to_string(file& f, size_t start, size_t len) {
    file_input_stream_options opt;
    auto in = make_file_input_stream(f, start, len, opt);
@@ -61,6 +71,12 @@ static sstring read_to_string(cached_file& cf, size_t off, size_t limit = std::n
    return read_to_string(s, limit);
 }

+[[gnu::unused]]
+static void read_to_void(cached_file& cf, size_t off, size_t limit = std::numeric_limits<size_t>::max()) {
+    auto s = cf.read(off, default_priority_class(), std::nullopt);
+    read_to_void(s, limit);
+}
+
 struct test_file {
    tmpdir dir;
    file f;
@@ -204,7 +220,9 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
        }

        {
-            cf_lru.evict_all();
+            with_allocator(region.allocator(), [] {
+                cf_lru.evict_all();
+            });

            BOOST_REQUIRE_EQUAL(0, metrics.cached_bytes); // change here
            BOOST_REQUIRE_EQUAL(0, cf.cached_bytes()); // change here
@@ -212,6 +230,8 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
            BOOST_REQUIRE_EQUAL(3, metrics.page_evictions); // change here
            BOOST_REQUIRE_EQUAL(0, metrics.page_hits);
            BOOST_REQUIRE_EQUAL(3, metrics.page_populations);
+
+            BOOST_REQUIRE_EQUAL(region.occupancy().used_space(), 0);
        }

        {
@@ -255,6 +275,88 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
    }
 }

+// A file which serves garbage but is very fast.
+class garbage_file_impl : public file_impl {
+private:
+    [[noreturn]] void unsupported() {
+        throw_with_backtrace<std::logic_error>("unsupported operation");
+    }
+public:
+    // unsupported
+    virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override { unsupported(); }
+    virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override { unsupported(); }
+    virtual future<> flush(void) override { unsupported(); }
+    virtual future<> truncate(uint64_t length) override { unsupported(); }
+    virtual future<> discard(uint64_t offset, uint64_t length) override { unsupported(); }
+    virtual future<> allocate(uint64_t position, uint64_t length) override { unsupported(); }
+    virtual subscription<directory_entry> list_directory(std::function<future<>(directory_entry)>) override { unsupported(); }
+    virtual future<struct stat> stat(void) override { unsupported(); }
+    virtual future<uint64_t> size(void) override { unsupported(); }
+    virtual std::unique_ptr<seastar::file_handle_impl> dup() override { unsupported(); }
+
+    virtual future<> close() override { return make_ready_future<>(); }
+
+    virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t size, const io_priority_class& pc) override {
+        return make_ready_future<temporary_buffer<uint8_t>>(temporary_buffer<uint8_t>(size));
+    }
+
+    virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override {
+        unsupported(); // FIXME
+    }
+
+    virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
+        unsupported(); // FIXME
+    }
+};
+
+#ifndef SEASTAR_DEFAULT_ALLOCATOR // Eviction works only with the seastar allocator
+SEASTAR_THREAD_TEST_CASE(test_stress_eviction) {
+    auto page_size = cached_file::page_size;
+    auto n_pages = 8'000'000 / page_size;
+    auto file_size = page_size * n_pages;
+    auto cached_size = 4'000'000;
+
+    cached_file::metrics metrics;
+    logalloc::region region;
+
+    auto f = file(make_shared<garbage_file_impl>());
+    cached_file cf(f, metrics, cf_lru, region, file_size);
+
+    region.make_evictable([&] {
+        testlog.trace("Evicting");
+        cf.invalidate_at_most_front(file_size / 2);
+        return cf_lru.evict();
+    });
+
+    for (int i = 0; i < (cached_size / page_size); ++i) {
+        read_to_string(cf, page_size * i, page_size);
+    }
+
+    testlog.debug("Saturating memory...");
+
+    // Disable background reclaiming which will prevent bugs from reproducing
+    // We want reclamation to happen synchronously with page cache population in read_to_void()
+    seastar::memory::set_min_free_pages(0);
+
+    // Saturate std memory
+    chunked_fifo<bytes> blobs;
+    auto rc = region.reclaim_counter();
+    while (region.reclaim_counter() == rc) {
+        blobs.emplace_back(bytes(bytes::initialized_later(), 1024));
+    }
+
+    testlog.debug("Memory: allocated={}, free={}", seastar::memory::stats().allocated_memory(), seastar::memory::stats().free_memory());
+    testlog.debug("Starting test...");
+
+    for (int j = 0; j < n_pages * 16; ++j) {
+        testlog.trace("Allocating");
+        auto stride = tests::random::get_int(1, 20);
+        auto page_idx = tests::random::get_int(n_pages - stride);
+        read_to_void(cf, page_idx * page_size, page_size * stride);
+    }
+}
+#endif
+
 SEASTAR_THREAD_TEST_CASE(test_invalidation) {
    auto page_size = cached_file::page_size;
    test_file tf = make_test_file(page_size * 2);
--- a/test/boost/chunked_managed_vector_test.cc
+++ b/test/boost/chunked_managed_vector_test.cc
@@ -25,6 +25,8 @@
 #include <deque>
 #include <random>
 #include "utils/lsa/chunked_managed_vector.hh"
+#include "utils/managed_ref.hh"
+#include "test/lib/log.hh"

 #include <boost/range/algorithm/sort.hpp>
 #include <boost/range/algorithm/equal.hpp>
@@ -216,3 +218,106 @@ SEASTAR_TEST_CASE(tests_reserve_partial) {
  });
  return make_ready_future<>();
 }
+
+SEASTAR_TEST_CASE(test_clear_and_release) {
+    region region;
+    allocating_section as;
+
+    with_allocator(region.allocator(), [&] {
+        lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
+
+        for (uint64_t i = 1; i < 4000; ++i) {
+            as(region, [&] {
+                v.emplace_back(make_managed<uint64_t>(i));
+            });
+        }
+
+        v.clear_and_release();
+    });
+
+    return make_ready_future<>();
+}
+
+SEASTAR_TEST_CASE(test_chunk_reserve) {
+    region region;
+    allocating_section as;
+
+    for (auto conf :
+            { // std::make_pair(reserve size, push count)
+                std::make_pair(0, 4000),
+                std::make_pair(100, 4000),
+                std::make_pair(200, 4000),
+                std::make_pair(1000, 4000),
+                std::make_pair(2000, 4000),
+                std::make_pair(3000, 4000),
+                std::make_pair(5000, 4000),
+                std::make_pair(500, 8000),
+                std::make_pair(1000, 8000),
+                std::make_pair(2000, 8000),
+                std::make_pair(8000, 500),
+            })
+    {
+        with_allocator(region.allocator(), [&] {
+            auto [reserve_size, push_count] = conf;
+            testlog.info("Testing reserve({}), {}x emplace_back()", reserve_size, push_count);
+            lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
+            v.reserve(reserve_size);
+            uint64_t seed = rand();
+            for (uint64_t i = 0; i < push_count; ++i) {
+                as(region, [&] {
+                    v.emplace_back(make_managed<uint64_t>(seed + i));
+                    BOOST_REQUIRE(**v.begin() == seed);
+                });
+            }
+            auto v_it = v.begin();
+            for (uint64_t i = 0; i < push_count; ++i) {
+                BOOST_REQUIRE(**v_it++ == seed + i);
+            }
+            v.clear_and_release();
+        });
+    }
+
+    return make_ready_future<>();
+}
+
+// Tests the case of make_room() invoked with last_chunk_capacity_deficit but _size not in
+// the last reserved chunk.
+SEASTAR_TEST_CASE(test_shrinking_and_expansion_involving_chunk_boundary) {
+    region region;
+    allocating_section as;
+
+    with_allocator(region.allocator(), [&] {
+        lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
+
+        // Fill two chunks
+        v.reserve(2000);
+        for (uint64_t i = 0; i < 2000; ++i) {
+            as(region, [&] {
+                v.emplace_back(make_managed<uint64_t>(i));
+            });
+        }
+
+        // Make the last chunk smaller than max size to trigger the last_chunk_capacity_deficit path in make_room()
+        v.shrink_to_fit();
+
+        // Leave the last chunk reserved but empty
+        for (uint64_t i = 0; i < 1000; ++i) {
+            v.pop_back();
+        }
+
+        // Try to reserve more than the currently reserved capacity and trigger last_chunk_capacity_deficit path
+        // with _size not in the last chunk. Should not sigsegv.
+        v.reserve(8000);
+
+        for (uint64_t i = 0; i < 2000; ++i) {
+            as(region, [&] {
+                v.emplace_back(make_managed<uint64_t>(i));
+            });
+        }
+
+        v.clear_and_release();
+    });
+
+    return make_ready_future<>();
+}
+
--- a/test/boost/chunked_vector_test.cc
+++ b/test/boost/chunked_vector_test.cc
@@ -191,3 +191,32 @@ BOOST_AUTO_TEST_CASE(tests_reserve_partial) {
        BOOST_REQUIRE_EQUAL(v.capacity(), orig_size);
    }
 }
+
+// Tests the case of make_room() invoked with last_chunk_capacity_deficit but _size not in
+// the last reserved chunk.
+BOOST_AUTO_TEST_CASE(test_shrinking_and_expansion_involving_chunk_boundary) {
+    using vector_type = utils::chunked_vector<std::unique_ptr<uint64_t>>;
+    vector_type v;
+
+    // Fill two chunks
+    v.reserve(vector_type::max_chunk_capacity() * 3 / 2);
+    for (uint64_t i = 0; i < vector_type::max_chunk_capacity() * 3 / 2; ++i) {
+        v.emplace_back(std::make_unique<uint64_t>(i));
+    }
+
+    // Make the last chunk smaller than max size to trigger the last_chunk_capacity_deficit path in make_room()
+    v.shrink_to_fit();
+
+    // Leave the last chunk reserved but empty
+    for (uint64_t i = 0; i < vector_type::max_chunk_capacity(); ++i) {
+        v.pop_back();
+    }
+
+    // Try to reserve more than the currently reserved capacity and trigger last_chunk_capacity_deficit path
+    // with _size not in the last chunk. Should not sigsegv.
+    v.reserve(vector_type::max_chunk_capacity() * 4);
+
+    for (uint64_t i = 0; i < vector_type::max_chunk_capacity() * 2; ++i) {
+        v.emplace_back(std::make_unique<uint64_t>(i));
+    }
+}
--- a/test/boost/commitlog_test.cc
+++ b/test/boost/commitlog_test.cc
@@ -44,7 +44,9 @@
 #include "test/lib/tmpdir.hh"
 #include "db/commitlog/commitlog.hh"
 #include "db/commitlog/commitlog_replayer.hh"
+#include "db/commitlog/commitlog_extensions.hh"
 #include "db/commitlog/rp_set.hh"
+#include "db/extensions.hh"
 #include "log.hh"
 #include "service/priority_manager.hh"
 #include "test/lib/exception_utils.hh"
@@ -947,3 +949,113 @@ SEASTAR_TEST_CASE(test_commitlog_deadlock_with_flush_threshold) {
        co_await log.clear();
    }
 }
+
+static future<> do_test_exception_in_allocate_ex(bool do_file_delete, bool reuse = true) {
+    commitlog::config cfg;
+
+    constexpr auto max_size_mb = 1;
+
+    cfg.commitlog_segment_size_in_mb = max_size_mb;
+    cfg.commitlog_total_space_in_mb = 2 * max_size_mb * smp::count;
+    cfg.commitlog_sync_period_in_ms = 10;
+    cfg.reuse_segments = reuse;
+    cfg.allow_going_over_size_limit = false; // #9348 - now can enforce size limit always
+    cfg.use_o_dsync = true; // make sure we pre-allocate.
+
+    // not using cl_test, because we need to be able to abandon
+    // the log.
+
+    tmpdir tmp;
+    cfg.commit_log_location = tmp.path().string();
+
+    class myfail : public std::exception {
+    public:
+        using std::exception::exception;
+    };
+
+    struct myext: public db::commitlog_file_extension {
+    public:
+        bool fail = false;
+        bool thrown = false;
+        bool do_file_delete;
+
+        myext(bool dd)
+            : do_file_delete(dd)
+        {}
+
+        seastar::future<seastar::file> wrap_file(const seastar::sstring& filename, seastar::file f, seastar::open_flags flags) override {
+            if (fail && !thrown) {
+                thrown = true;
+                if (do_file_delete) {
+                    co_await f.close();
+                    co_await seastar::remove_file(filename);
+                }
+                throw myfail{};
+            }
+            co_return f;
+        }
+        seastar::future<> before_delete(const seastar::sstring&) override {
+            co_return;
+        }
+    };
+
+    auto ep = std::make_unique<myext>(do_file_delete);
+    auto& mx = *ep;
+
+    db::extensions myexts;
+    myexts.add_commitlog_file_extension("hufflepuff", std::move(ep));
+
+    cfg.extensions = &myexts;
+
+    auto log = co_await commitlog::create_commitlog(cfg);
+
+    rp_set rps;
+    // uncomment for verbosity
+    // logging::logger_registry().set_logger_level("commitlog", logging::log_level::debug);
+
+    auto uuid = utils::UUID_gen::get_time_UUID();
+    auto size = log.max_record_size();
+
+    auto r = log.add_flush_handler([&](cf_id_type id, replay_position pos) {
+        log.discard_completed_segments(id, rps);
+        mx.fail = true;
+    });
+
+    try {
+        while (!mx.thrown) {
+            rp_handle h = co_await log.add_mutation(uuid, size, db::commitlog::force_sync::no, [&](db::commitlog::output& dst) {
+                dst.fill('1', size);
+            });
+            rps.put(std::move(h));
+        }
+    } catch (...) {
+        BOOST_FAIL("log write timed out. maybe it is deadlocked... Will not free log. ASAN errors and leaks will follow...");
+    }
+
+    co_await log.shutdown();
+    co_await log.clear();
+}
+
+/**
+ * Test generating an exception in segment file allocation
+ */
+SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex) {
+    co_await do_test_exception_in_allocate_ex(false);
+}
+
+SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex_no_recycle) {
+    co_await do_test_exception_in_allocate_ex(false, false);
+}
+
+/**
+ * Test generating an exception in segment file allocation, but also 
+ * delete the file, which in turn should cause follow-up exceptions
+ * in cleanup delete. Which CL should handle
+ */
+SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex_deleted_file) {
+    co_await do_test_exception_in_allocate_ex(true, false);
+}
+
+SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex_deleted_file_no_recycle) {
+    co_await do_test_exception_in_allocate_ex(true);
+}
--- a/test/boost/index_with_paging_test.cc
+++ b/test/boost/index_with_paging_test.cc
@@ -22,6 +22,8 @@
 #include <seastar/testing/test_case.hh>
 #include "test/lib/cql_test_env.hh"
 #include "test/lib/cql_assertions.hh"
+#include "cql3/untyped_result_set.hh"
+#include "cql3/query_processor.hh"
 #include "transport/messages/result_message.hh"

 SEASTAR_TEST_CASE(test_index_with_paging) {
@@ -56,3 +58,51 @@ SEASTAR_TEST_CASE(test_index_with_paging) {
        });
    });
 }
+
+SEASTAR_TEST_CASE(test_index_with_paging_with_base_short_read) {
+    return do_with_cql_env_thread([] (auto& e) {
+        e.execute_cql("CREATE TABLE tab (pk int, ck text, v int, v2 int, v3 text, PRIMARY KEY (pk, ck))").get();
+        e.execute_cql("CREATE INDEX ON tab (v)").get();
+
+        // Enough to trigger a short read on the base table during scan
+        sstring big_string(2 * query::result_memory_limiter::maximum_result_size, 'j');
+
+        const int row_count = 67;
+        for (int i = 0; i < row_count; ++i) {
+            e.execute_cql(format("INSERT INTO tab (pk, ck, v, v2, v3) VALUES ({}, 'hello{}', 1, {}, '{}')", i % 3, i, i, big_string)).get();
+        }
+
+        eventually([&] {
+            uint64_t count = 0;
+            e.qp().local().query_internal("SELECT * FROM ks.tab WHERE v = 1", [&] (const cql3::untyped_result_set_row&) {
+                ++count;
+                return make_ready_future<stop_iteration>(stop_iteration::no);
+            }).get();
+            BOOST_REQUIRE_EQUAL(count, row_count);
+        });
+    });
+}
+
+SEASTAR_TEST_CASE(test_index_with_paging_with_base_short_read_no_ck) {
+    return do_with_cql_env_thread([] (auto& e) {
+        e.execute_cql("CREATE TABLE tab (pk int, v int, v2 int, v3 text, PRIMARY KEY (pk))").get();
+        e.execute_cql("CREATE INDEX ON tab (v)").get();
+
+        // Enough to trigger a short read on the base table during scan
+        sstring big_string(2 * query::result_memory_limiter::maximum_result_size, 'j');
+
+        const int row_count = 67;
+        for (int i = 0; i < row_count; ++i) {
+            e.execute_cql(format("INSERT INTO tab (pk, v, v2, v3) VALUES ({}, 1, {}, '{}')", i, i, big_string)).get();
+        }
+
+        eventually([&] {
+            uint64_t count = 0;
+            e.qp().local().query_internal("SELECT * FROM ks.tab WHERE v = 1", [&] (const cql3::untyped_result_set_row&) {
+                ++count;
+                return make_ready_future<stop_iteration>(stop_iteration::no);
+            }).get();
+            BOOST_REQUIRE_EQUAL(count, row_count);
+        });
+    });
+}
--- a/test/boost/mutation_test.cc
+++ b/test/boost/mutation_test.cc
@@ -702,6 +702,7 @@ SEASTAR_TEST_CASE(test_cell_ordering) {
    };

    auto assert_equal = [] (atomic_cell_view c1, atomic_cell_view c2) {
+        testlog.trace("Expected {} == {}", c1, c2);
        BOOST_REQUIRE(compare_atomic_cell_for_merge(c1, c2) == 0);
        BOOST_REQUIRE(compare_atomic_cell_for_merge(c2, c1) == 0);
    };
@@ -723,9 +724,11 @@ SEASTAR_TEST_CASE(test_cell_ordering) {
        atomic_cell::make_live(*bytes_type, 1, bytes(), expiry_2, ttl_2));

    // Origin doesn't compare ttl (is it wise?)
-    assert_equal(
-        atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_1),
-        atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_2));
+    // But we do. See https://github.com/scylladb/scylla/issues/10156
+    // and https://github.com/scylladb/scylla/issues/10173
+    assert_order(
+        atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_2),
+        atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_1));

    assert_order(
        atomic_cell::make_live(*bytes_type, 0, bytes("value1")),
--- a/test/boost/range_tombstone_list_test.cc
+++ b/test/boost/range_tombstone_list_test.cc
@@ -210,6 +210,35 @@ BOOST_AUTO_TEST_CASE(test_overlapping_addition) {
    BOOST_REQUIRE(it == l.end());
 }

+BOOST_AUTO_TEST_CASE(test_adjacent_empty_range_tombstone) {
+    range_tombstone_list l(*s);
+
+    l.apply(*s, rtie(1, 1, 2));
+    l.apply(*s, rt(1, 2, 3));
+    l.apply(*s, rtei(2, 2, 2));
+    l.apply(*s, rtei(2, 4, 3));
+
+    auto it = l.begin();
+    assert_rt(rt(1, 4, 3), *it++);
+    BOOST_REQUIRE(it == l.end());
+}
+
+BOOST_AUTO_TEST_CASE(test_empty_range_tombstones_are_dropped) {
+    range_tombstone_list l(*s);
+
+    l.apply(*s, rtei(0, 0, 1));
+    l.apply(*s, rtie(0, 0, 1));
+    l.apply(*s, rt(1, 2, 1));
+    l.apply(*s, rtei(4, 4, 1));
+    l.apply(*s, rtie(5, 5, 1));
+    l.apply(*s, rt(7, 8, 1));
+
+    auto it = l.begin();
+    assert_rt(rt(1, 2, 1), *it++);
+    assert_rt(rt(7, 8, 1), *it++);
+    BOOST_REQUIRE(it == l.end());
+}
+
 BOOST_AUTO_TEST_CASE(test_simple_overlap) {
    range_tombstone_list l1(*s);

@@ -473,6 +502,23 @@ static std::vector<range_tombstone> make_random() {
        rts.emplace_back(std::move(start_b), std::move(end_b), tombstone(dist(gen), gc_now));
    }

+    int32_t size_empty = dist(gen) / 2;
+    for (int32_t i = 0; i < size_empty; ++i) {
+        clustering_key_prefix key = make_random_ckey();
+        bool start_incl = dist(gen) > 25;
+        if (start_incl) {
+            rts.emplace_back(
+                    position_in_partition::before_key(key),
+                    position_in_partition::before_key(key),
+                    tombstone(dist(gen), gc_now));
+        } else {
+            rts.emplace_back(
+                    position_in_partition::after_key(key),
+                    position_in_partition::after_key(key),
+                    tombstone(dist(gen), gc_now));
+        }
+    }
+
    return rts;
 }

--- a/test/boost/sstable_partition_index_cache_test.cc
+++ b/test/boost/sstable_partition_index_cache_test.cc
@@ -37,11 +37,13 @@ static void add_entry(logalloc::region& r,
 {
    logalloc::allocating_section as;
    as(r, [&] {
-        sstables::key sst_key = sstables::key::from_partition_key(s, key);
-        page._entries.push_back(make_managed<index_entry>(
-                managed_bytes(sst_key.get_bytes()),
-                position,
-                managed_ref<promoted_index>()));
+        with_allocator(r.allocator(), [&] {
+            sstables::key sst_key = sstables::key::from_partition_key(s, key);
+            page._entries.push_back(make_managed<index_entry>(
+                    managed_bytes(sst_key.get_bytes()),
+                    position,
+                    managed_ref<promoted_index>()));
+        });
    });
 }

--- a/test/cql-pytest/test_filtering.py
+++ b/test/cql-pytest/test_filtering.py
@@ -128,3 +128,16 @@ def test_operator_ne_not_supported(cql, table1):
        cql.execute(f'SELECT a FROM {table1} WHERE a != 0')
    with pytest.raises(InvalidRequest, match='Unsupported.*!='):
        cql.execute(f'SELECT a FROM {table1} WHERE token(a) != 0')
+
+# Test that the fact that a column is indexed does not cause us to fetch
+# incorrect results from a filtering query (issue #10300).
+def test_index_with_in_relation(scylla_only, cql, test_keyspace):
+    schema = 'p int, c int, v boolean, primary key (p,c)'
+    with new_test_table(cql, test_keyspace, schema) as table:
+        cql.execute(f"create index on {table}(v)")
+        for p, c, v in [(0,0,True),(0,1,False),(0,2,True),(0,3,False),
+                (1,0,True),(1,1,False),(1,2,True),(1,3,False),
+                (2,0,True),(2,1,False),(2,2,True),(2,3,False)]:
+            cql.execute(f"insert into {table} (p,c,v) values ({p}, {c}, {v})")
+        res = cql.execute(f"select * from {table} where p in (0,1) and v = False ALLOW FILTERING")
+        assert set(res) == set([(0,1,False),(0,3,False),(1,1,False), (1,3,False)])
--- a/test/cql-pytest/test_null.py
+++ b/test/cql-pytest/test_null.py
@@ -63,8 +63,9 @@ def test_insert_null_key(cql, table1):
    with pytest.raises(InvalidRequest, match='null value'):
        cql.execute(stmt, [None, s])

+# Tests handling of "key_column in ?" where ? is bound to null.
+# Reproduces issue #8265.
 def test_primary_key_in_null(cql, table1):
-    '''Tests handling of "key_column in ?" where ? is bound to null.'''
    with pytest.raises(InvalidRequest, match='null value'):
        cql.execute(cql.prepare(f"SELECT p FROM {table1} WHERE p IN ?"), [None])
    with pytest.raises(InvalidRequest, match='null value'):
@@ -159,6 +160,20 @@ def test_delete_empty_string_key(cql, table1):
    with pytest.raises(InvalidRequest, match='Key may not be empty'):
        cql.execute(f"DELETE FROM {table1} WHERE p='' AND c='{s}'")

+# Another test like test_insert_empty_string_key() just using an INSERT JSON
+# instead of a regular INSERT. Because INSERT JSON takes a different code path
+# from regular INSERT, we need the emptiness test in yet another place.
+# Reproduces issue #9853 (the empty-string partition key was allowed, and
+# actually inserted into the table.)
+def test_insert_json_empty_string_key(cql, table1):
+    s = random_string()
+    # An empty-string clustering *is* allowed:
+    cql.execute("""INSERT INTO %s JSON '{"p": "%s", "c": "", "v": "cat"}'""" % (table1, s))
+    assert list(cql.execute(f"SELECT v FROM {table1} WHERE p='{s}' AND c=''")) == [('cat',)]
+    # But an empty-string partition key is *not* allowed, with a specific
+    # error that a "Key may not be empty":
+    with pytest.raises(InvalidRequest, match='Key may not be empty'):
+        cql.execute("""INSERT INTO %s JSON '{"p": "", "c": "%s", "v": "cat"}'""" % (table1, s))

 # Although an empty string is not allowed as a partition key (as tested
 # above by test_empty_string_key()), it turns out that in a *compound*
--- a/test/cql/cdc_compact_storage_test.cql
+++ b/test/cql/cdc_compact_storage_test.cql
@@ -1,4 +1,14 @@
-create table tb2 (pk int, ck int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
-- Should add 3 rows (preimage + postimage + delta). Delta has only key columns and "pk" + "ck"
-insert into tb2 (pk, ck) VALUES (2, 22) USING TTL 2222;
-select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb2_scylla_cdc_log;
+create table tb2 (pk int, ck int, v int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
+-- Should add 2 rows (postimage + delta).
+insert into tb2 (pk, ck, v) VALUES (2, 22, 111) USING TTL 2222;
+select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck, v from tb2_scylla_cdc_log;
+-- Should add 3 rows (preimage + postimage + delta).
+insert into tb2 (pk, ck, v) VALUES (2, 22, 1111) USING TTL 2223;
+select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck, v from tb2_scylla_cdc_log;
+create table tb3 (pk int, ck int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
+-- Should add 2 rows (postimage + delta).
+insert into tb3 (pk, ck) VALUES (2, 22) USING TTL 2222;
+select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb3_scylla_cdc_log;
+-- Should add 3 rows (preimage + postimage + delta).
+insert into tb3 (pk, ck) VALUES (2, 22) USING TTL 2223;
+select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb3_scylla_cdc_log;
--- a/test/cql/cdc_compact_storage_test.result
+++ b/test/cql/cdc_compact_storage_test.result
@@ -1,13 +1,91 @@
-create table tb2 (pk int, ck int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
+create table tb2 (pk int, ck int, v int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
 {
 	"status" : "ok"
 }
-- Should add 3 rows (preimage + postimage + delta). Delta has only key columns and "pk" + "ck"
-insert into tb2 (pk, ck) VALUES (2, 22) USING TTL 2222;
+-- Should add 2 rows (postimage + delta).
+insert into tb2 (pk, ck, v) VALUES (2, 22, 111) USING TTL 2222;
 {
 	"status" : "ok"
 }
-select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb2_scylla_cdc_log;
+select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck, v from tb2_scylla_cdc_log;
+{
+	"rows" : 
+	[
+		{
+			"cdc$batch_seq_no" : "0",
+			"cdc$operation" : "1",
+			"cdc$ttl" : "2222",
+			"ck" : "22",
+			"pk" : "2",
+			"v" : "111"
+		},
+		{
+			"cdc$batch_seq_no" : "1",
+			"cdc$operation" : "9",
+			"ck" : "22",
+			"pk" : "2",
+			"v" : "111"
+		}
+	]
+}
+-- Should add 3 rows (preimage + postimage + delta).
+insert into tb2 (pk, ck, v) VALUES (2, 22, 1111) USING TTL 2223;
+{
+	"status" : "ok"
+}
+select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck, v from tb2_scylla_cdc_log;
+{
+	"rows" : 
+	[
+		{
+			"cdc$batch_seq_no" : "0",
+			"cdc$operation" : "1",
+			"cdc$ttl" : "2222",
+			"ck" : "22",
+			"pk" : "2",
+			"v" : "111"
+		},
+		{
+			"cdc$batch_seq_no" : "1",
+			"cdc$operation" : "9",
+			"ck" : "22",
+			"pk" : "2",
+			"v" : "111"
+		},
+		{
+			"cdc$batch_seq_no" : "0",
+			"cdc$operation" : "0",
+			"ck" : "22",
+			"pk" : "2",
+			"v" : "111"
+		},
+		{
+			"cdc$batch_seq_no" : "1",
+			"cdc$operation" : "1",
+			"cdc$ttl" : "2223",
+			"ck" : "22",
+			"pk" : "2",
+			"v" : "1111"
+		},
+		{
+			"cdc$batch_seq_no" : "2",
+			"cdc$operation" : "9",
+			"ck" : "22",
+			"pk" : "2",
+			"v" : "1111"
+		}
+	]
+}
+create table tb3 (pk int, ck int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
+{
+	"status" : "ok"
+}
+-- Should add 2 rows (postimage + delta).
+insert into tb3 (pk, ck) VALUES (2, 22) USING TTL 2222;
+{
+	"status" : "ok"
+}
+select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb3_scylla_cdc_log;
 {
 	"rows" : 
 	[
@@ -26,3 +104,46 @@ select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb2_scylla_cd
 		}
 	]
 }
+-- Should add 3 rows (preimage + postimage + delta).
+insert into tb3 (pk, ck) VALUES (2, 22) USING TTL 2223;
+{
+	"status" : "ok"
+}
+select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb3_scylla_cdc_log;
+{
+	"rows" : 
+	[
+		{
+			"cdc$batch_seq_no" : "0",
+			"cdc$operation" : "1",
+			"cdc$ttl" : "2222",
+			"ck" : "22",
+			"pk" : "2"
+		},
+		{
+			"cdc$batch_seq_no" : "1",
+			"cdc$operation" : "9",
+			"ck" : "22",
+			"pk" : "2"
+		},
+		{
+			"cdc$batch_seq_no" : "0",
+			"cdc$operation" : "0",
+			"ck" : "22",
+			"pk" : "2"
+		},
+		{
+			"cdc$batch_seq_no" : "1",
+			"cdc$operation" : "1",
+			"cdc$ttl" : "2223",
+			"ck" : "22",
+			"pk" : "2"
+		},
+		{
+			"cdc$batch_seq_no" : "2",
+			"cdc$operation" : "9",
+			"ck" : "22",
+			"pk" : "2"
+		}
+	]
+}
--- a/transport/server.cc
+++ b/transport/server.cc
@@ -1230,7 +1230,7 @@ std::unique_ptr<cql_server::response> cql_server::connection::make_read_timeout_
 std::unique_ptr<cql_server::response> cql_server::connection::make_read_failure_error(int16_t stream, exceptions::exception_code err, sstring msg, db::consistency_level cl, int32_t received, int32_t numfailures, int32_t blockfor, bool data_present, const tracing::trace_state_ptr& tr_state) const
 {
    if (_version < 4) {
-        return make_read_timeout_error(stream, err, std::move(msg), cl, received, blockfor, data_present, tr_state);
+        return make_read_timeout_error(stream, exceptions::exception_code::READ_TIMEOUT, std::move(msg), cl, received, blockfor, data_present, tr_state);
    }
    auto response = std::make_unique<cql_server::response>(stream, cql_binary_opcode::ERROR, tr_state);
    response->write_int(static_cast<int32_t>(err));
@@ -1258,7 +1258,7 @@ std::unique_ptr<cql_server::response> cql_server::connection::make_mutation_writ
 std::unique_ptr<cql_server::response> cql_server::connection::make_mutation_write_failure_error(int16_t stream, exceptions::exception_code err, sstring msg, db::consistency_level cl, int32_t received, int32_t numfailures, int32_t blockfor, db::write_type type, const tracing::trace_state_ptr& tr_state) const
 {
    if (_version < 4) {
-        return make_mutation_write_timeout_error(stream, err, std::move(msg), cl, received, blockfor, type, tr_state);
+        return make_mutation_write_timeout_error(stream, exceptions::exception_code::WRITE_TIMEOUT, std::move(msg), cl, received, blockfor, type, tr_state);
    }
    auto response = std::make_unique<cql_server::response>(stream, cql_binary_opcode::ERROR, tr_state);
    response->write_int(static_cast<int32_t>(err));
--- a/utils/cached_file.hh
+++ b/utils/cached_file.hh
@@ -157,6 +157,7 @@ private:
    metrics& _metrics;
    lru& _lru;
    logalloc::region& _region;
+    logalloc::allocating_section _as;

    using cache_type = bplus::tree<page_idx_type, cached_page, page_idx_less_comparator, 12, bplus::key_search::linear>;
    cache_type _cache;
@@ -187,10 +188,15 @@ private:
            .then([this, idx] (temporary_buffer<char>&& buf) mutable {
                cached_page::ptr_type first_page;
                while (buf.size()) {
-                    auto this_buf = buf.share();
-                    this_buf.trim(std::min(page_size, buf.size()));
-                    buf.trim_front(this_buf.size());
-                    auto it_and_flag = _cache.emplace(idx, this, idx, std::move(this_buf));
+                    auto this_size = std::min(page_size, buf.size());
+                    // _cache.emplace() needs to run under allocating section even though it lives in the std space
+                    // because bplus::tree operations are not reentrant, so we need to prevent memory reclamation.
+                    auto it_and_flag = _as(_region, [&] {
+                        auto this_buf = buf.share();
+                        this_buf.trim(this_size);
+                        return _cache.emplace(idx, this, idx, std::move(this_buf));
+                    });
+                    buf.trim_front(this_size);
                    ++idx;
                    cached_page &cp = *it_and_flag.first;
                    if (it_and_flag.second) {
@@ -333,6 +339,7 @@ public:
    }

    size_t evict_range(cache_type::iterator start, cache_type::iterator end) noexcept {
+      return with_allocator(standard_allocator(), [&] {
        size_t count = 0;
        auto disposer = [] (auto* p) noexcept {};
        while (start != end) {
@@ -345,6 +352,7 @@ public:
            }
        }
        return count;
+      });
    }
 public:
    /// \brief Constructs a cached_file.
@@ -471,8 +479,10 @@ public:
 inline
 void cached_file::cached_page::on_evicted() noexcept {
    parent->on_evicted(*this);
-    cached_file::cache_type::iterator it(this);
-    it.erase(page_idx_less_comparator());
+    with_allocator(standard_allocator(), [this] {
+        cached_file::cache_type::iterator it(this);
+        it.erase(page_idx_less_comparator());
+    });
 }

 class cached_file_impl : public file_impl {
--- a/utils/chunked_vector.hh
+++ b/utils/chunked_vector.hh
@@ -52,10 +52,11 @@ class chunked_vector {
    utils::small_vector<chunk_ptr, 1> _chunks;
    size_t _size = 0;
    size_t _capacity = 0;
-private:
+public:
    static size_t max_chunk_capacity() {
        return std::max(max_contiguous_allocation / sizeof(T), size_t(1));
    }
+private:
    void reserve_for_push_back() {
        if (_size == _capacity) {
            do_reserve_for_push_back();
@@ -387,7 +388,9 @@ chunked_vector<T, max_contiguous_allocation>::make_room(size_t n, bool stop_afte
        auto new_last_chunk_capacity = last_chunk_capacity + capacity_increase;
        // FIXME: realloc? maybe not worth the complication; only works for PODs
        auto new_last_chunk = new_chunk(new_last_chunk_capacity);
-        migrate(addr(_capacity - last_chunk_capacity), addr(_size), new_last_chunk.get());
+        if (_size > _capacity - last_chunk_capacity) {
+            migrate(addr(_capacity - last_chunk_capacity), addr(_size), new_last_chunk.get());
+        }
        _chunks.back() = std::move(new_last_chunk);
        _capacity += capacity_increase;
    }
--- a/utils/logalloc.cc
+++ b/utils/logalloc.cc
@@ -597,6 +597,10 @@ static constexpr auto max_used_space_ratio_for_compaction = 0.85;
 static constexpr size_t max_used_space_for_compaction = segment_size * max_used_space_ratio_for_compaction;
 static constexpr size_t min_free_space_for_compaction = segment_size - max_used_space_for_compaction;

+struct [[gnu::packed]] non_lsa_object_cookie {
+    uint64_t value = 0xbadcaffe;
+};
+
 static_assert(min_free_space_for_compaction >= max_managed_object_size,
    "Segments which cannot fit max_managed_object_size must not be considered compactible for the sake of forward progress of compaction");

@@ -840,9 +844,13 @@ public:
    void clear_allocation_failure_flag() { _allocation_failure_flag = false; }
    bool allocation_failure_flag() { return _allocation_failure_flag; }
    void refill_emergency_reserve();
-    void update_non_lsa_memory_in_use(ssize_t n) {
+    void add_non_lsa_memory_in_use(size_t n) {
        _non_lsa_memory_in_use += n;
    }
+    void subtract_non_lsa_memory_in_use(size_t n) {
+        assert(_non_lsa_memory_in_use >= n);
+        _non_lsa_memory_in_use -= n;
+    }
    size_t non_lsa_memory_in_use() const {
        return _non_lsa_memory_in_use;
    }
@@ -1395,6 +1403,8 @@ private:
    }

    lsa_buffer alloc_buf(size_t buf_size) {
+        // Note: Can be re-entered from allocation sites below due to memory reclamation which
+        // invokes segment compaction.
        static_assert(segment::size % buf_align == 0);
        if (buf_size > segment::size) {
            throw_with_backtrace<std::runtime_error>(format("Buffer size {} too large", buf_size));
@@ -1447,6 +1457,7 @@ private:

        if (seg != _buf_active) {
            if (desc.is_empty()) {
+                assert(desc._buf_pointers.empty());
                _segment_descs.erase(desc);
                desc._buf_pointers = std::vector<entangled>();
                free_segment(seg, desc);
@@ -1457,7 +1468,7 @@ private:
        }
    }

-    void compact_segment_locked(segment* seg, segment_descriptor& desc) {
+    void compact_segment_locked(segment* seg, segment_descriptor& desc) noexcept {
        auto seg_occupancy = desc.occupancy();
        llogger.debug("Compacting segment {} from region {}, {}", fmt::ptr(seg), id(), seg_occupancy);

@@ -1472,6 +1483,7 @@ private:
            for (entangled& e : _buf_ptrs_for_compact_segment) {
                if (e) {
                    lsa_buffer* old_ptr = e.get(&lsa_buffer::_link);
+                    assert(&desc == old_ptr->_desc);
                    lsa_buffer dst = alloc_buf(old_ptr->_size);
                    memcpy(dst._buf, old_ptr->_buf, dst._size);
                    old_ptr->_link = std::move(dst._link);
@@ -1502,6 +1514,10 @@ private:
        std::vector<entangled> ptrs;
        ptrs.reserve(segment::size / buf_align);
        segment* new_active = new_segment();
+        if (_buf_active) [[unlikely]] {
+            // Memory allocation above could allocate active buffer during segment compaction.
+            close_buf_active();
+        }
        assert((uintptr_t)new_active->at(0) % buf_align == 0);
        segment_descriptor& desc = shard_segment_pool.descriptor(new_active);
        desc._buf_pointers = std::move(ptrs);
@@ -1635,17 +1651,18 @@ public:
        memory::on_alloc_point();
        shard_segment_pool.on_memory_allocation(size);
        if (size > max_managed_object_size) {
-            auto ptr = standard_allocator().alloc(migrator, size, alignment);
+            auto ptr = standard_allocator().alloc(migrator, size + sizeof(non_lsa_object_cookie), alignment);
            // This isn't very acurrate, the correct free_space value would be
            // malloc_usable_size(ptr) - size, but there is no way to get
            // the exact object size at free.
            auto allocated_size = malloc_usable_size(ptr);
+            new ((char*)ptr + allocated_size - sizeof(non_lsa_object_cookie)) non_lsa_object_cookie();
            _non_lsa_occupancy += occupancy_stats(0, allocated_size);
            if (_group) {
                 _evictable_space += allocated_size;
                _group->increase_usage(_heap_handle, allocated_size);
            }
-            shard_segment_pool.update_non_lsa_memory_in_use(allocated_size);
+            shard_segment_pool.add_non_lsa_memory_in_use(allocated_size);
            return ptr;
        } else {
            auto ptr = alloc_small(object_descriptor(migrator), (segment::size_type) size, alignment);
@@ -1657,12 +1674,14 @@ public:
 private:
    void on_non_lsa_free(void* obj) noexcept {
        auto allocated_size = malloc_usable_size(obj);
+        auto cookie = (non_lsa_object_cookie*)((char*)obj + allocated_size) - 1;
+        assert(cookie->value == non_lsa_object_cookie().value);
        _non_lsa_occupancy -= occupancy_stats(0, allocated_size);
        if (_group) {
            _evictable_space -= allocated_size;
            _group->decrease_usage(_heap_handle, allocated_size);
        }
-        shard_segment_pool.update_non_lsa_memory_in_use(-allocated_size);
+        shard_segment_pool.subtract_non_lsa_memory_in_use(allocated_size);
    }
 public:
    virtual void free(void* obj) noexcept override {
@@ -2188,8 +2207,8 @@ private:
        auto info_level = _stall_detected ? log_level::info : log_level::debug;
        auto MiB = 1024*1024;

-        timing_logger.log(time_level, "Reclamation cycle took {} ms, trying to release {:.3f} MiB {}preemptibly",
-                          _duration.count(), (float)_memory_to_release / MiB, _preemptible ? "" : "non-");
+        timing_logger.log(time_level, "Reclamation cycle took {} us, trying to release {:.3f} MiB {}preemptibly",
+                          _duration / 1us, (float)_memory_to_release / MiB, _preemptible ? "" : "non-");
        log_if_any(info_level, "reserved segments", _reserve_segments);
        if (_memory_released > 0) {
            auto bytes_per_second =
--- a/utils/lsa/chunked_managed_vector.hh
+++ b/utils/lsa/chunked_managed_vector.hh
@@ -73,6 +73,9 @@ private:
            throw std::out_of_range("chunked_managed_vector out of range access");
        }
    }
+    chunk_ptr& back_chunk() {
+        return _chunks[_size / max_chunk_capacity()];
+    }
    static void migrate(T* begin, T* end, managed_vector<T>& result);
 public:
    using value_type = T;
@@ -119,24 +122,24 @@ public:

    void push_back(const T& x) {
        reserve_for_push_back();
-        _chunks.back().emplace_back(x);
+        back_chunk().emplace_back(x);
        ++_size;
    }
    void push_back(T&& x) {
        reserve_for_push_back();
-        _chunks.back().emplace_back(std::move(x));
+        back_chunk().emplace_back(std::move(x));
        ++_size;
    }
    template <typename... Args>
    T& emplace_back(Args&&... args) {
        reserve_for_push_back();
-        auto& ret = _chunks.back().emplace_back(std::forward<Args>(args)...);
+        auto& ret = back_chunk().emplace_back(std::forward<Args>(args)...);
        ++_size;
        return ret;
    }
    void pop_back() {
        --_size;
-        _chunks.back().pop_back();
+        back_chunk().pop_back();
    }
    const T& back() const {
        return *addr(_size - 1);
@@ -394,7 +397,9 @@ chunked_managed_vector<T>::make_room(size_t n, bool stop_after_one) {
        auto new_last_chunk_capacity = last_chunk_capacity + capacity_increase;
        // FIXME: realloc? maybe not worth the complication; only works for PODs
        auto new_last_chunk = new_chunk(new_last_chunk_capacity);
-        migrate(addr(_capacity - last_chunk_capacity), addr(_size), new_last_chunk);
+        if (_size > _capacity - last_chunk_capacity) {
+            migrate(addr(_capacity - last_chunk_capacity), addr(_size), new_last_chunk);
+        }
        _chunks.back() = std::move(new_last_chunk);
        _capacity += capacity_increase;
    }