Merge '[branch-5.0] - minimal fix for crash caused by empty primary key range in LWT update' from Jan Ciołek

In #13001 we found a test case which causes a crash because it didn't handle `UNSET_VALUE` properly: ```python3 def test_unset_insert_where(cql, table2): p = unique_key_int() stmt = cql.prepare(f'INSERT INTO {table2} (p, c) VALUES ({p}, ?)') with pytest.raises(InvalidRequest, match="unset"): cql.execute(stmt, [UNSET_VALUE]) def test_unset_insert_where_lwt(cql, table2): p = unique_key_int() stmt = cql.prepare(f'INSERT INTO {table2} (p, c) VALUES ({p}, ?) IF NOT EXISTS') with pytest.raises(InvalidRequest, match="unset"): cql.execute(stmt, [UNSET_VALUE]) ``` This PR does an absolutely minimal change to fix the crash. It adds a check the moment before the crash would happen. To make sure that everything works correctly, and to detect any possible breaking changes, I wrote a bunch of tests that validate the current behavior. I also ported some tests from the `master` branch, at least the ones that were in line with the behavior on `branch-5.0`. The changes are the same as in #13133, just cherry-picked to `branch-5.0` Closes #13178 * github.com:scylladb/scylladb: cql-pytest/test_unset: port some tests from master branch cql-pytest/test_unset: test unset value in UPDATEs with LWT conditions cql-pytest/test_unset: test unset value in UPDATEs with IF EXISTS cql-pytest/test_unset: test unset value in UPDATE statements cql-pytest/test_unset: test unset value in INSERTs with IF NOT EXISTS cql-pytest/test_unset: test unset value in INSERT statements cas_request: fix crash on unset value in primary key with LWT
Update seastar submodule
2026-05-13 03:12:13 +00:00 · 2023-05-08 12:03:44 +03:00 · 2023-05-08 10:41:24 +03:00 · 2023-05-08 09:58:46 +03:00 · 2023-05-02 21:22:23 +03:00 · 2023-04-28 03:25:27 +02:00
71 changed files with 2227 additions and 371 deletions
--- a/2
+++ b/2
@@ -60,7 +60,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.0.8
+VERSION=5.0.13

 if test -f version
 then
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -143,19 +143,24 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
    auto table = find_table(_proxy, request);
    auto db = _proxy.data_dictionary();
    auto cfs = db.get_tables();
-    auto i = cfs.begin();
-    auto e = cfs.end();

    if (limit < 1) {
        throw api_error::validation("Limit must be 1 or more");
    }

-    // TODO: the unordered_map here is not really well suited for partial
-    // querying - we're sorting on local hash order, and creating a table
-    // between queries may or may not miss info. But that should be rare,
-    // and we can probably expect this to be a single call.
+    // # 12601 (maybe?) - sort the set of tables on ID. This should ensure we never
+    // generate duplicates in a paged listing here. Can obviously miss things if they 
+    // are added between paged calls and end up with a "smaller" UUID/ARN, but that 
+    // is to be expected.
+    std::sort(cfs.begin(), cfs.end(), [](const data_dictionary::table& t1, const data_dictionary::table& t2) {
+        return t1.schema()->id() < t2.schema()->id();
+    });
+
+    auto i = cfs.begin();
+    auto e = cfs.end();
+
    if (streams_start) {
-        i = std::find_if(i, e, [&](data_dictionary::table t) {
+        i = std::find_if(i, e, [&](const data_dictionary::table& t) {
            return t.schema()->id() == streams_start 
                && cdc::get_base_table(db.real_database(), *t.schema())
                && is_alternator_keyspace(t.schema()->ks_name())
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -593,6 +593,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        if (column_families.empty()) {
            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
        }
+        apilog.debug("force_keyspace_compaction: keyspace={} tables={}", keyspace, column_families);
        return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) -> future<> {
            auto table_ids = boost::copy_range<std::vector<utils::UUID>>(column_families | boost::adaptors::transformed([&] (auto& cf_name) {
                return db.find_uuid(keyspace, cf_name);
@@ -617,6 +618,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        if (column_families.empty()) {
            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
        }
+        apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, column_families);
        return ss.local().is_cleanup_allowed(keyspace).then([&ctx, keyspace,
                column_families = std::move(column_families)] (bool is_cleanup_allowed) mutable {
            if (!is_cleanup_allowed) {
@@ -635,7 +637,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                // as a table can be dropped during loop below, let's find it before issuing the cleanup request.
                for (auto& id : table_ids) {
                    replica::table& t = db.find_column_family(id);
-                    co_await cm.perform_cleanup(db, &t);
+                    co_await t.perform_cleanup_compaction(db);
                }
                co_return;
            }).then([]{
@@ -645,6 +647,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::perform_keyspace_offstrategy_compaction.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> tables) -> future<json::json_return_type> {
+        apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, tables);
        co_return co_await ctx.db.map_reduce0([&keyspace, &tables] (replica::database& db) -> future<bool> {
            bool needed = false;
            for (const auto& table : tables) {
@@ -658,6 +661,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);

+        apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, column_families, exclude_current_version);
        return ctx.db.invoke_on_all([=] (replica::database& db) {
            return do_for_each(column_families, [=, &db](sstring cfname) {
                auto& cm = db.get_compaction_manager();
@@ -672,6 +676,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) -> future<json::json_return_type> {
        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
+        apilog.info("perform_keyspace_flush: keyspace={} tables={}", keyspace, column_families);
        auto &db = ctx.db.local();
        if (column_families.empty()) {
            co_await db.flush_on_all(keyspace);
@@ -683,6 +688,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_


    ss::decommission.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("decommission");
        return ss.local().decommission().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -698,6 +704,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    ss::remove_node.set(r, [&ss](std::unique_ptr<request> req) {
        auto host_id = req->get_query_param("host_id");
        std::vector<sstring> ignore_nodes_strs= split(req->get_query_param("ignore_nodes"), ",");
+        apilog.info("remove_node: host_id={} ignore_nodes={}", host_id, ignore_nodes_strs);
        auto ignore_nodes = std::list<gms::inet_address>();
        for (std::string n : ignore_nodes_strs) {
            try {
@@ -770,6 +777,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::drain.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("drain");
        return ss.local().drain().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -802,12 +810,14 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
    });

    ss::stop_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("stop_gossiping");
        return ss.local().stop_gossiping().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

    ss::start_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
+        apilog.info("start_gossiping");
        return ss.local().start_gossiping().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -904,6 +914,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::rebuild.set(r, [&ss](std::unique_ptr<request> req) {
        auto source_dc = req->get_query_param("source_dc");
+        apilog.info("rebuild: source_dc={}", source_dc);
        return ss.local().rebuild(std::move(source_dc)).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -940,6 +951,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        // FIXME: We should truncate schema tables if more than one node in the cluster.
        auto& sp = service::get_storage_proxy();
        auto& fs = sp.local().features();
+        apilog.info("reset_local_schema");
        return db::schema_tables::recalculate_schema_version(sp, fs).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
@@ -947,6 +959,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_

    ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
        auto probability = req->get_query_param("probability");
+        apilog.info("set_trace_probability: probability={}", probability);
        return futurize_invoke([probability] {
            double real_prob = std::stod(probability.c_str());
            return tracing::tracing::tracing_instance().invoke_on_all([real_prob] (auto& local_tracing) {
@@ -984,6 +997,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto ttl = req->get_query_param("ttl");
        auto threshold = req->get_query_param("threshold");
        auto fast = req->get_query_param("fast");
+        apilog.info("set_slow_query: enable={} ttl={} threshold={} fast={}", enable, ttl, threshold, fast);
        try {
            return tracing::tracing::tracing_instance().invoke_on_all([enable, ttl, threshold, fast] (auto& local_tracing) {
                if (threshold != "") {
@@ -1010,6 +1024,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

+        apilog.info("enable_auto_compaction: keyspace={} tables={}", keyspace, tables);
        return set_tables_autocompaction(ctx, ss.local(), keyspace, tables, true);
    });

@@ -1017,6 +1032,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
        auto keyspace = validate_keyspace(ctx, req->param);
        auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");

+        apilog.info("disable_auto_compaction: keyspace={} tables={}", keyspace, tables);
        return set_tables_autocompaction(ctx, ss.local(), keyspace, tables, false);
    });

--- a/configure.py
+++ b/configure.py
@@ -2008,7 +2008,8 @@ with open(buildfile_tmp, 'w') as f:
                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
                if cc.endswith('Parser.cpp'):
                    # Unoptimized parsers end up using huge amounts of stack space and overflowing their stack
-                    flags = '-O1'
+                    flags = '-O1' if modes[mode]['optimization-level'] in ['0', 'g', 's'] else ''
+
                    if has_sanitize_address_use_after_scope:
                        flags += ' -fno-sanitize-address-use-after-scope'
                    f.write('  obj_cxxflags = %s\n' % flags)
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -1293,7 +1293,7 @@ expression search_and_replace(const expression& e,
                    };
                },
                [&] (const binary_operator& oper) -> expression {
-                    return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs));
+                    return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs), oper.order);
                },
                [&] (const column_mutation_attribute& cma) -> expression {
                    return column_mutation_attribute{cma.kind, recurse(cma.column)};
--- a/cql3/selection/field_selector.hh
+++ b/cql3/selection/field_selector.hh
@@ -83,7 +83,7 @@ public:

    virtual sstring assignment_testable_source_context() const override {
        auto&& name = _type->field_name(_field);
-        auto sname = sstring(reinterpret_cast<const char*>(name.begin(), name.size()));
+        auto sname = std::string_view(reinterpret_cast<const char*>(name.data()), name.size());
        return format("{}.{}", _selected, sname);
    }

--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -254,6 +254,10 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
    if (options.getSerialConsistency() == null)
        throw new InvalidRequestException("Invalid empty serial consistency level");
 #endif
+    for (size_t i = 0; i < _statements.size(); ++i) {
+        _statements[i].statement->validate_primary_key_restrictions(options.for_statement(i));
+    }
+
    if (_has_conditions) {
        ++_stats.cas_batches;
        _stats.statements_in_cas_batches += _statements.size();
--- a/cql3/statements/cas_request.cc
+++ b/cql3/statements/cas_request.cc
@@ -121,6 +121,9 @@ std::optional<mutation> cas_request::apply(foreign_ptr<lw_shared_ptr<query::resu

 const update_parameters::prefetch_data::row* cas_request::find_old_row(const cas_row_update& op) const {
    static const clustering_key empty_ckey = clustering_key::make_empty();
+    if (_key.empty()) {
+        throw exceptions::invalid_request_exception("partition key ranges empty - probably caused by an unset value");
+    }
    const partition_key& pkey = _key.front().start()->value().key().value();
    // If a statement has only static columns conditions, we must ignore its clustering columns
    // restriction when choosing a row to check the conditions, i.e. choose any partition row,
@@ -134,6 +137,9 @@ const update_parameters::prefetch_data::row* cas_request::find_old_row(const cas
    // Another case when we pass an empty clustering key prefix is apparently when the table
    // doesn't have any clustering key columns and the clustering key range is empty (open
    // ended on both sides).
+    if (op.ranges.empty()) {
+        throw exceptions::invalid_request_exception("clustering key ranges empty - probably caused by an unset value");
+    }
    const clustering_key& ckey = !op.statement.has_only_static_column_conditions() && op.ranges.front().start() ?
        op.ranges.front().start()->value() : empty_ckey;
    return _rows.find_row(pkey, ckey);
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -242,6 +242,12 @@ modification_statement::execute(query_processor& qp, service::query_state& qs, c
    return modify_stage(this, seastar::ref(qp), seastar::ref(qs), seastar::cref(options));
 }

+void modification_statement::validate_primary_key_restrictions(const query_options& options) const {
+    if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
+        throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
+    }
+}
+
 future<::shared_ptr<cql_transport::messages::result_message>>
 modification_statement::do_execute(query_processor& qp, service::query_state& qs, const query_options& options) const {
    if (has_conditions() && options.get_protocol_version() == 1) {
@@ -252,9 +258,7 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs

    inc_cql_stats(qs.get_client_state().is_internal());

-    if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
-        throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
-    }
+    validate_primary_key_restrictions(options);

    if (has_conditions()) {
        return execute_with_condition(qp, qs, options);
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -231,6 +231,8 @@ public:
    // True if this statement needs to read only static column values to check if it can be applied.
    bool has_only_static_column_conditions() const { return !_has_regular_column_conditions && _has_static_column_conditions; }

+    void validate_primary_key_restrictions(const query_options& options) const;
+
    virtual future<::shared_ptr<cql_transport::messages::result_message>>
    execute(query_processor& qp, service::query_state& qs, const query_options& options) const override;

--- a/cql3/type_json.cc
+++ b/cql3/type_json.cc
@@ -218,7 +218,7 @@ struct from_json_object_visitor {
            throw marshal_exception("bytes_type must be represented as string");
        }
        std::string_view string_v = rjson::to_string_view(value);
-        if (string_v.size() < 2 && string_v[0] != '0' && string_v[1] != 'x') {
+        if (string_v.size() < 2 || string_v[0] != '0' || string_v[1] != 'x') {
            throw marshal_exception("Blob JSON strings must start with 0x");
        }
        string_v.remove_prefix(2);
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -3072,11 +3072,11 @@ mutation system_keyspace::make_group0_history_state_id_mutation(
        using namespace std::chrono;
        assert(*gc_older_than >= gc_clock::duration{0});

-        auto ts_millis = duration_cast<milliseconds>(microseconds{ts});
-        auto gc_older_than_millis = duration_cast<milliseconds>(*gc_older_than);
-        assert(gc_older_than_millis < ts_millis);
+        auto ts_micros = microseconds{ts};
+        auto gc_older_than_micros = duration_cast<microseconds>(*gc_older_than);
+        assert(gc_older_than_micros < ts_micros);

-        auto tomb_upper_bound = utils::UUID_gen::min_time_UUID(ts_millis - gc_older_than_millis);
+        auto tomb_upper_bound = utils::UUID_gen::min_time_UUID(ts_micros - gc_older_than_micros);
        // We want to delete all entries with IDs smaller than `tomb_upper_bound`
        // but the deleted range is of the form (x, +inf) since the schema is reversed.
        auto range = query::clustering_range::make_starting_with({
--- a/db/view/row_locking.cc
+++ b/db/view/row_locking.cc
@@ -74,33 +74,29 @@ row_locker::lock_pk(const dht::decorated_key& pk, bool exclusive, db::timeout_cl
 future<row_locker::lock_holder>
 row_locker::lock_ck(const dht::decorated_key& pk, const clustering_key_prefix& cpk, bool exclusive, db::timeout_clock::time_point timeout, stats& stats) {
    mylog.debug("taking shared lock on partition {}, and {} lock on row {} in it", pk, (exclusive ? "exclusive" : "shared"), cpk);
+    auto ck = cpk;
+    // Create a two-level lock entry for the partition if it doesn't exist already.
    auto i = _two_level_locks.try_emplace(pk, this).first;
+    // The two-level lock entry we've just created is guaranteed to be kept alive as long as it's locked.
+    // Initiating read locking in the background below ensures that even if the two-level lock is currently
+    // write-locked, releasing the write-lock will synchronously engage any waiting
+    // locks and will keep the entry alive.
    future<lock_type::holder> lock_partition = i->second._partition_lock.hold_read_lock(timeout);
-    auto j = i->second._row_locks.find(cpk);
-    if (j == i->second._row_locks.end()) {
-        // Not yet locked, need to create the lock. This makes a copy of cpk.
-        try {
-            j = i->second._row_locks.emplace(cpk, lock_type()).first;
-        } catch(...) {
-            // If this emplace() failed, e.g., out of memory, we fail. We
-            // could do nothing - the partition lock we already started
-            // taking will be unlocked automatically after being locked.
-            // But it's better form to wait for the work we started, and it
-            // will also allow us to remove the hash-table row we added.
-            return lock_partition.then([ex = std::current_exception()] (auto lock) {
-                // The lock is automatically released when "lock" goes out of scope.
-                // TODO: unlock (lock = {}) now, search for the partition in the
-                // hash table (we know it's still there, because we held the lock until
-                // now) and remove the unused lock from the hash table if still unused.
-                return make_exception_future<row_locker::lock_holder>(std::current_exception());
-            });
-        }
-    }
    single_lock_stats &single_lock_stats = exclusive ? stats.exclusive_row : stats.shared_row;
    single_lock_stats.operations_currently_waiting_for_lock++;
    utils::latency_counter waiting_latency;
    waiting_latency.start();
-    return lock_partition.then([this, pk = &i->first, cpk = &j->first, &row_lock = j->second, exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency), timeout] (auto lock1) mutable {
+    return lock_partition.then([this, pk = &i->first, row_locks = &i->second._row_locks, ck = std::move(ck), exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency), timeout] (auto lock1) mutable {
+        auto j = row_locks->find(ck);
+        if (j == row_locks->end()) {
+            // Not yet locked, need to create the lock.
+            j = row_locks->emplace(std::move(ck), lock_type()).first;
+        }
+        auto* cpk = &j->first;
+        auto& row_lock = j->second;
+        // Like to the two-level lock entry above, the row_lock entry we've just created
+        // is guaranteed to be kept alive as long as it's locked.
+        // Initiating read/write locking in the background below ensures that.
        auto lock_row = exclusive ? row_lock.hold_write_lock(timeout) : row_lock.hold_read_lock(timeout);
        return lock_row.then([this, pk, cpk, exclusive, &single_lock_stats, waiting_latency = std::move(waiting_latency), lock1 = std::move(lock1)] (auto lock2) mutable {
        // FIXME: indentation
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -947,8 +947,12 @@ future<stop_iteration> view_update_builder::stop() const {
    return make_ready_future<stop_iteration>(stop_iteration::yes);
 }

-future<utils::chunked_vector<frozen_mutation_and_schema>> view_update_builder::build_some() {
+future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>> view_update_builder::build_some() {
    return advance_all().then([this] (stop_iteration ignored) {
+        if (!_update && !_existing) {
+            // Tell the caller there is no more data to build.
+            return make_ready_future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>>(std::nullopt);
+        }
        bool do_advance_updates = false;
        bool do_advance_existings = false;
        if (_update && _update->is_partition_start()) {
@@ -960,22 +964,23 @@ future<utils::chunked_vector<frozen_mutation_and_schema>> view_update_builder::b
            _existing_tombstone_tracker.set_partition_tombstone(_existing->as_partition_start().partition_tombstone());
            do_advance_existings = true;
        }
+        future<stop_iteration> f = make_ready_future<stop_iteration>(stop_iteration::no);
        if (do_advance_updates) {
-            return do_advance_existings ? advance_all() : advance_updates();
+            f = do_advance_existings ? advance_all() : advance_updates();
        } else if (do_advance_existings) {
-            return advance_existings();
+            f = advance_existings();
        }
-        return make_ready_future<stop_iteration>(stop_iteration::no);
-    }).then([this] (stop_iteration ignored) {
-        return repeat([this] {
-            return this->on_results();
+        return std::move(f).then([this] (stop_iteration ignored) {
+            return repeat([this] {
+                return this->on_results();
+            });
+        }).then([this] {
+            utils::chunked_vector<frozen_mutation_and_schema> mutations;
+            for (auto& update : _view_updates) {
+                update.move_to(mutations);
+            }
+            return std::make_optional(mutations);
        });
-    }).then([this] {
-        utils::chunked_vector<frozen_mutation_and_schema> mutations;
-        for (auto& update : _view_updates) {
-            update.move_to(mutations);
-        }
-        return mutations;
    });
 }

@@ -2145,24 +2150,28 @@ update_backlog node_update_backlog::add_fetch(unsigned shard, update_backlog bac
    return std::max(backlog, _max.load(std::memory_order_relaxed));
 }

-future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const sstring& ks_name, const sstring& cf_name) {
-    return sys_dist_ks.view_status(ks_name, cf_name).then([] (std::unordered_map<utils::UUID, sstring>&& view_statuses) {
-        return boost::algorithm::any_of(view_statuses | boost::adaptors::map_values, [] (const sstring& view_status) {
-            return view_status == "STARTED";
+future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const sstring& ks_name,
+        const sstring& cf_name) {
+    using view_statuses_type = std::unordered_map<utils::UUID, sstring>;
+    return sys_dist_ks.view_status(ks_name, cf_name).then([&tm] (view_statuses_type&& view_statuses) {
+        return boost::algorithm::any_of(view_statuses, [&tm] (const view_statuses_type::value_type& view_status) {
+            // Only consider status of known hosts.
+            return view_status.second == "STARTED" && tm.get_endpoint_for_host_id(view_status.first);
        });
    });
 }

-future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const replica::table& t, streaming::stream_reason reason) {
+future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const replica::table& t,
+        streaming::stream_reason reason) {
    if (is_internal_keyspace(t.schema()->ks_name())) {
        return make_ready_future<bool>(false);
    }
    if (reason == streaming::stream_reason::repair && !t.views().empty()) {
        return make_ready_future<bool>(true);
    }
-    return do_with(t.views(), [&sys_dist_ks] (auto& views) {
+    return do_with(t.views(), [&sys_dist_ks, &tm] (auto& views) {
        return map_reduce(views,
-                [&sys_dist_ks] (const view_ptr& view) { return check_view_build_ongoing(sys_dist_ks, view->ks_name(), view->cf_name()); },
+                [&sys_dist_ks, &tm] (const view_ptr& view) { return check_view_build_ongoing(sys_dist_ks, tm, view->ks_name(), view->cf_name()); },
                false,
                std::logical_or<bool>());
    });
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -185,7 +185,15 @@ public:
    }
    view_update_builder(view_update_builder&& other) noexcept = default;

-    future<utils::chunked_vector<frozen_mutation_and_schema>> build_some();
+
+    // build_some() works on batches of 100 (max_rows_for_view_updates)
+    // updated rows, but can_skip_view_updates() can decide that some of
+    // these rows do not effect the view, and as a result build_some() can
+    // fewer than 100 rows - in extreme cases even zero (see issue #12297).
+    // So we can't use an empty returned vector to signify that the view
+    // update building is done - and we wrap the return value in an
+    // std::optional, which is disengaged when the iteration is done.
+    future<std::optional<utils::chunked_vector<frozen_mutation_and_schema>>> build_some();

    future<> close() noexcept;

--- a/db/view/view_update_checks.hh
+++ b/db/view/view_update_checks.hh
@@ -22,9 +22,13 @@ class system_distributed_keyspace;

 }

+namespace locator {
+class token_metadata;
+}
+
 namespace db::view {

-future<bool> check_view_build_ongoing(db::system_distributed_keyspace& sys_dist_ks, const sstring& ks_name, const sstring& cf_name);
-future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const replica::table& t, streaming::stream_reason reason);
+future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_dist_ks, const locator::token_metadata& tm, const replica::table& t,
+        streaming::stream_reason reason);

 }
--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -83,10 +83,10 @@ future<> view_update_generator::start() {
                            service::get_local_streaming_priority(),
                            nullptr,
                            ::mutation_reader::forwarding::no);
+                    auto close_sr = deferred_close(staging_sstable_reader);

                    inject_failure("view_update_generator_consume_staging_sstable");
                    auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, std::move(permit), *t, sstables, _as, staging_sstable_reader_handle));
-                    staging_sstable_reader.close().get();
                    if (result == stop_iteration::yes) {
                        break;
                    }
--- a/dht/murmur3_partitioner.cc
+++ b/dht/murmur3_partitioner.cc
@@ -15,11 +15,18 @@

 namespace dht {

+// Note: Cassandra has a special case where for an empty key it returns
+// minimum_token() instead of 0 (the naturally-calculated hash function for
+// an empty string). Their thinking was that empty partition keys are not
+// allowed anyway. However, they *are* allowed in materialized views, so the
+// empty-key partition should get a real token, not an invalid token, so
+// we dropped this special case. Since we don't support migrating sstables of
+// materialized-views from Cassandra, this Cassandra-Scylla incompatiblity
+// will not cause problems in practice.
+// Note that get_token(const schema& s, partition_key_view key) below must
+// use exactly the same algorithm as this function.
 token
 murmur3_partitioner::get_token(bytes_view key) const {
-    if (key.empty()) {
-        return minimum_token();
-    }
    std::array<uint64_t, 2> hash;
    utils::murmur_hash::hash3_x64_128(key, 0, hash);
    return get_token(hash[0]);
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -42,7 +42,8 @@ if __name__ == '__main__':
        if systemd_unit.available('systemd-coredump@.service'):
            dropin = '''
 [Service]
-TimeoutStartSec=infinity
+RuntimeMaxSec=infinity
+TimeoutSec=infinity
 '''[1:-1]
            os.makedirs('/etc/systemd/system/systemd-coredump@.service.d', exist_ok=True)
            with open('/etc/systemd/system/systemd-coredump@.service.d/timeout.conf', 'w') as f:
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -16,7 +16,7 @@ import stat
 import distro
 from pathlib import Path
 from scylla_util import *
-from subprocess import run
+from subprocess import run, SubprocessError

 if __name__ == '__main__':
    if os.getuid() > 0:
@@ -137,7 +137,9 @@ if __name__ == '__main__':
    # stalling. The minimum block size for crc enabled filesystems is 1024,
    # and it also cannot be smaller than the sector size.
    block_size = max(1024, sector_size)
+    run('udevadm settle', shell=True, check=True)
    run(f'mkfs.xfs -b size={block_size} {fsdev} -f -K', shell=True, check=True)
+    run('udevadm settle', shell=True, check=True)

    if is_debian_variant():
        confpath = '/etc/mdadm/mdadm.conf'
@@ -153,6 +155,11 @@ if __name__ == '__main__':
    os.makedirs(mount_at, exist_ok=True)

    uuid = run(f'blkid -s UUID -o value {fsdev}', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
+    if not uuid:
+        raise Exception(f'Failed to get UUID of {fsdev}')
+
+    uuidpath = f'/dev/disk/by-uuid/{uuid}'
+
    after = 'local-fs.target'
    wants = ''
    if raid and args.raid_level != '0':
@@ -169,7 +176,7 @@ After={after}{wants}
 DefaultDependencies=no

 [Mount]
-What=/dev/disk/by-uuid/{uuid}
+What={uuidpath}
 Where={mount_at}
 Type=xfs
 Options=noatime{opt_discard}
@@ -191,8 +198,16 @@ WantedBy=multi-user.target
    systemd_unit.reload()
    if args.raid_level != '0':
        md_service.start()
-    mount = systemd_unit(mntunit_bn)
-    mount.start()
+    try:
+        mount = systemd_unit(mntunit_bn)
+        mount.start()
+    except SubprocessError as e:
+        if not os.path.exists(uuidpath):
+            print(f'\nERROR: {uuidpath} is not found\n')
+        elif not stat.S_ISBLK(os.stat(uuidpath).st_mode):
+            print(f'\nERROR: {uuidpath} is not block device\n')
+        raise e
+
    if args.enable_on_nextboot:
        mount.enable()
    uid = pwd.getpwnam('scylla').pw_uid
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -214,7 +214,7 @@ if __name__ == '__main__':
                        help='skip raid setup')
    parser.add_argument('--raid-level-5', action='store_true', default=False,
                        help='use RAID5 for RAID volume')
-    parser.add_argument('--online-discard', default=True,
+    parser.add_argument('--online-discard', default=1, choices=[0, 1], type=int,
                        help='Configure XFS to discard unused blocks as soon as files are deleted')
    parser.add_argument('--nic',
                        help='specify NIC')
@@ -458,7 +458,7 @@ if __name__ == '__main__':
        args.no_raid_setup = not raid_setup
        if raid_setup:
            level = '5' if raid_level_5 else '0'
-            run_setup_script('RAID', f'scylla_raid_setup --disks {disks} --enable-on-nextboot --raid-level={level} --online-discard={int(online_discard)}')
+            run_setup_script('RAID', f'scylla_raid_setup --disks {disks} --enable-on-nextboot --raid-level={level} --online-discard={online_discard}')

        coredump_setup = interactive_ask_service('Do you want to enable coredumps?', 'Yes - sets up coredump to allow a post-mortem analysis of the Scylla state just prior to a crash. No - skips this step.', coredump_setup)
        args.no_coredump_setup = not coredump_setup
--- a/dist/docker/scyllasetup.py
+++ b/dist/docker/scyllasetup.py
@@ -68,7 +68,12 @@ class ScyllaSetup:

    def cqlshrc(self):
        home = os.environ['HOME']
-        hostname = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
+        if self._rpcAddress:
+            hostname = self._rpcAddress
+        elif self._listenAddress:
+            hostname = self._listenAddress
+        else:
+            hostname = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
        with open("%s/.cqlshrc" % home, "w") as cqlshrc:
            cqlshrc.write("[connection]\nhostname = %s\n" % hostname)

--- a/dist/redhat/scylla.spec
+++ b/dist/redhat/scylla.spec
@@ -7,7 +7,7 @@ Group:          Applications/Databases
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Source0:        %{reloc_pkg}
-Requires:       %{product}-server = %{version} %{product}-conf = %{version} %{product}-python3 = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version} %{product}-node-exporter = %{version}
+Requires:       %{product}-server = %{version}-%{release} %{product}-conf = %{version}-%{release} %{product}-python3 = %{version}-%{release} %{product}-kernel-conf = %{version}-%{release} %{product}-jmx = %{version}-%{release} %{product}-tools = %{version}-%{release} %{product}-tools-core = %{version}-%{release} %{product}-node-exporter = %{version}-%{release}
 Obsoletes:	scylla-server < 1.1

 %global _debugsource_template %{nil}
@@ -54,7 +54,7 @@ Group:          Applications/Databases
 Summary:        The Scylla database server
 License:        AGPLv3
 URL:            http://www.scylladb.com/
-Requires:       %{product}-conf  = %{version} %{product}-python3 = %{version}
+Requires:       %{product}-conf  = %{version}-%{release} %{product}-python3 = %{version}-%{release}
 Conflicts:      abrt
 AutoReqProv:    no

--- a/flat_mutation_reader.cc
+++ b/flat_mutation_reader.cc
@@ -774,11 +774,14 @@ make_flat_mutation_reader_from_mutations_v2(schema_ptr s, reader_permit permit,
        std::optional<mutation_consume_cookie> _cookie;

    private:
-        void flush_tombstones(position_in_partition_view pos) {
+        void flush_tombstones(position_in_partition_view pos, bool emit_end = false) {
            _rt_gen.flush(pos, [&] (range_tombstone_change rt) {
                _current_rt = rt.tombstone();
                push_mutation_fragment(*_schema, _permit, std::move(rt));
            });
+            if (emit_end && _current_rt) {
+                push_mutation_fragment(*_schema, _permit, range_tombstone_change(pos, {}));
+            }
        }
        void maybe_emit_partition_start() {
            if (_dk) {
@@ -815,10 +818,7 @@ make_flat_mutation_reader_from_mutations_v2(schema_ptr s, reader_permit permit,
                return stop_iteration::yes;
            }
            maybe_emit_partition_start();
-            flush_tombstones(position_in_partition::after_all_clustered_rows());
-            if (_current_rt) {
-                push_mutation_fragment(*_schema, _permit, range_tombstone_change(position_in_partition::after_all_clustered_rows(), {}));
-            }
+            flush_tombstones(position_in_partition::after_all_clustered_rows(), true);
            push_mutation_fragment(*_schema, _permit, partition_end{});
            return stop_iteration::no;
        }
@@ -1986,11 +1986,14 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
        tombstone _current_rt;
        std::optional<position_range> _pr;
    public:
-        void flush_tombstones(position_in_partition_view pos) {
+        void flush_tombstones(position_in_partition_view pos, bool emit_end = false) {
            _rt_gen.flush(pos, [&] (range_tombstone_change rt) {
                _current_rt = rt.tombstone();
                push_mutation_fragment(*_schema, _permit, std::move(rt));
            });
+            if (emit_end && _current_rt) {
+                push_mutation_fragment(*_schema, _permit, range_tombstone_change(pos, {}));
+            }
        }
        void consume(static_row mf) {
            push_mutation_fragment(*_schema, _permit, std::move(mf));
@@ -2015,11 +2018,9 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
            push_mutation_fragment(*_schema, _permit, std::move(mf));
        }
        void consume(partition_end mf) {
-            flush_tombstones(position_in_partition::after_all_clustered_rows());
+            flush_tombstones(position_in_partition::after_all_clustered_rows(), true);
            if (_current_rt) {
                assert(!_pr);
-                push_mutation_fragment(*_schema, _permit, range_tombstone_change(
-                        position_in_partition::after_all_clustered_rows(), {}));
            }
            push_mutation_fragment(*_schema, _permit, std::move(mf));
        }
@@ -2042,10 +2043,7 @@ flat_mutation_reader_v2 upgrade_to_v2(flat_mutation_reader r) {
                if (_reader.is_end_of_stream() && _reader.is_buffer_empty()) {
                    if (_pr) {
                        // If !_pr we should flush on partition_end
-                        flush_tombstones(_pr->end());
-                        if (_current_rt) {
-                            push_mutation_fragment(*_schema, _permit, range_tombstone_change(_pr->end(), {}));
-                        }
+                        flush_tombstones(_pr->end(), true);
                    }
                    _end_of_stream = true;
                }
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -1012,10 +1012,10 @@ std::set<inet_address> gossiper::get_live_members() {

 std::set<inet_address> gossiper::get_live_token_owners() {
    std::set<inet_address> token_owners;
-    for (auto& member : get_live_members()) {
-        auto es = get_endpoint_state_for_endpoint_ptr(member);
-        if (es && !is_dead_state(*es) && get_token_metadata_ptr()->is_member(member)) {
-            token_owners.insert(member);
+    auto normal_token_owners = get_token_metadata_ptr()->get_all_endpoints();
+    for (auto& node: normal_token_owners) {
+        if (is_alive(node)) {
+            token_owners.insert(node);
        }
    }
    return token_owners;
@@ -1023,10 +1023,10 @@ std::set<inet_address> gossiper::get_live_token_owners() {

 std::set<inet_address> gossiper::get_unreachable_token_owners() {
    std::set<inet_address> token_owners;
-    for (auto&& x : _unreachable_endpoints) {
-        auto& endpoint = x.first;
-        if (get_token_metadata_ptr()->is_member(endpoint)) {
-            token_owners.insert(endpoint);
+    auto normal_token_owners = get_token_metadata_ptr()->get_all_endpoints();
+    for (auto& node: normal_token_owners) {
+        if (!is_alive(node)) {
+            token_owners.insert(node);
        }
    }
    return token_owners;
--- a/locator/abstract_replication_strategy.cc
+++ b/locator/abstract_replication_strategy.cc
@@ -215,22 +215,6 @@ effective_replication_map::get_primary_ranges_within_dc(inet_address ep) const {
    });
 }

-future<std::unordered_multimap<inet_address, dht::token_range>>
-abstract_replication_strategy::get_address_ranges(const token_metadata& tm) const {
-    std::unordered_multimap<inet_address, dht::token_range> ret;
-    for (auto& t : tm.sorted_tokens()) {
-        dht::token_range_vector r = tm.get_primary_ranges_for(t);
-        auto eps = co_await calculate_natural_endpoints(t, tm);
-        rslogger.debug("token={}, primary_range={}, address={}", t, r, eps);
-        for (auto ep : eps) {
-            for (auto&& rng : r) {
-                ret.emplace(ep, rng);
-            }
-        }
-    }
-    co_return ret;
-}
-
 future<std::unordered_multimap<inet_address, dht::token_range>>
 abstract_replication_strategy::get_address_ranges(const token_metadata& tm, inet_address endpoint) const {
    std::unordered_multimap<inet_address, dht::token_range> ret;
--- a/locator/abstract_replication_strategy.hh
+++ b/locator/abstract_replication_strategy.hh
@@ -112,7 +112,6 @@ public:
    future<dht::token_range_vector> get_ranges(inet_address ep, token_metadata_ptr tmptr) const;

 public:
-    future<std::unordered_multimap<inet_address, dht::token_range>> get_address_ranges(const token_metadata& tm) const;
    future<std::unordered_multimap<inet_address, dht::token_range>> get_address_ranges(const token_metadata& tm, inet_address endpoint) const;

    // Caller must ensure that token_metadata will not change throughout the call.
--- a/locator/azure_snitch.cc
+++ b/locator/azure_snitch.cc
@@ -15,6 +15,7 @@
 #include <seastar/core/coroutine.hh>
 #include <seastar/core/seastar.hh>
 #include <seastar/http/response_parser.hh>
+#include <seastar/http/reply.hh>
 #include <seastar/net/api.hh>
 #include <seastar/net/dns.hh>

@@ -47,7 +48,8 @@ future<> azure_snitch::load_config() {

    logger().info("AzureSnitch using region: {}, zone: {}.", azure_region, azure_zone);

-    _my_rack = azure_zone;
+    // Zoneless regions return empty zone
+    _my_rack = (azure_zone != "" ? azure_zone : azure_region);
    _my_dc = azure_region;

    co_return co_await _my_distributed->invoke_on_all([this] (snitch_ptr& local_s) {
@@ -90,6 +92,10 @@ future<sstring> azure_snitch::azure_api_call(sstring path) {

        // Read HTTP response header first
        auto rsp = parser.get_parsed_response();
+        if (rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
+            throw std::runtime_error(format("Error: HTTP response status {}", rsp->_status_code));
+        }
+
        auto it = rsp->_headers.find("Content-Length");
        if (it == rsp->_headers.end()) {
            throw std::runtime_error("Error: HTTP response does not contain: Content-Length\n");
--- a/locator/ec2_snitch.cc
+++ b/locator/ec2_snitch.cc
@@ -2,6 +2,7 @@
 #include <seastar/core/seastar.hh>
 #include <seastar/core/sleep.hh>
 #include <seastar/core/do_with.hh>
+#include <seastar/http/reply.hh>

 #include <boost/algorithm/string/classification.hpp>
 #include <boost/algorithm/string/split.hpp>
@@ -114,6 +115,9 @@ future<sstring> ec2_snitch::aws_api_call_once(sstring addr, uint16_t port, sstri

            // Read HTTP response header first
            auto _rsp = _parser.get_parsed_response();
+            if (_rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
+                return make_exception_future<sstring>(std::runtime_error(format("Error: HTTP response status {}", _rsp->_status_code)));
+            }
            auto it = _rsp->_headers.find("Content-Length");
            if (it == _rsp->_headers.end()) {
                return make_exception_future<sstring>("Error: HTTP response does not contain: Content-Length\n");
--- a/locator/gce_snitch.cc
+++ b/locator/gce_snitch.cc
@@ -14,6 +14,7 @@
 #include <seastar/net/dns.hh>
 #include <seastar/core/seastar.hh>
 #include "locator/gce_snitch.hh"
+#include <seastar/http/reply.hh>

 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string/classification.hpp>
@@ -106,6 +107,10 @@ future<sstring> gce_snitch::gce_api_call(sstring addr, sstring cmd) {

        // Read HTTP response header first
        auto rsp = parser.get_parsed_response();
+        if (rsp->_status_code != static_cast<int>(httpd::reply::status_type::ok)) {
+            throw std::runtime_error(format("Error: HTTP response status {}", rsp->_status_code));
+        }
+
        auto it = rsp->_headers.find("Content-Length");
        if (it == rsp->_headers.end()) {
            throw std::runtime_error("Error: HTTP response does not contain: Content-Length\n");
--- a/locator/token_metadata.cc
+++ b/locator/token_metadata.cc
@@ -786,13 +786,12 @@ void token_metadata_impl::calculate_pending_ranges_for_leaving(
        const abstract_replication_strategy& strategy,
        std::unordered_multimap<range<token>, inet_address>& new_pending_ranges,
        mutable_token_metadata_ptr all_left_metadata) const {
-    std::unordered_multimap<inet_address, dht::token_range> address_ranges = strategy.get_address_ranges(unpimplified_this).get0();
    // get all ranges that will be affected by leaving nodes
    std::unordered_set<range<token>> affected_ranges;
    for (auto endpoint : _leaving_endpoints) {
-        auto r = address_ranges.equal_range(endpoint);
-        for (auto x = r.first; x != r.second; x++) {
-            affected_ranges.emplace(x->second);
+        auto r = strategy.get_address_ranges(unpimplified_this, endpoint).get0();
+        for (const auto& x : r) {
+            affected_ranges.emplace(x.second);
        }
    }
    // for each of those ranges, find what new nodes will be responsible for the range when
@@ -826,16 +825,14 @@ void token_metadata_impl::calculate_pending_ranges_for_replacing(
    if (_replacing_endpoints.empty()) {
        return;
    }
-    auto address_ranges = strategy.get_address_ranges(unpimplified_this).get0();
    for (const auto& node : _replacing_endpoints) {
        auto existing_node = node.first;
        auto replacing_node = node.second;
+        auto address_ranges = strategy.get_address_ranges(unpimplified_this, existing_node).get0();
        for (const auto& x : address_ranges) {
            seastar::thread::maybe_yield();
-            if (x.first == existing_node) {
-                tlogger.debug("Node {} replaces {} for range {}", replacing_node, existing_node, x.second);
-                new_pending_ranges.emplace(x.second, replacing_node);
-            }
+            tlogger.debug("Node {} replaces {} for range {}", replacing_node, existing_node, x.second);
+            new_pending_ranges.emplace(x.second, replacing_node);
        }
    }
 }
--- a/main.cc
+++ b/main.cc
@@ -383,6 +383,8 @@ static auto defer_verbose_shutdown(const char* what, Func&& func) {
                        break;
                    }
                }
+            } catch (const storage_io_error& e) {
+                do_abort = false;
            } catch (...) {
            }
            auto msg = fmt::format("Unexpected error shutting down {}: {}", what, ex);
@@ -425,6 +427,39 @@ static int scylla_main(int ac, char** av) {
        exit(1);
    }

+    // Even on the environment which causes error during initalize Scylla,
+    // "scylla --version" should be able to run without error.
+    // To do so, we need to parse and execute these options before
+    // initializing Scylla/Seastar classes.
+    bpo::options_description preinit_description("Scylla options");
+    bpo::variables_map preinit_vm;
+    preinit_description.add_options()
+        ("version", bpo::bool_switch(), "print version number and exit")
+        ("build-id", bpo::bool_switch(), "print build-id and exit")
+        ("build-mode", bpo::bool_switch(), "print build mode and exit")
+        ("list-tools", bpo::bool_switch(), "list included tools and exit");
+    auto preinit_parsed_opts = bpo::command_line_parser(ac, av).options(preinit_description).allow_unregistered().run();
+    bpo::store(preinit_parsed_opts, preinit_vm);
+    if (preinit_vm["version"].as<bool>()) {
+        fmt::print("{}\n", scylla_version());
+        return 0;
+    }
+    if (preinit_vm["build-id"].as<bool>()) {
+        fmt::print("{}\n", get_build_id());
+        return 0;
+    }
+    if (preinit_vm["build-mode"].as<bool>()) {
+        fmt::print("{}\n", scylla_build_mode());
+        return 0;
+    }
+    if (preinit_vm["list-tools"].as<bool>()) {
+        fmt::print(
+                "types - a command-line tool to examine values belonging to scylla types\n"
+                "sstable - a multifunctional command-line tool to examine the content of sstables\n"
+        );
+        return 0;
+    }
+
  try {
    runtime::init_uptime();
    std::setvbuf(stdout, nullptr, _IOLBF, 1000);
@@ -479,26 +514,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
    bpo::variables_map vm;
    auto parsed_opts = bpo::command_line_parser(ac, av).options(app.get_options_description()).allow_unregistered().run();
    bpo::store(parsed_opts, vm);
-    if (vm["version"].as<bool>()) {
-        fmt::print("{}\n", scylla_version());
-        return 0;
-    }
-    if (vm["build-id"].as<bool>()) {
-        fmt::print("{}\n", get_build_id());
-        return 0;
-    }
-    if (vm["build-mode"].as<bool>()) {
-        fmt::print("{}\n", scylla_build_mode());
-        return 0;
-    }
-    if (vm["list-tools"].as<bool>()) {
-        fmt::print(
-                "types - a command-line tool to examine values belonging to scylla types\n"
-                "sstable - a multifunctional command-line tool to examine the content of sstables\n"
-        );
-        return 0;
-    }
-
    print_starting_message(ac, av, parsed_opts);

    sharded<locator::shared_token_metadata> token_metadata;
--- a/mutation_compactor.hh
+++ b/mutation_compactor.hh
@@ -494,8 +494,16 @@ public:
            _partition_limit -= _rows_in_current_partition > 0;
            auto stop = consumer.consume_end_of_partition();
            if (!sstable_compaction()) {
-                return _row_limit && _partition_limit && stop != stop_iteration::yes
+                stop = _row_limit && _partition_limit && stop != stop_iteration::yes
                       ? stop_iteration::no : stop_iteration::yes;
+                // If we decided to stop earlier but decide to continue now, we
+                // are in effect skipping the partition. Do not leave `_stop` at
+                // `stop_iteration::yes` in this case, reset it back to
+                // `stop_iteration::no` as if we exhausted the partition.
+                if (_stop && !stop) {
+                    _stop = stop_iteration::no;
+                }
+                return stop;
            }
        }
        return stop_iteration::no;
@@ -540,6 +548,7 @@ public:
        _current_partition_limit = std::min(_row_limit, _partition_row_limit);
        _query_time = query_time;
        _stats = {};
+        _stop = stop_iteration::no;

        noop_compacted_fragments_consumer nc;

--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -1240,7 +1240,10 @@ future<flat_mutation_reader> evictable_reader::resume_or_create_reader() {
    if (auto reader_opt = try_resume()) {
        co_return std::move(*reader_opt);
    }
-    co_await _permit.maybe_wait_readmission();
+    // See evictable_reader_v2::resume_or_create_reader()
+    if (_permit.needs_readmission()) {
+        co_await _permit.wait_readmission();
+    }
    co_return recreate_reader();
 }

@@ -1773,7 +1776,18 @@ future<flat_mutation_reader_v2> evictable_reader_v2::resume_or_create_reader() {
    if (auto reader_opt = try_resume()) {
        co_return std::move(*reader_opt);
    }
-    co_await _permit.maybe_wait_readmission();
+    // When the reader is created the first time and we are actually resuming a
+    // saved reader in `recreate_reader()`, we have two cases here:
+    // * the reader is still alive (in inactive state)
+    // * the reader was evicted
+    // We check for this below with `needs_readmission()` and it is very
+    // important to not allow for preemption between said check and
+    // `recreate_reader()`, otherwise the reader might be evicted between the
+    // check and `recreate_reader()` and the latter will recreate it without
+    // waiting for re-admission.
+    if (_permit.needs_readmission()) {
+        co_await _permit.wait_readmission();
+    }
    co_return recreate_reader();
 }

@@ -1959,7 +1973,9 @@ future<> evictable_reader_v2::fill_buffer() {
        auto* next_mf = co_await _reader->peek();

        // First make sure we've made progress w.r.t. _next_position_in_partition.
-        while (next_mf && _tri_cmp(_next_position_in_partition, buffer().back().position()) <= 0) {
+        // This loop becomes inifinite when next pos is a partition start.
+        // In that case progress is guranteed anyway, so skip this loop entirely.
+        while (!_next_position_in_partition.is_partition_start() && next_mf && _tri_cmp(_next_position_in_partition, buffer().back().position()) <= 0) {
            push_mutation_fragment(_reader->pop_mutation_fragment());
            next_mf = co_await _reader->peek();
        }
--- a/query.cc
+++ b/query.cc
@@ -92,14 +92,13 @@ void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& range
 }

 void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, const clustering_key& key, bool reversed) {
-    if (key.is_full(s)) {
+    if (key.is_full(s) || reversed) {
        return trim_clustering_row_ranges_to(s, ranges,
                reversed ? position_in_partition_view::before_key(key) : position_in_partition_view::after_key(key), reversed);
    }
    auto full_key = key;
    clustering_key::make_full(s, full_key);
-    return trim_clustering_row_ranges_to(s, ranges,
-            reversed ? position_in_partition_view::after_key(full_key) : position_in_partition_view::before_key(full_key), reversed);
+    return trim_clustering_row_ranges_to(s, ranges, position_in_partition_view::before_key(full_key), reversed);
 }


--- a/range_tombstone_change_generator.hh
+++ b/range_tombstone_change_generator.hh
@@ -68,22 +68,33 @@ public:
    // for accumulated range tombstones.
    // After this, only range_tombstones with positions >= upper_bound may be added,
    // which guarantees that they won't affect the output of this flush.
+    //
+    // If upper_bound == position_in_partition::after_all_clustered_rows(),
+    // emits all remaining range_tombstone_changes.
+    // No range_tombstones may be added after this.
+    //
    // FIXME: respect preemption
    template<RangeTombstoneChangeConsumer C>
-    void flush(position_in_partition_view upper_bound, C consumer) {
-        position_in_partition::less_compare less(_schema);
-        std::optional<range_tombstone> prev;
+    void flush(const position_in_partition_view upper_bound, C consumer) {
+        if (_range_tombstones.empty()) {
+            _lower_bound = upper_bound;
+            return;
+        }

-        while (!_range_tombstones.empty() && less(_range_tombstones.begin()->end_position(), upper_bound)) {
+        position_in_partition::tri_compare cmp(_schema);
+        std::optional<range_tombstone> prev;
+        bool flush_all = cmp(upper_bound, position_in_partition::after_all_clustered_rows()) == 0;
+
+        while (!_range_tombstones.empty() && (flush_all || (cmp(_range_tombstones.begin()->end_position(), upper_bound) < 0))) {
            auto rt = _range_tombstones.pop(_range_tombstones.begin());

-            if (prev && less(prev->end_position(), rt.position())) { // [1]
+            if (prev && (cmp(prev->end_position(), rt.position()) < 0)) { // [1]
                // previous range tombstone not adjacent, emit gap.
                consumer(range_tombstone_change(prev->end_position(), tombstone()));
            }

            // Check if start of rt was already emitted, emit if not.
-            if (!less(rt.position(), _lower_bound)) {
+            if (cmp(rt.position(), _lower_bound) >= 0) {
                consumer(range_tombstone_change(rt.position(), rt.tomb));
            }

@@ -95,15 +106,15 @@ public:
        // It cannot get adjacent later because prev->end_position() < upper_bound,
        // so nothing == prev->end_position() can be added after this invocation.
        if (prev && (_range_tombstones.empty()
-                     || less(prev->end_position(), _range_tombstones.begin()->position()))) {
+                     || (cmp(prev->end_position(), _range_tombstones.begin()->position()) < 0))) {
            consumer(range_tombstone_change(prev->end_position(), tombstone())); // [2]
        }

        // Emit the fragment for start bound of a range_tombstone which is overlapping with upper_bound,
        // unless no such fragment or already emitted.
        if (!_range_tombstones.empty()
-            && less(_range_tombstones.begin()->position(), upper_bound)
-            && (!less(_range_tombstones.begin()->position(), _lower_bound))) {
+            && (cmp(_range_tombstones.begin()->position(), upper_bound) < 0)
+            && (cmp(_range_tombstones.begin()->position(), _lower_bound) >= 0)) {
            consumer(range_tombstone_change(
                    _range_tombstones.begin()->position(), _range_tombstones.begin()->tombstone().tomb));
        }
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -294,10 +294,11 @@ public:
        }
    }

-    future<> maybe_wait_readmission() {
-        if (_state != reader_permit::state::evicted) {
-            return make_ready_future<>();
-        }
+    bool needs_readmission() const {
+        return _state == reader_permit::state::evicted;
+    }
+
+    future<> wait_readmission() {
        return _semaphore.do_wait_admission(shared_from_this());
    }

@@ -360,8 +361,16 @@ reader_concurrency_semaphore& reader_permit::semaphore() {
    return _impl->semaphore();
 }

-future<> reader_permit::maybe_wait_readmission() {
-    return _impl->maybe_wait_readmission();
+reader_permit::state reader_permit::get_state() const {
+    return _impl->get_state();
+}
+
+bool reader_permit::needs_readmission() const {
+    return _impl->needs_readmission();
+}
+
+future<> reader_permit::wait_readmission() {
+    return _impl->wait_readmission();
 }

 void reader_permit::consume(reader_resources res) {
@@ -661,11 +670,7 @@ reader_concurrency_semaphore::~reader_concurrency_semaphore() {
 reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore::register_inactive_read(flat_mutation_reader_v2 reader) noexcept {
    auto& permit_impl = *reader.permit()._impl;
    permit_impl.on_register_as_inactive();
-    // Implies _inactive_reads.empty(), we don't queue new readers before
-    // evicting all inactive reads.
-    // Checking the _wait_list covers the count resources only, so check memory
-    // separately.
-    if (_wait_list.empty() && _resources.memory > 0) {
+    if (!should_evict_inactive_read()) {
      try {
        auto irp = std::make_unique<inactive_read>(std::move(reader));
        auto& ir = *irp;
@@ -736,10 +741,7 @@ bool reader_concurrency_semaphore::try_evict_one_inactive_read(evict_reason reas

 void reader_concurrency_semaphore::clear_inactive_reads() {
    while (!_inactive_reads.empty()) {
-        auto& ir = _inactive_reads.front();
-        close_reader(std::move(ir.reader));
-        // Destroying the read unlinks it too.
-        std::unique_ptr<inactive_read> _(&*_inactive_reads.begin());
+        evict(_inactive_reads.front(), evict_reason::manual);
    }
 }

@@ -751,8 +753,6 @@ future<> reader_concurrency_semaphore::evict_inactive_reads_for_table(utils::UUI
        ++it;
        if (ir.reader.schema()->id() == id) {
            do_detach_inactive_reader(ir, evict_reason::manual);
-            ir.ttl_timer.cancel();
-            ir.unlink();
            evicted_readers.push_back(ir);
        }
    }
@@ -785,6 +785,8 @@ future<> reader_concurrency_semaphore::stop() noexcept {
 }

 void reader_concurrency_semaphore::do_detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
+    ir.unlink();
+    ir.ttl_timer.cancel();
    ir.detach();
    ir.reader.permit()._impl->on_evicted();
    try {
@@ -858,35 +860,89 @@ future<> reader_concurrency_semaphore::enqueue_waiter(reader_permit permit, read
 }

 void reader_concurrency_semaphore::evict_readers_in_background() {
+    if (_evicting) {
+        return;
+    }
+    _evicting = true;
    // Evict inactive readers in the background while wait list isn't empty
    // This is safe since stop() closes _gate;
    (void)with_gate(_close_readers_gate, [this] {
-        return do_until([this] { return _wait_list.empty() || _inactive_reads.empty(); }, [this] {
-            return detach_inactive_reader(_inactive_reads.front(), evict_reason::permit).close();
+        return repeat([this] {
+            if (_inactive_reads.empty() || !should_evict_inactive_read()) {
+                _evicting = false;
+                return make_ready_future<stop_iteration>(stop_iteration::yes);
+            }
+            return detach_inactive_reader(_inactive_reads.front(), evict_reason::permit).close().then([] {
+                return stop_iteration::no;
+            });
        });
    });
- }
+}
+
+reader_concurrency_semaphore::admit_result
+reader_concurrency_semaphore::can_admit_read(const reader_permit& permit) const noexcept {
+    if (!_ready_list.empty()) {
+        return {can_admit::no, reason::ready_list};
+    }
+
+    if (!all_used_permits_are_stalled()) {
+        return {can_admit::no, reason::used_permits};
+    }
+
+    if (!has_available_units(permit.base_resources())) {
+        auto reason = _resources.memory >= permit.base_resources().memory ? reason::memory_resources : reason::count_resources;
+        if (_inactive_reads.empty()) {
+            return {can_admit::no, reason};
+        } else {
+            return {can_admit::maybe, reason};
+        }
+    }
+
+    return {can_admit::yes, reason::all_ok};
+}
+
+bool reader_concurrency_semaphore::should_evict_inactive_read() const noexcept {
+    if (_resources.memory < 0 || _resources.count < 0) {
+        return true;
+    }
+    if (_wait_list.empty()) {
+        return false;
+    }
+    const auto r = can_admit_read(_wait_list.front().permit).why;
+    return r == reason::memory_resources || r == reason::count_resources;
+}

 future<> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, read_func func) {
    if (!_execution_loop_future) {
        _execution_loop_future.emplace(execution_loop());
    }
-    if (!_wait_list.empty() || !_ready_list.empty()) {
-        return enqueue_waiter(std::move(permit), std::move(func));
-    }

-    if (!has_available_units(permit.base_resources())) {
+    static uint64_t stats::*stats_table[] = {
+        &stats::reads_admitted_immediately,
+        &stats::reads_queued_because_ready_list,
+        &stats::reads_queued_because_used_permits,
+        &stats::reads_queued_because_memory_resources,
+        &stats::reads_queued_because_count_resources
+    };
+
+    const auto [admit, why] = can_admit_read(permit);
+    ++(_stats.*stats_table[static_cast<int>(why)]);
+    if (admit != can_admit::yes || !_wait_list.empty()) {
        auto fut = enqueue_waiter(std::move(permit), std::move(func));
-        if (!_inactive_reads.empty()) {
+        if (admit == can_admit::yes && !_wait_list.empty()) {
+            // This is a contradiction: the semaphore could admit waiters yet it has waiters.
+            // Normally, the semaphore should admit waiters as soon as it can.
+            // So at any point in time, there should either be no waiters, or it
+            // shouldn't be able to admit new reads. Otherwise something went wrong.
+            maybe_dump_reader_permit_diagnostics(*this, _permit_list, "semaphore could admit new reads yet there are waiters");
+            maybe_admit_waiters();
+        } else if (admit == can_admit::maybe) {
+            ++_stats.reads_queued_with_eviction;
            evict_readers_in_background();
        }
        return fut;
    }

-    if (!all_used_permits_are_stalled()) {
-        return enqueue_waiter(std::move(permit), std::move(func));
-    }
-
    permit.on_admission();
    ++_stats.reads_admitted;
    if (func) {
@@ -896,7 +952,8 @@ future<> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, r
 }

 void reader_concurrency_semaphore::maybe_admit_waiters() noexcept {
-    while (!_wait_list.empty() && _ready_list.empty() && has_available_units(_wait_list.front().permit.base_resources()) && all_used_permits_are_stalled()) {
+    auto admit = can_admit::no;
+    while (!_wait_list.empty() && (admit = can_admit_read(_wait_list.front().permit).decision) == can_admit::yes) {
        auto& x = _wait_list.front();
        try {
            x.permit.on_admission();
@@ -911,6 +968,10 @@ void reader_concurrency_semaphore::maybe_admit_waiters() noexcept {
        }
        _wait_list.pop_front();
    }
+    if (admit == can_admit::maybe) {
+        // Evicting readers will trigger another call to `maybe_admit_waiters()` from `signal()`.
+        evict_readers_in_background();
+    }
 }

 void reader_concurrency_semaphore::on_permit_created(reader_permit::impl& permit) {
@@ -987,6 +1048,13 @@ future<> reader_concurrency_semaphore::with_ready_permit(reader_permit permit, r
    return fut;
 }

+void reader_concurrency_semaphore::set_resources(resources r) {
+    auto delta = r - _initial_resources;
+    _initial_resources = r;
+    _resources += delta;
+    maybe_admit_waiters();
+}
+
 void reader_concurrency_semaphore::broken(std::exception_ptr ex) {
    if (!ex) {
        ex = std::make_exception_ptr(broken_semaphore{});
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -74,6 +74,18 @@ public:
        uint64_t reads_admitted = 0;
        // Total number of reads enqueued to wait for admission.
        uint64_t reads_enqueued = 0;
+        // Total number of reads admitted immediately, without queueing
+        uint64_t reads_admitted_immediately = 0;
+        // Total number of reads enqueued because ready_list wasn't empty
+        uint64_t reads_queued_because_ready_list = 0;
+        // Total number of reads enqueued because there are used but unblocked permits
+        uint64_t reads_queued_because_used_permits = 0;
+        // Total number of reads enqueued because there weren't enough memory resources
+        uint64_t reads_queued_because_memory_resources = 0;
+        // Total number of reads enqueued because there weren't enough count resources
+        uint64_t reads_queued_because_count_resources = 0;
+        // Total number of reads enqueued to be maybe admitted after evicting some inactive reads
+        uint64_t reads_queued_with_eviction = 0;
        // Total number of permits created so far.
        uint64_t total_permits = 0;
        // Current number of permits.
@@ -169,7 +181,7 @@ public:
    };

 private:
-    const resources _initial_resources;
+    resources _initial_resources;
    resources _resources;

    expiring_fifo<entry, expiry_handler, db::timeout_clock> _wait_list;
@@ -181,6 +193,7 @@ private:
    stats _stats;
    permit_list_type _permit_list;
    bool _stopped = false;
+    bool _evicting = false;
    gate _close_readers_gate;
    gate _permit_gate;
    std::optional<future<>> _execution_loop_future;
@@ -201,6 +214,19 @@ private:
    future<> enqueue_waiter(reader_permit permit, read_func func);
    void evict_readers_in_background();
    future<> do_wait_admission(reader_permit permit, read_func func = {});
+
+    // Check whether permit can be admitted or not.
+    // The wait list is not taken into consideration, this is the caller's
+    // responsibility.
+    // A return value of can_admit::maybe means admission might be possible if
+    // some of the inactive readers are evicted.
+    enum class can_admit { no, maybe, yes };
+    enum class reason { all_ok = 0, ready_list, used_permits, memory_resources, count_resources };
+    struct admit_result { can_admit decision; reason why; };
+    admit_result can_admit_read(const reader_permit& permit) const noexcept;
+
+    bool should_evict_inactive_read() const noexcept;
+
    void maybe_admit_waiters() noexcept;

    void on_permit_created(reader_permit::impl&);
@@ -390,6 +416,12 @@ public:
    /// optimal then just using \ref with_permit().
    future<> with_ready_permit(reader_permit permit, read_func func);

+    /// Set the total resources of the semaphore to \p r.
+    ///
+    /// After this call, \ref initial_resources() will reflect the new value.
+    /// Available resources will be adjusted by the delta.
+    void set_resources(resources r);
+
    const resources initial_resources() const {
        return _initial_resources;
    }
--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -134,7 +134,12 @@ public:

    reader_concurrency_semaphore& semaphore();

-    future<> maybe_wait_readmission();
+    state get_state() const;
+
+    bool needs_readmission() const;
+
+    // Call only when needs_readmission() = true.
+    future<> wait_readmission();

    void consume(reader_resources res);

@@ -182,6 +187,8 @@ public:
    reader_resources resources() const { return _resources; }
 };

+std::ostream& operator<<(std::ostream& os, reader_permit::state s);
+
 /// Mark a permit as used.
 ///
 /// Conceptually, a permit is considered used, when at least one reader
--- a/replica/database.hh
+++ b/replica/database.hh
@@ -279,6 +279,7 @@ using sstable_list = sstables::sstable_list;
 namespace replica {

 class distributed_loader;
+struct table_population_metadata;

 // The CF has a "stats" structure. But we don't want all fields here,
 // since some of them are fairly complex for exporting to collectd. Also,
@@ -900,6 +901,8 @@ public:
    // The future value is true iff offstrategy compaction was required.
    future<bool> perform_offstrategy_compaction();
    future<> run_offstrategy_compaction(sstables::compaction_data& info);
+    future<> perform_cleanup_compaction(replica::database& db);
+
    void set_compaction_strategy(sstables::compaction_strategy_type strategy);
    const sstables::compaction_strategy& get_compaction_strategy() const {
        return _compaction_strategy;
@@ -925,7 +928,11 @@ public:
        return _config;
    }

-    compaction_manager& get_compaction_manager() const {
+    const compaction_manager& get_compaction_manager() const noexcept {
+        return _compaction_manager;
+    }
+
+    compaction_manager& get_compaction_manager() noexcept {
        return _compaction_manager;
    }

@@ -1080,6 +1087,7 @@ public:
    friend class ::column_family_test;

    friend class distributed_loader;
+    friend class table_population_metadata;

 private:
    timer<> _off_strategy_trigger;
--- a/replica/distributed_loader.cc
+++ b/replica/distributed_loader.cc
@@ -6,6 +6,7 @@
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

+#include <seastar/core/coroutine.hh>
 #include <seastar/util/closeable.hh>
 #include "distributed_loader.hh"
 #include "replica/database.hh"
@@ -361,7 +362,7 @@ distributed_loader::process_upload_dir(distributed<replica::database>& db, distr
                  &error_handler_gen_for_upload_dir);
        }, sstables::sstable_directory::default_sstable_filter()).get();

-        const bool use_view_update_path = db::view::check_needs_view_update_path(sys_dist_ks.local(), *global_table, streaming::stream_reason::repair).get0();
+        const bool use_view_update_path = db::view::check_needs_view_update_path(sys_dist_ks.local(), db.local().get_token_metadata(), *global_table, streaming::stream_reason::repair).get0();

        auto datadir = upload.parent_path();
        if (use_view_update_path) {
@@ -454,92 +455,192 @@ future<> distributed_loader::handle_sstables_pending_delete(sstring pending_dele
    });
 }

-future<> distributed_loader::populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, allow_offstrategy_compaction do_allow_offstrategy_compaction, must_exist dir_must_exist) {
-    dblog.debug("Populating {}/{}/{} allow_offstrategy_compaction={} must_exist={}", ks, cf, sstdir, do_allow_offstrategy_compaction, dir_must_exist);
-    return async([&db, sstdir = std::move(sstdir), ks = std::move(ks), cf = std::move(cf), do_allow_offstrategy_compaction, dir_must_exist] {
+class table_population_metadata {
+    distributed<replica::database>& _db;
+    sstring _ks;
+    sstring _cf;
+    global_column_family_ptr _global_table;
+    fs::path _base_path;
+    std::unordered_map<sstring, lw_shared_ptr<sharded<sstables::sstable_directory>>> _sstable_directories;
+    sstables::sstable_version_types _highest_version = sstables::oldest_writable_sstable_format;
+    int64_t _highest_generation = 0;
+
+public:
+    table_population_metadata(distributed<replica::database>& db, sstring ks, sstring cf)
+        : _db(db)
+        , _ks(std::move(ks))
+        , _cf(std::move(cf))
+        , _global_table(_db, _ks, _cf)
+        , _base_path(_global_table->dir())
+    {}
+
+    ~table_population_metadata() {
+        // All directories must have been stopped
+        // using table_population_metadata::stop()
+        assert(_sstable_directories.empty());
+    }
+
+    future<> start() {
        assert(this_shard_id() == 0);
-
-        if (!file_exists(sstdir).get0()) {
-            if (dir_must_exist) {
-                throw std::runtime_error(format("Populating {}/{} failed: {} does not exist", ks, cf, sstdir));
-            }
-            return;
+        for (auto subdir : { "", sstables::staging_dir, sstables::quarantine_dir }) {
+            co_await start_subdir(subdir);
        }

-        // First pass, cleanup temporary sstable directories and sstables pending delete.
-        cleanup_column_family_temp_sst_dirs(sstdir).get();
-        auto pending_delete_dir = sstdir + "/" + sstables::sstable::pending_delete_dir_basename();
-        auto exists = file_exists(pending_delete_dir).get0();
-        if (exists) {
-            handle_sstables_pending_delete(pending_delete_dir).get();
+        co_await smp::invoke_on_all([this] {
+            _global_table->update_sstables_known_generation(_highest_generation);
+            return _global_table->disable_auto_compaction();
+        });
+    }
+
+    future<> stop() {
+        for (auto it = _sstable_directories.begin(); it != _sstable_directories.end(); it = _sstable_directories.erase(it)) {
+            co_await it->second->stop();
        }
+    }

-        global_column_family_ptr global_table(db, ks, cf);
+    fs::path get_path(std::string_view subdir) {
+        return subdir.empty() ? _base_path : _base_path / subdir;
+    }

-        sharded<sstables::sstable_directory> directory;
-        directory.start(fs::path(sstdir), db.local().get_config().initial_sstable_loading_concurrency(), std::ref(db.local().get_sharded_sst_dir_semaphore()),
-            sstables::sstable_directory::need_mutate_level::no,
-            sstables::sstable_directory::lack_of_toc_fatal::yes,
-            sstables::sstable_directory::enable_dangerous_direct_import_of_cassandra_counters(db.local().get_config().enable_dangerous_direct_import_of_cassandra_counters()),
-            sstables::sstable_directory::allow_loading_materialized_view::yes,
-            [&global_table] (fs::path dir, int64_t gen, sstables::sstable_version_types v, sstables::sstable_format_types f) {
-                return global_table->make_sstable(dir.native(), gen, v, f);
-        }).get();
+    distributed<replica::database>& db() noexcept {
+        return _db;
+    }

-        auto stop = deferred_stop(directory);
+    const sstring& ks() const noexcept {
+        return _ks;
+    }

-        lock_table(directory, db, ks, cf).get();
-        process_sstable_dir(directory).get();
+    const sstring& cf() const noexcept {
+        return _cf;
+    }

-        // If we are resharding system tables before we can read them, we will not
-        // know which is the highest format we support: this information is itself stored
-        // in the system tables. In that case we'll rely on what we find on disk: we'll
-        // at least not downgrade any files. If we already know that we support a higher
-        // format than the one we see then we use that.
-        auto sys_format = global_table->get_sstables_manager().get_highest_supported_format();
-        auto sst_version = highest_version_seen(directory, sys_format).get0();
-        auto generation = highest_generation_seen(directory).get0();
+    global_column_family_ptr& global_table() noexcept {
+        return _global_table;
+    };

-        db.invoke_on_all([&global_table, generation] (replica::database& db) {
-            global_table->update_sstables_known_generation(generation);
-            return global_table->disable_auto_compaction();
-        }).get();
+    const global_column_family_ptr& global_table() const noexcept {
+        return _global_table;
+    };

-        reshard(directory, db, ks, cf, [&global_table, sstdir, sst_version] (shard_id shard) mutable {
-            auto gen = smp::submit_to(shard, [&global_table] () {
-                return global_table->calculate_generation_for_new_table();
-            }).get0();
+    const std::unordered_map<sstring, lw_shared_ptr<sharded<sstables::sstable_directory>>>& sstable_directories() const noexcept {
+        return _sstable_directories;
+    }

-            return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
-        }).get();
+    sstables::sstable::version_types highest_version() const noexcept {
+        return _highest_version;
+    }

-        // The node is offline at this point so we are very lenient with what we consider
-        // offstrategy.
-        // SSTables created by repair may not conform to compaction strategy layout goal
-        // because data segregation is only performed by compaction
-        // Instead of reshaping them on boot, let's add them to maintenance set and allow
-        // off-strategy compaction to reshape them. This will allow node to become online
-        // ASAP. Given that SSTables with repair origin are disjoint, they can be efficiently
-        // read from.
-        auto eligible_for_reshape_on_boot = [] (const sstables::shared_sstable& sst) {
-            return sst->get_origin() != sstables::repair_origin;
-        };
+    int64_t highest_generation() const noexcept {
+        return _highest_generation;
+    }

-        reshape(directory, db, sstables::reshape_mode::relaxed, ks, cf, [global_table, sstdir, sst_version] (shard_id shard) {
-            auto gen = global_table->calculate_generation_for_new_table();
-            return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
-        }, eligible_for_reshape_on_boot).get();
+private:
+    future<> start_subdir(sstring subdir);
+};

-        directory.invoke_on_all([global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::sstable_directory& dir) {
-            return dir.do_for_each_sstable([&global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::shared_sstable sst) {
-                auto requires_offstrategy = sstables::offstrategy(do_allow_offstrategy_compaction && !eligible_for_reshape_on_boot(sst));
-                return global_table->add_sstable_and_update_cache(sst, requires_offstrategy);
-            }).then([&global_table, do_allow_offstrategy_compaction] {
-              if (do_allow_offstrategy_compaction) {
-                global_table->trigger_offstrategy_compaction();
-              }
-            });
-        }).get();
+future<> table_population_metadata::start_subdir(sstring subdir) {
+    sstring sstdir = get_path(subdir).native();
+    if (!co_await file_exists(sstdir)) {
+        co_return;
+    }
+
+    // First pass, cleanup temporary sstable directories and sstables pending delete.
+    co_await distributed_loader::cleanup_column_family_temp_sst_dirs(sstdir);
+    auto pending_delete_dir = sstdir + "/" + sstables::sstable::pending_delete_dir_basename();
+    auto exists = co_await file_exists(pending_delete_dir);
+    if (exists) {
+        co_await distributed_loader::handle_sstables_pending_delete(pending_delete_dir);
+    }
+
+    auto dptr = make_lw_shared<sharded<sstables::sstable_directory>>();
+    auto& directory = *dptr;
+    auto& global_table = _global_table;
+    auto& db = _db;
+    co_await directory.start(fs::path(sstdir),
+        db.local().get_config().initial_sstable_loading_concurrency(), std::ref(db.local().get_sharded_sst_dir_semaphore()),
+        sstables::sstable_directory::need_mutate_level::no,
+        sstables::sstable_directory::lack_of_toc_fatal::yes,
+        sstables::sstable_directory::enable_dangerous_direct_import_of_cassandra_counters(db.local().get_config().enable_dangerous_direct_import_of_cassandra_counters()),
+        sstables::sstable_directory::allow_loading_materialized_view::yes,
+        [&global_table] (fs::path dir, int64_t gen, sstables::sstable_version_types v, sstables::sstable_format_types f) {
+            return global_table->make_sstable(dir.native(), gen, v, f);
+    });
+
+    // directory must be stopped using table_population_metadata::stop below
+    _sstable_directories[subdir] = dptr;
+
+    co_await distributed_loader::lock_table(directory, _db, _ks, _cf);
+    co_await distributed_loader::process_sstable_dir(directory);
+
+    // If we are resharding system tables before we can read them, we will not
+    // know which is the highest format we support: this information is itself stored
+    // in the system tables. In that case we'll rely on what we find on disk: we'll
+    // at least not downgrade any files. If we already know that we support a higher
+    // format than the one we see then we use that.
+    auto sys_format = global_table->get_sstables_manager().get_highest_supported_format();
+    auto sst_version = co_await highest_version_seen(directory, sys_format);
+    auto generation = co_await highest_generation_seen(directory);
+
+    _highest_version = std::max(sst_version, _highest_version);
+    _highest_generation = std::max(generation, _highest_generation);
+}
+
+future<> distributed_loader::populate_column_family(table_population_metadata& metadata, sstring subdir, allow_offstrategy_compaction do_allow_offstrategy_compaction, must_exist dir_must_exist) {
+    auto& db = metadata.db();
+    const auto& ks = metadata.ks();
+    const auto& cf = metadata.cf();
+    auto sstdir = metadata.get_path(subdir).native();
+    dblog.debug("Populating {}/{}/{} allow_offstrategy_compaction={} must_exist={}", ks, cf, sstdir, do_allow_offstrategy_compaction, dir_must_exist);
+
+    assert(this_shard_id() == 0);
+
+    if (!co_await file_exists(sstdir)) {
+        if (dir_must_exist) {
+            throw std::runtime_error(format("Populating {}/{} failed: {} does not exist", metadata.ks(), metadata.cf(), sstdir));
+        }
+        co_return;
+    }
+
+    auto& global_table = metadata.global_table();
+    if (!metadata.sstable_directories().contains(subdir)) {
+        dblog.error("Could not find sstables directory {}.{}/{}", ks, cf, subdir);
+    }
+    auto& directory = *metadata.sstable_directories().at(subdir);
+    auto sst_version = metadata.highest_version();
+
+    co_await reshard(directory, db, ks, cf, [&global_table, sstdir, sst_version] (shard_id shard) mutable {
+        auto gen = smp::submit_to(shard, [&global_table] () {
+            return global_table->calculate_generation_for_new_table();
+        }).get0();
+
+        return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
+    });
+
+    // The node is offline at this point so we are very lenient with what we consider
+    // offstrategy.
+    // SSTables created by repair may not conform to compaction strategy layout goal
+    // because data segregation is only performed by compaction
+    // Instead of reshaping them on boot, let's add them to maintenance set and allow
+    // off-strategy compaction to reshape them. This will allow node to become online
+    // ASAP. Given that SSTables with repair origin are disjoint, they can be efficiently
+    // read from.
+    auto eligible_for_reshape_on_boot = [] (const sstables::shared_sstable& sst) {
+        return sst->get_origin() != sstables::repair_origin;
+    };
+
+    co_await reshape(directory, db, sstables::reshape_mode::relaxed, ks, cf, [global_table, sstdir, sst_version] (shard_id shard) {
+        auto gen = global_table->calculate_generation_for_new_table();
+        return global_table->make_sstable(sstdir, gen, sst_version, sstables::sstable::format_types::big);
+    }, eligible_for_reshape_on_boot);
+
+    co_await directory.invoke_on_all([global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::sstable_directory& dir) -> future<> {
+        co_await dir.do_for_each_sstable([&global_table, &eligible_for_reshape_on_boot, do_allow_offstrategy_compaction] (sstables::shared_sstable sst) {
+            auto requires_offstrategy = sstables::offstrategy(do_allow_offstrategy_compaction && !eligible_for_reshape_on_boot(sst));
+            return global_table->add_sstable_and_update_cache(sst, requires_offstrategy);
+        });
+        if (do_allow_offstrategy_compaction) {
+            global_table->trigger_offstrategy_compaction();
+        }
    });
 }

@@ -549,41 +650,51 @@ future<> distributed_loader::populate_keyspace(distributed<replica::database>& d
    auto i = keyspaces.find(ks_name);
    if (i == keyspaces.end()) {
        dblog.warn("Skipping undefined keyspace: {}", ks_name);
-        return make_ready_future<>();
-    } else {
-        dblog.info("Populating Keyspace {}", ks_name);
-        auto& ks = i->second;
-        auto& column_families = db.local().get_column_families();
-
-        return parallel_for_each(ks.metadata()->cf_meta_data() | boost::adaptors::map_values,
-            [ks_name, ksdir, &ks, &column_families, &db] (schema_ptr s) {
-                utils::UUID uuid = s->id();
-                lw_shared_ptr<replica::column_family> cf = column_families[uuid];
-                sstring cfname = cf->schema()->cf_name();
-                auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
-                dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
-                return ks.make_directory_for_column_family(cfname, uuid).then([&db, sstdir, uuid, ks_name, cfname] {
-                    return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::staging_dir, ks_name, cfname, allow_offstrategy_compaction::no);
-                }).then([&db, sstdir, ks_name, cfname] {
-                    return distributed_loader::populate_column_family(db, sstdir + "/" + sstables::quarantine_dir, ks_name, cfname, allow_offstrategy_compaction::no, must_exist::no);
-                }).then([&db, sstdir, uuid, ks_name, cfname] {
-                    return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname, allow_offstrategy_compaction::yes);
-                }).handle_exception([ks_name, cfname, sstdir](std::exception_ptr eptr) {
-                    std::string msg =
-                        format("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
-                               ks_name, cfname, sstdir, eptr);
-                    dblog.error("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
-                                ks_name, cfname, sstdir, eptr);
-                    try {
-                        std::rethrow_exception(eptr);
-                    } catch (sstables::compaction_stopped_exception& e) {
-                        // swallow compaction stopped exception, to allow clean shutdown.
-                    } catch (...) {
-                        throw std::runtime_error(msg.c_str());
-                    }
-                });
-            });
+        co_return;
    }
+
+    dblog.info("Populating Keyspace {}", ks_name);
+    auto& ks = i->second;
+    auto& column_families = db.local().get_column_families();
+
+    co_await parallel_for_each(ks.metadata()->cf_meta_data() | boost::adaptors::map_values, [&] (schema_ptr s) -> future<> {
+            utils::UUID uuid = s->id();
+            lw_shared_ptr<replica::column_family> cf = column_families[uuid];
+            sstring cfname = cf->schema()->cf_name();
+            auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
+            dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
+
+        auto metadata = table_population_metadata(db, ks_name, cfname);
+        std::exception_ptr ex;
+
+        try {
+            co_await ks.make_directory_for_column_family(cfname, uuid);
+
+            co_await metadata.start();
+            co_await distributed_loader::populate_column_family(metadata, sstables::staging_dir, allow_offstrategy_compaction::no);
+            co_await distributed_loader::populate_column_family(metadata, sstables::quarantine_dir, allow_offstrategy_compaction::no, must_exist::no);
+            co_await distributed_loader::populate_column_family(metadata, "", allow_offstrategy_compaction::yes);
+        } catch (...) {
+            std::exception_ptr eptr = std::current_exception();
+            std::string msg =
+                format("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
+                        ks_name, cfname, sstdir, eptr);
+            dblog.error("Exception while populating keyspace '{}' with column family '{}' from file '{}': {}",
+                        ks_name, cfname, sstdir, eptr);
+            try {
+                std::rethrow_exception(eptr);
+            } catch (sstables::compaction_stopped_exception& e) {
+                // swallow compaction stopped exception, to allow clean shutdown.
+            } catch (...) {
+                ex = std::make_exception_ptr(std::runtime_error(msg.c_str()));
+            }
+        }
+
+        co_await metadata.stop();
+        if (ex) {
+            std::rethrow_exception(std::move(ex));
+        }
+    });
 }

 future<> distributed_loader::init_system_keyspace(distributed<replica::database>& db, distributed<service::storage_service>& ss, sharded<gms::gossiper>& g, db::config& cfg) {
--- a/replica/distributed_loader.hh
+++ b/replica/distributed_loader.hh
@@ -57,8 +57,11 @@ class distributed_loader_for_tests;

 namespace replica {

+class table_population_metadata;
+
 class distributed_loader {
    friend class ::distributed_loader_for_tests;
+    friend class table_population_metadata;

    static future<> reshape(sharded<sstables::sstable_directory>& dir, sharded<replica::database>& db, sstables::reshape_mode mode,
            sstring ks_name, sstring table_name, sstables::compaction_sstable_creator_fn creator, std::function<bool (const sstables::shared_sstable&)> filter);
@@ -70,7 +73,7 @@ class distributed_loader {
            std::filesystem::path datadir, sstring ks, sstring cf);
    using allow_offstrategy_compaction = bool_class<struct allow_offstrategy_compaction_tag>;
    using must_exist = bool_class<struct must_exist_tag>;
-    static future<> populate_column_family(distributed<replica::database>& db, sstring sstdir, sstring ks, sstring cf, allow_offstrategy_compaction, must_exist = must_exist::yes);
+    static future<> populate_column_family(table_population_metadata& metadata, sstring subdir, allow_offstrategy_compaction, must_exist = must_exist::yes);
    static future<> populate_keyspace(distributed<replica::database>& db, sstring datadir, sstring ks_name);
    static future<> cleanup_column_family_temp_sst_dirs(sstring sstdir);
    static future<> handle_sstables_pending_delete(sstring pending_deletes_dir);
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -803,16 +803,15 @@ void table::set_metrics() {
 }

 void table::rebuild_statistics() {
-    // zeroing live_disk_space_used and live_sstable_count because the
-    // sstable list was re-created
    _stats.live_disk_space_used = 0;
    _stats.live_sstable_count = 0;
+    _stats.total_disk_space_used = 0;

    _sstables->for_each_sstable([this] (const sstables::shared_sstable& tab) {
        update_stats_for_new_sstable(tab->bytes_on_disk());
    });
    for (auto& tab : _sstables_compacted_but_not_deleted) {
-        update_stats_for_new_sstable(tab->bytes_on_disk());
+        _stats.total_disk_space_used += tab->bytes_on_disk();
    }
 }

@@ -1137,6 +1136,11 @@ future<> table::run_offstrategy_compaction(sstables::compaction_data& info) {
    tlogger.info("Done with off-strategy compaction for {}.{}", _schema->ks_name(), _schema->cf_name());
 }

+future<> table::perform_cleanup_compaction(replica::database& db) {
+    co_await flush();
+    co_await get_compaction_manager().perform_cleanup(db, this);
+}
+
 void table::set_compaction_strategy(sstables::compaction_strategy_type strategy) {
    tlogger.debug("Setting compaction strategy of {}.{} to {}", _schema->ks_name(), _schema->cf_name(), sstables::compaction_strategy::name(strategy));
    auto new_cs = make_compaction_strategy(strategy, _schema->compaction_strategy_options());
@@ -1772,29 +1776,30 @@ future<> table::generate_and_propagate_view_updates(const schema_ptr& base,
        tracing::trace_state_ptr tr_state,
        gc_clock::time_point now) const {
    auto base_token = m.token();
+    auto m_schema = m.schema();
    db::view::view_update_builder builder = co_await db::view::make_view_update_builder(
            base,
            std::move(views),
-            make_flat_mutation_reader_from_mutations(m.schema(), std::move(permit), {std::move(m)}),
+            make_flat_mutation_reader_from_mutations(std::move(m_schema), std::move(permit), {std::move(m)}),
            std::move(existings),
            now);

    std::exception_ptr err = nullptr;
    while (true) {
-        utils::chunked_vector<frozen_mutation_and_schema> updates;
+        std::optional<utils::chunked_vector<frozen_mutation_and_schema>> updates;
        try {
            updates = co_await builder.build_some();
        } catch (...) {
            err = std::current_exception();
            break;
        }
-        if (updates.empty()) {
+        if (!updates) {
            break;
        }
-        tracing::trace(tr_state, "Generated {} view update mutations", updates.size());
-        auto units = seastar::consume_units(*_config.view_update_concurrency_semaphore, memory_usage_of(updates));
+        tracing::trace(tr_state, "Generated {} view update mutations", updates->size());
+        auto units = seastar::consume_units(*_config.view_update_concurrency_semaphore, memory_usage_of(*updates));
        try {
-            co_await db::view::mutate_MV(base_token, std::move(updates), _view_stats, *_config.cf_stats, tr_state,
+            co_await db::view::mutate_MV(base_token, std::move(*updates), _view_stats, *_config.cf_stats, tr_state,
                std::move(units), service::allow_hints::yes, db::view::wait_for_all_updates::no);
        } catch (...) {
            // Ignore exceptions: any individual failure to propagate a view update will be reported
@@ -1918,14 +1923,14 @@ future<> table::populate_views(
    while (true) {
        try {
            auto updates = co_await builder.build_some();
-            if (updates.empty()) {
+            if (!updates) {
                break;
            }
-            size_t update_size = memory_usage_of(updates);
+            size_t update_size = memory_usage_of(*updates);
            size_t units_to_wait_for = std::min(_config.view_update_concurrency_semaphore_limit, update_size);
            auto units = co_await seastar::get_units(*_config.view_update_concurrency_semaphore, units_to_wait_for);
            units.adopt(seastar::consume_units(*_config.view_update_concurrency_semaphore, update_size - units_to_wait_for));
-            co_await db::view::mutate_MV(base_token, std::move(updates), _view_stats, *_config.cf_stats,
+            co_await db::view::mutate_MV(base_token, std::move(*updates), _view_stats, *_config.cf_stats,
                    tracing::trace_state_ptr(), std::move(units), service::allow_hints::no, db::view::wait_for_all_updates::yes);
        } catch (...) {
            if (!err) {
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -950,6 +950,11 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
            _prev_snapshot = {};
        });
        utils::coroutine update; // Destroy before cleanup to release snapshots before invalidating.
+        auto destroy_update = defer([&] {
+            with_allocator(_tracker.allocator(), [&] {
+                update = {};
+            });
+        });
        partition_presence_checker is_present = _prev_snapshot->make_partition_presence_checker();
        while (!m.partitions.empty()) {
            with_allocator(_tracker.allocator(), [&] () {
@@ -1222,6 +1227,10 @@ void rows_entry::on_evicted(cache_tracker& tracker) noexcept {
        // That dummy is linked in the LRU, because there may be partitions
        // with no regular rows, and we need to track them.
        unlink_from_lru();
+
+        // We still need to break continuity in order to preserve the "older versions are evicted first"
+        // invariant.
+        it->set_continuous(false);
    } else {
        // When evicting a dummy with both sides continuous we don't need to break continuity.
        //
--- a/scylla_post_install.sh
+++ b/scylla_post_install.sh
@@ -63,4 +63,15 @@ MemoryLimit=$MEMORY_LIMIT
 EOS
 fi

+if [ -e /etc/systemd/system/systemd-coredump@.service.d/timeout.conf ]; then
+    COREDUMP_RUNTIME_MAX=$(grep RuntimeMaxSec /etc/systemd/system/systemd-coredump@.service.d/timeout.conf)
+    if [ -z $COREDUMP_RUNTIME_MAX ]; then
+    cat << EOS > /etc/systemd/system/systemd-coredump@.service.d/timeout.conf
+[Service]
+RuntimeMaxSec=infinity
+TimeoutSec=infinity
+EOS
+    fi
+fi
+
 systemctl --system daemon-reload >/dev/null || true
--- a/2
+++ b/2
--- a/service/paxos/paxos_state.cc
+++ b/service/paxos/paxos_state.cc
@@ -78,7 +78,7 @@ future<prepare_response> paxos_state::prepare(storage_proxy& sp, tracing::trace_
                                    prv, tr_state, timeout);
                        });
                    });
-                    return when_all(std::move(f1), std::move(f2)).then([state = std::move(state), only_digest] (auto t) {
+                    return when_all(std::move(f1), std::move(f2)).then([state = std::move(state), only_digest, schema] (auto t) mutable {
                        if (utils::get_local_injector().enter("paxos_error_after_save_promise")) {
                            return make_exception_future<prepare_response>(utils::injected_error("injected_error_after_save_promise"));
                        }
@@ -103,8 +103,25 @@ future<prepare_response> paxos_state::prepare(storage_proxy& sp, tracing::trace_
                            auto ex = f2.get_exception();
                            logger.debug("Failed to get data or digest: {}. Ignored.", std::move(ex));
                        }
-                        return make_ready_future<prepare_response>(prepare_response(promise(std::move(state._accepted_proposal),
-                                        std::move(state._most_recent_commit), std::move(data_or_digest))));
+                        auto upgrade_if_needed = [schema = std::move(schema)] (std::optional<proposal> p) {
+                            if (!p || p->update.schema_version() == schema->version()) {
+                                return make_ready_future<std::optional<proposal>>(std::move(p));
+                            }
+                            // In case current schema is not the same as the schema in the proposal
+                            // try to look it up first in the local schema_registry cache and upgrade
+                            // the mutation using schema from the cache.
+                            //
+                            // If there's no schema in the cache, then retrieve persisted column mapping
+                            // for that version and upgrade the mutation with it.
+                            logger.debug("Stored mutation references outdated schema version. "
+                                "Trying to upgrade the accepted proposal mutation to the most recent schema version.");
+                            return service::get_column_mapping(p->update.column_family_id(), p->update.schema_version()).then([schema, p = std::move(p)] (const column_mapping& cm) {
+                                return make_ready_future<std::optional<proposal>>(proposal(p->ballot, freeze(p->update.unfreeze_upgrading(schema, cm))));
+                            });
+                        };
+                        return when_all_succeed(upgrade_if_needed(std::move(state._accepted_proposal)), upgrade_if_needed(std::move(state._most_recent_commit))).then([data_or_digest = std::move(data_or_digest)] (auto&& u) mutable {
+                            return prepare_response(promise(std::move(std::get<0>(u)), std::move(std::get<1>(u)), std::move(data_or_digest)));
+                        });
                    });
                } else {
                    logger.debug("Promise rejected; {} is not sufficiently newer than {}", ballot, state._promised_ballot);
@@ -200,15 +217,9 @@ future<> paxos_state::learn(storage_proxy& sp, schema_ptr schema, proposal decis
                // If there's no schema in the cache, then retrieve persisted column mapping
                // for that version and upgrade the mutation with it.
                if (decision.update.schema_version() != schema->version()) {
-                    logger.debug("Stored mutation references outdated schema version. "
-                        "Trying to upgrade the accepted proposal mutation to the most recent schema version.");
-                    return service::get_column_mapping(decision.update.column_family_id(), decision.update.schema_version())
-                        .then([&sp, schema, tr_state, timeout, &decision] (const column_mapping& cm) {
-                            return do_with(decision.update.unfreeze_upgrading(schema, cm), [&sp, tr_state, timeout] (const mutation& upgraded) {
-                                return sp.mutate_locally(upgraded, tr_state, db::commitlog::force_sync::yes, timeout);
-                            });
-                        });
+                    on_internal_error(logger, format("schema version in learn does not match current schema"));
                }
+
                return sp.mutate_locally(schema, decision.update, tr_state, db::commitlog::force_sync::yes, timeout);
            });
        } else {
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -1227,19 +1227,15 @@ future<> paxos_response_handler::learn_decision(lw_shared_ptr<paxos::proposal> d

        auto cdc = _proxy->get_cdc_service();
        if (cdc && cdc->needs_cdc_augmentation(update_mut_vec)) {
-            f_cdc = cdc->augment_mutation_call(_timeout, std::move(update_mut_vec), tr_state, _cl_for_learn)
-                    .then([this, base_tbl_id, cdc = cdc->shared_from_this()] (std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>&& t) {
-                auto mutations = std::move(std::get<0>(t));
-                auto tracker = std::move(std::get<1>(t));
-                // Pick only the CDC ("augmenting") mutations
-                std::erase_if(mutations, [base_tbl_id = std::move(base_tbl_id)] (const mutation& v) {
-                    return v.schema()->id() == base_tbl_id;
-                });
-                if (mutations.empty()) {
-                    return make_ready_future<>();
-                }
-                return _proxy->mutate_internal(std::move(mutations), _cl_for_learn, false, tr_state, _permit, _timeout, std::move(tracker));
+            auto cdc_shared = cdc->shared_from_this(); // keep CDC service alive
+            auto [mutations, tracker] = co_await cdc->augment_mutation_call(_timeout, std::move(update_mut_vec), tr_state, _cl_for_learn);
+            // Pick only the CDC ("augmenting") mutations
+            std::erase_if(mutations, [base_tbl_id = std::move(base_tbl_id)] (const mutation& v) {
+                return v.schema()->id() == base_tbl_id;
            });
+            if (!mutations.empty()) {
+                f_cdc = _proxy->mutate_internal(std::move(mutations), _cl_for_learn, false, tr_state, _permit, _timeout, std::move(tracker));
+            }
        }
    }

@@ -1247,7 +1243,7 @@ future<> paxos_response_handler::learn_decision(lw_shared_ptr<paxos::proposal> d
    std::array<std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>, 1> m{std::make_tuple(std::move(decision), _schema, shared_from_this(), _key.token())};
    future<> f_lwt = _proxy->mutate_internal(std::move(m), _cl_for_learn, false, tr_state, _permit, _timeout);

-    return when_all_succeed(std::move(f_cdc), std::move(f_lwt)).discard_result();
+    co_await when_all_succeed(std::move(f_cdc), std::move(f_lwt)).discard_result();
 }

 void paxos_response_handler::prune(utils::UUID ballot) {
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -2282,6 +2282,8 @@ future<> storage_service::removenode(sstring host_id_string, std::list<gms::inet
                ss._group0->leave_group0(endpoint).get();
                slogger.info("removenode[{}]: Finished removenode operation, removing node={}, sync_nodes={}, ignore_nodes={}", uuid, endpoint, nodes, ignore_nodes);
            } catch (...) {
+                slogger.warn("removenode[{}]: removing node={}, sync_nodes={}, ignore_nodes={} failed, error {}",
+                             uuid, endpoint, nodes, ignore_nodes, std::current_exception());
                // we need to revert the effect of prepare verb the removenode ops is failed
                req.cmd = node_ops_cmd::removenode_abort;
                parallel_for_each(nodes, [&ss, &req, &nodes_unknown_verb, &nodes_down, uuid] (const gms::inet_address& node) {
--- a/sstables/compress.cc
+++ b/sstables/compress.cc
@@ -21,6 +21,7 @@
 #include "unimplemented.hh"
 #include "segmented_compress_params.hh"
 #include "utils/class_registrator.hh"
+#include "reader_permit.hh"

 namespace sstables {

@@ -338,16 +339,18 @@ class compressed_file_data_source_impl : public data_source_impl {
    sstables::compression* _compression_metadata;
    sstables::compression::segmented_offsets::accessor _offsets;
    sstables::local_compression _compression;
+    reader_permit _permit;
    uint64_t _underlying_pos;
    uint64_t _pos;
    uint64_t _beg_pos;
    uint64_t _end_pos;
 public:
    compressed_file_data_source_impl(file f, sstables::compression* cm,
-                uint64_t pos, size_t len, file_input_stream_options options)
+                uint64_t pos, size_t len, file_input_stream_options options, reader_permit permit)
            : _compression_metadata(cm)
            , _offsets(_compression_metadata->offsets.get_accessor())
            , _compression(*cm)
+            , _permit(std::move(permit))
    {
        _beg_pos = pos;
        if (pos > _compression_metadata->uncompressed_file_length()) {
@@ -412,7 +415,7 @@ public:
                _pos += out.size();
                _underlying_pos += addr.chunk_len;

-                return out;
+                return make_tracked_temporary_buffer(std::move(out), _permit);
        });
    }

@@ -444,9 +447,9 @@ requires ChecksumUtils<ChecksumType>
 class compressed_file_data_source : public data_source {
 public:
    compressed_file_data_source(file f, sstables::compression* cm,
-            uint64_t offset, size_t len, file_input_stream_options options)
+            uint64_t offset, size_t len, file_input_stream_options options, reader_permit permit)
        : data_source(std::make_unique<compressed_file_data_source_impl<ChecksumType>>(
-                std::move(f), cm, offset, len, std::move(options)))
+                std::move(f), cm, offset, len, std::move(options), std::move(permit)))
        {}
 };

@@ -454,10 +457,10 @@ template <typename ChecksumType>
 requires ChecksumUtils<ChecksumType>
 inline input_stream<char> make_compressed_file_input_stream(
        file f, sstables::compression *cm, uint64_t offset, size_t len,
-        file_input_stream_options options)
+        file_input_stream_options options, reader_permit permit)
 {
    return input_stream<char>(compressed_file_data_source<ChecksumType>(
-            std::move(f), cm, offset, len, std::move(options)));
+            std::move(f), cm, offset, len, std::move(options), std::move(permit)));
 }

 // For SSTables 2.x (formats 'ka' and 'la'), the full checksum is a combination of checksums of compressed chunks.
@@ -569,15 +572,15 @@ inline output_stream<char> make_compressed_file_output_stream(output_stream<char

 input_stream<char> sstables::make_compressed_file_k_l_format_input_stream(file f,
        sstables::compression* cm, uint64_t offset, size_t len,
-        class file_input_stream_options options)
+        class file_input_stream_options options, reader_permit permit)
 {
-    return make_compressed_file_input_stream<adler32_utils>(std::move(f), cm, offset, len, std::move(options));
+    return make_compressed_file_input_stream<adler32_utils>(std::move(f), cm, offset, len, std::move(options), std::move(permit));
 }

 input_stream<char> sstables::make_compressed_file_m_format_input_stream(file f,
        sstables::compression *cm, uint64_t offset, size_t len,
-        class file_input_stream_options options) {
-    return make_compressed_file_input_stream<crc32_utils>(std::move(f), cm, offset, len, std::move(options));
+        class file_input_stream_options options, reader_permit permit) {
+    return make_compressed_file_input_stream<crc32_utils>(std::move(f), cm, offset, len, std::move(options), std::move(permit));
 }

 output_stream<char> sstables::make_compressed_file_m_format_output_stream(output_stream<char> out,
--- a/sstables/compress.hh
+++ b/sstables/compress.hh
@@ -47,6 +47,8 @@
 #include "checksum_utils.hh"
 #include "../compress.hh"

+class reader_permit;
+
 class compression_parameters;
 class compressor;
 using compressor_ptr = shared_ptr<compressor>;
@@ -371,11 +373,11 @@ compressor_ptr get_sstable_compressor(const compression&);
 // sstable alive, and the compression metadata is only a part of it.
 input_stream<char> make_compressed_file_k_l_format_input_stream(file f,
                sstables::compression* cm, uint64_t offset, size_t len,
-                class file_input_stream_options options);
+                class file_input_stream_options options, reader_permit permit);

 input_stream<char> make_compressed_file_m_format_input_stream(file f,
                sstables::compression* cm, uint64_t offset, size_t len,
-                class file_input_stream_options options);
+                class file_input_stream_options options, reader_permit permit);

 output_stream<char> make_compressed_file_m_format_output_stream(output_stream<char> out,
                sstables::compression* cm,
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -2287,7 +2287,7 @@ input_stream<char> sstable::data_stream(uint64_t pos, size_t len, const io_prior
    options.read_ahead = 4;
    options.dynamic_adjustments = std::move(history);

-    file f = make_tracked_file(_data_file, std::move(permit));
+    file f = make_tracked_file(_data_file, permit);
    if (trace_state) {
        f = tracing::make_traced_file(std::move(f), std::move(trace_state), format("{}:", get_filename()));
    }
@@ -2296,10 +2296,10 @@ input_stream<char> sstable::data_stream(uint64_t pos, size_t len, const io_prior
    if (_components->compression) {
        if (_version >= sstable_version_types::mc) {
             return make_compressed_file_m_format_input_stream(f, &_components->compression,
-                pos, len, std::move(options));
+                pos, len, std::move(options), permit);
        } else {
            return make_compressed_file_k_l_format_input_stream(f, &_components->compression,
-                pos, len, std::move(options));
+                pos, len, std::move(options), permit);
        }
    }

--- a/streaming/consumer.cc
+++ b/streaming/consumer.cc
@@ -29,7 +29,7 @@ std::function<future<> (flat_mutation_reader)> make_streaming_consumer(sstring o
        std::exception_ptr ex;
        try {
            auto cf = db.local().find_column_family(reader.schema()).shared_from_this();
-            auto use_view_update_path = co_await db::view::check_needs_view_update_path(sys_dist_ks.local(), *cf, reason);
+            auto use_view_update_path = co_await db::view::check_needs_view_update_path(sys_dist_ks.local(), db.local().get_token_metadata(), *cf, reason);
            //FIXME: for better estimations this should be transmitted from remote
            auto metadata = mutation_source_metadata{};
            auto& cs = cf->get_compaction_strategy();
--- a/test/boost/json_cql_query_test.cc
+++ b/test/boost/json_cql_query_test.cc
@@ -296,6 +296,17 @@ SEASTAR_TEST_CASE(test_insert_json_types) {
            }
        });

+        BOOST_REQUIRE_THROW(e.execute_cql(R"(
+            INSERT INTO all_types JSON '{
+                "a": "abc", "c": "6"
+            }'
+        )").get(), marshal_exception);
+        BOOST_REQUIRE_THROW(e.execute_cql(R"(
+            INSERT INTO all_types JSON '{
+                "a": "abc", "c": "0392fa"
+            }'
+        )").get(), marshal_exception);
+
        e.execute_cql("CREATE TABLE multi_column_pk_table (p1 int, p2 int, p3 int, c1 int, c2 int, v int, PRIMARY KEY((p1, p2, p3), c1, c2));").get();
        e.require_table_exists("ks", "multi_column_pk_table").get();

--- a/test/boost/mutation_reader_test.cc
+++ b/test/boost/mutation_reader_test.cc
@@ -1607,13 +1607,13 @@ SEASTAR_TEST_CASE(test_trim_clustering_row_ranges_to) {
    check_reversed(
            { {excl{9, 39}, incl{10}} },
            {10},
-            { {excl{9, 39}, incl{10, null{}}} });
+            { {excl{9, 39}, excl{10}} });

    // (13)
    check_reversed(
            { {incl{9, 10}, incl{10, 30}} },
            {10},
-            { {incl{9, 10}, incl{10, null{}}} });
+            { {incl{9, 10}, excl{10}} });

    // (14)
    check_reversed(
@@ -3865,6 +3865,50 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_clear_tombstone_in_discontinued_p
    check(empty_buffer, "end of stream");
 }

+SEASTAR_THREAD_TEST_CASE(test_evictable_reader_next_pos_is_partition_start) {
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), 1, 0);
+    auto stop_sem = deferred_stop(semaphore);
+    simple_schema s;
+    auto schema = s.schema();
+    auto permit = semaphore.make_tracking_only_permit(s.schema().get(), get_name(), db::no_timeout);
+
+    auto pk = s.make_pkey();
+    const auto prange = dht::partition_range::make_open_ended_both_sides();
+
+    std::deque<mutation_fragment_v2> frags;
+    frags.emplace_back(*schema, permit, partition_start(pk, {}));
+    for (size_t ck = 0; ck < 1000; ++ck) {
+        frags.emplace_back(*schema, permit, range_tombstone_change(position_in_partition::before_key(s.make_ckey(ck)), tombstone(s.new_timestamp(), {})));
+    }
+    frags.emplace_back(*schema, permit, range_tombstone_change(position_in_partition::before_key(s.make_ckey(1001)), tombstone()));
+    frags.emplace_back(*schema, permit, partition_end{});
+
+    const auto max_buf_size = frags[0].memory_usage() + frags[1].memory_usage() + frags[2].memory_usage();
+
+    auto ms = mutation_source([&frags, max_buf_size] (
+            schema_ptr schema,
+            reader_permit permit,
+            const dht::partition_range& pr,
+            const query::partition_slice& ps,
+            const io_priority_class& pc,
+            tracing::trace_state_ptr trace_state,
+            streamed_mutation::forwarding fwd_sm,
+            mutation_reader::forwarding fwd_mr) {
+        auto rd = make_flat_mutation_reader_from_fragments(std::move(schema), std::move(permit), std::move(frags), pr, ps);
+        rd.set_max_buffer_size(max_buf_size);
+        return rd;
+    });
+
+    auto [rd, handle] = make_manually_paused_evictable_reader_v2(ms, schema, permit, prange, schema->full_slice(), default_priority_class(), {},
+            mutation_reader::forwarding::no);
+    auto stop_rd = deferred_close(rd);
+    rd.set_max_buffer_size(max_buf_size);
+
+    rd.fill_buffer().get();
+    auto buf1 = rd.detach_buffer();
+    BOOST_REQUIRE_EQUAL(buf1.size(), 3);
+}
+
 struct mutation_bounds {
    std::optional<mutation> m;
    position_in_partition lower;
--- a/test/boost/mutation_test.cc
+++ b/test/boost/mutation_test.cc
@@ -3461,3 +3461,100 @@ SEASTAR_THREAD_TEST_CASE(test_compactor_range_tombstone_spanning_many_pages) {
        BOOST_REQUIRE_EQUAL(res_mut, ref_mut);
    }
 }
+
+SEASTAR_THREAD_TEST_CASE(test_compactor_detach_state) {
+    simple_schema ss;
+    auto pk = ss.make_pkey();
+    auto s = ss.schema();
+
+    tests::reader_concurrency_semaphore_wrapper semaphore;
+
+    auto permit = semaphore.make_permit();
+
+    const auto expiry_point = gc_clock::now() + std::chrono::days(10);
+
+    const auto marker_ts = ss.new_timestamp();
+    const auto tomb_ts = ss.new_timestamp();
+    const auto row_ts = ss.new_timestamp();
+
+    const auto query_time = gc_clock::now();
+    const auto max_rows = std::numeric_limits<uint64_t>::max();
+    const auto max_partitions = std::numeric_limits<uint32_t>::max();
+
+    auto make_frags = [&] {
+        std::deque<mutation_fragment_v2> frags;
+
+        frags.emplace_back(*s, permit, partition_start(pk, {}));
+
+        frags.emplace_back(*s, permit, ss.make_static_row_v2(permit, "static_row"));
+
+        const auto& v_def = *s->get_column_definition(to_bytes("v"));
+
+        frags.emplace_back(*s, permit, range_tombstone_change(position_in_partition::before_key(ss.make_ckey(0)), tombstone{tomb_ts, expiry_point}));
+
+        for (uint32_t ck = 0; ck < 1; ++ck) {
+            auto row = clustering_row(ss.make_ckey(ck));
+            row.cells().apply(v_def, atomic_cell::make_live(*v_def.type, row_ts, serialized("v")));
+            row.marker() = row_marker(marker_ts);
+            frags.emplace_back(mutation_fragment_v2(*s, permit, std::move(row)));
+        }
+
+        frags.emplace_back(*s, permit, range_tombstone_change(position_in_partition::after_key(ss.make_ckey(10)), tombstone{}));
+
+        frags.emplace_back(*s, permit, partition_end{});
+
+        return frags;
+    };
+
+    struct consumer {
+        uint64_t frags = 0;
+        const uint64_t frag_limit;
+        const bool final_stop;
+
+        consumer(uint64_t stop_at, bool final_stop) : frag_limit(stop_at + 1), final_stop(final_stop) { }
+        void consume_new_partition(const dht::decorated_key& dk) { }
+        void consume(const tombstone& t) { }
+        stop_iteration consume(static_row&& sr, tombstone, bool) {
+            const auto ret = ++frags >= frag_limit;
+            testlog.trace("consume(static_row) ret={}", ret);
+            return stop_iteration(ret);
+        }
+        stop_iteration consume(clustering_row&& cr, row_tombstone t, bool is_alive) {
+            const auto ret = ++frags >= frag_limit;
+            testlog.trace("consume(clustering_row) ret={}", ret);
+            return stop_iteration(ret);
+        }
+        stop_iteration consume(range_tombstone&& rt) {
+            const auto ret = ++frags >= frag_limit;
+            testlog.trace("consume(range_tombstone) ret={}", ret);
+            return stop_iteration(ret);
+        }
+        stop_iteration consume_end_of_partition() {
+            testlog.trace("consume_end_of_partition()");
+            return stop_iteration(final_stop);
+        }
+        void consume_end_of_stream() { }
+    };
+
+    // deduct 2 for partition start and end respectively
+    const auto inter_partition_frag_count = make_frags().size() - 2;
+
+    auto check = [&] (uint64_t stop_at, bool final_stop) {
+        testlog.debug("stop_at={}, final_stop={}", stop_at, final_stop);
+        auto compaction_state = make_lw_shared<compact_mutation_state<emit_only_live_rows::no, compact_for_sstables::no>>(*s, query_time, s->full_slice(), max_rows, max_partitions);
+        auto reader = make_flat_mutation_reader_from_fragments(s, permit, make_frags());
+        auto close_reader = deferred_close(reader);
+        reader.consume(compact_for_query<emit_only_live_rows::no, consumer>(compaction_state, consumer(stop_at, final_stop))).get();
+        const auto has_detached_state = bool(std::move(*compaction_state).detach_state());
+        if (stop_at < inter_partition_frag_count) {
+            BOOST_CHECK_EQUAL(has_detached_state, final_stop);
+        } else {
+            BOOST_CHECK(!has_detached_state);
+        }
+    };
+
+    for (unsigned stop_at = 0; stop_at < inter_partition_frag_count; ++stop_at) {
+        check(stop_at, true);
+        check(stop_at, false);
+    }
+};
--- a/test/boost/reader_concurrency_semaphore_test.cc
+++ b/test/boost/reader_concurrency_semaphore_test.cc
@@ -19,22 +19,30 @@

 SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_clear_inactive_reads) {
    simple_schema s;
+    std::vector<reader_permit> permits;
    std::vector<reader_concurrency_semaphore::inactive_read_handle> handles;

    {
        reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name());
        auto stop_sem = deferred_stop(semaphore);
+        auto clear_permits = defer([&permits] { permits.clear(); });

        for (int i = 0; i < 10; ++i) {
-            handles.emplace_back(semaphore.register_inactive_read(make_empty_flat_reader_v2(s.schema(), semaphore.make_tracking_only_permit(s.schema().get(), get_name(), db::no_timeout))));
+            permits.emplace_back(semaphore.make_tracking_only_permit(s.schema().get(), get_name(), db::no_timeout));
+            handles.emplace_back(semaphore.register_inactive_read(make_empty_flat_reader_v2(s.schema(), permits.back())));
        }

        BOOST_REQUIRE(std::all_of(handles.begin(), handles.end(), [] (const reader_concurrency_semaphore::inactive_read_handle& handle) { return bool(handle); }));
+        BOOST_REQUIRE(std::all_of(permits.begin(), permits.end(), [] (const reader_permit& permit) { return permit.get_state() == reader_permit::state::inactive; }));

        semaphore.clear_inactive_reads();

        BOOST_REQUIRE(std::all_of(handles.begin(), handles.end(), [] (const reader_concurrency_semaphore::inactive_read_handle& handle) { return !bool(handle); }));
+        BOOST_REQUIRE(std::all_of(permits.begin(), permits.end(), [] (const reader_permit& permit) { return permit.get_state() == reader_permit::state::evicted; }));

+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
+
+        permits.clear();
        handles.clear();

        for (int i = 0; i < 10; ++i) {
@@ -134,13 +142,18 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_readmission_preserves
            const auto consumed_resources = semaphore.available_resources();
            semaphore.consume(consumed_resources);

-            auto fut = permit->maybe_wait_readmission();
+            auto fut = make_ready_future<>();
+            if (permit->needs_readmission()) {
+                fut = permit->wait_readmission();
+            }
            BOOST_REQUIRE(!fut.available());

            semaphore.signal(consumed_resources);
            fut.get();
        } else {
-            permit->maybe_wait_readmission().get();
+            if (permit->needs_readmission()) {
+                permit->wait_readmission().get();
+            }
        }

        BOOST_REQUIRE_EQUAL(permit->consumed_resources(), residue_units->resources() + base_resources);
@@ -223,7 +236,9 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_forward_progress) {
            if (auto reader = _permit->semaphore().unregister_inactive_read(std::move(handle)); reader) {
                _reader = downgrade_to_v1(std::move(*reader));
            } else {
-                co_await _permit->maybe_wait_readmission();
+                if (_permit->needs_readmission()) {
+                    co_await _permit->wait_readmission();
+                }
                make_reader();
            }
            co_await tick(std::get<flat_mutation_reader>(_reader));
@@ -690,7 +705,10 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_admission) {

        const auto stats_before = semaphore.get_stats();

-        auto wait_fut = permit.maybe_wait_readmission();
+        auto wait_fut = make_ready_future<>();
+        if (permit.needs_readmission()) {
+            wait_fut = permit.wait_readmission();
+        }
        wait_fut.wait();
        BOOST_REQUIRE(!wait_fut.failed());

@@ -946,3 +964,392 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_evict_inactive_reads_
        handles.clear();
    }
 }
+
+// Reproduces https://github.com/scylladb/scylladb/issues/11770
+SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_evict_inactive_reads_when_all_is_blocked) {
+    simple_schema ss;
+    const auto& s = *ss.schema();
+
+    const auto initial_resources = reader_concurrency_semaphore::resources{2, 32 * 1024};
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory);
+    auto stop_sem = deferred_stop(semaphore);
+
+    class read {
+        reader_permit _permit;
+        promise<> _read_started_pr;
+        future<> _read_started_fut;
+        promise<> _read_done_pr;
+        reader_permit::used_guard _ug;
+        std::optional<reader_permit::blocked_guard> _bg;
+
+    public:
+        explicit read(reader_permit p) : _permit(std::move(p)), _read_started_fut(_read_started_pr.get_future()), _ug(_permit) { }
+        future<> wait_read_started() { return std::move(_read_started_fut); }
+        void set_read_done() { _read_done_pr.set_value(); }
+        void mark_as_blocked() { _bg.emplace(_permit); }
+        void mark_as_unblocked() { _bg.reset(); }
+        reader_concurrency_semaphore::read_func get_read_func() {
+            return [this] (reader_permit permit) -> future<> {
+                _read_started_pr.set_value();
+                co_await _read_done_pr.get_future();
+            };
+        }
+    };
+
+    auto p1 = semaphore.obtain_permit(&s, get_name(), 1024, db::no_timeout).get();
+    auto irh1 = semaphore.register_inactive_read(make_empty_flat_reader_v2(ss.schema(), p1));
+
+    auto p2 = semaphore.obtain_permit(&s, get_name(), 1024, db::no_timeout).get();
+    read rd2(p2);
+    auto fut2 = semaphore.with_ready_permit(p2, rd2.get_read_func());
+
+    // At this point we expect to have:
+    // * 1 inactive read (not evicted)
+    // * 1 used (but not blocked) read on the ready list
+    // * 1 waiter
+    // * no more count resources left
+    auto p3_fut = semaphore.obtain_permit(&s, get_name(), 1024, db::no_timeout);
+    BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_enqueued, 1);
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().used_permits, 1);
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().blocked_permits, 0);
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, 0);
+    BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
+    BOOST_REQUIRE(irh1);
+
+    // Start the read emptying the ready list, this should not be enough to admit p3
+    rd2.wait_read_started().get();
+    BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().used_permits, 1);
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().blocked_permits, 0);
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, 0);
+    BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
+    BOOST_REQUIRE(irh1);
+
+    // Marking p2 as blocked should now allow p3 to be admitted by evicting p1
+    rd2.mark_as_blocked();
+    BOOST_REQUIRE_EQUAL(semaphore.waiters(), 0);
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().used_permits, 1);
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().blocked_permits, 1);
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, 1);
+    BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
+    BOOST_REQUIRE(!irh1);
+
+    p3_fut.get();
+    rd2.mark_as_unblocked();
+    rd2.set_read_done();
+    fut2.get();
+}
+
+// Check that `stop()` correctly evicts all inactive reads.
+SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_stop_with_inactive_reads) {
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name());
+
+    simple_schema ss;
+    auto s = ss.schema();
+
+    auto permit = reader_permit_opt(semaphore.obtain_permit(s.get(), get_name(), 1024, db::no_timeout).get());
+
+    auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, *permit));
+
+    BOOST_REQUIRE(handle);
+    BOOST_REQUIRE_EQUAL(permit->get_state(), reader_permit::state::inactive);
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
+
+    // Using BOOST_CHECK_* because an exception thrown here causes a segfault,
+    // due to the stop future not being waited for.
+    auto stop_f = semaphore.stop();
+    BOOST_CHECK(!stop_f.available());
+    BOOST_CHECK(eventually_true([&] { return !semaphore.get_stats().inactive_reads; }));
+    BOOST_CHECK(!handle);
+    BOOST_CHECK_EQUAL(permit->get_state(), reader_permit::state::evicted);
+
+    // Stop waits on all permits, so we need to destroy the permit before we can
+    // wait on the stop future.
+    permit = {};
+    stop_f.get();
+}
+
+SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_set_resources) {
+    const auto initial_resources = reader_concurrency_semaphore::resources{4, 4 * 1024};
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::for_tests{}, get_name(), initial_resources.count, initial_resources.memory);
+    auto stop_sem = deferred_stop(semaphore);
+
+    auto permit1 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get0();
+    auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get0();
+    BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(2, 2 * 1024));
+    BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(4, 4 * 1024));
+
+    semaphore.set_resources({8, 8 * 1024});
+    BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(6, 6 * 1024));
+    BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(8, 8 * 1024));
+
+    semaphore.set_resources({2, 2 * 1024});
+    BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(0, 0));
+    BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(2, 2 * 1024));
+
+    semaphore.set_resources({3, 128});
+    BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(1, 128 - 2 * 1024));
+    BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(3, 128));
+
+    semaphore.set_resources({1, 3 * 1024});
+    BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(-1, 1024));
+    BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(1, 3 * 1024));
+
+    auto permit3_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout);
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_enqueued, 1);
+    BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
+
+    semaphore.set_resources({4, 4 * 1024});
+    BOOST_REQUIRE_EQUAL(semaphore.waiters(), 0);
+    BOOST_REQUIRE_EQUAL(semaphore.available_resources(), reader_resources(1, 1024));
+    BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(4, 4 * 1024));
+    permit3_fut.get();
+}
+
+// Check that inactive reads are not needlessly evicted when admission is not
+// blocked on resources.
+// This test covers all the cases where eviction should **not** happen.
+SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_no_unnecessary_evicting) {
+    const auto initial_resources = reader_concurrency_semaphore::resources{2, 4 * 1024};
+    reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name(), 100);
+    auto stop_sem = deferred_stop(semaphore);
+
+    simple_schema ss;
+    auto s = ss.schema();
+
+    auto permit1 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
+
+    // There are available resources
+    {
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 1);
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 3 * 1024);
+
+        auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
+        BOOST_REQUIRE(handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
+
+        semaphore.set_resources(initial_resources);
+        BOOST_REQUIRE(handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
+
+        BOOST_REQUIRE(semaphore.unregister_inactive_read(std::move(handle)));
+    }
+
+    // Count resources are on the limit but no one wants more
+    {
+        auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
+
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 2 * 1024);
+
+        auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
+        BOOST_REQUIRE(handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
+
+        semaphore.set_resources(initial_resources);
+        BOOST_REQUIRE(handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
+
+        BOOST_REQUIRE(semaphore.unregister_inactive_read(std::move(handle)));
+    }
+
+    // Memory resources are on the limit but no one wants more
+    {
+        auto units = permit1.consume_memory(3 * 1024);
+
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 1);
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 0);
+
+        auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
+        BOOST_REQUIRE(handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
+        BOOST_REQUIRE(semaphore.unregister_inactive_read(std::move(handle)));
+    }
+
+    // Up the resource count, we need more permits to check the rest of the scenarios
+    semaphore.set_resources({4, 4 * 1024});
+
+    // There are waiters but they are not blocked on resources
+    {
+        auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
+        auto permit3 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
+
+        std::optional<reader_permit::used_guard> ug1{permit1};
+        std::optional<reader_permit::used_guard> ug2{permit2};
+
+        auto permit4_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout);
+        BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().reads_queued_because_used_permits, 1);
+
+        // First check the register path.
+        auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit3));
+
+        BOOST_REQUIRE(handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, 0);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
+        BOOST_REQUIRE_EQUAL(permit3.get_state(), reader_permit::state::inactive);
+
+        // Now check the callback admission path (admission check on resources being freed).
+        ug2.reset();
+        BOOST_REQUIRE(handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, 0);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
+        BOOST_REQUIRE_EQUAL(permit3.get_state(), reader_permit::state::inactive);
+    }
+}
+
+// Check that inactive reads are evicted when they are blocking admission
+SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_necessary_evicting) {
+    const auto initial_resources = reader_concurrency_semaphore::resources{2, 4 * 1024};
+    reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name(), 100);
+    auto stop_sem = deferred_stop(semaphore);
+
+    simple_schema ss;
+    auto s = ss.schema();
+
+    uint64_t evicted_reads = 0;
+
+    auto permit1 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
+
+    // No count resources - obtaining new permit
+    {
+        auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
+
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 2 * 1024);
+
+        auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
+        BOOST_REQUIRE(handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
+
+        auto new_permit = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
+        BOOST_REQUIRE(!handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
+    }
+
+    BOOST_REQUIRE(permit1.needs_readmission());
+    permit1.wait_readmission().get();
+
+    // No count resources - waiter
+    {
+        auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
+
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 2 * 1024);
+
+        auto new_permit_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout);
+        BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
+
+        auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
+        BOOST_REQUIRE(!handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
+
+        new_permit_fut.get();
+    }
+
+    BOOST_REQUIRE(permit1.needs_readmission());
+    permit1.wait_readmission().get();
+
+    // No memory resources
+    {
+        auto units = permit1.consume_memory(3 * 1024);
+
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 1);
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 0);
+
+        auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
+        BOOST_REQUIRE(handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
+
+        auto new_permit = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
+        BOOST_REQUIRE(!handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
+    }
+
+    BOOST_REQUIRE(permit1.needs_readmission());
+    permit1.wait_readmission().get();
+
+    // No memory resources - waiter
+    {
+        auto units = permit1.consume_memory(3 * 1024);
+
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 1);
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 0);
+
+        auto new_permit_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout);
+        BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
+
+        auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
+        BOOST_REQUIRE(!handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
+
+        new_permit_fut.get();
+    }
+
+    BOOST_REQUIRE(permit1.needs_readmission());
+    permit1.wait_readmission().get();
+
+    // No count resources - waiter blocked on something else too
+    {
+        auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
+
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 0);
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 2 * 1024);
+
+        std::optional<reader_permit::used_guard> ug{permit2};
+
+        auto new_permit_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout);
+        BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
+
+        auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
+        BOOST_REQUIRE(handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
+
+        ug.reset();
+        BOOST_REQUIRE(!handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
+
+        new_permit_fut.get();
+    }
+
+    BOOST_REQUIRE(permit1.needs_readmission());
+    permit1.wait_readmission().get();
+
+    // No memory resources - waiter blocked on something else too
+    {
+        semaphore.set_resources({initial_resources.count + 1, initial_resources.memory});
+        auto permit2 = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get();
+        auto units = permit1.consume_memory(2 * 1024);
+
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().count, 1);
+        BOOST_REQUIRE_EQUAL(semaphore.available_resources().memory, 0);
+
+        std::optional<reader_permit::used_guard> ug{permit2};
+
+        auto new_permit_fut = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout);
+        BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
+
+        auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, permit1));
+        BOOST_REQUIRE(handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
+
+        ug.reset();
+        thread::yield(); // allow debug builds to schedule the fiber evicting the reads again
+        BOOST_REQUIRE(!handle);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().permit_based_evictions, ++evicted_reads);
+
+        new_permit_fut.get();
+
+        semaphore.set_resources(initial_resources);
+    }
+}
--- a/test/boost/row_cache_test.cc
+++ b/test/boost/row_cache_test.cc
@@ -3944,6 +3944,92 @@ SEASTAR_TEST_CASE(test_scans_erase_dummies) {
    });
 }

+// Tests the following scenario:
+//
+// Initial state:
+//
+//   v2: ==== <7> [entry2] ==== <9> === <13> ==== <last dummy>
+//   v1: ======================================== <last dummy> [entry1]
+//
+// After two eviction events which evict entry1 and entry2, we should end up with:
+//
+//   v2: ---------------------- <9> === <13> ==== <last dummy>
+//   v1: ---------------------------------------- <last dummy>
+//
+// last dummy entries are treated in a special way in rows_entry::on_evicted(), and there
+// was a bug which didn't clear the continuity on last dummy when it was selected for eviction.
+// As a result, the view was this:
+//
+//   v2: ---------------------- <9> === <13> ==== <last dummy>
+//   v1: ======================================== <last dummy>
+//
+// This would violate the "older versions are evicted first" rule, which implies
+// that when entry2 is evicted in v2, the range in which entry2 falls into in all older versions
+// must be discontinuous. This won't hold if we don't clear continuity on last dummy in v1.
+// As a result, the range into which entry2 falls into from the perspective of v2 snapshot
+// would appear as continuous and <7> would be missing from the read result, because
+// continuity of a snapshot is a union of continuous ranges in all versions.
+//
+// Reproduces https://github.com/scylladb/scylladb/issues/12451
+SEASTAR_TEST_CASE(test_version_merging_with_range_tombstones_over_rowless_version) {
+    return seastar::async([] {
+        simple_schema s;
+        tests::reader_concurrency_semaphore_wrapper semaphore;
+
+        auto pkey = s.make_pkey("pk");
+        auto pr = dht::partition_range::make_singular(pkey);
+
+        memtable_snapshot_source underlying(s.schema());
+
+        mutation m1(s.schema(), pkey);
+        m1.partition().apply(s.new_tombstone());
+        underlying.apply(m1);
+
+        cache_tracker tracker;
+        row_cache cache(s.schema(), snapshot_source([&] { return underlying(); }), tracker);
+
+        // Populate cache
+        assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr))
+                .produces(m1);
+
+        mutation m2(s.schema(), pkey);
+        s.delete_range(m2, s.make_ckey_range(7, 13));
+        s.add_row(m2, s.make_ckey(7), "v");
+        s.delete_range(m2, s.make_ckey_range(9, 17));
+        s.add_row(m2, s.make_ckey(9), "v");
+        s.add_row(m2, s.make_ckey(17), "v");
+
+        {
+            auto rd1 = cache.make_reader(s.schema(), semaphore.make_permit(), pr);
+            auto close_rd1 = deferred_close(rd1);
+            rd1.set_max_buffer_size(1); // To hold the snapshot
+            rd1.fill_buffer().get();
+
+            apply(cache, underlying, m2);
+
+            evict_one_row(tracker); // hits last dummy in oldest version.
+
+            assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr))
+                    .produces(m1 + m2);
+
+            evict_one_row(tracker); // hits entry in the latest version, row (v1) or rtc (v2)
+
+            assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr))
+                    .produces(m1 + m2);
+
+            evict_one_row(tracker); // hits entry in the latest version, row (both v1 and v2)
+
+            assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr))
+                    .produces(m1 + m2);
+        }
+
+        tracker.cleaner().drain().get();
+
+        assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr))
+                .produces(m1 + m2);
+    });
+}
+
 SEASTAR_TEST_CASE(test_eviction_of_upper_bound_of_population_range) {
    return seastar::async([] {
        simple_schema s;
--- a/test/boost/sstable_test.cc
+++ b/test/boost/sstable_test.cc
@@ -18,6 +18,7 @@
 #include "compaction/compaction_manager.hh"
 #include "sstables/key.hh"
 #include "test/lib/sstable_utils.hh"
+#include "test/lib/reader_concurrency_semaphore.hh"
 #include <seastar/testing/test_case.hh>
 #include "schema.hh"
 #include "compress.hh"
@@ -752,6 +753,8 @@ SEASTAR_TEST_CASE(sub_partitions_read) {

 SEASTAR_TEST_CASE(test_skipping_in_compressed_stream) {
    return seastar::async([] {
+        tests::reader_concurrency_semaphore_wrapper semaphore;
+
        tmpdir tmp;
        auto file_path = (tmp.path() / "test").string();
        file f = open_file_dma(file_path, open_flags::create | open_flags::wo).get0();
@@ -787,7 +790,7 @@ SEASTAR_TEST_CASE(test_skipping_in_compressed_stream) {

        auto make_is = [&] {
            f = open_file_dma(file_path, open_flags::ro).get0();
-            return make_compressed_file_m_format_input_stream(f, &c, 0, uncompressed_size, opts);
+            return make_compressed_file_m_format_input_stream(f, &c, 0, uncompressed_size, opts, semaphore.make_permit());
        };

        auto expect = [] (input_stream<char>& in, const temporary_buffer<char>& buf) {
--- a/test/boost/stall_free_test.cc
+++ b/test/boost/stall_free_test.cc
@@ -89,6 +89,24 @@ SEASTAR_THREAD_TEST_CASE(test_clear_gently_non_trivial_unique_ptr) {
    utils::clear_gently(p).get();
    BOOST_CHECK(p);
    BOOST_REQUIRE_EQUAL(cleared_gently, 1);
+
+    cleared_gently = 0;
+    p.reset();
+    utils::clear_gently(p).get();
+    BOOST_CHECK(!p);
+    BOOST_REQUIRE_EQUAL(cleared_gently, 0);
+}
+
+SEASTAR_THREAD_TEST_CASE(test_clear_gently_vector_of_unique_ptrs) {
+    int cleared_gently = 0;
+    std::vector<std::unique_ptr<clear_gently_tracker<int>>> v;
+    v.emplace_back(std::make_unique<clear_gently_tracker<int>>(0, [&cleared_gently] (int) {
+        cleared_gently++;
+    }));
+    v.emplace_back(nullptr);
+
+    utils::clear_gently(v).get();
+    BOOST_REQUIRE_EQUAL(cleared_gently, 1);
 }

 SEASTAR_THREAD_TEST_CASE(test_clear_gently_foreign_ptr) {
--- a/test/boost/types_test.cc
+++ b/test/boost/types_test.cc
@@ -683,6 +683,13 @@ BOOST_AUTO_TEST_CASE(test_parse_valid_frozen_set) {
    BOOST_REQUIRE(type->as_cql3_type().to_string() == "frozen<set<int>>");
 }

+BOOST_AUTO_TEST_CASE(test_empty_frozen_set_compare) {
+    auto parser = db::marshal::type_parser("org.apache.cassandra.db.marshal.FrozenType(org.apache.cassandra.db.marshal.SetType(org.apache.cassandra.db.marshal.Int32Type))");
+    auto type = parser.parse();
+    std::unordered_set<int> set = {1, 2, 3};
+    type->compare(bytes_view(), bytes_view(*data_value(set).serialize()));
+}
+
 BOOST_AUTO_TEST_CASE(test_parse_valid_set_frozen_set) {
    sstring frozen = "org.apache.cassandra.db.marshal.FrozenType(org.apache.cassandra.db.marshal.SetType(org.apache.cassandra.db.marshal.Int32Type))";
    auto parser = db::marshal::type_parser("org.apache.cassandra.db.marshal.SetType(" + frozen + ")");
--- a/test/cql-pytest/run.py
+++ b/test/cql-pytest/run.py
@@ -244,7 +244,14 @@ def get_cql_cluster(ip, ssl_context=None):
    auth_provider = cassandra.auth.PlainTextAuthProvider(username='cassandra', password='cassandra')
    return cassandra.cluster.Cluster([ip],
        auth_provider=auth_provider,
-        ssl_context=ssl_context)
+        ssl_context=ssl_context,
+        # The default timeout for new connections is 5 seconds, and for
+        # requests made by the control connection is 2 seconds. These should
+        # have been more than enough, but in some extreme cases with a very
+        # slow debug build running on a very busy machine, they may not be.
+        # so let's increase them to 60 seconds. See issue #13239.
+        connect_timeout = 60,
+        control_connection_timeout = 60)

 ## Test that CQL is serving, for wait_for_services() below.
 def check_cql(ip, ssl_context=None):
--- a/test/cql-pytest/test_materialized_view.py
+++ b/test/cql-pytest/test_materialized_view.py
@@ -233,3 +233,106 @@ def test_view_update_and_alter_base(cql, test_keyspace, scylla_only):
            # Try to modify an item. This failed in #11542.
            cql.execute(f'UPDATE {table} SET v=-1 WHERE p=1')
            assert len(list(cql.execute(f"SELECT v from {mv}"))) == 0
+
+# Reproducer for issue #12297, reproducing a specific way in which a view
+# table could be made inconsistent with the base table:
+# The test writes 500 rows to one partition in a base table, and then uses
+# USING TIMESTAMP with the right value to cause a base partition deletion
+# which deletes not the entire partition but just its last 50 rows. As the
+# 50 rows of the base partition get deleted, we expect 50 rows from the
+# view table to also get deleted - but bug #12297 was that this wasn't
+# happening - rather, all rows remained in the view.
+# The bug cannot be reproduced with 100 rows (and deleting the last 10)
+# but 113 rows (and 101 rows after deleting the last 12) does reproduce
+# it. Reproducing the bug also required a setup where USING TIMESTAMP
+# deleted the *last* rows - using it to delete the *first* rows did not
+# have a bug (the view rows were deleted fine).
+@pytest.mark.parametrize("size", [100, 113, 500])
+def test_long_skipped_view_update_delete_with_timestamp(cql, test_keyspace, size):
+    with new_test_table(cql, test_keyspace, 'p int, c int, x int, y int, primary key (p,c)') as table:
+        with new_materialized_view(cql, table, '*', 'p, x, c', 'p is not null and x is not null and c is not null') as mv:
+            # Write size rows with c=0..(size-1). Because the iteration is in
+            # reverse order, the first row in clustering order (c=0) will
+            # have the latest write timestamp.
+            for i in reversed(range(size)):
+                cql.execute(f'INSERT INTO {table} (p,c,x,y) VALUES (1,{i},{i},{i})')
+            assert list(cql.execute(f"SELECT c FROM {table} WHERE p = 1")) == list(cql.execute(f"SELECT c FROM {mv} WHERE p = 1"))
+            # Get the timestamp of the size*0.9th item. Because we wrote items
+            # in reverse, items 0.9-1.0*size all have earlier timestamp than
+            # that.
+            t = list(cql.execute(f"SELECT writetime(y) FROM {table} WHERE p = 1 and c = {int(size*0.9)}"))[0].writetime_y
+            cql.execute(f'DELETE FROM {table} USING TIMESTAMP {t} WHERE p=1')
+            # After the deletion we expect to see size*0.9 rows remaining
+            # (timestamp ties cannot happen for separate writes, if they
+            # did we could have a bit less), but most importantly, the view
+            # should have exactly the same rows.
+            assert list(cql.execute(f"SELECT c FROM {table} WHERE p = 1")) == list(cql.execute(f"SELECT c FROM {mv} WHERE p = 1"))
+
+# Same test as above, just that in this version the view partition key is
+# different from the base's, so we can be sure that Scylla needs to go
+# through the loop of deleting many view rows and cannot delete an entire
+# view partition in one fell swoop. In the above test, Scylla *may* contain
+# such an optimization (currently it doesn't), so it may reach a different
+# code path.
+def test_long_skipped_view_update_delete_with_timestamp2(cql, test_keyspace):
+    size = 200
+    with new_test_table(cql, test_keyspace, 'p int, c int, x int, y int, primary key (p,c)') as table:
+        with new_materialized_view(cql, table, '*', 'x, p, c', 'p is not null and x is not null and c is not null') as mv:
+            for i in reversed(range(size)):
+                cql.execute(f'INSERT INTO {table} (p,c,x,y) VALUES (1,{i},{i},{i})')
+            assert list(cql.execute(f"SELECT c FROM {table}")) == sorted(list(cql.execute(f"SELECT c FROM {mv}")))
+            t = list(cql.execute(f"SELECT writetime(y) FROM {table} WHERE p = 1 and c = {int(size*0.9)}"))[0].writetime_y
+            cql.execute(f'DELETE FROM {table} USING TIMESTAMP {t} WHERE p=1')
+            assert list(cql.execute(f"SELECT c FROM {table}")) == sorted(list(cql.execute(f"SELECT c FROM {mv}")))
+
+# Another, more fundemental, reproducer for issue #12297 where a certain
+# modification to a base partition modifing more than 100 rows was not
+# applied to the view beyond the 100th row.
+# The test above, test_long_skipped_view_update_delete_with_timestamp was one
+# such specific case, which involved a partition tombstone and a specific
+# choice of timestamp which causes the first 100 rows to NOT be changed.
+# In this test we show that the bug is not just about do-nothing tombstones:
+# In any base modification which involves more than 100 rows, if the first
+# 100 rows don't change the view (as decided by the can_skip_view_updates()
+# function), the other rows are wrongly skipped at well and not applied to
+# the view!
+# The specific case we use here is an update that sets some irrelevant
+# (not-selected-by-the-view) column y on 200 rows, and additionally writes
+# a new row as the 201st row. With bug #12297, that 201st row will be
+# missing in the view.
+def test_long_skipped_view_update_irrelevant_column(cql, test_keyspace):
+    size = 200
+    with new_test_table(cql, test_keyspace, 'p int, c int, x int, y int, primary key (p,c)') as table:
+        # Note that column "y" is not selected by the materialized view
+        with new_materialized_view(cql, table, 'p, x, c', 'p, x, c', 'p is not null and x is not null and c is not null') as mv:
+            for i in range(size):
+                cql.execute(f'INSERT INTO {table} (p,c,x,y) VALUES (1,{i},{i},{i})')
+            # In a single batch (a single mutation), update "y" column in all
+            # 'size' existing rows, plus add one new row in the last position
+            # (the partition is sorted by the "c" column). The first 'size'
+            # UPDATEs can be skipped in the view (because y isn't selected),
+            # but the last INSERT can't be skipped - it really adds a new row.
+            cmd = 'BEGIN BATCH '
+            for i in range(size):
+                cmd += f'UPDATE {table} SET y=7 where p=1 and c={i}; '
+            cmd += f'INSERT INTO {table} (p,c,x,y) VALUES (1,{size+1},{size+1},{size+1}); '
+            cmd += 'APPLY BATCH;'
+            cql.execute(cmd)
+            # We should now have the same size+1 rows in both base and view
+            assert list(cql.execute(f"SELECT c FROM {table} WHERE p = 1")) == list(cql.execute(f"SELECT c FROM {mv} WHERE p = 1"))
+
+# After the previous tests checked elaborate conditions where modifying a
+# base-table partition resulted in many skipped view updates, let's also
+# check the more basic situation where the base-table partition modification
+# (in this case, a deletion) result in many view-table updates, and all
+# of them should happen even if the code needs to do it internally in
+# several batches of 100 (for example).
+def test_mv_long_delete(cql, test_keyspace):
+    size = 300
+    with new_test_table(cql, test_keyspace, 'p int, c int, x int, y int, primary key (p,c)') as table:
+        with new_materialized_view(cql, table, '*', 'p, x, c', 'p is not null and x is not null and c is not null') as mv:
+            for i in range(size):
+                cql.execute(f'INSERT INTO {table} (p,c,x,y) VALUES (1,{i},{i},{i})')
+            cql.execute(f'DELETE FROM {table} WHERE p=1')
+            assert list(cql.execute(f"SELECT c FROM {table} WHERE p = 1")) == []
+            assert list(cql.execute(f"SELECT c FROM {mv} WHERE p = 1")) == []
--- a/test/cql-pytest/test_null.py
+++ b/test/cql-pytest/test_null.py
@@ -65,6 +65,31 @@ def test_insert_null_key_lwt(cql, table1):
    with pytest.raises(InvalidRequest, match='null value'):
        cql.execute(stmt, [None, s])

+# Contains the same checks as test_insert_null_key() and test_insert_null_key_lwt() above, just inside a batch
+def test_insert_null_key_in_batch(cql, table1):
+    s = unique_key_string()
+    with pytest.raises(InvalidRequest, match='null value'):
+        cql.execute(f"BEGIN BATCH INSERT INTO {table1} (p,c) VALUES ('{s}', null);APPLY BATCH;")
+    with pytest.raises(InvalidRequest, match='null value'):
+        cql.execute(f"BEGIN BATCH INSERT INTO {table1} (p,c) VALUES ('{s}', null) IF NOT EXISTS;APPLY BATCH;")
+    with pytest.raises(InvalidRequest, match='null value'):
+        cql.execute(f"BEGIN BATCH INSERT INTO {table1} (p,c) VALUES (null, '{s}');APPLY BATCH;")
+    with pytest.raises(InvalidRequest, match='null value'):
+        cql.execute(f"BEGIN BATCH INSERT INTO {table1} (p,c) VALUES (null, '{s}') IF NOT EXISTS;APPLY BATCH;")
+
+    # Try the same thing with prepared statement.
+    stmt = cql.prepare(f"BEGIN BATCH INSERT INTO {table1} (p,c) VALUES (?, ?);APPLY BATCH;")
+    with pytest.raises(InvalidRequest, match='null value'):
+        cql.execute(stmt, [s, None])
+    with pytest.raises(InvalidRequest, match='null value'):
+        cql.execute(stmt, [None, s])
+
+    stmt = cql.prepare(f"BEGIN BATCH INSERT INTO {table1} (p,c) VALUES (?, ?) IF NOT EXISTS;APPLY BATCH;")
+    with pytest.raises(InvalidRequest, match='null value'):
+        cql.execute(stmt, [s, None])
+    with pytest.raises(InvalidRequest, match='null value'):
+        cql.execute(stmt, [None, s])
+
 # Tests handling of "key_column in ?" where ? is bound to null.
 # Reproduces issue #8265.
 def test_primary_key_in_null(cql, table1):
--- a/test/cql-pytest/test_paging.py
+++ b/test/cql-pytest/test_paging.py
@@ -0,0 +1,37 @@
+# Copyright 2020-present ScyllaDB
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+from util import new_test_table
+from cassandra.query import SimpleStatement
+import pytest
+import nodetool
+
+# Test that the _stop flag set in the compactor at the end of a page is not
+# sticky and doesn't remain set on the following page. If it does it can cause
+# the next page (and consequently the entire query) to be terminated prematurely.
+# This can happen if the code path on the very first consumed fragment doesn't
+# reset this flag. Currently this is the case for rows completely covered by a
+# higher level tombstone.
+def test_sticky_stop_flag(cql, test_keyspace):
+    with new_test_table(cql, test_keyspace, 'pk int, ck int, v int, PRIMARY KEY (pk, ck)') as table:
+        insert_row_id = cql.prepare(f"INSERT INTO {table} (pk, ck, v) VALUES (?, ?, ?)")
+
+        pk = 0
+
+        # Flush the row to disk, to prevent it being compacted away in the
+        # memtable upon writing the partition tombstone.
+        cql.execute(insert_row_id, (pk, 100, 0))
+        nodetool.flush(cql, table)
+        cql.execute(f"DELETE FROM {table} WHERE pk = {pk}")
+
+        for ck in range(0, 200):
+            if ck == 100:
+                continue
+            cql.execute(insert_row_id, (pk, ck, 0))
+
+        statement = SimpleStatement(f"SELECT * FROM {table} WHERE pk = {pk}", fetch_size=100)
+
+        res = list(cql.execute(statement))
+
+        assert len(res) == 199
--- a/test/cql-pytest/test_unset.py
+++ b/test/cql-pytest/test_unset.py
@@ -0,0 +1,598 @@
+# Copyright 2023-present ScyllaDB
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+import pytest
+from util import new_test_table, unique_key_int
+from cassandra.query import UNSET_VALUE
+from cassandra.protocol import InvalidRequest
+
+@pytest.fixture(scope="module")
+def table1(cql, test_keyspace):
+    with new_test_table(cql, test_keyspace, "p int PRIMARY KEY, a int, b int, c int, li list<int>") as table:
+        yield table
+
+@pytest.fixture(scope="module")
+def table2(cql, test_keyspace):
+    with new_test_table(cql, test_keyspace, "p int, c int, PRIMARY KEY (p, c)") as table:
+        yield table
+
+@pytest.fixture(scope="module")
+def table3(cql, test_keyspace):
+    with new_test_table(cql, test_keyspace, "p int, c int, r int, PRIMARY KEY (p, c)") as table:
+        yield table
+
+@pytest.fixture(scope="module")
+def table4(cql, test_keyspace):
+    with new_test_table(cql, test_keyspace, "p int, c int, s int, r int, PRIMARY KEY (p, c)") as table:
+        yield table
+
+# Test INSERT with UNSET_VALUE for the clustering column value
+def test_insert_unset_clustering_col(cql, table4, scylla_only):
+    p = unique_key_int()
+    def insert(c, s, r):
+        cql.execute(cql.prepare(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, ?, ?, ?)"), [c, s, r])
+    def select_rows():
+        return sorted(list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}")))
+
+    # INSERT (p, UNSET, 2, 3)
+    insert(c=UNSET_VALUE, s=2, r=3)
+    assert select_rows() == []
+
+    # INSERT (p, 1, 2, 3)
+    insert(c=1, s=2, r=3)
+    assert select_rows() == [(p, 1, 2, 3)]
+
+    # INSERT (p, UNSET, 2, 3)
+    insert(c=UNSET_VALUE, s=2, r=3)
+    assert select_rows() == [(p, 1, 2, 3)]
+
+    # INSERT (p, UNSET, 5, 6)
+    insert(c=UNSET_VALUE, s=5, r=6)
+    assert select_rows() == [(p, 1, 2, 3)]
+
+# Test INSERT with UNSET_VALUE for the static column value
+def test_insert_unset_static_col(cql, table4):
+    p = unique_key_int()
+    def insert(c, s, r):
+        cql.execute(cql.prepare(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, ?, ?, ?)"), [c, s, r])
+    def select_rows():
+        return sorted(list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}")))
+
+    # INSERT (p, 1, UNSET, 3)
+    insert(c=1, s=UNSET_VALUE, r=3)
+    assert select_rows() == [(p, 1, None, 3)]
+
+    # INSERT (p, 1, 2, 3)
+    insert(c=1, s=2, r=3)
+    assert select_rows() == [(p, 1, 2, 3)]
+
+    # INSERT (p, 1, UNSET, 3)
+    insert(c=1, s=UNSET_VALUE, r=3)
+    assert select_rows() == [(p, 1, 2, 3)]
+
+    # INSERT (p, 1, UNSET, 4)
+    insert(c=1, s=UNSET_VALUE, r=4)
+    assert select_rows() == [(p, 1, 2, 4)]
+
+# Test INSERT with UNSET_VALUE for the regular column value
+def test_insert_unset_regular_col(cql, table4):
+    p = unique_key_int()
+    def insert(c, s, r):
+        cql.execute(cql.prepare(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, ?, ?, ?)"), [c, s, r])
+    def select_rows():
+        return list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}"))
+
+    # INSERT (p, 1, 2, UNSET)
+    insert(c=1, s=2, r=UNSET_VALUE)
+    assert select_rows() == [(p, 1, 2, None)]
+
+    # INSERT (p, 1, 2, 3)
+    insert(c=1, s=2, r=3)
+    assert select_rows() == [(p, 1, 2, 3)]
+
+    # INSERT (p, 1, 2, UNSET)
+    insert(c=1, s=2, r=UNSET_VALUE)
+    assert select_rows() == [(p, 1, 2, 3)]
+
+    # INSERT (p, 1, 5, UNSET)
+    insert(c=1, s=5, r=UNSET_VALUE)
+    assert select_rows() == [(p, 1, 5, 3)]
+
+# Test INSERT with UNSET_VALUE for the clustering column value, using IF NOT EXISTS
+def test_insert_unset_clustering_col_if_not_exists(cql, table4):
+    p = unique_key_int()
+    def insert(c, s, r):
+        cql.execute(cql.prepare(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, ?, ?, ?) IF NOT EXISTS"), [c, s, r])
+    def select_rows():
+        return sorted(list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}")))
+
+    # INSERT (p, UNSET, 2, 3)
+    with pytest.raises(InvalidRequest, match='unset'):
+        insert(c=UNSET_VALUE, s=2, r=3)
+
+    # INSERT (p, 1, 2, 3)
+    insert(c=1, s=2, r=3)
+    assert select_rows() == [(p, 1, 2, 3)]
+
+    # INSERT (p, UNSET, 2, 3)
+    with pytest.raises(InvalidRequest, match='unset'):
+        insert(c=UNSET_VALUE, s=2, r=3)
+
+    # INSERT (p, UNSET, 5, 6)
+    with pytest.raises(InvalidRequest, match='unset'):
+        insert(c=UNSET_VALUE, s=5, r=6)
+
+    assert select_rows() == [(p, 1, 2, 3)]
+
+# Test INSERT with UNSET_VALUE for the static column value, using IF NOT EXISTS
+def test_insert_unset_static_col_if_not_exists(cql, table4):
+    p = unique_key_int()
+    def insert(c, s, r):
+        cql.execute(cql.prepare(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, ?, ?, ?) IF NOT EXISTS"), [c, s, r])
+    def select_rows():
+        return sorted(list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}")))
+
+    # INSERT (p, 1, UNSET, 3)
+    insert(c=1, s=UNSET_VALUE, r=3)
+    assert select_rows() == [(p, 1, None, 3)]
+
+    # INSERT (p, 1, 2, 3)
+    insert(c=1, s=2, r=3)
+    assert select_rows() == [(p, 1, None, 3)]
+
+    # INSERT (p, 1, UNSET, 3)
+    insert(c=1, s=UNSET_VALUE, r=3)
+    assert select_rows() == [(p, 1, None, 3)]
+
+    # INSERT (p, 1, UNSET, 4)
+    insert(c=1, s=UNSET_VALUE, r=4)
+    assert select_rows() == [(p, 1, None, 3)]
+
+# Test INSERT with UNSET_VALUE for the regular column value, using IF NOT EXISTS
+def test_insert_unset_regular_col_if_not_exists(cql, table4):
+    p = unique_key_int()
+    def insert(c, s, r):
+        cql.execute(cql.prepare(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, ?, ?, ?) IF NOT EXISTS"), [c, s, r])
+    def select_rows():
+        return list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}"))
+
+    # INSERT (p, 1, 2, UNSET)
+    insert(c=1, s=2, r=UNSET_VALUE)
+    assert select_rows() == [(p, 1, 2, None)]
+
+    # INSERT (p, 1, 2, 3)
+    insert(c=1, s=2, r=3)
+    assert select_rows() == [(p, 1, 2, None)]
+
+    # INSERT (p, 1, 2, UNSET)
+    insert(c=1, s=2, r=UNSET_VALUE)
+    assert select_rows() == [(p, 1, 2, None)]
+
+    # INSERT (p, 1, 5, UNSET)
+    insert(c=1, s=5, r=UNSET_VALUE)
+    assert select_rows() == [(p, 1, 2, None)]
+
+# Try doing UPDATE table4 SET c=UNSET_VALUE
+# Should fail, clustering columns can't be updated
+def test_update_unset_clustering_column(cql, table4):
+    with pytest.raises(InvalidRequest):
+        cql.prepare(f"UPDATE {table4} SET r=123, c = ? WHERE p = 0 AND c = ?")
+        # Prepare fails, no point in executing it.
+
+# Test doing UPDATE table4 SET s=UNSET_VALUE
+def test_update_unset_static_column(cql, table4, scylla_only):
+    p = unique_key_int()
+    def select_rows():
+        return list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}"))
+
+    # UPDATE SET s=UNSET_VALUE
+    update1 = cql.prepare(f"UPDATE {table4} SET s=? WHERE p = {p} AND c = ?")
+    cql.execute(update1, [UNSET_VALUE, 1])
+    assert select_rows() == []
+
+    # Try the same with c = UNSET_VALUE
+    cql.execute(update1, [UNSET_VALUE, UNSET_VALUE])
+    assert select_rows() == []
+
+    # UPDATE SET s=UNSET_VALUE, r=123
+    update2 = cql.prepare(f"UPDATE {table4} SET r=123, s=? WHERE p = {p} AND c = ?")
+    cql.execute(update2, [UNSET_VALUE, 1])
+    assert select_rows() == [(p, 1, None, 123)]
+
+    # Try the same with c = UNSET_VALUE
+    cql.execute(update2, [UNSET_VALUE, UNSET_VALUE])
+    assert select_rows() == [(p, 1, None, 123)]
+
+    # UPDATE SET s=4321
+    update3 = cql.prepare(f"UPDATE {table4} SET s=4321 WHERE p = {p} AND c = ?")
+    cql.execute(update3, [1])
+    assert select_rows() == [(p, 1, 4321, 123)]
+
+    # Try the same with c = UNSET_VALUE
+    cql.execute(update3, [UNSET_VALUE])
+    assert select_rows() == [(p, 1, 4321, 123)]
+
+    # UPDATE SET r=567, s=UNSET_VALUE
+    update4 = cql.prepare(f"UPDATE {table4} SET r=567, s=? WHERE p = {p} AND c = ?")
+    cql.execute(update4, [UNSET_VALUE, 1])
+    assert select_rows() == [(p, 1, 4321, 567)]
+
+    # Try the same with c = UNSET_VALUE
+    cql.execute(update4, [UNSET_VALUE, UNSET_VALUE])
+    assert select_rows() == [(p, 1, 4321, 567)]
+
+# Test doing UPDATE table4 SET r=UNSET_VALUE
+def test_update_unset_regular_column(cql, table4, scylla_only):
+    p = unique_key_int()
+    def select_rows():
+        return list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}"))
+
+    # UPDATE SET r = UNSET_VALUE
+    update1 = cql.prepare(f"UPDATE {table4} SET r = ? WHERE p = {p} AND c = ?")
+    cql.execute(update1, [UNSET_VALUE, 1])
+    assert select_rows() == []
+
+    # Try the same with c = UNSET_VALUE
+    cql.execute(update1, [UNSET_VALUE, UNSET_VALUE])
+    assert select_rows() == []
+
+    # UPDATE SET r = UNSET_VALUE, s = 100
+    update2 = cql.prepare(f"UPDATE {table4} SET r = ?, s = 100 WHERE p = {p} AND c = ?")
+    cql.execute(update2, [UNSET_VALUE, 1])
+    assert select_rows() == [(p, 1, 100, None)]
+
+    # Try the same with c = UNSET_VALUE
+    cql.execute(update2, [UNSET_VALUE, UNSET_VALUE])
+    assert select_rows() == [(p, 1, 100, None)]
+
+    # UPDATE SET r = 200
+    update3 = cql.prepare(f"UPDATE {table4} SET r = 200 WHERE p = {p} AND c = ?")
+    cql.execute(update3, [1])
+    assert select_rows() == [(p, 1, 100, 200)]
+
+    # Try the same with c = UNSET_VALUE
+    cql.execute(update3, [UNSET_VALUE])
+    assert select_rows() == [(p, 1, 100, 200)]
+
+    # UPDATE SET r = UNSET_VALUE, s = 300
+    update4 = cql.prepare(f"UPDATE {table4} SET r = ?, s = 300 WHERE p = {p} AND c = ?")
+    cql.execute(update4, [UNSET_VALUE, 1])
+    assert select_rows() == [(p, 1, 300, 200)]
+
+    # Try the same with c = UNSET_VALUE
+    cql.execute(update4, [UNSET_VALUE, UNSET_VALUE])
+    assert select_rows() == [(p, 1, 300, 200)]
+
+# Test doing UPDATE table4 SET s=UNSET_VALUE IF EXISTS
+def test_update_unset_static_column_if_exists(cql, table4):
+    p = unique_key_int()
+    def select_rows():
+        return list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}"))
+
+    # UPDATE SET s=UNSET_VALUE
+    update1 = cql.prepare(f"UPDATE {table4} SET s=? WHERE p = {p} AND c = ? IF EXISTS")
+    cql.execute(update1, [UNSET_VALUE, 1])
+    assert select_rows() == []
+
+    # Try the same with c = UNSET_VALUE
+    with pytest.raises(InvalidRequest):
+        cql.execute(update1, [UNSET_VALUE, UNSET_VALUE])
+
+    # Insert something into the table so that the updates actually change something
+    cql.execute(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, 1, 2, 3)")
+
+    # UPDATE SET s=UNSET_VALUE, r=123
+    update2 = cql.prepare(f"UPDATE {table4} SET r=123, s=? WHERE p = {p} AND c = ? IF EXISTS")
+    cql.execute(update2, [UNSET_VALUE, 1])
+    assert select_rows() == [(p, 1, 2, 123)]
+
+    # Try the same with c = UNSET_VALUE
+    with pytest.raises(InvalidRequest):
+        cql.execute(update2, [UNSET_VALUE, UNSET_VALUE])
+
+    # UPDATE SET s=4321
+    update3 = cql.prepare(f"UPDATE {table4} SET s=4321 WHERE p = {p} AND c = ? IF EXISTS")
+    cql.execute(update3, [1])
+    assert select_rows() == [(p, 1, 4321, 123)]
+
+    # Try the same with c = UNSET_VALUE
+    with pytest.raises(InvalidRequest):
+        cql.execute(update3, [UNSET_VALUE])
+
+    # UPDATE SET r=567, s=UNSET_VALUE
+    update4 = cql.prepare(f"UPDATE {table4} SET r=567, s=? WHERE p = {p} AND c = ? IF EXISTS")
+    cql.execute(update4, [UNSET_VALUE, 1])
+    assert select_rows() == [(p, 1, 4321, 567)]
+
+    # Try the same with c = UNSET_VALUE
+    with pytest.raises(InvalidRequest):
+        cql.execute(update4, [UNSET_VALUE, UNSET_VALUE])
+
+# Test doing UPDATE table4 SET r=UNSET_VALUE
+def test_update_unset_regular_column_if_exists(cql, table4):
+    p = unique_key_int()
+    def select_rows():
+        return list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}"))
+
+    # UPDATE SET r = UNSET_VALUE
+    update1 = cql.prepare(f"UPDATE {table4} SET r = ? WHERE p = {p} AND c = ? IF EXISTS")
+    cql.execute(update1, [UNSET_VALUE, 1])
+    assert select_rows() == []
+
+    # Try the same with c = UNSET_VALUE
+    with pytest.raises(InvalidRequest):
+        cql.execute(update1, [UNSET_VALUE, UNSET_VALUE])
+
+    # Insert something into the table so that the updates actually change something
+    cql.execute(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, 1, 2, 3)")
+
+    # UPDATE SET r = UNSET_VALUE, s = 100
+    update2 = cql.prepare(f"UPDATE {table4} SET r = ?, s = 100 WHERE p = {p} AND c = ? IF EXISTS")
+    cql.execute(update2, [UNSET_VALUE, 1])
+    assert select_rows() == [(p, 1, 100, 3)]
+
+    # Try the same with c = UNSET_VALUE
+    with pytest.raises(InvalidRequest):
+        cql.execute(update2, [UNSET_VALUE, UNSET_VALUE])
+
+    # UPDATE SET r = 200
+    update3 = cql.prepare(f"UPDATE {table4} SET r = 200 WHERE p = {p} AND c = ? IF EXISTS")
+    cql.execute(update3, [1])
+    assert select_rows() == [(p, 1, 100, 200)]
+
+    # Try the same with c = UNSET_VALUE
+    with pytest.raises(InvalidRequest):
+        cql.execute(update3, [UNSET_VALUE])
+
+    # UPDATE SET r = UNSET_VALUE, s = 300
+    update4 = cql.prepare(f"UPDATE {table4} SET r = ?, s = 300 WHERE p = {p} AND c = ? IF EXISTS")
+    cql.execute(update4, [UNSET_VALUE, 1])
+    assert select_rows() == [(p, 1, 300, 200)]
+
+    # Try the same with c = UNSET_VALUE
+    with pytest.raises(InvalidRequest):
+        cql.execute(update4, [UNSET_VALUE, UNSET_VALUE])
+
+# Test doing UPDATE table4 SET s=UNSET_VALUE IF <lwt condition>, unset values in lwt condition are also tested
+def test_update_unset_static_column_with_lwt(cql, table4):
+    p = unique_key_int()
+    def select_rows():
+        return list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}"))
+
+    update1 = cql.prepare(f"UPDATE {table4} SET s = ? WHERE p = {p} AND c = ? IF s = ? AND r = ?")
+
+    # UPDATE SET s = 123 WHERE c = 123 AND s = 123 AND r = 123
+    cql.execute(update1, [123, 123, 123, 123])
+    assert select_rows() == []
+
+    # UPDATE SET s = UNSET_VALUE WHERE c = 123 AND s = 123 AND r = 123
+    cql.execute(update1, [UNSET_VALUE, 123, 123, 123])
+    assert select_rows() == []
+
+    # Insert something into the table so that the updates actually change something
+    cql.execute(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, 123, 123, 123)")
+
+    # UPDATE SET s = UNSET_VALUE WHERE c = 123 AND s = 123 AND r = 123
+    cql.execute(update1, [UNSET_VALUE, 123, 123, 123])
+    assert select_rows() == [(p, 123, 123, 123)]
+
+    # UPDATE table4 SET s = 321 WHERE c = 123 AND s = 123 AND r = 123
+    cql.execute(update1, [321, 123, 123, 123])
+    assert select_rows() == [(p, 123, 321, 123)]
+
+    # Setting c (clustering column) to UNSET_VALUE should generate an error
+    with pytest.raises(InvalidRequest):
+        cql.execute(update1, [123, UNSET_VALUE, 123, 123])
+
+    # Doing IF s = UNSET generates an error
+    with pytest.raises(InvalidRequest):
+        cql.execute(update1, [9000000, 123, UNSET_VALUE, 123])
+
+    # Doing IF r = UNSET silently skips the update
+    cql.execute(update1, [9000000, 123, 123, UNSET_VALUE])
+    assert select_rows() == [(p, 123, 321, 123)]
+
+    # Doing IF s = UNSET AND r = UNSET generates an error
+    with pytest.raises(InvalidRequest):
+        cql.execute(update1, [9000000, 123, UNSET_VALUE, UNSET_VALUE])
+
+    # Setting everything to UNSET generates an error
+    with pytest.raises(InvalidRequest):
+        cql.execute(update1, [UNSET_VALUE, UNSET_VALUE, UNSET_VALUE, UNSET_VALUE])
+
+# Test doing UPDATE table4 SET r=UNSET_VALUE IF <lwt condition>, unset values in lwt condition are also tested
+def test_update_unset_regular_column_with_lwt(cql, table4):
+    p = unique_key_int()
+    def select_rows():
+        return list(cql.execute(f"SELECT p, c, s, r FROM {table4} WHERE p = {p}"))
+
+    update1 = cql.prepare(f"UPDATE {table4} SET r = ? WHERE p = {p} AND c = ? IF s = ? AND r = ?")
+
+    # UPDATE SET r = 123 WHERE c = 123 AND s = 123 AND r = 123
+    cql.execute(update1, [123, 123, 123, 123])
+    assert select_rows() == []
+
+    # UPDATE SET r = UNSET_VALUE WHERE c = 123 AND s = 123 AND r = 123
+    cql.execute(update1, [UNSET_VALUE, 123, 123, 123])
+    assert select_rows() == []
+
+    # Insert something into the table so that the updates actually change something
+    cql.execute(f"INSERT INTO {table4} (p, c, s, r) VALUES ({p}, 123, 123, 123)")
+
+    # UPDATE SET r = UNSET_VALUE WHERE c = 123 AND s = 123 AND r = 123
+    cql.execute(update1, [UNSET_VALUE, 123, 123, 123])
+    assert select_rows() == [(p, 123, 123, 123)]
+
+    # UPDATE table4 SET r = 321 WHERE c = 123 AND s = 123 AND r = 123
+    cql.execute(update1, [321, 123, 123, 123])
+    assert select_rows() == [(p, 123, 123, 321)]
+
+    # Setting c (clustering column) to UNSET_VALUE should generate an error
+    with pytest.raises(InvalidRequest):
+        cql.execute(update1, [123, UNSET_VALUE, 123, 123])
+
+    # Doing IF s = UNSET generates an error
+    with pytest.raises(InvalidRequest):
+        cql.execute(update1, [9000000, 123, UNSET_VALUE, 123])
+
+    # Doing IF r = UNSET generates an error
+    # This didn't cause an error when updating s instead of r
+    with pytest.raises(InvalidRequest):
+        cql.execute(update1, [9000000, 123, 123, UNSET_VALUE])
+
+    # Doing IF s = UNSET AND r = UNSET generates an error
+    with pytest.raises(InvalidRequest):
+        cql.execute(update1, [9000000, 123, UNSET_VALUE, UNSET_VALUE])
+
+    # Setting everything to UNSET generates an error
+    with pytest.raises(InvalidRequest):
+        cql.execute(update1, [UNSET_VALUE, UNSET_VALUE, UNSET_VALUE, UNSET_VALUE])
+
+
+# A basic test that in a prepared statement with three assignments, one
+# bound by an UNSET_VALUE is simply not done, but the other ones are.
+# Try all 2^3 combinations of a 3 column updates with each one set to either
+# a real value or an UNSET_VALUE.
+def test_update_unset_value_basic(cql, table1):
+    p = unique_key_int()
+    stmt = cql.prepare(f'UPDATE {table1} SET a=?, b=?, c=? WHERE p={p}')
+    a = 1
+    b = 2
+    c = 3
+    cql.execute(stmt, [a, b, c])
+    assert [(a, b, c)] == list(cql.execute(f'SELECT a,b,c FROM {table1} WHERE p = {p}'))
+    i = 4
+    for unset_a in [False, True]:
+        for unset_b in [False, True]:
+            for unset_c in [False, True]:
+                if unset_a:
+                    newa = UNSET_VALUE
+                else:
+                    newa = i
+                    a = i
+                    i += 1
+                if unset_b:
+                    newb = UNSET_VALUE
+                else:
+                    newb = i
+                    b = i
+                    i += 1
+                if unset_c:
+                    newc = UNSET_VALUE
+                else:
+                    newc = i
+                    c = i
+                    i += 1
+                cql.execute(stmt, [newa, newb, newc])
+                assert [(a, b, c)] == list(cql.execute(f'SELECT a,b,c FROM {table1} WHERE p = {p}'))
+
+# The expression "SET a=?" is skipped if the bound value is UNSET_VALUE.
+# But what if it is part of a more complex expression like "SET a=(int)?+1"
+# (arithmetic expression on the bind variable)? Does the SET also get
+# skipped? Cassandra, and Scylla, decided that the answer will be no:
+# We refuse to evaluate expressions involving an UNSET_VALUE, and in
+# such case the whole write request will fail instead of parts of it being
+# skipped. See discussion in pull request #12517.
+
+@pytest.mark.xfail(reason="issue #2693 - Scylla doesn't yet support arithmetic expressions")
+def test_update_unset_value_expr_arithmetic(cql, table1):
+    p = unique_key_int()
+    stmt = cql.prepare(f'UPDATE {table1} SET a=(int)?+1 WHERE p={p}')
+    cql.execute(stmt, [7])
+    assert [(8,)] == list(cql.execute(f'SELECT a FROM {table1} WHERE p = {p}'))
+    with pytest.raises(InvalidRequest):
+        cql.execute(stmt, [UNSET_VALUE])
+
+# Despite the decision that expressions will not allow UNSET_VALUE, Cassandra
+# decided that (quoting its NEWS.txt) "an unset bind counter operation does
+# not change the counter value.".  So "c = c + ?" for a counter, when given
+# an UNSET_VALUE, will causes the write to be skipped, without error.
+# The rationale is that "c = c + ?" is not an expression - it doesn't actually
+# calculate c + ?, but rather it is a primitive increment operation, and
+# passing ?=UNSET_VALUE should be able to skip this primitive operation.
+def test_unset_counter_increment(cql, test_keyspace):
+    with new_test_table(cql, test_keyspace, "p int PRIMARY KEY, c counter") as table:
+        p = unique_key_int()
+        stmt = cql.prepare(f'UPDATE {table} SET c=c+? WHERE p={p}')
+        cql.execute(stmt, [3])
+        assert [(3,)] == list(cql.execute(f'SELECT c FROM {table} WHERE p = {p}'))
+        cql.execute(stmt, [UNSET_VALUE])
+        assert [(3,)] == list(cql.execute(f'SELECT c FROM {table} WHERE p = {p}'))
+
+# Like the counter increment, a list append operation (li=li+?) is a primitive
+# operation and not expression, so we believe UNSET_VALUE should be able
+# to skip it, and Scylla indeed does as this test shows. Cassandra fails
+# this test - it produces an internal error on a bad cast, and we consider
+# this a Cassandra bug and hence the cassandra_bug tag.
+def test_unset_list_append(cql, table1, cassandra_bug):
+    p = unique_key_int()
+    stmt = cql.prepare(f'UPDATE {table1} SET li=li+? WHERE p={p}')
+    cql.execute(stmt, [[7]])
+    assert [([7],)] == list(cql.execute(f'SELECT li FROM {table1} WHERE p = {p}'))
+    cql.execute(stmt, [UNSET_VALUE])
+    assert [([7],)] == list(cql.execute(f'SELECT li FROM {table1} WHERE p = {p}'))
+
+# According to Cassandra's NEWS.txt, "an unset bind ttl is treated as
+# 'unlimited'". It shouldn't skip the write.
+# Note that the NEWS.txt is not accurate: An unset ttl isn't really treated
+# as unlimited, but rather as the default ttl set on the table. The default
+# ttl is usually unlimited, but not always. We test that case in
+# test_ttl.py::test_default_ttl_unset()
+def test_unset_ttl(cql, table1):
+    p = unique_key_int()
+    # First write using a normal TTL:
+    stmt = cql.prepare(f'UPDATE {table1} USING TTL ? SET a=? WHERE p={p}')
+    cql.execute(stmt, [20000, 3])
+    res = list(cql.execute(f'SELECT a, ttl(a) FROM {table1} WHERE p = {p}'))
+    assert res[0].a == 3
+    assert res[0].ttl_a > 10000
+    # Check that an UNSET_VALUE ttl didn't skip the write but reset the TTL
+    # to unlimited (None)
+    cql.execute(stmt, [UNSET_VALUE, 4])
+    assert [(4, None)] == list(cql.execute(f'SELECT a, ttl(a) FROM {table1} WHERE p = {p}'))
+
+# According to Cassadra's NEWS.txt, "an unset bind timestamp is treated
+# as 'now'". It shouldn't skip the write.
+def test_unset_timestamp(cql, table1):
+    p = unique_key_int()
+    stmt = cql.prepare(f'UPDATE {table1} USING TIMESTAMP ? SET a=? WHERE p={p}')
+    cql.execute(stmt, [UNSET_VALUE, 3])
+    assert [(3,)] == list(cql.execute(f'SELECT a FROM {table1} WHERE p = {p}'))
+
+# According to Cassandra's NEWS.txt, "In a QUERY request an unset limit
+# is treated as 'unlimited'.". It mustn't cause the query to fail (let alone
+# be skipped somehow).
+def test_unset_limit(cql, table2):
+    p = unique_key_int()
+    cql.execute(f'INSERT INTO {table2} (p, c) VALUES ({p}, 1)')
+    cql.execute(f'INSERT INTO {table2} (p, c) VALUES ({p}, 2)')
+    cql.execute(f'INSERT INTO {table2} (p, c) VALUES ({p}, 3)')
+    cql.execute(f'INSERT INTO {table2} (p, c) VALUES ({p}, 4)')
+    stmt = cql.prepare(f'SELECT c FROM {table2} WHERE p={p} limit ?')
+    assert [(1,),(2,)] == list(cql.execute(stmt, [2]))
+    assert [(1,),(2,),(3,),(4,)] == list(cql.execute(stmt, [UNSET_VALUE]))
+
+# TODO: check that (according to NEWS.txt documentation): "Unset tuple field,
+# UDT field and map key are not allowed.".
+
+# Similar to test_unset_insert_where() above, just use an LWT write ("IF
+# NOT EXISTS"). Test that using an UNSET_VALUE in an LWT condtion causes
+# a clear error, not silent skip and not a crash as in issue #13001.
+def test_unset_insert_where_lwt(cql, table2):
+    p = unique_key_int()
+    stmt = cql.prepare(f'INSERT INTO {table2} (p, c) VALUES ({p}, ?) IF NOT EXISTS')
+    with pytest.raises(InvalidRequest, match="unset"):
+        cql.execute(stmt, [UNSET_VALUE])
+
+# Like test_unset_insert_where_lwt, but using UPDATE
+# Python driver doesn't allow sending an UNSET_VALUE for the partition key,
+# so only the clustering key is tested.
+def test_unset_update_where_lwt(cql, table3):
+    stmt = cql.prepare(f"UPDATE {table3} SET r = 42 WHERE p = 0 AND c = ? IF r = ?")
+
+    with pytest.raises(InvalidRequest, match="unset"):
+        cql.execute(stmt, [UNSET_VALUE, 2])
+
+    with pytest.raises(InvalidRequest, match="unset"):
+        cql.execute(stmt, [1, UNSET_VALUE])
--- a/tools/python3
+++ b/tools/python3
--- a/types.cc
+++ b/types.cc
@@ -757,6 +757,7 @@ bool abstract_type::is_collection() const {
 bool abstract_type::is_tuple() const {
    struct visitor {
        bool operator()(const abstract_type&) { return false; }
+        bool operator()(const reversed_type_impl& t) { return t.underlying_type()->is_tuple(); }
        bool operator()(const tuple_type_impl&) { return true; }
    };
    return visit(*this, visitor{});
@@ -1997,6 +1998,10 @@ data_value deserialize_aux(const tuple_type_impl& t, View v) {

 template<FragmentedView View>
 utils::multiprecision_int deserialize_value(const varint_type_impl&, View v) {
+    if (v.empty()) {
+        throw marshal_exception("cannot deserialize multiprecision int - empty buffer");
+    }
+    skip_empty_fragments(v);
    bool negative = v.current_fragment().front() < 0;
    utils::multiprecision_int num;
  while (v.size_bytes()) {
@@ -2093,6 +2098,7 @@ bool deserialize_value(const boolean_type_impl&, View v) {
    if (v.size_bytes() != 1) {
        throw marshal_exception(format("cannot deserialize boolean, size mismatch ({:d})", v.size_bytes()));
    }
+    skip_empty_fragments(v);
    return v.current_fragment().front() != 0;
 }

@@ -2264,9 +2270,11 @@ struct compare_visitor {
    std::strong_ordering operator()(const listlike_collection_type_impl& l) {
        using llpdi = listlike_partial_deserializing_iterator;
        auto sf = cql_serialization_format::internal();
-        return lexicographical_tri_compare(llpdi::begin(v1, sf), llpdi::end(v1, sf), llpdi::begin(v2, sf),
-                llpdi::end(v2, sf),
-                [&] (const managed_bytes_view& o1, const managed_bytes_view& o2) { return l.get_elements_type()->compare(o1, o2); });
+        return with_empty_checks([&] {
+            return lexicographical_tri_compare(llpdi::begin(v1, sf), llpdi::end(v1, sf), llpdi::begin(v2, sf),
+                    llpdi::end(v2, sf),
+                    [&] (const managed_bytes_view& o1, const managed_bytes_view& o2) { return l.get_elements_type()->compare(o1, o2); });
+        });
    }
    std::strong_ordering operator()(const map_type_impl& m) {
        return map_type_impl::compare_maps(m.get_keys_type(), m.get_values_type(), v1, v2);
--- a/utils/UUID_gen.hh
+++ b/utils/UUID_gen.hh
@@ -289,7 +289,7 @@ public:
     * <b>Warning:</b> this method should only be used for querying as this
     * doesn't at all guarantee the uniqueness of the resulting UUID.
     */
-    static UUID min_time_UUID(milliseconds timestamp = milliseconds{0})
+    static UUID min_time_UUID(decimicroseconds timestamp = decimicroseconds{0})
    {
        auto uuid = UUID(create_time(from_unix_timestamp(timestamp)), MIN_CLOCK_SEQ_AND_NODE);
        assert(uuid.is_timestamp());
--- a/utils/config_file.cc
+++ b/utils/config_file.cc
@@ -199,7 +199,13 @@ utils::config_type::to_json(const void* value) const {

 bool
 utils::config_file::config_src::matches(std::string_view name) const {
-    if (_name == name) {
+    // The below line provides support for option names in the "long_name,short_name" format,
+    // such as "workdir,W". We only want the long name ("workdir") to be used in the YAML.
+    // But since at some point (due to a bug) the YAML config parser expected the silly
+    // double form ("workdir,W") instead, we support both for backward compatibility.
+    std::string_view long_name = _name.substr(0, _name.find_first_of(','));
+
+    if (_name == name || long_name == name) {
        return true;
    }
    if (!_alias.empty() && _alias == name) {
--- a/utils/stall_free.hh
+++ b/utils/stall_free.hh
@@ -60,6 +60,7 @@ concept HasClearGentlyMethod = requires (T x) {

 template <typename T>
 concept SmartPointer = requires (T x) {
+    { x.get() } -> std::same_as<typename T::element_type*>;
    { *x } -> std::same_as<typename T::element_type&>;
 };

@@ -177,7 +178,11 @@ future<> clear_gently(T& o) noexcept {

 template <SmartPointer T>
 future<> clear_gently(T& o) noexcept {
-    return internal::clear_gently(*o);
+    if (auto p = o.get()) {
+        return internal::clear_gently(*p);
+    } else {
+        return make_ready_future<>();
+    }
 }

 template <typename T, std::size_t N>