release: prepare for 5.2.0-rc3

Merge 'doc: Updates the recommended OS to be Ubuntu 22.04' from Anna Stuchlik
Fixes https://github.com/scylladb/scylladb/issues/13138 Fixes https://github.com/scylladb/scylladb/issues/13153 This PR: - Fixes outdated information about the recommended OS. Since version 5.2, the recommended OS should be Ubuntu 22.04 because that OS is used for building the ScyllaDB image. - Adds the OS support information for version 5.2. This PR (both commits) needs to be backported to branch-5.2. Closes #13188 * github.com:scylladb/scylladb: doc: Add OS support for version 5.2 doc: Updates the recommended OS to be Ubuntu 22.04 (cherry picked from commit f4b5679804)
2023-03-20 10:10:27 +02:00 · 2023-03-17 10:30:06 +02:00 · 2023-03-16 18:41:08 +03:00 · 2023-03-16 10:42:03 +02:00 · 2023-03-14 09:50:16 +02:00 · 2023-03-14 09:50:16 +02:00
74 changed files with 1109 additions and 650 deletions
--- a/2
+++ b/2
@@ -72,7 +72,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=5.2.0-dev
+VERSION=5.2.0-rc3

 if test -f version
 then
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -145,19 +145,24 @@ future<alternator::executor::request_return_type> alternator::executor::list_str
    auto table = find_table(_proxy, request);
    auto db = _proxy.data_dictionary();
    auto cfs = db.get_tables();
-    auto i = cfs.begin();
-    auto e = cfs.end();

    if (limit < 1) {
        throw api_error::validation("Limit must be 1 or more");
    }

-    // TODO: the unordered_map here is not really well suited for partial
-    // querying - we're sorting on local hash order, and creating a table
-    // between queries may or may not miss info. But that should be rare,
-    // and we can probably expect this to be a single call.
+    // # 12601 (maybe?) - sort the set of tables on ID. This should ensure we never
+    // generate duplicates in a paged listing here. Can obviously miss things if they 
+    // are added between paged calls and end up with a "smaller" UUID/ARN, but that 
+    // is to be expected.
+    std::sort(cfs.begin(), cfs.end(), [](const data_dictionary::table& t1, const data_dictionary::table& t2) {
+        return t1.schema()->id().uuid() < t2.schema()->id().uuid();
+    });
+
+    auto i = cfs.begin();
+    auto e = cfs.end();
+
    if (streams_start) {
-        i = std::find_if(i, e, [&](data_dictionary::table t) {
+        i = std::find_if(i, e, [&](const data_dictionary::table& t) {
            return t.schema()->id().uuid() == streams_start
                && cdc::get_base_table(db.real_database(), *t.schema())
                && is_alternator_keyspace(t.schema()->ks_name())
--- a/compaction/compaction_strategy.cc
+++ b/compaction/compaction_strategy.cc
@@ -409,7 +409,9 @@ public:
                l0_old_ssts.push_back(std::move(sst));
            }
        }
-        _l0_scts.replace_sstables(std::move(l0_old_ssts), std::move(l0_new_ssts));
+        if (l0_old_ssts.size() || l0_new_ssts.size()) {
+            _l0_scts.replace_sstables(std::move(l0_old_ssts), std::move(l0_new_ssts));
+        }
    }
 };

--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -553,4 +553,16 @@ murmur3_partitioner_ignore_msb_bits: 12
 # WARNING: It's unsafe to set this to false if the node previously booted
 # with the schema commit log enabled. In such case, some schema changes
 # may be lost if the node was not cleanly stopped.
-force_schema_commit_log: true
+force_schema_commit_log: true
+
+# Use Raft to consistently manage schema information in the cluster.
+# Refer to https://docs.scylladb.com/master/architecture/raft.html for more details.
+# The 'Handling Failures' section is especially important.
+#
+# Once enabled in a cluster, this cannot be turned off.
+# If you want to bootstrap a new cluster without Raft, make sure to set this to `false`
+# before starting your nodes for the first time.
+#
+# A cluster not using Raft can be 'upgraded' to use Raft. Refer to the aforementioned
+# documentation, section 'Enabling Raft in ScyllaDB 5.2 and further', for the procedure.
+consistent_cluster_management: true
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -1416,7 +1416,7 @@ expression search_and_replace(const expression& e,
                    };
                },
                [&] (const binary_operator& oper) -> expression {
-                    return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs));
+                    return binary_operator(recurse(oper.lhs), oper.op, recurse(oper.rhs), oper.order);
                },
                [&] (const column_mutation_attribute& cma) -> expression {
                    return column_mutation_attribute{cma.kind, recurse(cma.column)};
--- a/cql3/selection/field_selector.hh
+++ b/cql3/selection/field_selector.hh
@@ -80,7 +80,7 @@ public:

    virtual sstring assignment_testable_source_context() const override {
        auto&& name = _type->field_name(_field);
-        auto sname = sstring(reinterpret_cast<const char*>(name.begin(), name.size()));
+        auto sname = std::string_view(reinterpret_cast<const char*>(name.data()), name.size());
        return format("{}.{}", _selected, sname);
    }

--- a/db/chained_delegating_reader.hh
+++ b/db/chained_delegating_reader.hh
@@ -59,7 +59,7 @@ public:
        }

        _end_of_stream = false;
-        forward_buffer_to(pr.start());
+        clear_buffer();
        return _underlying->fast_forward_to(std::move(pr));
    }

--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -2116,6 +2116,9 @@ future<> db::commitlog::segment_manager::do_pending_deletes() {
    clogger.debug("Discarding segments {}", ftd);

    for (auto& [f, mode] : ftd) {
+        // `f.remove_file()` resets known_size to 0, so remember the size here,
+        // in order to subtract it from total_size_on_disk accurately.
+        size_t size = f.known_size();
        try {
            if (f) {
                co_await f.close();
@@ -2132,7 +2135,6 @@ future<> db::commitlog::segment_manager::do_pending_deletes() {
                }
            }

-            auto size = f.known_size();
            auto usage = totals.total_size_on_disk;
            auto next_usage = usage - size;

@@ -2165,7 +2167,7 @@ future<> db::commitlog::segment_manager::do_pending_deletes() {
        // or had such an exception that we consider the file dead
        // anyway. In either case we _remove_ the file size from
        // footprint, because it is no longer our problem.
-        totals.total_size_on_disk -= f.known_size();
+        totals.total_size_on_disk -= size;
    }

    // #8376 - if we had an error in recycling (disk rename?), and no elements
--- a/db/config.hh
+++ b/db/config.hh
@@ -401,6 +401,10 @@ public:
    named_value<uint64_t> wasm_udf_yield_fuel;
    named_value<uint64_t> wasm_udf_total_fuel;
    named_value<size_t> wasm_udf_memory_limit;
+    // wasm_udf_reserved_memory is static because the options in db::config
+    // are parsed using seastar::app_template, while this option is used for
+    // configuring the Seastar memory subsystem.
+    static constexpr size_t wasm_udf_reserved_memory = 50 * 1024 * 1024;

    seastar::logging_settings logging_settings(const log_cli::options&) const;

--- a/db/size_estimates_virtual_reader.cc
+++ b/db/size_estimates_virtual_reader.cc
@@ -295,7 +295,7 @@ future<> size_estimates_mutation_reader::fast_forward_to(const dht::partition_ra
 }

 future<> size_estimates_mutation_reader::fast_forward_to(position_range pr) {
-    forward_buffer_to(pr.start());
+    clear_buffer();
    _end_of_stream = false;
    if (_partition_reader) {
        return _partition_reader->fast_forward_to(std::move(pr));
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -2276,7 +2276,10 @@ public:
        add_partition(mutation_sink, "trace_probability", format("{:.2}", tracing::tracing::get_local_tracing_instance().get_trace_probability()));
        co_await add_partition(mutation_sink, "memory", [this] () {
            struct stats {
-                uint64_t total = 0;
+                // take the pre-reserved memory into account, as seastar only returns
+                // the stats of memory managed by the seastar allocator, but we instruct
+                // it to reserve addition memory for system.
+                uint64_t total = db::config::wasm_udf_reserved_memory;
                uint64_t free = 0;
                static stats reduce(stats a, stats b) { return stats{a.total + b.total, a.free + b.free}; }
            };
--- a/db/view/build_progress_virtual_reader.hh
+++ b/db/view/build_progress_virtual_reader.hh
@@ -172,7 +172,7 @@ class build_progress_virtual_reader {
        }

        virtual future<> fast_forward_to(position_range range) override {
-            forward_buffer_to(range.start());
+            clear_buffer();
            _end_of_stream = false;
            return _underlying.fast_forward_to(std::move(range));
        }
--- a/db/view/row_locking.cc
+++ b/db/view/row_locking.cc
@@ -85,29 +85,25 @@ future<row_locker::lock_holder>
 row_locker::lock_ck(const dht::decorated_key& pk, const clustering_key_prefix& cpk, bool exclusive, db::timeout_clock::time_point timeout, stats& stats) {
    mylog.debug("taking shared lock on partition {}, and {} lock on row {} in it", pk, (exclusive ? "exclusive" : "shared"), cpk);
    auto tracker = latency_stats_tracker(exclusive ? stats.exclusive_row : stats.shared_row);
+    auto ck = cpk;
+    // Create a two-level lock entry for the partition if it doesn't exist already.
    auto i = _two_level_locks.try_emplace(pk, this).first;
+    // The two-level lock entry we've just created is guaranteed to be kept alive as long as it's locked.
+    // Initiating read locking in the background below ensures that even if the two-level lock is currently
+    // write-locked, releasing the write-lock will synchronously engage any waiting
+    // locks and will keep the entry alive.
    future<lock_type::holder> lock_partition = i->second._partition_lock.hold_read_lock(timeout);
-    auto j = i->second._row_locks.find(cpk);
-    if (j == i->second._row_locks.end()) {
-        // Not yet locked, need to create the lock. This makes a copy of cpk.
-        try {
-            j = i->second._row_locks.emplace(cpk, lock_type()).first;
-        } catch(...) {
-            // If this emplace() failed, e.g., out of memory, we fail. We
-            // could do nothing - the partition lock we already started
-            // taking will be unlocked automatically after being locked.
-            // But it's better form to wait for the work we started, and it
-            // will also allow us to remove the hash-table row we added.
-            return lock_partition.then([ex = std::current_exception()] (auto lock) {
-                // The lock is automatically released when "lock" goes out of scope.
-                // TODO: unlock (lock = {}) now, search for the partition in the
-                // hash table (we know it's still there, because we held the lock until
-                // now) and remove the unused lock from the hash table if still unused.
-                return make_exception_future<row_locker::lock_holder>(std::current_exception());
-            });
+    return lock_partition.then([this, pk = &i->first, row_locks = &i->second._row_locks, ck = std::move(ck), exclusive, tracker = std::move(tracker), timeout] (auto lock1) mutable {
+        auto j = row_locks->find(ck);
+        if (j == row_locks->end()) {
+            // Not yet locked, need to create the lock.
+            j = row_locks->emplace(std::move(ck), lock_type()).first;
        }
-    }
-    return lock_partition.then([this, pk = &i->first, cpk = &j->first, &row_lock = j->second, exclusive, tracker = std::move(tracker), timeout] (auto lock1) mutable {
+        auto* cpk = &j->first;
+        auto& row_lock = j->second;
+        // Like to the two-level lock entry above, the row_lock entry we've just created
+        // is guaranteed to be kept alive as long as it's locked.
+        // Initiating read/write locking in the background below ensures that.
        auto lock_row = exclusive ? row_lock.hold_write_lock(timeout) : row_lock.hold_read_lock(timeout);
        return lock_row.then([this, pk, cpk, exclusive, tracker = std::move(tracker), lock1 = std::move(lock1)] (auto lock2) mutable {
            lock1.release();
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -42,7 +42,8 @@ if __name__ == '__main__':
        if systemd_unit.available('systemd-coredump@.service'):
            dropin = '''
 [Service]
-TimeoutStartSec=infinity
+RuntimeMaxSec=infinity
+TimeoutSec=infinity
 '''[1:-1]
            os.makedirs('/etc/systemd/system/systemd-coredump@.service.d', exist_ok=True)
            with open('/etc/systemd/system/systemd-coredump@.service.d/timeout.conf', 'w') as f:
--- a/docs/_utils/redirects.yaml
+++ b/docs/_utils/redirects.yaml
@@ -1112,14 +1112,14 @@ tls-ssl/index.html: /stable/operating-scylla/security
 /using-scylla/integrations/integration_kairos/index.html: /stable/using-scylla/integrations/integration-kairos
 /upgrade/ami_upgrade/index.html: /stable/upgrade/ami-upgrade

-/scylla-cloud/cloud-setup/gcp-vpc-peering/index.html: /stable/scylla-cloud/cloud-setup/GCP/gcp-vpc-peering
-/scylla-cloud/cloud-setup/GCP/gcp-vcp-peering/index.html: /stable/scylla-cloud/cloud-setup/GCP/gcp-vpc-peering
+/scylla-cloud/cloud-setup/gcp-vpc-peering/index.html: https://cloud.docs.scylladb.com/stable/cloud-setup/gcp-vpc-peering.html
+/scylla-cloud/cloud-setup/GCP/gcp-vcp-peering/index.html: https://cloud.docs.scylladb.com/stable/cloud-setup/gcp-vpc-peering.html

 # move scylla cloud for AWS to dedicated directory
-/scylla-cloud/cloud-setup/aws-vpc-peering/index.html: /stable/scylla-cloud/cloud-setup/AWS/aws-vpc-peering
-/scylla-cloud/cloud-setup/cloud-prom-proxy/index.html: /stable/scylla-cloud/cloud-setup/AWS/cloud-prom-proxy
-/scylla-cloud/cloud-setup/outposts/index.html: /stable/scylla-cloud/cloud-setup/AWS/outposts
-/scylla-cloud/cloud-setup/scylla-cloud-byoa/index.html: /stable/scylla-cloud/cloud-setup/AWS/scylla-cloud-byoa
+/scylla-cloud/cloud-setup/aws-vpc-peering/index.html: https://cloud.docs.scylladb.com/stable/cloud-setup/aws-vpc-peering.html
+/scylla-cloud/cloud-setup/cloud-prom-proxy/index.html: https://cloud.docs.scylladb.com/stable/monitoring/cloud-prom-proxy.html
+/scylla-cloud/cloud-setup/outposts/index.html: https://cloud.docs.scylladb.com/stable/cloud-setup/outposts.html
+/scylla-cloud/cloud-setup/scylla-cloud-byoa/index.html: https://cloud.docs.scylladb.com/stable/cloud-setup/scylla-cloud-byoa.html
 /scylla-cloud/cloud-services/scylla_cloud_costs/index.html: /stable/scylla-cloud/cloud-services/scylla-cloud-costs
 /scylla-cloud/cloud-services/scylla_cloud_managin_versions/index.html: /stable/scylla-cloud/cloud-services/scylla-cloud-managin-versions
 /scylla-cloud/cloud-services/scylla_cloud_support_alerts_sla/index.html: /stable/scylla-cloud/cloud-services/scylla-cloud-support-alerts-sla
--- a/docs/alternator/compatibility.md
+++ b/docs/alternator/compatibility.md
@@ -161,6 +161,10 @@ events appear in the Streams API as normal deletions - without the
 distinctive marker on deletions which are really expirations.
 See <https://github.com/scylladb/scylla/issues/5060>.

+<!--- REMOVE IN FUTURE VERSIONS - Remove the note below in version 5.3/2023.1 -->
+
+> **Note** This feature is experimental in versions earlier than ScyllaDB Open Source 5.2 and ScyllaDB Enterprise 2022.2.
+
 ---


--- a/docs/getting-started/index.rst
+++ b/docs/getting-started/index.rst
@@ -25,7 +25,7 @@ Getting Started
  :id: "getting-started"
  :class: my-panel

-  * `Install ScyllaDB (Binary Packages, Docker, or EC2) <https://www.scylladb.com/download/>`_ - Links to the ScyllaDB Download Center
+  * `Install ScyllaDB (Binary Packages, Docker, or EC2) <https://www.scylladb.com/download/#core>`_ - Links to the ScyllaDB Download Center
  
  * :doc:`Configure ScyllaDB </getting-started/system-configuration/>`
  * :doc:`Run ScyllaDB in a Shared Environment </getting-started/scylla-in-a-shared-environment>`
--- a/docs/getting-started/install-scylla/index.rst
+++ b/docs/getting-started/install-scylla/index.rst
@@ -20,7 +20,7 @@ Install ScyllaDB

  Keep your versions up-to-date. The two latest versions are supported. Also always install the latest patches for your version. 

-  * Download and install ScyllaDB Server, Drivers and Tools in `Scylla Download Center <https://www.scylladb.com/download/#server/>`_
+  * Download and install ScyllaDB Server, Drivers and Tools in `ScyllaDB Download Center <https://www.scylladb.com/download/#core>`_
  * :doc:`ScyllaDB Web Installer for Linux <scylla-web-installer>`
  * :doc:`ScyllaDB Unified Installer (relocatable executable) <unified-installer>`
  * :doc:`Air-gapped Server Installation <air-gapped-install>`
--- a/docs/getting-started/install-scylla/scylla-web-installer.rst
+++ b/docs/getting-started/install-scylla/scylla-web-installer.rst
@@ -4,7 +4,7 @@ ScyllaDB Web Installer for Linux

 ScyllaDB Web Installer is a platform-agnostic installation script you can run with ``curl`` to install ScyllaDB on Linux.

-See `ScyllaDB Download Center <https://www.scylladb.com/download/#server>`_ for information on manually installing ScyllaDB with platform-specific installation packages.
+See `ScyllaDB Download Center <https://www.scylladb.com/download/#core>`_ for information on manually installing ScyllaDB with platform-specific installation packages.

 Prerequisites
 --------------
--- a/docs/getting-started/os-support.rst
+++ b/docs/getting-started/os-support.rst
@@ -25,11 +25,7 @@ ScyllaDB Open Source

 .. note:: 

-    Recommended OS and ScyllaDB AMI/Image OS for ScyllaDB Open Source:
-
-       - Ubuntu 20.04 for versions 4.6 and later.
-       - CentOS 7 for versions earlier than 4.6.
-
+    The recommended OS for ScyllaDB Open Source is Ubuntu 22.04.

 +----------------------------+----------------------------------+-----------------------------+---------+-------+
 | Linux Distributions        |       Ubuntu                     |    Debian                   | CentOS /| Rocky/|
@@ -37,6 +33,8 @@ ScyllaDB Open Source
 +----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+
 | ScyllaDB Version / Version | 14.04| 16.04| 18.04|20.04 |22.04 | 8    | 9    |  10   |  11   | 7       |   8   |
 +============================+======+======+======+======+======+======+======+=======+=======+=========+=======+
+|   5.2                      | |x|  | |x|  | |v|  | |v|  | |v|  | |x|  | |x|  | |v|   | |v|   | |v|     | |v|   |
+----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+
 |   5.1                      | |x|  | |x|  | |v|  | |v|  | |v|  | |x|  | |x|  | |v|   | |v|   | |v|     | |v|   |
 +----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+
 |   5.0                      | |x|  | |x|  | |v|  | |v|  | |v|  | |x|  | |x|  | |v|   | |v|   | |v|     | |v|   |
@@ -63,17 +61,18 @@ ScyllaDB Open Source
 +----------------------------+------+------+------+------+------+------+------+-------+-------+---------+-------+


-All releases are available as a Docker container, EC2 AMI, and a GCP image (GCP image from version 4.3).
+All releases are available as a Docker container, EC2 AMI, and a GCP image (GCP image from version 4.3). Since 
+version 5.2, the ScyllaDB AMI/Image OS for ScyllaDB Open Source is based on Ubuntu 22.04.
+


 ScyllaDB Enterprise
 --------------------

 .. note:: 
-   Recommended OS and ScyllaDB AMI/Image OS for ScyllaDB Enterprise:

-    - Ubuntu 20.04 for versions 2021.1 and later.
-    - CentOS 7 for versions earlier than 2021.1.
+    The recommended OS for ScyllaDB Enterprise is Ubuntu 22.04.
+

 +----------------------------+-----------------------------------+---------------------------+--------+-------+
 | Linux Distributions        |  Ubuntu                           | Debian                    | CentOS/| Rocky/|
@@ -95,4 +94,5 @@ ScyllaDB Enterprise
 +----------------------------+------+------+------+------+-------+------+------+------+------+--------+-------+


-All releases are available as a Docker container, EC2 AMI, and a GCP image (GCP image from version 2021.1).
+All releases are available as a Docker container, EC2 AMI, and a GCP image (GCP image from version 2021.1). Since 
+version 2023.1, the ScyllaDB AMI/Image OS for ScyllaDB Enterprise is based on Ubuntu 22.04.
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -13,7 +13,7 @@
  :image: /_static/img/mascots/scylla-docs.svg
  :search_box:

-  The most up-to-date documents for the fastest, best performing, high availability NoSQL database.
+  New to ScyllaDB? Start `here <https://cloud.docs.scylladb.com/stable/scylladb-basics/>`_!

 .. raw:: html

@@ -26,16 +26,7 @@
  <div class="grid-x grid-margin-x hs">

 .. topic-box::
-  :title: New to ScyllaDB? Start here!
-  :link: https://cloud.docs.scylladb.com/stable/scylladb-basics/
-  :class: large-4
-  :anchor: ScyllaDB Basics
-
-  Learn the essentials of ScyllaDB.
-
-
-.. topic-box::
-  :title: Let us manage your DB
+  :title: ScyllaDB Cloud
  :link: https://cloud.docs.scylladb.com
  :class: large-4
  :anchor: ScyllaDB Cloud Documentation
@@ -43,12 +34,20 @@
  Simplify application development with ScyllaDB Cloud - a fully managed database-as-a-service.

 .. topic-box::
-  :title: Manage your own DB
+  :title: ScyllaDB Enterprise
+  :link: https://enterprise.docs.scylladb.com
+  :class: large-4
+  :anchor: ScyllaDB Enterprise Documentation
+
+  Deploy and manage ScyllaDB's most stable enterprise-grade database with premium features and 24/7 support.
+
+.. topic-box::
+  :title: ScyllaDB Open Source
  :link: getting-started
  :class: large-4
-  :anchor: ScyllaDB Open Source and Enterprise Documentation
+  :anchor: ScyllaDB Open Source Documentation

-  Deploy and manage your database in your own environment.
+  Deploy and manage your database in your environment.


 .. raw:: html
@@ -59,40 +58,16 @@

  <div class="topics-grid topics-grid--products">

-      <h2 class="topics-grid__title">Our Products</h2>
+      <h2 class="topics-grid__title">Other Products</h2>

      <div class="grid-container full">
          <div class="grid-x grid-margin-x">

-.. topic-box::
-  :title: ScyllaDB Enterprise
-  :link: getting-started
-  :image: /_static/img/mascots/scylla-enterprise.svg
-  :class: topic-box--product,large-3,small-6
-
-  ScyllaDB’s most stable high-performance enterprise-grade NoSQL database.
-
-.. topic-box::
-  :title: ScyllaDB Open Source
-  :link: getting-started
-  :image: /_static/img/mascots/scylla-opensource.svg
-  :class: topic-box--product,large-3,small-6
-
-  A high-performance NoSQL database with a close-to-the-hardware, shared-nothing approach.
-
-.. topic-box::
-  :title: ScyllaDB Cloud
-  :link: https://cloud.docs.scylladb.com
-  :image: /_static/img/mascots/scylla-cloud.svg
-  :class: topic-box--product,large-3,small-6
-
-  A fully managed NoSQL database as a service powered by ScyllaDB Enterprise.
-
 .. topic-box::
  :title: ScyllaDB Alternator
  :link: https://docs.scylladb.com/stable/alternator/alternator.html
  :image: /_static/img/mascots/scylla-alternator.svg
-  :class: topic-box--product,large-3,small-6
+  :class: topic-box--product,large-4,small-6

  Open source Amazon DynamoDB-compatible API.

@@ -100,7 +75,7 @@
  :title: ScyllaDB Monitoring Stack
  :link: https://monitoring.docs.scylladb.com
  :image: /_static/img/mascots/scylla-monitor.svg
-  :class: topic-box--product,large-3,small-6
+  :class: topic-box--product,large-4,small-6

  Complete open source monitoring solution for your ScyllaDB clusters.

@@ -108,7 +83,7 @@
  :title: ScyllaDB Manager
  :link: https://manager.docs.scylladb.com
  :image: /_static/img/mascots/scylla-manager.svg
-  :class: topic-box--product,large-3,small-6
+  :class: topic-box--product,large-4,small-6

  Hassle-free ScyllaDB NoSQL database management for scale-out clusters.

@@ -116,7 +91,7 @@
  :title: ScyllaDB Drivers
  :link: https://docs.scylladb.com/stable/using-scylla/drivers/
  :image: /_static/img/mascots/scylla-drivers.svg
-  :class: topic-box--product,large-3,small-6
+  :class: topic-box--product,large-4,small-6

  Shard-aware drivers for superior performance. 

@@ -124,7 +99,7 @@
  :title: ScyllaDB Operator
  :link: https://operator.docs.scylladb.com
  :image: /_static/img/mascots/scylla-enterprise.svg
-  :class: topic-box--product,large-3,small-6
+  :class: topic-box--product,large-4,small-6

  Easily run and manage your ScyllaDB cluster on Kubernetes.

--- a/docs/operating-scylla/procedures/cluster-management/_common/prereq.rst
+++ b/docs/operating-scylla/procedures/cluster-management/_common/prereq.rst
@@ -3,6 +3,7 @@
 * endpoint_snitch - ``grep endpoint_snitch /etc/scylla/scylla.yaml``
 * Scylla version - ``scylla --version``
 * Authenticator - ``grep authenticator /etc/scylla/scylla.yaml``
+* consistent_cluster_management - ``grep consistent_cluster_management /etc/scylla/scylla.yaml``

 .. Note:: 

--- a/docs/operating-scylla/procedures/cluster-management/add-dc-to-existing-dc.rst
+++ b/docs/operating-scylla/procedures/cluster-management/add-dc-to-existing-dc.rst
@@ -119,6 +119,7 @@ Add New DC
   * **listen_address** - IP address that Scylla used to connect to the other Scylla nodes in the cluster.
   * **endpoint_snitch** - Set the selected snitch.
   * **rpc_address** - Address for client connections (Thrift, CQL).
+   * **consistent_cluster_management** - set to the same value as used by your existing nodes.

   The parameters ``seeds``, ``cluster_name`` and ``endpoint_snitch`` need to match the existing cluster.

--- a/docs/operating-scylla/procedures/cluster-management/add-node-to-cluster.rst
+++ b/docs/operating-scylla/procedures/cluster-management/add-node-to-cluster.rst
@@ -54,6 +54,8 @@ Procedure

    * **seeds** - Specifies the IP address of an existing node in the cluster. The new node will use this IP to connect to the cluster and learn the cluster topology and state.

+    * **consistent_cluster_management** - set to the same value as used by your existing nodes.
+
   .. note:: 

       In earlier versions of ScyllaDB, seed nodes assisted in gossip. Starting with Scylla Open Source 4.3 and Scylla Enterprise 2021.1, the seed concept in gossip has been removed. If you are using an earlier version of ScyllaDB, you need to configure the seeds parameter in the following way:
--- a/docs/operating-scylla/procedures/cluster-management/create-cluster-multidc.rst
+++ b/docs/operating-scylla/procedures/cluster-management/create-cluster-multidc.rst
@@ -70,6 +70,7 @@ the file can be found under ``/etc/scylla/``
 - **listen_address** - IP address that the Scylla use to connect to other Scylla nodes in the cluster
 - **endpoint_snitch** - Set the selected snitch
 - **rpc_address** - Address for client connection (Thrift, CQLSH)
+- **consistent_cluster_management** - ``true`` by default, can be set to ``false`` if you don't want to use Raft for consistent schema management in this cluster (will be mandatory in later versions). Check the :doc:`Raft in ScyllaDB document</architecture/raft/>` to learn more.

 3. In the ``cassandra-rackdc.properties`` file, edit the rack and data center information. 
 The file can be found under ``/etc/scylla/``.
--- a/docs/operating-scylla/procedures/cluster-management/create-cluster.rst
+++ b/docs/operating-scylla/procedures/cluster-management/create-cluster.rst
@@ -26,6 +26,7 @@ The file can be found under ``/etc/scylla/``
 - **listen_address** - IP address that Scylla used to connect to other Scylla nodes in the cluster
 - **endpoint_snitch** - Set the selected snitch
 - **rpc_address** - Address for client connection (Thrift, CQL)
+- **consistent_cluster_management** - ``true`` by default, can be set to ``false`` if you don't want to use Raft for consistent schema management in this cluster (will be mandatory in later versions). Check the :doc:`Raft in ScyllaDB document</architecture/raft/>` to learn more.

 3. This step needs to be done **only** if you are using the **GossipingPropertyFileSnitch**. If not, skip this step.
 In the ``cassandra-rackdc.properties`` file, edit the parameters listed below.
--- a/docs/operating-scylla/procedures/cluster-management/ec2-dc.rst
+++ b/docs/operating-scylla/procedures/cluster-management/ec2-dc.rst
@@ -63,6 +63,7 @@ Perform the following steps for each node in the new cluster:
     * **rpc_address** - Address for client connection (Thrift, CQL).
     * **broadcast_address** - The IP address a node tells other nodes in the cluster to contact it by.
     * **broadcast_rpc_address** - Default: unset. The RPC address to broadcast to drivers and other Scylla nodes. It cannot be set to 0.0.0.0. If left blank, it will be set to the value of ``rpc_address``. If ``rpc_address`` is set to 0.0.0.0, ``broadcast_rpc_address`` must be explicitly configured.
+     * **consistent_cluster_management** - ``true`` by default, can be set to ``false`` if you don't want to use Raft for consistent schema management in this cluster (will be mandatory in later versions). Check the :doc:`Raft in ScyllaDB document</architecture/raft/>` to learn more.

 #. After you have installed and configured Scylla and edited ``scylla.yaml`` file on all the nodes, start the node specified with the ``seeds`` parameter. Then start the rest of the nodes in your cluster, one at a time, using
   ``sudo systemctl start scylla-server``.
--- a/docs/operating-scylla/procedures/cluster-management/replace-dead-node-or-more.rst
+++ b/docs/operating-scylla/procedures/cluster-management/replace-dead-node-or-more.rst
@@ -25,6 +25,7 @@ Login to one of the nodes in the cluster with (UN) status, collect the following
 * seeds - ``cat /etc/scylla/scylla.yaml | grep seeds:``
 * endpoint_snitch - ``cat /etc/scylla/scylla.yaml | grep endpoint_snitch``
 * Scylla version - ``scylla --version``
+* consistent_cluster_management - ``grep consistent_cluster_management /etc/scylla/scylla.yaml``

 Procedure
 ---------
--- a/docs/operating-scylla/procedures/cluster-management/replace-dead-node.rst
+++ b/docs/operating-scylla/procedures/cluster-management/replace-dead-node.rst
@@ -66,6 +66,8 @@ Procedure

    - **rpc_address** - Address for client connection (Thrift, CQL)

+    - **consistent_cluster_management** - set to the same value as used by your existing nodes.
+
 #. Add the ``replace_node_first_boot`` parameter to the ``scylla.yaml`` config file on the new node. This line can be added to any place in the config file. After a successful node replacement, there is no need to remove it from the ``scylla.yaml`` file. (Note: The obsolete parameters "replace_address" and "replace_address_first_boot" are not supported and should not be used). The value of the ``replace_node_first_boot`` parameter should be the Host ID of the node to be replaced.

    For example (using the Host ID of the failed node from above):
--- a/docs/upgrade/_common/upgrade-guide-from-2022.x.y-to-2022.x.z-ubuntu-and-debian.rst
+++ b/docs/upgrade/_common/upgrade-guide-from-2022.x.y-to-2022.x.z-ubuntu-and-debian.rst
@@ -68,7 +68,7 @@ Gracefully stop the node

 .. code:: sh

-   sudo service scylla-enterprise-server stop
+   sudo service scylla-server stop

 Download and install the new release
 ------------------------------------
@@ -92,13 +92,13 @@ Start the node

 .. code:: sh

-   sudo service scylla-enterprise-server start
+   sudo service scylla-server start

 Validate
 --------
 1. Check cluster status with ``nodetool status`` and make sure **all** nodes, including the one you just upgraded, are in UN status.
 2. Use ``curl -X GET "http://localhost:10000/storage_service/scylla_release_version"`` to check the ScyllaDB version.
-3. Check scylla-enterprise-server log (by ``journalctl _COMM=scylla``) and ``/var/log/syslog`` to validate there are no errors.
+3. Check scylla-server log (by ``journalctl _COMM=scylla``) and ``/var/log/syslog`` to validate there are no errors.
 4. Check again after 2 minutes to validate no new issues are introduced.

 Once you are sure the node upgrade is successful, move to the next node in the cluster.
@@ -130,7 +130,7 @@ Gracefully shutdown ScyllaDB
 .. code:: sh

   nodetool drain
-   sudo service scylla-enterprise-server stop
+   sudo service scylla-server stop

 Downgrade to the previous release
 ----------------------------------
@@ -164,7 +164,7 @@ Start the node

 .. code:: sh

-   sudo service scylla-enterprise-server start
+   sudo service scylla-server start

 Validate
 --------
--- a/docs/upgrade/_common/upgrade-guide-v2022-patch-ubuntu-and-debian-p1.rst
+++ b/docs/upgrade/_common/upgrade-guide-v2022-patch-ubuntu-and-debian-p1.rst
@@ -66,7 +66,7 @@ Gracefully stop the node

 .. code:: sh

-   sudo service scylla-enterprise-server stop
+   sudo service scylla-server stop

 Download and install the new release
 ------------------------------------
--- a/docs/upgrade/_common/upgrade-guide-v2022-patch-ubuntu-and-debian-p2.rst
+++ b/docs/upgrade/_common/upgrade-guide-v2022-patch-ubuntu-and-debian-p2.rst
@@ -16,13 +16,13 @@ Start the node

 .. code:: sh

-   sudo service scylla-enterprise-server start
+   sudo service scylla-server start

 Validate
 --------
 #. Check cluster status with ``nodetool status`` and make sure **all** nodes, including the one you just upgraded, are in UN status.
 #. Use ``curl -X GET "http://localhost:10000/storage_service/scylla_release_version"`` to check the ScyllaDB version.
-#. Check scylla-enterprise-server log (by ``journalctl _COMM=scylla``) and ``/var/log/syslog`` to validate there are no errors.
+#. Check scylla-server log (by ``journalctl _COMM=scylla``) and ``/var/log/syslog`` to validate there are no errors.
 #. Check again after 2 minutes to validate no new issues are introduced.

 Once you are sure the node upgrade is successful, move to the next node in the cluster.
@@ -54,7 +54,7 @@ Gracefully shutdown ScyllaDB
 .. code:: sh

   nodetool drain
-   sudo service scylla-enterprise-server stop
+   sudo service scylla-server stop

 Downgrade to the previous release
 ----------------------------------
@@ -88,7 +88,7 @@ Start the node

 .. code:: sh

-   sudo service scylla-enterprise-server start
+   sudo service scylla-server start

 Validate
 --------
--- a/docs/upgrade/_common/upgrade-guide-v2022-ubuntu-and-debian-p1.rst
+++ b/docs/upgrade/_common/upgrade-guide-v2022-ubuntu-and-debian-p1.rst
@@ -7,7 +7,7 @@ This document is a step-by-step procedure for upgrading from ScyllaDB Enterprise

 Applicable Versions
 ===================
-This guide covers upgrading ScyllaDB Enterprise from version 2021.1.x to ScyllaDB Enterprise version 2022.1.y on |OS|. See :doc:`OS Support by Platform and Version </getting-started/os-support>` for information about supported versions.
+This guide covers upgrading ScyllaDB Enterprise from version **2021.1.8** or later to ScyllaDB Enterprise version 2022.1.y on |OS|. See :doc:`OS Support by Platform and Version </getting-started/os-support>` for information about supported versions.

 Upgrade Procedure
 =================
@@ -69,7 +69,7 @@ Gracefully stop the node

 .. code:: sh

-   sudo service scylla-enterprise-server stop
+   sudo service scylla-server stop

 Download and install the new release
 ------------------------------------
--- a/docs/upgrade/_common/upgrade-guide-v2022-ubuntu-and-debian-p2.rst
+++ b/docs/upgrade/_common/upgrade-guide-v2022-ubuntu-and-debian-p2.rst
@@ -36,13 +36,13 @@ A new io.conf format was introduced in Scylla 2.3 and 2019.1. If your io.conf do

 .. code:: sh

-   sudo service scylla-enterprise-server start
+   sudo service scylla-server start

 Validate
 --------
 #. Check cluster status with ``nodetool status`` and make sure **all** nodes, including the one you just upgraded, are in UN status.
 #. Use ``curl -X GET "http://localhost:10000/storage_service/scylla_release_version"`` to check the ScyllaDB version.
-#. Check scylla-enterprise-server log (by ``journalctl _COMM=scylla``) and ``/var/log/syslog`` to validate there are no errors.
+#. Check scylla-server log (by ``journalctl _COMM=scylla``) and ``/var/log/syslog`` to validate there are no errors.
 #. Check again after two minutes to validate no new issues are introduced.

 Once you are sure the node upgrade is successful, move to the next node in the cluster.
@@ -75,7 +75,7 @@ Gracefully shutdown ScyllaDB
 .. code:: sh

   nodetool drain
-   sudo service scylla-enterprise-server stop
+   sudo service scylla-server stop

 Download and install the old release
 ------------------------------------
@@ -120,7 +120,7 @@ Start the node

 .. code:: sh

-   sudo service scylla-enterprise-server start
+   sudo service scylla-server start

 Validate
 --------
--- a/docs/upgrade/upgrade-enterprise/upgrade-guide-from-2022.1-to-2022.2/upgrade-guide-from-2022.1-to-2022.2-generic.rst
+++ b/docs/upgrade/upgrade-enterprise/upgrade-guide-from-2022.1-to-2022.2/upgrade-guide-from-2022.1-to-2022.2-generic.rst
@@ -102,7 +102,7 @@ Gracefully stop the node

 .. code:: sh

-   sudo service scylla-enterprise-server stop
+   sudo service scylla-server stop

 .. _upgrade-debian-ubuntu-enterprise-2022.2: 

@@ -138,7 +138,7 @@ Download and install the new release

               sudo apt-get clean all
               sudo apt-get update
-               sudo apt-get dist-upgrade scylla-enterprise-server
+               sudo apt-get dist-upgrade scylla-enterprise


        Answer ‘y’ to the first two questions.
@@ -213,13 +213,13 @@ Start the node

 .. code:: sh

-   sudo service scylla-enterprise-server start
+   sudo service scylla-server start

 Validate
 --------
 #. Check cluster status with ``nodetool status`` and make sure **all** nodes, including the one you just upgraded, are in ``UN`` status.
 #. Use ``curl -X GET "http://localhost:10000/storage_service/scylla_release_version"`` to check the ScyllaDB version. Validate that the version matches the one you upgraded to.
-#. Check scylla-enterprise-server log (using ``journalctl _COMM=scylla``) and ``/var/log/syslog`` to validate there are no new errors in the log.
+#. Check scylla-server log (using ``journalctl _COMM=scylla``) and ``/var/log/syslog`` to validate there are no new errors in the log.
 #. Check again after two minutes, to validate no new issues are introduced.

 Once you are sure the node upgrade was successful, move to the next node in the cluster.
@@ -260,7 +260,7 @@ Drain and gracefully stop the node
 .. code:: sh

   nodetool drain
-   sudo service scylla-enterprise-server stop
+   sudo service scylla-server stop

 Download and install the old release
 ------------------------------------
@@ -359,7 +359,7 @@ Start the node

 .. code:: sh

-   sudo service scylla-enterprise-server start
+   sudo service scylla-server start

 Validate
 --------
--- a/docs/upgrade/upgrade-enterprise/upgrade-guide-from-2022.x.y-to-2022.x.z/upgrade-guide-from-2022.x.y-to-2022.x.z-rpm.rst
+++ b/docs/upgrade/upgrade-enterprise/upgrade-guide-from-2022.x.y-to-2022.x.z/upgrade-guide-from-2022.x.y-to-2022.x.z-rpm.rst
@@ -63,7 +63,7 @@ Stop ScyllaDB

 .. code:: sh

-   sudo systemctl stop scylla-enterprise-server
+   sudo systemctl stop scylla-server

 Download and install the new release
 ------------------------------------
@@ -84,7 +84,7 @@ Start the node

 .. code:: sh

-   sudo systemctl start scylla-enterprise-server
+   sudo systemctl start scylla-server

 Validate
 --------
@@ -125,7 +125,7 @@ Gracefully shutdown ScyllaDB
 .. code:: sh

   nodetool drain
-   sudo systemctl stop scylla-enterprise-server
+   sudo systemctl stop scylla-server

 Downgrade to the previous release
 -----------------------------------
@@ -149,7 +149,7 @@ Start the node

 .. code:: sh

-   sudo systemctl start scylla-enterprise-server
+   sudo systemctl start scylla-server

 Validate
 --------
--- a/docs/upgrade/upgrade-opensource/upgrade-guide-from-5.1-to-5.2/metric-update-5.1-to-5.2.rst
+++ b/docs/upgrade/upgrade-opensource/upgrade-guide-from-5.1-to-5.2/metric-update-5.1-to-5.2.rst
@@ -1,5 +1,5 @@
-Scylla Metric Update - Scylla 5.1 to 5.2
-========================================
+ScyllaDB Metric Update - Scylla 5.1 to 5.2
+============================================

 .. toctree::
   :maxdepth: 2
@@ -7,8 +7,8 @@ Scylla Metric Update - Scylla 5.1 to 5.2

 Scylla 5.2 Dashboards are available as part of the latest |mon_root|.

-The following metrics are new in Scylla 5.2
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The following metrics are new in ScyllaDB 5.2
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. list-table::
   :widths: 25 150
@@ -16,5 +16,42 @@ The following metrics are new in Scylla 5.2

   * - Metric
     - Description
-   * - TODO
-     - TODO
+   * - scylla_database_disk_reads
+     - Holds the number of currently active disk read operations.
+   * - scylla_database_sstables_read
+     - Holds the number of currently read sstables.
+   * - scylla_memory_malloc_failed
+     - Total count of failed memory allocations
+   * - scylla_raft_group0_status
+     - status of the raft group, 0 - disabled, 1 - normal, 2 - aborted
+   * - scylla_storage_proxy_coordinator_cas_read_latency_summary
+     - CAS read latency summary
+   * - scylla_storage_proxy_coordinator_cas_write_latency_summary
+     - CAS write latency summary
+   * - scylla_storage_proxy_coordinator_read_latency_summary
+     - Read latency summary
+   * - scylla_storage_proxy_coordinator_write_latency_summary
+     - Write latency summary
+   * - scylla_streaming_finished_percentage
+     - Finished percentage of node operation on this shard
+   * - scylla_view_update_generator_sstables_pending_work
+     - Number of bytes remaining to be processed from SSTables for view updates
+
+
+The following metrics are renamed in ScyllaDB 5.2
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. list-table::
+   :widths: 25 150
+   :header-rows: 1
+
+   * - 5.1
+     - 5.2
+   * - scylla_database_active_reads_memory_consumption
+     - scylla_database_reads_memory_consumption
+   * - scylla_memory_regular_virtual_dirty_bytes
+     - scylla_memory_regular_unspooled_dirty_bytes
+   * - scylla_memory_system_virtual_dirty_bytes
+     - scylla_memory_system_unspooled_dirty_bytes
+   * - scylla_memory_virtual_dirty_bytes
+     - scylla_memory_unspooled_dirty_bytes
--- a/docs/upgrade/upgrade-opensource/upgrade-guide-from-5.1-to-5.2/upgrade-guide-from-5.1-to-5.2-generic.rst
+++ b/docs/upgrade/upgrade-opensource/upgrade-guide-from-5.1-to-5.2/upgrade-guide-from-5.1-to-5.2-generic.rst
@@ -67,7 +67,11 @@ Apply the following procedure **serially** on each node. Do not move to the next
 If you enabled consistent cluster management in each node's configuration file, then as soon as every node has been upgraded to the new version, the cluster will start a procedure which initializes the Raft algorithm for consistent cluster metadata management.
 You must then :ref:`verify <validate-raft-setup>` that this procedure successfully finishes.

-.. note:: Before upgrading, make sure to use the latest `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_ stack.
+.. note:: 
+
+   If you use the `ScyllaDB Monitoring Stack <https://monitoring.docs.scylladb.com/>`_, we recommend upgrading the Monitoring Stack  to the latest version **before** upgrading ScyllaDB.
+   
+   For ScyllaDB 5.2, you MUST upgrade the Monitoring Stack to version 4.3 or later.

 Upgrade Steps
 =============
--- a/index/built_indexes_virtual_reader.hh
+++ b/index/built_indexes_virtual_reader.hh
@@ -175,7 +175,7 @@ class built_indexes_virtual_reader {
        }

        virtual future<> fast_forward_to(position_range range) override {
-            forward_buffer_to(range.start());
+            clear_buffer();
            _end_of_stream = false;
            // range contains index names (e.g., xyz) but the underlying table
            // contains view names (e.g., xyz_index) so we need to add the
--- a/main.cc
+++ b/main.cc
@@ -476,7 +476,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
    // We need to have the entire app config to run the app, but we need to
    // run the app to read the config file with UDF specific options so that
    // we know whether we need to reserve additional memory for UDFs.
-    app_cfg.reserve_additional_memory = 50 * 1024 * 1024;
+    app_cfg.reserve_additional_memory = db::config::wasm_udf_reserved_memory;
    app_template app(std::move(app_cfg));

    auto ext = std::make_shared<db::extensions>();
--- a/mutation_compactor.hh
+++ b/mutation_compactor.hh
@@ -177,7 +177,6 @@ private:
    template <typename Consumer, typename GCConsumer>
    requires CompactedFragmentsConsumerV2<Consumer> && CompactedFragmentsConsumerV2<GCConsumer>
    stop_iteration do_consume(range_tombstone_change&& rtc, Consumer& consumer, GCConsumer& gc_consumer) {
-        _validator(mutation_fragment_v2::kind::range_tombstone_change, rtc.position(), rtc.tombstone());
        stop_iteration gc_consumer_stop = stop_iteration::no;
        stop_iteration consumer_stop = stop_iteration::no;
        if (rtc.tombstone() <= _partition_tombstone) {
@@ -199,6 +198,7 @@ private:
            partition_is_not_empty(consumer);
            _current_emitted_tombstone = rtc.tombstone();
            consumer_stop = consumer.consume(std::move(rtc));
+            _validator(mutation_fragment_v2::kind::range_tombstone_change, rtc.position(), rtc.tombstone());
        }
        return gc_consumer_stop || consumer_stop;
    }
--- a/raft/server.cc
+++ b/raft/server.cc
@@ -1144,7 +1144,7 @@ future<> server_impl::applier_fiber() {
                        co_await _state_machine->apply(std::move(commands));
                    } catch (abort_requested_exception& e) {
                        logger.info("[{}] applier fiber stopped because state machine was aborted: {}", _id, e);
-                        co_return;
+                        throw stop_apply_fiber{};
                    } catch (...) {
                        std::throw_with_nested(raft::state_machine_error{});
                    }
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -383,6 +383,10 @@ reader_concurrency_semaphore& reader_permit::semaphore() {
    return _impl->semaphore();
 }

+reader_permit::state reader_permit::get_state() const {
+    return _impl->get_state();
+}
+
 bool reader_permit::needs_readmission() const {
    return _impl->needs_readmission();
 }
@@ -771,10 +775,7 @@ bool reader_concurrency_semaphore::try_evict_one_inactive_read(evict_reason reas

 void reader_concurrency_semaphore::clear_inactive_reads() {
    while (!_inactive_reads.empty()) {
-        auto& ir = _inactive_reads.front();
-        close_reader(std::move(ir.reader));
-        // Destroying the read unlinks it too.
-        std::unique_ptr<inactive_read> _(&*_inactive_reads.begin());
+        evict(_inactive_reads.front(), evict_reason::manual);
    }
 }

--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -130,6 +130,8 @@ public:

    reader_concurrency_semaphore& semaphore();

+    state get_state() const;
+
    bool needs_readmission() const;

    // Call only when needs_readmission() = true.
@@ -184,6 +186,8 @@ public:
    reader_resources resources() const { return _resources; }
 };

+std::ostream& operator<<(std::ostream& os, reader_permit::state s);
+
 /// Mark a permit as used.
 ///
 /// Conceptually, a permit is considered used, when at least one reader
--- a/readers/combined.cc
+++ b/readers/combined.cc
@@ -733,7 +733,7 @@ future<> merging_reader<P>::fast_forward_to(const dht::partition_range& pr) {

 template <FragmentProducer P>
 future<> merging_reader<P>::fast_forward_to(position_range pr) {
-    forward_buffer_to(pr.start());
+    clear_buffer();
    _end_of_stream = false;
    return _merger.fast_forward_to(std::move(pr));
 }
--- a/readers/delegating_v2.hh
+++ b/readers/delegating_v2.hh
@@ -40,7 +40,7 @@ public:
    }
    virtual future<> fast_forward_to(position_range pr) override {
        _end_of_stream = false;
-        forward_buffer_to(pr.start());
+        clear_buffer();
        return _underlying->fast_forward_to(std::move(pr));
    }
    virtual future<> next_partition() override {
--- a/readers/filtering.hh
+++ b/readers/filtering.hh
@@ -54,7 +54,7 @@ public:
        return _rd.fast_forward_to(pr);
    }
    virtual future<> fast_forward_to(position_range pr) override {
-        forward_buffer_to(pr.start());
+        clear_buffer();
        _end_of_stream = false;
        return _rd.fast_forward_to(std::move(pr));
    }
--- a/readers/flat_mutation_reader_v2.hh
+++ b/readers/flat_mutation_reader_v2.hh
@@ -153,7 +153,6 @@ public:
        void reserve_additional(size_t n) {
            _buffer.reserve(_buffer.size() + n);
        }
-        void forward_buffer_to(const position_in_partition& pos);
        void clear_buffer_to_next_partition();
        template<typename Source>
        future<bool> fill_buffer_from(Source&);
@@ -722,7 +721,7 @@ flat_mutation_reader_v2 transform(flat_mutation_reader_v2 r, T t) {
            return _reader.fast_forward_to(pr);
        }
        virtual future<> fast_forward_to(position_range pr) override {
-            forward_buffer_to(pr.start());
+            clear_buffer();
            _end_of_stream = false;
            return _reader.fast_forward_to(std::move(pr));
        }
--- a/readers/multishard.cc
+++ b/readers/multishard.cc
@@ -158,7 +158,7 @@ future<> foreign_reader::fast_forward_to(const dht::partition_range& pr) {
 }

 future<> foreign_reader::fast_forward_to(position_range pr) {
-    forward_buffer_to(pr.start());
+    clear_buffer();
    _end_of_stream = false;
    return forward_operation([reader = _reader.get(), pr = std::move(pr)] () {
        return reader->fast_forward_to(std::move(pr));
--- a/readers/mutation_reader.cc
+++ b/readers/mutation_reader.cc
@@ -385,11 +385,6 @@ flat_mutation_reader_v2::~flat_mutation_reader_v2() {
    }
 }

-void flat_mutation_reader_v2::impl::forward_buffer_to(const position_in_partition& pos) {
-    clear_buffer();
-    _buffer_size = compute_buffer_size(*_schema, _buffer);
-}
-
 void flat_mutation_reader_v2::impl::clear_buffer_to_next_partition() {
    auto next_partition_start = std::find_if(_buffer.begin(), _buffer.end(), [] (const mutation_fragment_v2& mf) {
        return mf.is_partition_start();
--- a/readers/mutation_readers.cc
+++ b/readers/mutation_readers.cc
@@ -167,16 +167,19 @@ flat_mutation_reader_v2 make_forwardable(flat_mutation_reader_v2 m) {
            _current = std::move(pr);
            _end_of_stream = false;
            _current_has_content = false;
-            forward_buffer_to(_current.start());
+            clear_buffer();
            return make_ready_future<>();
        }
        virtual future<> next_partition() override {
+            clear_buffer_to_next_partition();
+            if (!is_buffer_empty()) {
+                co_return;
+            }
            _end_of_stream = false;
            if (!_next || !_next->is_partition_start()) {
                co_await _underlying.next_partition();
                _next = {};
            }
-            clear_buffer_to_next_partition();
            _current = {
                position_in_partition::for_partition_start(),
                position_in_partition(position_in_partition::after_static_row_tag_t())
@@ -267,7 +270,7 @@ flat_mutation_reader_v2 make_slicing_filtering_reader(flat_mutation_reader_v2 rd
        }

        virtual future<> fast_forward_to(position_range pr) override {
-            forward_buffer_to(pr.start());
+            clear_buffer();
            _end_of_stream = false;
            return _rd.fast_forward_to(std::move(pr));
        }
@@ -411,25 +414,32 @@ flat_mutation_reader_v2 make_nonforwardable(flat_mutation_reader_v2 r, bool sing
        flat_mutation_reader_v2 _underlying;
        bool _single_partition;
        bool _static_row_done = false;
+        bool _partition_is_open = false;
        bool is_end_end_of_underlying_stream() const {
            return _underlying.is_buffer_empty() && _underlying.is_end_of_stream();
        }
        future<> on_end_of_underlying_stream() {
-            if (!_static_row_done) {
-                _static_row_done = true;
-                return _underlying.fast_forward_to(position_range::all_clustered_rows());
+            if (_partition_is_open) {
+                if (!_static_row_done) {
+                    _static_row_done = true;
+                    return _underlying.fast_forward_to(position_range::all_clustered_rows());
+                }
+                push_mutation_fragment(*_schema, _permit, partition_end());
+                reset_partition();
            }
-            push_mutation_fragment(*_schema, _permit, partition_end());
            if (_single_partition) {
                _end_of_stream = true;
                return make_ready_future<>();
            }
-          return _underlying.next_partition().then([this] {
-            _static_row_done = false;
-            return _underlying.fill_buffer().then([this] {
-                _end_of_stream = is_end_end_of_underlying_stream();
+            return _underlying.next_partition().then([this] {
+                return _underlying.fill_buffer().then([this] {
+                    _end_of_stream = is_end_end_of_underlying_stream();
+                });
            });
-          });
+        }
+        void reset_partition() {
+            _partition_is_open = false;
+            _static_row_done = false;
        }
    public:
        reader(flat_mutation_reader_v2 r, bool single_partition)
@@ -440,6 +450,9 @@ flat_mutation_reader_v2 make_nonforwardable(flat_mutation_reader_v2 r, bool sing
        virtual future<> fill_buffer() override {
            return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] {
                return fill_buffer_from(_underlying).then([this] (bool underlying_finished) {
+                    if (!_partition_is_open && !is_buffer_empty()) {
+                        _partition_is_open = true;
+                    }
                    if (underlying_finished) {
                        return on_end_of_underlying_stream();
                    }
@@ -452,17 +465,27 @@ flat_mutation_reader_v2 make_nonforwardable(flat_mutation_reader_v2 r, bool sing
        }
        virtual future<> next_partition() override {
            clear_buffer_to_next_partition();
-            auto maybe_next_partition = make_ready_future<>();;
+            auto maybe_next_partition = make_ready_future<>();
            if (is_buffer_empty()) {
+                if (_end_of_stream || (_partition_is_open && _single_partition)) {
+                    _end_of_stream = true;
+                    return maybe_next_partition;
+                }
+                reset_partition();
                maybe_next_partition = _underlying.next_partition();
            }
-          return maybe_next_partition.then([this] {
-            _end_of_stream = is_end_end_of_underlying_stream();
-          });
+            return maybe_next_partition.then([this] {
+                _end_of_stream = is_end_end_of_underlying_stream();
+            });
        }
        virtual future<> fast_forward_to(const dht::partition_range& pr) override {
-            _end_of_stream = false;
            clear_buffer();
+            if (_single_partition) {
+                _end_of_stream = true;
+                return make_ready_future<>();
+            }
+            reset_partition();
+            _end_of_stream = false;
            return _underlying.fast_forward_to(pr);
        }
        virtual future<> close() noexcept override {
@@ -1532,7 +1555,7 @@ public:
        return _reader.fast_forward_to(pr);
    }
    virtual future<> fast_forward_to(position_range pr) override {
-        forward_buffer_to(pr.start());
+        clear_buffer();
        _end_of_stream = false;
        return _reader.fast_forward_to(std::move(pr));
    }
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -1253,10 +1253,13 @@ void table::set_compaction_strategy(sstables::compaction_strategy_type strategy)
            cg.get_backlog_tracker().copy_ongoing_charges(new_bt, move_read_charges);

            new_sstables = make_lw_shared<sstables::sstable_set>(new_cs.make_sstable_set(t._schema));
-            cg.main_sstables()->for_each_sstable([this] (const sstables::shared_sstable& s) {
-                add_sstable_to_backlog_tracker(new_bt, s);
+            std::vector<sstables::shared_sstable> new_sstables_for_backlog_tracker;
+            new_sstables_for_backlog_tracker.reserve(cg.main_sstables()->all()->size());
+            cg.main_sstables()->for_each_sstable([this, &new_sstables_for_backlog_tracker] (const sstables::shared_sstable& s) {
                new_sstables->insert(s);
+                new_sstables_for_backlog_tracker.push_back(s);
            });
+            new_bt.replace_sstables({}, std::move(new_sstables_for_backlog_tracker));
        }

        void execute() noexcept {
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -347,8 +347,9 @@ future<> read_context::create_underlying() {
    });
 }

-static flat_mutation_reader_v2 read_directly_from_underlying(read_context& reader) {
+static flat_mutation_reader_v2 read_directly_from_underlying(read_context& reader, mutation_fragment_v2 partition_start) {
    auto res = make_delegating_reader(reader.underlying().underlying());
+    res.unpop_mutation_fragment(std::move(partition_start));
    res.upgrade_schema(reader.schema());
    return make_nonforwardable(std::move(res), true);
 }
@@ -381,8 +382,7 @@ private:
                });
            } else {
                _cache._tracker.on_mispopulate();
-                _reader = read_directly_from_underlying(*_read_context);
-                this->push_mutation_fragment(std::move(*mfopt));
+                _reader = read_directly_from_underlying(*_read_context, std::move(*mfopt));
            }
          });
        });
@@ -507,15 +507,13 @@ public:
        , _read_context(ctx)
    {}

-    using read_result = std::tuple<flat_mutation_reader_v2_opt, mutation_fragment_v2_opt>;
-
-    future<read_result> operator()() {
+    future<flat_mutation_reader_v2_opt> operator()() {
        return _reader.move_to_next_partition().then([this] (auto&& mfopt) mutable {
            {
                if (!mfopt) {
                    return _cache._read_section(_cache._tracker.region(), [&] {
                        this->handle_end_of_stream();
-                        return make_ready_future<read_result>(read_result(std::nullopt, std::nullopt));
+                        return make_ready_future<flat_mutation_reader_v2_opt>(std::nullopt);
                    });
                }
                _cache.on_partition_miss();
@@ -526,14 +524,12 @@ public:
                        cache_entry& e = _cache.find_or_create_incomplete(ps, _reader.creation_phase(),
                                                               this->can_set_continuity() ? &*_last_key : nullptr);
                        _last_key = row_cache::previous_entry_pointer(key);
-                        return make_ready_future<read_result>(
-                                read_result(e.read(_cache, _read_context, _reader.creation_phase()), std::nullopt));
+                        return make_ready_future<flat_mutation_reader_v2_opt>(e.read(_cache, _read_context, _reader.creation_phase()));
                    });
                } else {
                    _cache._tracker.on_mispopulate();
                    _last_key = row_cache::previous_entry_pointer(key);
-                    return make_ready_future<read_result>(
-                            read_result(read_directly_from_underlying(_read_context), std::move(mfopt)));
+                    return make_ready_future<flat_mutation_reader_v2_opt>(read_directly_from_underlying(_read_context, std::move(*mfopt)));
                }
            }
        });
@@ -637,12 +633,8 @@ private:
    }

    future<flat_mutation_reader_v2_opt> read_from_secondary() {
-        return _secondary_reader().then([this] (range_populating_reader::read_result&& res) {
-            auto&& [fropt, ps] = res;
+        return _secondary_reader().then([this] (flat_mutation_reader_v2_opt&& fropt) {
            if (fropt) {
-                if (ps) {
-                    push_mutation_fragment(std::move(*ps));
-                }
                return make_ready_future<flat_mutation_reader_v2_opt>(std::move(fropt));
            } else {
                _secondary_in_progress = false;
--- a/scylla_post_install.sh
+++ b/scylla_post_install.sh
@@ -63,4 +63,15 @@ MemoryLimit=$MEMORY_LIMIT
 EOS
 fi

+if [ -e /etc/systemd/system/systemd-coredump@.service.d/timeout.conf ]; then
+    COREDUMP_RUNTIME_MAX=$(grep RuntimeMaxSec /etc/systemd/system/systemd-coredump@.service.d/timeout.conf)
+    if [ -z $COREDUMP_RUNTIME_MAX ]; then
+    cat << EOS > /etc/systemd/system/systemd-coredump@.service.d/timeout.conf
+[Service]
+RuntimeMaxSec=infinity
+TimeoutSec=infinity
+EOS
+    fi
+fi
+
 systemctl --system daemon-reload >/dev/null || true
--- a/service/paxos/paxos_state.cc
+++ b/service/paxos/paxos_state.cc
@@ -103,7 +103,7 @@ future<prepare_response> paxos_state::prepare(storage_proxy& sp, tracing::trace_
                            auto ex = f2.get_exception();
                            logger.debug("Failed to get data or digest: {}. Ignored.", std::move(ex));
                        }
-                        auto upgrade_if_needed = [schema = std::move(schema)] (std::optional<proposal> p) mutable {
+                        auto upgrade_if_needed = [schema = std::move(schema)] (std::optional<proposal> p) {
                            if (!p || p->update.schema_version() == schema->version()) {
                                return make_ready_future<std::optional<proposal>>(std::move(p));
                            }
@@ -115,7 +115,7 @@ future<prepare_response> paxos_state::prepare(storage_proxy& sp, tracing::trace_
                            // for that version and upgrade the mutation with it.
                            logger.debug("Stored mutation references outdated schema version. "
                                "Trying to upgrade the accepted proposal mutation to the most recent schema version.");
-                            return service::get_column_mapping(p->update.column_family_id(), p->update.schema_version()).then([schema = std::move(schema), p = std::move(p)] (const column_mapping& cm) {
+                            return service::get_column_mapping(p->update.column_family_id(), p->update.schema_version()).then([schema, p = std::move(p)] (const column_mapping& cm) {
                                return make_ready_future<std::optional<proposal>>(proposal(p->ballot, freeze(p->update.unfreeze_upgrading(schema, cm))));
                            });
                        };
--- a/service/raft/raft_group0.cc
+++ b/service/raft/raft_group0.cc
@@ -969,7 +969,11 @@ with_timeout(abort_source& as, db::timeout_clock::duration d, F&& fun) {
    // FIXME: using lambda as workaround for clang bug #50345 (miscompiling coroutine templates).
    auto impl = [] (abort_source& as, db::timeout_clock::duration d, F&& fun) -> future_t {
        abort_source timeout_src;
-        auto sub = as.subscribe([&timeout_src] () noexcept { timeout_src.request_abort(); });
+        auto sub = as.subscribe([&timeout_src] () noexcept {
+            if (!timeout_src.abort_requested()) {
+                timeout_src.request_abort();
+            }
+        });
        if (!sub) {
            throw abort_requested_exception{};
        }
--- a/sstables/kl/reader.cc
+++ b/sstables/kl/reader.cc
@@ -1465,7 +1465,7 @@ public:
        // If _ds is not created then next_partition() has no effect because there was no partition_start emitted yet.
    }
    virtual future<> fast_forward_to(position_range cr) override {
-        forward_buffer_to(cr.start());
+        clear_buffer();
        if (!_partition_finished) {
            _end_of_stream = false;
            return advance_context(_consumer.fast_forward_to(std::move(cr)));
--- a/sstables/mx/reader.cc
+++ b/sstables/mx/reader.cc
@@ -1653,7 +1653,7 @@ public:
        // If _ds is not created then next_partition() has no effect because there was no partition_start emitted yet.
    }
    virtual future<> fast_forward_to(position_range cr) override {
-        forward_buffer_to(cr.start());
+        clear_buffer();
        if (!_partition_finished) {
            _end_of_stream = false;
            return advance_context(_consumer.fast_forward_to(std::move(cr)));
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -2518,9 +2518,14 @@ static future<bool> do_validate_uncompressed(input_stream<char>& stream, const c
        offset += buf.size();
    }

-    if (!stream.eof()) {
-        sstlog.error("Chunk count mismatch between CRC.db and Data.db at offset {}: expected {} chunks but data file has more", offset, checksum.checksums.size());
-        valid = false;
+    {
+        // We should be at EOF here, but the flag might not be set yet. To ensure
+        // it is set, try to read some more. This should return an empty buffer.
+        auto buf = co_await stream.read();
+        if (!buf.empty()) {
+            sstlog.error("Chunk count mismatch between CRC.db and Data.db at offset {}: expected {} chunks but data file has more", offset, checksum.checksums.size());
+            valid = false;
+        }
    }

    if (actual_full_checksum != expected_digest) {
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -136,7 +136,9 @@ struct sstable_open_config {
    // fields respectively. Problematic sstables might fail to load. Set to
    // false if you want to disable this, to be able to read such sstables.
    // Should only be disabled for diagnostics purposes.
-    bool load_first_and_last_position_metadata = true;
+    // FIXME: Enable it by default once the root cause of large allocation when reading sstable in reverse is fixed.
+    //  Ref: https://github.com/scylladb/scylladb/issues/11642
+    bool load_first_and_last_position_metadata = false;
 };

 class sstable : public enable_lw_shared_from_this<sstable> {
--- a/test.py
+++ b/test.py
@@ -343,7 +343,16 @@ class PythonTestSuite(TestSuite):
        pool_size = cfg.get("pool_size", 2)

        self.create_cluster = self.get_cluster_factory(cluster_size)
-        self.clusters = Pool(pool_size, self.create_cluster)
+        async def recycle_cluster(cluster: ScyllaCluster) -> None:
+            """When a dirty cluster is returned to the cluster pool,
+               stop it and release the used IPs. We don't necessarily uninstall() it yet,
+               which would delete the log file and directory - we might want to preserve
+               these if it came from a failed test.
+            """
+            await cluster.stop()
+            await cluster.release_ips()
+
+        self.clusters = Pool(pool_size, self.create_cluster, recycle_cluster)

    def get_cluster_factory(self, cluster_size: int) -> Callable[..., Awaitable]:
        def create_server(create_cfg: ScyllaCluster.CreateServerParams):
@@ -686,7 +695,8 @@ class CQLApprovalTest(Test):
            if self.server_log is not None:
                logger.info("Server log:\n%s", self.server_log)

-        async with self.suite.clusters.instance(logger) as cluster:
+        # TODO: consider dirty_on_exception=True
+        async with self.suite.clusters.instance(False, logger) as cluster:
            try:
                cluster.before_test(self.uname)
                logger.info("Leasing Scylla cluster %s for test %s", cluster, self.uname)
@@ -842,26 +852,32 @@ class PythonTest(Test):

        loggerPrefix = self.mode + '/' + self.uname
        logger = LogPrefixAdapter(logging.getLogger(loggerPrefix), {'prefix': loggerPrefix})
-        async with self.suite.clusters.instance(logger) as cluster:
-            try:
-                cluster.before_test(self.uname)
-                logger.info("Leasing Scylla cluster %s for test %s", cluster, self.uname)
-                self.args.insert(0, "--host={}".format(cluster.endpoint()))
-                self.is_before_test_ok = True
-                cluster.take_log_savepoint()
-                status = await run_test(self, options)
-                cluster.after_test(self.uname)
-                self.is_after_test_ok = True
-                self.success = status
-            except Exception as e:
-                self.server_log = cluster.read_server_log()
-                self.server_log_filename = cluster.server_log_filename()
-                if self.is_before_test_ok is False:
-                    print("Test {} pre-check failed: {}".format(self.name, str(e)))
-                    print("Server log of the first server:\n{}".format(self.server_log))
-                    # Don't try to continue if the cluster is broken
-                    raise
-            logger.info("Test %s %s", self.uname, "succeeded" if self.success else "failed ")
+        cluster = await self.suite.clusters.get(logger)
+        try:
+            cluster.before_test(self.uname)
+            logger.info("Leasing Scylla cluster %s for test %s", cluster, self.uname)
+            self.args.insert(0, "--host={}".format(cluster.endpoint()))
+            self.is_before_test_ok = True
+            cluster.take_log_savepoint()
+            status = await run_test(self, options)
+            cluster.after_test(self.uname)
+            self.is_after_test_ok = True
+            self.success = status
+        except Exception as e:
+            self.server_log = cluster.read_server_log()
+            self.server_log_filename = cluster.server_log_filename()
+            if self.is_before_test_ok is False:
+                print("Test {} pre-check failed: {}".format(self.name, str(e)))
+                print("Server log of the first server:\n{}".format(self.server_log))
+                logger.info(f"Discarding cluster after failed start for test %s...", self.name)
+            elif self.is_after_test_ok is False:
+                print("Test {} post-check failed: {}".format(self.name, str(e)))
+                print("Server log of the first server:\n{}".format(self.server_log))
+                logger.info(f"Discarding cluster after failed test %s...", self.name)
+            await self.suite.clusters.put(cluster, is_dirty=True)
+        else:
+            await self.suite.clusters.put(cluster, is_dirty=False)
+        logger.info("Test %s %s", self.uname, "succeeded" if self.success else "failed ")
        return self

    def write_junit_failure_report(self, xml_res: ET.Element) -> None:
--- a/test/boost/flat_mutation_reader_test.cc
+++ b/test/boost/flat_mutation_reader_test.cc
@@ -38,6 +38,7 @@
 #include "readers/from_fragments_v2.hh"
 #include "readers/forwardable_v2.hh"
 #include "readers/compacting.hh"
+#include "readers/nonforwardable.hh"

 struct mock_consumer {
    struct result {
@@ -110,193 +111,187 @@ static size_t count_fragments(mutation m) {
    return res;
 }

-SEASTAR_TEST_CASE(test_flat_mutation_reader_consume_single_partition) {
-    return seastar::async([] {
-        tests::reader_concurrency_semaphore_wrapper semaphore;
-        for_each_mutation([&] (const mutation& m) {
-            size_t fragments_in_m = count_fragments(m);
-            for (size_t depth = 1; depth <= fragments_in_m + 1; ++depth) {
-                auto r = make_flat_mutation_reader_from_mutations_v2(m.schema(), semaphore.make_permit(), m);
-                auto close_reader = deferred_close(r);
-                auto result = r.consume(mock_consumer(*m.schema(), semaphore.make_permit(), depth)).get0();
-                BOOST_REQUIRE(result._consume_end_of_stream_called);
-                BOOST_REQUIRE_EQUAL(1, result._consume_new_partition_call_count);
-                BOOST_REQUIRE_EQUAL(1, result._consume_end_of_partition_call_count);
-                BOOST_REQUIRE_EQUAL(m.partition().partition_tombstone() ? 1 : 0, result._consume_tombstone_call_count);
-                auto r2 = assert_that(make_flat_mutation_reader_from_mutations_v2(m.schema(), semaphore.make_permit(), m));
-                r2.produces_partition_start(m.decorated_key(), m.partition().partition_tombstone());
-                if (result._fragments.empty()) {
-                    continue;
-                }
-                for (auto& mf : result._fragments) {
-                    r2.produces(*m.schema(), mf);
-                }
+SEASTAR_THREAD_TEST_CASE(test_flat_mutation_reader_consume_single_partition) {
+    tests::reader_concurrency_semaphore_wrapper semaphore;
+    for_each_mutation([&] (const mutation& m) {
+        size_t fragments_in_m = count_fragments(m);
+        for (size_t depth = 1; depth <= fragments_in_m + 1; ++depth) {
+            auto r = make_flat_mutation_reader_from_mutations_v2(m.schema(), semaphore.make_permit(), m);
+            auto close_reader = deferred_close(r);
+            auto result = r.consume(mock_consumer(*m.schema(), semaphore.make_permit(), depth)).get0();
+            BOOST_REQUIRE(result._consume_end_of_stream_called);
+            BOOST_REQUIRE_EQUAL(1, result._consume_new_partition_call_count);
+            BOOST_REQUIRE_EQUAL(1, result._consume_end_of_partition_call_count);
+            BOOST_REQUIRE_EQUAL(m.partition().partition_tombstone() ? 1 : 0, result._consume_tombstone_call_count);
+            auto r2 = assert_that(make_flat_mutation_reader_from_mutations_v2(m.schema(), semaphore.make_permit(), m));
+            r2.produces_partition_start(m.decorated_key(), m.partition().partition_tombstone());
+            if (result._fragments.empty()) {
+                continue;
            }
-        });
+            for (auto& mf : result._fragments) {
+                r2.produces(*m.schema(), mf);
+            }
+        }
    });
 }

-SEASTAR_TEST_CASE(test_flat_mutation_reader_consume_two_partitions) {
-    return seastar::async([] {
-        tests::reader_concurrency_semaphore_wrapper semaphore;
-        auto test = [&semaphore] (mutation m1, mutation m2) {
-            size_t fragments_in_m1 = count_fragments(m1);
-            size_t fragments_in_m2 = count_fragments(m2);
-            for (size_t depth = 1; depth < fragments_in_m1; ++depth) {
-                auto r = make_flat_mutation_reader_from_mutations_v2(m1.schema(), semaphore.make_permit(), {m1, m2});
-                auto close_r = deferred_close(r);
-                auto result = r.consume(mock_consumer(*m1.schema(), semaphore.make_permit(), depth)).get0();
-                BOOST_REQUIRE(result._consume_end_of_stream_called);
-                BOOST_REQUIRE_EQUAL(1, result._consume_new_partition_call_count);
-                BOOST_REQUIRE_EQUAL(1, result._consume_end_of_partition_call_count);
-                BOOST_REQUIRE_EQUAL(m1.partition().partition_tombstone() ? 1 : 0, result._consume_tombstone_call_count);
-                auto r2 = make_flat_mutation_reader_from_mutations_v2(m1.schema(), semaphore.make_permit(), {m1, m2});
-                auto close_r2 = deferred_close(r2);
-                auto start = r2().get0();
-                BOOST_REQUIRE(start);
-                BOOST_REQUIRE(start->is_partition_start());
-                for (auto& mf : result._fragments) {
-                    auto mfopt = r2().get0();
-                    BOOST_REQUIRE(mfopt);
-                    BOOST_REQUIRE(mf.equal(*m1.schema(), *mfopt));
-                }
+SEASTAR_THREAD_TEST_CASE(test_flat_mutation_reader_consume_two_partitions) {
+    tests::reader_concurrency_semaphore_wrapper semaphore;
+    auto test = [&semaphore] (mutation m1, mutation m2) {
+        size_t fragments_in_m1 = count_fragments(m1);
+        size_t fragments_in_m2 = count_fragments(m2);
+        for (size_t depth = 1; depth < fragments_in_m1; ++depth) {
+            auto r = make_flat_mutation_reader_from_mutations_v2(m1.schema(), semaphore.make_permit(), {m1, m2});
+            auto close_r = deferred_close(r);
+            auto result = r.consume(mock_consumer(*m1.schema(), semaphore.make_permit(), depth)).get0();
+            BOOST_REQUIRE(result._consume_end_of_stream_called);
+            BOOST_REQUIRE_EQUAL(1, result._consume_new_partition_call_count);
+            BOOST_REQUIRE_EQUAL(1, result._consume_end_of_partition_call_count);
+            BOOST_REQUIRE_EQUAL(m1.partition().partition_tombstone() ? 1 : 0, result._consume_tombstone_call_count);
+            auto r2 = make_flat_mutation_reader_from_mutations_v2(m1.schema(), semaphore.make_permit(), {m1, m2});
+            auto close_r2 = deferred_close(r2);
+            auto start = r2().get0();
+            BOOST_REQUIRE(start);
+            BOOST_REQUIRE(start->is_partition_start());
+            for (auto& mf : result._fragments) {
+                auto mfopt = r2().get0();
+                BOOST_REQUIRE(mfopt);
+                BOOST_REQUIRE(mf.equal(*m1.schema(), *mfopt));
            }
-            for (size_t depth = fragments_in_m1; depth < fragments_in_m1 + fragments_in_m2 + 1; ++depth) {
-                auto r = make_flat_mutation_reader_from_mutations_v2(m1.schema(), semaphore.make_permit(), {m1, m2});
-                auto close_r = deferred_close(r);
-                auto result = r.consume(mock_consumer(*m1.schema(), semaphore.make_permit(), depth)).get0();
-                BOOST_REQUIRE(result._consume_end_of_stream_called);
-                BOOST_REQUIRE_EQUAL(2, result._consume_new_partition_call_count);
-                BOOST_REQUIRE_EQUAL(2, result._consume_end_of_partition_call_count);
-                size_t tombstones_count = 0;
-                if (m1.partition().partition_tombstone()) {
-                    ++tombstones_count;
-                }
-                if (m2.partition().partition_tombstone()) {
-                    ++tombstones_count;
-                }
-                BOOST_REQUIRE_EQUAL(tombstones_count, result._consume_tombstone_call_count);
-                auto r2 = make_flat_mutation_reader_from_mutations_v2(m1.schema(), semaphore.make_permit(), {m1, m2});
-                auto close_r2 = deferred_close(r2);
-                auto start = r2().get0();
-                BOOST_REQUIRE(start);
-                BOOST_REQUIRE(start->is_partition_start());
-                for (auto& mf : result._fragments) {
-                    auto mfopt = r2().get0();
-                    BOOST_REQUIRE(mfopt);
-                    if (mfopt->is_partition_start() || mfopt->is_end_of_partition()) {
-                        mfopt = r2().get0();
-                    }
-                    BOOST_REQUIRE(mfopt);
-                    BOOST_REQUIRE(mf.equal(*m1.schema(), *mfopt));
-                }
+        }
+        for (size_t depth = fragments_in_m1; depth < fragments_in_m1 + fragments_in_m2 + 1; ++depth) {
+            auto r = make_flat_mutation_reader_from_mutations_v2(m1.schema(), semaphore.make_permit(), {m1, m2});
+            auto close_r = deferred_close(r);
+            auto result = r.consume(mock_consumer(*m1.schema(), semaphore.make_permit(), depth)).get0();
+            BOOST_REQUIRE(result._consume_end_of_stream_called);
+            BOOST_REQUIRE_EQUAL(2, result._consume_new_partition_call_count);
+            BOOST_REQUIRE_EQUAL(2, result._consume_end_of_partition_call_count);
+            size_t tombstones_count = 0;
+            if (m1.partition().partition_tombstone()) {
+                ++tombstones_count;
            }
-        };
-        for_each_mutation_pair([&] (auto&& m, auto&& m2, are_equal) {
-            if (m.decorated_key().less_compare(*m.schema(), m2.decorated_key())) {
-                test(m, m2);
-            } else if (m2.decorated_key().less_compare(*m.schema(), m.decorated_key())) {
-                test(m2, m);
+            if (m2.partition().partition_tombstone()) {
+                ++tombstones_count;
            }
-        });
+            BOOST_REQUIRE_EQUAL(tombstones_count, result._consume_tombstone_call_count);
+            auto r2 = make_flat_mutation_reader_from_mutations_v2(m1.schema(), semaphore.make_permit(), {m1, m2});
+            auto close_r2 = deferred_close(r2);
+            auto start = r2().get0();
+            BOOST_REQUIRE(start);
+            BOOST_REQUIRE(start->is_partition_start());
+            for (auto& mf : result._fragments) {
+                auto mfopt = r2().get0();
+                BOOST_REQUIRE(mfopt);
+                if (mfopt->is_partition_start() || mfopt->is_end_of_partition()) {
+                    mfopt = r2().get0();
+                }
+                BOOST_REQUIRE(mfopt);
+                BOOST_REQUIRE(mf.equal(*m1.schema(), *mfopt));
+            }
+        }
+    };
+    for_each_mutation_pair([&] (auto&& m, auto&& m2, are_equal) {
+        if (m.decorated_key().less_compare(*m.schema(), m2.decorated_key())) {
+            test(m, m2);
+        } else if (m2.decorated_key().less_compare(*m.schema(), m.decorated_key())) {
+            test(m2, m);
+        }
    });
 }

-SEASTAR_TEST_CASE(test_fragmenting_and_freezing) {
-    return seastar::async([] {
-        tests::reader_concurrency_semaphore_wrapper semaphore;
-        for_each_mutation([&] (const mutation& m) {
-            std::vector<frozen_mutation> fms;
+SEASTAR_THREAD_TEST_CASE(test_fragmenting_and_freezing) {
+    tests::reader_concurrency_semaphore_wrapper semaphore;
+    for_each_mutation([&] (const mutation& m) {
+        std::vector<frozen_mutation> fms;

-            fragment_and_freeze(make_flat_mutation_reader_from_mutations_v2(m.schema(), semaphore.make_permit(), { mutation(m) }), [&] (auto fm, bool frag) {
+        fragment_and_freeze(make_flat_mutation_reader_from_mutations_v2(m.schema(), semaphore.make_permit(), { mutation(m) }), [&] (auto fm, bool frag) {
+            BOOST_REQUIRE(!frag);
+            fms.emplace_back(std::move(fm));
+            return make_ready_future<stop_iteration>(stop_iteration::no);
+        }, std::numeric_limits<size_t>::max()).get0();
+
+        BOOST_REQUIRE_EQUAL(fms.size(), 1);
+
+        auto m1 = fms.back().unfreeze(m.schema());
+        BOOST_REQUIRE_EQUAL(m, m1);
+
+        fms.clear();
+
+        std::optional<bool> fragmented;
+        fragment_and_freeze(make_flat_mutation_reader_from_mutations_v2(m.schema(), semaphore.make_permit(), { mutation(m) }), [&] (auto fm, bool frag) {
+            BOOST_REQUIRE(!fragmented || *fragmented == frag);
+            *fragmented = frag;
+            fms.emplace_back(std::move(fm));
+            return make_ready_future<stop_iteration>(stop_iteration::no);
+        }, 1).get0();
+
+        auto&& rows = m.partition().non_dummy_rows();
+        auto expected_fragments = std::distance(rows.begin(), rows.end())
+                                  + m.partition().row_tombstones().size()
+                                  + !m.partition().static_row().empty();
+        BOOST_REQUIRE_EQUAL(fms.size(), std::max(expected_fragments, size_t(1)));
+        BOOST_REQUIRE(expected_fragments < 2 || *fragmented);
+
+        auto m2 = fms.back().unfreeze(m.schema());
+        fms.pop_back();
+        mutation_application_stats app_stats;
+        while (!fms.empty()) {
+            m2.partition().apply(*m.schema(), fms.back().partition(), *m.schema(), app_stats);
+            fms.pop_back();
+        }
+        BOOST_REQUIRE_EQUAL(m, m2);
+    });
+
+    auto test_random_streams = [&semaphore] (random_mutation_generator&& gen) {
+        for (auto i = 0; i < 4; i++) {
+            auto muts = gen(4);
+            auto s = muts[0].schema();
+
+            std::vector<frozen_mutation> frozen;
+
+            // Freeze all
+            fragment_and_freeze(make_flat_mutation_reader_from_mutations_v2(gen.schema(), semaphore.make_permit(), muts), [&] (auto fm, bool frag) {
                BOOST_REQUIRE(!frag);
-                fms.emplace_back(std::move(fm));
+                frozen.emplace_back(fm);
                return make_ready_future<stop_iteration>(stop_iteration::no);
            }, std::numeric_limits<size_t>::max()).get0();
+            BOOST_REQUIRE_EQUAL(muts.size(), frozen.size());
+            for (auto j = 0u; j < muts.size(); j++) {
+                BOOST_REQUIRE_EQUAL(muts[j], frozen[j].unfreeze(s));
+            }

-            BOOST_REQUIRE_EQUAL(fms.size(), 1);
+            // Freeze first
+            frozen.clear();
+            fragment_and_freeze(make_flat_mutation_reader_from_mutations_v2(gen.schema(), semaphore.make_permit(), muts), [&] (auto fm, bool frag) {
+                BOOST_REQUIRE(!frag);
+                frozen.emplace_back(fm);
+                return make_ready_future<stop_iteration>(stop_iteration::yes);
+            }, std::numeric_limits<size_t>::max()).get0();
+            BOOST_REQUIRE_EQUAL(frozen.size(), 1);
+            BOOST_REQUIRE_EQUAL(muts[0], frozen[0].unfreeze(s));

-            auto m1 = fms.back().unfreeze(m.schema());
-            BOOST_REQUIRE_EQUAL(m, m1);
-
-            fms.clear();
-
-            std::optional<bool> fragmented;
-            fragment_and_freeze(make_flat_mutation_reader_from_mutations_v2(m.schema(), semaphore.make_permit(), { mutation(m) }), [&] (auto fm, bool frag) {
-                BOOST_REQUIRE(!fragmented || *fragmented == frag);
-                *fragmented = frag;
-                fms.emplace_back(std::move(fm));
+            // Fragment and freeze all
+            frozen.clear();
+            fragment_and_freeze(make_flat_mutation_reader_from_mutations_v2(gen.schema(), semaphore.make_permit(), muts), [&] (auto fm, bool frag) {
+                frozen.emplace_back(fm);
                return make_ready_future<stop_iteration>(stop_iteration::no);
            }, 1).get0();
-
-            auto&& rows = m.partition().non_dummy_rows();
-            auto expected_fragments = std::distance(rows.begin(), rows.end())
-                                      + m.partition().row_tombstones().size()
-                                      + !m.partition().static_row().empty();
-            BOOST_REQUIRE_EQUAL(fms.size(), std::max(expected_fragments, size_t(1)));
-            BOOST_REQUIRE(expected_fragments < 2 || *fragmented);
-
-            auto m2 = fms.back().unfreeze(m.schema());
-            fms.pop_back();
-            mutation_application_stats app_stats;
-            while (!fms.empty()) {
-                m2.partition().apply(*m.schema(), fms.back().partition(), *m.schema(), app_stats);
-                fms.pop_back();
-            }
-            BOOST_REQUIRE_EQUAL(m, m2);
-        });
-
-        auto test_random_streams = [&semaphore] (random_mutation_generator&& gen) {
-            for (auto i = 0; i < 4; i++) {
-                auto muts = gen(4);
-                auto s = muts[0].schema();
-
-                std::vector<frozen_mutation> frozen;
-
-                // Freeze all
-                fragment_and_freeze(make_flat_mutation_reader_from_mutations_v2(gen.schema(), semaphore.make_permit(), muts), [&] (auto fm, bool frag) {
-                    BOOST_REQUIRE(!frag);
-                    frozen.emplace_back(fm);
-                    return make_ready_future<stop_iteration>(stop_iteration::no);
-                }, std::numeric_limits<size_t>::max()).get0();
-                BOOST_REQUIRE_EQUAL(muts.size(), frozen.size());
-                for (auto j = 0u; j < muts.size(); j++) {
-                    BOOST_REQUIRE_EQUAL(muts[j], frozen[j].unfreeze(s));
+            std::vector<mutation> unfrozen;
+            while (!frozen.empty()) {
+                auto m = frozen.front().unfreeze(s);
+                frozen.erase(frozen.begin());
+                if (unfrozen.empty() || !unfrozen.back().decorated_key().equal(*s, m.decorated_key())) {
+                    unfrozen.emplace_back(std::move(m));
+                } else {
+                    unfrozen.back().apply(std::move(m));
                }
-
-                // Freeze first
-                frozen.clear();
-                fragment_and_freeze(make_flat_mutation_reader_from_mutations_v2(gen.schema(), semaphore.make_permit(), muts), [&] (auto fm, bool frag) {
-                    BOOST_REQUIRE(!frag);
-                    frozen.emplace_back(fm);
-                    return make_ready_future<stop_iteration>(stop_iteration::yes);
-                }, std::numeric_limits<size_t>::max()).get0();
-                BOOST_REQUIRE_EQUAL(frozen.size(), 1);
-                BOOST_REQUIRE_EQUAL(muts[0], frozen[0].unfreeze(s));
-
-                // Fragment and freeze all
-                frozen.clear();
-                fragment_and_freeze(make_flat_mutation_reader_from_mutations_v2(gen.schema(), semaphore.make_permit(), muts), [&] (auto fm, bool frag) {
-                    frozen.emplace_back(fm);
-                    return make_ready_future<stop_iteration>(stop_iteration::no);
-                }, 1).get0();
-                std::vector<mutation> unfrozen;
-                while (!frozen.empty()) {
-                    auto m = frozen.front().unfreeze(s);
-                    frozen.erase(frozen.begin());
-                    if (unfrozen.empty() || !unfrozen.back().decorated_key().equal(*s, m.decorated_key())) {
-                        unfrozen.emplace_back(std::move(m));
-                    } else {
-                        unfrozen.back().apply(std::move(m));
-                    }
-                }
-                BOOST_REQUIRE_EQUAL(muts, unfrozen);
            }
-        };
+            BOOST_REQUIRE_EQUAL(muts, unfrozen);
+        }
+    };

-        test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::no));
-        test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::yes));
-    });
+    test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::no));
+    test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::yes));
 }

 SEASTAR_THREAD_TEST_CASE(test_flat_mutation_reader_move_buffer_content_to) {
@@ -371,111 +366,109 @@ SEASTAR_THREAD_TEST_CASE(test_flat_mutation_reader_move_buffer_content_to) {
        .is_equal_to(mut_orig);
 }

-SEASTAR_TEST_CASE(test_multi_range_reader) {
-    return seastar::async([] {
-        simple_schema s;
-        tests::reader_concurrency_semaphore_wrapper semaphore;
-        auto permit = semaphore.make_permit();
+SEASTAR_THREAD_TEST_CASE(test_multi_range_reader) {
+    simple_schema s;
+    tests::reader_concurrency_semaphore_wrapper semaphore;
+    auto permit = semaphore.make_permit();

-        auto keys = s.make_pkeys(10);
-        auto ring = s.to_ring_positions(keys);
+    auto keys = s.make_pkeys(10);
+    auto ring = s.to_ring_positions(keys);

-        auto crs = boost::copy_range<std::vector<mutation_fragment>>(boost::irange(0, 3) | boost::adaptors::transformed([&] (auto n) {
-            return s.make_row(permit, s.make_ckey(n), "value");
-        }));
+    auto crs = boost::copy_range<std::vector<mutation_fragment>>(boost::irange(0, 3) | boost::adaptors::transformed([&] (auto n) {
+        return s.make_row(permit, s.make_ckey(n), "value");
+    }));

-        auto ms = boost::copy_range<std::vector<mutation>>(keys | boost::adaptors::transformed([&] (auto& key) {
-            auto m = mutation(s.schema(), key);
-            for (auto& mf : crs) {
-                m.apply(mf);
-            }
-            return m;
-        }));
+    auto ms = boost::copy_range<std::vector<mutation>>(keys | boost::adaptors::transformed([&] (auto& key) {
+        auto m = mutation(s.schema(), key);
+        for (auto& mf : crs) {
+            m.apply(mf);
+        }
+        return m;
+    }));

-        auto source = mutation_source([&] (schema_ptr, reader_permit permit, const dht::partition_range& range) {
-            return make_flat_mutation_reader_from_mutations_v2(s.schema(), std::move(permit), ms, range);
-        });
-
-        const auto empty_ranges = dht::partition_range_vector{};
-        const auto single_ranges = dht::partition_range_vector{
-                dht::partition_range::make(ring[1], ring[2]),
-        };
-        const auto multiple_ranges = dht::partition_range_vector {
-                dht::partition_range::make(ring[1], ring[2]),
-                dht::partition_range::make_singular(ring[4]),
-                dht::partition_range::make(ring[6], ring[8]),
-        };
-        const auto empty_generator = [] { return std::optional<dht::partition_range>{}; };
-        const auto single_generator = [r = std::optional<dht::partition_range>(single_ranges.front())] () mutable {
-            return std::exchange(r, {});
-        };
-        const auto multiple_generator = [it = multiple_ranges.cbegin(), end = multiple_ranges.cend()] () mutable -> std::optional<dht::partition_range> {
-            if (it == end) {
-                return std::nullopt;
-            }
-            return *(it++);
-        };
-        auto fft_range = dht::partition_range::make_starting_with(ring[9]);
-
-        // Generator ranges are single pass, so we need a new range each time they are used.
-        auto run_test = [&] (auto make_empty_ranges, auto make_single_ranges, auto make_multiple_ranges) {
-            testlog.info("empty ranges");
-            assert_that(make_flat_multi_range_reader(s.schema(), semaphore.make_permit(), source, make_empty_ranges(), s.schema()->full_slice()))
-                    .produces_end_of_stream()
-                    .fast_forward_to(fft_range)
-                    .produces(ms[9])
-                    .produces_end_of_stream();
-
-            testlog.info("single range");
-            assert_that(make_flat_multi_range_reader(s.schema(), semaphore.make_permit(), source, make_single_ranges(), s.schema()->full_slice()))
-                    .produces(ms[1])
-                    .produces(ms[2])
-                    .produces_end_of_stream()
-                    .fast_forward_to(fft_range)
-                    .produces(ms[9])
-                    .produces_end_of_stream();
-
-            testlog.info("read full partitions and fast forward");
-            assert_that(make_flat_multi_range_reader(s.schema(), semaphore.make_permit(), source, make_multiple_ranges(), s.schema()->full_slice()))
-                    .produces(ms[1])
-                    .produces(ms[2])
-                    .produces(ms[4])
-                    .produces(ms[6])
-                    .fast_forward_to(fft_range)
-                    .produces(ms[9])
-                    .produces_end_of_stream();
-
-            testlog.info("read, skip partitions and fast forward");
-            assert_that(make_flat_multi_range_reader(s.schema(), semaphore.make_permit(), source, make_multiple_ranges(), s.schema()->full_slice()))
-                    .produces_partition_start(keys[1])
-                    .next_partition()
-                    .produces_partition_start(keys[2])
-                    .produces_row_with_key(crs[0].as_clustering_row().key())
-                    .next_partition()
-                    .produces(ms[4])
-                    .next_partition()
-                    .produces_partition_start(keys[6])
-                    .produces_row_with_key(crs[0].as_clustering_row().key())
-                    .produces_row_with_key(crs[1].as_clustering_row().key())
-                    .fast_forward_to(fft_range)
-                    .next_partition()
-                    .produces_partition_start(keys[9])
-                    .next_partition()
-                    .produces_end_of_stream();
-        };
-
-        testlog.info("vector version");
-        run_test(
-                [&] { return empty_ranges; },
-                [&] { return single_ranges; },
-                [&] { return multiple_ranges; });
-
-        testlog.info("generator version");
-        run_test(
-                [&] { return empty_generator; },
-                [&] { return single_generator; },
-                [&] { return multiple_generator; });
+    auto source = mutation_source([&] (schema_ptr, reader_permit permit, const dht::partition_range& range) {
+        return make_flat_mutation_reader_from_mutations_v2(s.schema(), std::move(permit), ms, range);
    });
+
+    const auto empty_ranges = dht::partition_range_vector{};
+    const auto single_ranges = dht::partition_range_vector{
+            dht::partition_range::make(ring[1], ring[2]),
+    };
+    const auto multiple_ranges = dht::partition_range_vector {
+            dht::partition_range::make(ring[1], ring[2]),
+            dht::partition_range::make_singular(ring[4]),
+            dht::partition_range::make(ring[6], ring[8]),
+    };
+    const auto empty_generator = [] { return std::optional<dht::partition_range>{}; };
+    const auto single_generator = [r = std::optional<dht::partition_range>(single_ranges.front())] () mutable {
+        return std::exchange(r, {});
+    };
+    const auto multiple_generator = [it = multiple_ranges.cbegin(), end = multiple_ranges.cend()] () mutable -> std::optional<dht::partition_range> {
+        if (it == end) {
+            return std::nullopt;
+        }
+        return *(it++);
+    };
+    auto fft_range = dht::partition_range::make_starting_with(ring[9]);
+
+    // Generator ranges are single pass, so we need a new range each time they are used.
+    auto run_test = [&] (auto make_empty_ranges, auto make_single_ranges, auto make_multiple_ranges) {
+        testlog.info("empty ranges");
+        assert_that(make_flat_multi_range_reader(s.schema(), semaphore.make_permit(), source, make_empty_ranges(), s.schema()->full_slice()))
+                .produces_end_of_stream()
+                .fast_forward_to(fft_range)
+                .produces(ms[9])
+                .produces_end_of_stream();
+
+        testlog.info("single range");
+        assert_that(make_flat_multi_range_reader(s.schema(), semaphore.make_permit(), source, make_single_ranges(), s.schema()->full_slice()))
+                .produces(ms[1])
+                .produces(ms[2])
+                .produces_end_of_stream()
+                .fast_forward_to(fft_range)
+                .produces(ms[9])
+                .produces_end_of_stream();
+
+        testlog.info("read full partitions and fast forward");
+        assert_that(make_flat_multi_range_reader(s.schema(), semaphore.make_permit(), source, make_multiple_ranges(), s.schema()->full_slice()))
+                .produces(ms[1])
+                .produces(ms[2])
+                .produces(ms[4])
+                .produces(ms[6])
+                .fast_forward_to(fft_range)
+                .produces(ms[9])
+                .produces_end_of_stream();
+
+        testlog.info("read, skip partitions and fast forward");
+        assert_that(make_flat_multi_range_reader(s.schema(), semaphore.make_permit(), source, make_multiple_ranges(), s.schema()->full_slice()))
+                .produces_partition_start(keys[1])
+                .next_partition()
+                .produces_partition_start(keys[2])
+                .produces_row_with_key(crs[0].as_clustering_row().key())
+                .next_partition()
+                .produces(ms[4])
+                .next_partition()
+                .produces_partition_start(keys[6])
+                .produces_row_with_key(crs[0].as_clustering_row().key())
+                .produces_row_with_key(crs[1].as_clustering_row().key())
+                .fast_forward_to(fft_range)
+                .next_partition()
+                .produces_partition_start(keys[9])
+                .next_partition()
+                .produces_end_of_stream();
+    };
+
+    testlog.info("vector version");
+    run_test(
+            [&] { return empty_ranges; },
+            [&] { return single_ranges; },
+            [&] { return multiple_ranges; });
+
+    testlog.info("generator version");
+    run_test(
+            [&] { return empty_generator; },
+            [&] { return single_generator; },
+            [&] { return multiple_generator; });
 }

 using reversed_partitions = seastar::bool_class<class reversed_partitions_tag>;
@@ -648,95 +641,290 @@ void test_flat_stream(schema_ptr s, std::vector<mutation> muts, reversed_partiti
    }
 }

-SEASTAR_TEST_CASE(test_consume_flat) {
-    return seastar::async([] {
-        auto test_random_streams = [&] (random_mutation_generator&& gen) {
-            for (auto i = 0; i < 4; i++) {
-                auto muts = gen(4);
-                test_flat_stream(gen.schema(), muts, reversed_partitions::no, in_thread::no);
-                test_flat_stream(gen.schema(), muts, reversed_partitions::yes, in_thread::no);
-                test_flat_stream(gen.schema(), muts, reversed_partitions::no, in_thread::yes);
-            }
-        };
+SEASTAR_THREAD_TEST_CASE(test_consume_flat) {
+    auto test_random_streams = [&] (random_mutation_generator&& gen) {
+        for (auto i = 0; i < 4; i++) {
+            auto muts = gen(4);
+            test_flat_stream(gen.schema(), muts, reversed_partitions::no, in_thread::no);
+            test_flat_stream(gen.schema(), muts, reversed_partitions::yes, in_thread::no);
+            test_flat_stream(gen.schema(), muts, reversed_partitions::no, in_thread::yes);
+        }
+    };

-        test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::no));
-        test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::yes));
-    });
+    test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::no));
+    test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::yes));
 }

-SEASTAR_TEST_CASE(test_make_forwardable) {
-    return seastar::async([] {
-        simple_schema s;
-        tests::reader_concurrency_semaphore_wrapper semaphore;
-        auto permit = semaphore.make_permit();
+SEASTAR_THREAD_TEST_CASE(test_make_forwardable) {
+    simple_schema s;
+    tests::reader_concurrency_semaphore_wrapper semaphore;
+    auto permit = semaphore.make_permit();

-        auto keys = s.make_pkeys(10);
+    auto keys = s.make_pkeys(10);

-        auto crs = boost::copy_range < std::vector <
-                   mutation_fragment >> (boost::irange(0, 3) | boost::adaptors::transformed([&](auto n) {
-                       return s.make_row(permit, s.make_ckey(n), "value");
-                   }));
+    auto crs = boost::copy_range < std::vector <
+               mutation_fragment >> (boost::irange(0, 3) | boost::adaptors::transformed([&](auto n) {
+                   return s.make_row(permit, s.make_ckey(n), "value");
+               }));

-        auto ms = boost::copy_range < std::vector < mutation >> (keys | boost::adaptors::transformed([&](auto &key) {
-            auto m = mutation(s.schema(), key);
-            for (auto &mf : crs) {
-                m.apply(mf);
-            }
-            return m;
-        }));
+    auto ms = boost::copy_range < std::vector < mutation >> (keys | boost::adaptors::transformed([&](auto &key) {
+        auto m = mutation(s.schema(), key);
+        for (auto &mf : crs) {
+            m.apply(mf);
+        }
+        return m;
+    }));

-        auto make_reader = [&] (auto& range) {
-            return assert_that(
-                make_forwardable(make_flat_mutation_reader_from_mutations_v2(s.schema(), semaphore.make_permit(), ms, range, streamed_mutation::forwarding::no)));
-        };
+    auto make_reader = [&] (auto& range) {
+        return assert_that(
+            make_forwardable(make_flat_mutation_reader_from_mutations_v2(s.schema(), semaphore.make_permit(), ms, range, streamed_mutation::forwarding::no)));
+    };

-        auto test = [&] (auto& rd, auto& partition) {
-            rd.produces_partition_start(partition.decorated_key(), partition.partition().partition_tombstone());
-            rd.produces_end_of_stream();
-            rd.fast_forward_to(position_range::all_clustered_rows());
-            for (auto &row : partition.partition().clustered_rows()) {
-                rd.produces_row_with_key(row.key());
-            }
-            rd.produces_end_of_stream();
+    auto test = [&] (auto& rd, auto& partition) {
+        rd.produces_partition_start(partition.decorated_key(), partition.partition().partition_tombstone());
+        rd.produces_end_of_stream();
+        rd.fast_forward_to(position_range::all_clustered_rows());
+        for (auto &row : partition.partition().clustered_rows()) {
+            rd.produces_row_with_key(row.key());
+        }
+        rd.produces_end_of_stream();
+        rd.next_partition();
+    };
+
+    auto rd = make_reader(query::full_partition_range);
+
+    for (auto& partition : ms) {
+        test(rd, partition);
+    }
+
+    auto single_range = dht::partition_range::make_singular(ms[0].decorated_key());
+
+    auto rd2 = make_reader(single_range);
+
+    rd2.produces_partition_start(ms[0].decorated_key(), ms[0].partition().partition_tombstone());
+    rd2.produces_end_of_stream();
+    rd2.fast_forward_to(position_range::all_clustered_rows());
+    rd2.produces_row_with_key(ms[0].partition().clustered_rows().begin()->key());
+    rd2.produces_row_with_key(std::next(ms[0].partition().clustered_rows().begin())->key());
+
+    auto remaining_range = dht::partition_range::make_starting_with({ms[0].decorated_key(), false});
+
+    rd2.fast_forward_to(remaining_range);
+
+    for (auto i = size_t(1); i < ms.size(); ++i) {
+        test(rd2, ms[i]);
+    }
+}
+
+SEASTAR_THREAD_TEST_CASE(test_make_forwardable_next_partition) {
+    simple_schema s;
+    tests::reader_concurrency_semaphore_wrapper semaphore;
+    const auto permit = semaphore.make_permit();
+
+    auto make_reader = [&](std::vector<mutation> mutations, const dht::partition_range& pr) {
+        auto result = make_flat_mutation_reader_from_mutations_v2(s.schema(),
+            permit,
+            std::move(mutations),
+            pr,
+            streamed_mutation::forwarding::yes);
+        return assert_that(std::move(result)).exact();
+    };
+
+    const auto pk1 = s.make_pkey(1);
+    auto m1 = mutation(s.schema(), pk1);
+    s.add_static_row(m1, "test-static-1");
+
+    const auto pk2 = s.make_pkey(2);
+    auto m2 = mutation(s.schema(), pk2);
+    s.add_static_row(m2, "test-static-2");
+
+    dht::ring_position_comparator cmp{*s.schema()};
+    BOOST_CHECK_EQUAL(cmp(m1.decorated_key(), m2.decorated_key()), std::strong_ordering::less);
+
+    auto rd = make_reader({m1, m2}, query::full_partition_range);
+    rd.fill_buffer().get();
+    rd.next_partition();
+    rd.produces_partition_start(m1.decorated_key(), m1.partition().partition_tombstone());
+    rd.produces_static_row(
+        {{s.schema()->get_column_definition(to_bytes("s1")), to_bytes("test-static-1")}});
+    rd.produces_end_of_stream();
+
+    rd.next_partition();
+    rd.produces_partition_start(m2.decorated_key(), m2.partition().partition_tombstone());
+    rd.produces_static_row(
+        {{s.schema()->get_column_definition(to_bytes("s1")), to_bytes("test-static-2")}});
+    rd.produces_end_of_stream();
+
+    rd.next_partition();
+    rd.produces_end_of_stream();
+}
+
+SEASTAR_THREAD_TEST_CASE(test_make_nonforwardable) {
+    simple_schema s;
+    tests::reader_concurrency_semaphore_wrapper semaphore;
+    const auto permit = semaphore.make_permit();
+
+    auto make_reader = [&](std::vector<mutation> mutations,
+        bool single_partition,
+        const dht::partition_range& pr)
+    {
+        auto result = make_flat_mutation_reader_from_mutations_v2(s.schema(),
+            permit,
+            std::move(mutations),
+            pr,
+            streamed_mutation::forwarding::yes);
+        result = make_nonforwardable(std::move(result), single_partition);
+        return assert_that(std::move(result)).exact();
+    };
+
+    const auto pk1 = s.make_pkey(1);
+    auto m1 = mutation(s.schema(), pk1);
+    m1.apply(s.make_row(permit, s.make_ckey(11), "value1"));
+
+    const auto pk2 = s.make_pkey(2);
+    auto m2 = mutation(s.schema(), pk2);
+    m2.apply(s.make_row(permit, s.make_ckey(22), "value2"));
+
+    const auto pk3 = s.make_pkey(3);
+    auto m3 = mutation(s.schema(), pk3);
+    m3.apply(s.make_row(permit, s.make_ckey(33), "value3"));
+
+    dht::ring_position_comparator cmp{*s.schema()};
+    BOOST_CHECK_EQUAL(cmp(m1.decorated_key(), m2.decorated_key()), std::strong_ordering::less);
+    BOOST_CHECK_EQUAL(cmp(m2.decorated_key(), m3.decorated_key()), std::strong_ordering::less);
+
+    // no input -> no output
+    {
+        auto rd = make_reader({}, false, query::full_partition_range);
+        rd.produces_end_of_stream();
+    }
+
+    // next_partition()
+    {
+        auto check = [&] (flat_reader_assertions_v2 rd) {
+            rd.produces_partition_start(m1.decorated_key(), m1.partition().partition_tombstone());
            rd.next_partition();
+            rd.produces_partition_start(m2.decorated_key(), m2.partition().partition_tombstone());
+            rd.produces_row_with_key(m2.partition().clustered_rows().begin()->key());
+            rd.produces_partition_end();
+            rd.produces_end_of_stream();
        };

-        auto rd = make_reader(query::full_partition_range);
+        // buffer is not empty
+        check(make_reader({m1, m2}, false, query::full_partition_range));

-        for (auto& partition : ms) {
-            test(rd, partition);
+        // buffer is empty
+        {
+            auto rd = make_reader({m1, m2}, false, query::full_partition_range);
+            rd.set_max_buffer_size(1);
+            check(std::move(rd));
        }
+    }

-        auto single_range = dht::partition_range::make_singular(ms[0].decorated_key());
+    // fast_forward_to()
+    {
+        const auto m1_range = dht::partition_range::make_singular(m1.decorated_key());
+        auto rd = make_reader({m1, m2}, false, m1_range);
+        rd.set_max_buffer_size(1);

-        auto rd2 = make_reader(single_range);
+        rd.produces_partition_start(m1.decorated_key(), m1.partition().partition_tombstone());

-        rd2.produces_partition_start(ms[0].decorated_key(), ms[0].partition().partition_tombstone());
-        rd2.produces_end_of_stream();
-        rd2.fast_forward_to(position_range::all_clustered_rows());
-        rd2.produces_row_with_key(ms[0].partition().clustered_rows().begin()->key());
-        rd2.produces_row_with_key(std::next(ms[0].partition().clustered_rows().begin())->key());
+        const auto m2_range = dht::partition_range::make_singular(m2.decorated_key());
+        rd.fast_forward_to(m2_range);
+        rd.produces_partition_start(m2.decorated_key(), m2.partition().partition_tombstone());
+        rd.produces_row_with_key(m2.partition().clustered_rows().begin()->key());
+        rd.produces_partition_end();

-        auto remaining_range = dht::partition_range::make_starting_with({ms[0].decorated_key(), false});
+        rd.next_partition();
+        rd.produces_end_of_stream();
+    }

-        rd2.fast_forward_to(remaining_range);
+    // single_partition
+    {
+        auto rd = make_reader({m1, m2}, true, query::full_partition_range);
+        rd.set_max_buffer_size(1);

-        for (auto i = size_t(1); i < ms.size(); ++i) {
-            test(rd2, ms[i]);
-        }
-    });
+        rd.produces_partition_start(m1.decorated_key(), m1.partition().partition_tombstone());
+        rd.produces_row_with_key(m1.partition().clustered_rows().begin()->key());
+
+        rd.next_partition();
+        rd.produces_end_of_stream();
+
+        rd.next_partition();
+        rd.produces_end_of_stream();
+    }
+
+    // single_partition with fast_forward_to
+    {
+        const auto m1_range = dht::partition_range::make_singular(m1.decorated_key());
+        auto rd = make_reader({m1, m2}, true, m1_range);
+        rd.set_max_buffer_size(1);
+
+        rd.produces_partition_start(m1.decorated_key(), m1.partition().partition_tombstone());
+
+        const auto m2_range = dht::partition_range::make_singular(m2.decorated_key());
+        rd.fast_forward_to(m2_range);
+        rd.produces_end_of_stream();
+
+        rd.next_partition();
+        rd.produces_end_of_stream();
+    }
+
+    // static row
+    {
+        s.add_static_row(m1, "test-static");
+        const auto m1_range = dht::partition_range::make_singular(m1.decorated_key());
+        auto rd = make_reader({m1, m2}, false, m1_range);
+        rd.set_max_buffer_size(1);
+        rd.produces_partition_start(m1.decorated_key(), m1.partition().partition_tombstone());
+        rd.produces_static_row(
+            {{s.schema()->get_column_definition(to_bytes("s1")), to_bytes("test-static")}});
+        rd.produces_row(
+            m1.partition().clustered_rows().begin()->key(),
+            {{s.schema()->get_column_definition(to_bytes("v")), to_bytes("value1")}}
+        );
+        rd.produces_partition_end();
+        rd.produces_end_of_stream();
+    }
 }

-SEASTAR_TEST_CASE(test_abandoned_flat_mutation_reader_from_mutation) {
-    return seastar::async([] {
-        tests::reader_concurrency_semaphore_wrapper semaphore;
-        for_each_mutation([&] (const mutation& m) {
-            auto rd = make_flat_mutation_reader_from_mutations_v2(m.schema(), semaphore.make_permit(), {mutation(m)});
-            auto close_rd = deferred_close(rd);
-            rd().get();
-            rd().get();
-            // We rely on AddressSanitizer telling us if nothing was leaked.
+SEASTAR_THREAD_TEST_CASE(test_make_nonforwardable_from_mutations_as_mutation_source) {
+    auto populate = [] (schema_ptr, const std::vector<mutation> &muts) {
+        return mutation_source([=] (
+            schema_ptr schema,
+            reader_permit permit,
+            const dht::partition_range& range,
+            const query::partition_slice& slice,
+            const io_priority_class&,
+            tracing::trace_state_ptr,
+            streamed_mutation::forwarding fwd_sm,
+            mutation_reader::forwarding) mutable {
+            auto squashed_muts = squash_mutations(muts);
+            const auto single_partition = squashed_muts.size() == 1;
+            auto reader = make_flat_mutation_reader_from_mutations_v2(schema,
+                std::move(permit),
+                std::move(squashed_muts),
+                range,
+                slice,
+                streamed_mutation::forwarding::yes);
+            reader = make_nonforwardable(std::move(reader), single_partition);
+            if (fwd_sm) {
+                reader = make_forwardable(std::move(reader));
+            }
+            return reader;
        });
+    };
+    run_mutation_source_tests(populate);
+}
+
+SEASTAR_THREAD_TEST_CASE(test_abandoned_flat_mutation_reader_from_mutation) {
+    tests::reader_concurrency_semaphore_wrapper semaphore;
+    for_each_mutation([&] (const mutation& m) {
+        auto rd = make_flat_mutation_reader_from_mutations_v2(m.schema(), semaphore.make_permit(), {mutation(m)});
+        auto close_rd = deferred_close(rd);
+        rd().get();
+        rd().get();
+        // We rely on AddressSanitizer telling us if nothing was leaked.
    });
 }

--- a/test/boost/reader_concurrency_semaphore_test.cc
+++ b/test/boost/reader_concurrency_semaphore_test.cc
@@ -24,22 +24,30 @@

 SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_clear_inactive_reads) {
    simple_schema s;
+    std::vector<reader_permit> permits;
    std::vector<reader_concurrency_semaphore::inactive_read_handle> handles;

    {
        reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name());
        auto stop_sem = deferred_stop(semaphore);
+        auto clear_permits = defer([&permits] { permits.clear(); });

        for (int i = 0; i < 10; ++i) {
-            handles.emplace_back(semaphore.register_inactive_read(make_empty_flat_reader_v2(s.schema(), semaphore.make_tracking_only_permit(s.schema().get(), get_name(), db::no_timeout))));
+            permits.emplace_back(semaphore.make_tracking_only_permit(s.schema().get(), get_name(), db::no_timeout));
+            handles.emplace_back(semaphore.register_inactive_read(make_empty_flat_reader_v2(s.schema(), permits.back())));
        }

        BOOST_REQUIRE(std::all_of(handles.begin(), handles.end(), [] (const reader_concurrency_semaphore::inactive_read_handle& handle) { return bool(handle); }));
+        BOOST_REQUIRE(std::all_of(permits.begin(), permits.end(), [] (const reader_permit& permit) { return permit.get_state() == reader_permit::state::inactive; }));

        semaphore.clear_inactive_reads();

        BOOST_REQUIRE(std::all_of(handles.begin(), handles.end(), [] (const reader_concurrency_semaphore::inactive_read_handle& handle) { return !bool(handle); }));
+        BOOST_REQUIRE(std::all_of(permits.begin(), permits.end(), [] (const reader_permit& permit) { return permit.get_state() == reader_permit::state::evicted; }));

+        BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 0);
+
+        permits.clear();
        handles.clear();

        for (int i = 0; i < 10; ++i) {
@@ -1077,3 +1085,33 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_set_resources) {
    BOOST_REQUIRE_EQUAL(semaphore.initial_resources(), reader_resources(4, 4 * 1024));
    permit3_fut.get();
 }
+
+
+// Check that `stop()` correctly evicts all inactive reads.
+SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_stop_with_inactive_reads) {
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name());
+
+    simple_schema ss;
+    auto s = ss.schema();
+
+    auto permit = reader_permit_opt(semaphore.obtain_permit(s.get(), get_name(), 1024, db::no_timeout).get());
+
+    auto handle = semaphore.register_inactive_read(make_empty_flat_reader_v2(s, *permit));
+
+    BOOST_REQUIRE(handle);
+    BOOST_REQUIRE_EQUAL(permit->get_state(), reader_permit::state::inactive);
+    BOOST_REQUIRE_EQUAL(semaphore.get_stats().inactive_reads, 1);
+
+    // Using BOOST_CHECK_* because an exception thrown here causes a segfault,
+    // due to the stop future not being waited for.
+    auto stop_f = semaphore.stop();
+    BOOST_CHECK(!stop_f.available());
+    BOOST_CHECK(eventually_true([&] { return !semaphore.get_stats().inactive_reads; }));
+    BOOST_CHECK(!handle);
+    BOOST_CHECK_EQUAL(permit->get_state(), reader_permit::state::evicted);
+
+    // Stop waits on all permits, so we need to destroy the permit before we can
+    // wait on the stop future.
+    permit = {};
+    stop_f.get();
+}
--- a/test/boost/sstable_compaction_test.cc
+++ b/test/boost/sstable_compaction_test.cc
@@ -4926,6 +4926,7 @@ SEASTAR_TEST_CASE(test_large_partition_splitting_on_compaction) {
        position_in_partition::tri_compare pos_tri_cmp(*s);

        for (auto& sst : ret.new_sstables) {
+            sst = env.reusable_sst(s, tmp.path().string(), sst->generation().value()).get0();
            BOOST_REQUIRE(sst->may_have_partition_tombstones());

            auto reader = sstable_reader(sst, s, env.make_reader_permit());
--- a/test/cql-pytest/run.py
+++ b/test/cql-pytest/run.py
@@ -205,6 +205,7 @@ def run_scylla_cmd(pid, dir):
        '--max-networking-io-control-blocks', '100',
        '--unsafe-bypass-fsync', '1',
        '--kernel-page-cache', '1',
+        '--commitlog-use-o-dsync', '0',
        '--flush-schema-tables-after-modification', 'false',
        '--api-address', ip,
        '--rpc-address', ip,
--- a/test/lib/cql_test_env.cc
+++ b/test/lib/cql_test_env.cc
@@ -106,6 +106,7 @@ cql_test_config::cql_test_config(shared_ptr<db::config> cfg)
    db_config->add_per_partition_rate_limit_extension();

    db_config->flush_schema_tables_after_modification.set(false);
+    db_config->commitlog_use_o_dsync(false);
 }

 cql_test_config::cql_test_config(const cql_test_config&) = default;
--- a/test/lib/sstable_test_env.hh
+++ b/test/lib/sstable_test_env.hh
@@ -85,7 +85,8 @@ public:
    future<shared_sstable> reusable_sst(schema_ptr schema, sstring dir, unsigned long generation,
            sstable::version_types version, sstable::format_types f = sstable::format_types::big) {
        auto sst = make_sstable(std::move(schema), dir, generation, version, f);
-        return sst->load().then([sst = std::move(sst)] {
+        sstable_open_config cfg { .load_first_and_last_position_metadata = true };
+        return sst->load(default_priority_class(), cfg).then([sst = std::move(sst)] {
            return make_ready_future<shared_sstable>(std::move(sst));
        });
    }
--- a/test/lib/sstable_utils.cc
+++ b/test/lib/sstable_utils.cc
@@ -43,7 +43,8 @@ sstables::shared_sstable make_sstable_containing(std::function<sstables::shared_
        }
    }
    write_memtable_to_sstable_for_test(*mt, sst).get();
-    sst->open_data().get();
+    sstable_open_config cfg { .load_first_and_last_position_metadata = true };
+    sst->open_data(cfg).get();

    std::set<mutation, mutation_decorated_key_less_comparator> merged;
    for (auto&& m : muts) {
--- a/test/pylib/host_registry.py
+++ b/test/pylib/host_registry.py
@@ -72,7 +72,11 @@ class HostRegistry:
            self.next_host_id += 1
            return Host(self.subnet.format(self.next_host_id))

-        self.pool = Pool[Host](254, create_host)
+        async def destroy_host(h: Host) -> None:
+            # Doesn't matter, we never return hosts to the pool as 'dirty'.
+            pass
+
+        self.pool = Pool[Host](254, create_host, destroy_host)

        async def cleanup() -> None:
            if self.lock_filename:
@@ -85,5 +89,5 @@ class HostRegistry:
        return await self.pool.get()

    async def release_host(self, host: Host) -> None:
-        return await self.pool.put(host)
+        return await self.pool.put(host, is_dirty=False)

--- a/test/pylib/pool.py
+++ b/test/pylib/pool.py
@@ -1,5 +1,5 @@
 import asyncio
-from typing import Generic, Callable, Awaitable, TypeVar, AsyncContextManager, Final
+from typing import Generic, Callable, Awaitable, TypeVar, AsyncContextManager, Final, Optional

 T = TypeVar('T')

@@ -10,12 +10,15 @@ class Pool(Generic[T]):
    on demand, so that if you use less, you don't create anything upfront.
    If there is no object in the pool and all N objects are in use, you want
    to wait until one of the object is returned to the pool. Expects a
-    builder async function to build a new object.
+    builder async function to build a new object and a destruction async
+    function to clean up after a 'dirty' object (see below).

    Usage example:
        async def start_server():
            return Server()
-        pool = Pool(4, start_server)
+        async def destroy_server(server):
+            await server.free_resources()
+        pool = Pool(4, start_server, destroy_server)

        server = await pool.get()
        try:
@@ -24,25 +27,51 @@ class Pool(Generic[T]):
            await pool.put(server)

    Alternatively:
-        async with pool.instance() as server:
+        async with pool.instance(dirty_on_exception=False) as server:
            await run_test(test, server)

+
    If the object is considered no longer usable by other users of the pool
-    you can 'steal' it, which frees up space in the pool.
+    you can pass `is_dirty=True` flag to `put`, which will cause the object
+    to be 'destroyed' (by calling the provided `destroy` function on it) and
+    will free up space in the pool.
        server = await.pool.get()
        dirty = True
        try:
            dirty = await run_test(test, server)
        finally:
-            if dirty:
-                await pool.steal()
-            else:
-                await pool.put(server)
+            await pool.put(server, is_dirty=dirty)
+
+    Alternatively:
+        async with (cm := pool.instance(dirty_on_exception=True)) as server:
+            cm.dirty = await run_test(test, server)
+            # It will also be considered dirty if run_test throws an exception
+
+
+    To atomically return a dirty object and use the freed space to obtain
+    another object, you can use `replace_dirty`. This is different from a
+    `put(is_dirty=True)` call followed by a `get` call, where a concurrent
+    waiter might take the space freed up by `put`.
+        server = await.pool.get()
+        dirty = False
+        try:
+            for _ in range(num_runs):
+                if dirty:
+                    srv = server
+                    server = None
+                    server = await pool.replace_dirty(srv)
+                dirty = await run_test(test, server)
+        finally:
+            if server:
+                await pool.put(is_dirty=dirty)
    """
-    def __init__(self, max_size: int, build: Callable[..., Awaitable[T]]):
+    def __init__(self, max_size: int,
+                 build: Callable[..., Awaitable[T]],
+                 destroy: Callable[[T], Awaitable[None]]):
        assert(max_size >= 0)
        self.max_size: Final[int] = max_size
        self.build: Final[Callable[..., Awaitable[T]]] = build
+        self.destroy: Final[Callable[[T], Awaitable]] = destroy
        self.cond: Final[asyncio.Condition] = asyncio.Condition()
        self.pool: list[T] = []
        self.total: int = 0 # len(self.pool) + leased objects
@@ -64,6 +93,68 @@ class Pool(Generic[T]):
            # No object in pool, but total < max_size so we can construct one
            self.total += 1

+        return await self._build_and_get(*args, **kwargs)
+
+    async def put(self, obj: T, is_dirty: bool):
+        """Return a previously borrowed object to the pool
+           if it's not dirty, otherwise destroy the object
+           and free up space in the pool.
+        """
+        if is_dirty:
+            await self.destroy(obj)
+
+        async with self.cond:
+            if is_dirty:
+                self.total -= 1
+            else:
+                self.pool.append(obj)
+            self.cond.notify()
+
+    async def replace_dirty(self, obj: T, *args, **kwargs) -> T:
+        """Atomically `put` a previously borrowed dirty object and `get` another one.
+           The 'atomicity' guarantees that the space freed up by the returned object
+           is used to return another object to the caller. The caller doesn't need
+           to wait for space to be freed by another user of the pool.
+
+           Note: the returned object might have been constructed earlier or it might
+           be built right now, as in `get`.
+           *args and **kwargs are used as in `get`.
+        """
+        await self.destroy(obj)
+
+        async with self.cond:
+            if self.pool:
+                self.total -= 1
+                return self.pool.pop()
+
+            # Need to construct a new object.
+            # The space for this object is already accounted for in self.total.
+
+        return await self._build_and_get(*args, **kwargs)
+
+    def instance(self, dirty_on_exception: bool, *args, **kwargs) -> AsyncContextManager[T]:
+        class Instance:
+            def __init__(self, pool: Pool[T], dirty_on_exception: bool):
+                self.pool = pool
+                self.dirty = False
+                self.dirty_on_exception = dirty_on_exception
+
+            async def __aenter__(self):
+                self.obj = await self.pool.get(*args, **kwargs)
+                return self.obj
+
+            async def __aexit__(self, exc_type, exc, obj):
+                if self.obj:
+                    self.dirty |= self.dirty_on_exception and exc is not None
+                    await self.pool.put(self.obj, is_dirty=self.dirty)
+                    self.obj = None
+
+        return Instance(self, dirty_on_exception)
+
+    async def _build_and_get(self, *args, **kwargs) -> T:
+        """Precondition: we allocated space for this object
+           (it's included in self.total).
+        """
        try:
            obj = await self.build(*args, **kwargs)
        except:
@@ -72,33 +163,3 @@ class Pool(Generic[T]):
                self.cond.notify()
            raise
        return obj
-
-    async def steal(self) -> None:
-        """Take ownership of a previously borrowed object.
-           Frees up space in the pool.
-        """
-        async with self.cond:
-            self.total -= 1
-            self.cond.notify()
-
-    async def put(self, obj: T):
-        """Return a previously borrowed object to the pool."""
-        async with self.cond:
-            self.pool.append(obj)
-            self.cond.notify()
-
-    def instance(self, *args, **kwargs) -> AsyncContextManager[T]:
-        class Instance:
-            def __init__(self, pool):
-                self.pool = pool
-
-            async def __aenter__(self):
-                self.obj = await self.pool.get(*args, **kwargs)
-                return self.obj
-
-            async def __aexit__(self, exc_type, exc, obj):
-                if self.obj:
-                    await self.pool.put(self.obj)
-                    self.obj = None
-
-        return Instance(self)
--- a/test/pylib/rest_client.py
+++ b/test/pylib/rest_client.py
@@ -21,14 +21,17 @@ logger = logging.getLogger(__name__)


 class HTTPError(Exception):
-    def __init__(self, uri, code, message):            
+    def __init__(self, uri, code, params, json, message):
        super().__init__(message)
        self.uri = uri
        self.code = code
+        self.params = params
+        self.json = json
        self.message = message

    def __str__(self):
-        return f"HTTP error {self.code}: {self.message}, uri {self.uri}"
+        return f"HTTP error {self.code}, uri: {self.uri}, " \
+               f"params: {self.params}, json: {self.json}, body:\n{self.message}"


 # TODO: support ssl and verify_ssl
@@ -63,7 +66,7 @@ class RESTClient(metaclass=ABCMeta):
                           params = params, json = json, timeout = client_timeout) as resp:
            if resp.status != 200:
                text = await resp.text()
-                raise HTTPError(uri, resp.status, f"{text}, params {params}, json {json}")
+                raise HTTPError(uri, resp.status, params, json, text)
            if response_type is not None:
                # Return response.text() or response.json()
                return await getattr(resp, response_type)()
--- a/test/pylib/scylla_cluster.py
+++ b/test/pylib/scylla_cluster.py
@@ -17,8 +17,10 @@ import pathlib
 import shutil
 import tempfile
 import time
+import traceback
 from typing import Optional, Dict, List, Set, Tuple, Callable, AsyncIterator, NamedTuple, Union
 import uuid
+from enum import Enum
 from io import BufferedWriter
 from test.pylib.host_registry import Host, HostRegistry
 from test.pylib.pool import Pool
@@ -111,6 +113,7 @@ SCYLLA_CMDLINE_OPTIONS = [
    '--max-networking-io-control-blocks', '100',
    '--unsafe-bypass-fsync', '1',
    '--kernel-page-cache', '1',
+    '--commitlog-use-o-dsync', '0',
    '--abort-on-lsa-bad-alloc', '1',
    '--abort-on-seastar-bad-alloc',
    '--abort-on-internal-error', '1',
@@ -173,6 +176,11 @@ def merge_cmdline_options(base: List[str], override: List[str]) -> List[str]:

    return run()

+class CqlUpState(Enum):
+    NOT_CONNECTED = 1,
+    CONNECTED = 2,
+    QUERIED = 3
+
 class ScyllaServer:
    """Starts and handles a single Scylla server, managing logs, checking if responsive,
       and cleanup when finished."""
@@ -295,7 +303,7 @@ class ScyllaServer:
        except Exception as exc:    # pylint: disable=broad-except
            return f"Exception when reading server log {self.log_filename}: {exc}"

-    async def cql_is_up(self) -> bool:
+    async def cql_is_up(self) -> CqlUpState:
        """Test that CQL is serving (a check we use at start up)."""
        caslog = logging.getLogger('cassandra')
        oldlevel = caslog.getEffectiveLevel()
@@ -310,6 +318,7 @@ class ScyllaServer:
        # work, so rely on this "side effect".
        profile = ExecutionProfile(load_balancing_policy=WhiteListRoundRobinPolicy([self.ip_addr]),
                                   request_timeout=self.START_TIMEOUT)
+        connected = False
        try:
            # In a cluster setup, it's possible that the CQL
            # here is directed to a node different from the initial contact
@@ -321,16 +330,19 @@ class ScyllaServer:
                         protocol_version=4,
                         auth_provider=auth) as cluster:
                with cluster.connect() as session:
-                    session.execute("SELECT * FROM system.local")
+                    connected = True
+                    # See the comment above about `auth::standard_role_manager`. We execute
+                    # a 'real' query to ensure that the auth service has finished initializing.
+                    session.execute("SELECT key FROM system.local where key = 'local'")
                    self.control_cluster = Cluster(execution_profiles=
                                                        {EXEC_PROFILE_DEFAULT: profile},
                                                   contact_points=[self.ip_addr],
                                                   auth_provider=auth)
                    self.control_connection = self.control_cluster.connect()
-                    return True
+                    return CqlUpState.QUERIED
        except (NoHostAvailable, InvalidRequest, OperationTimedOut) as exc:
            self.logger.debug("Exception when checking if CQL is up: %s", exc)
-            return False
+            return CqlUpState.CONNECTED if connected else CqlUpState.NOT_CONNECTED
        finally:
            caslog.setLevel(oldlevel)
        # Any other exception may indicate a problem, and is passed to the caller.
@@ -363,6 +375,7 @@ class ScyllaServer:

        self.start_time = time.time()
        sleep_interval = 0.1
+        cql_up_state = CqlUpState.NOT_CONNECTED

        while time.time() < self.start_time + self.START_TIMEOUT:
            if self.cmd.returncode:
@@ -377,20 +390,30 @@ class ScyllaServer:
                        logpath = log_handler.baseFilename   # type: ignore
                    else:
                        logpath = "?"
-                    raise RuntimeError(f"Failed to start server at host {self.ip_addr}.\n"
+                    raise RuntimeError(f"Failed to start server with ID = {self.server_id}, IP = {self.ip_addr}.\n"
                                       "Check the log files:\n"
                                       f"{logpath}\n"
                                       f"{self.log_filename}")

            if hasattr(self, "host_id") or await self.get_host_id(api):
-                if await self.cql_is_up():
+                cql_up_state = await self.cql_is_up()
+                if cql_up_state == CqlUpState.QUERIED:
                    return

            # Sleep and retry
            await asyncio.sleep(sleep_interval)

-        raise RuntimeError(f"failed to start server {self.ip_addr}, "
-                           f"check server log at {self.log_filename}")
+        err = f"Failed to start server with ID = {self.server_id}, IP = {self.ip_addr}."
+        if hasattr(self, "host_id"):
+            err += f" Managed to obtain the server's Host ID ({self.host_id})"
+            if cql_up_state == CqlUpState.CONNECTED:
+                err += " and to connect the CQL driver, but failed to execute a query."
+            else:
+                err += " but failed to connect the CQL driver."
+        else:
+            err += " Failed to obtain the server's Host ID."
+        err += f"\nCheck server log at {self.log_filename}."
+        raise RuntimeError(err)

    async def force_schema_migration(self) -> None:
        """This is a hack to change schema hash on an existing cluster node
@@ -705,6 +728,8 @@ class ScyllaCluster:
        to any specific test, throwing it here would stop a specific
        test."""
        if self.start_exception:
+            # Mark as dirty so further test cases don't try to reuse this cluster.
+            self.is_dirty = True
            raise self.start_exception

        for server in self.running.values():
@@ -729,11 +754,14 @@ class ScyllaCluster:
        if server_id not in self.running:
            return ScyllaCluster.ActionReturn(success=False, msg=f"Server {server_id} unknown")
        self.is_dirty = True
-        server = self.running.pop(server_id)
+        server = self.running[server_id]
+        # Remove the server from `running` only after we successfully stop it.
+        # Stopping may fail and if we removed it from `running` now it might leak.
        if gracefully:
            await server.stop_gracefully()
        else:
            await server.stop()
+        self.running.pop(server_id)
        self.stopped[server_id] = server
        return ScyllaCluster.ActionReturn(success=True, msg=f"{server} stopped")

@@ -753,8 +781,10 @@ class ScyllaCluster:
        self.is_dirty = True
        server = self.stopped.pop(server_id)
        server.seeds = self._seeds()
-        await server.start(self.api)
+        # Put the server in `running` before starting it.
+        # Starting may fail and if we didn't add it now it might leak.
        self.running[server_id] = server
+        await server.start(self.api)
        return ScyllaCluster.ActionReturn(success=True, msg=f"{server} started")

    async def server_restart(self, server_id: ServerNum) -> ActionReturn:
@@ -817,7 +847,9 @@ class ScyllaClusterManager:
        self.is_after_test_ok: bool = False
        # API
        # NOTE: need to make a safe temp dir as tempfile can't make a safe temp sock name
-        self.manager_dir: str = tempfile.mkdtemp(prefix="manager-", dir=base_dir)
+        # Put the socket in /tmp, not base_dir, to avoid going over the length
+        # limit of UNIX-domain socket addresses (issue #12622).
+        self.manager_dir: str = tempfile.mkdtemp(prefix="manager-", dir="/tmp")
        self.sock_path: str = f"{self.manager_dir}/api"
        app = aiohttp.web.Application()
        self._setup_routes(app)
@@ -828,7 +860,8 @@ class ScyllaClusterManager:
        if self.is_running:
            self.logger.warning("ScyllaClusterManager already running")
            return
-        await self._get_cluster()
+        self.cluster = await self.clusters.get(self.logger)
+        self.logger.info("First Scylla cluster: %s", self.cluster)
        self.cluster.setLogger(self.logger)
        await self.runner.setup()
        self.site = aiohttp.web.UnixSite(self.runner, path=self.sock_path)
@@ -839,12 +872,10 @@ class ScyllaClusterManager:
        self.current_test_case_full_name = f'{self.test_uname}::{test_case_name}'
        self.logger.info("Setting up %s", self.current_test_case_full_name)
        if self.cluster.is_dirty:
-            self.logger.info(f"Current cluster %s is dirty after last test, stopping...", self.cluster.name)
-            await self.clusters.steal()
-            await self.cluster.stop()
-            await self.cluster.release_ips()
-            self.logger.info(f"Waiting for new cluster for test %s...", self.current_test_case_full_name)
-            await self._get_cluster()
+            self.logger.info(f"Current cluster %s is dirty after test %s, replacing with a new one...",
+                             self.cluster.name, self.current_test_case_full_name)
+            self.cluster = await self.clusters.replace_dirty(self.cluster, self.logger)
+            self.logger.info("Got new Scylla cluster: %s", self.cluster.name)
        self.cluster.setLogger(self.logger)
        self.logger.info("Leasing Scylla cluster %s for test %s", self.cluster, self.current_test_case_full_name)
        self.cluster.before_test(self.current_test_case_full_name)
@@ -860,44 +891,56 @@ class ScyllaClusterManager:
            del self.site
        if not self.cluster.is_dirty:
            self.logger.info("Returning Scylla cluster %s for test %s", self.cluster, self.test_uname)
-            await self.clusters.put(self.cluster)
+            await self.clusters.put(self.cluster, is_dirty=False)
        else:
            self.logger.info("ScyllaManager: Scylla cluster %s is dirty after %s, stopping it",
                            self.cluster, self.test_uname)
-            await self.clusters.steal()
-            await self.cluster.stop()
+            await self.clusters.put(self.cluster, is_dirty=True)
        del self.cluster
        if os.path.exists(self.manager_dir):
            shutil.rmtree(self.manager_dir)
        self.is_running = False

-    async def _get_cluster(self) -> None:
-        self.cluster = await self.clusters.get(self.logger)
-        self.logger.info("Got new Scylla cluster %s", self.cluster)
-
-
    def _setup_routes(self, app: aiohttp.web.Application) -> None:
-        app.router.add_get('/up', self._manager_up)
-        app.router.add_get('/cluster/up', self._cluster_up)
-        app.router.add_get('/cluster/is-dirty', self._is_dirty)
-        app.router.add_get('/cluster/replicas', self._cluster_replicas)
-        app.router.add_get('/cluster/running-servers', self._cluster_running_servers)
-        app.router.add_get('/cluster/host-ip/{server_id}', self._cluster_server_ip_addr)
-        app.router.add_get('/cluster/host-id/{server_id}', self._cluster_host_id)
-        app.router.add_get('/cluster/before-test/{test_case_name}', self._before_test_req)
-        app.router.add_get('/cluster/after-test', self._after_test)
-        app.router.add_get('/cluster/mark-dirty', self._mark_dirty)
-        app.router.add_get('/cluster/server/{server_id}/stop', self._cluster_server_stop)
-        app.router.add_get('/cluster/server/{server_id}/stop_gracefully',
-                           self._cluster_server_stop_gracefully)
-        app.router.add_get('/cluster/server/{server_id}/start', self._cluster_server_start)
-        app.router.add_get('/cluster/server/{server_id}/restart', self._cluster_server_restart)
-        app.router.add_put('/cluster/addserver', self._cluster_server_add)
-        app.router.add_put('/cluster/remove-node/{initiator}', self._cluster_remove_node)
-        app.router.add_get('/cluster/decommission-node/{server_id}',
-                           self._cluster_decommission_node)
-        app.router.add_get('/cluster/server/{server_id}/get_config', self._server_get_config)
-        app.router.add_put('/cluster/server/{server_id}/update_config', self._server_update_config)
+        def make_catching_handler(handler: Callable) -> Callable:
+            async def catching_handler(request) -> aiohttp.web.Response:
+                """Catch all exceptions and return them to the client.
+                   Without this, the client would get an 'Internal server error' message
+                   without any details. Thanks to this the test log shows the actual error.
+                """
+                try:
+                    return await handler(request)
+                except Exception as e:
+                    tb = traceback.format_exc()
+                    self.logger.error(f'Exception when executing {handler.__name__}: {e}\n{tb}')
+                    return aiohttp.web.Response(status=500, text=str(e))
+            return catching_handler
+
+        def add_get(route: str, handler: Callable):
+            app.router.add_get(route, make_catching_handler(handler))
+
+        def add_put(route: str, handler: Callable):
+            app.router.add_put(route, make_catching_handler(handler))
+
+        add_get('/up', self._manager_up)
+        add_get('/cluster/up', self._cluster_up)
+        add_get('/cluster/is-dirty', self._is_dirty)
+        add_get('/cluster/replicas', self._cluster_replicas)
+        add_get('/cluster/running-servers', self._cluster_running_servers)
+        add_get('/cluster/host-ip/{server_id}', self._cluster_server_ip_addr)
+        add_get('/cluster/host-id/{server_id}', self._cluster_host_id)
+        add_get('/cluster/before-test/{test_case_name}', self._before_test_req)
+        add_get('/cluster/after-test', self._after_test)
+        add_get('/cluster/mark-dirty', self._mark_dirty)
+        add_get('/cluster/server/{server_id}/stop', self._cluster_server_stop)
+        add_get('/cluster/server/{server_id}/stop_gracefully', self._cluster_server_stop_gracefully)
+        add_get('/cluster/server/{server_id}/start', self._cluster_server_start)
+        add_get('/cluster/server/{server_id}/restart', self._cluster_server_restart)
+        add_put('/cluster/addserver', self._cluster_server_add)
+        add_put('/cluster/remove-node/{initiator}', self._cluster_remove_node)
+        add_get('/cluster/decommission-node/{server_id}', self._cluster_decommission_node)
+        add_get('/cluster/server/{server_id}/get_config', self._server_get_config)
+        add_put('/cluster/server/{server_id}/update_config', self._server_update_config)

    async def _manager_up(self, _request) -> aiohttp.web.Response:
        return aiohttp.web.Response(text=f"{self.is_running}")
--- a/tools/schema_loader.cc
+++ b/tools/schema_loader.cc
@@ -143,7 +143,8 @@ private:
        throw std::bad_function_call();
    }
    virtual const std::vector<view_ptr>& get_table_views(data_dictionary::table t) const override {
-        return {};
+        static const std::vector<view_ptr> empty;
+        return empty;
    }
    virtual sstring get_available_index_name(data_dictionary::database db, std::string_view ks_name, std::string_view table_name,
            std::optional<sstring> index_name_root) const override {
--- a/types.cc
+++ b/types.cc
@@ -735,6 +735,7 @@ bool abstract_type::is_collection() const {
 bool abstract_type::is_tuple() const {
    struct visitor {
        bool operator()(const abstract_type&) { return false; }
+        bool operator()(const reversed_type_impl& t) { return t.underlying_type()->is_tuple(); }
        bool operator()(const tuple_type_impl&) { return true; }
    };
    return visit(*this, visitor{});
@@ -1956,6 +1957,10 @@ data_value deserialize_aux(const tuple_type_impl& t, View v) {

 template<FragmentedView View>
 utils::multiprecision_int deserialize_value(const varint_type_impl&, View v) {
+    if (v.empty()) {
+        throw marshal_exception("cannot deserialize multiprecision int - empty buffer");
+    }
+    skip_empty_fragments(v);
    bool negative = v.current_fragment().front() < 0;
    utils::multiprecision_int num;
  while (v.size_bytes()) {
@@ -2052,6 +2057,7 @@ bool deserialize_value(const boolean_type_impl&, View v) {
    if (v.size_bytes() != 1) {
        throw marshal_exception(format("cannot deserialize boolean, size mismatch ({:d})", v.size_bytes()));
    }
+    skip_empty_fragments(v);
    return v.current_fragment().front() != 0;
 }