mutation_writer: feed_writer(): handle exceptions from consume_end_of_stream()

Currently the exception handling code of feed_writer() assumes consume_end_of_stream() doesn't throw. This is false and an exception from said method can currently lead to an unclean destroy of the writer and reader. Fix by also handling exceptions from consume_end_of_stream() too. Closes #10147 (cherry picked from commit 1963d1cc25)
release: prepare for 4.4.9
2022-03-03 10:45:40 +01:00 · 2022-02-16 14:24:54 +02:00 · 2022-02-03 18:40:12 +02:00 · 2022-01-30 20:08:43 +02:00 · 2022-01-30 11:00:21 +02:00 · 2022-01-27 10:27:45 +02:00
69 changed files with 1430 additions and 210 deletions
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=4.4.4
+VERSION=4.4.9

 if test -f version
 then
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -2509,7 +2509,7 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
                          const attribute_path_map_node<parsed::update_expression::action>* h = nullptr) {
        any_updates = true;
        if (_returnvalues == returnvalues::ALL_NEW) {
-            rjson::set_with_string_name(_return_attributes,
+            rjson::replace_with_string_name(_return_attributes,
                to_sstring_view(column_name), rjson::copy(json_value));
        } else if (_returnvalues == returnvalues::UPDATED_NEW) {
            rjson::value&& v = rjson::copy(json_value);
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -93,6 +93,10 @@ public:
                 [&] (const json::json_return_type& json_return_value) {
                     slogger.trace("api_handler success case");
                     if (json_return_value._body_writer) {
+                         // Unfortunately, write_body() forces us to choose
+                         // from a fixed and irrelevant list of "mime-types"
+                         // at this point. But we'll override it with the
+                         // one (application/x-amz-json-1.0) below.
                         rep->write_body("json", std::move(json_return_value._body_writer));
                     } else {
                         rep->_content += json_return_value._res;
@@ -105,14 +109,15 @@ public:

             return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
         });
-    }), _type("json") { }
+    }) { }

    api_handler(const api_handler&) = default;
    future<std::unique_ptr<reply>> handle(const sstring& path,
            std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
        return _f_handle(std::move(req), std::move(rep)).then(
                [this](std::unique_ptr<reply> rep) {
-                    rep->done(_type);
+                    rep->set_mime_type("application/x-amz-json-1.0");
+                    rep->done();
                    return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
                });
    }
@@ -126,7 +131,6 @@ protected:
    }

    future_handler_function _f_handle;
-    sstring _type;
 };

 class gated_handler : public handler_base {
@@ -192,24 +196,31 @@ future<> server::verify_signature(const request& req) {
        throw api_error::missing_authentication_token("Authorization header is mandatory for signature verification");
    }
    std::string host = host_it->second;
-    std::vector<std::string_view> credentials_raw = split(authorization_it->second, ' ');
+    std::string_view authorization_header = authorization_it->second;
+    auto pos = authorization_header.find_first_of(' ');
+    if (pos == std::string_view::npos || authorization_header.substr(0, pos) != "AWS4-HMAC-SHA256") {
+        throw api_error::invalid_signature(format("Authorization header must use AWS4-HMAC-SHA256 algorithm: {}", authorization_header));
+    }
+    authorization_header.remove_prefix(pos+1);
    std::string credential;
    std::string user_signature;
    std::string signed_headers_str;
    std::vector<std::string_view> signed_headers;
-    for (std::string_view entry : credentials_raw) {
+    do {
+        // Either one of a comma or space can mark the end of an entry
+        pos = authorization_header.find_first_of(" ,");
+        std::string_view entry = authorization_header.substr(0, pos);
+        if (pos != std::string_view::npos) {
+            authorization_header.remove_prefix(pos + 1);
+        }
+        if (entry.empty()) {
+            continue;
+        }
        std::vector<std::string_view> entry_split = split(entry, '=');
        if (entry_split.size() != 2) {
-            if (entry != "AWS4-HMAC-SHA256") {
-                throw api_error::invalid_signature(format("Only AWS4-HMAC-SHA256 algorithm is supported. Found: {}", entry));
-            }
            continue;
        }
        std::string_view auth_value = entry_split[1];
-        // Commas appear as an additional (quite redundant) delimiter
-        if (auth_value.back() == ',') {
-            auth_value.remove_suffix(1);
-        }
        if (entry_split[0] == "Credential") {
            credential = std::string(auth_value);
        } else if (entry_split[0] == "Signature") {
@@ -219,7 +230,8 @@ future<> server::verify_signature(const request& req) {
            signed_headers = split(auth_value, ';');
            std::sort(signed_headers.begin(), signed_headers.end());
        }
-    }
+    } while (pos != std::string_view::npos);
+
    std::vector<std::string_view> credential_split = split(credential, '/');
    if (credential_split.size() != 5) {
        throw api_error::validation(format("Incorrect credential information format: {}", credential));
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -38,6 +38,7 @@ stats::stats() : api_operations{} {
 #define OPERATION_LATENCY(name, CamelCaseName) \
                seastar::metrics::make_histogram("op_latency", \
                        seastar::metrics::description("Latency histogram of an operation via Alternator API"), {op(CamelCaseName)}, [this]{return to_metrics_histogram(api_operations.name);}),
+            OPERATION(batch_get_item, "BatchGetItem")
            OPERATION(batch_write_item, "BatchWriteItem")
            OPERATION(create_backup, "CreateBackup")
            OPERATION(create_global_table, "CreateGlobalTable")
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -225,7 +225,7 @@ void set_repair(http_context& ctx, routes& r, sharded<netw::messaging_service>&
            try {
                res = fut.get0();
            } catch (std::exception& e) {
-                return make_exception_future<json::json_return_type>(httpd::server_error_exception(e.what()));
+                return make_exception_future<json::json_return_type>(httpd::bad_param_exception(e.what()));
            }
            return make_ready_future<json::json_return_type>(json::json_return_type(res));
        });
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -39,7 +39,7 @@ public:
    using size_type = bytes::size_type;
    using value_type = bytes::value_type;
    using fragment_type = bytes_view;
-    static constexpr size_type max_chunk_size() { return 128 * 1024; }
+    static constexpr size_type max_chunk_size() { return max_alloc_size() - sizeof(chunk); }
 private:
    static_assert(sizeof(value_type) == 1, "value_type is assumed to be one byte long");
    struct chunk {
@@ -59,6 +59,7 @@ private:
        void operator delete(void* ptr) { free(ptr); }
    };
    static constexpr size_type default_chunk_size{512};
+    static constexpr size_type max_alloc_size() { return 128 * 1024; }
 private:
    std::unique_ptr<chunk> _begin;
    chunk* _current;
@@ -132,16 +133,15 @@ private:
        return _current->size - _current->offset;
    }
    // Figure out next chunk size.
-    //   - must be enough for data_size
+    //   - must be enough for data_size + sizeof(chunk)
    //   - must be at least _initial_chunk_size
    //   - try to double each time to prevent too many allocations
-    //   - do not exceed max_chunk_size
+    //   - should not exceed max_alloc_size, unless data_size requires so
    size_type next_alloc_size(size_t data_size) const {
        auto next_size = _current
                ? _current->size * 2
                : _initial_chunk_size;
-        next_size = std::min(next_size, max_chunk_size());
-        // FIXME: check for overflow?
+        next_size = std::min(next_size, max_alloc_size());
        return std::max<size_type>(next_size, data_size + sizeof(chunk));
    }
    // Makes room for a contiguous region of given size.
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -709,16 +709,16 @@ private:
       }
       return false;
    }
-    bool compare(const T&, const value_type& v);
+    int32_t compare(const T&, const value_type& v);
 };

 template<>
-bool maybe_back_insert_iterator<std::vector<std::pair<bytes_view, bytes_view>>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
+int32_t maybe_back_insert_iterator<std::vector<std::pair<bytes_view, bytes_view>>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
    return _type.compare(t, v.first);
 }

 template<>
-bool maybe_back_insert_iterator<std::vector<bytes_view>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
+int32_t maybe_back_insert_iterator<std::vector<bytes_view>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
    return _type.compare(t, v);
 }

--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -99,8 +99,8 @@ listen_address: localhost
 # listen_on_broadcast_address: false

 # port for the CQL native transport to listen for clients on
-# For security reasons, you should not expose this port to the internet.  Firewall it if needed.
-# To disable the CQL native transport, set this option to 0.
+# For security reasons, you should not expose this port to the internet. Firewall it if needed.
+# To disable the CQL native transport, remove this option and configure native_transport_port_ssl.
 native_transport_port: 9042

 # Like native_transport_port, but clients are forwarded to specific shards, based on the
--- a/configure.py
+++ b/configure.py
@@ -281,7 +281,7 @@ scylla_tests = set([
    'test/boost/cdc_generation_test',
    'test/boost/aggregate_fcts_test',
    'test/boost/allocation_strategy_test',
-    'test/boost/alternator_base64_test',
+    'test/boost/alternator_unit_test',
    'test/boost/anchorless_list_test',
    'test/boost/auth_passwords_test',
    'test/boost/auth_resource_test',
@@ -1033,7 +1033,7 @@ pure_boost_tests = set([
 ])

 tests_not_using_seastar_test_framework = set([
-    'test/boost/alternator_base64_test',
+    'test/boost/alternator_unit_test',
    'test/boost/small_vector_test',
    'test/manual/gossip',
    'test/manual/message',
@@ -1107,7 +1107,7 @@ deps['test/boost/linearizing_input_stream_test'] = [
 ]

 deps['test/boost/duration_test'] += ['test/lib/exception_utils.cc']
-deps['test/boost/alternator_base64_test'] += ['alternator/base64.cc']
+deps['test/boost/alternator_unit_test'] += ['alternator/base64.cc']

 deps['test/raft/replication_test'] = ['test/raft/replication_test.cc'] + scylla_raft_dependencies
 deps['test/boost/raft_fsm_test'] =  ['test/boost/raft_fsm_test.cc', 'test/lib/log.cc'] + scylla_raft_dependencies
@@ -1969,7 +1969,7 @@ with open(buildfile_tmp, 'w') as f:
            command = ./dist/debian/debian_files_gen.py
        build $builddir/debian/debian: debian_files_gen | always
        rule extract_node_exporter
-            command = tar -C build -xvpf {node_exporter_filename} && rm -rfv build/node_exporter && mv -v build/{node_exporter_dirname} build/node_exporter
+            command = tar -C build -xvpf {node_exporter_filename} --no-same-owner && rm -rfv build/node_exporter && mv -v build/{node_exporter_dirname} build/node_exporter
        build $builddir/node_exporter: extract_node_exporter | always
        ''').format(**globals()))

--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -181,13 +181,18 @@ inline
 shared_ptr<function>
 make_from_json_function(database& db, const sstring& keyspace, data_type t) {
    return make_native_scalar_function<true>("fromjson", t, {utf8_type},
-            [&db, &keyspace, t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
-        rjson::value json_value = rjson::parse(utf8_type->to_string(parameters[0].value()));
-        bytes_opt parsed_json_value;
-        if (!json_value.IsNull()) {
-            parsed_json_value.emplace(from_json_object(*t, json_value, sf));
+            [&db, keyspace, t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+        try {
+            rjson::value json_value = rjson::parse(utf8_type->to_string(parameters[0].value()));
+            bytes_opt parsed_json_value;
+            if (!json_value.IsNull()) {
+                parsed_json_value.emplace(from_json_object(*t, json_value, sf));
+            }
+            return parsed_json_value;
+        } catch(rjson::error& e) {
+            throw exceptions::function_execution_exception("fromJson",
+                format("Failed parsing fromJson parameter: {}", e.what()), keyspace, {t->name()});
        }
-        return parsed_json_value;
    });
 }

--- a/cql3/functions/native_scalar_function.hh
+++ b/cql3/functions/native_scalar_function.hh
@@ -78,7 +78,22 @@ public:
        return Pure;
    }
    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
-        return _func(sf, parameters);
+        try {
+            return _func(sf, parameters);
+        } catch(exceptions::cassandra_exception&) {
+            // If the function's code took the time to produce an official
+            // cassandra_exception, pass it through. Otherwise, below we will
+            // wrap the unknown exception in a function_execution_exception.
+            throw;
+        } catch(...) {
+            std::vector<sstring> args;
+            args.reserve(arg_types().size());
+            for (const data_type& a : arg_types()) {
+                args.push_back(a->name());
+            }
+            throw exceptions::function_execution_exception(name().name,
+                format("Failed execution of function {}: {}", name(), std::current_exception()), name().keyspace, std::move(args));
+        }
    }
 };

--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -551,16 +551,27 @@ bool statement_restrictions::need_filtering() const {
        // clustering restrictions.  Therefore, a continuous clustering range is guaranteed.
        return false;
    }
-    if (!_clustering_columns_restrictions->needs_filtering(*_schema)) { // Guaranteed continuous clustering range.
-        return false;
-    }
-    // Now we know there are some clustering-column restrictions that are out-of-order or not EQ.  A naive base-table
-    // query must be filtered.  What about an index-table query?  That can only avoid filtering if there is exactly one
-    // EQ supported by an index.
-    return !(_clustering_columns_restrictions->size() == 1 && _has_queriable_ck_index);

-    // TODO: it is also possible to avoid filtering here if a non-empty CK prefix is specified and token_known, plus
-    // there's exactly one out-of-order-but-index-supported clustering-column restriction.
+    if (_has_queriable_ck_index && _uses_secondary_indexing) {
+        // In cases where we use an index, clustering column restrictions might cause the need for filtering.
+        // TODO: This is overly conservative, there are some cases when this returns true but filtering
+        // is not needed. Because of that the database will sometimes perform filtering when it's not actually needed.
+        // Query performance shouldn't be affected much, at most we will filter rows that are all correct.
+        // Here are some cases to consider:
+        // On a table with primary key (p, c1, c2, c3) with an index on c3
+        // WHERE c3 = ? - doesn't require filtering
+        // WHERE c1 = ? AND c2 = ? AND c3 = ? - requires filtering
+        // WHERE p = ? AND c1 = ? AND c3 = ? - doesn't require filtering, but we conservatively report it does
+        // WHERE p = ? AND c1 LIKE ? AND c3 = ? - requires filtering
+        // WHERE p = ? AND c1 = ? AND c2 LIKE ? AND c3 = ? - requires filtering
+        // WHERE p = ? AND c1 = ? AND c2 = ? AND c3 = ? - doesn't use an index
+        // WHERE p = ? AND c1 = ? AND c2 < ? AND c3 = ? - doesn't require filtering, but we report it does
+        return _clustering_columns_restrictions->size() > 1;
+    }
+    // Now we know that the query doesn't use an index.
+
+    // The only thing that can cause filtering now are the clustering columns.
+    return _clustering_columns_restrictions->needs_filtering(*_schema);
 }

 void statement_restrictions::validate_secondary_index_selections(bool selects_only_static_columns) {
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -306,6 +306,13 @@ create_index_statement::announce_migration(service::storage_proxy& proxy) const
                    format("Index {} is a duplicate of existing index {}", index.name(), existing_index.value().name()));
        }
    }
+    auto index_table_name = secondary_index::index_table_name(accepted_name);
+    if (db.has_schema(keyspace(), index_table_name)) {
+        return make_exception_future<::shared_ptr<cql_transport::event::schema_change>>(
+            exceptions::invalid_request_exception(format("Index {} cannot be created, because table {} already exists",
+                accepted_name, index_table_name))
+        );
+    }
    ++_cql_stats->secondary_index_creates;
    schema_builder builder{schema};
    builder.with_index(index);
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -964,6 +964,7 @@ lw_shared_ptr<const service::pager::paging_state> indexed_table_select_statement
    }

    auto paging_state_copy = make_lw_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
+    paging_state_copy->set_remaining(internal_paging_size);
    paging_state_copy->set_partition_key(std::move(index_pk));
    paging_state_copy->set_clustering_key(std::move(index_ck));
    return std::move(paging_state_copy);
--- a/database.cc
+++ b/database.cc
@@ -1948,7 +1948,11 @@ sstring database::get_available_index_name(const sstring &ks_name, const sstring
    auto base_name = index_metadata::get_default_index_name(cf_name, index_name_root);
    sstring accepted_name = base_name;
    int i = 0;
-    while (existing_names.contains(accepted_name)) {
+    auto name_accepted = [&] {
+        auto index_table_name = secondary_index::index_table_name(accepted_name);
+        return !has_schema(ks_name, index_table_name) && !existing_names.contains(accepted_name);
+    };
+    while (!name_accepted()) {
        accepted_name = base_name + "_" + std::to_string(++i);
    }
    return accepted_name;
--- a/database.hh
+++ b/database.hh
@@ -240,9 +240,13 @@ public:
        return _memtables.back();
    }

-    // The caller has to make sure the element exist before calling this.
+    // # 8904 - this method is akin to std::set::erase(key_type), not
+    // erase(iterator). Should be tolerant against non-existing.
    void erase(const shared_memtable& element) {
-        _memtables.erase(boost::range::find(_memtables, element));
+        auto i = boost::range::find(_memtables, element);
+        if (i != _memtables.end()) {
+            _memtables.erase(i);
+        }
    }
    void clear() {
        _memtables.clear();
@@ -893,7 +897,7 @@ public:
        return _pending_writes_phaser.start();
    }

-    future<> await_pending_writes() {
+    future<> await_pending_writes() noexcept {
        return _pending_writes_phaser.advance_and_await();
    }

@@ -905,7 +909,7 @@ public:
        return _pending_reads_phaser.start();
    }

-    future<> await_pending_reads() {
+    future<> await_pending_reads() noexcept {
        return _pending_reads_phaser.advance_and_await();
    }

@@ -917,7 +921,7 @@ public:
        return _pending_streams_phaser.start();
    }

-    future<> await_pending_streams() {
+    future<> await_pending_streams() noexcept {
        return _pending_streams_phaser.advance_and_await();
    }

@@ -925,11 +929,11 @@ public:
        return _pending_streams_phaser.operations_in_progress();
    }

-    future<> await_pending_flushes() {
+    future<> await_pending_flushes() noexcept {
        return _pending_flushes_phaser.advance_and_await();
    }

-    future<> await_pending_ops() {
+    future<> await_pending_ops() noexcept {
        return when_all(await_pending_reads(), await_pending_writes(), await_pending_streams(), await_pending_flushes()).discard_result();
    }

--- a/db/large_data_handler.cc
+++ b/db/large_data_handler.cc
@@ -124,7 +124,7 @@ static future<> try_record(std::string_view large_table, const sstables::sstable
    const auto sstable_name = sst.get_filename();
    std::string pk_str = key_to_str(partition_key.to_partition_key(s), s);
    auto timestamp = db_clock::now();
-    large_data_logger.warn("Writing large {} {}/{}: {}{} ({} bytes)", desc, ks_name, cf_name, pk_str, extra_path, size);
+    large_data_logger.warn("Writing large {} {}/{}: {}{} ({} bytes) to {}", desc, ks_name, cf_name, pk_str, extra_path, size, sstable_name);
    return db::qctx->execute_cql(req, ks_name, cf_name, sstable_name, size, pk_str, timestamp, args...)
            .discard_result()
            .handle_exception([ks_name, cf_name, large_table, sstable_name] (std::exception_ptr ep) {
@@ -140,9 +140,10 @@ future<> cql_table_large_data_handler::record_large_partitions(const sstables::s
 void cql_table_large_data_handler::log_too_many_rows(const sstables::sstable& sst, const sstables::key& partition_key,
        uint64_t rows_count) const {
    const schema& s = *sst.get_schema();
-    large_data_logger.warn("Writing a partition with too many rows [{}/{}:{}] ({} rows)",
+    const auto sstable_name = sst.get_filename();
+    large_data_logger.warn("Writing a partition with too many rows [{}/{}:{}] ({} rows) to {}",
                           s.ks_name(), s.cf_name(), partition_key.to_partition_key(s).with_schema(s),
-                           rows_count);
+                           rows_count, sstable_name);
 }

 future<> cql_table_large_data_handler::record_large_cells(const sstables::sstable& sst, const sstables::key& partition_key,
--- a/db/snapshot-ctl.cc
+++ b/db/snapshot-ctl.cc
@@ -43,9 +43,13 @@

 namespace db {

-future<> snapshot_ctl::check_snapshot_not_exist(sstring ks_name, sstring name) {
+future<> snapshot_ctl::check_snapshot_not_exist(sstring ks_name, sstring name, std::optional<std::vector<sstring>> filter) {
    auto& ks = _db.local().find_keyspace(ks_name);
-    return parallel_for_each(ks.metadata()->cf_meta_data(), [this, ks_name = std::move(ks_name), name = std::move(name)] (auto& pair) {
+    return parallel_for_each(ks.metadata()->cf_meta_data(), [this, ks_name = std::move(ks_name), name = std::move(name), filter = std::move(filter)] (auto& pair) {
+        auto& cf_name = pair.first;
+        if (filter && std::find(filter->begin(), filter->end(), cf_name) == filter->end()) {
+            return make_ready_future<>();
+        }        
        auto& cf = _db.local().find_column_family(pair.second);
        return cf.snapshot_exists(name).then([ks_name = std::move(ks_name), name] (bool exists) {
            if (exists) {
@@ -111,7 +115,7 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
    }

    return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag)] {
-        return check_snapshot_not_exist(ks_name, tag).then([this, ks_name, tables = std::move(tables), tag] {
+        return check_snapshot_not_exist(ks_name, tag, tables).then([this, ks_name, tables, tag] {
            return do_with(std::vector<sstring>(std::move(tables)),[this, ks_name, tag](const std::vector<sstring>& tables) {
                return do_for_each(tables, [ks_name, tag, this] (const sstring& table_name) {
                    if (table_name.find(".") != sstring::npos) {
--- a/db/snapshot-ctl.hh
+++ b/db/snapshot-ctl.hh
@@ -40,6 +40,8 @@

 #pragma once

+#include <vector>
+
 #include <seastar/core/sharded.hh>
 #include <seastar/core/future.hh>
 #include "database.hh"
@@ -112,7 +114,7 @@ private:
    seastar::rwlock _lock;
    seastar::gate _ops;

-    future<> check_snapshot_not_exist(sstring ks_name, sstring name);
+    future<> check_snapshot_not_exist(sstring ks_name, sstring name, std::optional<std::vector<sstring>> filter = {});

    template <typename Func>
    std::result_of_t<Func()> run_snapshot_modify_operation(Func&&);
--- a/dist/common/scripts/scylla_cpuscaling_setup
+++ b/dist/common/scripts/scylla_cpuscaling_setup
@@ -22,6 +22,7 @@

 import os
 import sys
+import argparse
 import shlex
 import distro
 from scylla_util import *
@@ -46,7 +47,12 @@ if __name__ == '__main__':
    if os.getuid() > 0:
        print('Requires root permission.')
        sys.exit(1)
-    if not os.path.exists('/sys/devices/system/cpu/cpufreq/policy0/scaling_governor'):
+    parser = argparse.ArgumentParser(description='CPU scaling setup script for Scylla.')
+    parser.add_argument('--force', dest='force', action='store_true',
+                        help='force running setup even CPU scaling unsupported')
+    args = parser.parse_args()
+
+    if not args.force and not os.path.exists('/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor'):
        print('This computer doesn\'t supported CPU scaling configuration.')
        sys.exit(0)
    if not is_debian_variant():
@@ -56,6 +62,11 @@ if __name__ == '__main__':
        if not shutil.which('cpufreq-set'):
            pkg_install('cpufrequtils')
    if is_debian_variant():
+        try:
+            ondemand = systemd_unit('ondemand')
+            ondemand.disable()
+        except:
+            pass
        cfg = sysconfig_parser('/etc/default/cpufrequtils')
        cfg.set('GOVERNOR', 'performance')
        cfg.commit()
--- a/dist/common/scripts/scylla_io_setup
+++ b/dist/common/scripts/scylla_io_setup
@@ -254,7 +254,7 @@ if __name__ == "__main__":
                    disk_properties["read_bandwidth"] = 2650 * mbs
                    disk_properties["write_iops"] = 360000
                    disk_properties["write_bandwidth"] = 1400 * mbs
-                elif nr_disks == "16":
+                elif nr_disks == 16:
                    disk_properties["read_iops"] = 1600000
                    disk_properties["read_bandwidth"] = 4521251328
                    #below is google, above is our measured
@@ -263,7 +263,7 @@ if __name__ == "__main__":
                    disk_properties["write_bandwidth"] = 2759452672
                    #below is google, above is our measured
                    #disk_properties["write_bandwidth"] = 3120 * mbs
-                elif nr_disks == "24":
+                elif nr_disks == 24:
                    disk_properties["read_iops"] = 2400000
                    disk_properties["read_bandwidth"] = 5921532416
                    #below is google, above is our measured
--- a/dist/common/scripts/scylla_ntp_setup
+++ b/dist/common/scripts/scylla_ntp_setup
@@ -90,12 +90,12 @@ if __name__ == '__main__':
            with open('/etc/ntp.conf') as f:
                conf = f.read()
            if args.subdomain:
-                conf2 = re.sub(r'server\s+([0-9]+)\.(\S+)\.pool\.ntp\.org', 'server \\1.{}.pool.ntp.org'.format(args.subdomain), conf, flags=re.MULTILINE)
+                conf2 = re.sub(r'(server|pool)\s+([0-9]+)\.(\S+)\.pool\.ntp\.org', '\\1 \\2.{}.pool.ntp.org'.format(args.subdomain), conf, flags=re.MULTILINE)
                with open('/etc/ntp.conf', 'w') as f:
                    f.write(conf2)
                conf = conf2
-            match = re.search(r'^server\s+(\S*)(\s+\S+)?', conf, flags=re.MULTILINE)
-            server = match.group(1)
+            match = re.search(r'^(server|pool)\s+(\S*)(\s+\S+)?', conf, flags=re.MULTILINE)
+            server = match.group(2)
            ntpd = systemd_unit('ntpd.service')
            ntpd.stop()
            # ignore error, ntpd may able to adjust clock later
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -30,6 +30,8 @@ import distro
 from pathlib import Path
 from scylla_util import *
 from subprocess import run
+import distro
+from pkg_resources import parse_version

 if __name__ == '__main__':
    if os.getuid() > 0:
@@ -115,6 +117,25 @@ if __name__ == '__main__':
        pkg_install('xfsprogs')
    if not shutil.which('mdadm'):
        pkg_install('mdadm')
+    # XXX: Workaround for mdmonitor.service issue on CentOS8
+    if is_redhat_variant() and distro.version() == '8':
+        mdadm_rpm = run('rpm -q mdadm', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
+        match = re.match(r'^mdadm-([0-9]+\.[0-9]+-[a-zA-Z0-9]+)\.', mdadm_rpm)
+        mdadm_version = match.group(1)
+        if parse_version('4.1-14') < parse_version(mdadm_version):
+            repo_data = '''
+[BaseOS_8_3_2011]
+name=CentOS8.3.2011 - Base
+baseurl=http://vault.centos.org/8.3.2011/BaseOS/$basearch/os/
+gpgcheck=1
+enabled=0
+gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-centosofficial
+'''[1:-1]
+            with open('/etc/yum.repos.d/CentOS-Vault-8.3.repo', 'w') as f:
+                f.write(repo_data)
+            run('dnf downgrade --enablerepo=BaseOS_8_3_2011 -y mdadm', shell=True, check=True)
+            run('dnf install -y python3-dnf-plugin-versionlock', shell=True, check=True)
+            run('dnf versionlock add mdadm', shell=True, check=True)
    try:
        md_service = systemd_unit('mdmonitor.service')
    except SystemdException:
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -147,6 +147,11 @@ class gcp_instance:
            if af == socket.AF_INET:
                addr, port = sa
                if addr == "169.254.169.254":
+                    # Make sure it is not on GKE
+                    try:
+                        gcp_instance().__instance_metadata("machine-type")
+                    except urllib.error.HTTPError:
+                        return False
                    return True
        return False

--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -6,7 +6,7 @@ ENV container docker

 # The SCYLLA_REPO_URL argument specifies the URL to the RPM repository this Docker image uses to install Scylla. The default value is the Scylla's unstable RPM repository, which contains the daily build.
 ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/branch-4.4/latest/scylla.repo
-ARG VERSION=4.4.4
+ARG VERSION=4.4.9

 ADD scylla_bashrc /scylla_bashrc

--- a/dist/docker/redhat/etc/supervisord.conf.d/scylla-server.conf
+++ b/dist/docker/redhat/etc/supervisord.conf.d/scylla-server.conf
@@ -4,3 +4,4 @@ stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
 stderr_logfile=/dev/stderr
 stderr_logfile_maxbytes=0
+stopwaitsecs=900
--- a/dist/docker/redhat/scyllasetup.py
+++ b/dist/docker/redhat/scyllasetup.py
@@ -121,12 +121,13 @@ class ScyllaSetup:
        if self._apiAddress is not None:
            args += ["--api-address %s" % self._apiAddress]

-        if self._alternatorPort is not None:
+        if self._alternatorAddress is not None:
            args += ["--alternator-address %s" % self._alternatorAddress]
+
+        if self._alternatorPort is not None:
            args += ["--alternator-port %s" % self._alternatorPort]

        if self._alternatorHttpsPort is not None:
-            args += ["--alternator-address %s" % self._alternatorAddress]
            args += ["--alternator-https-port %s" % self._alternatorHttpsPort]

        if self._alternatorWriteIsolation is not None:
--- a/dist/redhat/scylla.spec
+++ b/dist/redhat/scylla.spec
@@ -7,7 +7,7 @@ Group:          Applications/Databases
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Source0:        %{reloc_pkg}
-Requires:       %{product}-server = %{version} %{product}-conf = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version} %{product}-node-exporter = %{version}
+Requires:       %{product}-server = %{version} %{product}-conf = %{version} %{product}-python3 = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version} %{product}-node-exporter = %{version}
 Obsoletes:	scylla-server < 1.1

 %global _debugsource_template %{nil}
@@ -54,7 +54,7 @@ Group:          Applications/Databases
 Summary:        The Scylla database server
 License:        AGPLv3
 URL:            http://www.scylladb.com/
-Requires:       %{product}-conf %{product}-python3
+Requires:       %{product}-conf  = %{version} %{product}-python3 = %{version}
 Conflicts:      abrt
 AutoReqProv:    no

--- a/exceptions/exceptions.hh
+++ b/exceptions/exceptions.hh
@@ -340,4 +340,18 @@ public:
    unsupported_operation_exception(const sstring& msg) : std::runtime_error("unsupported operation: " + msg) {}
 };

+class function_execution_exception : public cassandra_exception {
+public:
+    const sstring ks_name;
+    const sstring func_name;
+    const std::vector<sstring> args;
+    function_execution_exception(sstring func_name_, sstring detail, sstring ks_name_, std::vector<sstring> args_) noexcept
+        : cassandra_exception{exception_code::FUNCTION_FAILURE,
+            format("execution of {} failed: {}", func_name_, detail)}
+        , ks_name(std::move(ks_name_))
+        , func_name(std::move(func_name_))
+        , args(std::move(args_))
+    { }
+};
+
 }
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -1445,7 +1445,7 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
    logger.trace("marking as alive {}", addr);

    // Do not mark a node with status shutdown as UP.
-    auto status = get_gossip_status(local_state);
+    auto status = sstring(get_gossip_status(local_state));
    if (status == sstring(versioned_value::SHUTDOWN)) {
        logger.warn("Skip marking node {} with status = {} as UP", addr, status);
        return;
@@ -1464,6 +1464,8 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
        return;
    }

+    // Make a copy for endpoint_state because the code below can yield
+    endpoint_state state = local_state;
    _live_endpoints.push_back(addr);
    if (_endpoints_to_talk_with.empty()) {
        _endpoints_to_talk_with.push_back({addr});
@@ -1475,8 +1477,8 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
        logger.info("InetAddress {} is now UP, status = {}", addr, status);
    }

-    _subscribers.for_each([addr, local_state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
-        subscriber->on_alive(addr, local_state);
+    _subscribers.for_each([addr, state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
+        subscriber->on_alive(addr, state);
        logger.trace("Notified {}", fmt::ptr(subscriber.get()));
    });
 }
@@ -1485,11 +1487,12 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
 void gossiper::mark_dead(inet_address addr, endpoint_state& local_state) {
    logger.trace("marking as down {}", addr);
    local_state.mark_dead();
+    endpoint_state state = local_state;
    _live_endpoints.resize(std::distance(_live_endpoints.begin(), std::remove(_live_endpoints.begin(), _live_endpoints.end(), addr)));
    _unreachable_endpoints[addr] = now();
-    logger.info("InetAddress {} is now DOWN, status = {}", addr, get_gossip_status(local_state));
-    _subscribers.for_each([addr, local_state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
-        subscriber->on_dead(addr, local_state);
+    logger.info("InetAddress {} is now DOWN, status = {}", addr, get_gossip_status(state));
+    _subscribers.for_each([addr, state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
+        subscriber->on_dead(addr, state);
        logger.trace("Notified {}", fmt::ptr(subscriber.get()));
    });
 }
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -26,6 +26,7 @@

 #include "mutation_reader.hh"
 #include <seastar/core/future-util.hh>
+#include <seastar/core/coroutine.hh>
 #include "flat_mutation_reader.hh"
 #include "schema_registry.hh"
 #include "mutation_compactor.hh"
@@ -1176,6 +1177,9 @@ flat_mutation_reader evictable_reader::recreate_reader() {
    _range_override.reset();
    _slice_override.reset();

+    _drop_partition_start = false;
+    _drop_static_row = false;
+
    if (_last_pkey) {
        bool partition_range_is_inclusive = true;

@@ -1261,13 +1265,25 @@ void evictable_reader::maybe_validate_partition_start(const flat_mutation_reader
    // is in range.
    if (_last_pkey) {
        const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
-        if (_drop_partition_start) { // should be the same partition
+        if (_drop_partition_start) { // we expect to continue from the same partition
+            // We cannot assume the partition we stopped the read at is still alive
+            // when we recreate the reader. It might have been compacted away in the
+            // meanwhile, so allow for a larger partition too.
            require(
-                    cmp_res == 0,
-                    "{}(): validation failed, expected partition with key equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
+                    cmp_res <= 0,
+                    "{}(): validation failed, expected partition with key larger or equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
                    __FUNCTION__,
                    *_last_pkey,
                    ps.key());
+            // Reset drop flags and next pos if we are not continuing from the same partition
+            if (cmp_res < 0) {
+                // Close previous partition, we are not going to continue it.
+                push_mutation_fragment(*_schema, _permit, partition_end{});
+                _drop_partition_start = false;
+                _drop_static_row = false;
+                _next_position_in_partition = position_in_partition::for_partition_start();
+                _trim_range_tombstones = false;
+            }
        } else { // should be a larger partition
            require(
                    cmp_res < 0,
@@ -1318,9 +1334,14 @@ bool evictable_reader::should_drop_fragment(const mutation_fragment& mf) {
        _drop_partition_start = false;
        return true;
    }
-    if (_drop_static_row && mf.is_static_row()) {
-        _drop_static_row = false;
-        return true;
+    // Unlike partition-start above, a partition is not guaranteed to have a
+    // static row fragment. So reset the flag regardless of whether we could
+    // drop one or not.
+    // We are guaranteed to get here only right after dropping a partition-start,
+    // so if we are not seeing a static row here, the partition doesn't have one.
+    if (_drop_static_row) {
+         _drop_static_row = false;
+        return mf.is_static_row();
    }
    return false;
 }
@@ -1505,18 +1526,18 @@ future<> evictable_reader::fast_forward_to(const dht::partition_range& pr, db::t
    _end_of_stream = false;

    if (_reader) {
-        return _reader->fast_forward_to(pr, timeout);
+        co_await _reader->fast_forward_to(pr, timeout);
+        _range_override.reset();
+        co_return;
    }
    if (!_reader_created || !_irh) {
-        return make_ready_future<>();
+        co_return;
    }
    if (auto reader_opt = try_resume()) {
-        auto f = reader_opt->fast_forward_to(pr, timeout);
-        return f.then([this, reader = std::move(*reader_opt)] () mutable {
-            maybe_pause(std::move(reader));
-        });
+        co_await reader_opt->fast_forward_to(pr, timeout);
+        _range_override.reset();
+        maybe_pause(std::move(*reader_opt));
    }
-    return make_ready_future<>();
 }

 evictable_reader_handle::evictable_reader_handle(evictable_reader& r) : _r(&r)
@@ -1569,8 +1590,8 @@ class shard_reader : public enable_lw_shared_from_this<shard_reader>, public fla
 private:
    shared_ptr<reader_lifecycle_policy> _lifecycle_policy;
    const unsigned _shard;
-    const dht::partition_range* _pr;
-    const query::partition_slice& _ps;
+    dht::partition_range _pr;
+    query::partition_slice _ps;
    const io_priority_class& _pc;
    tracing::global_trace_state_ptr _trace_state;
    const mutation_reader::forwarding _fwd_mr;
@@ -1596,7 +1617,7 @@ public:
        : impl(std::move(schema), std::move(permit))
        , _lifecycle_policy(std::move(lifecycle_policy))
        , _shard(shard)
-        , _pr(&pr)
+        , _pr(pr)
        , _ps(ps)
        , _pc(pc)
        , _trace_state(std::move(trace_state))
@@ -1681,7 +1702,7 @@ future<> shard_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
            });
            auto s = gs.get();
            auto rreader = make_foreign(std::make_unique<evictable_reader>(evictable_reader::auto_pause::yes, std::move(ms),
-                        s, _lifecycle_policy->semaphore().make_permit(s.get(), "shard-reader"), *_pr, _ps, _pc, _trace_state, _fwd_mr));
+                        s, _lifecycle_policy->semaphore().make_permit(s.get(), "shard-reader"), _pr, _ps, _pc, _trace_state, _fwd_mr));
            tracing::trace(_trace_state, "Creating shard reader on shard: {}", this_shard_id());
            auto f = rreader->fill_buffer(timeout);
            return f.then([rreader = std::move(rreader)] () mutable {
@@ -1730,7 +1751,7 @@ void shard_reader::next_partition() {
 }

 future<> shard_reader::fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) {
-    _pr = &pr;
+    _pr = pr;

    if (!_reader && !_read_ahead) {
        // No need to fast-forward uncreated readers, they will be passed the new
@@ -1739,12 +1760,12 @@ future<> shard_reader::fast_forward_to(const dht::partition_range& pr, db::timeo
    }

    auto f = _read_ahead ? *std::exchange(_read_ahead, std::nullopt) : make_ready_future<>();
-    return f.then([this, &pr, timeout] {
+    return f.then([this, timeout] {
        _end_of_stream = false;
        clear_buffer();

-        return smp::submit_to(_shard, [this, &pr, timeout] {
-            return _reader->fast_forward_to(pr, timeout);
+        return smp::submit_to(_shard, [this, timeout] {
+            return _reader->fast_forward_to(_pr, timeout);
        });
    });
 }
--- a/mutation_writer/feed_writers.hh
+++ b/mutation_writer/feed_writers.hh
@@ -57,6 +57,8 @@ future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
                auto f2 = rd.is_buffer_empty() ? rd.fill_buffer(db::no_timeout) : make_ready_future<>();
                return when_all_succeed(std::move(f1), std::move(f2)).discard_result();
            });
+        }).then([&wr] {
+            wr.consume_end_of_stream();
        }).then_wrapped([&wr] (future<> f) {
            if (f.failed()) {
                auto ex = f.get_exception();
@@ -70,7 +72,6 @@ future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
                    return make_exception_future<>(std::move(ex));
                });
            } else {
-                wr.consume_end_of_stream();
                return wr.close();
            }
        });
--- a/range_tombstone.hh
+++ b/range_tombstone.hh
@@ -267,9 +267,14 @@ public:
        return _current_tombstone;
    }

-    const std::deque<range_tombstone>& range_tombstones_for_row(const clustering_key_prefix& ck) {
+    std::vector<range_tombstone> range_tombstones_for_row(const clustering_key_prefix& ck) {
        drop_unneeded_tombstones(ck);
-        return _range_tombstones;
+        std::vector<range_tombstone> result(_range_tombstones.begin(), _range_tombstones.end());
+        auto cmp = [&] (const range_tombstone& rt1, const range_tombstone& rt2) {
+            return _cmp(rt1.start_bound(), rt2.start_bound());
+        };
+        std::sort(result.begin(), result.end(), cmp);
+        return result;
    }

    std::deque<range_tombstone> range_tombstones() && {
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -1783,6 +1783,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, seastar::sharded<
            auto& strat = ks.get_replication_strategy();
            dht::token_range_vector desired_ranges = strat.get_pending_address_ranges(tmptr, tokens, myip, utils::can_yield::yes);
            bool find_node_in_local_dc_only = strat.get_type() == locator::replication_strategy_type::network_topology;
+            bool everywhere_topology = strat.get_type() == locator::replication_strategy_type::everywhere_topology;

            //Active ranges
            auto metadata_clone = tmptr->clone_only_token_map().get0();
@@ -1860,7 +1861,9 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, seastar::sharded<
                        };
                        auto old_endpoints_in_local_dc = get_old_endpoints_in_local_dc();
                        auto rf_in_local_dc = get_rf_in_local_dc();
-                        if (old_endpoints.size() == strat.get_replication_factor()) {
+                        if (everywhere_topology) {
+                            neighbors = old_endpoints_in_local_dc;
+                        } else if (old_endpoints.size() == strat.get_replication_factor()) {
                            // For example, with RF = 3 and 3 nodes n1, n2, n3
                            // in the cluster, n4 is bootstrapped, old_replicas
                            // = {n1, n2, n3}, new_replicas = {n1, n2, n4}, n3
--- a/2
+++ b/2
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -2532,7 +2532,13 @@ future<> storage_service::restore_replica_count(inet_address endpoint, inet_addr
    }
  return seastar::async([this, endpoint, notify_endpoint] {
    auto tmptr = get_token_metadata_ptr();
-    auto streamer = make_lw_shared<dht::range_streamer>(_db, tmptr, _abort_source, get_broadcast_address(), "Restore_replica_count", streaming::stream_reason::removenode);
+    abort_source as;
+    auto sub = _abort_source.subscribe([&as] () noexcept {
+        if (!as.abort_requested()) {
+            as.request_abort();
+        }
+    });
+    auto streamer = make_lw_shared<dht::range_streamer>(_db, tmptr, as, get_broadcast_address(), "Restore_replica_count", streaming::stream_reason::removenode);
    auto my_address = get_broadcast_address();
    auto non_system_keyspaces = _db.local().get_non_system_keyspaces();
    for (const auto& keyspace_name : non_system_keyspaces) {
@@ -2550,6 +2556,42 @@ future<> storage_service::restore_replica_count(inet_address endpoint, inet_addr
        }
        streamer->add_rx_ranges(keyspace_name, std::move(ranges_per_endpoint));
    }
+    auto status_checker = seastar::async([this, endpoint, &as] {
+        slogger.info("restore_replica_count: Started status checker for removing node {}", endpoint);
+        while (!as.abort_requested()) {
+            auto status = _gossiper.get_gossip_status(endpoint);
+            // If the node to be removed is already in removed status, it has
+            // probably been removed forcely with `nodetool removenode force`.
+            // Abort the restore_replica_count in such case to avoid streaming
+            // attempt since the user has removed the node forcely.
+            if (status == sstring(versioned_value::REMOVED_TOKEN)) {
+                slogger.info("restore_replica_count: Detected node {} has left the cluster, status={}, abort restore_replica_count for removing node {}",
+                        endpoint, status, endpoint);
+                if (!as.abort_requested()) {
+                    as.request_abort();
+                }
+                return;
+            }
+            slogger.debug("restore_replica_count: Sleep and detect removing node {}, status={}", endpoint, status);
+            sleep_abortable(std::chrono::seconds(10), as).get();
+        }
+    });
+    auto stop_status_checker = defer([endpoint, &status_checker, &as] () mutable {
+        try {
+            slogger.info("restore_replica_count: Started to stop status checker for removing node {}", endpoint);
+            if (!as.abort_requested()) {
+                as.request_abort();
+            }
+            status_checker.get();
+        } catch (const seastar::sleep_aborted& ignored) {
+            slogger.debug("restore_replica_count: Got sleep_abort to stop status checker for removing node {}: {}", endpoint, ignored);
+        } catch (...) {
+            slogger.warn("restore_replica_count: Found error in status checker for removing node {}: {}",
+                    endpoint, std::current_exception());
+        }
+        slogger.info("restore_replica_count: Finished to stop status checker for removing node {}", endpoint);
+    });
+
    streamer->stream_async().then_wrapped([this, streamer, notify_endpoint] (auto&& f) {
        try {
            f.get();
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -440,7 +440,6 @@ protected:
    mutation_source_metadata _ms_metadata = {};
    garbage_collected_sstable_writer::data _gc_sstable_writer_data;
    compaction_sstable_replacer_fn _replacer;
-    std::optional<compaction_weight_registration> _weight_registration;
    utils::UUID _run_identifier;
    ::io_priority_class _io_priority;
    // optional clone of sstable set to be used for expiration purposes, so it will be set if expiration is enabled.
@@ -459,7 +458,6 @@ protected:
        , _sstable_level(descriptor.level)
        , _gc_sstable_writer_data(*this)
        , _replacer(std::move(descriptor.replacer))
-        , _weight_registration(std::move(descriptor.weight_registration))
        , _run_identifier(descriptor.run_identifier)
        , _io_priority(descriptor.io_priority)
        , _sstable_set(std::move(descriptor.all_sstables_snapshot))
@@ -919,9 +917,6 @@ public:
    }

    virtual void on_end_of_compaction() override {
-        if (_weight_registration) {
-            _cf.get_compaction_manager().on_compaction_complete(*_weight_registration);
-        }
        replace_remaining_exhausted_sstables();
    }

--- a/sstables/compaction_descriptor.hh
+++ b/sstables/compaction_descriptor.hh
@@ -134,8 +134,6 @@ struct compaction_descriptor {
    uint64_t max_sstable_bytes;
    // Run identifier of output sstables.
    utils::UUID run_identifier;
-    // Holds ownership of a weight assigned to this compaction iff it's a regular one.
-    std::optional<compaction_weight_registration> weight_registration;
    // Calls compaction manager's task for this compaction to release reference to exhausted sstables.
    std::function<void(const std::vector<shared_sstable>& exhausted_sstables)> release_exhausted;
    // The options passed down to the compaction code.
--- a/sstables/compaction_manager.cc
+++ b/sstables/compaction_manager.cc
@@ -439,7 +439,7 @@ void compaction_manager::reevaluate_postponed_compactions() {
 }

 void compaction_manager::postpone_compaction_for_column_family(column_family* cf) {
-    _postponed.push_back(cf);
+    _postponed.insert(cf);
 }

 future<> compaction_manager::stop_ongoing_compactions(sstring reason) {
@@ -579,7 +579,7 @@ void compaction_manager::submit(column_family* cf) {
                return make_ready_future<stop_iteration>(stop_iteration::yes);
            }
            auto compacting = make_lw_shared<compacting_sstable_registration>(this, descriptor.sstables);
-            descriptor.weight_registration = compaction_weight_registration(this, weight);
+            auto weight_r = compaction_weight_registration(this, weight);
            descriptor.release_exhausted = [compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
                compacting->release_compacting(exhausted_sstables);
            };
@@ -589,7 +589,7 @@ void compaction_manager::submit(column_family* cf) {
            _stats.pending_tasks--;
            _stats.active_tasks++;
            task->compaction_running = true;
-            return cf.run_compaction(std::move(descriptor)).then_wrapped([this, task, compacting = std::move(compacting)] (future<> f) mutable {
+            return cf.run_compaction(std::move(descriptor)).then_wrapped([this, task, compacting = std::move(compacting), weight_r = std::move(weight_r)] (future<> f) mutable {
                _stats.active_tasks--;
                task->compaction_running = false;

@@ -802,12 +802,15 @@ future<> compaction_manager::remove(column_family* cf) {
            task->stopping = true;
        }
    }
-    _postponed.erase(boost::remove(_postponed, cf), _postponed.end());
+    _postponed.erase(cf);

    // Wait for the termination of an ongoing compaction on cf, if any.
    return do_for_each(*tasks_to_stop, [this, cf] (auto& task) {
        return this->task_stop(task);
    }).then([this, cf, tasks_to_stop] {
+#ifdef DEBUG
+        assert(std::find_if(_tasks.begin(), _tasks.end(), [cf] (auto& task) { return task->compacting_cf == cf; }) == _tasks.end());
+#endif
        _compaction_locks.erase(cf);
    });
 }
@@ -838,11 +841,6 @@ void compaction_manager::stop_compaction(sstring type) {
    }
 }

-void compaction_manager::on_compaction_complete(compaction_weight_registration& weight_registration) {
-    weight_registration.deregister();
-    reevaluate_postponed_compactions();
-}
-
 void compaction_manager::propagate_replacement(column_family* cf,
        const std::vector<sstables::shared_sstable>& removed, const std::vector<sstables::shared_sstable>& added) {
    for (auto& info : _compactions) {
--- a/sstables/compaction_manager.hh
+++ b/sstables/compaction_manager.hh
@@ -99,7 +99,7 @@ private:
    future<> _waiting_reevalution = make_ready_future<>();
    condition_variable _postponed_reevaluation;
    // column families that wait for compaction but had its submission postponed due to ongoing compaction.
-    std::vector<column_family*> _postponed;
+    std::unordered_set<column_family*> _postponed;
    // tracks taken weights of ongoing compactions, only one compaction per weight is allowed.
    // weight is value assigned to a compaction job that is log base N of total size of all input sstables.
    std::unordered_set<int> _weight_tracker;
@@ -256,11 +256,6 @@ public:
    // Stops ongoing compaction of a given type.
    void stop_compaction(sstring type);

-    // Called by compaction procedure to release the weight lock assigned to it, such that
-    // another compaction waiting on same weight can start as soon as possible. That's usually
-    // called before compaction seals sstable and such and after all compaction work is done.
-    void on_compaction_complete(compaction_weight_registration& weight_registration);
-
    double backlog() {
        return _backlog_manager.backlog();
    }
--- a/sstables/compaction_strategy.cc
+++ b/sstables/compaction_strategy.cc
@@ -503,7 +503,8 @@ date_tiered_manifest::get_compaction_candidates(column_family& cf, std::vector<s

 int64_t date_tiered_manifest::get_now(column_family& cf) {
    int64_t max_timestamp = 0;
-    for (auto& sst : *cf.get_sstables()) {
+    auto shared_set = cf.get_sstables();
+    for (auto& sst : *shared_set) {
        int64_t candidate = sst->get_stats_metadata().max_timestamp;
        max_timestamp = candidate > max_timestamp ? candidate : max_timestamp;
    }
--- a/sstables/kl/writer.cc
+++ b/sstables/kl/writer.cc
@@ -129,7 +129,7 @@ void sstable_writer_k_l::maybe_flush_pi_block(file_writer& out,
        // block includes them), but we set block_next_start_offset after - so
        // even if we wrote a lot of open tombstones, we still get a full
        // block size of new data.
-        auto& rts = _pi_write.tombstone_accumulator->range_tombstones_for_row(
+        auto rts = _pi_write.tombstone_accumulator->range_tombstones_for_row(
                clustering_key_prefix::from_range(clustering_key.values()));
        for (const auto& rt : rts) {
            auto start = composite::from_clustering_element(*_pi_write.schemap, rt.start);
--- a/streaming/stream_session.cc
+++ b/streaming/stream_session.cc
@@ -380,7 +380,7 @@ future<prepare_message> stream_session::prepare(std::vector<stream_request> requ
            try {
                db.find_column_family(ks, cf);
            } catch (no_such_column_family&) {
-                auto err = format("[Stream #{{}}] prepare requested ks={{}} cf={{}} does not exist", plan_id, ks, cf);
+                auto err = format("[Stream #{}] prepare requested ks={} cf={} does not exist", plan_id, ks, cf);
                sslog.warn(err.c_str());
                throw std::runtime_error(err);
            }
@@ -394,7 +394,7 @@ future<prepare_message> stream_session::prepare(std::vector<stream_request> requ
        try {
            db.find_column_family(cf_id);
        } catch (no_such_column_family&) {
-            auto err = format("[Stream #{{}}] prepare cf_id={} does not exist", plan_id, cf_id);
+            auto err = format("[Stream #{}] prepare cf_id={} does not exist", plan_id, cf_id);
            sslog.warn(err.c_str());
            throw std::runtime_error(err);
        }
--- a/table.cc
+++ b/table.cc
@@ -864,8 +864,8 @@ void table::try_trigger_compaction() noexcept {
 }

 void table::do_trigger_compaction() {
-    // But only submit if we're not locked out
-    if (!_compaction_disabled) {
+    // But not if we're locked out or stopping
+    if (!_compaction_disabled && !_async_gate.is_closed()) {
        _compaction_manager.submit(this);
    }
 }
--- a/test/alternator/test_authorization.py
+++ b/test/alternator/test_authorization.py
@@ -85,3 +85,20 @@ def test_signature_too_futuristic(dynamodb, test_table):
    response = requests.post(url, headers=headers, verify=False)
    assert not response.ok
    assert "InvalidSignatureException" in response.text and "Signature not yet current" in response.text
+
+# A test that commas can be uses instead of whitespace to separate components
+# of the Authorization headers - reproducing issue #9568.
+def test_authorization_no_whitespace(dynamodb, test_table):
+    # Unlike the above tests which checked error cases so didn't need to
+    # calculate a real signature, in this test we really a correct signature,
+    # so we use a function we already have in test_manual_requests.py.
+    from test_manual_requests import get_signed_request
+    payload = '{"TableName": "' + test_table.name + '", "Item": {"p": {"S": "x"}, "c": {"S": "x"}}}'
+    req = get_signed_request(dynamodb, 'PutItem', payload)
+    # Boto3 separates the components of the Authorization header by spaces.
+    # Let's remove all of them except the first one (which separates the
+    # signature algorithm name from the rest) and check the result still works:
+    a = req.headers['Authorization'].split()
+    req.headers['Authorization'] = a[0] + ' ' + ''.join(a[1:])
+    response = requests.post(req.url, headers=req.headers, data=req.body, verify=False)
+    assert response.ok
--- a/test/alternator/test_manual_requests.py
+++ b/test/alternator/test_manual_requests.py
@@ -154,3 +154,25 @@ def test_incorrect_numbers(dynamodb, test_table):
        req = get_signed_request(dynamodb, 'PutItem', payload)
        response = requests.post(req.url, headers=req.headers, data=req.body, verify=False)
        assert "ValidationException" in response.text and "numeric" in response.text
+
+# Although the DynamoDB API responses are JSON, additional conventions apply
+# to these responses - such as how error codes are encoded in JSON. For this
+# reason, DynamoDB uses the content type 'application/x-amz-json-1.0' instead
+# of the standard 'application/json'. This test verifies that we return the
+# correct content type header.
+# While most DynamoDB libraries we tried do not care about an unexpected
+# content-type, it turns out that one (aiodynamo) does. Moreover, AWS already
+# defined x-amz-json-1.1 - see
+#    https://awslabs.github.io/smithy/1.0/spec/aws/aws-json-1_1-protocol.html
+# which differs (only) in how it encodes error replies.
+# So in the future it may become even more important that Scylla return the
+# correct content type.
+def test_content_type(dynamodb, test_table):
+    payload = '{"TableName": "' + test_table.name + '", "Item": {"p": {"S": "x"}, "c": {"S": "x"}}}'
+    # Note that get_signed_request() uses x-amz-json-1.0 to encode the
+    # *request*. In the future this may or may not effect the content type
+    # in the response (today, DynamoDB doesn't allow any other content type
+    # in the request anyway).
+    req = get_signed_request(dynamodb, 'PutItem', payload)
+    response = requests.post(req.url, headers=req.headers, data=req.body, verify=False)
+    assert response.headers['Content-Type'] == 'application/x-amz-json-1.0'
--- a/test/alternator/test_metrics.py
+++ b/test/alternator/test_metrics.py
@@ -0,0 +1,113 @@
+# Copyright 2021-present ScyllaDB
+#
+# This file is part of Scylla.
+#
+# Scylla is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Scylla is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+
+##############################################################################
+# Tests for Scylla's metrics (see docs/design-notes/metrics.md) for Alternator
+# queries. Reproduces issue #9406, where although metrics was implemented for
+# Alternator requests, they were missing for some operations (BatchGetItem).
+# In the tests here we attempt to ensure that the metrics continue to work
+# for the relevant operations as the code evolves.
+#
+# Note that all tests in this file test Scylla-specific features, and are
+# "skipped" when not running against Scylla, or when unable to retrieve
+# metrics through out-of-band HTTP requests to Scylla's Prometheus port (9180).
+#
+# IMPORTANT: we do not want these tests to assume that are not running in
+# parallel with any other tests or workload - because such an assumption
+# would limit our test deployment options in the future. NOT making this
+# assumption means that these tests can't check that a certain operation
+# increases a certain counter by exactly 1 - because other concurrent
+# operations might increase it further! So our test can only check that the
+# counter increases.
+##############################################################################
+
+import pytest
+import requests
+import re
+
+from util import random_string
+
+# Fixture for checking if we are able to test Scylla metrics. Scylla metrics
+# are not available on AWS (of course), but may also not be available for
+# Scylla if for some reason we have only access to the Alternator protocol
+# port but no access to the metrics port (9180).
+# If metrics are *not* available, tests using this fixture will be skipped.
+# Tests using this fixture may call get_metrics(metrics).
+@pytest.fixture(scope="module")
+def metrics(dynamodb):
+    if dynamodb.meta.client._endpoint.host.endswith('.amazonaws.com'):
+        pytest.skip('Scylla-only feature not supported by AWS')
+    url = dynamodb.meta.client._endpoint.host
+    # The Prometheus API is on port 9180, and always http, not https.
+    url = re.sub(r':[0-9]+(/|$)', ':9180', url)
+    url = re.sub(r'^https:', 'http:', url)
+    url = url + '/metrics'
+    resp = requests.get(url)
+    if resp.status_code != 200:
+        pytest.skip('Metrics port 9180 is not available')
+    yield url
+
+# Utility function for fetching all metrics from Scylla, using an HTTP request
+# to port 9180. The response format is defined by the Prometheus protocol.
+# Only use get_metrics() in a test using the metrics_available fixture.
+def get_metrics(metrics):
+    response = requests.get(metrics)
+    assert response.status_code == 200
+    return response.text
+
+# Utility function for fetching a metric with a given name and optionally a
+# given sub-metric label (which should be a name-value map). If multiple
+# matches are found, they are summed - this is useful for summing up the
+# counts from multiple shards.
+def get_metric(metrics, name, requested_labels=None):
+    total = 0.0
+    lines = re.compile('^'+name+'{.*$', re.MULTILINE)
+    for match in re.findall(lines, get_metrics(metrics)):
+        a = match.split()
+        metric = a[0]
+        val = float(a[1])
+        # Check if match also matches the requested labels
+        if requested_labels:
+            # we know metric begins with name{ and ends with } - the labels
+            # are what we have between those
+            got_labels = metric[len(name)+1:-1].split(',')
+            # Check that every one of the requested labels is in got_labels:
+            for k, v in requested_labels.items():
+                if not f'{k}="{v}"' in got_labels:
+                    # No match for requested label, skip this metric (python
+                    # doesn't have "continue 2" so let's just set val to 0...
+                    val = 0
+                    break
+        total += float(val)
+    return total
+
+def test_batch_write_item(test_table_s, metrics):
+    n1 = get_metric(metrics, 'scylla_alternator_operation', {'op': 'BatchWriteItem'})
+    test_table_s.meta.client.batch_write_item(RequestItems = {
+        test_table_s.name: [{'PutRequest': {'Item': {'p': random_string(), 'a': 'hi'}}}]})
+    n2 = get_metric(metrics, 'scylla_alternator_operation', {'op': 'BatchWriteItem'})
+    assert n2 > n1
+
+# Reproduces issue #9406:
+def test_batch_get_item(test_table_s, metrics):
+    n1 = get_metric(metrics, 'scylla_alternator_operation', {'op': 'BatchGetItem'})
+    test_table_s.meta.client.batch_get_item(RequestItems = {
+        test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}})
+    n2 = get_metric(metrics, 'scylla_alternator_operation', {'op': 'BatchGetItem'})
+    assert n2 > n1
+
+# TODO: check the rest of the operations
--- a/test/alternator/test_returnvalues.py
+++ b/test/alternator/test_returnvalues.py
@@ -431,3 +431,14 @@ def test_update_item_returnvalues_nested(test_table_s):
    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW',
        UpdateExpression='REMOVE a.c[1]')
    assert ret['Attributes'] == {'a': {'c': [70]}}
+
+# A reproducer for issue #9542 - when UpdateExpression's REMOVE operation
+# actually deletes an existing attribute, it breaks the ALL_NEW ReturnValues
+# for other attributes set in the same command.
+def test_update_item_returnvalues_all_new_remove_etc(test_table_s):
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 's': 'dog', 'd': 'foo'})
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='ALL_NEW',
+        UpdateExpression='REMOVE d SET s = :v',
+        ExpressionAttributeValues={':v': 'cat'})
+    assert ret['Attributes']['s'] == 'cat'
--- a/test/boost/alternator_base64_test.cc
+++ b/test/boost/alternator_base64_test.cc
@@ -22,6 +22,7 @@
 #define BOOST_TEST_MODULE alternator
 #include <boost/test/included/unit_test.hpp>

+#include <seastar/util/defer.hh>
 #include "alternator/base64.hh"

 static bytes_view to_bytes_view(const std::string& s) {
@@ -78,3 +79,22 @@ BOOST_AUTO_TEST_CASE(test_base64_begins_with) {
        BOOST_REQUIRE(!base64_begins_with(encoded_str3, encoded_non_prefix));
    }
 }
+
+BOOST_AUTO_TEST_CASE(test_allocator_fail_gracefully) {
+// Unfortunately the address sanitizer fails if the allocator is not able
+// to allocate the requested memory. The test is therefore skipped for debug  mode
+#ifndef DEBUG
+    static constexpr size_t too_large_alloc_size = 0xffffffffff;
+    rjson::allocator allocator;
+    // Impossible allocation should throw
+    BOOST_REQUIRE_THROW(allocator.Malloc(too_large_alloc_size), rjson::error);
+    // So should impossible reallocation
+    void* memory = allocator.Malloc(1);
+    auto release = defer([memory] { rjson::allocator::Free(memory); });
+    BOOST_REQUIRE_THROW(allocator.Realloc(memory, 1, too_large_alloc_size), rjson::error);
+    // Internal rapidjson stack should also throw
+    // and also be destroyed gracefully later
+    rapidjson::internal::Stack stack(&allocator, 0);
+    BOOST_REQUIRE_THROW(stack.Push<char>(too_large_alloc_size), rjson::error);
+#endif
+}
--- a/test/boost/bytes_ostream_test.cc
+++ b/test/boost/bytes_ostream_test.cc
@@ -269,6 +269,21 @@ BOOST_AUTO_TEST_CASE(test_writing_placeholders) {
    BOOST_REQUIRE(in.size() == 0);
 }

+BOOST_AUTO_TEST_CASE(test_large_placeholder) {
+    bytes_ostream::size_type size;
+    try {
+        for (size = 1; (int32_t)size > 0; size *= 2) {
+            bytes_ostream buf;
+            int8_t* ph;
+            BOOST_TEST_MESSAGE(fmt::format("try size={}", size));
+            ph = buf.write_place_holder(size);
+            std::fill(ph, ph + size, 0);
+        }
+    } catch (const std::bad_alloc&) {
+    }
+    BOOST_REQUIRE(size >= bytes_ostream::max_chunk_size());
+}
+
 BOOST_AUTO_TEST_CASE(test_append_big_and_small_chunks) {
    bytes_ostream small;
    append_sequence(small, 12);
--- a/test/boost/index_with_paging_test.cc
+++ b/test/boost/index_with_paging_test.cc
@@ -22,6 +22,8 @@
 #include <seastar/testing/test_case.hh>
 #include "test/lib/cql_test_env.hh"
 #include "test/lib/cql_assertions.hh"
+#include "cql3/untyped_result_set.hh"
+#include "cql3/query_processor.hh"
 #include "transport/messages/result_message.hh"

 SEASTAR_TEST_CASE(test_index_with_paging) {
@@ -48,3 +50,51 @@ SEASTAR_TEST_CASE(test_index_with_paging) {
        });
    });
 }
+
+SEASTAR_TEST_CASE(test_index_with_paging_with_base_short_read) {
+    return do_with_cql_env_thread([] (auto& e) {
+        e.execute_cql("CREATE TABLE tab (pk int, ck text, v int, v2 int, v3 text, PRIMARY KEY (pk, ck))").get();
+        e.execute_cql("CREATE INDEX ON tab (v)").get();
+
+        // Enough to trigger a short read on the base table during scan
+        sstring big_string(2 * query::result_memory_limiter::maximum_result_size, 'j');
+
+        const int row_count = 67;
+        for (int i = 0; i < row_count; ++i) {
+            e.execute_cql(format("INSERT INTO tab (pk, ck, v, v2, v3) VALUES ({}, 'hello{}', 1, {}, '{}')", i % 3, i, i, big_string)).get();
+        }
+
+        eventually([&] {
+            uint64_t count = 0;
+            e.qp().local().query_internal("SELECT * FROM ks.tab WHERE v = 1", [&] (const cql3::untyped_result_set_row&) {
+                ++count;
+                return make_ready_future<stop_iteration>(stop_iteration::no);
+            }).get();
+            BOOST_REQUIRE_EQUAL(count, row_count);
+        });
+    });
+}
+
+SEASTAR_TEST_CASE(test_index_with_paging_with_base_short_read_no_ck) {
+    return do_with_cql_env_thread([] (auto& e) {
+        e.execute_cql("CREATE TABLE tab (pk int, v int, v2 int, v3 text, PRIMARY KEY (pk))").get();
+        e.execute_cql("CREATE INDEX ON tab (v)").get();
+
+        // Enough to trigger a short read on the base table during scan
+        sstring big_string(2 * query::result_memory_limiter::maximum_result_size, 'j');
+
+        const int row_count = 67;
+        for (int i = 0; i < row_count; ++i) {
+            e.execute_cql(format("INSERT INTO tab (pk, v, v2, v3) VALUES ({}, 1, {}, '{}')", i, i, big_string)).get();
+        }
+
+        eventually([&] {
+            uint64_t count = 0;
+            e.qp().local().query_internal("SELECT * FROM ks.tab WHERE v = 1", [&] (const cql3::untyped_result_set_row&) {
+                ++count;
+                return make_ready_future<stop_iteration>(stop_iteration::no);
+            }).get();
+            BOOST_REQUIRE_EQUAL(count, row_count);
+        });
+    });
+}
--- a/test/boost/mutation_reader_test.cc
+++ b/test/boost/mutation_reader_test.cc
@@ -3258,39 +3258,30 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
        reader_permit permit,
        const dht::partition_range& prange,
        const query::partition_slice& slice,
-        std::deque<mutation_fragment> first_buffer,
-        position_in_partition_view last_fragment_position,
-        std::deque<mutation_fragment> second_buffer,
-        size_t max_buffer_size) {
+        std::list<std::deque<mutation_fragment>> buffers,
+        position_in_partition_view first_buf_last_fragment_position,
+        size_t max_buffer_size,
+        bool detach_buffer = true) {
    class factory {
        schema_ptr _schema;
        reader_permit _permit;
-        std::optional<std::deque<mutation_fragment>> _first_buffer;
-        std::optional<std::deque<mutation_fragment>> _second_buffer;
+        std::list<std::deque<mutation_fragment>> _buffers;
        size_t _max_buffer_size;

-    private:
-        std::optional<std::deque<mutation_fragment>> copy_buffer(const std::optional<std::deque<mutation_fragment>>& o) {
-            if (!o) {
-                return {};
-            }
-            return copy_fragments(*_schema, _permit, *o);
-        }
-
    public:
-        factory(schema_ptr schema, reader_permit permit, std::deque<mutation_fragment> first_buffer, std::deque<mutation_fragment> second_buffer, size_t max_buffer_size)
+        factory(schema_ptr schema, reader_permit permit, std::list<std::deque<mutation_fragment>> buffers, size_t max_buffer_size)
            : _schema(std::move(schema))
            , _permit(std::move(permit))
-            , _first_buffer(std::move(first_buffer))
-            , _second_buffer(std::move(second_buffer))
+            , _buffers(std::move(buffers))
            , _max_buffer_size(max_buffer_size) {
        }

        factory(const factory& o)
            : _schema(o._schema)
-            , _permit(o._permit)
-            , _first_buffer(copy_buffer(o._first_buffer))
-            , _second_buffer(copy_buffer(o._second_buffer)) {
+            , _permit(o._permit) {
+            for (const auto& buf : o._buffers) {
+                _buffers.emplace_back(copy_fragments(*_schema, _permit, buf));
+            }
        }
        factory(factory&& o) = default;

@@ -3304,14 +3295,9 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
                streamed_mutation::forwarding fwd_sm,
                mutation_reader::forwarding fwd_mr) {
            BOOST_REQUIRE(s == _schema);
-            if (_first_buffer) {
-                auto buf = *std::exchange(_first_buffer, {});
-                auto rd = make_flat_mutation_reader_from_fragments(_schema, std::move(permit), std::move(buf));
-                rd.set_max_buffer_size(_max_buffer_size);
-                return rd;
-            }
-            if (_second_buffer) {
-                auto buf = *std::exchange(_second_buffer, {});
+            if (!_buffers.empty()) {
+                auto buf = std::move(_buffers.front());
+                _buffers.pop_front();
                auto rd = make_flat_mutation_reader_from_fragments(_schema, std::move(permit), std::move(buf));
                rd.set_max_buffer_size(_max_buffer_size);
                return rd;
@@ -3319,9 +3305,9 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
            return make_empty_flat_reader(_schema, std::move(permit));
        }
    };
-    auto ms = mutation_source(factory(schema, permit, std::move(first_buffer), std::move(second_buffer), max_buffer_size));
+    auto ms = mutation_source(factory(schema, permit, std::move(buffers), max_buffer_size));

-    auto [rd, handle] = make_manually_paused_evictable_reader(
+    auto rd = make_auto_paused_evictable_reader(
            std::move(ms),
            schema,
            permit,
@@ -3337,18 +3323,42 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(

    const auto eq_cmp = position_in_partition::equal_compare(*schema);
    BOOST_REQUIRE(rd.is_buffer_full());
-    BOOST_REQUIRE(eq_cmp(rd.buffer().back().position(), last_fragment_position));
+    BOOST_REQUIRE(eq_cmp(rd.buffer().back().position(), first_buf_last_fragment_position));
    BOOST_REQUIRE(!rd.is_end_of_stream());

-    rd.detach_buffer();
-
-    handle.pause();
+    if (detach_buffer) {
+        rd.detach_buffer();
+    }

    while(permit.semaphore().try_evict_one_inactive_read());

    return std::move(rd);
 }

+flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
+        schema_ptr schema,
+        reader_permit permit,
+        const dht::partition_range& prange,
+        const query::partition_slice& slice,
+        std::deque<mutation_fragment> first_buffer,
+        position_in_partition_view last_fragment_position,
+        std::deque<mutation_fragment> last_buffer,
+        size_t max_buffer_size,
+        bool detach_buffer = true) {
+    std::list<std::deque<mutation_fragment>> list;
+    list.emplace_back(std::move(first_buffer));
+    list.emplace_back(std::move(last_buffer));
+    return create_evictable_reader_and_evict_after_first_buffer(
+            std::move(schema),
+            std::move(permit),
+            prange,
+            slice,
+            std::move(list),
+            last_fragment_position,
+            max_buffer_size,
+            detach_buffer);
+}
+
 }

 SEASTAR_THREAD_TEST_CASE(test_evictable_reader_trim_range_tombstones) {
@@ -3650,7 +3660,7 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_self_validation) {

    check_evictable_reader_validation_is_triggered(
            "pkey > _last_pkey; pkey ∈ pkrange",
-            partition_error_prefix,
+            "",
            s.schema(),
            permit,
            prange,
@@ -3739,6 +3749,317 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_self_validation) {
            max_buffer_size);
 }

+SEASTAR_THREAD_TEST_CASE(test_evictable_reader_recreate_before_fast_forward_to) {
+    class test_reader : public flat_mutation_reader::impl {
+        simple_schema _s;
+        const std::vector<dht::decorated_key> _pkeys;
+        std::vector<dht::decorated_key>::const_iterator _it;
+        std::vector<dht::decorated_key>::const_iterator _end;
+    private:
+        void on_range_change(const dht::partition_range& pr) {
+            dht::ring_position_comparator cmp(*_schema);
+            _it = _pkeys.begin();
+            while (_it != _pkeys.end() && !pr.contains(*_it, cmp)) {
+                ++_it;
+            }
+            _end = _it;
+            while (_end != _pkeys.end() && pr.contains(*_end, cmp)) {
+                ++_end;
+            }
+        }
+    public:
+        test_reader(simple_schema s, reader_permit permit, const dht::partition_range& pr, std::vector<dht::decorated_key> pkeys)
+            : impl(s.schema(), std::move(permit))
+            , _s(std::move(s))
+            , _pkeys(std::move(pkeys)) {
+            on_range_change(pr);
+        }
+
+        virtual future<> fill_buffer(db::timeout_clock::time_point) override {
+            if (_it == _end) {
+                _end_of_stream = true;
+                return make_ready_future<>();
+            }
+
+            push_mutation_fragment(*_schema, _permit, partition_start(*_it++, {}));
+
+            uint32_t ck = 0;
+            while (!is_buffer_full()) {
+                auto ckey = _s.make_ckey(ck);
+                push_mutation_fragment(*_schema, _permit, _s.make_row(_s.make_ckey(ck++), make_random_string(1024)));
+                ++ck;
+            }
+
+            push_mutation_fragment(*_schema, _permit, partition_end());
+            return make_ready_future<>();
+        }
+        virtual void next_partition() override {
+        }
+        virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point) override {
+            on_range_change(pr);
+            clear_buffer();
+            _end_of_stream = false;
+            return make_ready_future<>();
+        }
+        virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point) override {
+            return make_exception_future<>(make_backtraced_exception_ptr<std::bad_function_call>());
+        }
+    };
+
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name());
+    simple_schema s;
+    auto permit = semaphore.make_permit(s.schema().get(), get_name());
+    auto pkeys = s.make_pkeys(6);
+    boost::sort(pkeys, dht::decorated_key::less_comparator(s.schema()));
+
+    auto ms = mutation_source([&] (schema_ptr schema,
+            reader_permit permit,
+            const dht::partition_range& range,
+            const query::partition_slice& slice,
+            const io_priority_class& pc,
+            tracing::trace_state_ptr tr,
+            streamed_mutation::forwarding fwd,
+            mutation_reader::forwarding fwd_mr) {
+        std::vector<dht::decorated_key> pkeys_with_data;
+        bool empty = false;
+        for (const auto& pkey : pkeys) {
+            empty = !empty;
+            if (empty) {
+                pkeys_with_data.push_back(pkey);
+            }
+        }
+        return make_flat_mutation_reader<test_reader>(
+                s,
+                std::move(permit),
+                range,
+                std::move(pkeys_with_data));
+    });
+
+    auto pr0 = dht::partition_range::make({pkeys[0], true}, {pkeys[3], true});
+    auto [reader, handle] = make_manually_paused_evictable_reader(std::move(ms), s.schema(), permit, pr0, s.schema()->full_slice(),
+            seastar::default_priority_class(), {}, mutation_reader::forwarding::yes);
+
+    auto reader_assert = assert_that(std::move(reader));
+    reader_assert.produces(pkeys[0]);
+    reader_assert.produces(pkeys[2]);
+
+    handle.pause();
+    BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
+
+    reader_assert.produces_end_of_stream();
+
+    auto pr1 = dht::partition_range::make({pkeys[4], true}, {pkeys[5], true});
+    reader_assert.fast_forward_to(pr1);
+
+    // Failure will happen in the form of `on_internal_error()`.
+    reader_assert.produces(pkeys[4]);
+}
+
+SEASTAR_THREAD_TEST_CASE(test_evictable_reader_drop_flags) {
+    reader_concurrency_semaphore semaphore(1, 0, get_name());
+    simple_schema s;
+    auto permit = semaphore.make_permit(s.schema().get(), get_name());
+
+    auto pkeys = s.make_pkeys(2);
+    std::sort(pkeys.begin(), pkeys.end(), [&s] (const auto& pk1, const auto& pk2) {
+        return pk1.less_compare(*s.schema(), pk2);
+    });
+    const auto& pkey1 = pkeys[0];
+    const auto& pkey2 = pkeys[1];
+    const int second_buffer_ck = 10;
+
+    struct buffer {
+        simple_schema& s;
+        reader_permit permit;
+        std::deque<mutation_fragment> frags;
+        std::vector<mutation> muts;
+        size_t size = 0;
+        std::optional<position_in_partition_view> last_pos;
+
+        buffer(simple_schema& s_, reader_permit permit_, dht::decorated_key key)
+            : s(s_), permit(std::move(permit_)) {
+            add_partition(key);
+        }
+        size_t add_partition(dht::decorated_key key) {
+            size += frags.emplace_back(*s.schema(), permit, partition_start{key, {}}).memory_usage();
+            muts.emplace_back(s.schema(), key);
+            return size;
+        }
+        size_t add_mutation_fragment(mutation_fragment&& mf, bool only_to_frags = false) {
+            if (!only_to_frags) {
+                muts.back().apply(mf);
+            }
+            size += frags.emplace_back(*s.schema(), permit, std::move(mf)).memory_usage();
+            return size;
+        }
+        size_t add_static_row(std::optional<mutation_fragment> sr = {}) {
+            auto srow = sr ? std::move(*sr) : s.make_static_row("s");
+            return add_mutation_fragment(std::move(srow));
+        }
+        size_t add_clustering_row(int i, bool only_to_frags = false) {
+            return add_mutation_fragment(mutation_fragment(*s.schema(), permit, s.make_row(s.make_ckey(i), "v")), only_to_frags);
+        }
+        size_t add_clustering_rows(int start, int end) {
+            for (int i = start; i < end; ++i) {
+                add_clustering_row(i);
+            }
+            return size;
+        }
+        size_t add_partition_end() {
+            size += frags.emplace_back(*s.schema(), permit, partition_end{}).memory_usage();
+            return size;
+        }
+        void save_position() { last_pos = frags.back().position(); }
+        void find_position(size_t buf_size) {
+            size_t s = 0;
+            for (const auto& frag : frags) {
+                s += frag.memory_usage();
+                if (s >= buf_size) {
+                    last_pos = frag.position();
+                    break;
+                }
+            }
+            BOOST_REQUIRE(last_pos);
+        }
+    };
+
+    auto make_reader = [&] (const buffer& first_buffer, const buffer& second_buffer, const buffer* const third_buffer, size_t max_buffer_size) {
+        std::list<std::deque<mutation_fragment>> buffers;
+        buffers.emplace_back(copy_fragments(*s.schema(), permit, first_buffer.frags));
+        buffers.emplace_back(copy_fragments(*s.schema(), permit, second_buffer.frags));
+        if (third_buffer) {
+            buffers.emplace_back(copy_fragments(*s.schema(), permit, third_buffer->frags));
+        }
+        return create_evictable_reader_and_evict_after_first_buffer(
+                s.schema(),
+                permit,
+                query::full_partition_range,
+                s.schema()->full_slice(),
+                std::move(buffers),
+                *first_buffer.last_pos,
+                max_buffer_size,
+                false);
+    };
+
+    testlog.info("Same partition, with static row");
+    {
+        buffer first_buffer(s, permit, pkey1);
+        first_buffer.add_static_row();
+        auto srow = mutation_fragment(*s.schema(), permit, first_buffer.frags.back());
+        const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
+        first_buffer.save_position();
+        first_buffer.add_clustering_row(second_buffer_ck);
+
+        buffer second_buffer(s, permit, pkey1);
+        second_buffer.add_static_row(std::move(srow));
+        second_buffer.add_clustering_row(second_buffer_ck);
+        second_buffer.add_clustering_row(second_buffer_ck + 1);
+        second_buffer.add_partition_end();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .has_monotonic_positions();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .produces(first_buffer.muts[0] + second_buffer.muts[0])
+            .produces_end_of_stream();
+    }
+
+    testlog.info("Same partition, no static row");
+    {
+        buffer first_buffer(s, permit, pkey1);
+        const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
+        first_buffer.save_position();
+        first_buffer.add_clustering_row(second_buffer_ck);
+
+        buffer second_buffer(s, permit, pkey1);
+        second_buffer.add_clustering_row(second_buffer_ck);
+        second_buffer.add_clustering_row(second_buffer_ck + 1);
+        second_buffer.add_partition_end();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .has_monotonic_positions();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .produces(first_buffer.muts[0] + second_buffer.muts[0])
+            .produces_end_of_stream();
+    }
+
+    testlog.info("Same partition as expected, no static row, next partition has static row (#8923)");
+    {
+        buffer second_buffer(s, permit, pkey1);
+        second_buffer.add_clustering_rows(second_buffer_ck, second_buffer_ck + second_buffer_ck / 2);
+        // We want to end the buffer on the partition-start below, but since a
+        // partition start will be dropped from it, we have to use the size
+        // without it.
+        const auto buf_size = second_buffer.add_partition_end();
+        second_buffer.add_partition(pkey2);
+        second_buffer.add_static_row();
+        auto srow = mutation_fragment(*s.schema(), permit, second_buffer.frags.back());
+        second_buffer.add_clustering_rows(0, 2);
+
+        buffer first_buffer(s, permit, pkey1);
+        for (int i = 0; first_buffer.add_clustering_row(i) < buf_size; ++i);
+        first_buffer.save_position();
+        first_buffer.add_mutation_fragment(mutation_fragment(*s.schema(), permit, second_buffer.frags[1]));
+
+        buffer third_buffer(s, permit, pkey2);
+        third_buffer.add_static_row(std::move(srow));
+        third_buffer.add_clustering_rows(0, 2);
+        third_buffer.add_partition_end();
+
+        first_buffer.find_position(buf_size);
+
+        assert_that(make_reader(first_buffer, second_buffer, &third_buffer, buf_size))
+            .has_monotonic_positions();
+
+        assert_that(make_reader(first_buffer, second_buffer, &third_buffer, buf_size))
+            .produces(first_buffer.muts[0] + second_buffer.muts[0])
+            .produces(second_buffer.muts[1] + third_buffer.muts[0])
+            .produces_end_of_stream();
+    }
+
+    testlog.info("Next partition, with no static row");
+    {
+        buffer first_buffer(s, permit, pkey1);
+        const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
+        first_buffer.save_position();
+        first_buffer.add_clustering_row(second_buffer_ck + 1, true);
+
+        buffer second_buffer(s, permit, pkey2);
+        second_buffer.add_clustering_rows(0, second_buffer_ck / 2);
+        second_buffer.add_partition_end();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .has_monotonic_positions();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .produces(first_buffer.muts[0])
+            .produces(second_buffer.muts[0])
+            .produces_end_of_stream();
+    }
+
+    testlog.info("Next partition, with static row");
+    {
+        buffer first_buffer(s, permit, pkey1);
+        const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
+        first_buffer.save_position();
+        first_buffer.add_clustering_row(second_buffer_ck + 1, true);
+
+        buffer second_buffer(s, permit, pkey2);
+        second_buffer.add_static_row();
+        second_buffer.add_clustering_rows(0, second_buffer_ck / 2);
+        second_buffer.add_partition_end();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .has_monotonic_positions();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .produces(first_buffer.muts[0])
+            .produces(second_buffer.muts[0])
+            .produces_end_of_stream();
+    }
+}
+
 struct mutation_bounds {
    std::optional<mutation> m;
    position_in_partition lower;
--- a/test/boost/sstable_conforms_to_mutation_source_test.cc
+++ b/test/boost/sstable_conforms_to_mutation_source_test.cc
@@ -28,6 +28,8 @@
 #include "sstables/sstables.hh"
 #include "test/lib/mutation_source_test.hh"
 #include "test/lib/sstable_utils.hh"
+#include "test/lib/mutation_assertions.hh"
+#include "partition_slice_builder.hh"

 using namespace sstables;
 using namespace std::chrono_literals;
@@ -62,3 +64,69 @@ SEASTAR_TEST_CASE(test_sstable_conforms_to_mutation_source) {
        }
    });
 }
+
+// Regression test for scylladb/scylla-enterprise#2016
+SEASTAR_THREAD_TEST_CASE(test_produces_range_tombstone) {
+    auto s = schema_builder("ks", "cf")
+                .with_column("pk", int32_type, column_kind::partition_key)
+                .with_column("ck", int32_type, column_kind::clustering_key)
+                .with_column("v", int32_type, column_kind::regular_column)
+                .build();
+
+    mutation m(s, partition_key::from_single_value(*s, int32_type->decompose(0)));
+    m.partition().apply_row_tombstone(*s, range_tombstone{
+            clustering_key::from_exploded(*s, {int32_type->decompose(6)}), bound_kind::excl_start,
+            clustering_key::from_exploded(*s, {int32_type->decompose(10)}), bound_kind::incl_end,
+            tombstone(0, gc_clock::time_point())
+    });
+
+    {
+        auto ckey = clustering_key::from_exploded(*s, {int32_type->decompose(6)});
+        deletable_row& row = m.partition().clustered_row(*s, ckey, is_dummy::no, is_continuous(false));
+        row.marker() = row_marker(4);
+    }
+    {
+        auto ckey = clustering_key::from_exploded(*s, {int32_type->decompose(8)});
+        deletable_row& row = m.partition().clustered_row(*s, ckey, is_dummy::no, is_continuous(false));
+        row.apply(tombstone(2, gc_clock::time_point()));
+        row.marker() = row_marker(5);
+    }
+
+    testlog.info("m: {}", m);
+
+    auto slice = partition_slice_builder(*s)
+        .with_range(query::clustering_range::make(
+            {clustering_key::from_exploded(*s, {int32_type->decompose(8)}), false},
+            {clustering_key::from_exploded(*s, {int32_type->decompose(10)}), true}
+        ))
+        .build();
+
+    auto pr = dht::partition_range::make_singular(m.decorated_key());
+
+    std::vector<tmpdir> dirs;
+    dirs.emplace_back();
+    sstables::test_env::do_with_async([&] (sstables::test_env& env) {
+        storage_service_for_tests ssft;
+        auto version = sstable_version_types::la;
+        auto index_block_size = 1;
+        sstable_writer_config cfg = env.manager().configure_writer();
+        cfg.promoted_index_block_size = index_block_size;
+
+        auto source = make_sstable_mutation_source(env, s, dirs.back().path().string(), {m}, cfg, version, gc_clock::now());
+
+        {
+            auto rd = source.make_reader(s, tests::make_permit(), pr, slice);
+            while (auto mf = rd(db::no_timeout).get0()) {
+                testlog.info("produced {}", mutation_fragment::printer(*s, *mf));
+            }
+        }
+
+        {
+            auto rd = source.make_reader(s, tests::make_permit(), pr, slice);
+            mutation_opt sliced_m = read_mutation_from_flat_mutation_reader(rd, db::no_timeout).get0();
+            BOOST_REQUIRE(bool(sliced_m));
+
+            assert_that(*sliced_m).is_equal_to(m, slice.row_ranges(*m.schema(), m.key()));
+        }
+    }).get();
+}
--- a/test/boost/sstable_datafile_test.cc
+++ b/test/boost/sstable_datafile_test.cc
@@ -6886,3 +6886,115 @@ SEASTAR_TEST_CASE(test_twcs_single_key_reader_filtering) {
                cf_stats.sstables_checked_by_clustering_filter - checked_by_ck);
    });
 }
+
+SEASTAR_TEST_CASE(max_ongoing_compaction_test) {
+    return test_env::do_with_async([] (test_env& env) {
+        BOOST_REQUIRE(smp::count == 1);
+
+        auto make_schema = [] (auto idx) {
+            auto builder = schema_builder("tests", std::to_string(idx))
+                .with_column("id", utf8_type, column_kind::partition_key)
+                .with_column("cl", int32_type, column_kind::clustering_key)
+                .with_column("value", int32_type);
+            builder.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
+            std::map <sstring, sstring> opts = {
+                {time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY,                  "HOURS"},
+                {time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY,                  "1"},
+                {time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY, "0"},
+            };
+            builder.set_compaction_strategy_options(std::move(opts));
+            builder.set_gc_grace_seconds(0);
+            return builder.build();
+        };
+
+        auto cm = make_lw_shared<compaction_manager>();
+        cm->enable();
+        auto stop_cm = defer([&cm] {
+            cm->stop().get();
+        });
+
+        auto tmp = tmpdir();
+        auto cl_stats = make_lw_shared<cell_locker_stats>();
+        auto tracker = make_lw_shared<cache_tracker>();
+        auto tokens = token_generation_for_shard(1, this_shard_id(), test_db_config.murmur3_partitioner_ignore_msb_bits(), smp::count);
+
+        auto next_timestamp = [] (auto step) {
+            using namespace std::chrono;
+            return (gc_clock::now().time_since_epoch() - duration_cast<microseconds>(step)).count();
+        };
+        auto make_expiring_cell = [&] (schema_ptr s, std::chrono::hours step) {
+            static thread_local int32_t value = 1;
+
+            auto key_str = tokens[0].first;
+            auto key = partition_key::from_exploded(*s, {to_bytes(key_str)});
+
+            mutation m(s, key);
+            auto c_key = clustering_key::from_exploded(*s, {int32_type->decompose(value++)});
+            m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(value)), next_timestamp(step), gc_clock::duration(step + 5s));
+            return m;
+        };
+
+        auto make_table_with_single_fully_expired_sstable = [&] (auto idx) {
+            auto s = make_schema(idx);
+            column_family::config cfg = column_family_test_config(env.manager());
+            cfg.datadir = tmp.path().string() + "/" + std::to_string(idx);
+            touch_directory(cfg.datadir).get();
+            cfg.enable_commitlog = false;
+            cfg.enable_incremental_backups = false;
+
+            auto sst_gen = [&env, s, dir = cfg.datadir, gen = make_lw_shared<unsigned>(1)] () mutable {
+                return env.make_sstable(s, dir, (*gen)++, sstables::sstable::version_types::md, big);
+            };
+
+            auto cf = make_lw_shared<column_family>(s, cfg, column_family::no_commitlog(), *cm, *cl_stats, *tracker);
+            cf->start();
+            cf->mark_ready_for_writes();
+
+            auto muts = { make_expiring_cell(s, std::chrono::hours(1)) };
+            auto sst = make_sstable_containing(sst_gen, muts);
+            column_family_test(cf).add_sstable(sst);
+            return cf;
+        };
+
+        std::vector<lw_shared_ptr<column_family>> tables;
+        auto stop_tables = defer([&tables] {
+            for (auto& t : tables) {
+                t->stop().get();
+            }
+        });
+        for (auto i = 0; i < 100; i++) {
+            tables.push_back(make_table_with_single_fully_expired_sstable(i));
+        }
+
+        // Make sure everything is expired
+        forward_jump_clocks(std::chrono::hours(100));
+
+        for (auto& t : tables) {
+            BOOST_REQUIRE(t->sstables_count() == 1);
+            t->trigger_compaction();
+        }
+
+        BOOST_REQUIRE(cm->get_stats().pending_tasks >= 1 || cm->get_stats().active_tasks >= 1);
+
+        size_t max_ongoing_compaction = 0;
+
+        // wait for submitted jobs to finish.
+        auto end = [cm, &tables] {
+            return cm->get_stats().pending_tasks == 0 && cm->get_stats().active_tasks == 0
+                && boost::algorithm::all_of(tables, [] (auto& t) { return t->sstables_count() == 0; });
+        };
+        while (!end()) {
+            if (!cm->get_stats().pending_tasks && !cm->get_stats().active_tasks) {
+                for (auto& t : tables) {
+                    if (t->sstables_count()) {
+                        t->trigger_compaction();
+                    }
+                }
+            }
+            max_ongoing_compaction = std::max(cm->get_stats().active_tasks, max_ongoing_compaction);
+            later().get();
+        }
+        BOOST_REQUIRE(cm->get_stats().errors == 0);
+        BOOST_REQUIRE(max_ongoing_compaction == 1);
+    });
+}
--- a/test/cql-pytest/cassandra_tests/validation/entities/frozen_collections_test.py
+++ b/test/cql-pytest/cassandra_tests/validation/entities/frozen_collections_test.py
@@ -440,7 +440,6 @@ def testNestedClusteringKeyUsage(cql, test_keyspace):
        )

 # Reproduces issue #7868 and #7902
-@pytest.mark.xfail(reason="fails because of issue #7902")
 def testNestedClusteringKeyUsageWithReverseOrder(cql, test_keyspace):
    with create_table(cql, test_keyspace, "(a int, b frozen<map<set<int>, list<int>>>, c frozen<set<int>>, d int, PRIMARY KEY (a, b, c)) WITH CLUSTERING ORDER BY (b DESC)") as table:
        execute(cql, table, "INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, {}, set(), 0)
--- a/test/cql-pytest/test_allow_filtering.py
+++ b/test/cql-pytest/test_allow_filtering.py
@@ -175,9 +175,12 @@ def wait_for_index(cql, table, column, everything):
        results = []
        for v in column_values:
            results.extend(list(cql.execute(f'SELECT * FROM {table} WHERE {column}={v}')))
-        if set(results) == set(everything):
+
+        if sorted(results) == sorted(everything):
            return
+        
        time.sleep(0.1)
+
    pytest.fail('Timeout waiting for index to become up to date.')

@pytest.fixture(scope="session")
@@ -291,3 +294,46 @@ def test_contains_frozen_collection_ck(cql, test_keyspace):
            "SELECT * FROM " + table + " WHERE a=0 AND c=0 AND b CONTAINS 0 ALLOW FILTERING")))
        assert 1 == len(list(cql.execute(
            "SELECT * FROM " + table + " WHERE a=0 AND c=0 AND b CONTAINS KEY 0 ALLOW FILTERING")))
+
+# table5 contains an indexed table with 3 clustering columns.
+# used to test correct filtering of rows fetched from an index table.
+@pytest.fixture(scope="module")
+def table5(cql, test_keyspace):
+    table = test_keyspace + "." + unique_name()
+    cql.execute(f"CREATE TABLE {table} (p int, c1 frozen<list<int>>, c2 frozen<list<int>>, c3 int, PRIMARY KEY (p,c1,c2,c3))")
+    cql.execute(f"CREATE INDEX ON {table} (c3)")
+    cql.execute(f"INSERT INTO {table} (p, c1, c2, c3) VALUES (0, [1], [2], 0)")
+    cql.execute(f"INSERT INTO {table} (p, c1, c2, c3) VALUES (0, [2], [2], 0)")
+    cql.execute(f"INSERT INTO {table} (p, c1, c2, c3) VALUES (0, [1], [3], 0)")
+    cql.execute(f"INSERT INTO {table} (p, c1, c2, c3) VALUES (0, [1], [2], 1)")
+
+    everything = list(cql.execute(f"SELECT * FROM {table}"))
+    wait_for_index(cql, table, 'c3', everything)
+    yield (table, everything)
+    cql.execute(f"DROP TABLE {table}")
+
+# Test that implementation of filtering for indexes works ok.
+# Current implementation is a bit conservative - it might sometimes state
+# that filtering is needed when it isn't actually required, but at least it's safe.
+def test_select_indexed_cluster_three_keys(cql, table5):
+    def check_good_row(row):
+        return row.p == 0 and row.c1 == [1] and row.c2 == [2] and row.c3 == 0
+    
+    check_af_optional(cql, table5, "c3 = 0", lambda r : r.c3 == 0)
+    check_af_mandatory(cql, table5, "c1 = [1] AND c2 = [2] AND c3 = 0", check_good_row)
+    check_af_mandatory(cql, table5, "p = 0 AND c1 CONTAINS 1 AND c3 = 0", lambda r : r.p == 0 and r.c1 == [1] and r.c3 == 0)
+    check_af_mandatory(cql, table5, "p = 0 AND c1 = [1] AND c2 CONTAINS 2 AND c3 = 0", check_good_row)
+
+    # Doesn't use an index - shouldn't be affected
+    check_af_optional(cql, table5, "p = 0 AND c1 = [1] AND c2 = [2] AND c3 = 0", check_good_row)
+
+# Here are the cases where current implementation of need_filtering() fails
+# By coincidence they also fail on cassandra, it looks like cassandra is buggy
+@pytest.mark.xfail(reason="Too conservative need_filtering() implementation")
+def test_select_indexed_cluster_three_keys_conservative(cql, table5, cassandra_bug):
+    def check_good_row(row):
+        return row.p == 0 and row.c1 == [1] and row.c3 == 0
+
+    # Don't require filtering, but for now we report they do
+    check_af_optional(cql, table5, "p = 0 AND c1 = [1] AND c3 = 0", check_good_row)
+    check_af_optional(cql, table5, "p = 0 AND c1 = [1] AND c2 < [3] AND c3 = 0", lambda r : check_good_row(r) and r.c2 < [3])
--- a/test/cql-pytest/test_json.py
+++ b/test/cql-pytest/test_json.py
@@ -26,7 +26,7 @@

 from util import unique_name, new_test_table

-from cassandra.protocol import FunctionFailure
+from cassandra.protocol import FunctionFailure, InvalidRequest

 import pytest
 import random
@@ -34,58 +34,62 @@ import random
@pytest.fixture(scope="session")
 def table1(cql, test_keyspace):
    table = test_keyspace + "." + unique_name()
-    cql.execute(f"CREATE TABLE {table} (p int PRIMARY KEY, v int, a ascii)")
+    cql.execute(f"CREATE TABLE {table} (p int PRIMARY KEY, v int, a ascii, b boolean)")
    yield table
    cql.execute("DROP TABLE " + table)

 # Test that failed fromJson() parsing an invalid JSON results in the expected
 # error - FunctionFailure - and not some weird internal error.
 # Reproduces issue #7911.
-@pytest.mark.xfail(reason="issue #7911")
 def test_failed_json_parsing_unprepared(cql, table1):
    p = random.randint(1,1000000000)
    with pytest.raises(FunctionFailure):
        cql.execute(f"INSERT INTO {table1} (p, v) VALUES ({p}, fromJson('dog'))")
-@pytest.mark.xfail(reason="issue #7911")
 def test_failed_json_parsing_prepared(cql, table1):
    p = random.randint(1,1000000000)
    stmt = cql.prepare(f"INSERT INTO {table1} (p, v) VALUES (?, fromJson(?))")
    with pytest.raises(FunctionFailure):
-        cql.execute(stmt, [0, 'dog'])
+        cql.execute(stmt, [p, 'dog'])

 # Similarly, if the JSON parsing did not fail, but yielded a type which is
 # incompatible with the type we want it to yield, we should get a clean
 # FunctionFailure, not some internal server error.
 # We have here examples of returning a string where a number was expected,
-# and returning a unicode string where ASCII was expected.
+# and returning a unicode string where ASCII was expected, and returning
+# a number of the wrong type
 # Reproduces issue #7911.
-@pytest.mark.xfail(reason="issue #7911")
 def test_fromjson_wrong_type_unprepared(cql, table1):
    p = random.randint(1,1000000000)
    with pytest.raises(FunctionFailure):
        cql.execute(f"INSERT INTO {table1} (p, v) VALUES ({p}, fromJson('\"dog\"'))")
    with pytest.raises(FunctionFailure):
        cql.execute(f"INSERT INTO {table1} (p, a) VALUES ({p}, fromJson('3'))")
-@pytest.mark.xfail(reason="issue #7911")
 def test_fromjson_wrong_type_prepared(cql, table1):
    p = random.randint(1,1000000000)
    stmt = cql.prepare(f"INSERT INTO {table1} (p, v) VALUES (?, fromJson(?))")
    with pytest.raises(FunctionFailure):
-        cql.execute(stmt, [0, '"dog"'])
+        cql.execute(stmt, [p, '"dog"'])
    stmt = cql.prepare(f"INSERT INTO {table1} (p, a) VALUES (?, fromJson(?))")
    with pytest.raises(FunctionFailure):
-        cql.execute(stmt, [0, '3'])
-@pytest.mark.xfail(reason="issue #7911")
+        cql.execute(stmt, [p, '3'])
 def test_fromjson_bad_ascii_unprepared(cql, table1):
    p = random.randint(1,1000000000)
    with pytest.raises(FunctionFailure):
        cql.execute(f"INSERT INTO {table1} (p, a) VALUES ({p}, fromJson('\"שלום\"'))")
-@pytest.mark.xfail(reason="issue #7911")
 def test_fromjson_bad_ascii_prepared(cql, table1):
    p = random.randint(1,1000000000)
    stmt = cql.prepare(f"INSERT INTO {table1} (p, a) VALUES (?, fromJson(?))")
    with pytest.raises(FunctionFailure):
-        cql.execute(stmt, [0, '"שלום"'])
+        cql.execute(stmt, [p, '"שלום"'])
+def test_fromjson_nonint_unprepared(cql, table1):
+    p = random.randint(1,1000000000)
+    with pytest.raises(FunctionFailure):
+        cql.execute(f"INSERT INTO {table1} (p, v) VALUES ({p}, fromJson('1.2'))")
+def test_fromjson_nonint_prepared(cql, table1):
+    p = random.randint(1,1000000000)
+    stmt = cql.prepare(f"INSERT INTO {table1} (p, v) VALUES (?, fromJson(?))")
+    with pytest.raises(FunctionFailure):
+        cql.execute(stmt, [p, '1.2'])

 # The JSON standard does not define or limit the range or precision of
 # numbers. However, if a number is assigned to a Scylla number type, the
@@ -105,7 +109,27 @@ def test_fromjson_int_overflow_prepared(cql, table1):
    p = random.randint(1,1000000000)
    stmt = cql.prepare(f"INSERT INTO {table1} (p, v) VALUES (?, fromJson(?))")
    with pytest.raises(FunctionFailure):
-        cql.execute(stmt, [0, '2147483648'])
+        cql.execute(stmt, [p, '2147483648'])
+
+# Cassandra allows the strings "true" and "false", not just the JSON constants
+# true and false, to be assigned to a boolean column. However, very strangely,
+# it only allows this for prepared statements, and *not* for unprepared
+# statements - which result in an InvalidRequest!
+# Reproduces #7915.
+def test_fromjson_boolean_string_unprepared(cql, table1):
+    p = random.randint(1,1000000000)
+    with pytest.raises(InvalidRequest):
+        cql.execute(f"INSERT INTO {table1} (p, b) VALUES ({p}, '\"true\"')")
+    with pytest.raises(InvalidRequest):
+        cql.execute(f"INSERT INTO {table1} (p, b) VALUES ({p}, '\"false\"')")
+@pytest.mark.xfail(reason="issue #7915")
+def test_fromjson_boolean_string_prepared(cql, table1):
+    p = random.randint(1,1000000000)
+    stmt = cql.prepare(f"INSERT INTO {table1} (p, b) VALUES (?, fromJson(?))")
+    cql.execute(stmt, [p, '"true"'])
+    assert list(cql.execute(f"SELECT p, b from {table1} where p = {p}")) == [(p, True)]
+    cql.execute(stmt, [p, '"false"'])
+    assert list(cql.execute(f"SELECT p, b from {table1} where p = {p}")) == [(p, False)]

 # Test that null argument is allowed for fromJson(), with unprepared statement
 # Reproduces issue #7912.
--- a/test/cql-pytest/test_secondary_index.py
+++ b/test/cql-pytest/test_secondary_index.py
@@ -22,7 +22,7 @@ import pytest
 from cassandra.protocol import SyntaxException, AlreadyExists, InvalidRequest, ConfigurationException, ReadFailure
 from cassandra.query import SimpleStatement

-from util import new_test_table
+from util import new_test_table, unique_name

 # A reproducer for issue #7443: Normally, when the entire table is SELECTed,
 # the partitions are returned sorted by the partitions' token. When there
@@ -81,3 +81,107 @@ def test_paging_with_desc_clustering_order(cql, test_keyspace):
            cql.execute(f"INSERT INTO {table}(p,c) VALUES ({i}, 42)")
        stmt = SimpleStatement(f"SELECT * FROM {table} WHERE c = 42", fetch_size=1)
        assert len([row for row in cql.execute(stmt)]) == 3
+
+# Test which ensures that indexes for a query are picked by the order in which
+# they appear in restrictions. That way, users can deterministically pick
+# which indexes are used for which queries.
+# Note that the order of picking indexing is not set in stone and may be
+# subject to change - in which case this test case should be amended as well.
+# The order tested in this case was decided as a good first step in issue
+# #7969, but it's possible that it will eventually be implemented another
+# way, e.g. dynamically based on estimated query selectivity statistics.
+# Ref: #7969
+@pytest.mark.xfail(reason="The order of picking indexes is currently arbitrary. Issue #7969")
+def test_order_of_indexes(scylla_only, cql, test_keyspace):
+    schema = 'p int primary key, v1 int, v2 int, v3 int'
+    with new_test_table(cql, test_keyspace, schema) as table:
+        cql.execute(f"CREATE INDEX my_v3_idx ON {table}(v3)")
+        cql.execute(f"CREATE INDEX my_v1_idx ON {table}(v1)")
+        cql.execute(f"CREATE INDEX my_v2_idx ON {table}((p),v2)")
+        # All queries below should use the first index they find in the list
+        # of restrictions. Tracing information will be consulted to ensure
+        # it's true. Currently some of the cases below succeed, because the
+        # order is not well defined (and may, for instance, change upon
+        # server restart), but some of them fail. Once a proper ordering
+        # is implemented, all cases below should succeed.
+        def index_used(query, index_name):
+            assert any([index_name in event.description for event in cql.execute(query, trace=True).get_query_trace().events])
+        index_used(f"SELECT * FROM {table} WHERE v3 = 1", "my_v3_idx")
+        index_used(f"SELECT * FROM {table} WHERE v3 = 1 and v1 = 2 allow filtering", "my_v3_idx")
+        index_used(f"SELECT * FROM {table} WHERE p = 1 and v1 = 1 and v3 = 2 allow filtering", "my_v1_idx")
+        index_used(f"SELECT * FROM {table} WHERE p = 1 and v3 = 1 and v1 = 2 allow filtering", "my_v3_idx")
+        # Local indexes are still skipped if they cannot be used
+        index_used(f"SELECT * FROM {table} WHERE v2 = 1 and v1 = 2 allow filtering", "my_v1_idx")
+        index_used(f"SELECT * FROM {table} WHERE v2 = 1 and v3 = 2 and v1 = 3 allow filtering", "my_v3_idx")
+        index_used(f"SELECT * FROM {table} WHERE v1 = 1 and v2 = 2 and v3 = 3 allow filtering", "my_v1_idx")
+        # Local indexes are still preferred over global ones, if they can be used
+        index_used(f"SELECT * FROM {table} WHERE p = 1 and v1 = 1 and v3 = 2 and v2 = 2 allow filtering", "my_v2_idx")
+        index_used(f"SELECT * FROM {table} WHERE p = 1 and v2 = 1 and v1 = 2 allow filtering", "my_v2_idx")
+
+# Indexes can be created without an explicit name, in which case a default name is chosen.
+# However, due to #8620 it was possible to break the index creation mechanism by creating
+# a properly named regular table, which conflicts with the generated index name.
+def test_create_unnamed_index_when_its_name_is_taken(cql, test_keyspace):
+    schema = 'p int primary key, v int'
+    with new_test_table(cql, test_keyspace, schema) as table:
+        try:
+            cql.execute(f"CREATE TABLE {table}_v_idx_index (i_do_not_exist_in_the_base_table int primary key)")
+            # Creating an index should succeed, even though its default name is taken
+            # by the table above
+            cql.execute(f"CREATE INDEX ON {table}(v)")
+        finally:
+            cql.execute(f"DROP TABLE {table}_v_idx_index")
+
+# Indexed created with an explicit name cause a materialized view to be created,
+# and this view has a specific name - <index-name>_index. If there happens to be
+# a regular table (or another view) named just like that, index creation should fail.
+def test_create_named_index_when_its_name_is_taken(scylla_only, cql, test_keyspace):
+    schema = 'p int primary key, v int'
+    with new_test_table(cql, test_keyspace, schema) as table:
+        index_name = unique_name()
+        try:
+            cql.execute(f"CREATE TABLE {test_keyspace}.{index_name}_index (i_do_not_exist_in_the_base_table int primary key)")
+            # Creating an index should fail, because it's impossible to create
+            # its underlying materialized view, because its name is taken by a regular table
+            with pytest.raises(InvalidRequest, match="already exists"):
+                cql.execute(f"CREATE INDEX {index_name} ON {table}(v)")
+        finally:
+            cql.execute(f"DROP TABLE {test_keyspace}.{index_name}_index")
+
+# Tests for CREATE INDEX IF NOT EXISTS
+# Reproduces issue #8717.
+def test_create_index_if_not_exists(cql, test_keyspace):
+    with new_test_table(cql, test_keyspace, 'p int primary key, v int') as table:
+        cql.execute(f"CREATE INDEX ON {table}(v)")
+        # Can't create the same index again without "IF NOT EXISTS", but can
+        # do it with "IF NOT EXISTS":
+        with pytest.raises(InvalidRequest, match="duplicate"):
+            cql.execute(f"CREATE INDEX ON {table}(v)")
+        cql.execute(f"CREATE INDEX IF NOT EXISTS ON {table}(v)")
+        cql.execute(f"DROP INDEX {test_keyspace}.{table.split('.')[1]}_v_idx")
+
+        # Now test the same thing for named indexes. This is what broke in #8717:
+        cql.execute(f"CREATE INDEX xyz ON {table}(v)")
+        with pytest.raises(InvalidRequest, match="already exists"):
+            cql.execute(f"CREATE INDEX xyz ON {table}(v)")
+        cql.execute(f"CREATE INDEX IF NOT EXISTS xyz ON {table}(v)")
+        cql.execute(f"DROP INDEX {test_keyspace}.xyz")
+
+        # Exactly the same with non-lower case name.
+        cql.execute(f'CREATE INDEX "CamelCase" ON {table}(v)')
+        with pytest.raises(InvalidRequest, match="already exists"):
+            cql.execute(f'CREATE INDEX "CamelCase" ON {table}(v)')
+        cql.execute(f'CREATE INDEX IF NOT EXISTS "CamelCase" ON {table}(v)')
+        cql.execute(f'DROP INDEX {test_keyspace}."CamelCase"')
+
+        # Trying to create an index for an attribute that's already indexed,
+        # but with a different name. The "IF NOT EXISTS" appears to succeed
+        # in this case, but does not actually create the new index name -
+        # only the old one remains.
+        cql.execute(f"CREATE INDEX xyz ON {table}(v)")
+        with pytest.raises(InvalidRequest, match="duplicate"):
+            cql.execute(f"CREATE INDEX abc ON {table}(v)")
+        cql.execute(f"CREATE INDEX IF NOT EXISTS abc ON {table}(v)")
+        with pytest.raises(InvalidRequest):
+            cql.execute(f"DROP INDEX {test_keyspace}.abc")
+        cql.execute(f"DROP INDEX {test_keyspace}.xyz")
--- a/test/lib/sstable_utils.hh
+++ b/test/lib/sstable_utils.hh
@@ -118,6 +118,8 @@ public:
                        return stop_iteration::no;
                    });
                });
+            }).finally([&ir] () {
+                return ir->close();
            });
        }).then([l] {
            return std::move(*l);
--- a/tools/java
+++ b/tools/java
--- a/transport/controller.cc
+++ b/transport/controller.cc
@@ -97,12 +97,18 @@ future<> controller::do_start_server() {
        };

        std::vector<listen_cfg> configs;
+        int native_port_idx = -1, native_shard_aware_port_idx = -1;

-        if (cfg.native_transport_port() != 0) {
-            configs.push_back(listen_cfg{ socket_address{ip, cfg.native_transport_port()}, false });
+        if (cfg.native_transport_port.is_set() ||
+                (!cfg.native_transport_port_ssl.is_set() && !cfg.native_transport_port.is_set())) {
+            // Non-SSL port is specified || neither SSL nor non-SSL ports are specified
+            configs.emplace_back(listen_cfg{ socket_address{ip, cfg.native_transport_port()}, false });
+            native_port_idx = 0;
        }
-        if (cfg.native_shard_aware_transport_port.is_set()) {
-            configs.push_back(listen_cfg{ socket_address{ip, cfg.native_shard_aware_transport_port()}, true });
+        if (cfg.native_shard_aware_transport_port.is_set() ||
+                (!cfg.native_shard_aware_transport_port_ssl.is_set() && !cfg.native_shard_aware_transport_port.is_set())) {
+            configs.emplace_back(listen_cfg{ socket_address{ip, cfg.native_shard_aware_transport_port()}, true });
+            native_shard_aware_port_idx = native_port_idx + 1;
        }

        // main should have made sure values are clean and neatish
@@ -127,15 +133,20 @@ future<> controller::do_start_server() {

            logger.info("Enabling encrypted CQL connections between client and server");

-            if (cfg.native_transport_port_ssl.is_set() && cfg.native_transport_port_ssl() != cfg.native_transport_port()) {
+            if (cfg.native_transport_port_ssl.is_set() &&
+                    (!cfg.native_transport_port.is_set() ||
+                    cfg.native_transport_port_ssl() != cfg.native_transport_port())) {
+                // SSL port is specified && non-SSL port is either left out or set to a different value
                configs.emplace_back(listen_cfg{{ip, cfg.native_transport_port_ssl()}, false, cred});
-            } else {
-                configs[0].cred = cred;
+            } else if (native_port_idx >= 0) {
+                configs[native_port_idx].cred = cred;
            }
-            if (cfg.native_shard_aware_transport_port_ssl.is_set() && cfg.native_shard_aware_transport_port_ssl() != cfg.native_shard_aware_transport_port()) {
+            if (cfg.native_shard_aware_transport_port_ssl.is_set() &&
+                    (!cfg.native_shard_aware_transport_port.is_set() ||
+                    cfg.native_shard_aware_transport_port_ssl() != cfg.native_shard_aware_transport_port())) {
                configs.emplace_back(listen_cfg{{ip, cfg.native_shard_aware_transport_port_ssl()}, true, std::move(cred)});
-            } else if (cfg.native_shard_aware_transport_port.is_set()) {
-                configs[1].cred = std::move(cred);
+            } else if (native_shard_aware_port_idx >= 0) {
+                configs[native_shard_aware_port_idx].cred = std::move(cred);
            }
        }

--- a/transport/server.cc
+++ b/transport/server.cc
@@ -572,7 +572,17 @@ future<foreign_ptr<std::unique_ptr<cql_server::response>>>
        } catch (const exceptions::prepared_query_not_found_exception& ex) {
            try { ++_server._stats.errors[ex.code()]; } catch(...) {}
            return make_unprepared_error(stream, ex.code(), ex.what(), ex.id, trace_state);
+        } catch (const exceptions::function_execution_exception& ex) {
+            try { ++_server._stats.errors[ex.code()]; } catch(...) {}
+            return make_function_failure_error(stream, ex.code(), ex.what(), ex.ks_name, ex.func_name, ex.args, trace_state);
        } catch (const exceptions::cassandra_exception& ex) {
+            // Note: the CQL protocol specifies that many types of errors have
+            // mandatory parameters. These cassandra_exception subclasses MUST
+            // be handled above. This default "cassandra_exception" case is
+            // only appropriate for the specific types of errors which do not have
+            // additional information, such as invalid_request_exception.
+            // TODO: consider listing those types explicitly, instead of the
+            // catch-all type cassandra_exception.
            try { ++_server._stats.errors[ex.code()]; } catch(...) {}
            return make_error(stream, ex.code(), ex.what(), trace_state);
        } catch (std::exception& ex) {
@@ -1334,6 +1344,17 @@ std::unique_ptr<cql_server::response> cql_server::connection::make_unprepared_er
    return response;
 }

+std::unique_ptr<cql_server::response> cql_server::connection::make_function_failure_error(int16_t stream, exceptions::exception_code err, sstring msg, sstring ks_name, sstring func_name, std::vector<sstring> args, const tracing::trace_state_ptr& tr_state) const
+{
+    auto response = std::make_unique<cql_server::response>(stream, cql_binary_opcode::ERROR, tr_state);
+    response->write_int(static_cast<int32_t>(err));
+    response->write_string(msg);
+    response->write_string(ks_name);
+    response->write_string(func_name);
+    response->write_string_list(args);
+    return response;
+}
+
 std::unique_ptr<cql_server::response> cql_server::connection::make_error(int16_t stream, exceptions::exception_code err, sstring msg, const tracing::trace_state_ptr& tr_state) const
 {
    auto response = std::make_unique<cql_server::response>(stream, cql_binary_opcode::ERROR, tr_state);
--- a/transport/server.hh
+++ b/transport/server.hh
@@ -235,6 +235,7 @@ private:
        std::unique_ptr<cql_server::response> make_mutation_write_failure_error(int16_t stream, exceptions::exception_code err, sstring msg, db::consistency_level cl, int32_t received, int32_t numfailures, int32_t blockfor, db::write_type type, const tracing::trace_state_ptr& tr_state) const;
        std::unique_ptr<cql_server::response> make_already_exists_error(int16_t stream, exceptions::exception_code err, sstring msg, sstring ks_name, sstring cf_name, const tracing::trace_state_ptr& tr_state) const;
        std::unique_ptr<cql_server::response> make_unprepared_error(int16_t stream, exceptions::exception_code err, sstring msg, bytes id, const tracing::trace_state_ptr& tr_state) const;
+        std::unique_ptr<cql_server::response> make_function_failure_error(int16_t stream, exceptions::exception_code err, sstring msg, sstring ks_name, sstring func_name, std::vector<sstring> args, const tracing::trace_state_ptr& tr_state) const;
        std::unique_ptr<cql_server::response> make_error(int16_t stream, exceptions::exception_code err, sstring msg, const tracing::trace_state_ptr& tr_state) const;
        std::unique_ptr<cql_server::response> make_ready(int16_t stream, const tracing::trace_state_ptr& tr_state) const;
        std::unique_ptr<cql_server::response> make_supported(int16_t stream, const tracing::trace_state_ptr& tr_state) const;
--- a/types.cc
+++ b/types.cc
@@ -1637,10 +1637,10 @@ static void serialize_aux(const tuple_type_impl& type, const tuple_type_impl::na
    assert(elems.size() <= type.size());

    for (size_t i = 0; i < elems.size(); ++i) {
-        const data_type& t = type.type(i);
+        const abstract_type& t = type.type(i)->without_reversed();
        const data_value& v = elems[i];
-        if (!v.is_null() && t != v.type()) {
-            throw std::runtime_error(format("tuple element type mismatch: expected {}, got {}", t->name(), v.type()->name()));
+        if (!v.is_null() && t != *v.type()) {
+            throw std::runtime_error(format("tuple element type mismatch: expected {}, got {}", t.name(), v.type()->name()));
        }

        if (v.is_null()) {
--- a/utils/fragment_range.hh
+++ b/utils/fragment_range.hh
@@ -263,6 +263,13 @@ decltype(auto) with_simplified(const View& v, Function&& fn)
    }
 }

+template<FragmentedView View>
+void skip_empty_fragments(View& v) {
+    while (!v.empty() && v.current_fragment().empty()) {
+        v.remove_current();
+    }
+}
+
 template<FragmentedView V1, FragmentedView V2>
 int compare_unsigned(V1 v1, V2 v2) {
    while (!v1.empty() && !v2.empty()) {
@@ -272,6 +279,8 @@ int compare_unsigned(V1 v1, V2 v2) {
        }
        v1.remove_prefix(n);
        v2.remove_prefix(n);
+        skip_empty_fragments(v1);
+        skip_empty_fragments(v2);
    }
    return v1.size_bytes() - v2.size_bytes();
 }
@@ -286,5 +295,7 @@ void write_fragmented(Dest& dest, Src src) {
        memcpy(dest.current_fragment().data(), src.current_fragment().data(), n);
        dest.remove_prefix(n);
        src.remove_prefix(n);
+        skip_empty_fragments(dest);
+        skip_empty_fragments(src);
    }
 }
--- a/utils/phased_barrier.hh
+++ b/utils/phased_barrier.hh
@@ -69,11 +69,15 @@ public:
    // Starts a new phase and waits for all operations started in any of the earlier phases.
    // It is fine to start multiple awaits in parallel.
    // Strong exception guarantees.
-    future<> advance_and_await() {
+    future<> advance_and_await() noexcept {
+      try {
        auto new_gate = make_lw_shared<gate>();
        ++_phase;
        auto old_gate = std::exchange(_gate, std::move(new_gate));
        return old_gate->close().then([old_gate, op = start()] {});
+      } catch (...) {
+        return current_exception_as_future();
+      }
    }

    // Returns current phase number. The smallest value returned is 0.
--- a/utils/rjson.cc
+++ b/utils/rjson.cc
@@ -120,6 +120,26 @@ protected:
    }
 };

+void* internal::throwing_allocator::Malloc(size_t size) {
+    void* ret = base::Malloc(size);
+    if (size > 0 && !ret) {
+        throw rjson::error(format("Failed to allocate {} bytes", size));
+    }
+    return ret;
+}
+
+void* internal::throwing_allocator::Realloc(void* orig_ptr, size_t orig_size, size_t new_size) {
+    void* ret = base::Realloc(orig_ptr, orig_size, new_size);
+    if (new_size > 0 && !ret) {
+        throw rjson::error(format("Failed to reallocate {} bytes to {} bytes from {}", orig_size, new_size, orig_ptr));
+    }
+    return ret;
+}
+
+void internal::throwing_allocator::Free(void* ptr) {
+    base::Free(ptr);
+}
+
 std::string print(const rjson::value& value) {
    string_buffer buffer;
    guarded_yieldable_json_handler<writer, false> writer(buffer, 78);
@@ -262,6 +282,15 @@ void set(rjson::value& base, rjson::string_ref_type name, rjson::string_ref_type
    base.AddMember(name, rjson::value(member), the_allocator);
 }

+void replace_with_string_name(rjson::value& base, const std::string_view name, rjson::value&& member) {
+    rjson::value *m = rjson::find(base, name);
+    if (m) {
+        *m = std::move(member);
+    } else {
+        set_with_string_name(base, name, std::move(member));
+    }
+}
+
 void push_back(rjson::value& base_array, rjson::value&& item) {
    base_array.PushBack(std::move(item), the_allocator);

--- a/utils/rjson.hh
+++ b/utils/rjson.hh
@@ -66,18 +66,35 @@ public:
 #include <rapidjson/writer.h>
 #include <rapidjson/stringbuffer.h>
 #include <rapidjson/error/en.h>
+#include <rapidjson/allocators.h>
 #include <seastar/core/sstring.hh>
 #include "seastarx.hh"

 namespace rjson {

-using allocator = rapidjson::CrtAllocator;
+// The internal namespace is a workaround for the fact that fmt::format
+// also has a to_string_view function and erroneously looks up our rjson::to_string_view
+// if this allocator is in the rjson namespace.
+namespace internal {
+// Implements an interface conforming to the one in rapidjson/allocators.h,
+// but throws rjson::error on allocation failures
+class throwing_allocator : public rapidjson::CrtAllocator {
+    using base = rapidjson::CrtAllocator;
+public:
+    static const bool kNeedFree = base::kNeedFree;
+    void* Malloc(size_t size);
+    void* Realloc(void* orig_ptr, size_t orig_size, size_t new_size);
+    static void Free(void* ptr);
+};
+}
+
+using allocator = internal::throwing_allocator;
 using encoding = rapidjson::UTF8<>;
-using document = rapidjson::GenericDocument<encoding, allocator>;
+using document = rapidjson::GenericDocument<encoding, allocator, allocator>;
 using value = rapidjson::GenericValue<encoding, allocator>;
 using string_ref_type = value::StringRefType;
-using string_buffer = rapidjson::GenericStringBuffer<encoding>;
-using writer = rapidjson::Writer<string_buffer, encoding>;
+using string_buffer = rapidjson::GenericStringBuffer<encoding, allocator>;
+using writer = rapidjson::Writer<string_buffer, encoding, encoding, allocator>;
 using type = rapidjson::Type;

 /** 
@@ -186,24 +203,37 @@ std::optional<T> get_opt(const rjson::value& value, std::string_view name) {
    }
 }

-// Sets a member in given JSON object by moving the member - allocates the name.
+// The various set*() functions below *add* a new member to a JSON object.
+// They all assume that a member with the same key (name) doesn't already
+// exist in that object, so they are meant to be used just to build a new
+// object from scratch. If a member with the same name *may* exist, and
+// might need to be replaced, use the replace*() functions instead.
+// The benefit of the set*() functions is that they are faster (O(1),
+// compared to O(n) for the replace* function that need to inspect the
+// existing members).
+
+// Adds a member to a given JSON object by moving the member - allocates the name.
 // Throws if base is not a JSON object.
+// Assumes a member with the same name does not yet exist in base.
 void set_with_string_name(rjson::value& base, std::string_view name, rjson::value&& member);

-// Sets a string member in given JSON object by assigning its reference - allocates the name.
+// Adds a string member to a given JSON object by assigning its reference - allocates the name.
 // NOTICE: member string liveness must be ensured to be at least as long as base's.
 // Throws if base is not a JSON object.
+// Assumes a member with the same name does not yet exist in base.
 void set_with_string_name(rjson::value& base, std::string_view name, rjson::string_ref_type member);

-// Sets a member in given JSON object by moving the member.
+// Adds a member to a given JSON object by moving the member.
 // NOTICE: name liveness must be ensured to be at least as long as base's.
 // Throws if base is not a JSON object.
+// Assumes a member with the same name does not yet exist in base.
 void set(rjson::value& base, rjson::string_ref_type name, rjson::value&& member);

-// Sets a string member in given JSON object by assigning its reference.
+// Adds a string member to a given JSON object by assigning its reference.
 // NOTICE: name liveness must be ensured to be at least as long as base's.
 // NOTICE: member liveness must be ensured to be at least as long as base's.
 // Throws if base is not a JSON object.
+// Assumes a member with the same name does not yet exist in base.
 void set(rjson::value& base, rjson::string_ref_type name, rjson::string_ref_type member);

 /**
@@ -224,6 +254,12 @@ set(rjson::value& base, rjson::string_ref_type name, T&& member) {
    set(base, std::move(name), std::move(v));
 }

+// Set a member in a given JSON object by moving the member - allocates the name.
+// If a member with the same name already exist in base, it is replaced.
+// Throws if base is not a JSON object.
+void replace_with_string_name(rjson::value& base, std::string_view name, rjson::value&& member);
+
+
 // Adds a value to a JSON list by moving the item to its end.
 // Throws if base_array is not a JSON array.
 void push_back(rjson::value& base_array, rjson::value&& item);
--- a/utils/stall_free.hh
+++ b/utils/stall_free.hh
@@ -44,8 +44,9 @@ void merge_to_gently(std::list<T>& list1, const std::list<T>& list2, Compare com
        seastar::thread::maybe_yield();
        if (first1 == last1) {
            // Copy remaining items of list2 into list1
-            std::copy_if(first2, last2, std::back_inserter(list1), [] (const auto&) { return true; });
-            return;
+            list1.insert(last1, *first2);
+            ++first2;
+            continue;
        }
        if (comp(*first2, *first1)) {
            first1 = list1.insert(first1, *first2);