mutation_writer: feed_writer(): handle exceptions from consume_end_of_stream()

Currently the exception handling code of feed_writer() assumes consume_end_of_stream() doesn't throw. This is false and an exception from said method can currently lead to an unclean destroy of the writer and reader. Fix by also handling exceptions from consume_end_of_stream() too. Closes #10147 (cherry picked from commit 1963d1cc25)
release: prepare for 4.4.9
2026-05-02 14:15:46 +00:00 · 2022-03-03 10:45:40 +01:00 · 2022-02-16 14:24:54 +02:00 · 2022-02-03 18:40:12 +02:00 · 2022-01-30 20:08:43 +02:00 · 2022-01-30 11:00:21 +02:00
108 changed files with 2342 additions and 565 deletions
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=4.4.1
+VERSION=4.4.9

 if test -f version
 then
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -123,7 +123,7 @@ struct rjson_engaged_ptr_comp {
 // as internally they're stored in an array, and the order of elements is
 // not important in set equality. See issue #5021
 static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2) {
-    if (set1.Size() != set2.Size()) {
+    if (!set1.IsArray() || !set2.IsArray() || set1.Size() != set2.Size()) {
        return false;
    }
    std::set<const rjson::value*, rjson_engaged_ptr_comp> set1_raw;
@@ -137,25 +137,70 @@ static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2
    }
    return true;
 }
+// Moreover, the JSON being compared can be a nested document with outer
+// layers of lists and maps and some inner set - and we need to get to that
+// inner set to compare it correctly with check_EQ_for_sets() (issue #8514).
+static bool check_EQ(const rjson::value* v1, const rjson::value& v2);
+static bool check_EQ_for_lists(const rjson::value& list1, const rjson::value& list2) {
+    if (!list1.IsArray() || !list2.IsArray() || list1.Size() != list2.Size()) {
+        return false;
+    }
+    auto it1 = list1.Begin();
+    auto it2 = list2.Begin();
+    while (it1 != list1.End()) {
+        // Note: Alternator limits an item's depth (rjson::parse() limits
+        // it to around 37 levels), so this recursion is safe.
+        if (!check_EQ(&*it1, *it2)) {
+            return false;
+        }
+        ++it1;
+        ++it2;
+    }
+    return true;
+}
+static bool check_EQ_for_maps(const rjson::value& list1, const rjson::value& list2) {
+    if (!list1.IsObject() || !list2.IsObject() || list1.MemberCount() != list2.MemberCount()) {
+        return false;
+    }
+    for (auto it1 = list1.MemberBegin(); it1 != list1.MemberEnd(); ++it1) {
+        auto it2 = list2.FindMember(it1->name);
+        if (it2 == list2.MemberEnd() || !check_EQ(&it1->value, it2->value)) {
+            return false;
+        }
+    }
+    return true;
+}

 // Check if two JSON-encoded values match with the EQ relation
 static bool check_EQ(const rjson::value* v1, const rjson::value& v2) {
-    if (!v1) {
-        return false;
-    }
-    if (v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
+    if (v1 && v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
        auto it1 = v1->MemberBegin();
        auto it2 = v2.MemberBegin();
-        if ((it1->name == "SS" && it2->name == "SS") || (it1->name == "NS" && it2->name == "NS") || (it1->name == "BS" && it2->name == "BS")) {
-            return check_EQ_for_sets(it1->value, it2->value);
+        if (it1->name != it2->name) {
+            return false;
        }
+        if (it1->name == "SS" || it1->name == "NS" || it1->name == "BS") {
+            return check_EQ_for_sets(it1->value, it2->value);
+        } else if(it1->name == "L") {
+            return check_EQ_for_lists(it1->value, it2->value);
+        } else if(it1->name == "M") {
+            return check_EQ_for_maps(it1->value, it2->value);
+        } else {
+            // Other, non-nested types (number, string, etc.) can be compared
+            // literally, comparing their JSON representation.
+            return it1->value == it2->value;
+        }
+    } else {
+        // If v1 and/or v2 are missing (IsNull()) the result should be false.
+        // In the unlikely case that the object is malformed (issue #8070),
+        // let's also return false.
+        return false;
    }
-    return *v1 == v2;
 }

 // Check if two JSON-encoded values match with the NE relation
 static bool check_NE(const rjson::value* v1, const rjson::value& v2) {
-    return !v1 || *v1 != v2; // null is unequal to anything.
+    return !check_EQ(v1, v2);
 }

 // Check if two JSON-encoded values match with the BEGINS_WITH relation
@@ -298,6 +343,8 @@ static bool check_NOT_NULL(const rjson::value* val) {

 // Only types S, N or B (string, number or bytes) may be compared by the
 // various comparion operators - lt, le, gt, ge, and between.
+// Note that in particular, if the value is missing (v->IsNull()), this
+// check returns false.
 static bool check_comparable_type(const rjson::value& v) {
    if (!v.IsObject() || v.MemberCount() != 1) {
        return false;
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -2509,7 +2509,7 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
                          const attribute_path_map_node<parsed::update_expression::action>* h = nullptr) {
        any_updates = true;
        if (_returnvalues == returnvalues::ALL_NEW) {
-            rjson::set_with_string_name(_return_attributes,
+            rjson::replace_with_string_name(_return_attributes,
                to_sstring_view(column_name), rjson::copy(json_value));
        } else if (_returnvalues == returnvalues::UPDATED_NEW) {
            rjson::value&& v = rjson::copy(json_value);
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -93,6 +93,10 @@ public:
                 [&] (const json::json_return_type& json_return_value) {
                     slogger.trace("api_handler success case");
                     if (json_return_value._body_writer) {
+                         // Unfortunately, write_body() forces us to choose
+                         // from a fixed and irrelevant list of "mime-types"
+                         // at this point. But we'll override it with the
+                         // one (application/x-amz-json-1.0) below.
                         rep->write_body("json", std::move(json_return_value._body_writer));
                     } else {
                         rep->_content += json_return_value._res;
@@ -105,14 +109,15 @@ public:

             return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
         });
-    }), _type("json") { }
+    }) { }

    api_handler(const api_handler&) = default;
    future<std::unique_ptr<reply>> handle(const sstring& path,
            std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
        return _f_handle(std::move(req), std::move(rep)).then(
                [this](std::unique_ptr<reply> rep) {
-                    rep->done(_type);
+                    rep->set_mime_type("application/x-amz-json-1.0");
+                    rep->done();
                    return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
                });
    }
@@ -126,7 +131,6 @@ protected:
    }

    future_handler_function _f_handle;
-    sstring _type;
 };

 class gated_handler : public handler_base {
@@ -192,24 +196,31 @@ future<> server::verify_signature(const request& req) {
        throw api_error::missing_authentication_token("Authorization header is mandatory for signature verification");
    }
    std::string host = host_it->second;
-    std::vector<std::string_view> credentials_raw = split(authorization_it->second, ' ');
+    std::string_view authorization_header = authorization_it->second;
+    auto pos = authorization_header.find_first_of(' ');
+    if (pos == std::string_view::npos || authorization_header.substr(0, pos) != "AWS4-HMAC-SHA256") {
+        throw api_error::invalid_signature(format("Authorization header must use AWS4-HMAC-SHA256 algorithm: {}", authorization_header));
+    }
+    authorization_header.remove_prefix(pos+1);
    std::string credential;
    std::string user_signature;
    std::string signed_headers_str;
    std::vector<std::string_view> signed_headers;
-    for (std::string_view entry : credentials_raw) {
+    do {
+        // Either one of a comma or space can mark the end of an entry
+        pos = authorization_header.find_first_of(" ,");
+        std::string_view entry = authorization_header.substr(0, pos);
+        if (pos != std::string_view::npos) {
+            authorization_header.remove_prefix(pos + 1);
+        }
+        if (entry.empty()) {
+            continue;
+        }
        std::vector<std::string_view> entry_split = split(entry, '=');
        if (entry_split.size() != 2) {
-            if (entry != "AWS4-HMAC-SHA256") {
-                throw api_error::invalid_signature(format("Only AWS4-HMAC-SHA256 algorithm is supported. Found: {}", entry));
-            }
            continue;
        }
        std::string_view auth_value = entry_split[1];
-        // Commas appear as an additional (quite redundant) delimiter
-        if (auth_value.back() == ',') {
-            auth_value.remove_suffix(1);
-        }
        if (entry_split[0] == "Credential") {
            credential = std::string(auth_value);
        } else if (entry_split[0] == "Signature") {
@@ -219,7 +230,8 @@ future<> server::verify_signature(const request& req) {
            signed_headers = split(auth_value, ';');
            std::sort(signed_headers.begin(), signed_headers.end());
        }
-    }
+    } while (pos != std::string_view::npos);
+
    std::vector<std::string_view> credential_split = split(credential, '/');
    if (credential_split.size() != 5) {
        throw api_error::validation(format("Incorrect credential information format: {}", credential));
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -38,6 +38,7 @@ stats::stats() : api_operations{} {
 #define OPERATION_LATENCY(name, CamelCaseName) \
                seastar::metrics::make_histogram("op_latency", \
                        seastar::metrics::description("Latency histogram of an operation via Alternator API"), {op(CamelCaseName)}, [this]{return to_metrics_histogram(api_operations.name);}),
+            OPERATION(batch_get_item, "BatchGetItem")
            OPERATION(batch_write_item, "BatchWriteItem")
            OPERATION(create_backup, "CreateBackup")
            OPERATION(create_global_table, "CreateGlobalTable")
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -331,15 +331,15 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], 0, [](column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], uint64_t{0}, [](column_family& cf) {
            return cf.active_memtable().partition_count();
-        }, std::plus<int>());
+        }, std::plus<>());
    });

    cf::get_all_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, 0, [](column_family& cf) {
+        return map_reduce_cf(ctx, uint64_t{0}, [](column_family& cf) {
            return cf.active_memtable().partition_count();
-        }, std::plus<int>());
+        }, std::plus<>());
    });

    cf::get_memtable_on_heap_size.set(r, [] (const_req req) {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -225,7 +225,7 @@ void set_repair(http_context& ctx, routes& r, sharded<netw::messaging_service>&
            try {
                res = fut.get0();
            } catch (std::exception& e) {
-                return make_exception_future<json::json_return_type>(httpd::server_error_exception(e.what()));
+                return make_exception_future<json::json_return_type>(httpd::bad_param_exception(e.what()));
            }
            return make_ready_future<json::json_return_type>(json::json_return_type(res));
        });
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -39,7 +39,7 @@ public:
    using size_type = bytes::size_type;
    using value_type = bytes::value_type;
    using fragment_type = bytes_view;
-    static constexpr size_type max_chunk_size() { return 128 * 1024; }
+    static constexpr size_type max_chunk_size() { return max_alloc_size() - sizeof(chunk); }
 private:
    static_assert(sizeof(value_type) == 1, "value_type is assumed to be one byte long");
    struct chunk {
@@ -59,6 +59,7 @@ private:
        void operator delete(void* ptr) { free(ptr); }
    };
    static constexpr size_type default_chunk_size{512};
+    static constexpr size_type max_alloc_size() { return 128 * 1024; }
 private:
    std::unique_ptr<chunk> _begin;
    chunk* _current;
@@ -132,16 +133,15 @@ private:
        return _current->size - _current->offset;
    }
    // Figure out next chunk size.
-    //   - must be enough for data_size
+    //   - must be enough for data_size + sizeof(chunk)
    //   - must be at least _initial_chunk_size
    //   - try to double each time to prevent too many allocations
-    //   - do not exceed max_chunk_size
+    //   - should not exceed max_alloc_size, unless data_size requires so
    size_type next_alloc_size(size_t data_size) const {
        auto next_size = _current
                ? _current->size * 2
                : _initial_chunk_size;
-        next_size = std::min(next_size, max_chunk_size());
-        // FIXME: check for overflow?
+        next_size = std::min(next_size, max_alloc_size());
        return std::max<size_type>(next_size, data_size + sizeof(chunk));
    }
    // Makes room for a contiguous region of given size.
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -264,6 +264,9 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin
        }
        _state = state::reading_from_underlying;
        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
+        if (!_read_context->partition_exists()) {
+            return read_from_underlying(timeout);
+        }
        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
                                      : position_in_partition(_upper_bound);
        return _underlying->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -709,16 +709,16 @@ private:
       }
       return false;
    }
-    bool compare(const T&, const value_type& v);
+    int32_t compare(const T&, const value_type& v);
 };

 template<>
-bool maybe_back_insert_iterator<std::vector<std::pair<bytes_view, bytes_view>>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
+int32_t maybe_back_insert_iterator<std::vector<std::pair<bytes_view, bytes_view>>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
    return _type.compare(t, v.first);
 }

 template<>
-bool maybe_back_insert_iterator<std::vector<bytes_view>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
+int32_t maybe_back_insert_iterator<std::vector<bytes_view>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
    return _type.compare(t, v);
 }

@@ -981,9 +981,9 @@ static bytes get_bytes(const atomic_cell_view& acv) {
    return acv.value().linearize();
 }

-static bytes_view get_bytes_view(const atomic_cell_view& acv, std::vector<bytes>& buf) {
+static bytes_view get_bytes_view(const atomic_cell_view& acv, std::forward_list<bytes>& buf) {
    return acv.value().is_fragmented()
-        ? bytes_view{buf.emplace_back(acv.value().linearize())}
+        ? bytes_view{buf.emplace_front(acv.value().linearize())}
        : acv.value().first_fragment();
 }

@@ -1138,9 +1138,9 @@ struct process_row_visitor {

                struct udt_visitor : public collection_visitor {
                    std::vector<bytes_opt> _added_cells;
-                    std::vector<bytes>& _buf;
+                    std::forward_list<bytes>& _buf;

-                    udt_visitor(ttl_opt& ttl_column, size_t num_keys, std::vector<bytes>& buf)
+                    udt_visitor(ttl_opt& ttl_column, size_t num_keys, std::forward_list<bytes>& buf)
                        : collection_visitor(ttl_column), _added_cells(num_keys), _buf(buf) {}

                    void live_collection_cell(bytes_view key, const atomic_cell_view& cell) {
@@ -1149,7 +1149,7 @@ struct process_row_visitor {
                    }
                };

-                std::vector<bytes> buf;
+                std::forward_list<bytes> buf;
                udt_visitor v(_ttl_column, type.size(), buf);

                visit_collection(v);
@@ -1168,9 +1168,9 @@ struct process_row_visitor {

                struct map_or_list_visitor : public collection_visitor {
                    std::vector<std::pair<bytes_view, bytes_view>> _added_cells;
-                    std::vector<bytes>& _buf;
+                    std::forward_list<bytes>& _buf;

-                    map_or_list_visitor(ttl_opt& ttl_column, std::vector<bytes>& buf)
+                    map_or_list_visitor(ttl_opt& ttl_column, std::forward_list<bytes>& buf)
                        : collection_visitor(ttl_column), _buf(buf) {}

                    void live_collection_cell(bytes_view key, const atomic_cell_view& cell) {
@@ -1179,7 +1179,7 @@ struct process_row_visitor {
                    }
                };

-                std::vector<bytes> buf;
+                std::forward_list<bytes> buf;
                map_or_list_visitor v(_ttl_column, buf);

                visit_collection(v);
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -99,8 +99,8 @@ listen_address: localhost
 # listen_on_broadcast_address: false

 # port for the CQL native transport to listen for clients on
-# For security reasons, you should not expose this port to the internet.  Firewall it if needed.
-# To disable the CQL native transport, set this option to 0.
+# For security reasons, you should not expose this port to the internet. Firewall it if needed.
+# To disable the CQL native transport, remove this option and configure native_transport_port_ssl.
 native_transport_port: 9042

 # Like native_transport_port, but clients are forwarded to specific shards, based on the
--- a/configure.py
+++ b/configure.py
@@ -281,7 +281,7 @@ scylla_tests = set([
    'test/boost/cdc_generation_test',
    'test/boost/aggregate_fcts_test',
    'test/boost/allocation_strategy_test',
-    'test/boost/alternator_base64_test',
+    'test/boost/alternator_unit_test',
    'test/boost/anchorless_list_test',
    'test/boost/auth_passwords_test',
    'test/boost/auth_resource_test',
@@ -1033,7 +1033,7 @@ pure_boost_tests = set([
 ])

 tests_not_using_seastar_test_framework = set([
-    'test/boost/alternator_base64_test',
+    'test/boost/alternator_unit_test',
    'test/boost/small_vector_test',
    'test/manual/gossip',
    'test/manual/message',
@@ -1107,7 +1107,7 @@ deps['test/boost/linearizing_input_stream_test'] = [
 ]

 deps['test/boost/duration_test'] += ['test/lib/exception_utils.cc']
-deps['test/boost/alternator_base64_test'] += ['alternator/base64.cc']
+deps['test/boost/alternator_unit_test'] += ['alternator/base64.cc']

 deps['test/raft/replication_test'] = ['test/raft/replication_test.cc'] + scylla_raft_dependencies
 deps['test/boost/raft_fsm_test'] =  ['test/boost/raft_fsm_test.cc', 'test/lib/log.cc'] + scylla_raft_dependencies
@@ -1969,7 +1969,7 @@ with open(buildfile_tmp, 'w') as f:
            command = ./dist/debian/debian_files_gen.py
        build $builddir/debian/debian: debian_files_gen | always
        rule extract_node_exporter
-            command = tar -C build -xvpf {node_exporter_filename} && rm -rfv build/node_exporter && mv -v build/{node_exporter_dirname} build/node_exporter
+            command = tar -C build -xvpf {node_exporter_filename} --no-same-owner && rm -rfv build/node_exporter && mv -v build/{node_exporter_dirname} build/node_exporter
        build $builddir/node_exporter: extract_node_exporter | always
        ''').format(**globals()))

--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -181,13 +181,18 @@ inline
 shared_ptr<function>
 make_from_json_function(database& db, const sstring& keyspace, data_type t) {
    return make_native_scalar_function<true>("fromjson", t, {utf8_type},
-            [&db, &keyspace, t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
-        rjson::value json_value = rjson::parse(utf8_type->to_string(parameters[0].value()));
-        bytes_opt parsed_json_value;
-        if (!json_value.IsNull()) {
-            parsed_json_value.emplace(from_json_object(*t, json_value, sf));
+            [&db, keyspace, t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+        try {
+            rjson::value json_value = rjson::parse(utf8_type->to_string(parameters[0].value()));
+            bytes_opt parsed_json_value;
+            if (!json_value.IsNull()) {
+                parsed_json_value.emplace(from_json_object(*t, json_value, sf));
+            }
+            return parsed_json_value;
+        } catch(rjson::error& e) {
+            throw exceptions::function_execution_exception("fromJson",
+                format("Failed parsing fromJson parameter: {}", e.what()), keyspace, {t->name()});
        }
-        return parsed_json_value;
    });
 }

--- a/cql3/functions/native_scalar_function.hh
+++ b/cql3/functions/native_scalar_function.hh
@@ -78,7 +78,22 @@ public:
        return Pure;
    }
    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
-        return _func(sf, parameters);
+        try {
+            return _func(sf, parameters);
+        } catch(exceptions::cassandra_exception&) {
+            // If the function's code took the time to produce an official
+            // cassandra_exception, pass it through. Otherwise, below we will
+            // wrap the unknown exception in a function_execution_exception.
+            throw;
+        } catch(...) {
+            std::vector<sstring> args;
+            args.reserve(arg_types().size());
+            for (const data_type& a : arg_types()) {
+                args.push_back(a->name());
+            }
+            throw exceptions::function_execution_exception(name().name,
+                format("Failed execution of function {}: {}", name(), std::current_exception()), name().keyspace, std::move(args));
+        }
    }
 };

--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -551,16 +551,27 @@ bool statement_restrictions::need_filtering() const {
        // clustering restrictions.  Therefore, a continuous clustering range is guaranteed.
        return false;
    }
-    if (!_clustering_columns_restrictions->needs_filtering(*_schema)) { // Guaranteed continuous clustering range.
-        return false;
-    }
-    // Now we know there are some clustering-column restrictions that are out-of-order or not EQ.  A naive base-table
-    // query must be filtered.  What about an index-table query?  That can only avoid filtering if there is exactly one
-    // EQ supported by an index.
-    return !(_clustering_columns_restrictions->size() == 1 && _has_queriable_ck_index);

-    // TODO: it is also possible to avoid filtering here if a non-empty CK prefix is specified and token_known, plus
-    // there's exactly one out-of-order-but-index-supported clustering-column restriction.
+    if (_has_queriable_ck_index && _uses_secondary_indexing) {
+        // In cases where we use an index, clustering column restrictions might cause the need for filtering.
+        // TODO: This is overly conservative, there are some cases when this returns true but filtering
+        // is not needed. Because of that the database will sometimes perform filtering when it's not actually needed.
+        // Query performance shouldn't be affected much, at most we will filter rows that are all correct.
+        // Here are some cases to consider:
+        // On a table with primary key (p, c1, c2, c3) with an index on c3
+        // WHERE c3 = ? - doesn't require filtering
+        // WHERE c1 = ? AND c2 = ? AND c3 = ? - requires filtering
+        // WHERE p = ? AND c1 = ? AND c3 = ? - doesn't require filtering, but we conservatively report it does
+        // WHERE p = ? AND c1 LIKE ? AND c3 = ? - requires filtering
+        // WHERE p = ? AND c1 = ? AND c2 LIKE ? AND c3 = ? - requires filtering
+        // WHERE p = ? AND c1 = ? AND c2 = ? AND c3 = ? - doesn't use an index
+        // WHERE p = ? AND c1 = ? AND c2 < ? AND c3 = ? - doesn't require filtering, but we report it does
+        return _clustering_columns_restrictions->size() > 1;
+    }
+    // Now we know that the query doesn't use an index.
+
+    // The only thing that can cause filtering now are the clustering columns.
+    return _clustering_columns_restrictions->needs_filtering(*_schema);
 }

 void statement_restrictions::validate_secondary_index_selections(bool selects_only_static_columns) {
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -306,6 +306,13 @@ create_index_statement::announce_migration(service::storage_proxy& proxy) const
                    format("Index {} is a duplicate of existing index {}", index.name(), existing_index.value().name()));
        }
    }
+    auto index_table_name = secondary_index::index_table_name(accepted_name);
+    if (db.has_schema(keyspace(), index_table_name)) {
+        return make_exception_future<::shared_ptr<cql_transport::event::schema_change>>(
+            exceptions::invalid_request_exception(format("Index {} cannot be created, because table {} already exists",
+                accepted_name, index_table_name))
+        );
+    }
    ++_cql_stats->secondary_index_creates;
    schema_builder builder{schema};
    builder.with_index(index);
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -460,7 +460,7 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const std::option
        if (!view_col) {
            throw std::runtime_error(format("Base key column not found in the view: {}", base_col.name_as_text()));
        }
-        if (base_col.type != view_col->type) {
+        if (base_col.type->without_reversed() != *view_col->type) {
            throw std::runtime_error(format("Mismatched types for base and view columns {}: {} and {}",
                    base_col.name_as_text(), base_col.type->cql3_type_name(), view_col->type->cql3_type_name()));
        }
@@ -964,6 +964,7 @@ lw_shared_ptr<const service::pager::paging_state> indexed_table_select_statement
    }

    auto paging_state_copy = make_lw_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
+    paging_state_copy->set_remaining(internal_paging_size);
    paging_state_copy->set_partition_key(std::move(index_pk));
    paging_state_copy->set_clustering_key(std::move(index_ck));
    return std::move(paging_state_copy);
@@ -1145,7 +1146,11 @@ query::partition_slice indexed_table_select_statement::get_partition_slice_for_g
                if (single_ck_restrictions) {
                    auto prefix_restrictions = single_ck_restrictions->get_longest_prefix_restrictions();
                    auto clustering_restrictions_from_base = ::make_shared<restrictions::single_column_clustering_key_restrictions>(_view_schema, *prefix_restrictions);
+                    const auto indexed_column = _view_schema->get_column_definition(to_bytes(_index.target_column()));
                    for (auto restriction_it : clustering_restrictions_from_base->restrictions()) {
+                        if (restriction_it.first == indexed_column) {
+                            continue; // In the index table, the indexed column is the partition (not clustering) key.
+                        }
                        clustering_restrictions->merge_with(restriction_it.second);
                    }
                }
--- a/database.cc
+++ b/database.cc
@@ -760,9 +760,6 @@ void database::set_format_by_config() {
 }

 database::~database() {
-    _read_concurrency_sem.clear_inactive_reads();
-    _streaming_concurrency_sem.clear_inactive_reads();
-    _system_read_concurrency_sem.clear_inactive_reads();
 }

 void database::update_version(const utils::UUID& version) {
@@ -1951,7 +1948,11 @@ sstring database::get_available_index_name(const sstring &ks_name, const sstring
    auto base_name = index_metadata::get_default_index_name(cf_name, index_name_root);
    sstring accepted_name = base_name;
    int i = 0;
-    while (existing_names.contains(accepted_name)) {
+    auto name_accepted = [&] {
+        auto index_table_name = secondary_index::index_table_name(accepted_name);
+        return !has_schema(ks_name, index_table_name) && !existing_names.contains(accepted_name);
+    };
+    while (!name_accepted()) {
        accepted_name = base_name + "_" + std::to_string(++i);
    }
    return accepted_name;
@@ -2016,6 +2017,13 @@ future<>
 database::stop() {
    assert(!_large_data_handler->running());

+    // Inactive reads might hold on to sstables, blocking the
+    // `sstables_manager::close()` calls below. No one will come back for these
+    // reads at this point so clear them before proceeding with the shutdown.
+    _read_concurrency_sem.clear_inactive_reads();
+    _streaming_concurrency_sem.clear_inactive_reads();
+    _system_read_concurrency_sem.clear_inactive_reads();
+
    // try to ensure that CL has done disk flushing
    future<> maybe_shutdown_commitlog = _commitlog != nullptr ? _commitlog->shutdown() : make_ready_future<>();
    return maybe_shutdown_commitlog.then([this] {
--- a/database.hh
+++ b/database.hh
@@ -240,9 +240,13 @@ public:
        return _memtables.back();
    }

-    // The caller has to make sure the element exist before calling this.
+    // # 8904 - this method is akin to std::set::erase(key_type), not
+    // erase(iterator). Should be tolerant against non-existing.
    void erase(const shared_memtable& element) {
-        _memtables.erase(boost::range::find(_memtables, element));
+        auto i = boost::range::find(_memtables, element);
+        if (i != _memtables.end()) {
+            _memtables.erase(i);
+        }
    }
    void clear() {
        _memtables.clear();
@@ -893,7 +897,7 @@ public:
        return _pending_writes_phaser.start();
    }

-    future<> await_pending_writes() {
+    future<> await_pending_writes() noexcept {
        return _pending_writes_phaser.advance_and_await();
    }

@@ -905,7 +909,7 @@ public:
        return _pending_reads_phaser.start();
    }

-    future<> await_pending_reads() {
+    future<> await_pending_reads() noexcept {
        return _pending_reads_phaser.advance_and_await();
    }

@@ -917,7 +921,7 @@ public:
        return _pending_streams_phaser.start();
    }

-    future<> await_pending_streams() {
+    future<> await_pending_streams() noexcept {
        return _pending_streams_phaser.advance_and_await();
    }

@@ -925,11 +929,11 @@ public:
        return _pending_streams_phaser.operations_in_progress();
    }

-    future<> await_pending_flushes() {
+    future<> await_pending_flushes() noexcept {
        return _pending_flushes_phaser.advance_and_await();
    }

-    future<> await_pending_ops() {
+    future<> await_pending_ops() noexcept {
        return when_all(await_pending_reads(), await_pending_writes(), await_pending_streams(), await_pending_flushes()).discard_result();
    }

--- a/db/large_data_handler.cc
+++ b/db/large_data_handler.cc
@@ -124,7 +124,7 @@ static future<> try_record(std::string_view large_table, const sstables::sstable
    const auto sstable_name = sst.get_filename();
    std::string pk_str = key_to_str(partition_key.to_partition_key(s), s);
    auto timestamp = db_clock::now();
-    large_data_logger.warn("Writing large {} {}/{}: {}{} ({} bytes)", desc, ks_name, cf_name, pk_str, extra_path, size);
+    large_data_logger.warn("Writing large {} {}/{}: {}{} ({} bytes) to {}", desc, ks_name, cf_name, pk_str, extra_path, size, sstable_name);
    return db::qctx->execute_cql(req, ks_name, cf_name, sstable_name, size, pk_str, timestamp, args...)
            .discard_result()
            .handle_exception([ks_name, cf_name, large_table, sstable_name] (std::exception_ptr ep) {
@@ -140,9 +140,10 @@ future<> cql_table_large_data_handler::record_large_partitions(const sstables::s
 void cql_table_large_data_handler::log_too_many_rows(const sstables::sstable& sst, const sstables::key& partition_key,
        uint64_t rows_count) const {
    const schema& s = *sst.get_schema();
-    large_data_logger.warn("Writing a partition with too many rows [{}/{}:{}] ({} rows)",
+    const auto sstable_name = sst.get_filename();
+    large_data_logger.warn("Writing a partition with too many rows [{}/{}:{}] ({} rows) to {}",
                           s.ks_name(), s.cf_name(), partition_key.to_partition_key(s).with_schema(s),
-                           rows_count);
+                           rows_count, sstable_name);
 }

 future<> cql_table_large_data_handler::record_large_cells(const sstables::sstable& sst, const sstables::key& partition_key,
--- a/db/snapshot-ctl.cc
+++ b/db/snapshot-ctl.cc
@@ -43,9 +43,13 @@

 namespace db {

-future<> snapshot_ctl::check_snapshot_not_exist(sstring ks_name, sstring name) {
+future<> snapshot_ctl::check_snapshot_not_exist(sstring ks_name, sstring name, std::optional<std::vector<sstring>> filter) {
    auto& ks = _db.local().find_keyspace(ks_name);
-    return parallel_for_each(ks.metadata()->cf_meta_data(), [this, ks_name = std::move(ks_name), name = std::move(name)] (auto& pair) {
+    return parallel_for_each(ks.metadata()->cf_meta_data(), [this, ks_name = std::move(ks_name), name = std::move(name), filter = std::move(filter)] (auto& pair) {
+        auto& cf_name = pair.first;
+        if (filter && std::find(filter->begin(), filter->end(), cf_name) == filter->end()) {
+            return make_ready_future<>();
+        }        
        auto& cf = _db.local().find_column_family(pair.second);
        return cf.snapshot_exists(name).then([ks_name = std::move(ks_name), name] (bool exists) {
            if (exists) {
@@ -111,7 +115,7 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
    }

    return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag)] {
-        return check_snapshot_not_exist(ks_name, tag).then([this, ks_name, tables = std::move(tables), tag] {
+        return check_snapshot_not_exist(ks_name, tag, tables).then([this, ks_name, tables, tag] {
            return do_with(std::vector<sstring>(std::move(tables)),[this, ks_name, tag](const std::vector<sstring>& tables) {
                return do_for_each(tables, [ks_name, tag, this] (const sstring& table_name) {
                    if (table_name.find(".") != sstring::npos) {
--- a/db/snapshot-ctl.hh
+++ b/db/snapshot-ctl.hh
@@ -40,6 +40,8 @@

 #pragma once

+#include <vector>
+
 #include <seastar/core/sharded.hh>
 #include <seastar/core/future.hh>
 #include "database.hh"
@@ -112,7 +114,7 @@ private:
    seastar::rwlock _lock;
    seastar::gate _ops;

-    future<> check_snapshot_not_exist(sstring ks_name, sstring name);
+    future<> check_snapshot_not_exist(sstring ks_name, sstring name, std::optional<std::vector<sstring>> filter = {});

    template <typename Func>
    std::result_of_t<Func()> run_snapshot_modify_operation(Func&&);
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -1170,7 +1170,7 @@ get_view_natural_endpoint(const sstring& keyspace_name,
 }

 static future<> apply_to_remote_endpoints(gms::inet_address target, std::vector<gms::inet_address>&& pending_endpoints,
-        frozen_mutation_and_schema& mut, const dht::token& base_token, const dht::token& view_token,
+        frozen_mutation_and_schema&& mut, const dht::token& base_token, const dht::token& view_token,
        service::allow_hints allow_hints, tracing::trace_state_ptr tr_state) {

    tracing::trace(tr_state, "Sending view update for {}.{} to {}, with pending endpoints = {}; base token = {}; view token = {}",
@@ -1189,7 +1189,7 @@ static future<> apply_to_remote_endpoints(gms::inet_address target, std::vector<
 // appropriate paired replicas. This is done asynchronously - we do not wait
 // for the writes to complete.
 future<> mutate_MV(
-        const dht::token& base_token,
+        dht::token base_token,
        std::vector<frozen_mutation_and_schema> view_updates,
        db::view::stats& stats,
        cf_stats& cf_stats,
@@ -1205,27 +1205,7 @@ future<> mutate_MV(
        auto& keyspace_name = mut.s->ks_name();
        auto target_endpoint = get_view_natural_endpoint(keyspace_name, base_token, view_token);
        auto remote_endpoints = service::get_local_storage_service().get_token_metadata().pending_endpoints_for(view_token, keyspace_name);
-        auto maybe_account_failure = [tr_state, &stats, &cf_stats, units = pending_view_updates.split(mut.fm.representation().size())] (
-                future<>&& f,
-                gms::inet_address target,
-                bool is_local,
-                size_t remotes) {
-            if (f.failed()) {
-                stats.view_updates_failed_local += is_local;
-                stats.view_updates_failed_remote += remotes;
-                cf_stats.total_view_updates_failed_local += is_local;
-                cf_stats.total_view_updates_failed_remote += remotes;
-                auto ep = f.get_exception();
-                tracing::trace(tr_state, "Failed to apply {}view update for {} and {} remote endpoints",
-                        seastar::value_of([is_local]{return is_local ? "local " : "";}), target, remotes);
-                vlogger.error("Error applying view update to {}: {}", target, ep);
-                return make_exception_future<>(std::move(ep));
-            } else {
-                tracing::trace(tr_state, "Successfully applied {}view update for {} and {} remote endpoints",
-                        seastar::value_of([is_local]{return is_local ? "local " : "";}), target, remotes);
-                return make_ready_future<>();
-            }
-        };
+        auto sem_units = pending_view_updates.split(mut.fm.representation().size());

        // First, find the local endpoint and ensure that if it exists,
        // it will be the target endpoint. That way, all endpoints in the
@@ -1262,11 +1242,20 @@ future<> mutate_MV(
            tracing::trace(tr_state, "Locally applying view update for {}.{}; base token = {}; view token = {}",
                    mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
            future<> local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, std::move(tr_state), db::commitlog::force_sync::no).then_wrapped(
-                    [&stats,
-                     maybe_account_failure = std::move(maybe_account_failure),
-                     mut_ptr = std::move(mut_ptr)] (future<>&& f) {
+                    [s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
+                            units = sem_units.split(sem_units.count())] (future<>&& f) {
                --stats.writes;
-                return maybe_account_failure(std::move(f), utils::fb_utilities::get_broadcast_address(), true, 0);
+                if (f.failed()) {
+                    ++stats.view_updates_failed_local;
+                    ++cf_stats.total_view_updates_failed_local;
+                    auto ep = f.get_exception();
+                    tracing::trace(tr_state, "Failed to apply local view update for {}", my_address);
+                    vlogger.error("Error applying view update to {} (view: {}.{}, base token: {}, view token: {}): {}",
+                            my_address, s->ks_name(), s->cf_name(), base_token, view_token, ep);
+                    return make_exception_future<>(std::move(ep));
+                }
+                tracing::trace(tr_state, "Successfully applied local view update for {}", my_address);
+                return make_ready_future<>();
            });
            fs->push_back(std::move(local_view_update));
            // We just applied a local update to the target endpoint, so it should now be removed
@@ -1288,11 +1277,23 @@ future<> mutate_MV(
            size_t updates_pushed_remote = remote_endpoints.size() + 1;
            stats.view_updates_pushed_remote += updates_pushed_remote;
            cf_stats.total_view_updates_pushed_remote += updates_pushed_remote;
-            future<> view_update = apply_to_remote_endpoints(*target_endpoint, std::move(remote_endpoints), mut, base_token, view_token, allow_hints, tr_state).then_wrapped(
-                    [target_endpoint,
-                     updates_pushed_remote,
-                     maybe_account_failure = std::move(maybe_account_failure)] (future<>&& f) mutable {
-                return maybe_account_failure(std::move(f), std::move(*target_endpoint), false, updates_pushed_remote);
+            schema_ptr s = mut.s;
+            future<> view_update = apply_to_remote_endpoints(*target_endpoint, std::move(remote_endpoints), std::move(mut), base_token, view_token, allow_hints, tr_state).then_wrapped(
+                    [s = std::move(s), &stats, &cf_stats, tr_state, base_token, view_token, target_endpoint, updates_pushed_remote,
+                            units = sem_units.split(sem_units.count())] (future<>&& f) mutable {
+                if (f.failed()) {
+                    stats.view_updates_failed_remote += updates_pushed_remote;
+                    cf_stats.total_view_updates_failed_remote += updates_pushed_remote;
+                    auto ep = f.get_exception();
+                    tracing::trace(tr_state, "Failed to apply view update for {} and {} remote endpoints",
+                            *target_endpoint, updates_pushed_remote);
+                    vlogger.error("Error applying view update to {} (view: {}.{}, base token: {}, view token: {}): {}",
+                            *target_endpoint, s->ks_name(), s->cf_name(), base_token, view_token, ep);
+                    return make_exception_future<>(std::move(ep));
+                }
+                tracing::trace(tr_state, "Successfully applied view update for {} and {} remote endpoints",
+                        *target_endpoint, updates_pushed_remote);
+                return make_ready_future<>();
            });
            if (wait_for_all) {
                fs->push_back(std::move(view_update));
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -153,7 +153,7 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(
 struct wait_for_all_updates_tag {};
 using wait_for_all_updates = bool_class<wait_for_all_updates_tag>;
 future<> mutate_MV(
-        const dht::token& base_token,
+        dht::token base_token,
        std::vector<frozen_mutation_and_schema> view_updates,
        db::view::stats& stats,
        cf_stats& cf_stats,
--- a/digester.hh
+++ b/digester.hh
@@ -58,7 +58,8 @@ public:

    template<typename T, typename... Args>
    void feed_hash(const T& value, Args&&... args) {
-        std::visit([&] (auto& hasher) noexcept -> void {
+        // FIXME uncomment the noexcept marking once clang bug 50994 is fixed or gcc compilation is turned on
+        std::visit([&] (auto& hasher) /* noexcept(noexcept(::feed_hash(hasher, value, args...))) */ -> void {
            ::feed_hash(hasher, value, std::forward<Args>(args)...);
        }, _impl);
    };
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -87,7 +87,8 @@ WantedBy=multi-user.target
            run('sysctl -p /etc/sysctl.d/99-scylla-coredump.conf', shell=True, check=True)

        fp = tempfile.NamedTemporaryFile()
-        fp.write(b'kill -SEGV $$')
+        fp.write(b'ulimit -c unlimited\n')
+        fp.write(b'kill -SEGV $$\n')
        fp.flush()
        p = subprocess.Popen(['/bin/bash', fp.name], stdout=subprocess.PIPE)
        pid = p.pid
--- a/dist/common/scripts/scylla_cpuscaling_setup
+++ b/dist/common/scripts/scylla_cpuscaling_setup
@@ -22,6 +22,7 @@

 import os
 import sys
+import argparse
 import shlex
 import distro
 from scylla_util import *
@@ -46,7 +47,12 @@ if __name__ == '__main__':
    if os.getuid() > 0:
        print('Requires root permission.')
        sys.exit(1)
-    if not os.path.exists('/sys/devices/system/cpu/cpufreq/policy0/scaling_governor'):
+    parser = argparse.ArgumentParser(description='CPU scaling setup script for Scylla.')
+    parser.add_argument('--force', dest='force', action='store_true',
+                        help='force running setup even CPU scaling unsupported')
+    args = parser.parse_args()
+
+    if not args.force and not os.path.exists('/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor'):
        print('This computer doesn\'t supported CPU scaling configuration.')
        sys.exit(0)
    if not is_debian_variant():
@@ -56,6 +62,11 @@ if __name__ == '__main__':
        if not shutil.which('cpufreq-set'):
            pkg_install('cpufrequtils')
    if is_debian_variant():
+        try:
+            ondemand = systemd_unit('ondemand')
+            ondemand.disable()
+        except:
+            pass
        cfg = sysconfig_parser('/etc/default/cpufrequtils')
        cfg.set('GOVERNOR', 'performance')
        cfg.commit()
--- a/dist/common/scripts/scylla_io_setup
+++ b/dist/common/scripts/scylla_io_setup
@@ -254,7 +254,7 @@ if __name__ == "__main__":
                    disk_properties["read_bandwidth"] = 2650 * mbs
                    disk_properties["write_iops"] = 360000
                    disk_properties["write_bandwidth"] = 1400 * mbs
-                elif nr_disks == "16":
+                elif nr_disks == 16:
                    disk_properties["read_iops"] = 1600000
                    disk_properties["read_bandwidth"] = 4521251328
                    #below is google, above is our measured
@@ -263,7 +263,7 @@ if __name__ == "__main__":
                    disk_properties["write_bandwidth"] = 2759452672
                    #below is google, above is our measured
                    #disk_properties["write_bandwidth"] = 3120 * mbs
-                elif nr_disks == "24":
+                elif nr_disks == 24:
                    disk_properties["read_iops"] = 2400000
                    disk_properties["read_bandwidth"] = 5921532416
                    #below is google, above is our measured
--- a/dist/common/scripts/scylla_ntp_setup
+++ b/dist/common/scripts/scylla_ntp_setup
@@ -90,12 +90,12 @@ if __name__ == '__main__':
            with open('/etc/ntp.conf') as f:
                conf = f.read()
            if args.subdomain:
-                conf2 = re.sub(r'server\s+([0-9]+)\.(\S+)\.pool\.ntp\.org', 'server \\1.{}.pool.ntp.org'.format(args.subdomain), conf, flags=re.MULTILINE)
+                conf2 = re.sub(r'(server|pool)\s+([0-9]+)\.(\S+)\.pool\.ntp\.org', '\\1 \\2.{}.pool.ntp.org'.format(args.subdomain), conf, flags=re.MULTILINE)
                with open('/etc/ntp.conf', 'w') as f:
                    f.write(conf2)
                conf = conf2
-            match = re.search(r'^server\s+(\S*)(\s+\S+)?', conf, flags=re.MULTILINE)
-            server = match.group(1)
+            match = re.search(r'^(server|pool)\s+(\S*)(\s+\S+)?', conf, flags=re.MULTILINE)
+            server = match.group(2)
            ntpd = systemd_unit('ntpd.service')
            ntpd.stop()
            # ignore error, ntpd may able to adjust clock later
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -142,4 +142,3 @@ if __name__ == '__main__':
            print(f'Exception occurred while creating perftune.yaml: {e}')
            print('To fix the error, please re-run scylla_setup.')
            sys.exit(1)
-
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -30,6 +30,8 @@ import distro
 from pathlib import Path
 from scylla_util import *
 from subprocess import run
+import distro
+from pkg_resources import parse_version

 if __name__ == '__main__':
    if os.getuid() > 0:
@@ -115,6 +117,25 @@ if __name__ == '__main__':
        pkg_install('xfsprogs')
    if not shutil.which('mdadm'):
        pkg_install('mdadm')
+    # XXX: Workaround for mdmonitor.service issue on CentOS8
+    if is_redhat_variant() and distro.version() == '8':
+        mdadm_rpm = run('rpm -q mdadm', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
+        match = re.match(r'^mdadm-([0-9]+\.[0-9]+-[a-zA-Z0-9]+)\.', mdadm_rpm)
+        mdadm_version = match.group(1)
+        if parse_version('4.1-14') < parse_version(mdadm_version):
+            repo_data = '''
+[BaseOS_8_3_2011]
+name=CentOS8.3.2011 - Base
+baseurl=http://vault.centos.org/8.3.2011/BaseOS/$basearch/os/
+gpgcheck=1
+enabled=0
+gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-centosofficial
+'''[1:-1]
+            with open('/etc/yum.repos.d/CentOS-Vault-8.3.repo', 'w') as f:
+                f.write(repo_data)
+            run('dnf downgrade --enablerepo=BaseOS_8_3_2011 -y mdadm', shell=True, check=True)
+            run('dnf install -y python3-dnf-plugin-versionlock', shell=True, check=True)
+            run('dnf versionlock add mdadm', shell=True, check=True)
    try:
        md_service = systemd_unit('mdmonitor.service')
    except SystemdException:
@@ -163,7 +184,7 @@ Before=scylla-server.service
 After={after}

 [Mount]
-What=UUID={uuid}
+What=/dev/disk/by-uuid/{uuid}
 Where={mount_at}
 Type=xfs
 Options=noatime
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -36,6 +36,7 @@ from subprocess import run, DEVNULL
 import distro
 from scylla_sysconfdir import SYSCONFDIR

+from multiprocessing import cpu_count

 def scriptsdir_p():
    p = Path(sys.argv[0]).resolve()
@@ -146,6 +147,11 @@ class gcp_instance:
            if af == socket.AF_INET:
                addr, port = sa
                if addr == "169.254.169.254":
+                    # Make sure it is not on GKE
+                    try:
+                        gcp_instance().__instance_metadata("machine-type")
+                    except urllib.error.HTTPError:
+                        return False
                    return True
        return False

--- a/dist/common/sysctl.d/99-scylla-aio.conf
+++ b/dist/common/sysctl.d/99-scylla-aio.conf
@@ -1,2 +1,2 @@
 # Raise max AIO events
-fs.aio-max-nr = 1048576
+fs.aio-max-nr = 5578536
--- a/dist/common/systemd/scylla-fstrim.timer
+++ b/dist/common/systemd/scylla-fstrim.timer
@@ -1,7 +1,5 @@
 [Unit]
 Description=Run Scylla fstrim daily
-After=scylla-server.service
-BindsTo=scylla-server.service

 [Timer]
 OnCalendar=Sat *-*-* 00:00:00
--- a/dist/debian/debian/scylla-kernel-conf.postinst
+++ b/dist/debian/debian/scylla-kernel-conf.postinst
@@ -9,9 +9,9 @@ if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
 else
    # expect failures in virtualized environments
    sysctl -p/usr/lib/sysctl.d/99-scylla-sched.conf || :
-    sysctl -p/usr/lib/sysctl.d/99-scylla-aio.conf || :
    sysctl -p/usr/lib/sysctl.d/99-scylla-vm.conf || :
    sysctl -p/usr/lib/sysctl.d/99-scylla-inotify.conf || :
+    sysctl -p/usr/lib/sysctl.d/99-scylla-aio.conf || :
 fi

 #DEBHELPER#
--- a/dist/debian/debian/scylla-server.postrm
+++ b/dist/debian/debian/scylla-server.postrm
@@ -12,8 +12,6 @@ case "$1" in
        if [ "$1" = "purge" ]; then
            rm -rf /etc/systemd/system/scylla-server.service.d/
        fi
-        rm -f /etc/systemd/system/var-lib-systemd-coredump.mount
-        rm -f /etc/systemd/system/var-lib-scylla.mount
        ;;
 esac

--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -6,7 +6,7 @@ ENV container docker

 # The SCYLLA_REPO_URL argument specifies the URL to the RPM repository this Docker image uses to install Scylla. The default value is the Scylla's unstable RPM repository, which contains the daily build.
 ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/branch-4.4/latest/scylla.repo
-ARG VERSION=4.4
+ARG VERSION=4.4.9

 ADD scylla_bashrc /scylla_bashrc

--- a/dist/docker/redhat/etc/supervisord.conf.d/scylla-server.conf
+++ b/dist/docker/redhat/etc/supervisord.conf.d/scylla-server.conf
@@ -4,3 +4,4 @@ stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
 stderr_logfile=/dev/stderr
 stderr_logfile_maxbytes=0
+stopwaitsecs=900
--- a/dist/docker/redhat/scyllasetup.py
+++ b/dist/docker/redhat/scyllasetup.py
@@ -121,12 +121,13 @@ class ScyllaSetup:
        if self._apiAddress is not None:
            args += ["--api-address %s" % self._apiAddress]

-        if self._alternatorPort is not None:
+        if self._alternatorAddress is not None:
            args += ["--alternator-address %s" % self._alternatorAddress]
+
+        if self._alternatorPort is not None:
            args += ["--alternator-port %s" % self._alternatorPort]

        if self._alternatorHttpsPort is not None:
-            args += ["--alternator-address %s" % self._alternatorAddress]
            args += ["--alternator-https-port %s" % self._alternatorHttpsPort]

        if self._alternatorWriteIsolation is not None:
--- a/dist/redhat/scylla.spec
+++ b/dist/redhat/scylla.spec
@@ -7,7 +7,7 @@ Group:          Applications/Databases
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Source0:        %{reloc_pkg}
-Requires:       %{product}-server = %{version} %{product}-conf = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version} %{product}-node-exporter = %{version}
+Requires:       %{product}-server = %{version} %{product}-conf = %{version} %{product}-python3 = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version} %{product}-node-exporter = %{version}
 Obsoletes:	scylla-server < 1.1

 %global _debugsource_template %{nil}
@@ -54,7 +54,7 @@ Group:          Applications/Databases
 Summary:        The Scylla database server
 License:        AGPLv3
 URL:            http://www.scylladb.com/
-Requires:       %{product}-conf %{product}-python3
+Requires:       %{product}-conf  = %{version} %{product}-python3 = %{version}
 Conflicts:      abrt
 AutoReqProv:    no

@@ -78,13 +78,18 @@ getent passwd scylla || /usr/sbin/useradd -g scylla -s /sbin/nologin -r -d %{_sh
 %post server
 /opt/scylladb/scripts/scylla_post_install.sh

-%systemd_post scylla-server.service
+if [ $1 -eq 1 ] ; then
+    /usr/bin/systemctl preset scylla-server.service ||:
+fi

 %preun server
-%systemd_preun scylla-server.service
+if [ $1 -eq 0 ] ; then
+    /usr/bin/systemctl --no-reload disable scylla-server.service ||:
+    /usr/bin/systemctl stop scylla-server.service ||:
+fi

 %postun server
-%systemd_postun scylla-server.service
+/usr/bin/systemctl daemon-reload ||:

 %posttrans server
 if  [ -d /tmp/%{name}-%{version}-%{release} ]; then
@@ -137,9 +142,9 @@ rm -rf $RPM_BUILD_ROOT
 %ghost /etc/systemd/system/scylla-server.service.d/capabilities.conf
 %ghost /etc/systemd/system/scylla-server.service.d/mounts.conf
 /etc/systemd/system/scylla-server.service.d/dependencies.conf
-%ghost /etc/systemd/system/var-lib-systemd-coredump.mount
+%ghost %config /etc/systemd/system/var-lib-systemd-coredump.mount
 %ghost /etc/systemd/system/scylla-cpupower.service
-%ghost /etc/systemd/system/var-lib-scylla.mount
+%ghost %config /etc/systemd/system/var-lib-scylla.mount

 %package conf
 Group:          Applications/Databases
@@ -205,9 +210,9 @@ if Scylla is the main application on your server and you wish to optimize its la
 # We cannot use the sysctl_apply rpm macro because it is not present in 7.0
 # following is a "manual" expansion
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
-/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :
 /usr/lib/systemd/systemd-sysctl 99-scylla-vm.conf >/dev/null 2>&1 || :
 /usr/lib/systemd/systemd-sysctl 99-scylla-inotify.conf >/dev/null 2>&1 || :
+/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :

 %files kernel-conf
 %defattr(-,root,root)
@@ -224,10 +229,18 @@ URL:            https://github.com/prometheus/node_exporter
 Prometheus exporter for machine metrics, written in Go with pluggable metric collectors.

 %post node-exporter
-%systemd_post node-exporter.service
+if [ $1 -eq 1 ] ; then
+    /usr/bin/systemctl preset scylla-node-exporter.service ||:
+fi

 %preun node-exporter
-%systemd_preun node-exporter.service
+if [ $1 -eq 0 ] ; then
+    /usr/bin/systemctl --no-reload disable scylla-node-exporter.service ||:
+    /usr/bin/systemctl stop scylla-node-exporter.service ||:
+fi
+
+%postun node-exporter
+/usr/bin/systemctl daemon-reload ||:

 %files node-exporter
 %defattr(-,root,root)
--- a/exceptions/exceptions.hh
+++ b/exceptions/exceptions.hh
@@ -340,4 +340,18 @@ public:
    unsupported_operation_exception(const sstring& msg) : std::runtime_error("unsupported operation: " + msg) {}
 };

+class function_execution_exception : public cassandra_exception {
+public:
+    const sstring ks_name;
+    const sstring func_name;
+    const std::vector<sstring> args;
+    function_execution_exception(sstring func_name_, sstring detail, sstring ks_name_, std::vector<sstring> args_) noexcept
+        : cassandra_exception{exception_code::FUNCTION_FAILURE,
+            format("execution of {} failed: {}", func_name_, detail)}
+        , ks_name(std::move(ks_name_))
+        , func_name(std::move(func_name_))
+        , args(std::move(args_))
+    { }
+};
+
 }
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -1445,7 +1445,7 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
    logger.trace("marking as alive {}", addr);

    // Do not mark a node with status shutdown as UP.
-    auto status = get_gossip_status(local_state);
+    auto status = sstring(get_gossip_status(local_state));
    if (status == sstring(versioned_value::SHUTDOWN)) {
        logger.warn("Skip marking node {} with status = {} as UP", addr, status);
        return;
@@ -1464,6 +1464,8 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
        return;
    }

+    // Make a copy for endpoint_state because the code below can yield
+    endpoint_state state = local_state;
    _live_endpoints.push_back(addr);
    if (_endpoints_to_talk_with.empty()) {
        _endpoints_to_talk_with.push_back({addr});
@@ -1475,8 +1477,8 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
        logger.info("InetAddress {} is now UP, status = {}", addr, status);
    }

-    _subscribers.for_each([addr, local_state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
-        subscriber->on_alive(addr, local_state);
+    _subscribers.for_each([addr, state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
+        subscriber->on_alive(addr, state);
        logger.trace("Notified {}", fmt::ptr(subscriber.get()));
    });
 }
@@ -1485,11 +1487,12 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
 void gossiper::mark_dead(inet_address addr, endpoint_state& local_state) {
    logger.trace("marking as down {}", addr);
    local_state.mark_dead();
+    endpoint_state state = local_state;
    _live_endpoints.resize(std::distance(_live_endpoints.begin(), std::remove(_live_endpoints.begin(), _live_endpoints.end(), addr)));
    _unreachable_endpoints[addr] = now();
-    logger.info("InetAddress {} is now DOWN, status = {}", addr, get_gossip_status(local_state));
-    _subscribers.for_each([addr, local_state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
-        subscriber->on_dead(addr, local_state);
+    logger.info("InetAddress {} is now DOWN, status = {}", addr, get_gossip_status(state));
+    _subscribers.for_each([addr, state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
+        subscriber->on_dead(addr, state);
        logger.trace("Notified {}", fmt::ptr(subscriber.get()));
    });
 }
--- a/hashing.hh
+++ b/hashing.hh
@@ -62,7 +62,7 @@ struct appending_hash;
 template<typename H, typename T, typename... Args>
 requires Hasher<H>
 inline
-void feed_hash(H& h, const T& value, Args&&... args) noexcept {
+void feed_hash(H& h, const T& value, Args&&... args) noexcept(noexcept(std::declval<appending_hash<T>>()(h, value, args...))) {
    appending_hash<T>()(h, value, std::forward<Args>(args)...);
 };

--- a/install.sh
+++ b/install.sh
@@ -150,6 +150,10 @@ EOF
    chmod +x "$install"
 }

+install() {
+    command install -Z "$@"
+}
+
 installconfig() {
    local perm="$1"
    local src="$2"
@@ -210,13 +214,13 @@ if [ -z "$python3" ]; then
 fi
 rpython3=$(realpath -m "$root/$python3")
 if ! $nonroot; then
-    retc="$root/etc"
-    rsysconfdir="$root/$sysconfdir"
-    rusr="$root/usr"
-    rsystemd="$rusr/lib/systemd/system"
+    retc=$(realpath -m "$root/etc")
+    rsysconfdir=$(realpath -m "$root/$sysconfdir")
+    rusr=$(realpath -m "$root/usr")
+    rsystemd=$(realpath -m "$rusr/lib/systemd/system")
    rdoc="$rprefix/share/doc"
-    rdata="$root/var/lib/scylla"
-    rhkdata="$root/var/lib/scylla-housekeeping"
+    rdata=$(realpath -m "$root/var/lib/scylla")
+    rhkdata=$(realpath -m "$root/var/lib/scylla-housekeeping")
 else
    retc="$rprefix/etc"
    rsysconfdir="$rprefix/$sysconfdir"
@@ -245,6 +249,7 @@ if ! $nonroot; then
    done
 fi
 # scylla-node-exporter
+install -d -m755 "$rsysconfdir" "$rsystemd"
 install -d -m755 "$rprefix"/node_exporter
 install -d -m755 "$rprefix"/node_exporter/licenses
 install -m755 node_exporter/node_exporter "$rprefix"/node_exporter
@@ -278,7 +283,6 @@ fi

 # scylla-server
 install -m755 -d "$rprefix"
-install -m755 -d "$rsysconfdir"
 install -m755 -d "$retc/scylla.d"
 installconfig 644 dist/common/sysconfig/scylla-housekeeping "$rsysconfdir"
 installconfig 644 dist/common/sysconfig/scylla-server "$rsysconfdir"
@@ -286,7 +290,7 @@ for file in dist/common/scylla.d/*.conf; do
    installconfig 644 "$file" "$retc"/scylla.d
 done

-install -d -m755 "$retc"/scylla "$rsystemd" "$rprefix/bin" "$rprefix/libexec" "$rprefix/libreloc" "$rprefix/scripts" "$rprefix/bin"
+install -d -m755 "$retc"/scylla "$rprefix/bin" "$rprefix/libexec" "$rprefix/libreloc" "$rprefix/scripts" "$rprefix/bin"
 install -m644 dist/common/systemd/scylla-fstrim.service -Dt "$rsystemd"
 install -m644 dist/common/systemd/scylla-housekeeping-daily.service -Dt "$rsystemd"
 install -m644 dist/common/systemd/scylla-housekeeping-restart.service -Dt "$rsystemd"
--- a/locator/network_topology_strategy.cc
+++ b/locator/network_topology_strategy.cc
@@ -273,8 +273,12 @@ void network_topology_strategy::validate_options() const {
 }

 std::optional<std::set<sstring>> network_topology_strategy::recognized_options() const {
-    // We explicitely allow all options
-    return std::nullopt;
+    std::set<sstring> datacenters;
+    for (const auto& [dc_name, endpoints] : _shared_token_metadata.get()->get_topology().get_datacenter_endpoints()) {
+        datacenters.insert(dc_name);
+    }
+    // We only allow datacenter names as options
+    return datacenters;
 }

 using registry = class_registrator<abstract_replication_strategy, network_topology_strategy, const sstring&, const shared_token_metadata&, snitch_ptr&, const std::map<sstring, sstring>&>;
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -26,6 +26,7 @@

 #include "mutation_reader.hh"
 #include <seastar/core/future-util.hh>
+#include <seastar/core/coroutine.hh>
 #include "flat_mutation_reader.hh"
 #include "schema_registry.hh"
 #include "mutation_compactor.hh"
@@ -1176,6 +1177,9 @@ flat_mutation_reader evictable_reader::recreate_reader() {
    _range_override.reset();
    _slice_override.reset();

+    _drop_partition_start = false;
+    _drop_static_row = false;
+
    if (_last_pkey) {
        bool partition_range_is_inclusive = true;

@@ -1261,13 +1265,25 @@ void evictable_reader::maybe_validate_partition_start(const flat_mutation_reader
    // is in range.
    if (_last_pkey) {
        const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
-        if (_drop_partition_start) { // should be the same partition
+        if (_drop_partition_start) { // we expect to continue from the same partition
+            // We cannot assume the partition we stopped the read at is still alive
+            // when we recreate the reader. It might have been compacted away in the
+            // meanwhile, so allow for a larger partition too.
            require(
-                    cmp_res == 0,
-                    "{}(): validation failed, expected partition with key equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
+                    cmp_res <= 0,
+                    "{}(): validation failed, expected partition with key larger or equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
                    __FUNCTION__,
                    *_last_pkey,
                    ps.key());
+            // Reset drop flags and next pos if we are not continuing from the same partition
+            if (cmp_res < 0) {
+                // Close previous partition, we are not going to continue it.
+                push_mutation_fragment(*_schema, _permit, partition_end{});
+                _drop_partition_start = false;
+                _drop_static_row = false;
+                _next_position_in_partition = position_in_partition::for_partition_start();
+                _trim_range_tombstones = false;
+            }
        } else { // should be a larger partition
            require(
                    cmp_res < 0,
@@ -1318,9 +1334,14 @@ bool evictable_reader::should_drop_fragment(const mutation_fragment& mf) {
        _drop_partition_start = false;
        return true;
    }
-    if (_drop_static_row && mf.is_static_row()) {
-        _drop_static_row = false;
-        return true;
+    // Unlike partition-start above, a partition is not guaranteed to have a
+    // static row fragment. So reset the flag regardless of whether we could
+    // drop one or not.
+    // We are guaranteed to get here only right after dropping a partition-start,
+    // so if we are not seeing a static row here, the partition doesn't have one.
+    if (_drop_static_row) {
+         _drop_static_row = false;
+        return mf.is_static_row();
    }
    return false;
 }
@@ -1505,18 +1526,18 @@ future<> evictable_reader::fast_forward_to(const dht::partition_range& pr, db::t
    _end_of_stream = false;

    if (_reader) {
-        return _reader->fast_forward_to(pr, timeout);
+        co_await _reader->fast_forward_to(pr, timeout);
+        _range_override.reset();
+        co_return;
    }
    if (!_reader_created || !_irh) {
-        return make_ready_future<>();
+        co_return;
    }
    if (auto reader_opt = try_resume()) {
-        auto f = reader_opt->fast_forward_to(pr, timeout);
-        return f.then([this, reader = std::move(*reader_opt)] () mutable {
-            maybe_pause(std::move(reader));
-        });
+        co_await reader_opt->fast_forward_to(pr, timeout);
+        _range_override.reset();
+        maybe_pause(std::move(*reader_opt));
    }
-    return make_ready_future<>();
 }

 evictable_reader_handle::evictable_reader_handle(evictable_reader& r) : _r(&r)
@@ -1569,8 +1590,8 @@ class shard_reader : public enable_lw_shared_from_this<shard_reader>, public fla
 private:
    shared_ptr<reader_lifecycle_policy> _lifecycle_policy;
    const unsigned _shard;
-    const dht::partition_range* _pr;
-    const query::partition_slice& _ps;
+    dht::partition_range _pr;
+    query::partition_slice _ps;
    const io_priority_class& _pc;
    tracing::global_trace_state_ptr _trace_state;
    const mutation_reader::forwarding _fwd_mr;
@@ -1596,7 +1617,7 @@ public:
        : impl(std::move(schema), std::move(permit))
        , _lifecycle_policy(std::move(lifecycle_policy))
        , _shard(shard)
-        , _pr(&pr)
+        , _pr(pr)
        , _ps(ps)
        , _pc(pc)
        , _trace_state(std::move(trace_state))
@@ -1681,7 +1702,7 @@ future<> shard_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
            });
            auto s = gs.get();
            auto rreader = make_foreign(std::make_unique<evictable_reader>(evictable_reader::auto_pause::yes, std::move(ms),
-                        s, _lifecycle_policy->semaphore().make_permit(s.get(), "shard-reader"), *_pr, _ps, _pc, _trace_state, _fwd_mr));
+                        s, _lifecycle_policy->semaphore().make_permit(s.get(), "shard-reader"), _pr, _ps, _pc, _trace_state, _fwd_mr));
            tracing::trace(_trace_state, "Creating shard reader on shard: {}", this_shard_id());
            auto f = rreader->fill_buffer(timeout);
            return f.then([rreader = std::move(rreader)] () mutable {
@@ -1730,7 +1751,7 @@ void shard_reader::next_partition() {
 }

 future<> shard_reader::fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) {
-    _pr = &pr;
+    _pr = pr;

    if (!_reader && !_read_ahead) {
        // No need to fast-forward uncreated readers, they will be passed the new
@@ -1739,12 +1760,12 @@ future<> shard_reader::fast_forward_to(const dht::partition_range& pr, db::timeo
    }

    auto f = _read_ahead ? *std::exchange(_read_ahead, std::nullopt) : make_ready_future<>();
-    return f.then([this, &pr, timeout] {
+    return f.then([this, timeout] {
        _end_of_stream = false;
        clear_buffer();

-        return smp::submit_to(_shard, [this, &pr, timeout] {
-            return _reader->fast_forward_to(pr, timeout);
+        return smp::submit_to(_shard, [this, timeout] {
+            return _reader->fast_forward_to(_pr, timeout);
        });
    });
 }
@@ -2308,9 +2329,9 @@ position_reader_queue::~position_reader_queue() {}
 // are not implemented and throw an error; the reader is only used for single partition queries.
 //
 // Assumes that:
-// - the queue contains at least one reader,
 // - there are no static rows,
-// - the returned fragments do not contain partition tombstones.
+// - the returned fragments do not contain partition tombstones,
+// - the merged readers return fragments from the same partition (but some or even all of them may be empty).
 class clustering_order_reader_merger {
    const schema_ptr _schema;
    const reader_permit _permit;
@@ -2422,12 +2443,17 @@ class clustering_order_reader_merger {
            if (!mf) {
                // The reader returned end-of-stream before returning end-of-partition
                // (otherwise we would have removed it in a previous peek). This means that
-                // we are in forwarding mode and the reader won't return any more fragments in the current range.
+                // either the reader was empty from the beginning (not even returning a `partition_start`)
+                // or we are in forwarding mode and the reader won't return any more fragments in the current range.
                // If the reader's upper bound is smaller then the end of the current range then it won't
                // return any more fragments in later ranges as well (subsequent fast-forward-to ranges
                // are non-overlapping and strictly increasing), so we can remove it now.
-                // Otherwise it may start returning fragments later, so we save it for the moment
-                // in _halted_readers and will bring it back when we get fast-forwarded.
+                // Otherwise, if it previously returned a `partition_start`, it may start returning more fragments
+                // later (after we fast-forward) so we save it for the moment in _halted_readers and will bring it
+                // back when we get fast-forwarded.
+                // We also save the reader if it was empty from the beginning (no `partition_start`) since
+                // it makes the code simpler (to check for this here we would need additional state); it is a bit wasteful
+                // but completely empty readers should be rare.
                if (_cmp(it->upper_bound, _pr_end) < 0) {
                    _all_readers.erase(it);
                } else {
@@ -2557,19 +2583,6 @@ public:
                        : position_in_partition_view::after_all_clustered_rows())
        , _should_emit_partition_end(fwd_sm == streamed_mutation::forwarding::no)
    {
-        // The first call to `_reader_queue::pop` uses `after_all_clustered_rows`
-        // so we obtain at least one reader; we will return this reader's `partition_start`
-        // as the first fragment.
-        auto rs = _reader_queue->pop(position_in_partition_view::after_all_clustered_rows());
-        for (auto& r: rs) {
-            _all_readers.push_front(std::move(r));
-            _unpeeked_readers.push_back(_all_readers.begin());
-        }
-
-        if (rs.empty()) {
-            // No readers, no partition.
-            _should_emit_partition_end = false;
-        }
    }

    // We assume that operator() is called sequentially and that the caller doesn't use the batch
@@ -2586,8 +2599,22 @@ public:
            return peek_readers(timeout).then([this, timeout] { return (*this)(timeout); });
        }

-        auto next_peeked_pos = _peeked_readers.empty() ? _pr_end : _peeked_readers.front()->reader.peek_buffer().position();
-        // There might be queued readers containing fragments with positions <= next_peeked_pos:
+        // Before we return a batch of fragments using currently opened readers we must check the queue
+        // for potential new readers that must be opened. There are three cases which determine how ``far''
+        // should we look:
+        // - If there are some peeked readers in the heap, we must check for new readers
+        //   whose `min_position`s are <= the position of the first peeked reader; there is no need
+        //   to check for ``later'' readers (yet).
+        // - Otherwise, if we already fetched a partition start fragment, we need to look no further
+        //   than the end of the current position range (_pr_end).
+        // - Otherwise we need to look for any reader (by calling the queue with `after_all_clustered_rows`),
+        //   even for readers whose `min_position`s may be outside the current position range since they
+        //   may be the only readers which have a `partition_start` fragment which we need to return
+        //   before end-of-stream.
+        auto next_peeked_pos =
+            _peeked_readers.empty()
+                ? (_partition_start_fetched ? _pr_end : position_in_partition_view::after_all_clustered_rows())
+                : _peeked_readers.front()->reader.peek_buffer().position();
        if (!_reader_queue->empty(next_peeked_pos)) {
            auto rs = _reader_queue->pop(next_peeked_pos);
            for (auto& r: rs) {
@@ -2601,8 +2628,11 @@ public:
            // We are either in forwarding mode and waiting for a fast-forward,
            // or we've exhausted all the readers.
            if (_should_emit_partition_end) {
-                // Not forwarding, so all readers must be exhausted. Return the last fragment.
-                _current_batch.push_back(mutation_fragment(*_schema, _permit, partition_end()));
+                // Not forwarding, so all readers must be exhausted.
+                // Return a partition end fragment unless all readers have been empty from the beginning.
+                if (_partition_start_fetched) {
+                    _current_batch.push_back(mutation_fragment(*_schema, _permit, partition_end()));
+                }
                _should_emit_partition_end = false;
            }
            return make_ready_future<mutation_fragment_batch>(_current_batch);
--- a/mutation_writer/feed_writers.hh
+++ b/mutation_writer/feed_writers.hh
@@ -57,6 +57,8 @@ future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
                auto f2 = rd.is_buffer_empty() ? rd.fill_buffer(db::no_timeout) : make_ready_future<>();
                return when_all_succeed(std::move(f1), std::move(f2)).discard_result();
            });
+        }).then([&wr] {
+            wr.consume_end_of_stream();
        }).then_wrapped([&wr] (future<> f) {
            if (f.failed()) {
                auto ex = f.get_exception();
@@ -70,7 +72,6 @@ future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
                    return make_exception_future<>(std::move(ex));
                });
            } else {
-                wr.consume_end_of_stream();
                return wr.close();
            }
        });
--- a/range_tombstone.hh
+++ b/range_tombstone.hh
@@ -267,9 +267,14 @@ public:
        return _current_tombstone;
    }

-    const std::deque<range_tombstone>& range_tombstones_for_row(const clustering_key_prefix& ck) {
+    std::vector<range_tombstone> range_tombstones_for_row(const clustering_key_prefix& ck) {
        drop_unneeded_tombstones(ck);
-        return _range_tombstones;
+        std::vector<range_tombstone> result(_range_tombstones.begin(), _range_tombstones.end());
+        auto cmp = [&] (const range_tombstone& rt1, const range_tombstone& rt2) {
+            return _cmp(rt1.start_bound(), rt2.start_bound());
+        };
+        std::sort(result.begin(), result.end(), cmp);
+        return result;
    }

    std::deque<range_tombstone> range_tombstones() && {
--- a/read_context.hh
+++ b/read_context.hh
@@ -141,6 +141,7 @@ class read_context final : public enable_lw_shared_from_this<read_context> {
    mutation_source_opt _underlying_snapshot;
    dht::partition_range _sm_range;
    std::optional<dht::decorated_key> _key;
+    bool _partition_exists;
    row_cache::phase_type _phase;
 public:
    read_context(row_cache& cache,
@@ -189,22 +190,34 @@ public:
    autoupdating_underlying_reader& underlying() { return _underlying; }
    row_cache::phase_type phase() const { return _phase; }
    const dht::decorated_key& key() const { return *_key; }
+    bool partition_exists() const { return _partition_exists; }
    void on_underlying_created() { ++_underlying_created; }
    bool digest_requested() const { return _slice.options.contains<query::partition_slice::option::with_digest>(); }
 public:
    future<> ensure_underlying(db::timeout_clock::time_point timeout) {
        if (_underlying_snapshot) {
-            return create_underlying(true, timeout);
+            return create_underlying(timeout).then([this, timeout] {
+                return _underlying.underlying()(timeout).then([this] (mutation_fragment_opt&& mfopt) {
+                    _partition_exists = bool(mfopt);
+                });
+            });
        }
+        // We know that partition exists because all the callers of
+        // enter_partition(const dht::decorated_key&, row_cache::phase_type)
+        // check that and there's no other way of setting _underlying_snapshot
+        // to empty. Except for calling create_underlying.
+        _partition_exists = true;
        return make_ready_future<>();
    }
 public:
-    future<> create_underlying(bool skip_first_fragment, db::timeout_clock::time_point timeout);
+    future<> create_underlying(db::timeout_clock::time_point timeout);
    void enter_partition(const dht::decorated_key& dk, mutation_source& snapshot, row_cache::phase_type phase) {
        _phase = phase;
        _underlying_snapshot = snapshot;
        _key = dk;
    }
+    // Precondition: each caller needs to make sure that partition with |dk| key
+    //               exists in underlying before calling this function.
    void enter_partition(const dht::decorated_key& dk, row_cache::phase_type phase) {
        _phase = phase;
        _underlying_snapshot = {};
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -76,7 +76,7 @@ class reader_permit::impl : public boost::intrusive::list_base_hook<boost::intru
    sstring _op_name;
    std::string_view _op_name_view;
    reader_resources _resources;
-    reader_permit::state _state = reader_permit::state::registered;
+    reader_permit::state _state = reader_permit::state::active;

 public:
    struct value_tag {};
@@ -124,22 +124,17 @@ public:
    }

    void on_admission() {
-        _state = reader_permit::state::admitted;
-        _semaphore.consume(_resources);
+        _state = reader_permit::state::active;
    }

    void consume(reader_resources res) {
        _resources += res;
-        if (_state == reader_permit::state::admitted) {
-            _semaphore.consume(res);
-        }
+        _semaphore.consume(res);
    }

    void signal(reader_resources res) {
        _resources -= res;
-        if (_state == reader_permit::state::admitted) {
-            _semaphore.signal(res);
-        }
+        _semaphore.signal(res);
    }

    reader_resources resources() const {
@@ -206,14 +201,11 @@ reader_resources reader_permit::consumed_resources() const {

 std::ostream& operator<<(std::ostream& os, reader_permit::state s) {
    switch (s) {
-        case reader_permit::state::registered:
-            os << "registered";
-            break;
        case reader_permit::state::waiting:
            os << "waiting";
            break;
-        case reader_permit::state::admitted:
-            os << "admitted";
+        case reader_permit::state::active:
+            os << "active";
            break;
    }
    return os;
@@ -250,7 +242,7 @@ struct permit_group_key_hash {

 using permit_groups = std::unordered_map<permit_group_key, permit_stats, permit_group_key_hash>;

-static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const permit_groups& permits, reader_permit::state state, bool sort_by_memory) {
+static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const permit_groups& permits, reader_permit::state state) {
    struct permit_summary {
        const schema* s;
        std::string_view op_name;
@@ -266,25 +258,17 @@ static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const pe
        }
    }

-    std::ranges::sort(permit_summaries, [sort_by_memory] (const permit_summary& a, const permit_summary& b) {
-        if (sort_by_memory) {
-            return a.memory < b.memory;
-        } else {
-            return a.count < b.count;
-        }
+    std::ranges::sort(permit_summaries, [] (const permit_summary& a, const permit_summary& b) {
+        return a.memory < b.memory;
    });

    permit_stats total;

-    auto print_line = [&os, sort_by_memory] (auto col1, auto col2, auto col3) {
-        if (sort_by_memory) {
-            fmt::print(os, "{}\t{}\t{}\n", col2, col1, col3);
-        } else {
-            fmt::print(os, "{}\t{}\t{}\n", col1, col2, col3);
-        }
+    auto print_line = [&os] (auto col1, auto col2, auto col3) {
+        fmt::print(os, "{}\t{}\t{}\n", col2, col1, col3);
    };

-    fmt::print(os, "Permits with state {}, sorted by {}\n", state, sort_by_memory ? "memory" : "count");
+    fmt::print(os, "Permits with state {}\n", state);
    print_line("count", "memory", "name");
    for (const auto& summary : permit_summaries) {
        total.count += summary.count;
@@ -310,11 +294,9 @@ static void do_dump_reader_permit_diagnostics(std::ostream& os, const reader_con
    permit_stats total;

    fmt::print(os, "Semaphore {}: {}, dumping permit diagnostics:\n", semaphore.name(), problem);
-    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::admitted, true);
+    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::active);
    fmt::print(os, "\n");
-    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::waiting, false);
-    fmt::print(os, "\n");
-    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::registered, false);
+    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::waiting);
    fmt::print(os, "\n");
    fmt::print(os, "Total: permits: {}, memory: {}\n", total.count, utils::to_hr_size(total.memory));
 }
@@ -375,7 +357,7 @@ reader_concurrency_semaphore::~reader_concurrency_semaphore() {
 reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore::register_inactive_read(std::unique_ptr<inactive_read> ir) {
    // Implies _inactive_reads.empty(), we don't queue new readers before
    // evicting all inactive reads.
-    if (_wait_list.empty()) {
+    if (_wait_list.empty() && _resources.memory > 0) {
        const auto [it, _] = _inactive_reads.emplace(_next_id++, std::move(ir));
        (void)_;
        ++_stats.inactive_reads;
@@ -425,13 +407,13 @@ bool reader_concurrency_semaphore::try_evict_one_inactive_read() {
 }

 bool reader_concurrency_semaphore::has_available_units(const resources& r) const {
-    return bool(_resources) && _resources >= r;
+    // Special case: when there is no active reader (based on count) admit one
+    // regardless of availability of memory.
+    return (bool(_resources) && _resources >= r) || _resources.count == _initial_resources.count;
 }

 bool reader_concurrency_semaphore::may_proceed(const resources& r) const {
-    // Special case: when there is no active reader (based on count) admit one
-    // regardless of availability of memory.
-    return _wait_list.empty() && (has_available_units(r) || _resources.count == _initial_resources.count);
+    return _wait_list.empty() && has_available_units(r);
 }

 future<reader_permit::resource_units> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, size_t memory,
@@ -482,6 +464,12 @@ void reader_concurrency_semaphore::broken(std::exception_ptr ex) {
    }
 }

+std::string reader_concurrency_semaphore::dump_diagnostics() const {
+    std::ostringstream os;
+    do_dump_reader_permit_diagnostics(os, *this, *_permit_list, "user request");
+    return os.str();
+}
+
 // A file that tracks the memory usage of buffers resulting from read
 // operations.
 class tracking_file_impl : public file_impl {
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -237,4 +237,6 @@ public:
    }

    void broken(std::exception_ptr ex);
+
+    std::string dump_diagnostics() const;
 };
--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -91,9 +91,8 @@ public:
    class resource_units;

    enum class state {
-        registered, // read is registered, but didn't attempt admission yet
        waiting, // waiting for admission
-        admitted,
+        active,
    };

    class impl;
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -311,7 +311,7 @@ float node_ops_metrics::repair_finished_percentage() {
 tracker::tracker(size_t nr_shards, size_t max_repair_memory)
    : _shutdown(false)
    , _repairs(nr_shards) {
-    auto nr = std::max(size_t(1), size_t(max_repair_memory / max_repair_memory_per_range()));
+    auto nr = std::max(size_t(1), size_t(max_repair_memory / max_repair_memory_per_range() / 4));
    rlogger.info("Setting max_repair_memory={}, max_repair_memory_per_range={}, max_repair_ranges_in_parallel={}",
        max_repair_memory, max_repair_memory_per_range(), nr);
    _range_parallelism_semaphores.reserve(nr_shards);
@@ -1783,6 +1783,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, seastar::sharded<
            auto& strat = ks.get_replication_strategy();
            dht::token_range_vector desired_ranges = strat.get_pending_address_ranges(tmptr, tokens, myip, utils::can_yield::yes);
            bool find_node_in_local_dc_only = strat.get_type() == locator::replication_strategy_type::network_topology;
+            bool everywhere_topology = strat.get_type() == locator::replication_strategy_type::everywhere_topology;

            //Active ranges
            auto metadata_clone = tmptr->clone_only_token_map().get0();
@@ -1860,7 +1861,9 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, seastar::sharded<
                        };
                        auto old_endpoints_in_local_dc = get_old_endpoints_in_local_dc();
                        auto rf_in_local_dc = get_rf_in_local_dc();
-                        if (old_endpoints.size() == strat.get_replication_factor()) {
+                        if (everywhere_topology) {
+                            neighbors = old_endpoints_in_local_dc;
+                        } else if (old_endpoints.size() == strat.get_replication_factor()) {
                            // For example, with RF = 3 and 3 nodes n1, n2, n3
                            // in the cluster, n4 is bootstrapped, old_replicas
                            // = {n1, n2, n3}, new_replicas = {n1, n2, n4}, n3
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -327,7 +327,7 @@ public:
    }
 };

-future<> read_context::create_underlying(bool skip_first_fragment, db::timeout_clock::time_point timeout) {
+future<> read_context::create_underlying(db::timeout_clock::time_point timeout) {
    if (_range_query) {
        // FIXME: Singular-range mutation readers don't support fast_forward_to(), so need to use a wide range
        // here in case the same reader will need to be fast forwarded later.
@@ -335,13 +335,8 @@ future<> read_context::create_underlying(bool skip_first_fragment, db::timeout_c
    } else {
        _sm_range = dht::partition_range::make_singular({dht::ring_position(*_key)});
    }
-    return _underlying.fast_forward_to(std::move(_sm_range), *_underlying_snapshot, _phase, timeout).then([this, skip_first_fragment, timeout] {
+    return _underlying.fast_forward_to(std::move(_sm_range), *_underlying_snapshot, _phase, timeout).then([this] {
        _underlying_snapshot = {};
-        if (skip_first_fragment) {
-            return _underlying.underlying()(timeout).then([](auto &&mf) {});
-        } else {
-            return make_ready_future<>();
-        }
    });
 }

@@ -361,7 +356,7 @@ private:
        auto src_and_phase = _cache.snapshot_of(_read_context->range().start()->value());
        auto phase = src_and_phase.phase;
        _read_context->enter_partition(_read_context->range().start()->value().as_decorated_key(), src_and_phase.snapshot, phase);
-        return _read_context->create_underlying(false, timeout).then([this, phase, timeout] {
+        return _read_context->create_underlying(timeout).then([this, phase, timeout] {
          return _read_context->underlying().underlying()(timeout).then([this, phase] (auto&& mfopt) {
            if (!mfopt) {
                if (phase == _cache.phase_of(_read_context->range().start()->value())) {
@@ -722,7 +717,7 @@ row_cache::make_reader(schema_ptr s,
            auto&& pos = ctx->range().start()->value();
            partitions_type::bound_hint hint;
            auto i = _partitions.lower_bound(pos, cmp, hint);
-            if (i != _partitions.end() && hint.match) {
+            if (hint.match) {
                cache_entry& e = *i;
                upgrade_entry(e);
                on_partition_hit();
--- a/2
+++ b/2
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -53,6 +53,7 @@
 #include "database.hh"
 #include "db/schema_tables.hh"
 #include "types/user.hh"
+#include "db/schema_tables.hh"

 namespace service {

@@ -1070,8 +1071,19 @@ future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging
            // referenced by the incoming request.
            // That means the column mapping for the schema should always be inserted
            // with TTL (refresh TTL in case column mapping already existed prior to that).
-            return db::schema_tables::store_column_mapping(proxy, s.unfreeze(db::schema_ctxt(proxy)), true).then([s] {
-                return s;
+            auto us = s.unfreeze(db::schema_ctxt(proxy));
+            // if this is a view - we might need to fix it's schema before registering it.
+            if (us->is_view()) {
+                auto& db = proxy.local().local_db();
+                schema_ptr base_schema = db.find_schema(us->view_info()->base_id());
+                auto fixed_view = db::schema_tables::maybe_fix_legacy_secondary_index_mv_schema(db, view_ptr(us), base_schema,
+                        db::schema_tables::preserve_version::yes);
+                if (fixed_view) {
+                    us = fixed_view;
+                }
+            }
+            return db::schema_tables::store_column_mapping(proxy, us, true).then([us] {
+                return frozen_schema{us};
            });
        });
    }).then([] (schema_ptr s) {
@@ -1079,7 +1091,7 @@ future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging
        // table.
        if (s->is_view()) {
            if (!s->view_info()->base_info()) {
-                auto& db = service::get_local_storage_proxy().get_db().local();
+                auto& db = service::get_local_storage_proxy().local_db();
                // This line might throw a no_such_column_family
                // It should be fine since if we tried to register a view for which
                // we don't know the base table, our registry is broken.
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -2532,7 +2532,13 @@ future<> storage_service::restore_replica_count(inet_address endpoint, inet_addr
    }
  return seastar::async([this, endpoint, notify_endpoint] {
    auto tmptr = get_token_metadata_ptr();
-    auto streamer = make_lw_shared<dht::range_streamer>(_db, tmptr, _abort_source, get_broadcast_address(), "Restore_replica_count", streaming::stream_reason::removenode);
+    abort_source as;
+    auto sub = _abort_source.subscribe([&as] () noexcept {
+        if (!as.abort_requested()) {
+            as.request_abort();
+        }
+    });
+    auto streamer = make_lw_shared<dht::range_streamer>(_db, tmptr, as, get_broadcast_address(), "Restore_replica_count", streaming::stream_reason::removenode);
    auto my_address = get_broadcast_address();
    auto non_system_keyspaces = _db.local().get_non_system_keyspaces();
    for (const auto& keyspace_name : non_system_keyspaces) {
@@ -2550,6 +2556,42 @@ future<> storage_service::restore_replica_count(inet_address endpoint, inet_addr
        }
        streamer->add_rx_ranges(keyspace_name, std::move(ranges_per_endpoint));
    }
+    auto status_checker = seastar::async([this, endpoint, &as] {
+        slogger.info("restore_replica_count: Started status checker for removing node {}", endpoint);
+        while (!as.abort_requested()) {
+            auto status = _gossiper.get_gossip_status(endpoint);
+            // If the node to be removed is already in removed status, it has
+            // probably been removed forcely with `nodetool removenode force`.
+            // Abort the restore_replica_count in such case to avoid streaming
+            // attempt since the user has removed the node forcely.
+            if (status == sstring(versioned_value::REMOVED_TOKEN)) {
+                slogger.info("restore_replica_count: Detected node {} has left the cluster, status={}, abort restore_replica_count for removing node {}",
+                        endpoint, status, endpoint);
+                if (!as.abort_requested()) {
+                    as.request_abort();
+                }
+                return;
+            }
+            slogger.debug("restore_replica_count: Sleep and detect removing node {}, status={}", endpoint, status);
+            sleep_abortable(std::chrono::seconds(10), as).get();
+        }
+    });
+    auto stop_status_checker = defer([endpoint, &status_checker, &as] () mutable {
+        try {
+            slogger.info("restore_replica_count: Started to stop status checker for removing node {}", endpoint);
+            if (!as.abort_requested()) {
+                as.request_abort();
+            }
+            status_checker.get();
+        } catch (const seastar::sleep_aborted& ignored) {
+            slogger.debug("restore_replica_count: Got sleep_abort to stop status checker for removing node {}: {}", endpoint, ignored);
+        } catch (...) {
+            slogger.warn("restore_replica_count: Found error in status checker for removing node {}: {}",
+                    endpoint, std::current_exception());
+        }
+        slogger.info("restore_replica_count: Finished to stop status checker for removing node {}", endpoint);
+    });
+
    streamer->stream_async().then_wrapped([this, streamer, notify_endpoint] (auto&& f) {
        try {
            f.get();
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -440,7 +440,6 @@ protected:
    mutation_source_metadata _ms_metadata = {};
    garbage_collected_sstable_writer::data _gc_sstable_writer_data;
    compaction_sstable_replacer_fn _replacer;
-    std::optional<compaction_weight_registration> _weight_registration;
    utils::UUID _run_identifier;
    ::io_priority_class _io_priority;
    // optional clone of sstable set to be used for expiration purposes, so it will be set if expiration is enabled.
@@ -459,7 +458,6 @@ protected:
        , _sstable_level(descriptor.level)
        , _gc_sstable_writer_data(*this)
        , _replacer(std::move(descriptor.replacer))
-        , _weight_registration(std::move(descriptor.weight_registration))
        , _run_identifier(descriptor.run_identifier)
        , _io_priority(descriptor.io_priority)
        , _sstable_set(std::move(descriptor.all_sstables_snapshot))
@@ -919,9 +917,6 @@ public:
    }

    virtual void on_end_of_compaction() override {
-        if (_weight_registration) {
-            _cf.get_compaction_manager().on_compaction_complete(*_weight_registration);
-        }
        replace_remaining_exhausted_sstables();
    }

--- a/sstables/compaction_descriptor.hh
+++ b/sstables/compaction_descriptor.hh
@@ -134,8 +134,6 @@ struct compaction_descriptor {
    uint64_t max_sstable_bytes;
    // Run identifier of output sstables.
    utils::UUID run_identifier;
-    // Holds ownership of a weight assigned to this compaction iff it's a regular one.
-    std::optional<compaction_weight_registration> weight_registration;
    // Calls compaction manager's task for this compaction to release reference to exhausted sstables.
    std::function<void(const std::vector<shared_sstable>& exhausted_sstables)> release_exhausted;
    // The options passed down to the compaction code.
--- a/sstables/compaction_manager.cc
+++ b/sstables/compaction_manager.cc
@@ -131,9 +131,12 @@ static inline int calculate_weight(uint64_t total_size) {
    // comes along, you do them in parallel.
    // TODO: Find a possibly better log base through experimentation.
    static constexpr int WEIGHT_LOG_BASE = 4;
+    // Fixed tax is added to size before taking the log, to make sure all jobs
+    // smaller than the tax (i.e. 1MB) will be serialized.
+    static constexpr int fixed_size_tax = 1024*1024;

    // computes the logarithm (base WEIGHT_LOG_BASE) of total_size.
-    return int(std::log(total_size) / std::log(WEIGHT_LOG_BASE));
+    return int(std::log(total_size + fixed_size_tax) / std::log(WEIGHT_LOG_BASE));
 }

 static inline int calculate_weight(const std::vector<sstables::shared_sstable>& sstables) {
@@ -311,6 +314,7 @@ future<> compaction_manager::run_custom_job(column_family* cf, sstring name, non
            cmlog.info("{} was abruptly stopped, reason: {}", name, e.what());
        } catch (...) {
            cmlog.error("{} failed: {}", name, std::current_exception());
+            throw;
        }
    });
    return task->compaction_done.get_future().then([task] {});
@@ -435,7 +439,7 @@ void compaction_manager::reevaluate_postponed_compactions() {
 }

 void compaction_manager::postpone_compaction_for_column_family(column_family* cf) {
-    _postponed.push_back(cf);
+    _postponed.insert(cf);
 }

 future<> compaction_manager::stop_ongoing_compactions(sstring reason) {
@@ -575,7 +579,7 @@ void compaction_manager::submit(column_family* cf) {
                return make_ready_future<stop_iteration>(stop_iteration::yes);
            }
            auto compacting = make_lw_shared<compacting_sstable_registration>(this, descriptor.sstables);
-            descriptor.weight_registration = compaction_weight_registration(this, weight);
+            auto weight_r = compaction_weight_registration(this, weight);
            descriptor.release_exhausted = [compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
                compacting->release_compacting(exhausted_sstables);
            };
@@ -585,7 +589,7 @@ void compaction_manager::submit(column_family* cf) {
            _stats.pending_tasks--;
            _stats.active_tasks++;
            task->compaction_running = true;
-            return cf.run_compaction(std::move(descriptor)).then_wrapped([this, task, compacting = std::move(compacting)] (future<> f) mutable {
+            return cf.run_compaction(std::move(descriptor)).then_wrapped([this, task, compacting = std::move(compacting), weight_r = std::move(weight_r)] (future<> f) mutable {
                _stats.active_tasks--;
                task->compaction_running = false;

@@ -798,12 +802,15 @@ future<> compaction_manager::remove(column_family* cf) {
            task->stopping = true;
        }
    }
-    _postponed.erase(boost::remove(_postponed, cf), _postponed.end());
+    _postponed.erase(cf);

    // Wait for the termination of an ongoing compaction on cf, if any.
    return do_for_each(*tasks_to_stop, [this, cf] (auto& task) {
        return this->task_stop(task);
    }).then([this, cf, tasks_to_stop] {
+#ifdef DEBUG
+        assert(std::find_if(_tasks.begin(), _tasks.end(), [cf] (auto& task) { return task->compacting_cf == cf; }) == _tasks.end());
+#endif
        _compaction_locks.erase(cf);
    });
 }
@@ -834,11 +841,6 @@ void compaction_manager::stop_compaction(sstring type) {
    }
 }

-void compaction_manager::on_compaction_complete(compaction_weight_registration& weight_registration) {
-    weight_registration.deregister();
-    reevaluate_postponed_compactions();
-}
-
 void compaction_manager::propagate_replacement(column_family* cf,
        const std::vector<sstables::shared_sstable>& removed, const std::vector<sstables::shared_sstable>& added) {
    for (auto& info : _compactions) {
--- a/sstables/compaction_manager.hh
+++ b/sstables/compaction_manager.hh
@@ -99,7 +99,7 @@ private:
    future<> _waiting_reevalution = make_ready_future<>();
    condition_variable _postponed_reevaluation;
    // column families that wait for compaction but had its submission postponed due to ongoing compaction.
-    std::vector<column_family*> _postponed;
+    std::unordered_set<column_family*> _postponed;
    // tracks taken weights of ongoing compactions, only one compaction per weight is allowed.
    // weight is value assigned to a compaction job that is log base N of total size of all input sstables.
    std::unordered_set<int> _weight_tracker;
@@ -256,11 +256,6 @@ public:
    // Stops ongoing compaction of a given type.
    void stop_compaction(sstring type);

-    // Called by compaction procedure to release the weight lock assigned to it, such that
-    // another compaction waiting on same weight can start as soon as possible. That's usually
-    // called before compaction seals sstable and such and after all compaction work is done.
-    void on_compaction_complete(compaction_weight_registration& weight_registration);
-
    double backlog() {
        return _backlog_manager.backlog();
    }
--- a/sstables/compaction_strategy.cc
+++ b/sstables/compaction_strategy.cc
@@ -503,7 +503,8 @@ date_tiered_manifest::get_compaction_candidates(column_family& cf, std::vector<s

 int64_t date_tiered_manifest::get_now(column_family& cf) {
    int64_t max_timestamp = 0;
-    for (auto& sst : *cf.get_sstables()) {
+    auto shared_set = cf.get_sstables();
+    for (auto& sst : *shared_set) {
        int64_t candidate = sst->get_stats_metadata().max_timestamp;
        max_timestamp = candidate > max_timestamp ? candidate : max_timestamp;
    }
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -367,6 +367,7 @@ class index_reader {
    const io_priority_class& _pc;
    tracing::trace_state_ptr _trace_state;
    shared_index_lists _index_lists;
+    future<> _background_closes = make_ready_future<>();

    struct reader {
        index_consumer _consumer;
@@ -472,6 +473,16 @@ private:
        };

        return _index_lists.get_or_load(summary_idx, loader).then([this, &bound, summary_idx] (shared_index_lists::list_ptr ref) {
+            // to make sure list is not closed when another bound is still using it, index list will only be closed when there's only one owner holding it
+            if (bound.current_list && bound.current_list.use_count() == 1) {
+                // a new background close will only be initiated when previous ones terminate, so as to limit the concurrency.
+                _background_closes = _background_closes.then_wrapped([current_list = std::move(bound.current_list)] (future<>&& f) mutable {
+                    f.ignore_ready_future();
+                    return do_with(std::move(current_list), [] (shared_index_lists::list_ptr& current_list) mutable {
+                        return close_index_list(current_list);
+                    });
+                });
+            }
            bound.current_list = std::move(ref);
            bound.current_summary_idx = summary_idx;
            bound.current_index_idx = 0;
@@ -841,6 +852,8 @@ public:
                return close_index_list(_upper_bound->current_list);
            }
            return make_ready_future<>();
+        }).then([this] () mutable {
+            return std::move(_background_closes);
        });
    }
 };
--- a/sstables/kl/writer.cc
+++ b/sstables/kl/writer.cc
@@ -129,7 +129,7 @@ void sstable_writer_k_l::maybe_flush_pi_block(file_writer& out,
        // block includes them), but we set block_next_start_offset after - so
        // even if we wrote a lot of open tombstones, we still get a full
        // block size of new data.
-        auto& rts = _pi_write.tombstone_accumulator->range_tombstones_for_row(
+        auto rts = _pi_write.tombstone_accumulator->range_tombstones_for_row(
                clustering_key_prefix::from_range(clustering_key.values()));
        for (const auto& rt : rts) {
            auto start = composite::from_clustering_element(*_pi_write.schemap, rt.start);
--- a/sstables/leveled_compaction_strategy.cc
+++ b/sstables/leveled_compaction_strategy.cc
@@ -147,7 +147,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
        unsigned overlapping_sstables = 0;
        auto prev_last = dht::ring_position::min();
        for (auto& sst : sstables) {
-            if (dht::ring_position(sst->get_first_decorated_key()).less_compare(*schema, prev_last)) {
+            if (dht::ring_position(sst->get_first_decorated_key()).tri_compare(*schema, prev_last) <= 0) {
                overlapping_sstables++;
            }
            prev_last = dht::ring_position(sst->get_last_decorated_key());
@@ -189,10 +189,8 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
    };

    if (level_info[0].size() > offstrategy_threshold) {
-        level_info[0].resize(std::min(level_info[0].size(), max_sstables));
-        compaction_descriptor desc(std::move(level_info[0]), std::optional<sstables::sstable_set>(), iop);
-        desc.options = compaction_options::make_reshape();
-        return desc;
+        size_tiered_compaction_strategy stcs(_stcs_options);
+        return stcs.get_reshaping_job(std::move(level_info[0]), schema, iop, mode);
    }

    for (unsigned level = leveled_manifest::MAX_LEVELS - 1; level > 0; --level) {
--- a/sstables/mp_row_consumer.hh
+++ b/sstables/mp_row_consumer.hh
@@ -1145,7 +1145,11 @@ public:
        setup_for_partition(pk);
        auto dk = dht::decorate_key(*_schema, pk);
        _reader->on_next_partition(std::move(dk), tombstone(deltime));
-        return proceed::yes;
+        // Only partition start will be consumed if processing a large run of partition tombstones,
+        // so let's stop the consumer if buffer is full.
+        // Otherwise, partition tombstones will keep accumulating in memory till other fragment type
+        // is found which can stop the consumer (perhaps there's none if sstable is full of tombstones).
+        return proceed(!_reader->is_buffer_full());
    }

    virtual consumer_m::row_processing_result consume_row_start(const std::vector<temporary_buffer<char>>& ecp) override {
--- a/sstables/sstable_set.cc
+++ b/sstables/sstable_set.cc
@@ -419,6 +419,11 @@ std::unique_ptr<incremental_selector_impl> time_series_sstable_set::make_increme
 // exactly once after all sstables are iterated over.
 //
 // The readers are created lazily on-demand using the supplied factory function.
+//
+// Additionally to the sstable readers, the queue always returns one ``dummy reader''
+// that contains only the partition_start/end markers. This dummy reader is always
+// returned as the first on the first `pop(b)` call for any `b`. Its upper bound
+// is `before_all_clustered_rows`.
 class min_position_reader_queue : public position_reader_queue {
    using container_t = time_series_sstable_set::container_t;
    using value_t = container_t::value_type;
@@ -436,6 +441,11 @@ class min_position_reader_queue : public position_reader_queue {
    std::function<flat_mutation_reader(sstable&)> _create_reader;
    std::function<bool(const sstable&)> _filter;

+    // After construction contains a reader which returns only the partition
+    // start (and end, if not in forwarding mode) markers. This is the first
+    // returned reader.
+    std::optional<flat_mutation_reader> _dummy_reader;
+
    flat_mutation_reader create_reader(sstable& sst) {
        return _create_reader(sst);
    }
@@ -445,10 +455,14 @@ class min_position_reader_queue : public position_reader_queue {
    }

 public:
+    // Assumes that `create_reader` returns readers that emit only fragments from partition `pk`.
    min_position_reader_queue(schema_ptr schema,
            lw_shared_ptr<const time_series_sstable_set::container_t> sstables,
            std::function<flat_mutation_reader(sstable&)> create_reader,
-            std::function<bool(const sstable&)> filter)
+            std::function<bool(const sstable&)> filter,
+            partition_key pk,
+            reader_permit permit,
+            streamed_mutation::forwarding fwd_sm)
        : _schema(std::move(schema))
        , _sstables(std::move(sstables))
        , _it(_sstables->begin())
@@ -456,6 +470,8 @@ public:
        , _cmp(*_schema)
        , _create_reader(std::move(create_reader))
        , _filter(std::move(filter))
+        , _dummy_reader(flat_mutation_reader_from_mutations(
+                std::move(permit), {mutation(_schema, std::move(pk))}, _schema->full_slice(), fwd_sm))
    {
        while (_it != _end && !this->filter(*_it->second)) {
            ++_it;
@@ -464,7 +480,8 @@ public:

    virtual ~min_position_reader_queue() override = default;

-    // Open sstable readers to all sstables with smallest min_position() from the set
+    // If the dummy reader was not yet returned, return the dummy reader.
+    // Otherwise, open sstable readers to all sstables with smallest min_position() from the set
    // {S: filter(S) and prev_min_pos < S.min_position() <= bound}, where `prev_min_pos` is the min_position()
    // of the sstables returned from last non-empty pop() or -infinity if no sstables were previously returned,
    // and `filter` is the filtering function provided when creating the queue.
@@ -478,6 +495,12 @@ public:
            return {};
        }

+        if (_dummy_reader) {
+            std::vector<reader_and_upper_bound> ret;
+            ret.emplace_back(*std::exchange(_dummy_reader, std::nullopt), position_in_partition::before_all_clustered_rows());
+            return ret;
+        }
+
        // by !empty(bound) and `_it` invariant:
        //      _it != _end, _it->first <= bound, and filter(*_it->second) == true
        assert(_cmp(_it->first, bound) <= 0);
@@ -506,17 +529,22 @@ public:
        return ret;
    }

-    // Is the set of sstables {S: filter(S) and prev_min_pos < S.min_position() <= bound} empty?
-    // (see pop() for definition of `prev_min_pos`)
+    // If the dummy reader was not returned yet, returns false.
+    // Otherwise checks if the set of sstables {S: filter(S) and prev_min_pos < S.min_position() <= bound}
+    // is empty (see pop() for definition of `prev_min_pos`).
    virtual bool empty(position_in_partition_view bound) const override {
-        return _it == _end || _cmp(_it->first, bound) > 0;
+        return !_dummy_reader && (_it == _end || _cmp(_it->first, bound) > 0);
    }
 };

 std::unique_ptr<position_reader_queue> time_series_sstable_set::make_min_position_reader_queue(
        std::function<flat_mutation_reader(sstable&)> create_reader,
-        std::function<bool(const sstable&)> filter) const {
-    return std::make_unique<min_position_reader_queue>(_schema, _sstables, std::move(create_reader), std::move(filter));
+        std::function<bool(const sstable&)> filter,
+        partition_key pk, schema_ptr schema, reader_permit permit,
+        streamed_mutation::forwarding fwd_sm) const {
+    return std::make_unique<min_position_reader_queue>(
+            std::move(schema), _sstables, std::move(create_reader), std::move(filter),
+            std::move(pk), std::move(permit), fwd_sm);
 }

 std::unique_ptr<incremental_selector_impl> partitioned_sstable_set::make_incremental_selector() const {
@@ -775,30 +803,19 @@ time_series_sstable_set::create_single_key_sstable_reader(
    auto& stats = *cf->cf_stats();
    stats.clustering_filter_count++;

-    auto ck_filter = [ranges = slice.get_all_ranges()] (const sstable& sst) { return sst.may_contain_rows(ranges); };
-    {
-        auto next = std::find_if(it, _sstables->end(), [&] (const sst_entry& e) { return ck_filter(*e.second); });
-        stats.sstables_checked_by_clustering_filter += std::distance(it, next);
-        it = next;
-    }
-    if (it == _sstables->end()) {
-        // Some sstables passed the partition key filter, but none passed the clustering key filter.
-        // However, we still have to emit a partition (even though it will be empty) so we don't fool the cache
-        // into thinking this partition doesn't exist in any sstable (#3552).
-        return flat_mutation_reader_from_mutations(std::move(permit), {mutation(schema, *pos.key())}, slice, fwd_sm);
-    }
-
    auto create_reader = [schema, permit, &pos, &slice, &pc, trace_state, fwd_sm] (sstable& sst) {
        return sst.read_row_flat(schema, permit, pos, slice, pc, trace_state, fwd_sm);
    };

+    auto ck_filter = [ranges = slice.get_all_ranges()] (const sstable& sst) { return sst.may_contain_rows(ranges); };
+
    // We're going to pass this filter into min_position_reader_queue. The queue guarantees that
    // the filter is going to be called at most once for each sstable and exactly once after
    // the queue is exhausted. We use that fact to gather statistics.
    auto filter = [pk_filter = std::move(pk_filter), ck_filter = std::move(ck_filter), &stats]
        (const sstable& sst) {
-            if (pk_filter(sst)) {
-                return true;
+            if (!pk_filter(sst)) {
+                return false;
            }

            ++stats.sstables_checked_by_clustering_filter;
@@ -810,9 +827,12 @@ time_series_sstable_set::create_single_key_sstable_reader(
            return false;
    };

+    // Note that `min_position_reader_queue` always includes a reader which emits a `partition_start` fragment,
+    // guaranteeing that the reader we return emits it as well; this helps us avoid the problem from #3552.
    return make_clustering_combined_reader(
-            std::move(schema), std::move(permit), fwd_sm,
-            make_min_position_reader_queue(std::move(create_reader), std::move(filter)));
+            schema, permit, fwd_sm,
+            make_min_position_reader_queue(
+                std::move(create_reader), std::move(filter), *pos.key(), schema, permit, fwd_sm));
 }

 flat_mutation_reader
--- a/sstables/sstable_set_impl.hh
+++ b/sstables/sstable_set_impl.hh
@@ -128,7 +128,9 @@ public:

    std::unique_ptr<position_reader_queue> make_min_position_reader_queue(
        std::function<flat_mutation_reader(sstable&)> create_reader,
-        std::function<bool(const sstable&)> filter) const;
+        std::function<bool(const sstable&)> filter,
+        partition_key pk, schema_ptr schema, reader_permit permit,
+        streamed_mutation::forwarding fwd_sm) const;

    virtual flat_mutation_reader create_single_key_sstable_reader(
        column_family*,
--- a/sstables/time_window_compaction_strategy.hh
+++ b/sstables/time_window_compaction_strategy.hh
@@ -101,7 +101,8 @@ class time_window_compaction_strategy : public compaction_strategy_impl {
    time_window_compaction_strategy_options _options;
    int64_t _estimated_remaining_tasks = 0;
    db_clock::time_point _last_expired_check;
-    timestamp_type _highest_window_seen;
+    // As timestamp_type is an int64_t, a primitive type, it must be initialized here.
+    timestamp_type _highest_window_seen = 0;
    // Keep track of all recent active windows that still need to be compacted into a single SSTable
    std::unordered_set<timestamp_type> _recent_active_windows;
    size_tiered_compaction_strategy_options _stcs_options;
--- a/streaming/stream_session.cc
+++ b/streaming/stream_session.cc
@@ -380,7 +380,7 @@ future<prepare_message> stream_session::prepare(std::vector<stream_request> requ
            try {
                db.find_column_family(ks, cf);
            } catch (no_such_column_family&) {
-                auto err = format("[Stream #{{}}] prepare requested ks={{}} cf={{}} does not exist", plan_id, ks, cf);
+                auto err = format("[Stream #{}] prepare requested ks={} cf={} does not exist", plan_id, ks, cf);
                sslog.warn(err.c_str());
                throw std::runtime_error(err);
            }
@@ -394,7 +394,7 @@ future<prepare_message> stream_session::prepare(std::vector<stream_request> requ
        try {
            db.find_column_family(cf_id);
        } catch (no_such_column_family&) {
-            auto err = format("[Stream #{{}}] prepare cf_id={} does not exist", plan_id, cf_id);
+            auto err = format("[Stream #{}] prepare cf_id={} does not exist", plan_id, cf_id);
            sslog.warn(err.c_str());
            throw std::runtime_error(err);
        }
--- a/table.cc
+++ b/table.cc
@@ -864,8 +864,8 @@ void table::try_trigger_compaction() noexcept {
 }

 void table::do_trigger_compaction() {
-    // But only submit if we're not locked out
-    if (!_compaction_disabled) {
+    // But not if we're locked out or stopping
+    if (!_compaction_disabled && !_async_gate.is_closed()) {
        _compaction_manager.submit(this);
    }
 }
--- a/test/alternator/test_authorization.py
+++ b/test/alternator/test_authorization.py
@@ -85,3 +85,20 @@ def test_signature_too_futuristic(dynamodb, test_table):
    response = requests.post(url, headers=headers, verify=False)
    assert not response.ok
    assert "InvalidSignatureException" in response.text and "Signature not yet current" in response.text
+
+# A test that commas can be uses instead of whitespace to separate components
+# of the Authorization headers - reproducing issue #9568.
+def test_authorization_no_whitespace(dynamodb, test_table):
+    # Unlike the above tests which checked error cases so didn't need to
+    # calculate a real signature, in this test we really a correct signature,
+    # so we use a function we already have in test_manual_requests.py.
+    from test_manual_requests import get_signed_request
+    payload = '{"TableName": "' + test_table.name + '", "Item": {"p": {"S": "x"}, "c": {"S": "x"}}}'
+    req = get_signed_request(dynamodb, 'PutItem', payload)
+    # Boto3 separates the components of the Authorization header by spaces.
+    # Let's remove all of them except the first one (which separates the
+    # signature algorithm name from the rest) and check the result still works:
+    a = req.headers['Authorization'].split()
+    req.headers['Authorization'] = a[0] + ' ' + ''.join(a[1:])
+    response = requests.post(req.url, headers=req.headers, data=req.body, verify=False)
+    assert response.ok
--- a/test/alternator/test_condition_expression.py
+++ b/test/alternator/test_condition_expression.py
@@ -154,6 +154,27 @@ def test_update_condition_eq_unequal(test_table_s):
            ConditionExpression='q = :oldval',
            ExpressionAttributeValues={':val1': 3, ':oldval': 2})

+# In test_update_condition_eq_unequal() above we saw that a non-existent
+# attribute is not "=" to a value. Here we check what happens when two
+# non-existent attributes are checked for equality. It turns out, they should
+# *not* be considered equal. In short, an unset attribute is never equal to
+# anything - not even to another unset attribute.
+# Reproduces issue #8511.
+def test_update_condition_eq_two_unset(test_table_s):
+    p = random_string()
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET a = :val1',
+            ConditionExpression='q = z',
+            ExpressionAttributeValues={':val1': 2})
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'a': {'Value': 1, 'Action': 'PUT'}})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET a = :val1',
+            ConditionExpression='q = z',
+            ExpressionAttributeValues={':val1': 3})
+
 # Check that set equality is checked correctly. Unlike string equality (for
 # example), it cannot be done with just naive string comparison of the JSON
 # representation, and we need to allow for any order. (see issue #5021)
@@ -175,6 +196,38 @@ def test_update_condition_eq_set(test_table_s):
        ExpressionAttributeValues={':val1': 3, ':oldval': set(['chinchilla', 'cat', 'dog', 'mouse'])})
    assert 'b' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']

+# The above test (test_update_condition_eq_set()) checked equality of simple
+# set attributes. But an attributes can contain a nested document, where the
+# set sits in a deep level (the set itself is a leaf in this heirarchy because
+# it can only contain numbers, strings or bytes). We need to correctly support
+# equality check in that case too.
+# Reproduces issue #8514.
+def test_update_condition_eq_nested_set(test_table_s):
+    p = random_string()
+    # Because boto3 sorts the set values we give it, in order to generate a
+    # set with a different order, we need to build it incrementally.
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'a': {'Value': {'b': 'c', 'd': ['e', 'f', set(['g', 'h'])], 'i': set(['j', 'k'])}, 'Action': 'PUT'}})
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='ADD a.d[2] :val1, a.i :val2',
+        ExpressionAttributeValues={':val1': set(['l', 'm']), ':val2': set(['n', 'o'])})
+    # Sanity check - the attribute contains the set we think it does
+    expected = {'b': 'c', 'd': ['e', 'f', set(['g', 'h', 'l', 'm'])], 'i': set(['j', 'k', 'n', 'o'])}
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == expected
+    # Now finally check that condition expression check knows the equality too.
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='SET b = :val1',
+        ConditionExpression='a = :oldval',
+        ExpressionAttributeValues={':val1': 3, ':oldval': expected})
+    assert 'b' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+    # Check that equality can also fail, if the inner set differs
+    wrong = {'b': 'c', 'd': ['e', 'f', set(['g', 'h', 'l', 'bad'])], 'i': set(['j', 'k', 'n', 'o'])}
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET b = :val1',
+            ConditionExpression='a = :oldval',
+            ExpressionAttributeValues={':val1': 4, ':oldval': wrong})
+
 # Test for ConditionExpression with operator "<>" (non-equality),
 def test_update_condition_ne(test_table_s):
    p = random_string()
@@ -215,6 +268,54 @@ def test_update_condition_ne(test_table_s):
        ExpressionAttributeValues={':newval': 3, ':oldval': 1})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['c'] == 3

+# Check that set inequality is checked correctly. This reproduces the same
+# bug #5021 that we reproduced above in test_update_condition_eq_set(), just
+# that here we check the inequality operator instead of equality.
+# Reproduces issue #8513.
+def test_update_condition_ne_set(test_table_s):
+    p = random_string()
+    # Because boto3 sorts the set values we give it, in order to generate a
+    # set with a different order, we need to build it incrementally.
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'a': {'Value': set(['dog', 'chinchilla']), 'Action': 'PUT'}})
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='ADD a :val1',
+        ExpressionAttributeValues={':val1': set(['cat', 'mouse'])})
+    # Sanity check - the attribute contains the set we think it does
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == set(['chinchilla', 'cat', 'dog', 'mouse'])
+    # Now check that condition expression check knows there is no inequality
+    # here.
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET b = :val1',
+            ConditionExpression='a <> :oldval',
+            ExpressionAttributeValues={':val1': 2, ':oldval': set(['chinchilla', 'cat', 'dog', 'mouse'])})
+    # As a sanity check, also check something which should be unequal:
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='SET b = :val1',
+        ConditionExpression='a <> :oldval',
+        ExpressionAttributeValues={':val1': 3, ':oldval': set(['chinchilla', 'cat', 'dog', 'horse'])})
+    assert 'b' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+
+# In test_update_condition_ne() above we saw that a non-existent attribute is
+# "not equal" to any value. Here we check what happens when two non-existent
+# attributes are checked for non-equality. It turns out, they are also
+# considered "not equal". In short, an unset attribute is always "not equal" to
+# anything - even to another unset attribute.
+# Reproduces issue #8511.
+def test_update_condition_ne_two_unset(test_table_s):
+    p = random_string()
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='SET a = :val1',
+        ConditionExpression='q <> z',
+        ExpressionAttributeValues={':val1': 2})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == 2
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='SET a = :val1',
+        ConditionExpression='q <> z',
+        ExpressionAttributeValues={':val1': 3})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == 3
+
 # Test for ConditionExpression with operator "<"
 def test_update_condition_lt(test_table_s):
    p = random_string()
@@ -316,6 +417,45 @@ def test_update_condition_lt(test_table_s):
            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 4

+# In test_update_condition_lt() above we saw that a non-existent attribute is
+# not "<" any value. Here we check what happens when two non-existent
+# attributes are compared with "<". It turns out that the result of such
+# comparison is also false.
+# The same is true for other order operators - any order comparison involving
+# one unset attribute should be false - even if the second operand is an
+# unset attribute as well. Note that the <> operator is different - it is
+# always results in true if one of the operands is an unset attribute (see
+# test_update_condition_ne_two_unset() above).
+# This test is related to issue #8511 (although it passed even before fixing
+# that issue).
+def test_update_condition_comparison_two_unset(test_table_s):
+    p = random_string()
+    ops = ['<', '<=', '>', '>=']
+    for op in ops:
+        with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+            test_table_s.update_item(Key={'p': p},
+                UpdateExpression='SET a = :val1',
+                ConditionExpression='q ' + op + ' z',
+                ExpressionAttributeValues={':val1': 2})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET a = :val1',
+            ConditionExpression='q between z and x',
+            ExpressionAttributeValues={':val1': 2})
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'a': {'Value': 1, 'Action': 'PUT'}})
+    for op in ops:
+        with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+            test_table_s.update_item(Key={'p': p},
+                UpdateExpression='SET a = :val1',
+                ConditionExpression='q ' + op + ' z',
+                ExpressionAttributeValues={':val1': 3})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET a = :val1',
+            ConditionExpression='q between z and x',
+            ExpressionAttributeValues={':val1': 2})
+
 # Test for ConditionExpression with operator "<="
 def test_update_condition_le(test_table_s):
    p = random_string()
--- a/test/alternator/test_manual_requests.py
+++ b/test/alternator/test_manual_requests.py
@@ -154,3 +154,25 @@ def test_incorrect_numbers(dynamodb, test_table):
        req = get_signed_request(dynamodb, 'PutItem', payload)
        response = requests.post(req.url, headers=req.headers, data=req.body, verify=False)
        assert "ValidationException" in response.text and "numeric" in response.text
+
+# Although the DynamoDB API responses are JSON, additional conventions apply
+# to these responses - such as how error codes are encoded in JSON. For this
+# reason, DynamoDB uses the content type 'application/x-amz-json-1.0' instead
+# of the standard 'application/json'. This test verifies that we return the
+# correct content type header.
+# While most DynamoDB libraries we tried do not care about an unexpected
+# content-type, it turns out that one (aiodynamo) does. Moreover, AWS already
+# defined x-amz-json-1.1 - see
+#    https://awslabs.github.io/smithy/1.0/spec/aws/aws-json-1_1-protocol.html
+# which differs (only) in how it encodes error replies.
+# So in the future it may become even more important that Scylla return the
+# correct content type.
+def test_content_type(dynamodb, test_table):
+    payload = '{"TableName": "' + test_table.name + '", "Item": {"p": {"S": "x"}, "c": {"S": "x"}}}'
+    # Note that get_signed_request() uses x-amz-json-1.0 to encode the
+    # *request*. In the future this may or may not effect the content type
+    # in the response (today, DynamoDB doesn't allow any other content type
+    # in the request anyway).
+    req = get_signed_request(dynamodb, 'PutItem', payload)
+    response = requests.post(req.url, headers=req.headers, data=req.body, verify=False)
+    assert response.headers['Content-Type'] == 'application/x-amz-json-1.0'
--- a/test/alternator/test_metrics.py
+++ b/test/alternator/test_metrics.py
@@ -0,0 +1,113 @@
+# Copyright 2021-present ScyllaDB
+#
+# This file is part of Scylla.
+#
+# Scylla is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Scylla is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+
+##############################################################################
+# Tests for Scylla's metrics (see docs/design-notes/metrics.md) for Alternator
+# queries. Reproduces issue #9406, where although metrics was implemented for
+# Alternator requests, they were missing for some operations (BatchGetItem).
+# In the tests here we attempt to ensure that the metrics continue to work
+# for the relevant operations as the code evolves.
+#
+# Note that all tests in this file test Scylla-specific features, and are
+# "skipped" when not running against Scylla, or when unable to retrieve
+# metrics through out-of-band HTTP requests to Scylla's Prometheus port (9180).
+#
+# IMPORTANT: we do not want these tests to assume that are not running in
+# parallel with any other tests or workload - because such an assumption
+# would limit our test deployment options in the future. NOT making this
+# assumption means that these tests can't check that a certain operation
+# increases a certain counter by exactly 1 - because other concurrent
+# operations might increase it further! So our test can only check that the
+# counter increases.
+##############################################################################
+
+import pytest
+import requests
+import re
+
+from util import random_string
+
+# Fixture for checking if we are able to test Scylla metrics. Scylla metrics
+# are not available on AWS (of course), but may also not be available for
+# Scylla if for some reason we have only access to the Alternator protocol
+# port but no access to the metrics port (9180).
+# If metrics are *not* available, tests using this fixture will be skipped.
+# Tests using this fixture may call get_metrics(metrics).
+@pytest.fixture(scope="module")
+def metrics(dynamodb):
+    if dynamodb.meta.client._endpoint.host.endswith('.amazonaws.com'):
+        pytest.skip('Scylla-only feature not supported by AWS')
+    url = dynamodb.meta.client._endpoint.host
+    # The Prometheus API is on port 9180, and always http, not https.
+    url = re.sub(r':[0-9]+(/|$)', ':9180', url)
+    url = re.sub(r'^https:', 'http:', url)
+    url = url + '/metrics'
+    resp = requests.get(url)
+    if resp.status_code != 200:
+        pytest.skip('Metrics port 9180 is not available')
+    yield url
+
+# Utility function for fetching all metrics from Scylla, using an HTTP request
+# to port 9180. The response format is defined by the Prometheus protocol.
+# Only use get_metrics() in a test using the metrics_available fixture.
+def get_metrics(metrics):
+    response = requests.get(metrics)
+    assert response.status_code == 200
+    return response.text
+
+# Utility function for fetching a metric with a given name and optionally a
+# given sub-metric label (which should be a name-value map). If multiple
+# matches are found, they are summed - this is useful for summing up the
+# counts from multiple shards.
+def get_metric(metrics, name, requested_labels=None):
+    total = 0.0
+    lines = re.compile('^'+name+'{.*$', re.MULTILINE)
+    for match in re.findall(lines, get_metrics(metrics)):
+        a = match.split()
+        metric = a[0]
+        val = float(a[1])
+        # Check if match also matches the requested labels
+        if requested_labels:
+            # we know metric begins with name{ and ends with } - the labels
+            # are what we have between those
+            got_labels = metric[len(name)+1:-1].split(',')
+            # Check that every one of the requested labels is in got_labels:
+            for k, v in requested_labels.items():
+                if not f'{k}="{v}"' in got_labels:
+                    # No match for requested label, skip this metric (python
+                    # doesn't have "continue 2" so let's just set val to 0...
+                    val = 0
+                    break
+        total += float(val)
+    return total
+
+def test_batch_write_item(test_table_s, metrics):
+    n1 = get_metric(metrics, 'scylla_alternator_operation', {'op': 'BatchWriteItem'})
+    test_table_s.meta.client.batch_write_item(RequestItems = {
+        test_table_s.name: [{'PutRequest': {'Item': {'p': random_string(), 'a': 'hi'}}}]})
+    n2 = get_metric(metrics, 'scylla_alternator_operation', {'op': 'BatchWriteItem'})
+    assert n2 > n1
+
+# Reproduces issue #9406:
+def test_batch_get_item(test_table_s, metrics):
+    n1 = get_metric(metrics, 'scylla_alternator_operation', {'op': 'BatchGetItem'})
+    test_table_s.meta.client.batch_get_item(RequestItems = {
+        test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}})
+    n2 = get_metric(metrics, 'scylla_alternator_operation', {'op': 'BatchGetItem'})
+    assert n2 > n1
+
+# TODO: check the rest of the operations
--- a/test/alternator/test_returnvalues.py
+++ b/test/alternator/test_returnvalues.py
@@ -431,3 +431,14 @@ def test_update_item_returnvalues_nested(test_table_s):
    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW',
        UpdateExpression='REMOVE a.c[1]')
    assert ret['Attributes'] == {'a': {'c': [70]}}
+
+# A reproducer for issue #9542 - when UpdateExpression's REMOVE operation
+# actually deletes an existing attribute, it breaks the ALL_NEW ReturnValues
+# for other attributes set in the same command.
+def test_update_item_returnvalues_all_new_remove_etc(test_table_s):
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 's': 'dog', 'd': 'foo'})
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='ALL_NEW',
+        UpdateExpression='REMOVE d SET s = :v',
+        ExpressionAttributeValues={':v': 'cat'})
+    assert ret['Attributes']['s'] == 'cat'
--- a/test/boost/alternator_base64_test.cc
+++ b/test/boost/alternator_base64_test.cc
@@ -22,6 +22,7 @@
 #define BOOST_TEST_MODULE alternator
 #include <boost/test/included/unit_test.hpp>

+#include <seastar/util/defer.hh>
 #include "alternator/base64.hh"

 static bytes_view to_bytes_view(const std::string& s) {
@@ -78,3 +79,22 @@ BOOST_AUTO_TEST_CASE(test_base64_begins_with) {
        BOOST_REQUIRE(!base64_begins_with(encoded_str3, encoded_non_prefix));
    }
 }
+
+BOOST_AUTO_TEST_CASE(test_allocator_fail_gracefully) {
+// Unfortunately the address sanitizer fails if the allocator is not able
+// to allocate the requested memory. The test is therefore skipped for debug  mode
+#ifndef DEBUG
+    static constexpr size_t too_large_alloc_size = 0xffffffffff;
+    rjson::allocator allocator;
+    // Impossible allocation should throw
+    BOOST_REQUIRE_THROW(allocator.Malloc(too_large_alloc_size), rjson::error);
+    // So should impossible reallocation
+    void* memory = allocator.Malloc(1);
+    auto release = defer([memory] { rjson::allocator::Free(memory); });
+    BOOST_REQUIRE_THROW(allocator.Realloc(memory, 1, too_large_alloc_size), rjson::error);
+    // Internal rapidjson stack should also throw
+    // and also be destroyed gracefully later
+    rapidjson::internal::Stack stack(&allocator, 0);
+    BOOST_REQUIRE_THROW(stack.Push<char>(too_large_alloc_size), rjson::error);
+#endif
+}
--- a/test/boost/bytes_ostream_test.cc
+++ b/test/boost/bytes_ostream_test.cc
@@ -269,6 +269,21 @@ BOOST_AUTO_TEST_CASE(test_writing_placeholders) {
    BOOST_REQUIRE(in.size() == 0);
 }

+BOOST_AUTO_TEST_CASE(test_large_placeholder) {
+    bytes_ostream::size_type size;
+    try {
+        for (size = 1; (int32_t)size > 0; size *= 2) {
+            bytes_ostream buf;
+            int8_t* ph;
+            BOOST_TEST_MESSAGE(fmt::format("try size={}", size));
+            ph = buf.write_place_holder(size);
+            std::fill(ph, ph + size, 0);
+        }
+    } catch (const std::bad_alloc&) {
+    }
+    BOOST_REQUIRE(size >= bytes_ostream::max_chunk_size());
+}
+
 BOOST_AUTO_TEST_CASE(test_append_big_and_small_chunks) {
    bytes_ostream small;
    append_sequence(small, 12);
--- a/test/boost/commitlog_test.cc
+++ b/test/boost/commitlog_test.cc
@@ -578,11 +578,14 @@ SEASTAR_TEST_CASE(test_allocation_failure){

            // Use us loads of memory so we can OOM at the appropriate place
            try {
+                assert(fragmented_temporary_buffer::default_fragment_size < size);
                for (;;) {
-                    junk->emplace_back(new char[size]);
+                    junk->emplace_back(new char[fragmented_temporary_buffer::default_fragment_size]);
                }
            } catch (std::bad_alloc&) {
            }
+            auto last = junk->end();
+            junk->erase(--last);
            return log.add_mutation(utils::UUID_gen::get_time_UUID(), size, db::commitlog::force_sync::no, [size](db::commitlog::output& dst) {
                        dst.fill(char(1), size);
                    }).then_wrapped([junk, size](future<db::rp_handle> f) {
--- a/test/boost/cql_query_test.cc
+++ b/test/boost/cql_query_test.cc
@@ -3634,19 +3634,9 @@ SEASTAR_TEST_CASE(test_rf_expand) {
            {"datacenter1", "3"}
        });

-        e.execute_cql("CREATE KEYSPACE rf_expand_3 WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 3}").get();
-        assert_replication_contains("rf_expand_3", {
-            {"class", simple},
-            {"replication_factor", "3"}
-        });
-
-        // Should not auto-expand when switching to NetworkTopologyStrategy with additional options.
-        e.execute_cql("ALTER KEYSPACE rf_expand_3 WITH replication = {'class': 'NetworkTopologyStrategy', 'datacenter2': 2}").get();
-        assert_replication_not_contains("rf_expand_3", {"datacenter1"});
-
        // Respect factors specified manually.
-        e.execute_cql("CREATE KEYSPACE rf_expand_4 WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3, 'datacenter1': 2}").get();
-        assert_replication_contains("rf_expand_4", {
+        e.execute_cql("CREATE KEYSPACE rf_expand_3 WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3, 'datacenter1': 2}").get();
+        assert_replication_contains("rf_expand_3", {
            {"class", network_topology},
            {"datacenter1", "2"}
        });
--- a/test/boost/index_with_paging_test.cc
+++ b/test/boost/index_with_paging_test.cc
@@ -22,6 +22,8 @@
 #include <seastar/testing/test_case.hh>
 #include "test/lib/cql_test_env.hh"
 #include "test/lib/cql_assertions.hh"
+#include "cql3/untyped_result_set.hh"
+#include "cql3/query_processor.hh"
 #include "transport/messages/result_message.hh"

 SEASTAR_TEST_CASE(test_index_with_paging) {
@@ -48,3 +50,51 @@ SEASTAR_TEST_CASE(test_index_with_paging) {
        });
    });
 }
+
+SEASTAR_TEST_CASE(test_index_with_paging_with_base_short_read) {
+    return do_with_cql_env_thread([] (auto& e) {
+        e.execute_cql("CREATE TABLE tab (pk int, ck text, v int, v2 int, v3 text, PRIMARY KEY (pk, ck))").get();
+        e.execute_cql("CREATE INDEX ON tab (v)").get();
+
+        // Enough to trigger a short read on the base table during scan
+        sstring big_string(2 * query::result_memory_limiter::maximum_result_size, 'j');
+
+        const int row_count = 67;
+        for (int i = 0; i < row_count; ++i) {
+            e.execute_cql(format("INSERT INTO tab (pk, ck, v, v2, v3) VALUES ({}, 'hello{}', 1, {}, '{}')", i % 3, i, i, big_string)).get();
+        }
+
+        eventually([&] {
+            uint64_t count = 0;
+            e.qp().local().query_internal("SELECT * FROM ks.tab WHERE v = 1", [&] (const cql3::untyped_result_set_row&) {
+                ++count;
+                return make_ready_future<stop_iteration>(stop_iteration::no);
+            }).get();
+            BOOST_REQUIRE_EQUAL(count, row_count);
+        });
+    });
+}
+
+SEASTAR_TEST_CASE(test_index_with_paging_with_base_short_read_no_ck) {
+    return do_with_cql_env_thread([] (auto& e) {
+        e.execute_cql("CREATE TABLE tab (pk int, v int, v2 int, v3 text, PRIMARY KEY (pk))").get();
+        e.execute_cql("CREATE INDEX ON tab (v)").get();
+
+        // Enough to trigger a short read on the base table during scan
+        sstring big_string(2 * query::result_memory_limiter::maximum_result_size, 'j');
+
+        const int row_count = 67;
+        for (int i = 0; i < row_count; ++i) {
+            e.execute_cql(format("INSERT INTO tab (pk, v, v2, v3) VALUES ({}, 1, {}, '{}')", i, i, big_string)).get();
+        }
+
+        eventually([&] {
+            uint64_t count = 0;
+            e.qp().local().query_internal("SELECT * FROM ks.tab WHERE v = 1", [&] (const cql3::untyped_result_set_row&) {
+                ++count;
+                return make_ready_future<stop_iteration>(stop_iteration::no);
+            }).get();
+            BOOST_REQUIRE_EQUAL(count, row_count);
+        });
+    });
+}
--- a/test/boost/multishard_mutation_query_test.cc
+++ b/test/boost/multishard_mutation_query_test.cc
@@ -977,14 +977,7 @@ SEASTAR_THREAD_TEST_CASE(fuzzy_test) {

        const auto& partitions = pop_desc.partitions;
        smp::invoke_on_all([cfg, db = &env.db(), gs = global_schema_ptr(pop_desc.schema), &partitions] {
-            auto s = gs.get();
-            auto& sem = db->local().get_reader_concurrency_semaphore();
-
-            auto resources = sem.available_resources();
-            resources -= reader_concurrency_semaphore::resources{1, 0};
-            auto permit = sem.make_permit(s.get(), "fuzzy-test");
-
-            return run_fuzzy_test_workload(cfg, *db, std::move(s), partitions).finally([units = permit.consume_resources(resources)] {});
+            return run_fuzzy_test_workload(cfg, *db, gs.get(), partitions);
        }).handle_exception([seed] (std::exception_ptr e) {
            testlog.error("Test workload failed with exception {}."
                    " To repeat this particular run, replace the random seed of the test, with that of this run ({})."
--- a/test/boost/mutation_reader_test.cc
+++ b/test/boost/mutation_reader_test.cc
@@ -896,6 +896,221 @@ sstables::shared_sstable create_sstable(sstables::test_env& env, simple_schema&
        , mutations);
 }

+namespace {
+
+class generic_inactive_read : public reader_concurrency_semaphore::inactive_read {
+    flat_mutation_reader_opt _reader;
+
+private:
+    explicit generic_inactive_read(flat_mutation_reader&& rd) : _reader(std::move(rd)) { }
+
+    virtual void evict() override {
+        _reader = {};
+    }
+
+public:
+    static std::unique_ptr<inactive_read> make(flat_mutation_reader&& rd) {
+        return std::make_unique<generic_inactive_read>(generic_inactive_read(std::move(rd)));
+    }
+
+    static flat_mutation_reader_opt get_reader(std::unique_ptr<inactive_read>&& ir) {
+        if (!ir) {
+            return {};
+        }
+        auto gir = dynamic_cast<generic_inactive_read*>(ir.get());
+        BOOST_REQUIRE(gir);
+        return std::move(gir->_reader);
+    }
+};
+
+} // anonymous namespace
+
+// This unit test passes a read through admission again-and-again, just
+// like an evictable reader would be during its lifetime. When readmitted
+// the read sometimes has to wait and sometimes not. This is to check that
+// the readmitting a previously admitted reader doesn't leak any units.
+SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_readmission_preserves_units) {
+    simple_schema s;
+    const auto initial_resources = reader_concurrency_semaphore::resources{10, 1024 * 1024};
+    reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name());
+
+    auto permit = semaphore.make_permit(s.schema().get(), get_name());
+
+    std::optional<reader_permit::resource_units> residue_units;
+
+    for (int i = 0; i < 10; ++i) {
+        const auto have_residue_units = bool(residue_units);
+
+        auto current_resources = initial_resources;
+        if (have_residue_units) {
+            current_resources -= residue_units->resources();
+        }
+        BOOST_REQUIRE(semaphore.available_resources() == current_resources);
+
+        std::optional<reader_permit::resource_units> admitted_units;
+        if (i % 2) {
+            const auto consumed_resources = semaphore.available_resources();
+            semaphore.consume(consumed_resources);
+
+            auto units_fut = permit.wait_admission(1024, db::no_timeout);
+            BOOST_REQUIRE(!units_fut.available());
+
+            semaphore.signal(consumed_resources);
+            admitted_units = units_fut.get();
+        } else {
+            admitted_units = permit.wait_admission(1024, db::no_timeout).get();
+        }
+
+        current_resources -= admitted_units->resources();
+        BOOST_REQUIRE(semaphore.available_resources() == current_resources);
+
+        residue_units.emplace(permit.consume_resources(reader_resources(0, 100)));
+        if (!have_residue_units) {
+            current_resources -= residue_units->resources();
+        }
+        BOOST_REQUIRE(semaphore.available_resources() == current_resources);
+
+        auto handle = semaphore.register_inactive_read(generic_inactive_read::make(make_empty_flat_reader(s.schema(), permit)));
+        BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
+    }
+
+    BOOST_REQUIRE(semaphore.available_resources() == initial_resources - residue_units->resources());
+
+    residue_units.reset();
+
+    BOOST_REQUIRE(semaphore.available_resources() == initial_resources);
+}
+
+// This unit test checks that the semaphore doesn't get into a deadlock
+// when contended, in the presence of many memory-only reads (that don't
+// wait for admission). This is tested by simulating the 3 kind of reads we
+// currently have in the system:
+// * memory-only: reads that don't pass admission and only own memory.
+// * admitted: reads that pass admission.
+// * evictable: admitted reads that are furthermore evictable.
+//
+// The test creates and runs a large number of these reads in parallel,
+// read kinds being selected randomly, then creates a watchdog which
+// kills the test if no progress is being made.
+SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_forward_progress) {
+    class reader {
+        class skeleton_reader : public flat_mutation_reader::impl {
+            reader_permit::resource_units _base_resources;
+            std::optional<reader_permit::resource_units> _resources;
+        public:
+            skeleton_reader(schema_ptr s, reader_permit permit, reader_permit::resource_units res)
+                : impl(std::move(s), std::move(permit)), _base_resources(std::move(res)) { }
+            virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
+                _resources.emplace(_permit.consume_resources(reader_resources(0, tests::random::get_int(1024, 2048))));
+                return make_ready_future<>();
+            }
+            virtual void next_partition() override { }
+            virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override { return make_ready_future<>(); }
+            virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point timeout) override { return make_ready_future<>(); }
+        };
+        struct reader_visitor {
+            reader& r;
+            future<> operator()(std::monostate& ms) { return r.tick(ms); }
+            future<> operator()(flat_mutation_reader& reader) { return r.tick(reader); }
+            future<> operator()(reader_concurrency_semaphore::inactive_read_handle& handle) { return r.tick(handle); }
+        };
+
+    private:
+        schema_ptr _schema;
+        reader_permit _permit;
+        bool _memory_only = true;
+        bool _evictable = false;
+        std::optional<reader_permit::resource_units> _units;
+        std::variant<std::monostate, flat_mutation_reader, reader_concurrency_semaphore::inactive_read_handle> _reader;
+
+    private:
+        future<> make_reader() {
+            auto res = _permit.consume_memory();
+            if (!_memory_only) {
+                res = co_await _permit.wait_admission(1024, db::no_timeout);
+            }
+            _reader = make_flat_mutation_reader<skeleton_reader>(_schema, _permit, std::move(res));
+        }
+        future<> tick(std::monostate&) {
+            co_await make_reader();
+            co_await tick(std::get<flat_mutation_reader>(_reader));
+        }
+        future<> tick(flat_mutation_reader& reader) {
+            co_await reader.fill_buffer(db::no_timeout);
+            if (_evictable) {
+                _reader = _permit.semaphore().register_inactive_read(generic_inactive_read::make(std::move(reader)));
+            }
+        }
+        future<> tick(reader_concurrency_semaphore::inactive_read_handle& handle) {
+            if (auto reader = generic_inactive_read::get_reader(_permit.semaphore().unregister_inactive_read(std::move(handle))); reader) {
+                _reader = std::move(*reader);
+            } else {
+                co_await make_reader();
+            }
+            co_await tick(std::get<flat_mutation_reader>(_reader));
+        }
+
+    public:
+        reader(schema_ptr s, reader_permit permit, bool memory_only, bool evictable)
+            : _schema(std::move(s))
+            , _permit(std::move(permit))
+            , _memory_only(memory_only)
+            , _evictable(evictable)
+            , _units(_permit.consume_memory(tests::random::get_int(128, 1024)))
+        {
+        }
+        future<> tick() {
+            return std::visit(reader_visitor{*this}, _reader);
+        }
+    };
+
+    const auto count = 10;
+    const auto num_readers = 512;
+    const auto ticks = 1000;
+
+    simple_schema s;
+    reader_concurrency_semaphore semaphore(count, count * 1024, get_name());
+
+    std::list<std::optional<reader>> readers;
+    unsigned nr_memory_only = 0;
+    unsigned nr_admitted = 0;
+    unsigned nr_evictable = 0;
+
+    for (auto i = 0; i <  num_readers; ++i) {
+        const auto memory_only = tests::random::get_bool();
+        const auto evictable = !memory_only && tests::random::get_bool();
+        if (memory_only) {
+            ++nr_memory_only;
+        } else if (evictable) {
+            ++nr_evictable;
+        } else {
+            ++nr_admitted;
+        }
+        readers.emplace_back(reader(s.schema(), semaphore.make_permit(s.schema().get(), fmt::format("reader{}", i)), memory_only, evictable));
+    }
+
+    testlog.info("Created {} readers, memory_only={}, admitted={}, evictable={}", readers.size(), nr_memory_only, nr_admitted, nr_evictable);
+
+    bool watchdog_touched = false;
+    auto watchdog = timer<db::timeout_clock>([&semaphore, &watchdog_touched] {
+        if (!watchdog_touched) {
+            testlog.error("Watchdog detected a deadlock, dumping diagnostics before killing the test: {}", semaphore.dump_diagnostics());
+            semaphore.broken(std::make_exception_ptr(std::runtime_error("test killed by watchdog")));
+        }
+        watchdog_touched = false;
+    });
+    watchdog.arm_periodic(std::chrono::seconds(30));
+
+    parallel_for_each(readers, [&] (std::optional<reader>& r) -> future<> {
+        for (auto i = 0; i < ticks; ++i) {
+            watchdog_touched = true;
+            co_await r->tick();
+        }
+        r.reset();
+        watchdog_touched = true;
+    }).get();
+}
+
 static
 sstables::shared_sstable create_sstable(sstables::test_env& env, schema_ptr s, std::vector<mutation> mutations) {
    static thread_local auto tmp = tmpdir();
@@ -3043,39 +3258,30 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
        reader_permit permit,
        const dht::partition_range& prange,
        const query::partition_slice& slice,
-        std::deque<mutation_fragment> first_buffer,
-        position_in_partition_view last_fragment_position,
-        std::deque<mutation_fragment> second_buffer,
-        size_t max_buffer_size) {
+        std::list<std::deque<mutation_fragment>> buffers,
+        position_in_partition_view first_buf_last_fragment_position,
+        size_t max_buffer_size,
+        bool detach_buffer = true) {
    class factory {
        schema_ptr _schema;
        reader_permit _permit;
-        std::optional<std::deque<mutation_fragment>> _first_buffer;
-        std::optional<std::deque<mutation_fragment>> _second_buffer;
+        std::list<std::deque<mutation_fragment>> _buffers;
        size_t _max_buffer_size;

-    private:
-        std::optional<std::deque<mutation_fragment>> copy_buffer(const std::optional<std::deque<mutation_fragment>>& o) {
-            if (!o) {
-                return {};
-            }
-            return copy_fragments(*_schema, _permit, *o);
-        }
-
    public:
-        factory(schema_ptr schema, reader_permit permit, std::deque<mutation_fragment> first_buffer, std::deque<mutation_fragment> second_buffer, size_t max_buffer_size)
+        factory(schema_ptr schema, reader_permit permit, std::list<std::deque<mutation_fragment>> buffers, size_t max_buffer_size)
            : _schema(std::move(schema))
            , _permit(std::move(permit))
-            , _first_buffer(std::move(first_buffer))
-            , _second_buffer(std::move(second_buffer))
+            , _buffers(std::move(buffers))
            , _max_buffer_size(max_buffer_size) {
        }

        factory(const factory& o)
            : _schema(o._schema)
-            , _permit(o._permit)
-            , _first_buffer(copy_buffer(o._first_buffer))
-            , _second_buffer(copy_buffer(o._second_buffer)) {
+            , _permit(o._permit) {
+            for (const auto& buf : o._buffers) {
+                _buffers.emplace_back(copy_fragments(*_schema, _permit, buf));
+            }
        }
        factory(factory&& o) = default;

@@ -3089,14 +3295,9 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
                streamed_mutation::forwarding fwd_sm,
                mutation_reader::forwarding fwd_mr) {
            BOOST_REQUIRE(s == _schema);
-            if (_first_buffer) {
-                auto buf = *std::exchange(_first_buffer, {});
-                auto rd = make_flat_mutation_reader_from_fragments(_schema, std::move(permit), std::move(buf));
-                rd.set_max_buffer_size(_max_buffer_size);
-                return rd;
-            }
-            if (_second_buffer) {
-                auto buf = *std::exchange(_second_buffer, {});
+            if (!_buffers.empty()) {
+                auto buf = std::move(_buffers.front());
+                _buffers.pop_front();
                auto rd = make_flat_mutation_reader_from_fragments(_schema, std::move(permit), std::move(buf));
                rd.set_max_buffer_size(_max_buffer_size);
                return rd;
@@ -3104,9 +3305,9 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
            return make_empty_flat_reader(_schema, std::move(permit));
        }
    };
-    auto ms = mutation_source(factory(schema, permit, std::move(first_buffer), std::move(second_buffer), max_buffer_size));
+    auto ms = mutation_source(factory(schema, permit, std::move(buffers), max_buffer_size));

-    auto [rd, handle] = make_manually_paused_evictable_reader(
+    auto rd = make_auto_paused_evictable_reader(
            std::move(ms),
            schema,
            permit,
@@ -3122,18 +3323,42 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(

    const auto eq_cmp = position_in_partition::equal_compare(*schema);
    BOOST_REQUIRE(rd.is_buffer_full());
-    BOOST_REQUIRE(eq_cmp(rd.buffer().back().position(), last_fragment_position));
+    BOOST_REQUIRE(eq_cmp(rd.buffer().back().position(), first_buf_last_fragment_position));
    BOOST_REQUIRE(!rd.is_end_of_stream());

-    rd.detach_buffer();
-
-    handle.pause();
+    if (detach_buffer) {
+        rd.detach_buffer();
+    }

    while(permit.semaphore().try_evict_one_inactive_read());

    return std::move(rd);
 }

+flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
+        schema_ptr schema,
+        reader_permit permit,
+        const dht::partition_range& prange,
+        const query::partition_slice& slice,
+        std::deque<mutation_fragment> first_buffer,
+        position_in_partition_view last_fragment_position,
+        std::deque<mutation_fragment> last_buffer,
+        size_t max_buffer_size,
+        bool detach_buffer = true) {
+    std::list<std::deque<mutation_fragment>> list;
+    list.emplace_back(std::move(first_buffer));
+    list.emplace_back(std::move(last_buffer));
+    return create_evictable_reader_and_evict_after_first_buffer(
+            std::move(schema),
+            std::move(permit),
+            prange,
+            slice,
+            std::move(list),
+            last_fragment_position,
+            max_buffer_size,
+            detach_buffer);
+}
+
 }

 SEASTAR_THREAD_TEST_CASE(test_evictable_reader_trim_range_tombstones) {
@@ -3435,7 +3660,7 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_self_validation) {

    check_evictable_reader_validation_is_triggered(
            "pkey > _last_pkey; pkey ∈ pkrange",
-            partition_error_prefix,
+            "",
            s.schema(),
            permit,
            prange,
@@ -3524,12 +3749,338 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_self_validation) {
            max_buffer_size);
 }

+SEASTAR_THREAD_TEST_CASE(test_evictable_reader_recreate_before_fast_forward_to) {
+    class test_reader : public flat_mutation_reader::impl {
+        simple_schema _s;
+        const std::vector<dht::decorated_key> _pkeys;
+        std::vector<dht::decorated_key>::const_iterator _it;
+        std::vector<dht::decorated_key>::const_iterator _end;
+    private:
+        void on_range_change(const dht::partition_range& pr) {
+            dht::ring_position_comparator cmp(*_schema);
+            _it = _pkeys.begin();
+            while (_it != _pkeys.end() && !pr.contains(*_it, cmp)) {
+                ++_it;
+            }
+            _end = _it;
+            while (_end != _pkeys.end() && pr.contains(*_end, cmp)) {
+                ++_end;
+            }
+        }
+    public:
+        test_reader(simple_schema s, reader_permit permit, const dht::partition_range& pr, std::vector<dht::decorated_key> pkeys)
+            : impl(s.schema(), std::move(permit))
+            , _s(std::move(s))
+            , _pkeys(std::move(pkeys)) {
+            on_range_change(pr);
+        }
+
+        virtual future<> fill_buffer(db::timeout_clock::time_point) override {
+            if (_it == _end) {
+                _end_of_stream = true;
+                return make_ready_future<>();
+            }
+
+            push_mutation_fragment(*_schema, _permit, partition_start(*_it++, {}));
+
+            uint32_t ck = 0;
+            while (!is_buffer_full()) {
+                auto ckey = _s.make_ckey(ck);
+                push_mutation_fragment(*_schema, _permit, _s.make_row(_s.make_ckey(ck++), make_random_string(1024)));
+                ++ck;
+            }
+
+            push_mutation_fragment(*_schema, _permit, partition_end());
+            return make_ready_future<>();
+        }
+        virtual void next_partition() override {
+        }
+        virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point) override {
+            on_range_change(pr);
+            clear_buffer();
+            _end_of_stream = false;
+            return make_ready_future<>();
+        }
+        virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point) override {
+            return make_exception_future<>(make_backtraced_exception_ptr<std::bad_function_call>());
+        }
+    };
+
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name());
+    simple_schema s;
+    auto permit = semaphore.make_permit(s.schema().get(), get_name());
+    auto pkeys = s.make_pkeys(6);
+    boost::sort(pkeys, dht::decorated_key::less_comparator(s.schema()));
+
+    auto ms = mutation_source([&] (schema_ptr schema,
+            reader_permit permit,
+            const dht::partition_range& range,
+            const query::partition_slice& slice,
+            const io_priority_class& pc,
+            tracing::trace_state_ptr tr,
+            streamed_mutation::forwarding fwd,
+            mutation_reader::forwarding fwd_mr) {
+        std::vector<dht::decorated_key> pkeys_with_data;
+        bool empty = false;
+        for (const auto& pkey : pkeys) {
+            empty = !empty;
+            if (empty) {
+                pkeys_with_data.push_back(pkey);
+            }
+        }
+        return make_flat_mutation_reader<test_reader>(
+                s,
+                std::move(permit),
+                range,
+                std::move(pkeys_with_data));
+    });
+
+    auto pr0 = dht::partition_range::make({pkeys[0], true}, {pkeys[3], true});
+    auto [reader, handle] = make_manually_paused_evictable_reader(std::move(ms), s.schema(), permit, pr0, s.schema()->full_slice(),
+            seastar::default_priority_class(), {}, mutation_reader::forwarding::yes);
+
+    auto reader_assert = assert_that(std::move(reader));
+    reader_assert.produces(pkeys[0]);
+    reader_assert.produces(pkeys[2]);
+
+    handle.pause();
+    BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
+
+    reader_assert.produces_end_of_stream();
+
+    auto pr1 = dht::partition_range::make({pkeys[4], true}, {pkeys[5], true});
+    reader_assert.fast_forward_to(pr1);
+
+    // Failure will happen in the form of `on_internal_error()`.
+    reader_assert.produces(pkeys[4]);
+}
+
+SEASTAR_THREAD_TEST_CASE(test_evictable_reader_drop_flags) {
+    reader_concurrency_semaphore semaphore(1, 0, get_name());
+    simple_schema s;
+    auto permit = semaphore.make_permit(s.schema().get(), get_name());
+
+    auto pkeys = s.make_pkeys(2);
+    std::sort(pkeys.begin(), pkeys.end(), [&s] (const auto& pk1, const auto& pk2) {
+        return pk1.less_compare(*s.schema(), pk2);
+    });
+    const auto& pkey1 = pkeys[0];
+    const auto& pkey2 = pkeys[1];
+    const int second_buffer_ck = 10;
+
+    struct buffer {
+        simple_schema& s;
+        reader_permit permit;
+        std::deque<mutation_fragment> frags;
+        std::vector<mutation> muts;
+        size_t size = 0;
+        std::optional<position_in_partition_view> last_pos;
+
+        buffer(simple_schema& s_, reader_permit permit_, dht::decorated_key key)
+            : s(s_), permit(std::move(permit_)) {
+            add_partition(key);
+        }
+        size_t add_partition(dht::decorated_key key) {
+            size += frags.emplace_back(*s.schema(), permit, partition_start{key, {}}).memory_usage();
+            muts.emplace_back(s.schema(), key);
+            return size;
+        }
+        size_t add_mutation_fragment(mutation_fragment&& mf, bool only_to_frags = false) {
+            if (!only_to_frags) {
+                muts.back().apply(mf);
+            }
+            size += frags.emplace_back(*s.schema(), permit, std::move(mf)).memory_usage();
+            return size;
+        }
+        size_t add_static_row(std::optional<mutation_fragment> sr = {}) {
+            auto srow = sr ? std::move(*sr) : s.make_static_row("s");
+            return add_mutation_fragment(std::move(srow));
+        }
+        size_t add_clustering_row(int i, bool only_to_frags = false) {
+            return add_mutation_fragment(mutation_fragment(*s.schema(), permit, s.make_row(s.make_ckey(i), "v")), only_to_frags);
+        }
+        size_t add_clustering_rows(int start, int end) {
+            for (int i = start; i < end; ++i) {
+                add_clustering_row(i);
+            }
+            return size;
+        }
+        size_t add_partition_end() {
+            size += frags.emplace_back(*s.schema(), permit, partition_end{}).memory_usage();
+            return size;
+        }
+        void save_position() { last_pos = frags.back().position(); }
+        void find_position(size_t buf_size) {
+            size_t s = 0;
+            for (const auto& frag : frags) {
+                s += frag.memory_usage();
+                if (s >= buf_size) {
+                    last_pos = frag.position();
+                    break;
+                }
+            }
+            BOOST_REQUIRE(last_pos);
+        }
+    };
+
+    auto make_reader = [&] (const buffer& first_buffer, const buffer& second_buffer, const buffer* const third_buffer, size_t max_buffer_size) {
+        std::list<std::deque<mutation_fragment>> buffers;
+        buffers.emplace_back(copy_fragments(*s.schema(), permit, first_buffer.frags));
+        buffers.emplace_back(copy_fragments(*s.schema(), permit, second_buffer.frags));
+        if (third_buffer) {
+            buffers.emplace_back(copy_fragments(*s.schema(), permit, third_buffer->frags));
+        }
+        return create_evictable_reader_and_evict_after_first_buffer(
+                s.schema(),
+                permit,
+                query::full_partition_range,
+                s.schema()->full_slice(),
+                std::move(buffers),
+                *first_buffer.last_pos,
+                max_buffer_size,
+                false);
+    };
+
+    testlog.info("Same partition, with static row");
+    {
+        buffer first_buffer(s, permit, pkey1);
+        first_buffer.add_static_row();
+        auto srow = mutation_fragment(*s.schema(), permit, first_buffer.frags.back());
+        const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
+        first_buffer.save_position();
+        first_buffer.add_clustering_row(second_buffer_ck);
+
+        buffer second_buffer(s, permit, pkey1);
+        second_buffer.add_static_row(std::move(srow));
+        second_buffer.add_clustering_row(second_buffer_ck);
+        second_buffer.add_clustering_row(second_buffer_ck + 1);
+        second_buffer.add_partition_end();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .has_monotonic_positions();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .produces(first_buffer.muts[0] + second_buffer.muts[0])
+            .produces_end_of_stream();
+    }
+
+    testlog.info("Same partition, no static row");
+    {
+        buffer first_buffer(s, permit, pkey1);
+        const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
+        first_buffer.save_position();
+        first_buffer.add_clustering_row(second_buffer_ck);
+
+        buffer second_buffer(s, permit, pkey1);
+        second_buffer.add_clustering_row(second_buffer_ck);
+        second_buffer.add_clustering_row(second_buffer_ck + 1);
+        second_buffer.add_partition_end();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .has_monotonic_positions();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .produces(first_buffer.muts[0] + second_buffer.muts[0])
+            .produces_end_of_stream();
+    }
+
+    testlog.info("Same partition as expected, no static row, next partition has static row (#8923)");
+    {
+        buffer second_buffer(s, permit, pkey1);
+        second_buffer.add_clustering_rows(second_buffer_ck, second_buffer_ck + second_buffer_ck / 2);
+        // We want to end the buffer on the partition-start below, but since a
+        // partition start will be dropped from it, we have to use the size
+        // without it.
+        const auto buf_size = second_buffer.add_partition_end();
+        second_buffer.add_partition(pkey2);
+        second_buffer.add_static_row();
+        auto srow = mutation_fragment(*s.schema(), permit, second_buffer.frags.back());
+        second_buffer.add_clustering_rows(0, 2);
+
+        buffer first_buffer(s, permit, pkey1);
+        for (int i = 0; first_buffer.add_clustering_row(i) < buf_size; ++i);
+        first_buffer.save_position();
+        first_buffer.add_mutation_fragment(mutation_fragment(*s.schema(), permit, second_buffer.frags[1]));
+
+        buffer third_buffer(s, permit, pkey2);
+        third_buffer.add_static_row(std::move(srow));
+        third_buffer.add_clustering_rows(0, 2);
+        third_buffer.add_partition_end();
+
+        first_buffer.find_position(buf_size);
+
+        assert_that(make_reader(first_buffer, second_buffer, &third_buffer, buf_size))
+            .has_monotonic_positions();
+
+        assert_that(make_reader(first_buffer, second_buffer, &third_buffer, buf_size))
+            .produces(first_buffer.muts[0] + second_buffer.muts[0])
+            .produces(second_buffer.muts[1] + third_buffer.muts[0])
+            .produces_end_of_stream();
+    }
+
+    testlog.info("Next partition, with no static row");
+    {
+        buffer first_buffer(s, permit, pkey1);
+        const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
+        first_buffer.save_position();
+        first_buffer.add_clustering_row(second_buffer_ck + 1, true);
+
+        buffer second_buffer(s, permit, pkey2);
+        second_buffer.add_clustering_rows(0, second_buffer_ck / 2);
+        second_buffer.add_partition_end();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .has_monotonic_positions();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .produces(first_buffer.muts[0])
+            .produces(second_buffer.muts[0])
+            .produces_end_of_stream();
+    }
+
+    testlog.info("Next partition, with static row");
+    {
+        buffer first_buffer(s, permit, pkey1);
+        const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
+        first_buffer.save_position();
+        first_buffer.add_clustering_row(second_buffer_ck + 1, true);
+
+        buffer second_buffer(s, permit, pkey2);
+        second_buffer.add_static_row();
+        second_buffer.add_clustering_rows(0, second_buffer_ck / 2);
+        second_buffer.add_partition_end();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .has_monotonic_positions();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .produces(first_buffer.muts[0])
+            .produces(second_buffer.muts[0])
+            .produces_end_of_stream();
+    }
+}
+
 struct mutation_bounds {
-    mutation m;
+    std::optional<mutation> m;
    position_in_partition lower;
    position_in_partition upper;
 };

+static reader_bounds make_reader_bounds(
+        schema_ptr s, reader_permit permit, mutation_bounds mb, streamed_mutation::forwarding fwd,
+        const query::partition_slice* slice = nullptr) {
+    if (!slice) {
+        slice = &s->full_slice();
+    }
+
+    return reader_bounds {
+        .r = mb.m ? flat_mutation_reader_from_mutations(permit, {std::move(*mb.m)}, *slice, fwd)
+                  : make_empty_flat_reader(s, permit),
+        .lower = std::move(mb.lower),
+        .upper = std::move(mb.upper)
+    };
+}
+
 struct clustering_order_merger_test_generator {
    struct scenario {
        std::vector<mutation_bounds> readers_data;
@@ -3539,13 +4090,13 @@ struct clustering_order_merger_test_generator {
    schema_ptr _s;
    partition_key _pk;

-    clustering_order_merger_test_generator()
-        : _s(make_schema()), _pk(partition_key::from_single_value(*_s, int32_type->decompose(0)))
+    clustering_order_merger_test_generator(std::optional<sstring> pk = std::nullopt)
+        : _s(make_schema()), _pk(partition_key::from_single_value(*_s, utf8_type->decompose(pk ? *pk : make_local_key(make_schema()))))
    {}

    static schema_ptr make_schema() {
        return schema_builder("ks", "t")
-            .with_column("pk", int32_type, column_kind::partition_key)
+            .with_column("pk", utf8_type, column_kind::partition_key)
            .with_column("ck", int32_type, column_kind::clustering_key)
            .with_column("v", int32_type, column_kind::regular_column)
            .build();
@@ -3569,15 +4120,18 @@ struct clustering_order_merger_test_generator {
        return m;
    }

-    dht::decorated_key decorated_pk() const {
-        return dht::decorate_key(*_s, _pk);
-    }
-
    scenario generate_scenario(std::mt19937& engine) const {
        std::set<int> all_ks;
        std::vector<mutation_bounds> readers_data;

        auto num_readers = tests::random::get_int(1, 10, engine);
+        auto num_empty_readers = tests::random::get_int(1, num_readers, engine);
+        while (num_empty_readers--) {
+            auto lower = -tests::random::get_int(0, 5, engine);
+            auto upper = tests::random::get_int(0, 5, engine);
+            readers_data.push_back(mutation_bounds{std::nullopt, mk_pos_for(lower), mk_pos_for(upper)});
+            num_readers--;
+        }
        while (num_readers--) {
            auto len = tests::random::get_int(0, 15, engine);
            auto ks = tests::random::random_subset<int>(100, len, engine);
@@ -3625,16 +4179,17 @@ struct clustering_order_merger_test_generator {
 SEASTAR_THREAD_TEST_CASE(test_clustering_order_merger_in_memory) {
    clustering_order_merger_test_generator g;

-    auto make_authority = [] (mutation mut, streamed_mutation::forwarding fwd) {
-        return flat_mutation_reader_from_mutations(tests::make_permit(), {std::move(mut)}, fwd);
+    auto make_authority = [s = g._s] (std::optional<mutation> mut, streamed_mutation::forwarding fwd) {
+        if (mut) {
+            return flat_mutation_reader_from_mutations(tests::make_permit(), {std::move(*mut)}, fwd);
+        }
+        return make_empty_flat_reader(s, tests::make_permit());
    };

    auto make_tested = [s = g._s] (std::vector<mutation_bounds> ms, streamed_mutation::forwarding fwd) {
        auto rs = boost::copy_range<std::vector<reader_bounds>>(std::move(ms)
-                | boost::adaptors::transformed([fwd] (auto&& mb) {
-                    return reader_bounds{
-                        flat_mutation_reader_from_mutations(tests::make_permit(), {std::move(mb.m)}, fwd),
-                        std::move(mb.lower), std::move(mb.upper)};
+                | boost::adaptors::transformed([s, fwd] (auto&& mb) {
+                    return make_reader_bounds(s, tests::make_permit(), std::move(mb), fwd);
                }));
        auto q = std::make_unique<simple_position_reader_queue>(*s, std::move(rs));
        return make_clustering_combined_reader(s, tests::make_permit(), fwd, std::move(q));
@@ -3647,7 +4202,15 @@ SEASTAR_THREAD_TEST_CASE(test_clustering_order_merger_in_memory) {
    for (int run = 0; run < 1000; ++run) {
        auto scenario = g.generate_scenario(engine);
        auto merged = std::accumulate(scenario.readers_data.begin(), scenario.readers_data.end(),
-                mutation(g._s, g._pk), [] (mutation curr, const mutation_bounds& mb) { return std::move(curr) + mb.m; });
+                std::optional<mutation>{}, [&g] (std::optional<mutation> curr, const mutation_bounds& mb) {
+                    if (mb.m) {
+                        if (!curr) {
+                            curr = mutation(g._s, g._pk);
+                        }
+                        *curr += *mb.m;
+                    }
+                    return curr;
+                });

        {
            auto fwd = streamed_mutation::forwarding::no;
@@ -3670,13 +4233,16 @@ SEASTAR_THREAD_TEST_CASE(test_clustering_order_merger_in_memory) {
 SEASTAR_THREAD_TEST_CASE(test_clustering_order_merger_sstable_set) {
  sstables::test_env::do_with_async([] (sstables::test_env& env) {
    storage_service_for_tests ssft;
-    clustering_order_merger_test_generator g;

-    auto make_authority = [] (mutation mut, streamed_mutation::forwarding fwd) {
+    auto pkeys = make_local_keys(2, clustering_order_merger_test_generator::make_schema());
+    clustering_order_merger_test_generator g(pkeys[0]);
+
+    auto make_authority = [s = g._s] (mutation mut, streamed_mutation::forwarding fwd) {
        return flat_mutation_reader_from_mutations(tests::make_permit(), {std::move(mut)}, fwd);
    };

-    auto make_tested = [s = g._s, pos = dht::ring_position(g.decorated_pk())]
+    auto pos = dht::ring_position(dht::decorate_key(*g._s, g._pk));
+    auto make_tested = [s = g._s, pk = g._pk, &pos]
            (const time_series_sstable_set& sst_set,
                const std::unordered_set<int64_t>& included_gens, streamed_mutation::forwarding fwd) {
        auto q = sst_set.make_min_position_reader_queue(
@@ -3684,7 +4250,8 @@ SEASTAR_THREAD_TEST_CASE(test_clustering_order_merger_sstable_set) {
                return sst.read_row_flat(s, tests::make_permit(), pos,
                            s->full_slice(), seastar::default_priority_class(), nullptr, fwd);
            },
-            [included_gens] (const sstable& sst) { return included_gens.contains(sst.generation()); });
+            [included_gens] (const sstable& sst) { return included_gens.contains(sst.generation()); },
+            pk, s, tests::make_permit(), fwd);
        return make_clustering_combined_reader(s, tests::make_permit(), fwd, std::move(q));
    };

@@ -3702,32 +4269,39 @@ SEASTAR_THREAD_TEST_CASE(test_clustering_order_merger_sstable_set) {
        std::unordered_set<int64_t> included_gens;
        int64_t gen = 0;
        for (auto& mb: scenario.readers_data) {
-            sst_set.insert(make_sstable_containing([s = g._s, &env, &tmp, gen = ++gen] () {
+            auto sst_factory = [s = g._s, &env, &tmp, gen = ++gen] () {
                return env.make_sstable(std::move(s), tmp.path().string(), gen,
                    sstables::sstable::version_types::md, sstables::sstable::format_types::big);
-            }, {mb.m}));
+            };
+
+            if (mb.m) {
+                sst_set.insert(make_sstable_containing(std::move(sst_factory), {*mb.m}));
+            } else {
+                // We want to have an sstable that won't return any fragments when we query it
+                // for our partition (not even `partition_start`). For that we create an sstable
+                // with a different partition.
+                auto pk = partition_key::from_single_value(*g._s, utf8_type->decompose(pkeys[1]));
+                assert(pk != g._pk);
+
+                sst_set.insert(make_sstable_containing(std::move(sst_factory), {mutation(g._s, pk)}));
+            }

            if (dist(engine)) {
                included_gens.insert(gen);
-                merged += mb.m;
+                if (mb.m) {
+                    merged += *mb.m;
+                }
            }
        }

-        if (included_gens.empty()) {
-            for (auto fwd: {streamed_mutation::forwarding::no, streamed_mutation::forwarding::yes}) {
-                assert_that(make_tested(sst_set, included_gens, fwd)).produces_end_of_stream();
-            }
-
-            continue;
-        }
-
        {
            auto fwd = streamed_mutation::forwarding::no;
            compare_readers(*g._s, make_authority(merged, fwd), make_tested(sst_set, included_gens, fwd));
        }

        auto fwd = streamed_mutation::forwarding::yes;
-        compare_readers(*g._s, make_authority(std::move(merged), fwd), make_tested(sst_set, included_gens, fwd), scenario.fwd_ranges);
+        compare_readers(*g._s, make_authority(std::move(merged), fwd),
+                make_tested(sst_set, included_gens, fwd), scenario.fwd_ranges);
    }

  }).get();
@@ -3915,9 +4489,7 @@ SEASTAR_THREAD_TEST_CASE(clustering_combined_reader_mutation_source_test) {
            for (auto& [k, ms]: good) {
                auto rs = boost::copy_range<std::vector<reader_bounds>>(std::move(ms)
                        | boost::adaptors::transformed([&] (auto&& mb) {
-                            return reader_bounds{
-                                flat_mutation_reader_from_mutations(permit, {std::move(mb.m)}, slice, fwd_sm),
-                                std::move(mb.lower), std::move(mb.upper)};
+                            return make_reader_bounds(s, permit, std::move(mb), fwd_sm, &slice);
                        }));
                std::sort(rs.begin(), rs.end(), [less = position_in_partition::less_compare(*s)]
                        (const reader_bounds& a, const reader_bounds& b) { return less(a.lower, b.lower); });
@@ -3937,3 +4509,23 @@ SEASTAR_THREAD_TEST_CASE(clustering_combined_reader_mutation_source_test) {

    run_mutation_source_tests(std::move(populate));
 }
+
+// Regression test for #8445.
+SEASTAR_THREAD_TEST_CASE(test_clustering_combining_of_empty_readers) {
+    auto s = clustering_order_merger_test_generator::make_schema();
+
+    std::vector<reader_bounds> rs;
+    rs.push_back({
+        .r = make_empty_flat_reader(s, tests::make_permit()),
+        .lower = position_in_partition::before_all_clustered_rows(),
+        .upper = position_in_partition::after_all_clustered_rows()
+    });
+    auto r = make_clustering_combined_reader(
+            s, tests::make_permit(), streamed_mutation::forwarding::no,
+            std::make_unique<simple_position_reader_queue>(*s, std::move(rs)));
+
+    auto mf = r(db::no_timeout).get0();
+    if (mf) {
+        BOOST_FAIL(format("reader combined of empty readers returned fragment {}", mutation_fragment::printer(*s, *mf)));
+    }
+}
--- a/test/boost/mutation_test.cc
+++ b/test/boost/mutation_test.cc
@@ -1805,12 +1805,16 @@ SEASTAR_TEST_CASE(test_mutation_diff_with_random_generator) {
                BOOST_FAIL(format("Partitions don't match, got: {}\n...and: {}", mutation_partition::printer(s, mp1), mutation_partition::printer(s, mp2)));
            }
        };
-        for_each_mutation_pair([&] (auto&& m1, auto&& m2, are_equal eq) {
+        const auto now = gc_clock::now();
+        can_gc_fn never_gc = [] (tombstone) { return false; };
+        for_each_mutation_pair([&] (auto m1, auto m2, are_equal eq) {
            mutation_application_stats app_stats;
            auto s = m1.schema();
            if (s != m2.schema()) {
                return;
            }
+            m1.partition().compact_for_compaction(*s, never_gc, now);
+            m2.partition().compact_for_compaction(*s, never_gc, now);
            auto m12 = m1;
            m12.apply(m2);
            auto m12_with_diff = m1;
--- a/test/boost/querier_cache_test.cc
+++ b/test/boost/querier_cache_test.cc
@@ -712,7 +712,10 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
                nullptr,
                db::no_timeout).get();

-        BOOST_CHECK_EQUAL(db.get_querier_cache_stats().resource_based_evictions, 1);
+        // The second read might be evicted too if it consumes more
+        // memory than the first and hence triggers memory control when
+        // saved in the querier cache.
+        BOOST_CHECK_GE(db.get_querier_cache_stats().resource_based_evictions, 1);

        // We want to read the entire partition so that the querier
        // is not saved at the end and thus ensure it is destroyed.
--- a/test/boost/sstable_conforms_to_mutation_source_test.cc
+++ b/test/boost/sstable_conforms_to_mutation_source_test.cc
@@ -28,6 +28,8 @@
 #include "sstables/sstables.hh"
 #include "test/lib/mutation_source_test.hh"
 #include "test/lib/sstable_utils.hh"
+#include "test/lib/mutation_assertions.hh"
+#include "partition_slice_builder.hh"

 using namespace sstables;
 using namespace std::chrono_literals;
@@ -62,3 +64,69 @@ SEASTAR_TEST_CASE(test_sstable_conforms_to_mutation_source) {
        }
    });
 }
+
+// Regression test for scylladb/scylla-enterprise#2016
+SEASTAR_THREAD_TEST_CASE(test_produces_range_tombstone) {
+    auto s = schema_builder("ks", "cf")
+                .with_column("pk", int32_type, column_kind::partition_key)
+                .with_column("ck", int32_type, column_kind::clustering_key)
+                .with_column("v", int32_type, column_kind::regular_column)
+                .build();
+
+    mutation m(s, partition_key::from_single_value(*s, int32_type->decompose(0)));
+    m.partition().apply_row_tombstone(*s, range_tombstone{
+            clustering_key::from_exploded(*s, {int32_type->decompose(6)}), bound_kind::excl_start,
+            clustering_key::from_exploded(*s, {int32_type->decompose(10)}), bound_kind::incl_end,
+            tombstone(0, gc_clock::time_point())
+    });
+
+    {
+        auto ckey = clustering_key::from_exploded(*s, {int32_type->decompose(6)});
+        deletable_row& row = m.partition().clustered_row(*s, ckey, is_dummy::no, is_continuous(false));
+        row.marker() = row_marker(4);
+    }
+    {
+        auto ckey = clustering_key::from_exploded(*s, {int32_type->decompose(8)});
+        deletable_row& row = m.partition().clustered_row(*s, ckey, is_dummy::no, is_continuous(false));
+        row.apply(tombstone(2, gc_clock::time_point()));
+        row.marker() = row_marker(5);
+    }
+
+    testlog.info("m: {}", m);
+
+    auto slice = partition_slice_builder(*s)
+        .with_range(query::clustering_range::make(
+            {clustering_key::from_exploded(*s, {int32_type->decompose(8)}), false},
+            {clustering_key::from_exploded(*s, {int32_type->decompose(10)}), true}
+        ))
+        .build();
+
+    auto pr = dht::partition_range::make_singular(m.decorated_key());
+
+    std::vector<tmpdir> dirs;
+    dirs.emplace_back();
+    sstables::test_env::do_with_async([&] (sstables::test_env& env) {
+        storage_service_for_tests ssft;
+        auto version = sstable_version_types::la;
+        auto index_block_size = 1;
+        sstable_writer_config cfg = env.manager().configure_writer();
+        cfg.promoted_index_block_size = index_block_size;
+
+        auto source = make_sstable_mutation_source(env, s, dirs.back().path().string(), {m}, cfg, version, gc_clock::now());
+
+        {
+            auto rd = source.make_reader(s, tests::make_permit(), pr, slice);
+            while (auto mf = rd(db::no_timeout).get0()) {
+                testlog.info("produced {}", mutation_fragment::printer(*s, *mf));
+            }
+        }
+
+        {
+            auto rd = source.make_reader(s, tests::make_permit(), pr, slice);
+            mutation_opt sliced_m = read_mutation_from_flat_mutation_reader(rd, db::no_timeout).get0();
+            BOOST_REQUIRE(bool(sliced_m));
+
+            assert_that(*sliced_m).is_equal_to(m, slice.row_ranges(*m.schema(), m.key()));
+        }
+    }).get();
+}
--- a/test/boost/sstable_datafile_test.cc
+++ b/test/boost/sstable_datafile_test.cc
@@ -6814,3 +6814,187 @@ SEASTAR_TEST_CASE(test_twcs_interposer_on_memtable_flush) {
      test_interposer_on_flush(false);
    });
 }
+
+// Regression test for #8432
+SEASTAR_TEST_CASE(test_twcs_single_key_reader_filtering) {
+    return test_env::do_with_async([] (test_env& env) {
+        auto builder = schema_builder("tests", "twcs_single_key_reader_filtering")
+                .with_column("pk", int32_type, column_kind::partition_key)
+                .with_column("ck", int32_type, column_kind::clustering_key)
+                .with_column("v", int32_type);
+        builder.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
+        auto s = builder.build();
+
+        auto tmp = tmpdir();
+        auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)]() {
+            return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::sstable::version_types::md, big);
+        };
+
+        auto make_row = [&] (int32_t pk, int32_t ck) {
+            mutation m(s, partition_key::from_single_value(*s, int32_type->decompose(pk)));
+            m.set_clustered_cell(clustering_key::from_single_value(*s, int32_type->decompose(ck)), to_bytes("v"), int32_t(0), api::new_timestamp());
+            return m;
+        };
+
+        auto sst1 = make_sstable_containing(sst_gen, {make_row(0, 0)});
+        auto sst2 = make_sstable_containing(sst_gen, {make_row(0, 1)});
+        auto dkey = sst1->get_first_decorated_key();
+
+        auto cm = make_lw_shared<compaction_manager>();
+        column_family::config cfg = column_family_test_config(env.manager());
+        ::cf_stats cf_stats{0};
+        cfg.cf_stats = &cf_stats;
+        cfg.datadir = tmp.path().string();
+        auto tracker = make_lw_shared<cache_tracker>();
+        cell_locker_stats cl_stats;
+        column_family cf(s, cfg, column_family::no_commitlog(), *cm, cl_stats, *tracker);
+        cf.mark_ready_for_writes();
+        cf.start();
+
+        auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::time_window, {});
+
+        auto set = cs.make_sstable_set(s);
+        set.insert(std::move(sst1));
+        set.insert(std::move(sst2));
+
+        reader_permit permit = tests::make_permit();
+        utils::estimated_histogram eh;
+
+        auto slice = partition_slice_builder(*s)
+                    .with_range(query::clustering_range {
+                        query::clustering_range::bound { clustering_key_prefix::from_single_value(*s, int32_type->decompose(0)) },
+                        query::clustering_range::bound { clustering_key_prefix::from_single_value(*s, int32_type->decompose(1)) },
+                    }).build();
+
+        auto pos = dht::ring_position(dkey);
+        auto reader = set.create_single_key_sstable_reader(
+                &cf, s, permit, eh, pos, slice, default_priority_class(),
+                tracing::trace_state_ptr(), ::streamed_mutation::forwarding::no,
+                ::mutation_reader::forwarding::no);
+
+        auto checked_by_ck = cf_stats.sstables_checked_by_clustering_filter;
+        auto surviving_after_ck = cf_stats.surviving_sstables_after_clustering_filter;
+
+        // consume all fragments
+        while (reader(db::no_timeout).get());
+
+        // At least sst2 should be checked by the CK filter during fragment consumption and should pass.
+        // With the bug in #8432, sst2 wouldn't even be checked by the CK filter since it would pass right after checking the PK filter.
+        BOOST_REQUIRE_GE(cf_stats.sstables_checked_by_clustering_filter - checked_by_ck, 1);
+        BOOST_REQUIRE_EQUAL(
+                cf_stats.surviving_sstables_after_clustering_filter - surviving_after_ck,
+                cf_stats.sstables_checked_by_clustering_filter - checked_by_ck);
+    });
+}
+
+SEASTAR_TEST_CASE(max_ongoing_compaction_test) {
+    return test_env::do_with_async([] (test_env& env) {
+        BOOST_REQUIRE(smp::count == 1);
+
+        auto make_schema = [] (auto idx) {
+            auto builder = schema_builder("tests", std::to_string(idx))
+                .with_column("id", utf8_type, column_kind::partition_key)
+                .with_column("cl", int32_type, column_kind::clustering_key)
+                .with_column("value", int32_type);
+            builder.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
+            std::map <sstring, sstring> opts = {
+                {time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY,                  "HOURS"},
+                {time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY,                  "1"},
+                {time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY, "0"},
+            };
+            builder.set_compaction_strategy_options(std::move(opts));
+            builder.set_gc_grace_seconds(0);
+            return builder.build();
+        };
+
+        auto cm = make_lw_shared<compaction_manager>();
+        cm->enable();
+        auto stop_cm = defer([&cm] {
+            cm->stop().get();
+        });
+
+        auto tmp = tmpdir();
+        auto cl_stats = make_lw_shared<cell_locker_stats>();
+        auto tracker = make_lw_shared<cache_tracker>();
+        auto tokens = token_generation_for_shard(1, this_shard_id(), test_db_config.murmur3_partitioner_ignore_msb_bits(), smp::count);
+
+        auto next_timestamp = [] (auto step) {
+            using namespace std::chrono;
+            return (gc_clock::now().time_since_epoch() - duration_cast<microseconds>(step)).count();
+        };
+        auto make_expiring_cell = [&] (schema_ptr s, std::chrono::hours step) {
+            static thread_local int32_t value = 1;
+
+            auto key_str = tokens[0].first;
+            auto key = partition_key::from_exploded(*s, {to_bytes(key_str)});
+
+            mutation m(s, key);
+            auto c_key = clustering_key::from_exploded(*s, {int32_type->decompose(value++)});
+            m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(value)), next_timestamp(step), gc_clock::duration(step + 5s));
+            return m;
+        };
+
+        auto make_table_with_single_fully_expired_sstable = [&] (auto idx) {
+            auto s = make_schema(idx);
+            column_family::config cfg = column_family_test_config(env.manager());
+            cfg.datadir = tmp.path().string() + "/" + std::to_string(idx);
+            touch_directory(cfg.datadir).get();
+            cfg.enable_commitlog = false;
+            cfg.enable_incremental_backups = false;
+
+            auto sst_gen = [&env, s, dir = cfg.datadir, gen = make_lw_shared<unsigned>(1)] () mutable {
+                return env.make_sstable(s, dir, (*gen)++, sstables::sstable::version_types::md, big);
+            };
+
+            auto cf = make_lw_shared<column_family>(s, cfg, column_family::no_commitlog(), *cm, *cl_stats, *tracker);
+            cf->start();
+            cf->mark_ready_for_writes();
+
+            auto muts = { make_expiring_cell(s, std::chrono::hours(1)) };
+            auto sst = make_sstable_containing(sst_gen, muts);
+            column_family_test(cf).add_sstable(sst);
+            return cf;
+        };
+
+        std::vector<lw_shared_ptr<column_family>> tables;
+        auto stop_tables = defer([&tables] {
+            for (auto& t : tables) {
+                t->stop().get();
+            }
+        });
+        for (auto i = 0; i < 100; i++) {
+            tables.push_back(make_table_with_single_fully_expired_sstable(i));
+        }
+
+        // Make sure everything is expired
+        forward_jump_clocks(std::chrono::hours(100));
+
+        for (auto& t : tables) {
+            BOOST_REQUIRE(t->sstables_count() == 1);
+            t->trigger_compaction();
+        }
+
+        BOOST_REQUIRE(cm->get_stats().pending_tasks >= 1 || cm->get_stats().active_tasks >= 1);
+
+        size_t max_ongoing_compaction = 0;
+
+        // wait for submitted jobs to finish.
+        auto end = [cm, &tables] {
+            return cm->get_stats().pending_tasks == 0 && cm->get_stats().active_tasks == 0
+                && boost::algorithm::all_of(tables, [] (auto& t) { return t->sstables_count() == 0; });
+        };
+        while (!end()) {
+            if (!cm->get_stats().pending_tasks && !cm->get_stats().active_tasks) {
+                for (auto& t : tables) {
+                    if (t->sstables_count()) {
+                        t->trigger_compaction();
+                    }
+                }
+            }
+            max_ongoing_compaction = std::max(cm->get_stats().active_tasks, max_ongoing_compaction);
+            later().get();
+        }
+        BOOST_REQUIRE(cm->get_stats().errors == 0);
+        BOOST_REQUIRE(max_ongoing_compaction == 1);
+    });
+}
--- a/test/cql-pytest/cassandra_tests/validation/entities/frozen_collections_test.py
+++ b/test/cql-pytest/cassandra_tests/validation/entities/frozen_collections_test.py
@@ -440,7 +440,6 @@ def testNestedClusteringKeyUsage(cql, test_keyspace):
        )

 # Reproduces issue #7868 and #7902
-@pytest.mark.xfail(reason="fails because of issue #7902")
 def testNestedClusteringKeyUsageWithReverseOrder(cql, test_keyspace):
    with create_table(cql, test_keyspace, "(a int, b frozen<map<set<int>, list<int>>>, c frozen<set<int>>, d int, PRIMARY KEY (a, b, c)) WITH CLUSTERING ORDER BY (b DESC)") as table:
        execute(cql, table, "INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, {}, set(), 0)
--- a/test/cql-pytest/test_allow_filtering.py
+++ b/test/cql-pytest/test_allow_filtering.py
@@ -175,9 +175,12 @@ def wait_for_index(cql, table, column, everything):
        results = []
        for v in column_values:
            results.extend(list(cql.execute(f'SELECT * FROM {table} WHERE {column}={v}')))
-        if set(results) == set(everything):
+
+        if sorted(results) == sorted(everything):
            return
+        
        time.sleep(0.1)
+
    pytest.fail('Timeout waiting for index to become up to date.')

@pytest.fixture(scope="session")
@@ -280,7 +283,6 @@ def test_allow_filtering_index_in(cql, table2):
 # single partition. Analogous to CASSANDRA-8302, and also reproduced by
 # Cassandra's more elaborate test for this issue,
 # cassandra_tests/validation/entities/frozen_collections_test.py::testClusteringColumnFiltering
-@pytest.mark.xfail(reason="issue #7888")
 def test_contains_frozen_collection_ck(cql, test_keyspace):
    with new_test_table(cql, test_keyspace, "a int, b frozen<map<int, int>>, c int, PRIMARY KEY (a,b,c)") as table:
        # The CREATE INDEX for c is necessary to reproduce this bug.
@@ -292,3 +294,46 @@ def test_contains_frozen_collection_ck(cql, test_keyspace):
            "SELECT * FROM " + table + " WHERE a=0 AND c=0 AND b CONTAINS 0 ALLOW FILTERING")))
        assert 1 == len(list(cql.execute(
            "SELECT * FROM " + table + " WHERE a=0 AND c=0 AND b CONTAINS KEY 0 ALLOW FILTERING")))
+
+# table5 contains an indexed table with 3 clustering columns.
+# used to test correct filtering of rows fetched from an index table.
+@pytest.fixture(scope="module")
+def table5(cql, test_keyspace):
+    table = test_keyspace + "." + unique_name()
+    cql.execute(f"CREATE TABLE {table} (p int, c1 frozen<list<int>>, c2 frozen<list<int>>, c3 int, PRIMARY KEY (p,c1,c2,c3))")
+    cql.execute(f"CREATE INDEX ON {table} (c3)")
+    cql.execute(f"INSERT INTO {table} (p, c1, c2, c3) VALUES (0, [1], [2], 0)")
+    cql.execute(f"INSERT INTO {table} (p, c1, c2, c3) VALUES (0, [2], [2], 0)")
+    cql.execute(f"INSERT INTO {table} (p, c1, c2, c3) VALUES (0, [1], [3], 0)")
+    cql.execute(f"INSERT INTO {table} (p, c1, c2, c3) VALUES (0, [1], [2], 1)")
+
+    everything = list(cql.execute(f"SELECT * FROM {table}"))
+    wait_for_index(cql, table, 'c3', everything)
+    yield (table, everything)
+    cql.execute(f"DROP TABLE {table}")
+
+# Test that implementation of filtering for indexes works ok.
+# Current implementation is a bit conservative - it might sometimes state
+# that filtering is needed when it isn't actually required, but at least it's safe.
+def test_select_indexed_cluster_three_keys(cql, table5):
+    def check_good_row(row):
+        return row.p == 0 and row.c1 == [1] and row.c2 == [2] and row.c3 == 0
+    
+    check_af_optional(cql, table5, "c3 = 0", lambda r : r.c3 == 0)
+    check_af_mandatory(cql, table5, "c1 = [1] AND c2 = [2] AND c3 = 0", check_good_row)
+    check_af_mandatory(cql, table5, "p = 0 AND c1 CONTAINS 1 AND c3 = 0", lambda r : r.p == 0 and r.c1 == [1] and r.c3 == 0)
+    check_af_mandatory(cql, table5, "p = 0 AND c1 = [1] AND c2 CONTAINS 2 AND c3 = 0", check_good_row)
+
+    # Doesn't use an index - shouldn't be affected
+    check_af_optional(cql, table5, "p = 0 AND c1 = [1] AND c2 = [2] AND c3 = 0", check_good_row)
+
+# Here are the cases where current implementation of need_filtering() fails
+# By coincidence they also fail on cassandra, it looks like cassandra is buggy
+@pytest.mark.xfail(reason="Too conservative need_filtering() implementation")
+def test_select_indexed_cluster_three_keys_conservative(cql, table5, cassandra_bug):
+    def check_good_row(row):
+        return row.p == 0 and row.c1 == [1] and row.c3 == 0
+
+    # Don't require filtering, but for now we report they do
+    check_af_optional(cql, table5, "p = 0 AND c1 = [1] AND c3 = 0", check_good_row)
+    check_af_optional(cql, table5, "p = 0 AND c1 = [1] AND c2 < [3] AND c3 = 0", lambda r : check_good_row(r) and r.c2 < [3])
--- a/test/cql-pytest/test_json.py
+++ b/test/cql-pytest/test_json.py
@@ -26,7 +26,7 @@

 from util import unique_name, new_test_table

-from cassandra.protocol import FunctionFailure
+from cassandra.protocol import FunctionFailure, InvalidRequest

 import pytest
 import random
@@ -34,58 +34,62 @@ import random
@pytest.fixture(scope="session")
 def table1(cql, test_keyspace):
    table = test_keyspace + "." + unique_name()
-    cql.execute(f"CREATE TABLE {table} (p int PRIMARY KEY, v int, a ascii)")
+    cql.execute(f"CREATE TABLE {table} (p int PRIMARY KEY, v int, a ascii, b boolean)")
    yield table
    cql.execute("DROP TABLE " + table)

 # Test that failed fromJson() parsing an invalid JSON results in the expected
 # error - FunctionFailure - and not some weird internal error.
 # Reproduces issue #7911.
-@pytest.mark.xfail(reason="issue #7911")
 def test_failed_json_parsing_unprepared(cql, table1):
    p = random.randint(1,1000000000)
    with pytest.raises(FunctionFailure):
        cql.execute(f"INSERT INTO {table1} (p, v) VALUES ({p}, fromJson('dog'))")
-@pytest.mark.xfail(reason="issue #7911")
 def test_failed_json_parsing_prepared(cql, table1):
    p = random.randint(1,1000000000)
    stmt = cql.prepare(f"INSERT INTO {table1} (p, v) VALUES (?, fromJson(?))")
    with pytest.raises(FunctionFailure):
-        cql.execute(stmt, [0, 'dog'])
+        cql.execute(stmt, [p, 'dog'])

 # Similarly, if the JSON parsing did not fail, but yielded a type which is
 # incompatible with the type we want it to yield, we should get a clean
 # FunctionFailure, not some internal server error.
 # We have here examples of returning a string where a number was expected,
-# and returning a unicode string where ASCII was expected.
+# and returning a unicode string where ASCII was expected, and returning
+# a number of the wrong type
 # Reproduces issue #7911.
-@pytest.mark.xfail(reason="issue #7911")
 def test_fromjson_wrong_type_unprepared(cql, table1):
    p = random.randint(1,1000000000)
    with pytest.raises(FunctionFailure):
        cql.execute(f"INSERT INTO {table1} (p, v) VALUES ({p}, fromJson('\"dog\"'))")
    with pytest.raises(FunctionFailure):
        cql.execute(f"INSERT INTO {table1} (p, a) VALUES ({p}, fromJson('3'))")
-@pytest.mark.xfail(reason="issue #7911")
 def test_fromjson_wrong_type_prepared(cql, table1):
    p = random.randint(1,1000000000)
    stmt = cql.prepare(f"INSERT INTO {table1} (p, v) VALUES (?, fromJson(?))")
    with pytest.raises(FunctionFailure):
-        cql.execute(stmt, [0, '"dog"'])
+        cql.execute(stmt, [p, '"dog"'])
    stmt = cql.prepare(f"INSERT INTO {table1} (p, a) VALUES (?, fromJson(?))")
    with pytest.raises(FunctionFailure):
-        cql.execute(stmt, [0, '3'])
-@pytest.mark.xfail(reason="issue #7911")
+        cql.execute(stmt, [p, '3'])
 def test_fromjson_bad_ascii_unprepared(cql, table1):
    p = random.randint(1,1000000000)
    with pytest.raises(FunctionFailure):
        cql.execute(f"INSERT INTO {table1} (p, a) VALUES ({p}, fromJson('\"שלום\"'))")
-@pytest.mark.xfail(reason="issue #7911")
 def test_fromjson_bad_ascii_prepared(cql, table1):
    p = random.randint(1,1000000000)
    stmt = cql.prepare(f"INSERT INTO {table1} (p, a) VALUES (?, fromJson(?))")
    with pytest.raises(FunctionFailure):
-        cql.execute(stmt, [0, '"שלום"'])
+        cql.execute(stmt, [p, '"שלום"'])
+def test_fromjson_nonint_unprepared(cql, table1):
+    p = random.randint(1,1000000000)
+    with pytest.raises(FunctionFailure):
+        cql.execute(f"INSERT INTO {table1} (p, v) VALUES ({p}, fromJson('1.2'))")
+def test_fromjson_nonint_prepared(cql, table1):
+    p = random.randint(1,1000000000)
+    stmt = cql.prepare(f"INSERT INTO {table1} (p, v) VALUES (?, fromJson(?))")
+    with pytest.raises(FunctionFailure):
+        cql.execute(stmt, [p, '1.2'])

 # The JSON standard does not define or limit the range or precision of
 # numbers. However, if a number is assigned to a Scylla number type, the
@@ -105,7 +109,27 @@ def test_fromjson_int_overflow_prepared(cql, table1):
    p = random.randint(1,1000000000)
    stmt = cql.prepare(f"INSERT INTO {table1} (p, v) VALUES (?, fromJson(?))")
    with pytest.raises(FunctionFailure):
-        cql.execute(stmt, [0, '2147483648'])
+        cql.execute(stmt, [p, '2147483648'])
+
+# Cassandra allows the strings "true" and "false", not just the JSON constants
+# true and false, to be assigned to a boolean column. However, very strangely,
+# it only allows this for prepared statements, and *not* for unprepared
+# statements - which result in an InvalidRequest!
+# Reproduces #7915.
+def test_fromjson_boolean_string_unprepared(cql, table1):
+    p = random.randint(1,1000000000)
+    with pytest.raises(InvalidRequest):
+        cql.execute(f"INSERT INTO {table1} (p, b) VALUES ({p}, '\"true\"')")
+    with pytest.raises(InvalidRequest):
+        cql.execute(f"INSERT INTO {table1} (p, b) VALUES ({p}, '\"false\"')")
+@pytest.mark.xfail(reason="issue #7915")
+def test_fromjson_boolean_string_prepared(cql, table1):
+    p = random.randint(1,1000000000)
+    stmt = cql.prepare(f"INSERT INTO {table1} (p, b) VALUES (?, fromJson(?))")
+    cql.execute(stmt, [p, '"true"'])
+    assert list(cql.execute(f"SELECT p, b from {table1} where p = {p}")) == [(p, True)]
+    cql.execute(stmt, [p, '"false"'])
+    assert list(cql.execute(f"SELECT p, b from {table1} where p = {p}")) == [(p, False)]

 # Test that null argument is allowed for fromJson(), with unprepared statement
 # Reproduces issue #7912.
--- a/test/cql-pytest/test_keyspace.py
+++ b/test/cql-pytest/test_keyspace.py
@@ -82,7 +82,7 @@ def test_create_keyspace_invalid_name(cql):
    cql.execute('DROP KEYSPACE "123"')

 # Test trying to ALTER a keyspace with invalid options.
-@pytest.mark.xfail(reason="fails because of issue #7595")
+# Reproduces #7595.
 def test_create_keyspace_nonexistent_dc(cql):
    with pytest.raises(ConfigurationException):
        ks = unique_name()
@@ -124,7 +124,7 @@ def test_alter_keyspace_invalid(cql):
            cql.execute(f"ALTER KEYSPACE {keyspace} WITH REPLICATION = {{ 'class' : 'SimpleStrategy', 'replication_factor' : 'foo' }}")

 # Test trying to ALTER a keyspace with invalid options.
-@pytest.mark.xfail(reason="fails because of issue #7595")
+# Reproduces #7595.
 def test_alter_keyspace_nonexistent_dc(cql):
    with new_test_keyspace(cql, "WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }") as keyspace:
        with pytest.raises(ConfigurationException):
--- a/test/cql-pytest/test_secondary_index.py
+++ b/test/cql-pytest/test_secondary_index.py
@@ -20,8 +20,9 @@
 import time
 import pytest
 from cassandra.protocol import SyntaxException, AlreadyExists, InvalidRequest, ConfigurationException, ReadFailure
+from cassandra.query import SimpleStatement

-from util import new_test_table
+from util import new_test_table, unique_name

 # A reproducer for issue #7443: Normally, when the entire table is SELECTed,
 # the partitions are returned sorted by the partitions' token. When there
@@ -64,3 +65,123 @@ def test_partition_order_with_si(cql, test_keyspace):
                pass
            time.sleep(0.1)
        assert tokens2 == tokens
+
+# Test that the paging state works properly for indexes on tables
+# with descending clustering order. There was a problem with indexes
+# created on clustering keys with DESC clustering order - they are represented
+# as "reverse" types internally and Scylla assertions failed that the base type
+# is different from the underlying view type, even though, from the perspective
+# of deserialization, they're equal. Issue #8666
+def test_paging_with_desc_clustering_order(cql, test_keyspace):
+    schema = 'p int, c int, primary key (p,c)'
+    extra = 'with clustering order by (c desc)'
+    with new_test_table(cql, test_keyspace, schema, extra) as table:
+        cql.execute(f"CREATE INDEX ON {table}(c)")
+        for i in range(3):
+            cql.execute(f"INSERT INTO {table}(p,c) VALUES ({i}, 42)")
+        stmt = SimpleStatement(f"SELECT * FROM {table} WHERE c = 42", fetch_size=1)
+        assert len([row for row in cql.execute(stmt)]) == 3
+
+# Test which ensures that indexes for a query are picked by the order in which
+# they appear in restrictions. That way, users can deterministically pick
+# which indexes are used for which queries.
+# Note that the order of picking indexing is not set in stone and may be
+# subject to change - in which case this test case should be amended as well.
+# The order tested in this case was decided as a good first step in issue
+# #7969, but it's possible that it will eventually be implemented another
+# way, e.g. dynamically based on estimated query selectivity statistics.
+# Ref: #7969
+@pytest.mark.xfail(reason="The order of picking indexes is currently arbitrary. Issue #7969")
+def test_order_of_indexes(scylla_only, cql, test_keyspace):
+    schema = 'p int primary key, v1 int, v2 int, v3 int'
+    with new_test_table(cql, test_keyspace, schema) as table:
+        cql.execute(f"CREATE INDEX my_v3_idx ON {table}(v3)")
+        cql.execute(f"CREATE INDEX my_v1_idx ON {table}(v1)")
+        cql.execute(f"CREATE INDEX my_v2_idx ON {table}((p),v2)")
+        # All queries below should use the first index they find in the list
+        # of restrictions. Tracing information will be consulted to ensure
+        # it's true. Currently some of the cases below succeed, because the
+        # order is not well defined (and may, for instance, change upon
+        # server restart), but some of them fail. Once a proper ordering
+        # is implemented, all cases below should succeed.
+        def index_used(query, index_name):
+            assert any([index_name in event.description for event in cql.execute(query, trace=True).get_query_trace().events])
+        index_used(f"SELECT * FROM {table} WHERE v3 = 1", "my_v3_idx")
+        index_used(f"SELECT * FROM {table} WHERE v3 = 1 and v1 = 2 allow filtering", "my_v3_idx")
+        index_used(f"SELECT * FROM {table} WHERE p = 1 and v1 = 1 and v3 = 2 allow filtering", "my_v1_idx")
+        index_used(f"SELECT * FROM {table} WHERE p = 1 and v3 = 1 and v1 = 2 allow filtering", "my_v3_idx")
+        # Local indexes are still skipped if they cannot be used
+        index_used(f"SELECT * FROM {table} WHERE v2 = 1 and v1 = 2 allow filtering", "my_v1_idx")
+        index_used(f"SELECT * FROM {table} WHERE v2 = 1 and v3 = 2 and v1 = 3 allow filtering", "my_v3_idx")
+        index_used(f"SELECT * FROM {table} WHERE v1 = 1 and v2 = 2 and v3 = 3 allow filtering", "my_v1_idx")
+        # Local indexes are still preferred over global ones, if they can be used
+        index_used(f"SELECT * FROM {table} WHERE p = 1 and v1 = 1 and v3 = 2 and v2 = 2 allow filtering", "my_v2_idx")
+        index_used(f"SELECT * FROM {table} WHERE p = 1 and v2 = 1 and v1 = 2 allow filtering", "my_v2_idx")
+
+# Indexes can be created without an explicit name, in which case a default name is chosen.
+# However, due to #8620 it was possible to break the index creation mechanism by creating
+# a properly named regular table, which conflicts with the generated index name.
+def test_create_unnamed_index_when_its_name_is_taken(cql, test_keyspace):
+    schema = 'p int primary key, v int'
+    with new_test_table(cql, test_keyspace, schema) as table:
+        try:
+            cql.execute(f"CREATE TABLE {table}_v_idx_index (i_do_not_exist_in_the_base_table int primary key)")
+            # Creating an index should succeed, even though its default name is taken
+            # by the table above
+            cql.execute(f"CREATE INDEX ON {table}(v)")
+        finally:
+            cql.execute(f"DROP TABLE {table}_v_idx_index")
+
+# Indexed created with an explicit name cause a materialized view to be created,
+# and this view has a specific name - <index-name>_index. If there happens to be
+# a regular table (or another view) named just like that, index creation should fail.
+def test_create_named_index_when_its_name_is_taken(scylla_only, cql, test_keyspace):
+    schema = 'p int primary key, v int'
+    with new_test_table(cql, test_keyspace, schema) as table:
+        index_name = unique_name()
+        try:
+            cql.execute(f"CREATE TABLE {test_keyspace}.{index_name}_index (i_do_not_exist_in_the_base_table int primary key)")
+            # Creating an index should fail, because it's impossible to create
+            # its underlying materialized view, because its name is taken by a regular table
+            with pytest.raises(InvalidRequest, match="already exists"):
+                cql.execute(f"CREATE INDEX {index_name} ON {table}(v)")
+        finally:
+            cql.execute(f"DROP TABLE {test_keyspace}.{index_name}_index")
+
+# Tests for CREATE INDEX IF NOT EXISTS
+# Reproduces issue #8717.
+def test_create_index_if_not_exists(cql, test_keyspace):
+    with new_test_table(cql, test_keyspace, 'p int primary key, v int') as table:
+        cql.execute(f"CREATE INDEX ON {table}(v)")
+        # Can't create the same index again without "IF NOT EXISTS", but can
+        # do it with "IF NOT EXISTS":
+        with pytest.raises(InvalidRequest, match="duplicate"):
+            cql.execute(f"CREATE INDEX ON {table}(v)")
+        cql.execute(f"CREATE INDEX IF NOT EXISTS ON {table}(v)")
+        cql.execute(f"DROP INDEX {test_keyspace}.{table.split('.')[1]}_v_idx")
+
+        # Now test the same thing for named indexes. This is what broke in #8717:
+        cql.execute(f"CREATE INDEX xyz ON {table}(v)")
+        with pytest.raises(InvalidRequest, match="already exists"):
+            cql.execute(f"CREATE INDEX xyz ON {table}(v)")
+        cql.execute(f"CREATE INDEX IF NOT EXISTS xyz ON {table}(v)")
+        cql.execute(f"DROP INDEX {test_keyspace}.xyz")
+
+        # Exactly the same with non-lower case name.
+        cql.execute(f'CREATE INDEX "CamelCase" ON {table}(v)')
+        with pytest.raises(InvalidRequest, match="already exists"):
+            cql.execute(f'CREATE INDEX "CamelCase" ON {table}(v)')
+        cql.execute(f'CREATE INDEX IF NOT EXISTS "CamelCase" ON {table}(v)')
+        cql.execute(f'DROP INDEX {test_keyspace}."CamelCase"')
+
+        # Trying to create an index for an attribute that's already indexed,
+        # but with a different name. The "IF NOT EXISTS" appears to succeed
+        # in this case, but does not actually create the new index name -
+        # only the old one remains.
+        cql.execute(f"CREATE INDEX xyz ON {table}(v)")
+        with pytest.raises(InvalidRequest, match="duplicate"):
+            cql.execute(f"CREATE INDEX abc ON {table}(v)")
+        cql.execute(f"CREATE INDEX IF NOT EXISTS abc ON {table}(v)")
+        with pytest.raises(InvalidRequest):
+            cql.execute(f"DROP INDEX {test_keyspace}.abc")
+        cql.execute(f"DROP INDEX {test_keyspace}.xyz")
--- a/test/cql/locator_test.cql
+++ b/test/cql/locator_test.cql
@@ -1,25 +1,9 @@
 --
-- https://github.com/scylladb/scylla/issues/5935
-- crash if all_replicas is empty test we work well when the list of
-- natural endpoints is empty
+-- https://github.com/scylladb/scylla/issues/7595
+-- Fail on wrong DC name
 --
-CREATE KEYSPACE t WITH REPLICATION = { 'class' : 'NetworkTopologyStrategy', 'nosuchdc' : 3 } AND DURABLE_WRITES =  true;
-CREATE TABLE t.t (a INT PRIMARY KEY, b int);
-INSERT INTO t.t (a, b) VALUES (1, 1);
-INSERT INTO t.t (a, b) VALUES (2, 2);
-SELECT * FROM t.t ALLOW FILTERING;
-- This statement used to trigger a crash
-SELECT a FROM t.t WHERE a IN (1, 2);
-DELETE FROM t.t WHERE a = 1;
-DELETE FROM t.t WHERE a = 2;
-CREATE INDEX b ON t.t (b);
-SELECT * FROM t.t WHERE b=2;
-INSERT INTO t.t (a) VALUES (1) IF NOT EXISTS;
-DELETE FROM t.t WHERE a=1 IF EXISTS;
-CREATE MATERIALIZED VIEW t.mv AS SELECT a, b FROM t.t WHERE b > 1 PRIMARY KEY (b, a);
-SELECT * FROM t.mv WHERE b IN (2, 1);
-DROP MATERIALIZED VIEW t.mv;
-DROP TABLE t.t;
+CREATE KEYSPACE t WITH REPLICATION = { 'class' : 'NetworkTopologyStrategy', 'nosuchdc' : 3 } AND DURABLE_WRITES = true;
+CREATE KEYSPACE t WITH REPLICATION = { 'class' : 'NetworkTopologyStrategy', 'datacenter1' : 3 } AND DURABLE_WRITES = true;
 DROP KEYSPACE t;
 --
 -- https://github.com/scylladb/scylla/issues/5962
--- a/test/cql/locator_test.result
+++ b/test/cql/locator_test.result
@@ -1,80 +1,13 @@
 --
-- https://github.com/scylladb/scylla/issues/5935
-- crash if all_replicas is empty test we work well when the list of
-- natural endpoints is empty
+-- https://github.com/scylladb/scylla/issues/7595
+-- Fail on wrong DC name
 --
-CREATE KEYSPACE t WITH REPLICATION = { 'class' : 'NetworkTopologyStrategy', 'nosuchdc' : 3 } AND DURABLE_WRITES =  true;
+CREATE KEYSPACE t WITH REPLICATION = { 'class' : 'NetworkTopologyStrategy', 'nosuchdc' : 3 } AND DURABLE_WRITES = true;
 {
-	"status" : "ok"
-}
-CREATE TABLE t.t (a INT PRIMARY KEY, b int);
-{
-	"status" : "ok"
-}
-INSERT INTO t.t (a, b) VALUES (1, 1);
-{
-	"message" : "exceptions::unavailable_exception (Cannot achieve consistency level for cl ONE. Requires 1, alive 0)",
+	"message" : "exceptions::configuration_exception (Unrecognized strategy option {nosuchdc} passed to org.apache.cassandra.locator.NetworkTopologyStrategy for keyspace t)",
 	"status" : "error"
 }
-INSERT INTO t.t (a, b) VALUES (2, 2);
-{
-	"message" : "exceptions::unavailable_exception (Cannot achieve consistency level for cl ONE. Requires 1, alive 0)",
-	"status" : "error"
-}
-SELECT * FROM t.t ALLOW FILTERING;
-{
-	"message" : "exceptions::unavailable_exception (Cannot achieve consistency level for cl ONE. Requires 1, alive 0)",
-	"status" : "error"
-}
-- This statement used to trigger a crash
-SELECT a FROM t.t WHERE a IN (1, 2);
-{
-	"message" : "exceptions::unavailable_exception (Cannot achieve consistency level for cl ONE. Requires 1, alive 0)",
-	"status" : "error"
-}
-DELETE FROM t.t WHERE a = 1;
-{
-	"message" : "exceptions::unavailable_exception (Cannot achieve consistency level for cl ONE. Requires 1, alive 0)",
-	"status" : "error"
-}
-DELETE FROM t.t WHERE a = 2;
-{
-	"message" : "exceptions::unavailable_exception (Cannot achieve consistency level for cl ONE. Requires 1, alive 0)",
-	"status" : "error"
-}
-CREATE INDEX b ON t.t (b);
-{
-	"status" : "ok"
-}
-SELECT * FROM t.t WHERE b=2;
-{
-	"message" : "exceptions::unavailable_exception (Cannot achieve consistency level for cl ONE. Requires 1, alive 0)",
-	"status" : "error"
-}
-INSERT INTO t.t (a) VALUES (1) IF NOT EXISTS;
-{
-	"message" : "exceptions::unavailable_exception (Cannot achieve consistency level for cl SERIAL. Requires 1, alive 0)",
-	"status" : "error"
-}
-DELETE FROM t.t WHERE a=1 IF EXISTS;
-{
-	"message" : "exceptions::unavailable_exception (Cannot achieve consistency level for cl SERIAL. Requires 1, alive 0)",
-	"status" : "error"
-}
-CREATE MATERIALIZED VIEW t.mv AS SELECT a, b FROM t.t WHERE b > 1 PRIMARY KEY (b, a);
-{
-	"status" : "ok"
-}
-SELECT * FROM t.mv WHERE b IN (2, 1);
-{
-	"message" : "exceptions::unavailable_exception (Cannot achieve consistency level for cl ONE. Requires 1, alive 0)",
-	"status" : "error"
-}
-DROP MATERIALIZED VIEW t.mv;
-{
-	"status" : "ok"
-}
-DROP TABLE t.t;
+CREATE KEYSPACE t WITH REPLICATION = { 'class' : 'NetworkTopologyStrategy', 'datacenter1' : 3 } AND DURABLE_WRITES = true;
 {
 	"status" : "ok"
 }
--- a/test/lib/mutation_source_test.cc
+++ b/test/lib/mutation_source_test.cc
@@ -2328,11 +2328,15 @@ void for_each_schema_change(std::function<void(schema_ptr, const std::vector<mut
    test_mutated_schemas();
 }

-static void compare_readers(const schema& s, flat_mutation_reader& authority, flat_reader_assertions& tested) {
+// Returns true iff the readers were non-empty.
+static bool compare_readers(const schema& s, flat_mutation_reader& authority, flat_reader_assertions& tested) {
+    bool empty = true;
    while (auto expected = authority(db::no_timeout).get()) {
        tested.produces(s, *expected);
+        empty = false;
    }
    tested.produces_end_of_stream();
+    return !empty;
 }

 void compare_readers(const schema& s, flat_mutation_reader authority, flat_mutation_reader tested) {
@@ -2340,12 +2344,14 @@ void compare_readers(const schema& s, flat_mutation_reader authority, flat_mutat
    compare_readers(s, authority, assertions);
 }

+// Assumes that the readers return fragments from (at most) a single (and the same) partition.
 void compare_readers(const schema& s, flat_mutation_reader authority, flat_mutation_reader tested, const std::vector<position_range>& fwd_ranges) {
    auto assertions = assert_that(std::move(tested));
-    compare_readers(s, authority, assertions);
-    for (auto& r: fwd_ranges) {
-        authority.fast_forward_to(r, db::no_timeout).get();
-        assertions.fast_forward_to(r);
-        compare_readers(s, authority, assertions);
+    if (compare_readers(s, authority, assertions)) {
+        for (auto& r: fwd_ranges) {
+            authority.fast_forward_to(r, db::no_timeout).get();
+            assertions.fast_forward_to(r);
+            compare_readers(s, authority, assertions);
+        }
    }
 }
--- a/test/lib/sstable_utils.hh
+++ b/test/lib/sstable_utils.hh
@@ -118,6 +118,8 @@ public:
                        return stop_iteration::no;
                    });
                });
+            }).finally([&ir] () {
+                return ir->close();
            });
        }).then([l] {
            return std::move(*l);
--- a/tools/java
+++ b/tools/java
--- a/tools/jmx
+++ b/tools/jmx
--- a/transport/controller.cc
+++ b/transport/controller.cc
@@ -97,12 +97,18 @@ future<> controller::do_start_server() {
        };

        std::vector<listen_cfg> configs;
+        int native_port_idx = -1, native_shard_aware_port_idx = -1;

-        if (cfg.native_transport_port() != 0) {
-            configs.push_back(listen_cfg{ socket_address{ip, cfg.native_transport_port()}, false });
+        if (cfg.native_transport_port.is_set() ||
+                (!cfg.native_transport_port_ssl.is_set() && !cfg.native_transport_port.is_set())) {
+            // Non-SSL port is specified || neither SSL nor non-SSL ports are specified
+            configs.emplace_back(listen_cfg{ socket_address{ip, cfg.native_transport_port()}, false });
+            native_port_idx = 0;
        }
-        if (cfg.native_shard_aware_transport_port.is_set()) {
-            configs.push_back(listen_cfg{ socket_address{ip, cfg.native_shard_aware_transport_port()}, true });
+        if (cfg.native_shard_aware_transport_port.is_set() ||
+                (!cfg.native_shard_aware_transport_port_ssl.is_set() && !cfg.native_shard_aware_transport_port.is_set())) {
+            configs.emplace_back(listen_cfg{ socket_address{ip, cfg.native_shard_aware_transport_port()}, true });
+            native_shard_aware_port_idx = native_port_idx + 1;
        }

        // main should have made sure values are clean and neatish
@@ -127,15 +133,20 @@ future<> controller::do_start_server() {

            logger.info("Enabling encrypted CQL connections between client and server");

-            if (cfg.native_transport_port_ssl.is_set() && cfg.native_transport_port_ssl() != cfg.native_transport_port()) {
+            if (cfg.native_transport_port_ssl.is_set() &&
+                    (!cfg.native_transport_port.is_set() ||
+                    cfg.native_transport_port_ssl() != cfg.native_transport_port())) {
+                // SSL port is specified && non-SSL port is either left out or set to a different value
                configs.emplace_back(listen_cfg{{ip, cfg.native_transport_port_ssl()}, false, cred});
-            } else {
-                configs[0].cred = cred;
+            } else if (native_port_idx >= 0) {
+                configs[native_port_idx].cred = cred;
            }
-            if (cfg.native_shard_aware_transport_port_ssl.is_set() && cfg.native_shard_aware_transport_port_ssl() != cfg.native_shard_aware_transport_port()) {
+            if (cfg.native_shard_aware_transport_port_ssl.is_set() &&
+                    (!cfg.native_shard_aware_transport_port.is_set() ||
+                    cfg.native_shard_aware_transport_port_ssl() != cfg.native_shard_aware_transport_port())) {
                configs.emplace_back(listen_cfg{{ip, cfg.native_shard_aware_transport_port_ssl()}, true, std::move(cred)});
-            } else if (cfg.native_shard_aware_transport_port.is_set()) {
-                configs[1].cred = std::move(cred);
+            } else if (native_shard_aware_port_idx >= 0) {
+                configs[native_shard_aware_port_idx].cred = std::move(cred);
            }
        }

--- a/transport/server.cc
+++ b/transport/server.cc
@@ -572,7 +572,17 @@ future<foreign_ptr<std::unique_ptr<cql_server::response>>>
        } catch (const exceptions::prepared_query_not_found_exception& ex) {
            try { ++_server._stats.errors[ex.code()]; } catch(...) {}
            return make_unprepared_error(stream, ex.code(), ex.what(), ex.id, trace_state);
+        } catch (const exceptions::function_execution_exception& ex) {
+            try { ++_server._stats.errors[ex.code()]; } catch(...) {}
+            return make_function_failure_error(stream, ex.code(), ex.what(), ex.ks_name, ex.func_name, ex.args, trace_state);
        } catch (const exceptions::cassandra_exception& ex) {
+            // Note: the CQL protocol specifies that many types of errors have
+            // mandatory parameters. These cassandra_exception subclasses MUST
+            // be handled above. This default "cassandra_exception" case is
+            // only appropriate for the specific types of errors which do not have
+            // additional information, such as invalid_request_exception.
+            // TODO: consider listing those types explicitly, instead of the
+            // catch-all type cassandra_exception.
            try { ++_server._stats.errors[ex.code()]; } catch(...) {}
            return make_error(stream, ex.code(), ex.what(), trace_state);
        } catch (std::exception& ex) {
@@ -1334,6 +1344,17 @@ std::unique_ptr<cql_server::response> cql_server::connection::make_unprepared_er
    return response;
 }

+std::unique_ptr<cql_server::response> cql_server::connection::make_function_failure_error(int16_t stream, exceptions::exception_code err, sstring msg, sstring ks_name, sstring func_name, std::vector<sstring> args, const tracing::trace_state_ptr& tr_state) const
+{
+    auto response = std::make_unique<cql_server::response>(stream, cql_binary_opcode::ERROR, tr_state);
+    response->write_int(static_cast<int32_t>(err));
+    response->write_string(msg);
+    response->write_string(ks_name);
+    response->write_string(func_name);
+    response->write_string_list(args);
+    return response;
+}
+
 std::unique_ptr<cql_server::response> cql_server::connection::make_error(int16_t stream, exceptions::exception_code err, sstring msg, const tracing::trace_state_ptr& tr_state) const
 {
    auto response = std::make_unique<cql_server::response>(stream, cql_binary_opcode::ERROR, tr_state);
--- a/Show More
+++ b/Show More