sstables: Fix sstable reshaping for STCS

The heuristic of STCS reshape is correct, and it built the compaction descriptor correctly, but forgot to return it to the caller, so no reshape was ever done on behalf of STCS even when the strategy needed it. Fixes #7774. Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com> Message-Id: <20201209175044.1609102-1-raphaelsc@scylladb.com> (cherry picked from commit e4b55f40f3)
scylla_ntp_setup: support 'pool' directive on ntp.conf
2021-11-15 13:28:52 +02:00 · 2021-10-10 19:42:14 +03:00 · 2021-10-05 16:20:30 +03:00 · 2021-10-03 14:09:37 +03:00 · 2021-10-03 13:11:30 +03:00 · 2021-09-23 15:18:22 +03:00
82 changed files with 1734 additions and 333 deletions
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=4.3.2
+VERSION=4.3.7

 if test -f version
 then
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -123,7 +123,7 @@ struct rjson_engaged_ptr_comp {
 // as internally they're stored in an array, and the order of elements is
 // not important in set equality. See issue #5021
 static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2) {
-    if (set1.Size() != set2.Size()) {
+    if (!set1.IsArray() || !set2.IsArray() || set1.Size() != set2.Size()) {
        return false;
    }
    std::set<const rjson::value*, rjson_engaged_ptr_comp> set1_raw;
@@ -137,25 +137,70 @@ static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2
    }
    return true;
 }
+// Moreover, the JSON being compared can be a nested document with outer
+// layers of lists and maps and some inner set - and we need to get to that
+// inner set to compare it correctly with check_EQ_for_sets() (issue #8514).
+static bool check_EQ(const rjson::value* v1, const rjson::value& v2);
+static bool check_EQ_for_lists(const rjson::value& list1, const rjson::value& list2) {
+    if (!list1.IsArray() || !list2.IsArray() || list1.Size() != list2.Size()) {
+        return false;
+    }
+    auto it1 = list1.Begin();
+    auto it2 = list2.Begin();
+    while (it1 != list1.End()) {
+        // Note: Alternator limits an item's depth (rjson::parse() limits
+        // it to around 37 levels), so this recursion is safe.
+        if (!check_EQ(&*it1, *it2)) {
+            return false;
+        }
+        ++it1;
+        ++it2;
+    }
+    return true;
+}
+static bool check_EQ_for_maps(const rjson::value& list1, const rjson::value& list2) {
+    if (!list1.IsObject() || !list2.IsObject() || list1.MemberCount() != list2.MemberCount()) {
+        return false;
+    }
+    for (auto it1 = list1.MemberBegin(); it1 != list1.MemberEnd(); ++it1) {
+        auto it2 = list2.FindMember(it1->name);
+        if (it2 == list2.MemberEnd() || !check_EQ(&it1->value, it2->value)) {
+            return false;
+        }
+    }
+    return true;
+}

 // Check if two JSON-encoded values match with the EQ relation
 static bool check_EQ(const rjson::value* v1, const rjson::value& v2) {
-    if (!v1) {
-        return false;
-    }
-    if (v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
+    if (v1 && v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
        auto it1 = v1->MemberBegin();
        auto it2 = v2.MemberBegin();
-        if ((it1->name == "SS" && it2->name == "SS") || (it1->name == "NS" && it2->name == "NS") || (it1->name == "BS" && it2->name == "BS")) {
-            return check_EQ_for_sets(it1->value, it2->value);
+        if (it1->name != it2->name) {
+            return false;
        }
+        if (it1->name == "SS" || it1->name == "NS" || it1->name == "BS") {
+            return check_EQ_for_sets(it1->value, it2->value);
+        } else if(it1->name == "L") {
+            return check_EQ_for_lists(it1->value, it2->value);
+        } else if(it1->name == "M") {
+            return check_EQ_for_maps(it1->value, it2->value);
+        } else {
+            // Other, non-nested types (number, string, etc.) can be compared
+            // literally, comparing their JSON representation.
+            return it1->value == it2->value;
+        }
+    } else {
+        // If v1 and/or v2 are missing (IsNull()) the result should be false.
+        // In the unlikely case that the object is malformed (issue #8070),
+        // let's also return false.
+        return false;
    }
-    return *v1 == v2;
 }

 // Check if two JSON-encoded values match with the NE relation
 static bool check_NE(const rjson::value* v1, const rjson::value& v2) {
-    return !v1 || *v1 != v2; // null is unequal to anything.
+    return !check_EQ(v1, v2);
 }

 // Check if two JSON-encoded values match with the BEGINS_WITH relation
@@ -298,6 +343,8 @@ static bool check_NOT_NULL(const rjson::value* val) {

 // Only types S, N or B (string, number or bytes) may be compared by the
 // various comparion operators - lt, le, gt, ge, and between.
+// Note that in particular, if the value is missing (v->IsNull()), this
+// check returns false.
 static bool check_comparable_type(const rjson::value& v) {
    if (!v.IsObject() || v.MemberCount() != 1) {
        return false;
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -331,15 +331,15 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], 0, [](column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], uint64_t{0}, [](column_family& cf) {
            return cf.active_memtable().partition_count();
-        }, std::plus<int>());
+        }, std::plus<>());
    });

    cf::get_all_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, 0, [](column_family& cf) {
+        return map_reduce_cf(ctx, uint64_t{0}, [](column_family& cf) {
            return cf.active_memtable().partition_count();
-        }, std::plus<int>());
+        }, std::plus<>());
    });

    cf::get_memtable_on_heap_size.set(r, [] (const_req req) {
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -980,9 +980,9 @@ static bytes get_bytes(const atomic_cell_view& acv) {
    return acv.value().linearize();
 }

-static bytes_view get_bytes_view(const atomic_cell_view& acv, std::vector<bytes>& buf) {
+static bytes_view get_bytes_view(const atomic_cell_view& acv, std::forward_list<bytes>& buf) {
    return acv.value().is_fragmented()
-        ? bytes_view{buf.emplace_back(acv.value().linearize())}
+        ? bytes_view{buf.emplace_front(acv.value().linearize())}
        : acv.value().first_fragment();
 }

@@ -1137,9 +1137,9 @@ struct process_row_visitor {

                struct udt_visitor : public collection_visitor {
                    std::vector<bytes_opt> _added_cells;
-                    std::vector<bytes>& _buf;
+                    std::forward_list<bytes>& _buf;

-                    udt_visitor(ttl_opt& ttl_column, size_t num_keys, std::vector<bytes>& buf)
+                    udt_visitor(ttl_opt& ttl_column, size_t num_keys, std::forward_list<bytes>& buf)
                        : collection_visitor(ttl_column), _added_cells(num_keys), _buf(buf) {}

                    void live_collection_cell(bytes_view key, const atomic_cell_view& cell) {
@@ -1148,7 +1148,7 @@ struct process_row_visitor {
                    }
                };

-                std::vector<bytes> buf;
+                std::forward_list<bytes> buf;
                udt_visitor v(_ttl_column, type.size(), buf);

                visit_collection(v);
@@ -1167,9 +1167,9 @@ struct process_row_visitor {

                struct map_or_list_visitor : public collection_visitor {
                    std::vector<std::pair<bytes_view, bytes_view>> _added_cells;
-                    std::vector<bytes>& _buf;
+                    std::forward_list<bytes>& _buf;

-                    map_or_list_visitor(ttl_opt& ttl_column, std::vector<bytes>& buf)
+                    map_or_list_visitor(ttl_opt& ttl_column, std::forward_list<bytes>& buf)
                        : collection_visitor(ttl_column), _buf(buf) {}

                    void live_collection_cell(bytes_view key, const atomic_cell_view& cell) {
@@ -1178,7 +1178,7 @@ struct process_row_visitor {
                    }
                };

-                std::vector<bytes> buf;
+                std::forward_list<bytes> buf;
                map_or_list_visitor v(_ttl_column, buf);

                visit_collection(v);
--- a/configure.py
+++ b/configure.py
@@ -855,6 +855,7 @@ scylla_core = (['database.cc',
                'utils/error_injection.cc',
                'mutation_writer/timestamp_based_splitting_writer.cc',
                'mutation_writer/shard_based_splitting_writer.cc',
+                'mutation_writer/feed_writers.cc',
                'lua.cc',
                ] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
               )
--- a/cql3/constants.hh
+++ b/cql3/constants.hh
@@ -192,9 +192,12 @@ public:

        virtual ::shared_ptr<terminal> bind(const query_options& options) override {
            auto bytes = bind_and_get(options);
-            if (!bytes) {
+            if (bytes.is_null()) {
                return ::shared_ptr<terminal>{};
            }
+            if (bytes.is_unset_value()) {
+                return UNSET_VALUE;
+            }
            return ::make_shared<constants::value>(std::move(cql3::raw_value::make_value(to_bytes(*bytes))));
        }
    };
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -27,7 +27,9 @@
 #include <fmt/ostream.h>
 #include <unordered_map>

+#include "cql3/constants.hh"
 #include "cql3/lists.hh"
+#include "cql3/statements/request_validations.hh"
 #include "cql3/tuples.hh"
 #include "index/secondary_index_manager.hh"
 #include "types/list.hh"
@@ -417,6 +419,8 @@ bool is_one_of(const column_value& col, term& rhs, const column_value_eval_bag&
    } else if (auto mkr = dynamic_cast<lists::marker*>(&rhs)) {
        // This is `a IN ?`.  RHS elements are values representable as bytes_opt.
        const auto values = static_pointer_cast<lists::value>(mkr->bind(bag.options));
+        statements::request_validations::check_not_null(
+                values, "Invalid null value for column %s", col.col->name_as_text());
        return boost::algorithm::any_of(values->get_elements(), [&] (const bytes_opt& b) {
                return equal(b, col, bag);
            });
@@ -568,7 +572,8 @@ const auto deref = boost::adaptors::transformed([] (const bytes_opt& b) { return

 /// Returns possible values from t, which must be RHS of IN.
 value_list get_IN_values(
-        const ::shared_ptr<term>& t, const query_options& options, const serialized_compare& comparator) {
+        const ::shared_ptr<term>& t, const query_options& options, const serialized_compare& comparator,
+        sstring_view column_name) {
    // RHS is prepared differently for different CQL cases.  Cast it dynamically to discern which case this is.
    if (auto dv = dynamic_pointer_cast<lists::delayed_value>(t)) {
        // Case `a IN (1,2,3)`.
@@ -578,8 +583,12 @@ value_list get_IN_values(
        return to_sorted_vector(std::move(result_range), comparator);
    } else if (auto mkr = dynamic_pointer_cast<lists::marker>(t)) {
        // Case `a IN ?`.  Collect all list-element values.
-        const auto val = static_pointer_cast<lists::value>(mkr->bind(options));
-        return to_sorted_vector(val->get_elements() | non_null | deref, comparator);
+        const auto val = mkr->bind(options);
+        if (val == constants::UNSET_VALUE) {
+            throw exceptions::invalid_request_exception(format("Invalid unset value for column {}", column_name));
+        }
+        statements::request_validations::check_not_null(val, "Invalid null value for IN tuple");
+        return to_sorted_vector(static_pointer_cast<lists::value>(val)->get_elements() | non_null | deref, comparator);
    }
    throw std::logic_error(format("get_IN_values(single column) on invalid term {}", *t));
 }
@@ -686,7 +695,7 @@ value_set possible_lhs_values(const column_definition* cdef, const expression& e
                                return oper.op == oper_t::EQ ? value_set(value_list{*val})
                                        : to_range(oper.op, *val);
                            } else if (oper.op == oper_t::IN) {
-                                return get_IN_values(oper.rhs, options, type->as_less_comparator());
+                                return get_IN_values(oper.rhs, options, type->as_less_comparator(), cdef->name_as_text());
                            }
                            throw std::logic_error(format("possible_lhs_values: unhandled operator {}", oper));
                        },
--- a/cql3/maps.cc
+++ b/cql3/maps.cc
@@ -305,6 +305,12 @@ maps::setter_by_key::execute(mutation& m, const clustering_key_prefix& prefix, c
    assert(column.type->is_multi_cell()); // "Attempted to set a value for a single key on a frozen map"m
    auto key = _k->bind_and_get(params._options);
    auto value = _t->bind_and_get(params._options);
+    if (value.is_unset_value()) {
+        return;
+    }
+    if (key.is_unset_value() || value.is_unset_value()) {
+        throw invalid_request_exception("Invalid unset map key");
+    }
    if (!key) {
        throw invalid_request_exception("Invalid null map key");
    }
--- a/cql3/sets.cc
+++ b/cql3/sets.cc
@@ -315,7 +315,7 @@ sets::discarder::execute(mutation& m, const clustering_key_prefix& row_key, cons
    assert(column.type->is_multi_cell()); // "Attempted to remove items from a frozen set";

    auto&& value = _t->bind(params._options);
-    if (!value) {
+    if (!value || value == constants::UNSET_VALUE) {
        return;
    }

--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -306,6 +306,13 @@ create_index_statement::announce_migration(service::storage_proxy& proxy, bool i
                    format("Index {} is a duplicate of existing index {}", index.name(), existing_index.value().name()));
        }
    }
+    auto index_table_name = secondary_index::index_table_name(accepted_name);
+    if (db.has_schema(keyspace(), index_table_name)) {
+        return make_exception_future<::shared_ptr<cql_transport::event::schema_change>>(
+            exceptions::invalid_request_exception(format("Index {} cannot be created, because table {} already exists",
+                accepted_name, index_table_name))
+        );
+    }
    ++_cql_stats->secondary_index_creates;
    schema_builder builder{schema};
    builder.with_index(index);
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -456,7 +456,7 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const std::option
        if (!view_col) {
            throw std::runtime_error(format("Base key column not found in the view: {}", base_col.name_as_text()));
        }
-        if (base_col.type != view_col->type) {
+        if (base_col.type->without_reversed() != *view_col->type) {
            throw std::runtime_error(format("Mismatched types for base and view columns {}: {} and {}",
                    base_col.name_as_text(), base_col.type->cql3_type_name(), view_col->type->cql3_type_name()));
        }
@@ -1120,7 +1120,11 @@ query::partition_slice indexed_table_select_statement::get_partition_slice_for_g
                if (single_ck_restrictions) {
                    auto prefix_restrictions = single_ck_restrictions->get_longest_prefix_restrictions();
                    auto clustering_restrictions_from_base = ::make_shared<restrictions::single_column_clustering_key_restrictions>(_view_schema, *prefix_restrictions);
+                    const auto indexed_column = _view_schema->get_column_definition(to_bytes(_index.target_column()));
                    for (auto restriction_it : clustering_restrictions_from_base->restrictions()) {
+                        if (restriction_it.first == indexed_column) {
+                            continue; // In the index table, the indexed column is the partition (not clustering) key.
+                        }
                        clustering_restrictions->merge_with(restriction_it.second);
                    }
                }
--- a/database.cc
+++ b/database.cc
@@ -572,9 +572,6 @@ void database::set_format_by_config() {
 }

 database::~database() {
-    _read_concurrency_sem.clear_inactive_reads();
-    _streaming_concurrency_sem.clear_inactive_reads();
-    _system_read_concurrency_sem.clear_inactive_reads();
 }

 void database::update_version(const utils::UUID& version) {
@@ -662,11 +659,22 @@ future<> database::parse_system_tables(distributed<service::storage_proxy>& prox
            });
    }).then([&proxy, &mm, this] {
        return do_parse_schema_tables(proxy, db::schema_tables::VIEWS, [this, &proxy, &mm] (schema_result_value_type &v) {
-            return create_views_from_schema_partition(proxy, v.second).then([this, &mm] (std::vector<view_ptr> views) {
-                return parallel_for_each(views.begin(), views.end(), [this, &mm] (auto&& v) {
-                    return this->add_column_family_and_make_directory(v).then([this, &mm, v] {
-                        return maybe_update_legacy_secondary_index_mv_schema(mm.local(), *this, v);
-                    });
+            return create_views_from_schema_partition(proxy, v.second).then([this, &mm, &proxy] (std::vector<view_ptr> views) {
+                return parallel_for_each(views.begin(), views.end(), [this, &mm, &proxy] (auto&& v) {
+                    // TODO: Remove once computed columns are guaranteed to be featured in the whole cluster.
+                    // we fix here the schema in place in oreder to avoid races (write commands comming from other coordinators).
+                    view_ptr fixed_v = maybe_fix_legacy_secondary_index_mv_schema(*this, v, nullptr, preserve_version::yes);
+                    view_ptr v_to_add = fixed_v ? fixed_v : v;
+                    future<> f = this->add_column_family_and_make_directory(v_to_add);
+                    if (bool(fixed_v)) {
+                        v_to_add = fixed_v;
+                        auto&& keyspace = find_keyspace(v->ks_name()).metadata();
+                        auto mutations = db::schema_tables::make_update_view_mutations(keyspace, view_ptr(v), fixed_v, api::new_timestamp(), true);
+                        f = f.then([this, &proxy, mutations = std::move(mutations)] {
+                            return db::schema_tables::merge_schema(proxy, _feat, std::move(mutations));
+                        });
+                    }
+                    return f;
                });
            });
        });
@@ -1743,7 +1751,11 @@ sstring database::get_available_index_name(const sstring &ks_name, const sstring
    auto base_name = index_metadata::get_default_index_name(cf_name, index_name_root);
    sstring accepted_name = base_name;
    int i = 0;
-    while (existing_names.contains(accepted_name)) {
+    auto name_accepted = [&] {
+        auto index_table_name = secondary_index::index_table_name(accepted_name);
+        return !has_schema(ks_name, index_table_name) && !existing_names.contains(accepted_name);
+    };
+    while (!name_accepted()) {
        accepted_name = base_name + "_" + std::to_string(++i);
    }
    return accepted_name;
@@ -1808,6 +1820,13 @@ future<>
 database::stop() {
    assert(!_large_data_handler->running());

+    // Inactive reads might hold on to sstables, blocking the
+    // `sstables_manager::close()` calls below. No one will come back for these
+    // reads at this point so clear them before proceeding with the shutdown.
+    _read_concurrency_sem.clear_inactive_reads();
+    _streaming_concurrency_sem.clear_inactive_reads();
+    _system_read_concurrency_sem.clear_inactive_reads();
+
    // try to ensure that CL has done disk flushing
    future<> maybe_shutdown_commitlog = _commitlog != nullptr ? _commitlog->shutdown() : make_ready_future<>();
    return maybe_shutdown_commitlog.then([this] {
@@ -1859,26 +1878,28 @@ future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_fun

        return cf.run_with_compaction_disabled([this, &cf, should_flush, auto_snapshot, tsf = std::move(tsf), low_mark]() mutable {
            future<> f = make_ready_future<>();
-            if (should_flush) {
+            bool did_flush = false;
+            if (should_flush && cf.can_flush()) {
                // TODO:
                // this is not really a guarantee at all that we've actually
                // gotten all things to disk. Again, need queue-ish or something.
                f = cf.flush();
+                did_flush = true;
            } else {
                f = cf.clear();
            }
-            return f.then([this, &cf, auto_snapshot, tsf = std::move(tsf), low_mark, should_flush] {
+            return f.then([this, &cf, auto_snapshot, tsf = std::move(tsf), low_mark, should_flush, did_flush] {
                dblog.debug("Discarding sstable data for truncated CF + indexes");
                // TODO: notify truncation

-                return tsf().then([this, &cf, auto_snapshot, low_mark, should_flush](db_clock::time_point truncated_at) {
+                return tsf().then([this, &cf, auto_snapshot, low_mark, should_flush, did_flush](db_clock::time_point truncated_at) {
                    future<> f = make_ready_future<>();
                    if (auto_snapshot) {
                        auto name = format("{:d}-{}", truncated_at.time_since_epoch().count(), cf.schema()->cf_name());
                        f = cf.snapshot(*this, name);
                    }
-                    return f.then([this, &cf, truncated_at, low_mark, should_flush] {
-                        return cf.discard_sstables(truncated_at).then([this, &cf, truncated_at, low_mark, should_flush](db::replay_position rp) {
+                    return f.then([this, &cf, truncated_at, low_mark, should_flush, did_flush] {
+                        return cf.discard_sstables(truncated_at).then([this, &cf, truncated_at, low_mark, should_flush, did_flush](db::replay_position rp) {
                            // TODO: indexes.
                            // Note: since discard_sstables was changed to only count tables owned by this shard,
                            // we can get zero rp back. Changed assert, and ensure we save at least low_mark.
@@ -1886,7 +1907,7 @@ future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_fun
                            // We nowadays do not flush tables with sstables but autosnapshot=false. This means
                            // the low_mark assertion does not hold, because we maybe/probably never got around to 
                            // creating the sstables that would create them.
-                            assert(!should_flush || low_mark <= rp || rp == db::replay_position());
+                            assert(!did_flush || low_mark <= rp || rp == db::replay_position());
                            rp = std::max(low_mark, rp);
                            return truncate_views(cf, truncated_at, should_flush).then([&cf, truncated_at, rp] {
                                // save_truncation_record() may actually fail after we cached the truncation time
--- a/database.hh
+++ b/database.hh
@@ -224,6 +224,10 @@ public:
        return bool(_seal_immediate_fn);
    }

+    bool can_flush() const {
+        return may_flush() && !empty();
+    }
+
    bool empty() const {
        for (auto& m : _memtables) {
           if (!m->empty()) {
@@ -782,6 +786,8 @@ public:
    // to them, and then pass that + 1 as "start".
    future<std::vector<sstables::entry_descriptor>> reshuffle_sstables(std::set<int64_t> all_generations, int64_t start);

+    bool can_flush() const;
+
    // FIXME: this is just an example, should be changed to something more
    // general. compact_all_sstables() starts a compaction of all sstables.
    // It doesn't flush the current memtable first. It's just a ad-hoc method,
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -1208,7 +1208,42 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
        return create_table_from_mutations(proxy, std::move(sm));
    });
    auto views_diff = diff_table_or_view(proxy, std::move(views_before), std::move(views_after), [&] (schema_mutations sm) {
-        return create_view_from_mutations(proxy, std::move(sm));
+        // The view schema mutation should be created with reference to the base table schema because we definitely know it by now.
+        // If we don't do it we are leaving a window where write commands to this schema are illegal.
+        // There are 3 possibilities:
+        // 1. The table was altered - in this case we want the view to correspond to this new table schema.
+        // 2. The table was just created - the table is guarantied to be published with the view in that case.
+        // 3. The view itself was altered - in that case we already know the base table so we can take it from
+        //    the database object.
+        view_ptr vp = create_view_from_mutations(proxy, std::move(sm));
+        schema_ptr base_schema;
+        for (auto&& s : tables_diff.altered) {
+            if (s.new_schema.get()->ks_name() == vp->ks_name() && s.new_schema.get()->cf_name() == vp->view_info()->base_name() ) {
+                base_schema = s.new_schema;
+                break;
+            }
+        }
+        if (!base_schema) {
+            for (auto&& s : tables_diff.created) {
+                if (s.get()->ks_name() == vp->ks_name() && s.get()->cf_name() == vp->view_info()->base_name() ) {
+                    base_schema = s;
+                    break;
+                }
+            }
+        }
+
+        if (!base_schema) {
+            base_schema = proxy.local().local_db().find_schema(vp->ks_name(), vp->view_info()->base_name());
+        }
+
+        // Now when we have a referenced base - just in case we are registering an old view (this can happen in a mixed cluster)
+        // lets make it write enabled by updating it's compute columns.
+        view_ptr fixed_vp = maybe_fix_legacy_secondary_index_mv_schema(proxy.local().get_db().local(), vp, base_schema, preserve_version::yes);
+        if(fixed_vp) {
+            vp = fixed_vp;
+        }
+        vp->view_info()->set_base_info(vp->view_info()->make_base_dependent_view_info(*base_schema));
+        return vp;
    });

    proxy.local().get_db().invoke_on_all([&] (database& db) {
@@ -3033,8 +3068,7 @@ std::vector<sstring> all_table_names(schema_features features) {
           boost::adaptors::transformed([] (auto schema) { return schema->cf_name(); }));
 }

-future<> maybe_update_legacy_secondary_index_mv_schema(service::migration_manager& mm, database& db, view_ptr v) {
-    // TODO(sarna): Remove once computed columns are guaranteed to be featured in the whole cluster.
+view_ptr maybe_fix_legacy_secondary_index_mv_schema(database& db, const view_ptr& v, schema_ptr base_schema, preserve_version preserve_version) {
    // Legacy format for a secondary index used a hardcoded "token" column, which ensured a proper
    // order for indexed queries. This "token" column is now implemented as a computed column,
    // but for the sake of compatibility we assume that there might be indexes created in the legacy
@@ -3042,26 +3076,32 @@ future<> maybe_update_legacy_secondary_index_mv_schema(service::migration_manage
    // columns marked as computed (because they were either created on a node that supports computed
    // columns or were fixed by this utility function), it's safe to remove this function altogether.
    if (v->clustering_key_size() == 0) {
-        return make_ready_future<>();
+        return view_ptr(nullptr);
    }
    const column_definition& first_view_ck = v->clustering_key_columns().front();
    if (first_view_ck.is_computed()) {
-        return make_ready_future<>();
+        return view_ptr(nullptr);
+    }
+
+    if (!base_schema) {
+        base_schema = db.find_schema(v->view_info()->base_id());
    }

-    table& base = db.find_column_family(v->view_info()->base_id());
-    schema_ptr base_schema = base.schema();
    // If the first clustering key part of a view is a column with name not found in base schema,
    // it implies it might be backing an index created before computed columns were introduced,
    // and as such it must be recreated properly.
    if (!base_schema->columns_by_name().contains(first_view_ck.name())) {
        schema_builder builder{schema_ptr(v)};
        builder.mark_column_computed(first_view_ck.name(), std::make_unique<token_column_computation>());
-        return mm.announce_view_update(view_ptr(builder.build()), true);
+        if (preserve_version) {
+            builder.with_version(v->version());
+        }
+        return view_ptr(builder.build());
    }
-    return make_ready_future<>();
+    return view_ptr(nullptr);
 }

+
 namespace legacy {

 table_schema_version schema_mutations::digest() const {
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -238,7 +238,9 @@ std::vector<mutation> make_update_view_mutations(lw_shared_ptr<keyspace_metadata

 std::vector<mutation> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);

-future<> maybe_update_legacy_secondary_index_mv_schema(service::migration_manager& mm, database& db, view_ptr v);
+class preserve_version_tag {};
+using preserve_version = bool_class<preserve_version_tag>;
+view_ptr maybe_fix_legacy_secondary_index_mv_schema(database& db, const view_ptr& v, schema_ptr base_schema, preserve_version preserve_version);

 sstring serialize_kind(column_kind kind);
 column_kind deserialize_kind(sstring kind);
--- a/db/snapshot-ctl.cc
+++ b/db/snapshot-ctl.cc
@@ -43,9 +43,13 @@

 namespace db {

-future<> snapshot_ctl::check_snapshot_not_exist(sstring ks_name, sstring name) {
+future<> snapshot_ctl::check_snapshot_not_exist(sstring ks_name, sstring name, std::optional<std::vector<sstring>> filter) {
    auto& ks = _db.local().find_keyspace(ks_name);
-    return parallel_for_each(ks.metadata()->cf_meta_data(), [this, ks_name = std::move(ks_name), name = std::move(name)] (auto& pair) {
+    return parallel_for_each(ks.metadata()->cf_meta_data(), [this, ks_name = std::move(ks_name), name = std::move(name), filter = std::move(filter)] (auto& pair) {
+        auto& cf_name = pair.first;
+        if (filter && std::find(filter->begin(), filter->end(), cf_name) == filter->end()) {
+            return make_ready_future<>();
+        }        
        auto& cf = _db.local().find_column_family(pair.second);
        return cf.snapshot_exists(name).then([ks_name = std::move(ks_name), name] (bool exists) {
            if (exists) {
@@ -111,7 +115,7 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
    }

    return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag)] {
-        return check_snapshot_not_exist(ks_name, tag).then([this, ks_name, tables = std::move(tables), tag] {
+        return check_snapshot_not_exist(ks_name, tag, tables).then([this, ks_name, tables, tag] {
            return do_with(std::vector<sstring>(std::move(tables)),[this, ks_name, tag](const std::vector<sstring>& tables) {
                return do_for_each(tables, [ks_name, tag, this] (const sstring& table_name) {
                    if (table_name.find(".") != sstring::npos) {
--- a/db/snapshot-ctl.hh
+++ b/db/snapshot-ctl.hh
@@ -40,6 +40,8 @@

 #pragma once

+#include <vector>
+
 #include <seastar/core/sharded.hh>
 #include <seastar/core/future.hh>
 #include "database.hh"
@@ -112,7 +114,7 @@ private:
    seastar::rwlock _lock;
    seastar::gate _ops;

-    future<> check_snapshot_not_exist(sstring ks_name, sstring name);
+    future<> check_snapshot_not_exist(sstring ks_name, sstring name, std::optional<std::vector<sstring>> filter = {});

    template <typename Func>
    std::result_of_t<Func()> run_snapshot_modify_operation(Func&&);
--- a/digester.hh
+++ b/digester.hh
@@ -58,7 +58,8 @@ public:

    template<typename T, typename... Args>
    void feed_hash(const T& value, Args&&... args) {
-        std::visit([&] (auto& hasher) noexcept -> void {
+        // FIXME uncomment the noexcept marking once clang bug 50994 is fixed or gcc compilation is turned on
+        std::visit([&] (auto& hasher) /* noexcept(noexcept(::feed_hash(hasher, value, args...))) */ -> void {
            ::feed_hash(hasher, value, std::forward<Args>(args)...);
        }, _impl);
    };
--- a/dist/common/scripts/node_exporter_install
+++ b/dist/common/scripts/node_exporter_install
@@ -24,6 +24,8 @@ import os
 import sys
 import tempfile
 import tarfile
+import shutil
+import glob
 from scylla_util import *
 import argparse

@@ -61,6 +63,9 @@ if __name__ == '__main__':
            f.write(data)
        with tarfile.open('/var/tmp/node_exporter-{version}.linux-amd64.tar.gz'.format(version=VERSION)) as tf:
            tf.extractall(INSTALL_DIR)
+        shutil.chown(f'{INSTALL_DIR}/node_exporter-{VERSION}.linux-amd64', 'root', 'root')
+        for f in glob.glob(f'{INSTALL_DIR}/node_exporter-{VERSION}.linux-amd64/*'):
+            shutil.chown(f, 'root', 'root')
        os.remove('/var/tmp/node_exporter-{version}.linux-amd64.tar.gz'.format(version=VERSION))
        if node_exporter_p.exists():
            node_exporter_p.unlink()
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -87,7 +87,8 @@ WantedBy=multi-user.target
            run('sysctl -p /etc/sysctl.d/99-scylla-coredump.conf')

        fp = tempfile.NamedTemporaryFile()
-        fp.write(b'kill -SEGV $$')
+        fp.write(b'ulimit -c unlimited\n')
+        fp.write(b'kill -SEGV $$\n')
        fp.flush()
        p = subprocess.Popen(['/bin/bash', fp.name], stdout=subprocess.PIPE)
        pid = p.pid
--- a/dist/common/scripts/scylla_cpuscaling_setup
+++ b/dist/common/scripts/scylla_cpuscaling_setup
@@ -22,6 +22,7 @@

 import os
 import sys
+import argparse
 import shlex
 import distro
 from scylla_util import *
@@ -33,12 +34,22 @@ if __name__ == '__main__':
    if os.getuid() > 0:
        print('Requires root permission.')
        sys.exit(1)
-    if not os.path.exists('/sys/devices/system/cpu/cpufreq/policy0/scaling_governor'):
+    parser = argparse.ArgumentParser(description='CPU scaling setup script for Scylla.')
+    parser.add_argument('--force', dest='force', action='store_true',
+                        help='force running setup even CPU scaling unsupported')
+    args = parser.parse_args()
+
+    if not args.force and not os.path.exists('/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor'):
        print('This computer doesn\'t supported CPU scaling configuration.')
        sys.exit(0)
    if is_debian_variant():
        if not shutil.which('cpufreq-set'):
            apt_install('cpufrequtils')
+        try:
+            ondemand = systemd_unit('ondemand')
+            ondemand.disable()
+        except:
+            pass
        cfg = sysconfig_parser('/etc/default/cpufrequtils')
        cfg.set('GOVERNOR', 'performance')
        cfg.commit()
--- a/dist/common/scripts/scylla_ntp_setup
+++ b/dist/common/scripts/scylla_ntp_setup
@@ -91,12 +91,12 @@ if __name__ == '__main__':
            with open('/etc/ntp.conf') as f:
                conf = f.read()
            if args.subdomain:
-                conf2 = re.sub(r'server\s+([0-9]+)\.(\S+)\.pool\.ntp\.org', 'server \\1.{}.pool.ntp.org'.format(args.subdomain), conf, flags=re.MULTILINE)
+                conf2 = re.sub(r'(server|pool)\s+([0-9]+)\.(\S+)\.pool\.ntp\.org', '\\1 \\2.{}.pool.ntp.org'.format(args.subdomain), conf, flags=re.MULTILINE)
                with open('/etc/ntp.conf', 'w') as f:
                    f.write(conf2)
                conf = conf2
-            match = re.search(r'^server\s+(\S*)(\s+\S+)?', conf, flags=re.MULTILINE)
-            server = match.group(1)
+            match = re.search(r'^(server|pool)\s+(\S*)(\s+\S+)?', conf, flags=re.MULTILINE)
+            server = match.group(2)
            ntpd = systemd_unit('ntpd.service')
            ntpd.stop()
            # ignore error, ntpd may able to adjust clock later
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -143,4 +143,3 @@ if __name__ == '__main__':
            print(f'Exception occurred while creating perftune.yaml: {e}')
            print('To fix the error, please re-run scylla_setup.')
            sys.exit(1)
-
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -160,7 +160,7 @@ Before=scylla-server.service
 After={after}

 [Mount]
-What=UUID={uuid}
+What=/dev/disk/by-uuid/{uuid}
 Where={mount_at}
 Type=xfs
 Options=noatime
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -34,6 +34,7 @@ from pathlib import Path

 import distro

+from multiprocessing import cpu_count

 def scriptsdir_p():
    p = Path(sys.argv[0]).resolve()
@@ -308,9 +309,10 @@ class gcp_instance:
                    logging.warning(
                        "This machine doesn't have enough CPUs for allocated number of NVMEs (at least 32 cpus for >=16 disks). Performance will suffer.")
                    return False
-                diskSize = self.firstNvmeSize
                if diskCount < 1:
+                    logging.warning("No ephemeral disks were found.")
                    return False
+                diskSize = self.firstNvmeSize
                max_disktoramratio = 105
                # 30:1 Disk/RAM ratio must be kept at least(AWS), we relax this a little bit
                # on GCP we are OK with {max_disktoramratio}:1 , n1-standard-2 can cope with 1 disk, not more
--- a/dist/common/sysctl.d/99-scylla-aio.conf
+++ b/dist/common/sysctl.d/99-scylla-aio.conf
@@ -1,2 +1,2 @@
 # Raise max AIO events
-fs.aio-max-nr = 1048576
+fs.aio-max-nr = 5578536
--- a/dist/common/systemd/scylla-fstrim.timer
+++ b/dist/common/systemd/scylla-fstrim.timer
@@ -1,7 +1,5 @@
 [Unit]
 Description=Run Scylla fstrim daily
-After=scylla-server.service
-BindsTo=scylla-server.service

 [Timer]
 OnCalendar=Sat *-*-* 00:00:00
--- a/dist/debian/debian/scylla-kernel-conf.postinst
+++ b/dist/debian/debian/scylla-kernel-conf.postinst
@@ -9,9 +9,9 @@ if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
 else
    # expect failures in virtualized environments
    sysctl -p/usr/lib/sysctl.d/99-scylla-sched.conf || :
-    sysctl -p/usr/lib/sysctl.d/99-scylla-aio.conf || :
    sysctl -p/usr/lib/sysctl.d/99-scylla-vm.conf || :
    sysctl -p/usr/lib/sysctl.d/99-scylla-inotify.conf || :
+    sysctl -p/usr/lib/sysctl.d/99-scylla-aio.conf || :
 fi

 #DEBHELPER#
--- a/dist/debian/debian/scylla-server.postrm
+++ b/dist/debian/debian/scylla-server.postrm
@@ -12,8 +12,6 @@ case "$1" in
        if [ "$1" = "purge" ]; then
            rm -rf /etc/systemd/system/scylla-server.service.d/
        fi
-        rm -f /etc/systemd/system/var-lib-systemd-coredump.mount
-        rm -f /etc/systemd/system/var-lib-scylla.mount
        ;;
 esac

--- a/dist/redhat/scylla.spec
+++ b/dist/redhat/scylla.spec
@@ -7,7 +7,7 @@ Group:          Applications/Databases
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Source0:        %{reloc_pkg}
-Requires:       %{product}-server = %{version} %{product}-conf = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version}
+Requires:       %{product}-server = %{version} %{product}-conf = %{version} %{product}-python3 = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version}
 Obsoletes:	scylla-server < 1.1

 %global _debugsource_template %{nil}
@@ -52,7 +52,7 @@ Summary:        The Scylla database server
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Requires: kernel >= 3.10.0-514
-Requires:       %{product}-conf %{product}-python3
+Requires:       %{product}-conf  = %{version} %{product}-python3 = %{version}
 Conflicts:      abrt
 AutoReqProv:    no

@@ -76,13 +76,18 @@ getent passwd scylla || /usr/sbin/useradd -g scylla -s /sbin/nologin -r -d %{_sh
 %post server
 /opt/scylladb/scripts/scylla_post_install.sh

-%systemd_post scylla-server.service
+if [ $1 -eq 1 ] ; then
+    /usr/bin/systemctl preset scylla-server.service ||:
+fi

 %preun server
-%systemd_preun scylla-server.service
+if [ $1 -eq 0 ] ; then
+    /usr/bin/systemctl --no-reload disable scylla-server.service ||:
+    /usr/bin/systemctl stop scylla-server.service ||:
+fi

 %postun server
-%systemd_postun scylla-server.service
+/usr/bin/systemctl daemon-reload ||:

 %posttrans server
 if  [ -d /tmp/%{name}-%{version}-%{release} ]; then
@@ -132,9 +137,9 @@ rm -rf $RPM_BUILD_ROOT
 %ghost /etc/systemd/system/scylla-server.service.d/capabilities.conf
 %ghost /etc/systemd/system/scylla-server.service.d/mounts.conf
 /etc/systemd/system/scylla-server.service.d/dependencies.conf
-%ghost /etc/systemd/system/var-lib-systemd-coredump.mount
+%ghost %config /etc/systemd/system/var-lib-systemd-coredump.mount
 %ghost /etc/systemd/system/scylla-cpupower.service
-%ghost /etc/systemd/system/var-lib-scylla.mount
+%ghost %config /etc/systemd/system/var-lib-scylla.mount

 %package conf
 Group:          Applications/Databases
@@ -200,9 +205,9 @@ if Scylla is the main application on your server and you wish to optimize its la
 # We cannot use the sysctl_apply rpm macro because it is not present in 7.0
 # following is a "manual" expansion
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
-/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :
 /usr/lib/systemd/systemd-sysctl 99-scylla-vm.conf >/dev/null 2>&1 || :
 /usr/lib/systemd/systemd-sysctl 99-scylla-inotify.conf >/dev/null 2>&1 || :
+/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :

 %files kernel-conf
 %defattr(-,root,root)
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -1774,6 +1774,8 @@ future<> gossiper::do_shadow_round(std::unordered_set<gms::inet_address> nodes,
                }).handle_exception_type([node, &fall_back_to_syn_msg] (seastar::rpc::unknown_verb_error&) {
                    logger.warn("Node {} does not support get_endpoint_states verb", node);
                    fall_back_to_syn_msg = true;
+                }).handle_exception_type([node, &nodes_down] (seastar::rpc::timeout_error&) {
+                    logger.warn("The get_endpoint_states verb to node {} was timeout", node);
                }).handle_exception_type([node, &nodes_down] (seastar::rpc::closed_error&) {
                    nodes_down++;
                    logger.warn("Node {} is down for get_endpoint_states verb", node);
--- a/hashing.hh
+++ b/hashing.hh
@@ -62,7 +62,7 @@ struct appending_hash;
 template<typename H, typename T, typename... Args>
 requires Hasher<H>
 inline
-void feed_hash(H& h, const T& value, Args&&... args) noexcept {
+void feed_hash(H& h, const T& value, Args&&... args) noexcept(noexcept(std::declval<appending_hash<T>>()(h, value, args...))) {
    appending_hash<T>()(h, value, std::forward<Args>(args)...);
 };

--- a/install.sh
+++ b/install.sh
@@ -147,6 +147,10 @@ EOF
    chmod +x "$install"
 }

+install() {
+    command install -Z "$@"
+}
+
 installconfig() {
    local perm="$1"
    local src="$2"
@@ -197,13 +201,13 @@ if [ -z "$python3" ]; then
 fi
 rpython3=$(realpath -m "$root/$python3")
 if ! $nonroot; then
-    retc="$root/etc"
-    rsysconfdir="$root/$sysconfdir"
-    rusr="$root/usr"
-    rsystemd="$rusr/lib/systemd/system"
+    retc=$(realpath -m "$root/etc")
+    rsysconfdir=$(realpath -m "$root/$sysconfdir")
+    rusr=$(realpath -m "$root/usr")
+    rsystemd=$(realpath -m "$rusr/lib/systemd/system")
    rdoc="$rprefix/share/doc"
-    rdata="$root/var/lib/scylla"
-    rhkdata="$root/var/lib/scylla-housekeeping"
+    rdata=$(realpath -m "$root/var/lib/scylla")
+    rhkdata=$(realpath -m "$root/var/lib/scylla-housekeeping")
 else
    retc="$rprefix/etc"
    rsysconfdir="$rprefix/$sysconfdir"
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -1151,6 +1151,9 @@ flat_mutation_reader evictable_reader::recreate_reader() {
    _range_override.reset();
    _slice_override.reset();

+    _drop_partition_start = false;
+    _drop_static_row = false;
+
    if (_last_pkey) {
        bool partition_range_is_inclusive = true;

@@ -1236,13 +1239,25 @@ void evictable_reader::maybe_validate_partition_start(const flat_mutation_reader
    // is in range.
    if (_last_pkey) {
        const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
-        if (_drop_partition_start) { // should be the same partition
+        if (_drop_partition_start) { // we expect to continue from the same partition
+            // We cannot assume the partition we stopped the read at is still alive
+            // when we recreate the reader. It might have been compacted away in the
+            // meanwhile, so allow for a larger partition too.
            require(
-                    cmp_res == 0,
-                    "{}(): validation failed, expected partition with key equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
+                    cmp_res <= 0,
+                    "{}(): validation failed, expected partition with key larger or equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
                    __FUNCTION__,
                    *_last_pkey,
                    ps.key());
+            // Reset drop flags and next pos if we are not continuing from the same partition
+            if (cmp_res < 0) {
+                // Close previous partition, we are not going to continue it.
+                push_mutation_fragment(*_schema, _permit, partition_end{});
+                _drop_partition_start = false;
+                _drop_static_row = false;
+                _next_position_in_partition = position_in_partition::for_partition_start();
+                _trim_range_tombstones = false;
+            }
        } else { // should be a larger partition
            require(
                    cmp_res < 0,
@@ -1293,9 +1308,14 @@ bool evictable_reader::should_drop_fragment(const mutation_fragment& mf) {
        _drop_partition_start = false;
        return true;
    }
-    if (_drop_static_row && mf.is_static_row()) {
-        _drop_static_row = false;
-        return true;
+    // Unlike partition-start above, a partition is not guaranteed to have a
+    // static row fragment. So reset the flag regardless of whether we could
+    // drop one or not.
+    // We are guaranteed to get here only right after dropping a partition-start,
+    // so if we are not seeing a static row here, the partition doesn't have one.
+    if (_drop_static_row) {
+         _drop_static_row = false;
+        return mf.is_static_row();
    }
    return false;
 }
--- a/mutation_writer/feed_writers.cc
+++ b/mutation_writer/feed_writers.cc
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2021 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "feed_writers.hh"
+
+namespace mutation_writer {
+
+bucket_writer::bucket_writer(schema_ptr schema, std::pair<flat_mutation_reader, queue_reader_handle> queue_reader, reader_consumer& consumer)
+    : _schema(schema)
+    , _handle(std::move(queue_reader.second))
+    , _consume_fut(consumer(std::move(queue_reader.first)))
+{ }
+
+bucket_writer::bucket_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer)
+    : bucket_writer(schema, make_queue_reader(schema, std::move(permit)), consumer)
+{ }
+
+future<> bucket_writer::consume(mutation_fragment mf) {
+    return _handle.push(std::move(mf));
+}
+
+void bucket_writer::consume_end_of_stream() {
+    _handle.push_end_of_stream();
+}
+
+void bucket_writer::abort(std::exception_ptr ep) noexcept {
+    _handle.abort(std::move(ep));
+}
+
+future<> bucket_writer::close() noexcept {
+    return std::move(_consume_fut);
+}
+
+} // mutation_writer
--- a/mutation_writer/feed_writers.hh
+++ b/mutation_writer/feed_writers.hh
@@ -22,10 +22,31 @@
 #pragma once

 #include "flat_mutation_reader.hh"
+#include "mutation_reader.hh"

 namespace mutation_writer {
 using reader_consumer = noncopyable_function<future<> (flat_mutation_reader)>;

+class bucket_writer {
+    schema_ptr _schema;
+    queue_reader_handle _handle;
+    future<> _consume_fut;
+
+private:
+    bucket_writer(schema_ptr schema, std::pair<flat_mutation_reader, queue_reader_handle> queue_reader, reader_consumer& consumer);
+
+public:
+    bucket_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer);
+
+    future<> consume(mutation_fragment mf);
+
+    void consume_end_of_stream();
+
+    void abort(std::exception_ptr ep) noexcept;
+
+    future<> close() noexcept;
+};
+
 template <typename Writer>
 requires MutationFragmentConsumer<Writer, future<>>
 future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
@@ -40,9 +61,17 @@ future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
            if (f.failed()) {
                auto ex = f.get_exception();
                wr.abort(ex);
-                return make_exception_future<>(ex);
+                return wr.close().then_wrapped([ex = std::move(ex)] (future<> f) mutable {
+                    if (f.failed()) {
+                        // The consumer is expected to fail when aborted,
+                        // so just ignore any exception.
+                        (void)f.get_exception();
+                    }
+                    return make_exception_future<>(std::move(ex));
+                });
            } else {
-                return wr.consume_end_of_stream();
+                wr.consume_end_of_stream();
+                return wr.close();
            }
        });
    });
--- a/mutation_writer/shard_based_splitting_writer.cc
+++ b/mutation_writer/shard_based_splitting_writer.cc
@@ -31,36 +31,7 @@
 namespace mutation_writer {

 class shard_based_splitting_mutation_writer {
-    class shard_writer {
-        queue_reader_handle _handle;
-        future<> _consume_fut;
-    private:
-        shard_writer(schema_ptr schema, std::pair<flat_mutation_reader, queue_reader_handle> queue_reader, reader_consumer& consumer)
-            : _handle(std::move(queue_reader.second))
-            , _consume_fut(consumer(std::move(queue_reader.first))) {
-        }
-
-    public:
-        shard_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer)
-            : shard_writer(schema, make_queue_reader(schema, std::move(permit)), consumer) {
-        }
-        future<> consume(mutation_fragment mf) {
-            return _handle.push(std::move(mf));
-        }
-        future<> consume_end_of_stream() {
-            // consume_end_of_stream is always called from a finally block,
-            // and that's because we wait for _consume_fut to return. We
-            // don't want to generate another exception here if the read was
-            // aborted.
-            if (!_handle.is_terminated()) {
-                _handle.push_end_of_stream();
-            }
-            return std::move(_consume_fut);
-        }
-        void abort(std::exception_ptr ep) {
-            _handle.abort(ep);
-        }
-    };
+    using shard_writer = bucket_writer;

 private:
    schema_ptr _schema;
@@ -105,13 +76,12 @@ public:
        return write_to_shard(mutation_fragment(*_schema, _permit, std::move(pe)));
    }

-    future<> consume_end_of_stream() {
-        return parallel_for_each(_shards, [] (std::optional<shard_writer>& shard) {
-            if (!shard) {
-                return make_ready_future<>();
+    void consume_end_of_stream() {
+        for (auto& shard : _shards) {
+            if (shard) {
+                shard->consume_end_of_stream();
            }
-            return shard->consume_end_of_stream();
-        });
+        }
    }
    void abort(std::exception_ptr ep) {
        for (auto&& shard : _shards) {
@@ -120,6 +90,11 @@ public:
            }
        }
    }
+    future<> close() noexcept {
+        return parallel_for_each(_shards, [] (std::optional<shard_writer>& shard) {
+            return shard ? shard->close() : make_ready_future<>();
+        });
+    }
 };

 future<> segregate_by_shard(flat_mutation_reader producer, reader_consumer consumer) {
--- a/mutation_writer/timestamp_based_splitting_writer.cc
+++ b/mutation_writer/timestamp_based_splitting_writer.cc
@@ -109,22 +109,12 @@ small_flat_map<Key, Value, Size>::find(const key_type& k) {
 class timestamp_based_splitting_mutation_writer {
    using bucket_id = int64_t;

-    class bucket_writer {
-        schema_ptr _schema;
-        queue_reader_handle _handle;
-        future<> _consume_fut;
+    class timestamp_bucket_writer : public bucket_writer {
        bool _has_current_partition = false;

-    private:
-        bucket_writer(schema_ptr schema, std::pair<flat_mutation_reader, queue_reader_handle> queue_reader, reader_consumer& consumer)
-            : _schema(std::move(schema))
-            , _handle(std::move(queue_reader.second))
-            , _consume_fut(consumer(std::move(queue_reader.first))) {
-        }
-
    public:
-        bucket_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer)
-            : bucket_writer(schema, make_queue_reader(schema, std::move(permit)), consumer) {
+        timestamp_bucket_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer)
+            : bucket_writer(schema, std::move(permit), consumer) {
        }
        void set_has_current_partition() {
            _has_current_partition = true;
@@ -135,18 +125,6 @@ class timestamp_based_splitting_mutation_writer {
        bool has_current_partition() const {
            return _has_current_partition;
        }
-        future<> consume(mutation_fragment mf) {
-            return _handle.push(std::move(mf));
-        }
-        future<> consume_end_of_stream() {
-            if (!_handle.is_terminated()) {
-                _handle.push_end_of_stream();
-            }
-            return std::move(_consume_fut);
-        }
-        void abort(std::exception_ptr ep) {
-            _handle.abort(ep);
-        }
    };

 private:
@@ -155,7 +133,7 @@ private:
    classify_by_timestamp _classifier;
    reader_consumer _consumer;
    partition_start _current_partition_start;
-    std::unordered_map<bucket_id, bucket_writer> _buckets;
+    std::unordered_map<bucket_id, timestamp_bucket_writer> _buckets;
    std::vector<bucket_id> _buckets_used_for_current_partition;

 private:
@@ -186,16 +164,21 @@ public:
    future<> consume(range_tombstone&& rt);
    future<> consume(partition_end&& pe);

-    future<> consume_end_of_stream() {
-        return parallel_for_each(_buckets, [] (std::pair<const bucket_id, bucket_writer>& bucket) {
-            return bucket.second.consume_end_of_stream();
-        });
+    void consume_end_of_stream() {
+        for (auto& b : _buckets) {
+            b.second.consume_end_of_stream();
+        }
    }
    void abort(std::exception_ptr ep) {
        for (auto&& b : _buckets) {
            b.second.abort(ep);
        }
    }
+    future<> close() noexcept {
+        return parallel_for_each(_buckets, [] (std::pair<const bucket_id, timestamp_bucket_writer>& b) {
+            return b.second.close();
+        });
+    }
 };

 future<> timestamp_based_splitting_mutation_writer::write_to_bucket(bucket_id bucket, mutation_fragment&& mf) {
--- a/query-result.hh
+++ b/query-result.hh
@@ -205,6 +205,10 @@ public:
            auto to_block = std::min(_used_memory - _blocked_bytes, n);
            _blocked_bytes += to_block;
            stop = (_limiter->update_and_check(to_block) && _stop_on_global_limit) || stop;
+            if (stop && !_short_read_allowed) {
+                // If we are here we stopped because of the global limit.
+                throw std::runtime_error("Maximum amount of memory for building query results is exhausted, unpaged query cannot be finished");
+            }
        }
        return stop;
    }
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -75,7 +75,7 @@ class reader_permit::impl : public boost::intrusive::list_base_hook<boost::intru
    sstring _op_name;
    std::string_view _op_name_view;
    reader_resources _resources;
-    reader_permit::state _state = reader_permit::state::registered;
+    reader_permit::state _state = reader_permit::state::active;

 public:
    struct value_tag {};
@@ -123,22 +123,17 @@ public:
    }

    void on_admission() {
-        _state = reader_permit::state::admitted;
-        _semaphore.consume(_resources);
+        _state = reader_permit::state::active;
    }

    void consume(reader_resources res) {
        _resources += res;
-        if (_state == reader_permit::state::admitted) {
-            _semaphore.consume(res);
-        }
+        _semaphore.consume(res);
    }

    void signal(reader_resources res) {
        _resources -= res;
-        if (_state == reader_permit::state::admitted) {
-            _semaphore.signal(res);
-        }
+        _semaphore.signal(res);
    }

    reader_resources resources() const {
@@ -205,14 +200,11 @@ reader_resources reader_permit::consumed_resources() const {

 std::ostream& operator<<(std::ostream& os, reader_permit::state s) {
    switch (s) {
-        case reader_permit::state::registered:
-            os << "registered";
-            break;
        case reader_permit::state::waiting:
            os << "waiting";
            break;
-        case reader_permit::state::admitted:
-            os << "admitted";
+        case reader_permit::state::active:
+            os << "active";
            break;
    }
    return os;
@@ -249,7 +241,7 @@ struct permit_group_key_hash {

 using permit_groups = std::unordered_map<permit_group_key, permit_stats, permit_group_key_hash>;

-static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const permit_groups& permits, reader_permit::state state, bool sort_by_memory) {
+static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const permit_groups& permits, reader_permit::state state) {
    struct permit_summary {
        const schema* s;
        std::string_view op_name;
@@ -265,25 +257,17 @@ static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const pe
        }
    }

-    std::ranges::sort(permit_summaries, [sort_by_memory] (const permit_summary& a, const permit_summary& b) {
-        if (sort_by_memory) {
-            return a.memory < b.memory;
-        } else {
-            return a.count < b.count;
-        }
+    std::ranges::sort(permit_summaries, [] (const permit_summary& a, const permit_summary& b) {
+        return a.memory < b.memory;
    });

    permit_stats total;

-    auto print_line = [&os, sort_by_memory] (auto col1, auto col2, auto col3) {
-        if (sort_by_memory) {
-            fmt::print(os, "{}\t{}\t{}\n", col2, col1, col3);
-        } else {
-            fmt::print(os, "{}\t{}\t{}\n", col1, col2, col3);
-        }
+    auto print_line = [&os] (auto col1, auto col2, auto col3) {
+        fmt::print(os, "{}\t{}\t{}\n", col2, col1, col3);
    };

-    fmt::print(os, "Permits with state {}, sorted by {}\n", state, sort_by_memory ? "memory" : "count");
+    fmt::print(os, "Permits with state {}\n", state);
    print_line("count", "memory", "name");
    for (const auto& summary : permit_summaries) {
        total.count += summary.count;
@@ -309,11 +293,9 @@ static void do_dump_reader_permit_diagnostics(std::ostream& os, const reader_con
    permit_stats total;

    fmt::print(os, "Semaphore {}: {}, dumping permit diagnostics:\n", semaphore.name(), problem);
-    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::admitted, true);
+    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::active);
    fmt::print(os, "\n");
-    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::waiting, false);
-    fmt::print(os, "\n");
-    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::registered, false);
+    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::waiting);
    fmt::print(os, "\n");
    fmt::print(os, "Total: permits: {}, memory: {}\n", total.count, utils::to_hr_size(total.memory));
 }
@@ -374,7 +356,7 @@ reader_concurrency_semaphore::~reader_concurrency_semaphore() {
 reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore::register_inactive_read(std::unique_ptr<inactive_read> ir) {
    // Implies _inactive_reads.empty(), we don't queue new readers before
    // evicting all inactive reads.
-    if (_wait_list.empty()) {
+    if (_wait_list.empty() && _resources.memory > 0) {
        const auto [it, _] = _inactive_reads.emplace(_next_id++, std::move(ir));
        (void)_;
        ++_stats.inactive_reads;
@@ -424,13 +406,13 @@ bool reader_concurrency_semaphore::try_evict_one_inactive_read() {
 }

 bool reader_concurrency_semaphore::has_available_units(const resources& r) const {
-    return bool(_resources) && _resources >= r;
+    // Special case: when there is no active reader (based on count) admit one
+    // regardless of availability of memory.
+    return (bool(_resources) && _resources >= r) || _resources.count == _initial_resources.count;
 }

 bool reader_concurrency_semaphore::may_proceed(const resources& r) const {
-    // Special case: when there is no active reader (based on count) admit one
-    // regardless of availability of memory.
-    return _wait_list.empty() && (has_available_units(r) || _resources.count == _initial_resources.count);
+    return _wait_list.empty() && has_available_units(r);
 }

 future<reader_permit::resource_units> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, size_t memory,
@@ -480,6 +462,12 @@ void reader_concurrency_semaphore::broken(std::exception_ptr ex) {
    }
 }

+std::string reader_concurrency_semaphore::dump_diagnostics() const {
+    std::ostringstream os;
+    do_dump_reader_permit_diagnostics(os, *this, *_permit_list, "user request");
+    return os.str();
+}
+
 // A file that tracks the memory usage of buffers resulting from read
 // operations.
 class tracking_file_impl : public file_impl {
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -231,4 +231,6 @@ public:
    }

    void broken(std::exception_ptr ex);
+
+    std::string dump_diagnostics() const;
 };
--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -91,9 +91,8 @@ public:
    class resource_units;

    enum class state {
-        registered, // read is registered, but didn't attempt admission yet
        waiting, // waiting for admission
-        admitted,
+        active,
    };

    class impl;
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -309,7 +309,7 @@ float node_ops_metrics::repair_finished_percentage() {
 tracker::tracker(size_t nr_shards, size_t max_repair_memory)
    : _shutdown(false)
    , _repairs(nr_shards) {
-    auto nr = std::max(size_t(1), size_t(max_repair_memory / max_repair_memory_per_range()));
+    auto nr = std::max(size_t(1), size_t(max_repair_memory / max_repair_memory_per_range() / 4));
    rlogger.info("Setting max_repair_memory={}, max_repair_memory_per_range={}, max_repair_ranges_in_parallel={}",
        max_repair_memory, max_repair_memory_per_range(), nr);
    _range_parallelism_semaphores.reserve(nr_shards);
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -571,7 +571,7 @@ public:
        _mq[node_idx] = std::move(queue_handle);
        auto writer = shared_from_this();
        _writer_done[node_idx] = mutation_writer::distribute_reader_and_consume_on_shards(_schema, std::move(queue_reader),
-                [&db, reason = this->_reason, estimated_partitions = this->_estimated_partitions, writer] (flat_mutation_reader reader) {
+                [&db, reason = this->_reason, estimated_partitions = this->_estimated_partitions] (flat_mutation_reader reader) {
            auto& t = db.local().find_column_family(reader.schema());
            return db::view::check_needs_view_update_path(_sys_dist_ks->local(), t, reason).then([t = t.shared_from_this(), estimated_partitions, reader = std::move(reader)] (bool use_view_update_path) mutable {
                //FIXME: for better estimations this should be transmitted from remote
--- a/schema.cc
+++ b/schema.cc
@@ -456,6 +456,9 @@ schema::schema(const schema& o)
    rebuild();
    if (o.is_view()) {
        _view_info = std::make_unique<::view_info>(*this, o.view_info()->raw());
+        if (o.view_info()->base_info()) {
+            _view_info->set_base_info(o.view_info()->base_info());
+        }
    }
 }

@@ -859,7 +862,7 @@ std::ostream& schema::describe(database& db, std::ostream& os) const {
    os << "}";
    os << "\n    AND comment = '" << comment()<< "'";
    os << "\n    AND compaction = {'class': '" <<  sstables::compaction_strategy::name(compaction_strategy()) << "'";
-    map_as_cql_param(os, compaction_strategy_options()) << "}";
+    map_as_cql_param(os, compaction_strategy_options(), false) << "}";
    os << "\n    AND compression = {";
    map_as_cql_param(os,  get_compressor_params().get_options());
    os << "}";
--- a/schema_registry.cc
+++ b/schema_registry.cc
@@ -24,6 +24,7 @@
 #include "schema_registry.hh"
 #include "log.hh"
 #include "db/schema_tables.hh"
+#include "view_info.hh"

 static logging::logger slogger("schema_registry");

@@ -274,22 +275,43 @@ global_schema_ptr::global_schema_ptr(global_schema_ptr&& o) noexcept {
    assert(o._cpu_of_origin == current);
    _ptr = std::move(o._ptr);
    _cpu_of_origin = current;
+    _base_schema = std::move(o._base_schema);
 }

 schema_ptr global_schema_ptr::get() const {
    if (this_shard_id() == _cpu_of_origin) {
        return _ptr;
    } else {
-        // 'e' points to a foreign entry, but we know it won't be evicted
-        // because _ptr is preventing this.
-        const schema_registry_entry& e = *_ptr->registry_entry();
-        schema_ptr s = local_schema_registry().get_or_null(e.version());
-        if (!s) {
-            s = local_schema_registry().get_or_load(e.version(), [&e](table_schema_version) {
-                return e.frozen();
-            });
+        auto registered_schema = [](const schema_registry_entry& e) {
+            schema_ptr ret = local_schema_registry().get_or_null(e.version());
+            if (!ret) {
+                ret = local_schema_registry().get_or_load(e.version(), [&e](table_schema_version) {
+                    return e.frozen();
+                });
+            }
+            return ret;
+        };
+
+        schema_ptr registered_bs;
+        // the following code contains registry entry dereference of a foreign shard
+        // however, it is guarantied to succeed since we made sure in the constructor
+        // that _bs_schema and _ptr will have a registry on the foreign shard where this
+        // object originated so as long as this object lives the registry entries lives too
+        // and it is safe to reference them on foreign shards.
+        if (_base_schema) {
+            registered_bs = registered_schema(*_base_schema->registry_entry());
+            if (_base_schema->registry_entry()->is_synced()) {
+                registered_bs->registry_entry()->mark_synced();
+            }
        }
-        if (e.is_synced()) {
+        schema_ptr s = registered_schema(*_ptr->registry_entry());
+        if (s->is_view()) {
+            if (!s->view_info()->base_info()) {
+                // we know that registered_bs is valid here because we make sure of it in the constructors.
+                s->view_info()->set_base_info(s->view_info()->make_base_dependent_view_info(*registered_bs));
+            }
+        }
+        if (_ptr->registry_entry()->is_synced()) {
            s->registry_entry()->mark_synced();
        }
        return s;
@@ -297,16 +319,33 @@ schema_ptr global_schema_ptr::get() const {
 }

 global_schema_ptr::global_schema_ptr(const schema_ptr& ptr)
-    : _ptr([&ptr]() {
-        // _ptr must always have an associated registry entry,
-        // if ptr doesn't, we need to load it into the registry.
-        schema_registry_entry* e = ptr->registry_entry();
+        : _cpu_of_origin(this_shard_id()) {
+    // _ptr must always have an associated registry entry,
+    // if ptr doesn't, we need to load it into the registry.
+    auto ensure_registry_entry = [] (const schema_ptr& s) {
+        schema_registry_entry* e = s->registry_entry();
        if (e) {
-            return ptr;
-        }
-        return local_schema_registry().get_or_load(ptr->version(), [&ptr] (table_schema_version) {
-                return frozen_schema(ptr);
+            return s;
+        } else {
+            return local_schema_registry().get_or_load(s->version(), [&s] (table_schema_version) {
+                return frozen_schema(s);
            });
-        }())
-    , _cpu_of_origin(this_shard_id())
-{ }
+        }
+    };
+
+    schema_ptr s = ensure_registry_entry(ptr);
+    if (s->is_view()) {
+        if (s->view_info()->base_info()) {
+            _base_schema = ensure_registry_entry(s->view_info()->base_info()->base_schema());
+        } else if (ptr->view_info()->base_info()) {
+            _base_schema = ensure_registry_entry(ptr->view_info()->base_info()->base_schema());
+        } else {
+            on_internal_error(slogger, format("Tried to build a global schema for view {}.{} with an uninitialized base info", s->ks_name(), s->cf_name()));
+        }
+
+        if (!s->view_info()->base_info() || !s->view_info()->base_info()->base_schema()->registry_entry()) {
+            s->view_info()->set_base_info(s->view_info()->make_base_dependent_view_info(*_base_schema));
+        }
+    }
+    _ptr = s;
+}
--- a/schema_registry.hh
+++ b/schema_registry.hh
@@ -165,6 +165,7 @@ schema_registry& local_schema_registry();
 // chain will last.
 class global_schema_ptr {
    schema_ptr _ptr;
+    schema_ptr _base_schema;
    unsigned _cpu_of_origin;
 public:
    // Note: the schema_ptr must come from the current shard and can't be nullptr.
--- a/2
+++ b/2
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -53,6 +53,7 @@
 #include "database.hh"
 #include "db/schema_tables.hh"
 #include "types/user.hh"
+#include "db/schema_tables.hh"

 namespace service {

@@ -1096,8 +1097,19 @@ future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging
            // referenced by the incoming request.
            // That means the column mapping for the schema should always be inserted
            // with TTL (refresh TTL in case column mapping already existed prior to that).
-            return db::schema_tables::store_column_mapping(proxy, s.unfreeze(db::schema_ctxt(proxy)), true).then([s] {
-                return s;
+            auto us = s.unfreeze(db::schema_ctxt(proxy));
+            // if this is a view - we might need to fix it's schema before registering it.
+            if (us->is_view()) {
+                auto& db = proxy.local().local_db();
+                schema_ptr base_schema = db.find_schema(us->view_info()->base_id());
+                auto fixed_view = db::schema_tables::maybe_fix_legacy_secondary_index_mv_schema(db, view_ptr(us), base_schema,
+                        db::schema_tables::preserve_version::yes);
+                if (fixed_view) {
+                    us = fixed_view;
+                }
+            }
+            return db::schema_tables::store_column_mapping(proxy, us, true).then([us] {
+                return frozen_schema{us};
            });
        });
    }).then([] (schema_ptr s) {
@@ -1105,7 +1117,7 @@ future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging
        // table.
        if (s->is_view()) {
            if (!s->view_info()->base_info()) {
-                auto& db = service::get_local_storage_proxy().get_db().local();
+                auto& db = service::get_local_storage_proxy().local_db();
                // This line might throw a no_such_column_family
                // It should be fine since if we tried to register a view for which
                // we don't know the base table, our registry is broken.
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -3624,6 +3624,11 @@ protected:

 public:
    virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(storage_proxy::clock_type::time_point timeout) {
+        if (_targets.empty()) {
+            // We may have no targets to read from if a DC with zero replication is queried with LOCACL_QUORUM.
+            // Return an empty result in this case
+            return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>>(make_foreign(make_lw_shared(query::result())));
+        }
        digest_resolver_ptr digest_resolver = ::make_shared<digest_read_resolver>(_schema, _cl, _block_for,
                db::is_datacenter_local(_cl) ? db::count_local_endpoints(_targets): _targets.size(), timeout);
        auto exec = shared_from_this();
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -446,6 +446,12 @@ public:
    distributed<database>& get_db() {
        return _db;
    }
+    const database& local_db() const noexcept {
+        return _db.local();
+    }
+    database& local_db() noexcept {
+        return _db.local();
+    }

    void set_cdc_service(cdc::cdc_service* cdc) {
        _cdc = cdc;
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -2336,7 +2336,7 @@ future<> storage_service::rebuild(sstring source_dc) {
                    slogger.info("Streaming for rebuild successful");
                }).handle_exception([] (auto ep) {
                    // This is used exclusively through JMX, so log the full trace but only throw a simple RTE
-                    slogger.warn("Error while rebuilding node: {}", std::current_exception());
+                    slogger.warn("Error while rebuilding node: {}", ep);
                    return make_exception_future<>(std::move(ep));
                });
            });
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -438,7 +438,6 @@ protected:
    mutation_source_metadata _ms_metadata = {};
    garbage_collected_sstable_writer::data _gc_sstable_writer_data;
    compaction_sstable_replacer_fn _replacer;
-    std::optional<compaction_weight_registration> _weight_registration;
    utils::UUID _run_identifier;
    ::io_priority_class _io_priority;
    // optional clone of sstable set to be used for expiration purposes, so it will be set if expiration is enabled.
@@ -457,7 +456,6 @@ protected:
        , _sstable_level(descriptor.level)
        , _gc_sstable_writer_data(*this)
        , _replacer(std::move(descriptor.replacer))
-        , _weight_registration(std::move(descriptor.weight_registration))
        , _run_identifier(descriptor.run_identifier)
        , _io_priority(descriptor.io_priority)
        , _sstable_set(std::move(descriptor.all_sstables_snapshot))
@@ -929,9 +927,6 @@ public:
    }

    virtual void on_end_of_compaction() override {
-        if (_weight_registration) {
-            _cf.get_compaction_manager().on_compaction_complete(*_weight_registration);
-        }
        replace_remaining_exhausted_sstables();
    }
 private:
--- a/sstables/compaction_descriptor.hh
+++ b/sstables/compaction_descriptor.hh
@@ -134,8 +134,6 @@ struct compaction_descriptor {
    uint64_t max_sstable_bytes;
    // Run identifier of output sstables.
    utils::UUID run_identifier;
-    // Holds ownership of a weight assigned to this compaction iff it's a regular one.
-    std::optional<compaction_weight_registration> weight_registration;
    // Calls compaction manager's task for this compaction to release reference to exhausted sstables.
    std::function<void(const std::vector<shared_sstable>& exhausted_sstables)> release_exhausted;
    // The options passed down to the compaction code.
--- a/sstables/compaction_manager.cc
+++ b/sstables/compaction_manager.cc
@@ -311,6 +311,7 @@ future<> compaction_manager::run_custom_job(column_family* cf, sstring name, non
            cmlog.info("{} was abruptly stopped, reason: {}", name, e.what());
        } catch (...) {
            cmlog.error("{} failed: {}", name, std::current_exception());
+            throw;
        }
    });
    return task->compaction_done.get_future().then([task] {});
@@ -435,7 +436,7 @@ void compaction_manager::reevaluate_postponed_compactions() {
 }

 void compaction_manager::postpone_compaction_for_column_family(column_family* cf) {
-    _postponed.push_back(cf);
+    _postponed.insert(cf);
 }

 future<> compaction_manager::stop_ongoing_compactions(sstring reason) {
@@ -575,7 +576,7 @@ void compaction_manager::submit(column_family* cf) {
                return make_ready_future<stop_iteration>(stop_iteration::yes);
            }
            auto compacting = make_lw_shared<compacting_sstable_registration>(this, descriptor.sstables);
-            descriptor.weight_registration = compaction_weight_registration(this, weight);
+            auto weight_r = compaction_weight_registration(this, weight);
            descriptor.release_exhausted = [compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
                compacting->release_compacting(exhausted_sstables);
            };
@@ -585,7 +586,7 @@ void compaction_manager::submit(column_family* cf) {
            _stats.pending_tasks--;
            _stats.active_tasks++;
            task->compaction_running = true;
-            return cf.run_compaction(std::move(descriptor)).then_wrapped([this, task, compacting = std::move(compacting)] (future<> f) mutable {
+            return cf.run_compaction(std::move(descriptor)).then_wrapped([this, task, compacting = std::move(compacting), weight_r = std::move(weight_r)] (future<> f) mutable {
                _stats.active_tasks--;
                task->compaction_running = false;

@@ -629,10 +630,11 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
    _tasks.push_back(task);

    auto sstables = std::make_unique<std::vector<sstables::shared_sstable>>(get_func(*cf));
+    auto compacting = make_lw_shared<compacting_sstable_registration>(this, *sstables);
    auto sstables_ptr = sstables.get();
    _stats.pending_tasks += sstables->size();

-    task->compaction_done = do_until([sstables_ptr] { return sstables_ptr->empty(); }, [this, task, options, sstables_ptr] () mutable {
+    task->compaction_done = do_until([sstables_ptr] { return sstables_ptr->empty(); }, [this, task, options, sstables_ptr, compacting] () mutable {

        // FIXME: lock cf here
        if (!can_proceed(task)) {
@@ -642,7 +644,7 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
        auto sst = sstables_ptr->back();
        sstables_ptr->pop_back();

-        return repeat([this, task, options, sst = std::move(sst)] () mutable {
+        return repeat([this, task, options, sst = std::move(sst), compacting] () mutable {
            column_family& cf = *task->compacting_cf;
            auto sstable_level = sst->get_sstable_level();
            auto run_identifier = sst->run_identifier();
@@ -650,21 +652,22 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
            auto descriptor = sstables::compaction_descriptor({ sst }, cf.get_sstable_set(), service::get_local_compaction_priority(),
                sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, options);

-            auto compacting = make_lw_shared<compacting_sstable_registration>(this, descriptor.sstables);
            // Releases reference to cleaned sstable such that respective used disk space can be freed.
            descriptor.release_exhausted = [compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
                compacting->release_compacting(exhausted_sstables);
            };

-            _stats.pending_tasks--;
-            _stats.active_tasks++;
-            task->compaction_running = true;
-            compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
-            return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor)] (compaction_backlog_tracker& bt) mutable {
-                return with_scheduling_group(_scheduling_group, [this, &cf, descriptor = std::move(descriptor)] () mutable {
-                    return cf.run_compaction(std::move(descriptor));
+            return with_semaphore(_rewrite_sstables_sem, 1, [this, task, &cf, descriptor = std::move(descriptor)] () mutable {
+                _stats.pending_tasks--;
+                _stats.active_tasks++;
+                task->compaction_running = true;
+                compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
+                return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor)] (compaction_backlog_tracker& bt) mutable {
+                    return with_scheduling_group(_scheduling_group, [this, &cf, descriptor = std::move(descriptor)]() mutable {
+                        return cf.run_compaction(std::move(descriptor));
+                    });
                });
-            }).then_wrapped([this, task, compacting = std::move(compacting)] (future<> f) mutable {
+            }).then_wrapped([this, task, compacting] (future<> f) mutable {
                task->compaction_running = false;
                _stats.active_tasks--;
                if (!can_proceed(task)) {
@@ -796,7 +799,7 @@ future<> compaction_manager::remove(column_family* cf) {
            task->stopping = true;
        }
    }
-    _postponed.erase(boost::remove(_postponed, cf), _postponed.end());
+    _postponed.erase(cf);

    // Wait for the termination of an ongoing compaction on cf, if any.
    return do_for_each(*tasks_to_stop, [this, cf] (auto& task) {
@@ -832,11 +835,6 @@ void compaction_manager::stop_compaction(sstring type) {
    }
 }

-void compaction_manager::on_compaction_complete(compaction_weight_registration& weight_registration) {
-    weight_registration.deregister();
-    reevaluate_postponed_compactions();
-}
-
 void compaction_manager::propagate_replacement(column_family* cf,
        const std::vector<sstables::shared_sstable>& removed, const std::vector<sstables::shared_sstable>& added) {
    for (auto& info : _compactions) {
--- a/sstables/compaction_manager.hh
+++ b/sstables/compaction_manager.hh
@@ -99,7 +99,7 @@ private:
    future<> _waiting_reevalution = make_ready_future<>();
    condition_variable _postponed_reevaluation;
    // column families that wait for compaction but had its submission postponed due to ongoing compaction.
-    std::vector<column_family*> _postponed;
+    std::unordered_set<column_family*> _postponed;
    // tracks taken weights of ongoing compactions, only one compaction per weight is allowed.
    // weight is value assigned to a compaction job that is log base N of total size of all input sstables.
    std::unordered_set<int> _weight_tracker;
@@ -111,6 +111,7 @@ private:
    std::unordered_map<column_family*, rwlock> _compaction_locks;

    semaphore _custom_job_sem{1};
+    seastar::named_semaphore _rewrite_sstables_sem = {1, named_semaphore_exception_factory{"rewrite sstables"}};

    std::function<void()> compaction_submission_callback();
    // all registered column families are submitted for compaction at a constant interval.
@@ -255,11 +256,6 @@ public:
    // Stops ongoing compaction of a given type.
    void stop_compaction(sstring type);

-    // Called by compaction procedure to release the weight lock assigned to it, such that
-    // another compaction waiting on same weight can start as soon as possible. That's usually
-    // called before compaction seals sstable and such and after all compaction work is done.
-    void on_compaction_complete(compaction_weight_registration& weight_registration);
-
    double backlog() {
        return _backlog_manager.backlog();
    }
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -367,6 +367,7 @@ class index_reader {
    const io_priority_class& _pc;
    tracing::trace_state_ptr _trace_state;
    shared_index_lists _index_lists;
+    future<> _background_closes = make_ready_future<>();

    struct reader {
        index_consumer _consumer;
@@ -472,6 +473,16 @@ private:
        };

        return _index_lists.get_or_load(summary_idx, loader).then([this, &bound, summary_idx] (shared_index_lists::list_ptr ref) {
+            // to make sure list is not closed when another bound is still using it, index list will only be closed when there's only one owner holding it
+            if (bound.current_list && bound.current_list.use_count() == 1) {
+                // a new background close will only be initiated when previous ones terminate, so as to limit the concurrency.
+                _background_closes = _background_closes.then_wrapped([current_list = std::move(bound.current_list)] (future<>&& f) mutable {
+                    f.ignore_ready_future();
+                    return do_with(std::move(current_list), [] (shared_index_lists::list_ptr& current_list) mutable {
+                        return close_index_list(current_list);
+                    });
+                });
+            }
            bound.current_list = std::move(ref);
            bound.current_summary_idx = summary_idx;
            bound.current_index_idx = 0;
@@ -841,6 +852,8 @@ public:
                return close_index_list(_upper_bound->current_list);
            }
            return make_ready_future<>();
+        }).then([this] () mutable {
+            return std::move(_background_closes);
        });
    }
 };
--- a/sstables/kl/writer.cc
+++ b/sstables/kl/writer.cc
@@ -315,8 +315,8 @@ void sstable_writer_k_l::write_collection(file_writer& out, const composite& clu
 void sstable_writer_k_l::write_clustered_row(file_writer& out, const schema& schema, const clustering_row& clustered_row) {
    auto clustering_key = composite::from_clustering_element(schema, clustered_row.key());

-    maybe_write_row_marker(out, schema, clustered_row.marker(), clustering_key);
    maybe_write_row_tombstone(out, clustering_key, clustered_row);
+    maybe_write_row_marker(out, schema, clustered_row.marker(), clustering_key);

    _collector.update_min_max_components(clustered_row.key());

--- a/sstables/leveled_compaction_strategy.cc
+++ b/sstables/leveled_compaction_strategy.cc
@@ -147,7 +147,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
        unsigned overlapping_sstables = 0;
        auto prev_last = dht::ring_position::min();
        for (auto& sst : sstables) {
-            if (dht::ring_position(sst->get_first_decorated_key()).less_compare(*schema, prev_last)) {
+            if (dht::ring_position(sst->get_first_decorated_key()).tri_compare(*schema, prev_last) <= 0) {
                overlapping_sstables++;
            }
            prev_last = dht::ring_position(sst->get_last_decorated_key());
@@ -178,7 +178,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

    unsigned max_filled_level = 0;

-    size_t offstrategy_threshold = std::max(schema->min_compaction_threshold(), 4);
+    size_t offstrategy_threshold = (mode == reshape_mode::strict) ? std::max(schema->min_compaction_threshold(), 4) : std::max(schema->max_compaction_threshold(), 32);
    size_t max_sstables = std::max(schema->max_compaction_threshold(), int(offstrategy_threshold));
    auto tolerance = [mode] (unsigned level) -> unsigned {
        if (mode == reshape_mode::strict) {
@@ -189,10 +189,8 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
    };

    if (level_info[0].size() > offstrategy_threshold) {
-        level_info[0].resize(std::min(level_info[0].size(), max_sstables));
-        compaction_descriptor desc(std::move(level_info[0]), std::optional<sstables::sstable_set>(), iop);
-        desc.options = compaction_options::make_reshape();
-        return desc;
+        size_tiered_compaction_strategy stcs(_stcs_options);
+        return stcs.get_reshaping_job(std::move(level_info[0]), schema, iop, mode);
    }

    for (unsigned level = leveled_manifest::MAX_LEVELS - 1; level > 0; --level) {
--- a/sstables/mp_row_consumer.hh
+++ b/sstables/mp_row_consumer.hh
@@ -1145,7 +1145,11 @@ public:
        setup_for_partition(pk);
        auto dk = dht::decorate_key(*_schema, pk);
        _reader->on_next_partition(std::move(dk), tombstone(deltime));
-        return proceed::yes;
+        // Only partition start will be consumed if processing a large run of partition tombstones,
+        // so let's stop the consumer if buffer is full.
+        // Otherwise, partition tombstones will keep accumulating in memory till other fragment type
+        // is found which can stop the consumer (perhaps there's none if sstable is full of tombstones).
+        return proceed(!_reader->is_buffer_full());
    }

    virtual consumer_m::row_processing_result consume_row_start(const std::vector<temporary_buffer<char>>& ecp) override {
--- a/sstables/size_tiered_compaction_strategy.cc
+++ b/sstables/size_tiered_compaction_strategy.cc
@@ -256,6 +256,7 @@ size_tiered_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
            bucket.resize(std::min(max_sstables, bucket.size()));
            compaction_descriptor desc(std::move(bucket), std::optional<sstables::sstable_set>(), iop);
            desc.options = compaction_options::make_reshape();
+            return desc;
        }
    }

--- a/sstables/time_window_compaction_strategy.hh
+++ b/sstables/time_window_compaction_strategy.hh
@@ -101,7 +101,8 @@ class time_window_compaction_strategy : public compaction_strategy_impl {
    time_window_compaction_strategy_options _options;
    int64_t _estimated_remaining_tasks = 0;
    db_clock::time_point _last_expired_check;
-    timestamp_type _highest_window_seen;
+    // As timestamp_type is an int64_t, a primitive type, it must be initialized here.
+    timestamp_type _highest_window_seen = 0;
    // Keep track of all recent active windows that still need to be compacted into a single SSTable
    std::unordered_set<timestamp_type> _recent_active_windows;
    size_tiered_compaction_strategy_options _stcs_options;
--- a/table.cc
+++ b/table.cc
@@ -1551,6 +1551,10 @@ future<> table::flush_streaming_mutations(utils::UUID plan_id, dht::partition_ra
    });
 }

+bool table::can_flush() const {
+    return _memtables->can_flush();
+}
+
 future<> table::clear() {
    if (_commitlog) {
        _commitlog->discard_completed_segments(_schema->id());
--- a/test/alternator/conftest.py
+++ b/test/alternator/conftest.py
@@ -80,7 +80,7 @@ def dynamodb(request):
        verify = not request.config.getoption('https')
        return boto3.resource('dynamodb', endpoint_url=local_url, verify=verify,
            region_name='us-east-1', aws_access_key_id='alternator', aws_secret_access_key='secret_pass',
-            config=botocore.client.Config(retries={"max_attempts": 3}))
+            config=botocore.client.Config(retries={"max_attempts": 0}, read_timeout=300))

@pytest.fixture(scope="session")
 def dynamodbstreams(request):
--- a/test/alternator/test_condition_expression.py
+++ b/test/alternator/test_condition_expression.py
@@ -154,6 +154,27 @@ def test_update_condition_eq_unequal(test_table_s):
            ConditionExpression='q = :oldval',
            ExpressionAttributeValues={':val1': 3, ':oldval': 2})

+# In test_update_condition_eq_unequal() above we saw that a non-existent
+# attribute is not "=" to a value. Here we check what happens when two
+# non-existent attributes are checked for equality. It turns out, they should
+# *not* be considered equal. In short, an unset attribute is never equal to
+# anything - not even to another unset attribute.
+# Reproduces issue #8511.
+def test_update_condition_eq_two_unset(test_table_s):
+    p = random_string()
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET a = :val1',
+            ConditionExpression='q = z',
+            ExpressionAttributeValues={':val1': 2})
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'a': {'Value': 1, 'Action': 'PUT'}})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET a = :val1',
+            ConditionExpression='q = z',
+            ExpressionAttributeValues={':val1': 3})
+
 # Check that set equality is checked correctly. Unlike string equality (for
 # example), it cannot be done with just naive string comparison of the JSON
 # representation, and we need to allow for any order. (see issue #5021)
@@ -175,6 +196,39 @@ def test_update_condition_eq_set(test_table_s):
        ExpressionAttributeValues={':val1': 3, ':oldval': set(['chinchilla', 'cat', 'dog', 'mouse'])})
    assert 'b' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']

+# The above test (test_update_condition_eq_set()) checked equality of simple
+# set attributes. But an attributes can contain a nested document, where the
+# set sits in a deep level (the set itself is a leaf in this heirarchy because
+# it can only contain numbers, strings or bytes). We need to correctly support
+# equality check in that case too.
+# Reproduces issue #8514.
+@pytest.mark.skip(reason="test needs nested update not yet in branch 4.3")
+def test_update_condition_eq_nested_set(test_table_s):
+    p = random_string()
+    # Because boto3 sorts the set values we give it, in order to generate a
+    # set with a different order, we need to build it incrementally.
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'a': {'Value': {'b': 'c', 'd': ['e', 'f', set(['g', 'h'])], 'i': set(['j', 'k'])}, 'Action': 'PUT'}})
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='ADD a.d[2] :val1, a.i :val2',
+        ExpressionAttributeValues={':val1': set(['l', 'm']), ':val2': set(['n', 'o'])})
+    # Sanity check - the attribute contains the set we think it does
+    expected = {'b': 'c', 'd': ['e', 'f', set(['g', 'h', 'l', 'm'])], 'i': set(['j', 'k', 'n', 'o'])}
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == expected
+    # Now finally check that condition expression check knows the equality too.
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='SET b = :val1',
+        ConditionExpression='a = :oldval',
+        ExpressionAttributeValues={':val1': 3, ':oldval': expected})
+    assert 'b' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+    # Check that equality can also fail, if the inner set differs
+    wrong = {'b': 'c', 'd': ['e', 'f', set(['g', 'h', 'l', 'bad'])], 'i': set(['j', 'k', 'n', 'o'])}
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET b = :val1',
+            ConditionExpression='a = :oldval',
+            ExpressionAttributeValues={':val1': 4, ':oldval': wrong})
+
 # Test for ConditionExpression with operator "<>" (non-equality),
 def test_update_condition_ne(test_table_s):
    p = random_string()
@@ -215,6 +269,54 @@ def test_update_condition_ne(test_table_s):
        ExpressionAttributeValues={':newval': 3, ':oldval': 1})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['c'] == 3

+# Check that set inequality is checked correctly. This reproduces the same
+# bug #5021 that we reproduced above in test_update_condition_eq_set(), just
+# that here we check the inequality operator instead of equality.
+# Reproduces issue #8513.
+def test_update_condition_ne_set(test_table_s):
+    p = random_string()
+    # Because boto3 sorts the set values we give it, in order to generate a
+    # set with a different order, we need to build it incrementally.
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'a': {'Value': set(['dog', 'chinchilla']), 'Action': 'PUT'}})
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='ADD a :val1',
+        ExpressionAttributeValues={':val1': set(['cat', 'mouse'])})
+    # Sanity check - the attribute contains the set we think it does
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == set(['chinchilla', 'cat', 'dog', 'mouse'])
+    # Now check that condition expression check knows there is no inequality
+    # here.
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET b = :val1',
+            ConditionExpression='a <> :oldval',
+            ExpressionAttributeValues={':val1': 2, ':oldval': set(['chinchilla', 'cat', 'dog', 'mouse'])})
+    # As a sanity check, also check something which should be unequal:
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='SET b = :val1',
+        ConditionExpression='a <> :oldval',
+        ExpressionAttributeValues={':val1': 3, ':oldval': set(['chinchilla', 'cat', 'dog', 'horse'])})
+    assert 'b' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+
+# In test_update_condition_ne() above we saw that a non-existent attribute is
+# "not equal" to any value. Here we check what happens when two non-existent
+# attributes are checked for non-equality. It turns out, they are also
+# considered "not equal". In short, an unset attribute is always "not equal" to
+# anything - even to another unset attribute.
+# Reproduces issue #8511.
+def test_update_condition_ne_two_unset(test_table_s):
+    p = random_string()
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='SET a = :val1',
+        ConditionExpression='q <> z',
+        ExpressionAttributeValues={':val1': 2})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == 2
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='SET a = :val1',
+        ConditionExpression='q <> z',
+        ExpressionAttributeValues={':val1': 3})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == 3
+
 # Test for ConditionExpression with operator "<"
 def test_update_condition_lt(test_table_s):
    p = random_string()
@@ -316,6 +418,45 @@ def test_update_condition_lt(test_table_s):
            ExpressionAttributeValues={':newval': 2, ':oldval': 1})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 4

+# In test_update_condition_lt() above we saw that a non-existent attribute is
+# not "<" any value. Here we check what happens when two non-existent
+# attributes are compared with "<". It turns out that the result of such
+# comparison is also false.
+# The same is true for other order operators - any order comparison involving
+# one unset attribute should be false - even if the second operand is an
+# unset attribute as well. Note that the <> operator is different - it is
+# always results in true if one of the operands is an unset attribute (see
+# test_update_condition_ne_two_unset() above).
+# This test is related to issue #8511 (although it passed even before fixing
+# that issue).
+def test_update_condition_comparison_two_unset(test_table_s):
+    p = random_string()
+    ops = ['<', '<=', '>', '>=']
+    for op in ops:
+        with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+            test_table_s.update_item(Key={'p': p},
+                UpdateExpression='SET a = :val1',
+                ConditionExpression='q ' + op + ' z',
+                ExpressionAttributeValues={':val1': 2})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET a = :val1',
+            ConditionExpression='q between z and x',
+            ExpressionAttributeValues={':val1': 2})
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'a': {'Value': 1, 'Action': 'PUT'}})
+    for op in ops:
+        with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+            test_table_s.update_item(Key={'p': p},
+                UpdateExpression='SET a = :val1',
+                ConditionExpression='q ' + op + ' z',
+                ExpressionAttributeValues={':val1': 3})
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            UpdateExpression='SET a = :val1',
+            ConditionExpression='q between z and x',
+            ExpressionAttributeValues={':val1': 2})
+
 # Test for ConditionExpression with operator "<="
 def test_update_condition_le(test_table_s):
    p = random_string()
--- a/test/boost/commitlog_test.cc
+++ b/test/boost/commitlog_test.cc
@@ -578,11 +578,14 @@ SEASTAR_TEST_CASE(test_allocation_failure){

            // Use us loads of memory so we can OOM at the appropriate place
            try {
+                assert(fragmented_temporary_buffer::default_fragment_size < size);
                for (;;) {
-                    junk->emplace_back(new char[size]);
+                    junk->emplace_back(new char[fragmented_temporary_buffer::default_fragment_size]);
                }
            } catch (std::bad_alloc&) {
            }
+            auto last = junk->end();
+            junk->erase(--last);
            return log.add_mutation(utils::UUID_gen::get_time_UUID(), size, db::commitlog::force_sync::no, [size](db::commitlog::output& dst) {
                        dst.fill(char(1), size);
                    }).then_wrapped([junk, size](future<db::rp_handle> f) {
--- a/test/boost/database_test.cc
+++ b/test/boost/database_test.cc
@@ -550,3 +550,71 @@ SEASTAR_THREAD_TEST_CASE(read_max_size) {
        }
    }).get();
 }
+
+// Check that mutation queries, those that are stopped when the memory
+// consumed by their results reach the local/global limit, are aborted
+// instead of silently terminated when this happens.
+SEASTAR_THREAD_TEST_CASE(unpaged_mutation_read_global_limit) {
+    auto cfg = cql_test_config{};
+    cfg.dbcfg.emplace();
+    // The memory available to the result memory limiter (global limit) is
+    // configured based on the available memory, so give a small amount to
+    // the "node", so we don't have to work with large amount of data.
+    cfg.dbcfg->available_memory = 2 * 1024 * 1024;
+    do_with_cql_env_thread([] (cql_test_env& e) {
+        e.execute_cql("CREATE TABLE test (pk text, ck int, v text, PRIMARY KEY (pk, ck));").get();
+        auto id = e.prepare("INSERT INTO test (pk, ck, v) VALUES (?, ?, ?);").get0();
+
+        auto& db = e.local_db();
+        auto& tab = db.find_column_family("ks", "test");
+        auto s = tab.schema();
+
+        auto pk = make_local_key(s);
+        const auto raw_pk = utf8_type->decompose(data_value(pk));
+        const auto cql3_pk = cql3::raw_value::make_value(raw_pk);
+
+        const auto value = sstring(1024, 'a');
+        const auto raw_value = utf8_type->decompose(data_value(value));
+        const auto cql3_value = cql3::raw_value::make_value(raw_value);
+
+        const int num_rows = 1024;
+        const auto max_size = 1024u * 1024u * 1024u;
+
+        for (int i = 0; i != num_rows; ++i) {
+            const auto cql3_ck = cql3::raw_value::make_value(int32_type->decompose(data_value(i)));
+            e.execute_prepared(id, {cql3_pk, cql3_ck, cql3_value}).get();
+        }
+
+        const auto partition_ranges = std::vector<dht::partition_range>{query::full_partition_range};
+
+        const std::vector<std::pair<sstring, std::function<future<size_t>(schema_ptr, const query::read_command&)>>> query_methods{
+                {"query_mutations()", [&db, &partition_ranges] (schema_ptr s, const query::read_command& cmd) -> future<size_t> {
+                    return db.query_mutations(s, cmd, partition_ranges.front(), {}, db::no_timeout).then(
+                            [] (const std::tuple<reconcilable_result, cache_temperature>& res) {
+                        return std::get<0>(res).memory_usage();
+                    });
+                }},
+                {"query_mutations_on_all_shards()", [&e, &partition_ranges] (schema_ptr s, const query::read_command& cmd) -> future<size_t> {
+                    return query_mutations_on_all_shards(e.db(), s, cmd, partition_ranges, {}, db::no_timeout).then(
+                            [] (const std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>& res) {
+                        return std::get<0>(res)->memory_usage();
+                    });
+                }}
+        };
+
+        for (auto [query_method_name, query_method] : query_methods) {
+            testlog.info("checking: query_method={}", query_method_name);
+            auto slice = s->full_slice();
+            slice.options.remove<query::partition_slice::option::allow_short_read>();
+            query::read_command cmd(s->id(), s->version(), slice, query::max_result_size(max_size));
+            try {
+                auto size = query_method(s, cmd).get0();
+                // Just to ensure we are not interpreting empty results as success.
+                BOOST_REQUIRE(size != 0);
+                BOOST_FAIL("Expected exception, but none was thrown.");
+            } catch (std::runtime_error& e) {
+                testlog.trace("Exception thrown, as expected: {}", e);
+            }
+        }
+    }, std::move(cfg)).get();
+}
--- a/test/boost/multishard_mutation_query_test.cc
+++ b/test/boost/multishard_mutation_query_test.cc
@@ -974,14 +974,7 @@ SEASTAR_THREAD_TEST_CASE(fuzzy_test) {

        const auto& partitions = pop_desc.partitions;
        smp::invoke_on_all([cfg, db = &env.db(), gs = global_schema_ptr(pop_desc.schema), &partitions] {
-            auto s = gs.get();
-            auto& sem = db->local().get_reader_concurrency_semaphore();
-
-            auto resources = sem.available_resources();
-            resources -= reader_concurrency_semaphore::resources{1, 0};
-            auto permit = sem.make_permit(s.get(), "fuzzy-test");
-
-            return run_fuzzy_test_workload(cfg, *db, std::move(s), partitions).finally([units = permit.consume_resources(resources)] {});
+            return run_fuzzy_test_workload(cfg, *db, gs.get(), partitions);
        }).handle_exception([seed] (std::exception_ptr e) {
            testlog.error("Test workload failed with exception {}."
                    " To repeat this particular run, replace the random seed of the test, with that of this run ({})."
--- a/test/boost/mutation_reader_test.cc
+++ b/test/boost/mutation_reader_test.cc
@@ -894,6 +894,232 @@ sstables::shared_sstable create_sstable(sstables::test_env& env, simple_schema&
        , mutations);
 }

+namespace {
+
+class generic_inactive_read : public reader_concurrency_semaphore::inactive_read {
+    flat_mutation_reader_opt _reader;
+
+private:
+    explicit generic_inactive_read(flat_mutation_reader&& rd) : _reader(std::move(rd)) { }
+
+    virtual void evict() override {
+        _reader = {};
+    }
+
+public:
+    static std::unique_ptr<inactive_read> make(flat_mutation_reader&& rd) {
+        return std::make_unique<generic_inactive_read>(generic_inactive_read(std::move(rd)));
+    }
+
+    static flat_mutation_reader_opt get_reader(std::unique_ptr<inactive_read>&& ir) {
+        if (!ir) {
+            return {};
+        }
+        auto gir = dynamic_cast<generic_inactive_read*>(ir.get());
+        BOOST_REQUIRE(gir);
+        return std::move(gir->_reader);
+    }
+};
+
+} // anonymous namespace
+
+// This unit test passes a read through admission again-and-again, just
+// like an evictable reader would be during its lifetime. When readmitted
+// the read sometimes has to wait and sometimes not. This is to check that
+// the readmitting a previously admitted reader doesn't leak any units.
+SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_readmission_preserves_units) {
+    simple_schema s;
+    const auto initial_resources = reader_concurrency_semaphore::resources{10, 1024 * 1024};
+    reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name());
+
+    auto permit = semaphore.make_permit(s.schema().get(), get_name());
+
+    std::optional<reader_permit::resource_units> residue_units;
+
+    for (int i = 0; i < 10; ++i) {
+        const auto have_residue_units = bool(residue_units);
+
+        auto current_resources = initial_resources;
+        if (have_residue_units) {
+            current_resources -= residue_units->resources();
+        }
+        BOOST_REQUIRE(semaphore.available_resources() == current_resources);
+
+        std::optional<reader_permit::resource_units> admitted_units;
+        if (i % 2) {
+            const auto consumed_resources = semaphore.available_resources();
+            semaphore.consume(consumed_resources);
+
+            auto units_fut = permit.wait_admission(1024, db::no_timeout);
+            BOOST_REQUIRE(!units_fut.available());
+
+            semaphore.signal(consumed_resources);
+            admitted_units = units_fut.get();
+        } else {
+            admitted_units = permit.wait_admission(1024, db::no_timeout).get();
+        }
+
+        current_resources -= admitted_units->resources();
+        BOOST_REQUIRE(semaphore.available_resources() == current_resources);
+
+        residue_units.emplace(permit.consume_resources(reader_resources(0, 100)));
+        if (!have_residue_units) {
+            current_resources -= residue_units->resources();
+        }
+        BOOST_REQUIRE(semaphore.available_resources() == current_resources);
+
+        auto handle = semaphore.register_inactive_read(generic_inactive_read::make(make_empty_flat_reader(s.schema(), permit)));
+        (void)handle;
+        BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
+    }
+
+    BOOST_REQUIRE(semaphore.available_resources() == initial_resources - residue_units->resources());
+
+    residue_units.reset();
+
+    BOOST_REQUIRE(semaphore.available_resources() == initial_resources);
+}
+
+// This unit test checks that the semaphore doesn't get into a deadlock
+// when contended, in the presence of many memory-only reads (that don't
+// wait for admission). This is tested by simulating the 3 kind of reads we
+// currently have in the system:
+// * memory-only: reads that don't pass admission and only own memory.
+// * admitted: reads that pass admission.
+// * evictable: admitted reads that are furthermore evictable.
+//
+// The test creates and runs a large number of these reads in parallel,
+// read kinds being selected randomly, then creates a watchdog which
+// kills the test if no progress is being made.
+SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_forward_progress) {
+    class reader {
+        class skeleton_reader : public flat_mutation_reader::impl {
+            reader_permit::resource_units _base_resources;
+            std::optional<reader_permit::resource_units> _resources;
+        public:
+            skeleton_reader(schema_ptr s, reader_permit permit, reader_permit::resource_units res)
+                : impl(std::move(s), std::move(permit)), _base_resources(std::move(res)) { }
+            virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
+                _resources.emplace(_permit.consume_resources(reader_resources(0, tests::random::get_int(1024, 2048))));
+                return make_ready_future<>();
+            }
+            virtual void next_partition() override { }
+            virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override { return make_ready_future<>(); }
+            virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point timeout) override { return make_ready_future<>(); }
+        };
+        struct reader_visitor {
+            reader& r;
+            future<> operator()(std::monostate& ms) { return r.tick(ms); }
+            future<> operator()(flat_mutation_reader& reader) { return r.tick(reader); }
+            future<> operator()(reader_concurrency_semaphore::inactive_read_handle& handle) { return r.tick(handle); }
+        };
+
+    private:
+        schema_ptr _schema;
+        reader_permit _permit;
+        bool _memory_only = true;
+        bool _evictable = false;
+        std::optional<reader_permit::resource_units> _units;
+        std::variant<std::monostate, flat_mutation_reader, reader_concurrency_semaphore::inactive_read_handle> _reader;
+
+    private:
+        future<> make_reader() {
+          return async([this] {
+            auto res = _permit.consume_memory();
+            if (!_memory_only) {
+                res = _permit.wait_admission(1024, db::no_timeout).get0();
+            }
+            _reader = make_flat_mutation_reader<skeleton_reader>(_schema, _permit, std::move(res));
+          });
+        }
+        future<> tick(std::monostate&) {
+          return async([this] {
+            make_reader().get();
+            tick(std::get<flat_mutation_reader>(_reader)).get();
+          });
+        }
+        future<> tick(flat_mutation_reader& reader) {
+          return async([this, &reader] {
+            reader.fill_buffer(db::no_timeout).get();
+            if (_evictable) {
+                _reader = _permit.semaphore().register_inactive_read(generic_inactive_read::make(std::move(reader)));
+            }
+          });
+        }
+        future<> tick(reader_concurrency_semaphore::inactive_read_handle& handle) {
+          return async([this, &handle] () mutable {
+            if (auto reader = generic_inactive_read::get_reader(_permit.semaphore().unregister_inactive_read(std::move(handle))); reader) {
+                _reader = std::move(*reader);
+            } else {
+                make_reader().get();
+            }
+            tick(std::get<flat_mutation_reader>(_reader)).get();
+          });
+        }
+
+    public:
+        reader(schema_ptr s, reader_permit permit, bool memory_only, bool evictable)
+            : _schema(std::move(s))
+            , _permit(std::move(permit))
+            , _memory_only(memory_only)
+            , _evictable(evictable)
+            , _units(_permit.consume_memory(tests::random::get_int(128, 1024)))
+        {
+        }
+        future<> tick() {
+            return std::visit(reader_visitor{*this}, _reader);
+        }
+    };
+
+    const auto count = 10;
+    const auto num_readers = 512;
+    const auto ticks = 1000;
+
+    simple_schema s;
+    reader_concurrency_semaphore semaphore(count, count * 1024, get_name());
+
+    std::list<std::optional<reader>> readers;
+    unsigned nr_memory_only = 0;
+    unsigned nr_admitted = 0;
+    unsigned nr_evictable = 0;
+
+    for (auto i = 0; i <  num_readers; ++i) {
+        const auto memory_only = tests::random::get_bool();
+        const auto evictable = !memory_only && tests::random::get_bool();
+        if (memory_only) {
+            ++nr_memory_only;
+        } else if (evictable) {
+            ++nr_evictable;
+        } else {
+            ++nr_admitted;
+        }
+        readers.emplace_back(reader(s.schema(), semaphore.make_permit(s.schema().get(), fmt::format("reader{}", i)), memory_only, evictable));
+    }
+
+    testlog.info("Created {} readers, memory_only={}, admitted={}, evictable={}", readers.size(), nr_memory_only, nr_admitted, nr_evictable);
+
+    bool watchdog_touched = false;
+    auto watchdog = timer<db::timeout_clock>([&semaphore, &watchdog_touched] {
+        if (!watchdog_touched) {
+            testlog.error("Watchdog detected a deadlock, dumping diagnostics before killing the test: {}", semaphore.dump_diagnostics());
+            semaphore.broken(std::make_exception_ptr(std::runtime_error("test killed by watchdog")));
+        }
+        watchdog_touched = false;
+    });
+    watchdog.arm_periodic(std::chrono::seconds(30));
+
+    parallel_for_each(readers, [&] (std::optional<reader>& r) -> future<> {
+      return async([this, &watchdog_touched, &r] {
+        for (auto i = 0; i < ticks; ++i) {
+            watchdog_touched = true;
+            r->tick().get();
+        }
+        r.reset();
+        watchdog_touched = true;
+      });
+    }).get();
+}
+
 static
 sstables::shared_sstable create_sstable(sstables::test_env& env, schema_ptr s, std::vector<mutation> mutations) {
    static thread_local auto tmp = tmpdir();
@@ -3041,39 +3267,30 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
        reader_permit permit,
        const dht::partition_range& prange,
        const query::partition_slice& slice,
-        std::deque<mutation_fragment> first_buffer,
-        position_in_partition_view last_fragment_position,
-        std::deque<mutation_fragment> second_buffer,
-        size_t max_buffer_size) {
+        std::list<std::deque<mutation_fragment>> buffers,
+        position_in_partition_view first_buf_last_fragment_position,
+        size_t max_buffer_size,
+        bool detach_buffer = true) {
    class factory {
        schema_ptr _schema;
        reader_permit _permit;
-        std::optional<std::deque<mutation_fragment>> _first_buffer;
-        std::optional<std::deque<mutation_fragment>> _second_buffer;
+        std::list<std::deque<mutation_fragment>> _buffers;
        size_t _max_buffer_size;

-    private:
-        std::optional<std::deque<mutation_fragment>> copy_buffer(const std::optional<std::deque<mutation_fragment>>& o) {
-            if (!o) {
-                return {};
-            }
-            return copy_fragments(*_schema, _permit, *o);
-        }
-
    public:
-        factory(schema_ptr schema, reader_permit permit, std::deque<mutation_fragment> first_buffer, std::deque<mutation_fragment> second_buffer, size_t max_buffer_size)
+        factory(schema_ptr schema, reader_permit permit, std::list<std::deque<mutation_fragment>> buffers, size_t max_buffer_size)
            : _schema(std::move(schema))
            , _permit(std::move(permit))
-            , _first_buffer(std::move(first_buffer))
-            , _second_buffer(std::move(second_buffer))
+            , _buffers(std::move(buffers))
            , _max_buffer_size(max_buffer_size) {
        }

        factory(const factory& o)
            : _schema(o._schema)
-            , _permit(o._permit)
-            , _first_buffer(copy_buffer(o._first_buffer))
-            , _second_buffer(copy_buffer(o._second_buffer)) {
+            , _permit(o._permit) {
+            for (const auto& buf : o._buffers) {
+                _buffers.emplace_back(copy_fragments(*_schema, _permit, buf));
+            }
        }
        factory(factory&& o) = default;

@@ -3087,14 +3304,9 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
                streamed_mutation::forwarding fwd_sm,
                mutation_reader::forwarding fwd_mr) {
            BOOST_REQUIRE(s == _schema);
-            if (_first_buffer) {
-                auto buf = *std::exchange(_first_buffer, {});
-                auto rd = make_flat_mutation_reader_from_fragments(_schema, std::move(permit), std::move(buf));
-                rd.set_max_buffer_size(_max_buffer_size);
-                return rd;
-            }
-            if (_second_buffer) {
-                auto buf = *std::exchange(_second_buffer, {});
+            if (!_buffers.empty()) {
+                auto buf = std::move(_buffers.front());
+                _buffers.pop_front();
                auto rd = make_flat_mutation_reader_from_fragments(_schema, std::move(permit), std::move(buf));
                rd.set_max_buffer_size(_max_buffer_size);
                return rd;
@@ -3102,9 +3314,9 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
            return make_empty_flat_reader(_schema, std::move(permit));
        }
    };
-    auto ms = mutation_source(factory(schema, permit, std::move(first_buffer), std::move(second_buffer), max_buffer_size));
+    auto ms = mutation_source(factory(schema, permit, std::move(buffers), max_buffer_size));

-    auto [rd, handle] = make_manually_paused_evictable_reader(
+    auto rd = make_auto_paused_evictable_reader(
            std::move(ms),
            schema,
            permit,
@@ -3120,18 +3332,42 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(

    const auto eq_cmp = position_in_partition::equal_compare(*schema);
    BOOST_REQUIRE(rd.is_buffer_full());
-    BOOST_REQUIRE(eq_cmp(rd.buffer().back().position(), last_fragment_position));
+    BOOST_REQUIRE(eq_cmp(rd.buffer().back().position(), first_buf_last_fragment_position));
    BOOST_REQUIRE(!rd.is_end_of_stream());

-    rd.detach_buffer();
-
-    handle.pause();
+    if (detach_buffer) {
+        rd.detach_buffer();
+    }

    while(permit.semaphore().try_evict_one_inactive_read());

    return std::move(rd);
 }

+flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
+        schema_ptr schema,
+        reader_permit permit,
+        const dht::partition_range& prange,
+        const query::partition_slice& slice,
+        std::deque<mutation_fragment> first_buffer,
+        position_in_partition_view last_fragment_position,
+        std::deque<mutation_fragment> last_buffer,
+        size_t max_buffer_size,
+        bool detach_buffer = true) {
+    std::list<std::deque<mutation_fragment>> list;
+    list.emplace_back(std::move(first_buffer));
+    list.emplace_back(std::move(last_buffer));
+    return create_evictable_reader_and_evict_after_first_buffer(
+            std::move(schema),
+            std::move(permit),
+            prange,
+            slice,
+            std::move(list),
+            last_fragment_position,
+            max_buffer_size,
+            detach_buffer);
+}
+
 }

 SEASTAR_THREAD_TEST_CASE(test_evictable_reader_trim_range_tombstones) {
@@ -3433,7 +3669,7 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_self_validation) {

    check_evictable_reader_validation_is_triggered(
            "pkey > _last_pkey; pkey ∈ pkrange",
-            partition_error_prefix,
+            "",
            s.schema(),
            permit,
            prange,
@@ -3521,3 +3757,208 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_self_validation) {
            make_second_buffer(pkeys[3]),
            max_buffer_size);
 }
+
+SEASTAR_THREAD_TEST_CASE(test_evictable_reader_drop_flags) {
+    reader_concurrency_semaphore semaphore(1, 0, get_name());
+    simple_schema s;
+    auto permit = semaphore.make_permit(s.schema().get(), get_name());
+
+    auto pkeys = s.make_pkeys(2);
+    std::sort(pkeys.begin(), pkeys.end(), [&s] (const auto& pk1, const auto& pk2) {
+        return pk1.less_compare(*s.schema(), pk2);
+    });
+    const auto& pkey1 = pkeys[0];
+    const auto& pkey2 = pkeys[1];
+    const int second_buffer_ck = 10;
+
+    struct buffer {
+        simple_schema& s;
+        reader_permit permit;
+        std::deque<mutation_fragment> frags;
+        std::vector<mutation> muts;
+        size_t size = 0;
+        std::optional<position_in_partition_view> last_pos;
+
+        buffer(simple_schema& s_, reader_permit permit_, dht::decorated_key key)
+            : s(s_), permit(std::move(permit_)) {
+            add_partition(key);
+        }
+        size_t add_partition(dht::decorated_key key) {
+            size += frags.emplace_back(*s.schema(), permit, partition_start{key, {}}).memory_usage();
+            muts.emplace_back(s.schema(), key);
+            return size;
+        }
+        size_t add_mutation_fragment(mutation_fragment&& mf, bool only_to_frags = false) {
+            if (!only_to_frags) {
+                muts.back().apply(mf);
+            }
+            size += frags.emplace_back(*s.schema(), permit, std::move(mf)).memory_usage();
+            return size;
+        }
+        size_t add_static_row(std::optional<mutation_fragment> sr = {}) {
+            auto srow = sr ? std::move(*sr) : s.make_static_row("s");
+            return add_mutation_fragment(std::move(srow));
+        }
+        size_t add_clustering_row(int i, bool only_to_frags = false) {
+            return add_mutation_fragment(mutation_fragment(*s.schema(), permit, s.make_row(s.make_ckey(i), "v")), only_to_frags);
+        }
+        size_t add_clustering_rows(int start, int end) {
+            for (int i = start; i < end; ++i) {
+                add_clustering_row(i);
+            }
+            return size;
+        }
+        size_t add_partition_end() {
+            size += frags.emplace_back(*s.schema(), permit, partition_end{}).memory_usage();
+            return size;
+        }
+        void save_position() { last_pos = frags.back().position(); }
+        void find_position(size_t buf_size) {
+            size_t s = 0;
+            for (const auto& frag : frags) {
+                s += frag.memory_usage();
+                if (s >= buf_size) {
+                    last_pos = frag.position();
+                    break;
+                }
+            }
+            BOOST_REQUIRE(last_pos);
+        }
+    };
+
+    auto make_reader = [&] (const buffer& first_buffer, const buffer& second_buffer, const buffer* const third_buffer, size_t max_buffer_size) {
+        std::list<std::deque<mutation_fragment>> buffers;
+        buffers.emplace_back(copy_fragments(*s.schema(), permit, first_buffer.frags));
+        buffers.emplace_back(copy_fragments(*s.schema(), permit, second_buffer.frags));
+        if (third_buffer) {
+            buffers.emplace_back(copy_fragments(*s.schema(), permit, third_buffer->frags));
+        }
+        return create_evictable_reader_and_evict_after_first_buffer(
+                s.schema(),
+                permit,
+                query::full_partition_range,
+                s.schema()->full_slice(),
+                std::move(buffers),
+                *first_buffer.last_pos,
+                max_buffer_size,
+                false);
+    };
+
+    testlog.info("Same partition, with static row");
+    {
+        buffer first_buffer(s, permit, pkey1);
+        first_buffer.add_static_row();
+        auto srow = mutation_fragment(*s.schema(), permit, first_buffer.frags.back());
+        const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
+        first_buffer.save_position();
+        first_buffer.add_clustering_row(second_buffer_ck);
+
+        buffer second_buffer(s, permit, pkey1);
+        second_buffer.add_static_row(std::move(srow));
+        second_buffer.add_clustering_row(second_buffer_ck);
+        second_buffer.add_clustering_row(second_buffer_ck + 1);
+        second_buffer.add_partition_end();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .has_monotonic_positions();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .produces(first_buffer.muts[0] + second_buffer.muts[0])
+            .produces_end_of_stream();
+    }
+
+    testlog.info("Same partition, no static row");
+    {
+        buffer first_buffer(s, permit, pkey1);
+        const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
+        first_buffer.save_position();
+        first_buffer.add_clustering_row(second_buffer_ck);
+
+        buffer second_buffer(s, permit, pkey1);
+        second_buffer.add_clustering_row(second_buffer_ck);
+        second_buffer.add_clustering_row(second_buffer_ck + 1);
+        second_buffer.add_partition_end();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .has_monotonic_positions();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .produces(first_buffer.muts[0] + second_buffer.muts[0])
+            .produces_end_of_stream();
+    }
+
+    testlog.info("Same partition as expected, no static row, next partition has static row (#8923)");
+    {
+        buffer second_buffer(s, permit, pkey1);
+        second_buffer.add_clustering_rows(second_buffer_ck, second_buffer_ck + second_buffer_ck / 2);
+        // We want to end the buffer on the partition-start below, but since a
+        // partition start will be dropped from it, we have to use the size
+        // without it.
+        const auto buf_size = second_buffer.add_partition_end();
+        second_buffer.add_partition(pkey2);
+        second_buffer.add_static_row();
+        auto srow = mutation_fragment(*s.schema(), permit, second_buffer.frags.back());
+        second_buffer.add_clustering_rows(0, 2);
+
+        buffer first_buffer(s, permit, pkey1);
+        for (int i = 0; first_buffer.add_clustering_row(i) < buf_size; ++i);
+        first_buffer.save_position();
+        first_buffer.add_mutation_fragment(mutation_fragment(*s.schema(), permit, second_buffer.frags[1]));
+
+        buffer third_buffer(s, permit, pkey2);
+        third_buffer.add_static_row(std::move(srow));
+        third_buffer.add_clustering_rows(0, 2);
+        third_buffer.add_partition_end();
+
+        first_buffer.find_position(buf_size);
+
+        assert_that(make_reader(first_buffer, second_buffer, &third_buffer, buf_size))
+            .has_monotonic_positions();
+
+        assert_that(make_reader(first_buffer, second_buffer, &third_buffer, buf_size))
+            .produces(first_buffer.muts[0] + second_buffer.muts[0])
+            .produces(second_buffer.muts[1] + third_buffer.muts[0])
+            .produces_end_of_stream();
+    }
+
+    testlog.info("Next partition, with no static row");
+    {
+        buffer first_buffer(s, permit, pkey1);
+        const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
+        first_buffer.save_position();
+        first_buffer.add_clustering_row(second_buffer_ck + 1, true);
+
+        buffer second_buffer(s, permit, pkey2);
+        second_buffer.add_clustering_rows(0, second_buffer_ck / 2);
+        second_buffer.add_partition_end();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .has_monotonic_positions();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .produces(first_buffer.muts[0])
+            .produces(second_buffer.muts[0])
+            .produces_end_of_stream();
+    }
+
+    testlog.info("Next partition, with static row");
+    {
+        buffer first_buffer(s, permit, pkey1);
+        const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
+        first_buffer.save_position();
+        first_buffer.add_clustering_row(second_buffer_ck + 1, true);
+
+        buffer second_buffer(s, permit, pkey2);
+        second_buffer.add_static_row();
+        second_buffer.add_clustering_rows(0, second_buffer_ck / 2);
+        second_buffer.add_partition_end();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .has_monotonic_positions();
+
+        assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
+            .produces(first_buffer.muts[0])
+            .produces(second_buffer.muts[0])
+            .produces_end_of_stream();
+    }
+}
--- a/test/boost/mutation_test.cc
+++ b/test/boost/mutation_test.cc
@@ -1805,12 +1805,16 @@ SEASTAR_TEST_CASE(test_mutation_diff_with_random_generator) {
                BOOST_FAIL(format("Partitions don't match, got: {}\n...and: {}", mutation_partition::printer(s, mp1), mutation_partition::printer(s, mp2)));
            }
        };
-        for_each_mutation_pair([&] (auto&& m1, auto&& m2, are_equal eq) {
+        const auto now = gc_clock::now();
+        can_gc_fn never_gc = [] (tombstone) { return false; };
+        for_each_mutation_pair([&] (auto m1, auto m2, are_equal eq) {
            mutation_application_stats app_stats;
            auto s = m1.schema();
            if (s != m2.schema()) {
                return;
            }
+            m1.partition().compact_for_compaction(*s, never_gc, now);
+            m2.partition().compact_for_compaction(*s, never_gc, now);
            auto m12 = m1;
            m12.apply(m2);
            auto m12_with_diff = m1;
--- a/test/boost/mutation_writer_test.cc
+++ b/test/boost/mutation_writer_test.cc
@@ -166,7 +166,7 @@ SEASTAR_TEST_CASE(test_multishard_writer_producer_aborts) {

 namespace {

-class bucket_writer {
+class test_bucket_writer {
    schema_ptr _schema;
    classify_by_timestamp _classify;
    std::unordered_map<int64_t, std::vector<mutation>>& _buckets;
@@ -175,6 +175,17 @@ class bucket_writer {
    mutation_opt _current_mutation;
    bool _is_first_mutation = true;

+    size_t _throw_after;
+    size_t _mutation_consumed = 0;
+
+public:
+    class expected_exception : public std::exception {
+    public:
+        virtual const char* what() const noexcept override {
+            return "expected_exception";
+        }
+    };
+
 private:
    void check_timestamp(api::timestamp_type ts) {
        const auto bucket_id = _classify(ts);
@@ -223,40 +234,53 @@ private:
        check_timestamp(rt.tomb.timestamp);
    }

+    void maybe_throw() {
+        if (_mutation_consumed++ >= _throw_after) {
+            throw(expected_exception());
+        }
+    }
+
 public:
-    bucket_writer(schema_ptr schema, classify_by_timestamp classify, std::unordered_map<int64_t, std::vector<mutation>>& buckets)
+    test_bucket_writer(schema_ptr schema, classify_by_timestamp classify, std::unordered_map<int64_t, std::vector<mutation>>& buckets, size_t throw_after = std::numeric_limits<size_t>::max())
        : _schema(std::move(schema))
        , _classify(std::move(classify))
-        , _buckets(buckets) {
-    }
+        , _buckets(buckets)
+        , _throw_after(throw_after)
+    { }
    void consume_new_partition(const dht::decorated_key& dk) {
+        maybe_throw();
        BOOST_REQUIRE(!_current_mutation);
        _current_mutation = mutation(_schema, dk);
    }
    void consume(tombstone partition_tombstone) {
+        maybe_throw();
        BOOST_REQUIRE(_current_mutation);
        verify_partition_tombstone(partition_tombstone);
        _current_mutation->partition().apply(partition_tombstone);
    }
    stop_iteration consume(static_row&& sr) {
+        maybe_throw();
        BOOST_REQUIRE(_current_mutation);
        verify_static_row(sr);
        _current_mutation->apply(mutation_fragment(*_schema, tests::make_permit(), std::move(sr)));
        return stop_iteration::no;
    }
    stop_iteration consume(clustering_row&& cr) {
+        maybe_throw();
        BOOST_REQUIRE(_current_mutation);
        verify_clustering_row(cr);
        _current_mutation->apply(mutation_fragment(*_schema, tests::make_permit(), std::move(cr)));
        return stop_iteration::no;
    }
    stop_iteration consume(range_tombstone&& rt) {
+        maybe_throw();
        BOOST_REQUIRE(_current_mutation);
        verify_range_tombstone(rt);
        _current_mutation->apply(mutation_fragment(*_schema, tests::make_permit(), std::move(rt)));
        return stop_iteration::no;
    }
    stop_iteration consume_end_of_partition() {
+        maybe_throw();
        BOOST_REQUIRE(_current_mutation);
        BOOST_REQUIRE(_bucket_id);
        auto& bucket = _buckets[*_bucket_id];
@@ -311,7 +335,7 @@ SEASTAR_THREAD_TEST_CASE(test_timestamp_based_splitting_mutation_writer) {

    auto consumer = [&] (flat_mutation_reader bucket_reader) {
        return do_with(std::move(bucket_reader), [&] (flat_mutation_reader& rd) {
-            return rd.consume(bucket_writer(random_schema.schema(), classify_fn, buckets), db::no_timeout);
+            return rd.consume(test_bucket_writer(random_schema.schema(), classify_fn, buckets), db::no_timeout);
        });
    };

@@ -342,3 +366,53 @@ SEASTAR_THREAD_TEST_CASE(test_timestamp_based_splitting_mutation_writer) {
    }

 }
+
+SEASTAR_THREAD_TEST_CASE(test_timestamp_based_splitting_mutation_writer_abort) {
+    auto random_spec = tests::make_random_schema_specification(
+            get_name(),
+            std::uniform_int_distribution<size_t>(1, 4),
+            std::uniform_int_distribution<size_t>(2, 4),
+            std::uniform_int_distribution<size_t>(2, 8),
+            std::uniform_int_distribution<size_t>(2, 8));
+    auto random_schema = tests::random_schema{tests::random::get_int<uint32_t>(), *random_spec};
+
+    testlog.info("Random schema:\n{}", random_schema.cql());
+
+    auto ts_gen = [&, underlying = tests::default_timestamp_generator()] (std::mt19937& engine,
+            tests::timestamp_destination ts_dest, api::timestamp_type min_timestamp) -> api::timestamp_type {
+        if (ts_dest == tests::timestamp_destination::partition_tombstone ||
+                ts_dest == tests::timestamp_destination::row_marker ||
+                ts_dest == tests::timestamp_destination::row_tombstone ||
+                ts_dest == tests::timestamp_destination::collection_tombstone) {
+            if (tests::random::get_int<int>(0, 10, engine)) {
+                return api::missing_timestamp;
+            }
+        }
+        return underlying(engine, ts_dest, min_timestamp);
+    };
+
+    auto muts = tests::generate_random_mutations(random_schema, ts_gen).get0();
+
+    auto classify_fn = [] (api::timestamp_type ts) {
+        return int64_t(ts % 2);
+    };
+
+    std::unordered_map<int64_t, std::vector<mutation>> buckets;
+
+    int throw_after = tests::random::get_int(muts.size() - 1);
+    testlog.info("Will raise exception after {}/{} mutations", throw_after, muts.size());
+    auto consumer = [&] (flat_mutation_reader bucket_reader) {
+        return do_with(std::move(bucket_reader), [&] (flat_mutation_reader& rd) {
+            return rd.consume(test_bucket_writer(random_schema.schema(), classify_fn, buckets, throw_after), db::no_timeout);
+        });
+    };
+
+    try {
+        segregate_by_timestamp(flat_mutation_reader_from_mutations(tests::make_permit(), muts), classify_fn, std::move(consumer)).get();
+    } catch (const test_bucket_writer::expected_exception&) {
+        BOOST_TEST_PASSPOINT();
+    } catch (const seastar::broken_promise&) {
+        // Tolerated until we properly abort readers
+        BOOST_TEST_PASSPOINT();
+    }
+}
--- a/test/boost/querier_cache_test.cc
+++ b/test/boost/querier_cache_test.cc
@@ -712,7 +712,10 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
                nullptr,
                db::no_timeout).get();

-        BOOST_CHECK_EQUAL(db.get_querier_cache_stats().resource_based_evictions, 1);
+        // The second read might be evicted too if it consumes more
+        // memory than the first and hence triggers memory control when
+        // saved in the querier cache.
+        BOOST_CHECK_GE(db.get_querier_cache_stats().resource_based_evictions, 1);

        // We want to read the entire partition so that the querier
        // is not saved at the end and thus ensure it is destroyed.
--- a/test/boost/restrictions_test.cc
+++ b/test/boost/restrictions_test.cc
@@ -674,6 +674,18 @@ SEASTAR_THREAD_TEST_CASE(scalar_in) {
        require_rows(e, stmt, {}, {LF({24.f, 25.f})}, {{F(24)}, {F(24)}, {F(25)}});
        require_rows(e, stmt, {}, {LF({25.f, data_value::make_null(float_type)})}, {{F(25)}});
        require_rows(e, stmt, {}, {LF({99.f, data_value::make_null(float_type)})}, {});
+
+        const auto in_null = [&] (const char* column) {
+            return e.execute_prepared(
+                    e.prepare(format("select * from t where {} in ? allow filtering", column)).get0(),
+                    {cql3::raw_value::make_null()})
+                    .get();
+        };
+        using ire = exceptions::invalid_request_exception;
+        using exception_predicate::message_contains;
+        BOOST_REQUIRE_EXCEPTION(in_null("p"), ire, message_contains("null value"));
+        BOOST_REQUIRE_EXCEPTION(in_null("c"), ire, message_contains("null value"));
+        BOOST_REQUIRE_EXCEPTION(in_null("r"), ire, message_contains("null value"));
    }).get();
 }

@@ -778,6 +790,10 @@ SEASTAR_THREAD_TEST_CASE(multi_col_in) {
        require_rows(e, stmt, {}, {bound_tuples({{13, 13}, {12, 22}})}, {{I(12), F(22)}});
        require_rows(e, stmt, {}, {bound_tuples({{12, 21}})}, {});
        require_rows(e, stmt, {}, {bound_tuples({{12, 21}, {12, 21}, {13, 21}, {14, 21}})}, {});
+        BOOST_REQUIRE_EXCEPTION(
+                e.execute_prepared(stmt, {cql3::raw_value::make_null()}).get(),
+                exceptions::invalid_request_exception,
+                exception_predicate::message_equals("Invalid null value for IN restriction"));
        stmt = e.prepare("select ck1 from t where (ck1,ck2) in (?) allow filtering").get0();
        auto tpl = [] (int32_t e1, float e2) {
            return make_tuple({int32_type, float_type}, {e1, e2});
--- a/test/boost/row_cache_test.cc
+++ b/test/boost/row_cache_test.cc
@@ -910,8 +910,20 @@ SEASTAR_TEST_CASE(test_eviction_from_invalidated) {

        std::vector<sstring> tmp;
        auto alloc_size = logalloc::segment_size * 10;
-        while (tracker.region().occupancy().total_space() > alloc_size) {
-            tmp.push_back(uninitialized_string(alloc_size));
+        /*
+         * Now allocate huge chunks on the region until it gives up
+         * with bad_alloc. At that point the region must not have more
+         * memory than the chunk size, neither it must contain rows
+         * or partitions (except for dummy entries)
+         */
+        try {
+            while (true) {
+                tmp.push_back(uninitialized_string(alloc_size));
+            }
+        } catch (const std::bad_alloc&) {
+            BOOST_REQUIRE(tracker.region().occupancy().total_space() < alloc_size);
+            BOOST_REQUIRE(tracker.get_stats().partitions == 0);
+            BOOST_REQUIRE(tracker.get_stats().rows == 0);
        }
    });
 }
--- a/test/boost/sstable_datafile_test.cc
+++ b/test/boost/sstable_datafile_test.cc
@@ -6685,3 +6685,135 @@ SEASTAR_TEST_CASE(test_zero_estimated_partitions) {
        return make_ready_future<>();
    });
 }
+
+SEASTAR_TEST_CASE(max_ongoing_compaction_test) {
+    return test_env::do_with_async([] (test_env& env) {
+        BOOST_REQUIRE(smp::count == 1);
+
+        auto make_schema = [] (auto idx) {
+            auto builder = schema_builder("tests", std::to_string(idx))
+                .with_column("id", utf8_type, column_kind::partition_key)
+                .with_column("cl", int32_type, column_kind::clustering_key)
+                .with_column("value", int32_type);
+            builder.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
+            std::map <sstring, sstring> opts = {
+                {time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY,                  "HOURS"},
+                {time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY,                  "1"},
+                {time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY, "0"},
+            };
+            builder.set_compaction_strategy_options(std::move(opts));
+            builder.set_gc_grace_seconds(0);
+            return builder.build();
+        };
+
+        auto cm = make_lw_shared<compaction_manager>();
+        cm->enable();
+        auto stop_cm = defer([&cm] {
+            cm->stop().get();
+        });
+
+        auto tmp = tmpdir();
+        auto cl_stats = make_lw_shared<cell_locker_stats>();
+        auto tracker = make_lw_shared<cache_tracker>();
+        auto tokens = token_generation_for_shard(1, this_shard_id(), test_db_config.murmur3_partitioner_ignore_msb_bits(), smp::count);
+
+        auto next_timestamp = [] (auto step) {
+            using namespace std::chrono;
+            return (gc_clock::now().time_since_epoch() - duration_cast<microseconds>(step)).count();
+        };
+        auto make_expiring_cell = [&] (schema_ptr s, std::chrono::hours step) {
+            static thread_local int32_t value = 1;
+
+            auto key_str = tokens[0].first;
+            auto key = partition_key::from_exploded(*s, {to_bytes(key_str)});
+
+            mutation m(s, key);
+            auto c_key = clustering_key::from_exploded(*s, {int32_type->decompose(value++)});
+            m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(value)), next_timestamp(step), gc_clock::duration(step + 5s));
+            return m;
+        };
+
+        auto make_table_with_single_fully_expired_sstable = [&] (auto idx) {
+            auto s = make_schema(idx);
+            column_family::config cfg = column_family_test_config(env.manager());
+            cfg.datadir = tmp.path().string() + "/" + std::to_string(idx);
+            touch_directory(cfg.datadir).get();
+            cfg.enable_commitlog = false;
+            cfg.enable_incremental_backups = false;
+
+            auto sst_gen = [&env, s, dir = cfg.datadir, gen = make_lw_shared<unsigned>(1)] () mutable {
+                return env.make_sstable(s, dir, (*gen)++, sstables::sstable::version_types::md, big);
+            };
+
+            auto cf = make_lw_shared<column_family>(s, cfg, column_family::no_commitlog(), *cm, *cl_stats, *tracker);
+            cf->start();
+            cf->mark_ready_for_writes();
+
+            auto muts = { make_expiring_cell(s, std::chrono::hours(1)) };
+            auto sst = make_sstable_containing(sst_gen, muts);
+            column_family_test(cf).add_sstable(sst);
+            return cf;
+        };
+
+        std::vector<lw_shared_ptr<column_family>> tables;
+        auto stop_tables = defer([&tables] {
+            for (auto& t : tables) {
+                t->stop().get();
+            }
+        });
+        for (auto i = 0; i < 100; i++) {
+            tables.push_back(make_table_with_single_fully_expired_sstable(i));
+        }
+
+        // Make sure everything is expired
+        forward_jump_clocks(std::chrono::hours(100));
+
+        for (auto& t : tables) {
+            BOOST_REQUIRE(t->sstables_count() == 1);
+            t->trigger_compaction();
+        }
+
+        BOOST_REQUIRE(cm->get_stats().pending_tasks >= 1 || cm->get_stats().active_tasks >= 1);
+
+        size_t max_ongoing_compaction = 0;
+
+        // wait for submitted jobs to finish.
+        auto end = [cm, &tables] {
+            return cm->get_stats().pending_tasks == 0 && cm->get_stats().active_tasks == 0
+                && boost::algorithm::all_of(tables, [] (auto& t) { return t->sstables_count() == 0; });
+        };
+        while (!end()) {
+            if (!cm->get_stats().pending_tasks && !cm->get_stats().active_tasks) {
+                for (auto& t : tables) {
+                    if (t->sstables_count()) {
+                        t->trigger_compaction();
+                    }
+                }
+            }
+            max_ongoing_compaction = std::max(cm->get_stats().active_tasks, max_ongoing_compaction);
+            later().get();
+        }
+        BOOST_REQUIRE(cm->get_stats().errors == 0);
+        BOOST_REQUIRE(max_ongoing_compaction == 1);
+    });
+}
+
+SEASTAR_TEST_CASE(stcs_reshape_test) {
+    return test_env::do_with_async([] (test_env& env) {
+        simple_schema ss;
+        auto s = ss.schema();
+        std::vector<shared_sstable> sstables;
+        sstables.reserve(s->max_compaction_threshold());
+        for (auto gen = 1; gen <= s->max_compaction_threshold(); gen++) {
+            auto sst = env.make_sstable(s, "", gen, la, big);
+            sstables::test(sst).set_data_file_size(1);
+            sstables.push_back(std::move(sst));
+        }
+
+        auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::size_tiered,
+                                                    s->compaction_strategy_options());
+
+        BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, default_priority_class(), reshape_mode::strict).sstables.size());
+        BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, default_priority_class(), reshape_mode::relaxed).sstables.size());
+    });
+}
--- a/test/cql-pytest/test_secondary_index.py
+++ b/test/cql-pytest/test_secondary_index.py
@@ -0,0 +1,186 @@
+# Copyright 2020 ScyllaDB
+#
+# This file is part of Scylla.
+#
+# Scylla is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Scylla is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+
+# Tests for secondary indexes
+
+import time
+import pytest
+from cassandra.protocol import SyntaxException, AlreadyExists, InvalidRequest, ConfigurationException, ReadFailure
+
+from util import new_test_table, unique_name
+
+# A reproducer for issue #7443: Normally, when the entire table is SELECTed,
+# the partitions are returned sorted by the partitions' token. When there
+# is filtering, this order is not expected to change. Furthermore, when this
+# filtering happens to use a secondary index, again the order is not expected
+# to change.
+def test_partition_order_with_si(cql, test_keyspace):
+    schema = 'pk int, x int, PRIMARY KEY ((pk))'
+    with new_test_table(cql, test_keyspace, schema) as table:
+        # Insert 20 partitions, all of them with x=1 so that filtering by x=1
+        # will yield the same 20 partitions:
+        N = 20
+        stmt = cql.prepare('INSERT INTO '+table+' (pk, x) VALUES (?, ?)')
+        for i in range(N):
+            cql.execute(stmt, [i, 1])
+        # SELECT all the rows, and verify they are returned in increasing
+        # partition token order (note that the token is a *signed* number):
+        tokens = [row.system_token_pk for row in cql.execute('SELECT token(pk) FROM '+table)]
+        assert len(tokens) == N
+        assert sorted(tokens) == tokens
+        # Now select all the partitions with filtering of x=1. Since all
+        # rows have x=1, this shouldn't change the list of matching rows, and
+        # also shouldn't check their order:
+        tokens1 = [row.system_token_pk for row in cql.execute('SELECT token(pk) FROM '+table+' WHERE x=1 ALLOW FILTERING')]
+        assert tokens1 == tokens
+        # Now add an index on x, which allows implementing the "x=1"
+        # restriction differently. With the index, "ALLOW FILTERING" is
+        # no longer necessary. But the order of the results should
+        # still not change. Issue #7443 is about the order changing here.
+        cql.execute('CREATE INDEX ON '+table+'(x)')
+        # "CREATE INDEX" does not wait until the index is actually available
+        # for use. Reads immediately after the CREATE INDEX may fail or return
+        # partial results. So let's retry until reads resume working:
+        for i in range(100):
+            try:
+                tokens2 = [row.system_token_pk for row in cql.execute('SELECT token(pk) FROM '+table+' WHERE x=1')]
+                if len(tokens2) == N:
+                    break
+            except ReadFailure:
+                pass
+            time.sleep(0.1)
+        assert tokens2 == tokens
+
+# Test which ensures that indexes for a query are picked by the order in which
+# they appear in restrictions. That way, users can deterministically pick
+# which indexes are used for which queries.
+# Note that the order of picking indexing is not set in stone and may be
+# subject to change - in which case this test case should be amended as well.
+# The order tested in this case was decided as a good first step in issue
+# #7969, but it's possible that it will eventually be implemented another
+# way, e.g. dynamically based on estimated query selectivity statistics.
+# Ref: #7969
+@pytest.mark.xfail(reason="The order of picking indexes is currently arbitrary. Issue #7969")
+def test_order_of_indexes(scylla_only, cql, test_keyspace):
+    schema = 'p int primary key, v1 int, v2 int, v3 int'
+    with new_test_table(cql, test_keyspace, schema) as table:
+        cql.execute(f"CREATE INDEX my_v3_idx ON {table}(v3)")
+        cql.execute(f"CREATE INDEX my_v1_idx ON {table}(v1)")
+        cql.execute(f"CREATE INDEX my_v2_idx ON {table}((p),v2)")
+        # All queries below should use the first index they find in the list
+        # of restrictions. Tracing information will be consulted to ensure
+        # it's true. Currently some of the cases below succeed, because the
+        # order is not well defined (and may, for instance, change upon
+        # server restart), but some of them fail. Once a proper ordering
+        # is implemented, all cases below should succeed.
+        def index_used(query, index_name):
+            assert any([index_name in event.description for event in cql.execute(query, trace=True).get_query_trace().events])
+        index_used(f"SELECT * FROM {table} WHERE v3 = 1", "my_v3_idx")
+        index_used(f"SELECT * FROM {table} WHERE v3 = 1 and v1 = 2 allow filtering", "my_v3_idx")
+        index_used(f"SELECT * FROM {table} WHERE p = 1 and v1 = 1 and v3 = 2 allow filtering", "my_v1_idx")
+        index_used(f"SELECT * FROM {table} WHERE p = 1 and v3 = 1 and v1 = 2 allow filtering", "my_v3_idx")
+        # Local indexes are still skipped if they cannot be used
+        index_used(f"SELECT * FROM {table} WHERE v2 = 1 and v1 = 2 allow filtering", "my_v1_idx")
+        index_used(f"SELECT * FROM {table} WHERE v2 = 1 and v3 = 2 and v1 = 3 allow filtering", "my_v3_idx")
+        index_used(f"SELECT * FROM {table} WHERE v1 = 1 and v2 = 2 and v3 = 3 allow filtering", "my_v1_idx")
+        # Local indexes are still preferred over global ones, if they can be used
+        index_used(f"SELECT * FROM {table} WHERE p = 1 and v1 = 1 and v3 = 2 and v2 = 2 allow filtering", "my_v2_idx")
+        index_used(f"SELECT * FROM {table} WHERE p = 1 and v2 = 1 and v1 = 2 allow filtering", "my_v2_idx")
+
+# Indexes can be created without an explicit name, in which case a default name is chosen.
+# However, due to #8620 it was possible to break the index creation mechanism by creating
+# a properly named regular table, which conflicts with the generated index name.
+def test_create_unnamed_index_when_its_name_is_taken(cql, test_keyspace):
+    schema = 'p int primary key, v int'
+    with new_test_table(cql, test_keyspace, schema) as table:
+        try:
+            cql.execute(f"CREATE TABLE {table}_v_idx_index (i_do_not_exist_in_the_base_table int primary key)")
+            # Creating an index should succeed, even though its default name is taken
+            # by the table above
+            cql.execute(f"CREATE INDEX ON {table}(v)")
+        finally:
+            cql.execute(f"DROP TABLE {table}_v_idx_index")
+
+# Indexed created with an explicit name cause a materialized view to be created,
+# and this view has a specific name - <index-name>_index. If there happens to be
+# a regular table (or another view) named just like that, index creation should fail.
+def test_create_named_index_when_its_name_is_taken(scylla_only, cql, test_keyspace):
+    schema = 'p int primary key, v int'
+    with new_test_table(cql, test_keyspace, schema) as table:
+        index_name = unique_name()
+        try:
+            cql.execute(f"CREATE TABLE {test_keyspace}.{index_name}_index (i_do_not_exist_in_the_base_table int primary key)")
+            # Creating an index should fail, because it's impossible to create
+            # its underlying materialized view, because its name is taken by a regular table
+            with pytest.raises(InvalidRequest, match="already exists"):
+                cql.execute(f"CREATE INDEX {index_name} ON {table}(v)")
+        finally:
+            cql.execute(f"DROP TABLE {test_keyspace}.{index_name}_index")
+
+# Tests for CREATE INDEX IF NOT EXISTS
+# Reproduces issue #8717.
+def test_create_index_if_not_exists(cql, test_keyspace):
+    with new_test_table(cql, test_keyspace, 'p int primary key, v int') as table:
+        cql.execute(f"CREATE INDEX ON {table}(v)")
+        # Can't create the same index again without "IF NOT EXISTS", but can
+        # do it with "IF NOT EXISTS":
+        with pytest.raises(InvalidRequest, match="duplicate"):
+            cql.execute(f"CREATE INDEX ON {table}(v)")
+        cql.execute(f"CREATE INDEX IF NOT EXISTS ON {table}(v)")
+        cql.execute(f"DROP INDEX {test_keyspace}.{table.split('.')[1]}_v_idx")
+
+        # Now test the same thing for named indexes. This is what broke in #8717:
+        cql.execute(f"CREATE INDEX xyz ON {table}(v)")
+        with pytest.raises(InvalidRequest, match="already exists"):
+            cql.execute(f"CREATE INDEX xyz ON {table}(v)")
+        cql.execute(f"CREATE INDEX IF NOT EXISTS xyz ON {table}(v)")
+        cql.execute(f"DROP INDEX {test_keyspace}.xyz")
+
+        # Exactly the same with non-lower case name.
+        cql.execute(f'CREATE INDEX "CamelCase" ON {table}(v)')
+        with pytest.raises(InvalidRequest, match="already exists"):
+            cql.execute(f'CREATE INDEX "CamelCase" ON {table}(v)')
+        cql.execute(f'CREATE INDEX IF NOT EXISTS "CamelCase" ON {table}(v)')
+        cql.execute(f'DROP INDEX {test_keyspace}."CamelCase"')
+
+        # Trying to create an index for an attribute that's already indexed,
+        # but with a different name. The "IF NOT EXISTS" appears to succeed
+        # in this case, but does not actually create the new index name -
+        # only the old one remains.
+        cql.execute(f"CREATE INDEX xyz ON {table}(v)")
+        with pytest.raises(InvalidRequest, match="duplicate"):
+            cql.execute(f"CREATE INDEX abc ON {table}(v)")
+        cql.execute(f"CREATE INDEX IF NOT EXISTS abc ON {table}(v)")
+        with pytest.raises(InvalidRequest):
+            cql.execute(f"DROP INDEX {test_keyspace}.abc")
+        cql.execute(f"DROP INDEX {test_keyspace}.xyz")
+
+# Test that the paging state works properly for indexes on tables
+# with descending clustering order. There was a problem with indexes
+# created on clustering keys with DESC clustering order - they are represented
+# as "reverse" types internally and Scylla assertions failed that the base type
+# is different from the underlying view type, even though, from the perspective
+# of deserialization, they're equal. Issue #8666
+def test_paging_with_desc_clustering_order(cql, test_keyspace):
+    schema = 'p int, c int, primary key (p,c)'
+    extra = 'with clustering order by (c desc)'
+    with new_test_table(cql, test_keyspace, schema, extra) as table:
+        cql.execute(f"CREATE INDEX ON {table}(c)")
+        for i in range(3):
+            cql.execute(f"INSERT INTO {table}(p,c) VALUES ({i}, 42)")
+        stmt = SimpleStatement(f"SELECT * FROM {table} WHERE c = 42", fetch_size=1)
+        assert len([row for row in cql.execute(stmt)]) == 3
--- a/thrift/handler.cc
+++ b/thrift/handler.cc
@@ -1396,9 +1396,18 @@ private:
            return { };
        }
    }
+    static void validate_key(const schema& s, const clustering_key& ck, bytes_view v) {
+        auto ck_size = ck.size(s);
+        if (ck_size > s.clustering_key_size()) {
+            throw std::runtime_error(format("Cell name of {}.{} has too many components, expected {} but got {} in 0x{}",
+                s.ks_name(), s.cf_name(), s.clustering_key_size(), ck_size, to_hex(v)));
+        }
+    }
    static clustering_key_prefix make_clustering_prefix(const schema& s, bytes_view v) {
        auto composite = composite_view(v, s.thrift().has_compound_comparator());
-        return clustering_key_prefix::from_exploded(composite.values());
+        auto ck = clustering_key_prefix::from_exploded(composite.values());
+        validate_key(s, ck, v);
+        return ck;
    }
    static query::clustering_range::bound make_clustering_bound(const schema& s, bytes_view v, composite::eoc exclusiveness_marker) {
        auto composite = composite_view(v, s.thrift().has_compound_comparator());
@@ -1407,6 +1416,7 @@ private:
            last = c.second;
            return c.first;
        }));
+        validate_key(s, ck, v);
        return query::clustering_range::bound(std::move(ck), last != exclusiveness_marker);
    }
    static range<clustering_key_prefix> make_clustering_range(const schema& s, const std::string& start, const std::string& end) {
--- a/tools/java
+++ b/tools/java
--- a/tools/jmx
+++ b/tools/jmx
--- a/transport/server.cc
+++ b/transport/server.cc
@@ -616,16 +616,23 @@ future<> cql_server::connection::process_request() {
        auto op = f.opcode;
        auto stream = f.stream;
        auto mem_estimate = f.length * 2 + 8000; // Allow for extra copies and bookkeeping
-
        if (mem_estimate > _server._max_request_size) {
-            return make_exception_future<>(exceptions::invalid_request_exception(format("request size too large (frame size {:d}; estimate {:d}; allowed {:d}",
-                    f.length, mem_estimate, _server._max_request_size)));
+            return _read_buf.skip(f.length).then([length = f.length, stream = f.stream, mem_estimate, this] () {
+                write_response(make_error(stream, exceptions::exception_code::INVALID,
+                        format("request size too large (frame size {:d}; estimate {:d}; allowed {:d}", length, mem_estimate, _server._max_request_size),
+                        tracing::trace_state_ptr()));
+                return make_ready_future<>();
+            });
        }

        if (_server._requests_serving > _server._max_concurrent_requests) {
            ++_server._requests_shed;
-            return make_exception_future<>(
-                    exceptions::overloaded_exception(format("too many in-flight requests (configured via max_concurrent_requests_per_shard): {}", _server._requests_serving)));
+            return _read_buf.skip(f.length).then([this, stream = f.stream] {
+                write_response(make_error(stream, exceptions::exception_code::OVERLOADED,
+                        format("too many in-flight requests (configured via max_concurrent_requests_per_shard): {}", _server._requests_serving),
+                        tracing::trace_state_ptr()));
+                return make_ready_future<>();
+            });
        }

        auto fut = get_units(_server._memory_available, mem_estimate);
--- a/types.hh
+++ b/types.hh
@@ -588,6 +588,9 @@ public:
    cql3::cql3_type as_cql3_type() const;
    const sstring& cql3_type_name() const;
    virtual shared_ptr<const abstract_type> freeze() const { return shared_from_this(); }
+    const abstract_type& without_reversed() const {
+        return is_reversed() ? *underlying_type() : *this;
+    }
    friend class list_type_impl;
 private:
    mutable sstring _cql3_type_name;
--- a/utils/loading_shared_values.hh
+++ b/utils/loading_shared_values.hh
@@ -173,6 +173,10 @@ public:
            return res;
        }

+        long use_count() const noexcept {
+            return _e ? _e.use_count() : 0;
+        }
+
        friend class loading_shared_values;
    };