Update seastar submodule (json crash in describe_ring)

* seastar 0568c231cd...fd0d7c1c9a (1): > Merge 'stream_range_as_array: always close output stream' from Benny Halevy Fixes #10592.
release: prepare for 4.5.7
2022-06-08 16:50:51 +03:00 · 2022-05-16 15:27:49 +03:00 · 2022-05-15 13:44:15 +03:00 · 2022-05-15 13:20:40 +03:00 · 2022-05-10 14:09:54 +02:00 · 2022-05-08 12:37:00 +03:00
149 changed files with 3389 additions and 949 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/DEDICATION.txt
+++ b/DEDICATION.txt
@@ -0,0 +1 @@
+Dedicated to the memory of Alberto José Araújo, a coworker and a friend.
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=4.5.dev
+VERSION=4.5.7

 if test -f version
 then
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -123,7 +123,7 @@ struct rjson_engaged_ptr_comp {
 // as internally they're stored in an array, and the order of elements is
 // not important in set equality. See issue #5021
 static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2) {
-    if (set1.Size() != set2.Size()) {
+    if (!set1.IsArray() || !set2.IsArray() || set1.Size() != set2.Size()) {
        return false;
    }
    std::set<const rjson::value*, rjson_engaged_ptr_comp> set1_raw;
@@ -137,25 +137,70 @@ static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2
    }
    return true;
 }
+// Moreover, the JSON being compared can be a nested document with outer
+// layers of lists and maps and some inner set - and we need to get to that
+// inner set to compare it correctly with check_EQ_for_sets() (issue #8514).
+static bool check_EQ(const rjson::value* v1, const rjson::value& v2);
+static bool check_EQ_for_lists(const rjson::value& list1, const rjson::value& list2) {
+    if (!list1.IsArray() || !list2.IsArray() || list1.Size() != list2.Size()) {
+        return false;
+    }
+    auto it1 = list1.Begin();
+    auto it2 = list2.Begin();
+    while (it1 != list1.End()) {
+        // Note: Alternator limits an item's depth (rjson::parse() limits
+        // it to around 37 levels), so this recursion is safe.
+        if (!check_EQ(&*it1, *it2)) {
+            return false;
+        }
+        ++it1;
+        ++it2;
+    }
+    return true;
+}
+static bool check_EQ_for_maps(const rjson::value& list1, const rjson::value& list2) {
+    if (!list1.IsObject() || !list2.IsObject() || list1.MemberCount() != list2.MemberCount()) {
+        return false;
+    }
+    for (auto it1 = list1.MemberBegin(); it1 != list1.MemberEnd(); ++it1) {
+        auto it2 = list2.FindMember(it1->name);
+        if (it2 == list2.MemberEnd() || !check_EQ(&it1->value, it2->value)) {
+            return false;
+        }
+    }
+    return true;
+}

 // Check if two JSON-encoded values match with the EQ relation
 static bool check_EQ(const rjson::value* v1, const rjson::value& v2) {
-    if (!v1) {
-        return false;
-    }
-    if (v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
+    if (v1 && v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
        auto it1 = v1->MemberBegin();
        auto it2 = v2.MemberBegin();
-        if ((it1->name == "SS" && it2->name == "SS") || (it1->name == "NS" && it2->name == "NS") || (it1->name == "BS" && it2->name == "BS")) {
-            return check_EQ_for_sets(it1->value, it2->value);
+        if (it1->name != it2->name) {
+            return false;
        }
+        if (it1->name == "SS" || it1->name == "NS" || it1->name == "BS") {
+            return check_EQ_for_sets(it1->value, it2->value);
+        } else if(it1->name == "L") {
+            return check_EQ_for_lists(it1->value, it2->value);
+        } else if(it1->name == "M") {
+            return check_EQ_for_maps(it1->value, it2->value);
+        } else {
+            // Other, non-nested types (number, string, etc.) can be compared
+            // literally, comparing their JSON representation.
+            return it1->value == it2->value;
+        }
+    } else {
+        // If v1 and/or v2 are missing (IsNull()) the result should be false.
+        // In the unlikely case that the object is malformed (issue #8070),
+        // let's also return false.
+        return false;
    }
-    return *v1 == v2;
 }

 // Check if two JSON-encoded values match with the NE relation
 static bool check_NE(const rjson::value* v1, const rjson::value& v2) {
-    return !v1 || *v1 != v2; // null is unequal to anything.
+    return !check_EQ(v1, v2);
 }

 // Check if two JSON-encoded values match with the BEGINS_WITH relation
@@ -298,6 +343,8 @@ static bool check_NOT_NULL(const rjson::value* val) {

 // Only types S, N or B (string, number or bytes) may be compared by the
 // various comparion operators - lt, le, gt, ge, and between.
+// Note that in particular, if the value is missing (v->IsNull()), this
+// check returns false.
 static bool check_comparable_type(const rjson::value& v) {
    if (!v.IsObject() || v.MemberCount() != 1) {
        return false;
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -2442,8 +2442,8 @@ static bool hierarchy_actions(
                    if (newv) {
                        rjson::set_with_string_name(v, attr, std::move(*newv));
                    } else {
-                        throw api_error::validation(format("Can't remove document path {} - not present in item",
-                            subh.get_value()._path));
+                        // Removing a.b when a is a map but a.b doesn't exist
+                        // is silently ignored. It's not considered an error.
                    }
                } else {
                    throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
@@ -2509,7 +2509,7 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
                          const attribute_path_map_node<parsed::update_expression::action>* h = nullptr) {
        any_updates = true;
        if (_returnvalues == returnvalues::ALL_NEW) {
-            rjson::set_with_string_name(_return_attributes,
+            rjson::replace_with_string_name(_return_attributes,
                to_sstring_view(column_name), rjson::copy(json_value));
        } else if (_returnvalues == returnvalues::UPDATED_NEW) {
            rjson::value&& v = rjson::copy(json_value);
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -129,6 +129,10 @@ public:
                 [&] (const json::json_return_type& json_return_value) {
                     slogger.trace("api_handler success case");
                     if (json_return_value._body_writer) {
+                         // Unfortunately, write_body() forces us to choose
+                         // from a fixed and irrelevant list of "mime-types"
+                         // at this point. But we'll override it with the
+                         // one (application/x-amz-json-1.0) below.
                         rep->write_body("json", std::move(json_return_value._body_writer));
                     } else {
                         rep->_content += json_return_value._res;
@@ -141,7 +145,7 @@ public:

             return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
         });
-    }), _type("json") { }
+    }) { }

    api_handler(const api_handler&) = default;
    future<std::unique_ptr<reply>> handle(const sstring& path,
@@ -149,7 +153,8 @@ public:
        handle_CORS(*req, *rep, false);
        return _f_handle(std::move(req), std::move(rep)).then(
                [this](std::unique_ptr<reply> rep) {
-                    rep->done(_type);
+                    rep->set_mime_type("application/x-amz-json-1.0");
+                    rep->done();
                    return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
                });
    }
@@ -163,7 +168,6 @@ protected:
    }

    future_handler_function _f_handle;
-    sstring _type;
 };

 class gated_handler : public handler_base {
@@ -246,24 +250,31 @@ future<> server::verify_signature(const request& req, const chunked_content& con
        throw api_error::missing_authentication_token("Authorization header is mandatory for signature verification");
    }
    std::string host = host_it->second;
-    std::vector<std::string_view> credentials_raw = split(authorization_it->second, ' ');
+    std::string_view authorization_header = authorization_it->second;
+    auto pos = authorization_header.find_first_of(' ');
+    if (pos == std::string_view::npos || authorization_header.substr(0, pos) != "AWS4-HMAC-SHA256") {
+        throw api_error::invalid_signature(format("Authorization header must use AWS4-HMAC-SHA256 algorithm: {}", authorization_header));
+    }
+    authorization_header.remove_prefix(pos+1);
    std::string credential;
    std::string user_signature;
    std::string signed_headers_str;
    std::vector<std::string_view> signed_headers;
-    for (std::string_view entry : credentials_raw) {
+    do {
+        // Either one of a comma or space can mark the end of an entry
+        pos = authorization_header.find_first_of(" ,");
+        std::string_view entry = authorization_header.substr(0, pos);
+        if (pos != std::string_view::npos) {
+            authorization_header.remove_prefix(pos + 1);
+        }
+        if (entry.empty()) {
+            continue;
+        }
        std::vector<std::string_view> entry_split = split(entry, '=');
        if (entry_split.size() != 2) {
-            if (entry != "AWS4-HMAC-SHA256") {
-                throw api_error::invalid_signature(format("Only AWS4-HMAC-SHA256 algorithm is supported. Found: {}", entry));
-            }
            continue;
        }
        std::string_view auth_value = entry_split[1];
-        // Commas appear as an additional (quite redundant) delimiter
-        if (auth_value.back() == ',') {
-            auth_value.remove_suffix(1);
-        }
        if (entry_split[0] == "Credential") {
            credential = std::string(auth_value);
        } else if (entry_split[0] == "Signature") {
@@ -273,7 +284,8 @@ future<> server::verify_signature(const request& req, const chunked_content& con
            signed_headers = split(auth_value, ';');
            std::sort(signed_headers.begin(), signed_headers.end());
        }
-    }
+    } while (pos != std::string_view::npos);
+
    std::vector<std::string_view> credential_split = split(credential, '/');
    if (credential_split.size() != 5) {
        throw api_error::validation(format("Incorrect credential information format: {}", credential));
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -38,6 +38,7 @@ stats::stats() : api_operations{} {
 #define OPERATION_LATENCY(name, CamelCaseName) \
                seastar::metrics::make_histogram("op_latency", \
                        seastar::metrics::description("Latency histogram of an operation via Alternator API"), {op(CamelCaseName)}, [this]{return to_metrics_histogram(api_operations.name);}),
+            OPERATION(batch_get_item, "BatchGetItem")
            OPERATION(batch_write_item, "BatchWriteItem")
            OPERATION(create_backup, "CreateBackup")
            OPERATION(create_global_table, "CreateGlobalTable")
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -332,15 +332,15 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], 0, [](column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], uint64_t{0}, [](column_family& cf) {
            return cf.active_memtable().partition_count();
-        }, std::plus<int>());
+        }, std::plus<>());
    });

    cf::get_all_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, 0, [](column_family& cf) {
+        return map_reduce_cf(ctx, uint64_t{0}, [](column_family& cf) {
            return cf.active_memtable().partition_count();
-        }, std::plus<int>());
+        }, std::plus<>());
    });

    cf::get_memtable_on_heap_size.set(r, [] (const_req req) {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -262,7 +262,7 @@ void set_repair(http_context& ctx, routes& r, sharded<netw::messaging_service>&
            try {
                res = fut.get0();
            } catch (std::exception& e) {
-                return make_exception_future<json::json_return_type>(httpd::server_error_exception(e.what()));
+                return make_exception_future<json::json_return_type>(httpd::bad_param_exception(e.what()));
            }
            return make_ready_future<json::json_return_type>(json::json_return_type(res));
        });
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -79,6 +79,54 @@ atomic_cell::atomic_cell(const abstract_type& type, atomic_cell_view other)
    set_view(_data);
 }

+// Based on:
+//  - org.apache.cassandra.db.AbstractCell#reconcile()
+//  - org.apache.cassandra.db.BufferExpiringCell#reconcile()
+//  - org.apache.cassandra.db.BufferDeletedCell#reconcile()
+int
+compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
+    if (left.timestamp() != right.timestamp()) {
+        return left.timestamp() > right.timestamp() ? 1 : -1;
+    }
+    if (left.is_live() != right.is_live()) {
+        return left.is_live() ? -1 : 1;
+    }
+    if (left.is_live()) {
+        auto c = compare_unsigned(left.value(), right.value());
+        if (c != 0) {
+            return c;
+        }
+        if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
+            // prefer expiring cells.
+            return left.is_live_and_has_ttl() ? 1 : -1;
+        }
+        if (left.is_live_and_has_ttl()) {
+            if (left.expiry() != right.expiry()) {
+                return left.expiry() < right.expiry() ? -1 : 1;
+            } else {
+                // prefer the cell that was written later,
+                // so it survives longer after it expires, until purged.
+                if (left.ttl() != right.ttl()) {
+                    return left.ttl() < right.ttl() ? 1 : -1;
+                } else {
+                    return 0;
+                }
+            }
+        }
+    } else {
+        // Both are deleted
+        if (left.deletion_time() != right.deletion_time()) {
+            // Origin compares big-endian serialized deletion time. That's because it
+            // delegates to AbstractCell.reconcile() which compares values after
+            // comparing timestamps, which in case of deleted cells will hold
+            // serialized expiry.
+            return (uint64_t) left.deletion_time().time_since_epoch().count()
+                   < (uint64_t) right.deletion_time().time_since_epoch().count() ? -1 : 1;
+        }
+    }
+    return 0;
+}
+
 atomic_cell_or_collection atomic_cell_or_collection::copy(const abstract_type& type) const {
    if (_data.empty()) {
        return atomic_cell_or_collection();
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -267,6 +267,9 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin
        }
        _state = state::reading_from_underlying;
        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
+        if (!_read_context->partition_exists()) {
+            return read_from_underlying(timeout);
+        }
        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
                                      : position_in_partition(_upper_bound);
        return _underlying->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
@@ -573,8 +576,8 @@ void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::con
                clogger.trace("csm {}: insert dummy at {}", fmt::ptr(this), _lower_bound);
                auto it = with_allocator(_lsa_manager.region().allocator(), [&] {
                    auto& rows = _snp->version()->partition().clustered_rows();
-                    auto new_entry = current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no);
-                    return rows.insert_before(_next_row.get_iterator_in_latest_version(), *new_entry);
+                    auto new_entry = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no));
+                    return rows.insert_before(_next_row.get_iterator_in_latest_version(), std::move(new_entry));
                });
                _snp->tracker()->insert(*it);
                _last_row = partition_snapshot_row_weakref(*_snp, it, true);
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -74,7 +74,7 @@ using namespace std::chrono_literals;
 logging::logger cdc_log("cdc");

 namespace cdc {
-static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {});
+static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {}, schema_ptr = nullptr);
 }

 static constexpr auto cdc_group_name = "cdc";
@@ -221,7 +221,7 @@ public:
                return;
            }

-            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt);
+            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);

            auto log_mut = log_schema 
                ? db::schema_tables::make_update_table_mutations(db, keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
@@ -490,7 +490,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
    return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
 }

-static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid) {
+static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid, schema_ptr old) {
    schema_builder b(s.ks_name(), log_name(s.cf_name()));
    b.with_partitioner("com.scylladb.dht.CDCPartitioner");
    b.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
@@ -571,6 +571,20 @@ static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID>
        b.set_uuid(*uuid);
    }

+    /**
+     * #10473 - if we are redefining the log table, we need to ensure any dropped
+     * columns are registered in "dropped_columns" table, otherwise clients will not
+     * be able to read data older than now.
+     */
+    if (old) {
+        // not super efficient, but we don't do this often.
+        for (auto& col : old->all_columns()) {
+            if (!b.has_column({col.name(), col.name_as_text() })) {
+                b.without_column(col.name_as_text(), col.type, api::new_timestamp());
+            }
+        }
+    }
+
    return b.build();
 }

@@ -716,16 +730,16 @@ private:
       }
       return false;
    }
-    bool compare(const T&, const value_type& v);
+    int32_t compare(const T&, const value_type& v);
 };

 template<>
-bool maybe_back_insert_iterator<std::vector<std::pair<bytes_view, bytes_view>>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
+int32_t maybe_back_insert_iterator<std::vector<std::pair<bytes_view, bytes_view>>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
    return _type.compare(t, v.first);
 }

 template<>
-bool maybe_back_insert_iterator<std::vector<bytes_view>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
+int32_t maybe_back_insert_iterator<std::vector<bytes_view>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
    return _type.compare(t, v);
 }

--- a/configure.py
+++ b/configure.py
@@ -302,7 +302,7 @@ scylla_tests = set([
    'test/boost/cdc_generation_test',
    'test/boost/aggregate_fcts_test',
    'test/boost/allocation_strategy_test',
-    'test/boost/alternator_base64_test',
+    'test/boost/alternator_unit_test',
    'test/boost/anchorless_list_test',
    'test/boost/auth_passwords_test',
    'test/boost/auth_resource_test',
@@ -1076,7 +1076,7 @@ pure_boost_tests = set([
 ])

 tests_not_using_seastar_test_framework = set([
-    'test/boost/alternator_base64_test',
+    'test/boost/alternator_unit_test',
    'test/boost/small_vector_test',
    'test/manual/gossip',
    'test/manual/message',
@@ -1152,7 +1152,7 @@ deps['test/boost/linearizing_input_stream_test'] = [
 ]

 deps['test/boost/duration_test'] += ['test/lib/exception_utils.cc']
-deps['test/boost/alternator_base64_test'] += ['alternator/base64.cc']
+deps['test/boost/alternator_unit_test'] += ['alternator/base64.cc']

 deps['test/raft/replication_test'] = ['test/raft/replication_test.cc'] + scylla_raft_dependencies
 deps['test/raft/fsm_test'] =  ['test/raft/fsm_test.cc', 'test/lib/log.cc'] + scylla_raft_dependencies
@@ -2018,7 +2018,7 @@ with open(buildfile_tmp, 'w') as f:
            command = ./dist/debian/debian_files_gen.py
        build $builddir/debian/debian: debian_files_gen | always
        rule extract_node_exporter
-            command = tar -C build -xvpf {node_exporter_filename} && rm -rfv build/node_exporter && mv -v build/{node_exporter_dirname} build/node_exporter
+            command = tar -C build -xvpf {node_exporter_filename} --no-same-owner && rm -rfv build/node_exporter && mv -v build/{node_exporter_dirname} build/node_exporter
        build $builddir/node_exporter: extract_node_exporter | always
        ''').format(**globals()))

--- a/cql3/authorized_prepared_statements_cache.hh
+++ b/cql3/authorized_prepared_statements_cache.hh
@@ -35,6 +35,28 @@ struct authorized_prepared_statements_cache_size {
 class authorized_prepared_statements_cache_key {
 public:
    using cache_key_type = std::pair<auth::authenticated_user, typename cql3::prepared_cache_key_type::cache_key_type>;
+
+    struct view {
+        const auth::authenticated_user& user_ref;
+        const cql3::prepared_cache_key_type& prep_cache_key_ref;
+    };
+
+    struct view_hasher {
+        size_t operator()(const view& kv) {
+            return cql3::authorized_prepared_statements_cache_key::hash(kv.user_ref, kv.prep_cache_key_ref.key());
+        }
+    };
+
+    struct view_equal {
+        bool operator()(const authorized_prepared_statements_cache_key& k1, const view& k2) {
+            return k1.key().first == k2.user_ref && k1.key().second == k2.prep_cache_key_ref.key();
+        }
+
+        bool operator()(const view& k2, const authorized_prepared_statements_cache_key& k1) {
+            return operator()(k1, k2);
+        }
+    };
+
 private:
    cache_key_type _key;

@@ -100,10 +122,12 @@ private:

 public:
    using key_type = cache_key_type;
+    using key_view_type = typename key_type::view;
+    using key_view_hasher = typename key_type::view_hasher;
+    using key_view_equal = typename key_type::view_equal;
    using value_type = checked_weak_ptr;
    using entry_is_too_big = typename cache_type::entry_is_too_big;
-    using iterator = typename cache_type::iterator;
-
+    using value_ptr = typename cache_type::value_ptr;
 private:
    cache_type _cache;
    logging::logger& _logger;
@@ -124,38 +148,12 @@ public:
        }).discard_result();
    }

-    iterator find(const auth::authenticated_user& user, const cql3::prepared_cache_key_type& prep_cache_key) {
-        struct key_view {
-            const auth::authenticated_user& user_ref;
-            const cql3::prepared_cache_key_type& prep_cache_key_ref;
-        };
-
-        struct hasher {
-            size_t operator()(const key_view& kv) {
-                return cql3::authorized_prepared_statements_cache_key::hash(kv.user_ref, kv.prep_cache_key_ref.key());
-            }
-        };
-
-        struct equal {
-            bool operator()(const key_type& k1, const key_view& k2) {
-                return k1.key().first == k2.user_ref && k1.key().second == k2.prep_cache_key_ref.key();
-            }
-
-            bool operator()(const key_view& k2, const key_type& k1) {
-                return operator()(k1, k2);
-            }
-        };
-
-        return _cache.find(key_view{user, prep_cache_key}, hasher(), equal());
-    }
-
-    iterator end() {
-        return _cache.end();
+    value_ptr find(const auth::authenticated_user& user, const cql3::prepared_cache_key_type& prep_cache_key) {
+        return _cache.find(key_view_type{user, prep_cache_key}, key_view_hasher(), key_view_equal());
    }

    void remove(const auth::authenticated_user& user, const cql3::prepared_cache_key_type& prep_cache_key) {
-        iterator it = find(user, prep_cache_key);
-        _cache.remove(it);
+        _cache.remove(key_view_type{user, prep_cache_key}, key_view_hasher(), key_view_equal());
    }

    size_t size() const {
--- a/cql3/cql_statement.hh
+++ b/cql3/cql_statement.hh
@@ -103,9 +103,7 @@ public:
    virtual future<::shared_ptr<cql_transport::messages::result_message>>
        execute(query_processor& qp, service::query_state& state, const query_options& options) const = 0;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const = 0;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const = 0;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const = 0;

    virtual shared_ptr<const metadata> get_result_metadata() const = 0;

--- a/cql3/prepared_statements_cache.hh
+++ b/cql3/prepared_statements_cache.hh
@@ -102,13 +102,7 @@ private:
    using cache_key_type = typename prepared_cache_key_type::cache_key_type;
    using cache_type = utils::loading_cache<cache_key_type, prepared_cache_entry, utils::loading_cache_reload_enabled::no, prepared_cache_entry_size, utils::tuple_hash, std::equal_to<cache_key_type>, prepared_cache_stats_updater>;
    using cache_value_ptr = typename cache_type::value_ptr;
-    using cache_iterator = typename cache_type::iterator;
    using checked_weak_ptr = typename statements::prepared_statement::checked_weak_ptr;
-    struct value_extractor_fn {
-        checked_weak_ptr operator()(prepared_cache_entry& e) const {
-            return e->checked_weak_from_this();
-        }
-    };

 public:
    static const std::chrono::minutes entry_expiry;
@@ -116,12 +110,9 @@ public:
    using key_type = prepared_cache_key_type;
    using value_type = checked_weak_ptr;
    using statement_is_too_big = typename cache_type::entry_is_too_big;
-    /// \note both iterator::reference and iterator::value_type are checked_weak_ptr
-    using iterator = boost::transform_iterator<value_extractor_fn, cache_iterator>;

 private:
    cache_type _cache;
-    value_extractor_fn _value_extractor_fn;

 public:
    prepared_statements_cache(logging::logger& logger, size_t size)
@@ -135,16 +126,12 @@ public:
        });
    }

-    iterator find(const key_type& key) {
-        return boost::make_transform_iterator(_cache.find(key.key()), _value_extractor_fn);
-    }
-
-    iterator end() {
-        return boost::make_transform_iterator(_cache.end(), _value_extractor_fn);
-    }
-
-    iterator begin() {
-        return boost::make_transform_iterator(_cache.begin(), _value_extractor_fn);
+    value_type find(const key_type& key) {
+        cache_value_ptr vp = _cache.find(key.key());
+        if (vp) {
+            return (*vp)->checked_weak_from_this();
+        }
+        return value_type();
    }

    template <typename Pred>
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -943,7 +943,7 @@ bool query_processor::migration_subscriber::should_invalidate(
        sstring ks_name,
        std::optional<sstring> cf_name,
        ::shared_ptr<cql_statement> statement) {
-    return statement->depends_on_keyspace(ks_name) && (!cf_name || statement->depends_on_column_family(*cf_name));
+    return statement->depends_on(ks_name, cf_name);
 }

 future<> query_processor::query_internal(
--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -179,10 +179,10 @@ public:

    statements::prepared_statement::checked_weak_ptr get_prepared(const std::optional<auth::authenticated_user>& user, const prepared_cache_key_type& key) {
        if (user) {
-            auto it = _authorized_prepared_cache.find(*user, key);
-            if (it != _authorized_prepared_cache.end()) {
+            auto vp = _authorized_prepared_cache.find(*user, key);
+            if (vp) {
                try {
-                    return it->get()->checked_weak_from_this();
+                    return vp->get()->checked_weak_from_this();
                } catch (seastar::checked_ptr_is_null_exception&) {
                    // If the prepared statement got invalidated - remove the corresponding authorized_prepared_statements_cache entry as well.
                    _authorized_prepared_cache.remove(*user, key);
@@ -193,11 +193,7 @@ public:
    }

    statements::prepared_statement::checked_weak_ptr get_prepared(const prepared_cache_key_type& key) {
-        auto it = _prepared_cache.find(key);
-        if (it == _prepared_cache.end()) {
-            return statements::prepared_statement::checked_weak_ptr();
-        }
-        return *it;
+        return _prepared_cache.find(key);
    }

    future<::shared_ptr<cql_transport::messages::result_message>>
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -133,6 +133,7 @@ statement_restrictions::statement_restrictions(schema_ptr schema, bool allow_fil
    , _partition_key_restrictions(get_initial_partition_key_restrictions(allow_filtering))
    , _clustering_columns_restrictions(get_initial_clustering_key_restrictions(allow_filtering))
    , _nonprimary_key_restrictions(::make_shared<single_column_restrictions>(schema))
+    , _partition_range_is_simple(true)
 { }
 #if 0
 static const column_definition*
@@ -335,7 +336,7 @@ statement_restrictions::statement_restrictions(database& db,
    }

    if (!_nonprimary_key_restrictions->empty()) {
-        if (_has_queriable_regular_index) {
+        if (_has_queriable_regular_index && _partition_range_is_simple) {
            _uses_secondary_indexing = true;
        } else if (!allow_filtering) {
            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
@@ -377,6 +378,7 @@ void statement_restrictions::add_single_column_restriction(::shared_ptr<single_c
                    "Only EQ and IN relation are supported on the partition key (unless you use the token() function or allow filtering)");
        }
        _partition_key_restrictions = _partition_key_restrictions->merge_to(_schema, restriction);
+        _partition_range_is_simple &= !find(restriction->expression, expr::oper_t::IN);
    } else if (def.is_clustering_key()) {
        _clustering_columns_restrictions = _clustering_columns_restrictions->merge_to(_schema, restriction);
    } else {
@@ -1103,16 +1105,27 @@ bool statement_restrictions::need_filtering() const {
        // clustering restrictions.  Therefore, a continuous clustering range is guaranteed.
        return false;
    }
-    if (!_clustering_columns_restrictions->needs_filtering(*_schema)) { // Guaranteed continuous clustering range.
-        return false;
-    }
-    // Now we know there are some clustering-column restrictions that are out-of-order or not EQ.  A naive base-table
-    // query must be filtered.  What about an index-table query?  That can only avoid filtering if there is exactly one
-    // EQ supported by an index.
-    return !(_clustering_columns_restrictions->size() == 1 && _has_queriable_ck_index);

-    // TODO: it is also possible to avoid filtering here if a non-empty CK prefix is specified and token_known, plus
-    // there's exactly one out-of-order-but-index-supported clustering-column restriction.
+    if (_has_queriable_ck_index && _uses_secondary_indexing) {
+        // In cases where we use an index, clustering column restrictions might cause the need for filtering.
+        // TODO: This is overly conservative, there are some cases when this returns true but filtering
+        // is not needed. Because of that the database will sometimes perform filtering when it's not actually needed.
+        // Query performance shouldn't be affected much, at most we will filter rows that are all correct.
+        // Here are some cases to consider:
+        // On a table with primary key (p, c1, c2, c3) with an index on c3
+        // WHERE c3 = ? - doesn't require filtering
+        // WHERE c1 = ? AND c2 = ? AND c3 = ? - requires filtering
+        // WHERE p = ? AND c1 = ? AND c3 = ? - doesn't require filtering, but we conservatively report it does
+        // WHERE p = ? AND c1 LIKE ? AND c3 = ? - requires filtering
+        // WHERE p = ? AND c1 = ? AND c2 LIKE ? AND c3 = ? - requires filtering
+        // WHERE p = ? AND c1 = ? AND c2 = ? AND c3 = ? - doesn't use an index
+        // WHERE p = ? AND c1 = ? AND c2 < ? AND c3 = ? - doesn't require filtering, but we report it does
+        return _clustering_columns_restrictions->size() > 1;
+    }
+    // Now we know that the query doesn't use an index.
+
+    // The only thing that can cause filtering now are the clustering columns.
+    return _clustering_columns_restrictions->needs_filtering(*_schema);
 }

 void statement_restrictions::validate_secondary_index_selections(bool selects_only_static_columns) {
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -107,6 +107,8 @@ private:
    std::optional<expr::expression> _where; ///< The entire WHERE clause.
    std::vector<expr::expression> _clustering_prefix_restrictions; ///< Parts of _where defining the clustering slice.

+    bool _partition_range_is_simple; ///< False iff _partition_range_restrictions imply a Cartesian product.
+
 public:
    /**
     * Creates a new empty <code>StatementRestrictions</code>.
--- a/cql3/statements/authentication_statement.cc
+++ b/cql3/statements/authentication_statement.cc
@@ -46,13 +46,7 @@ uint32_t cql3::statements::authentication_statement::get_bound_terms() const {
    return 0;
 }

-bool cql3::statements::authentication_statement::depends_on_keyspace(
-                const sstring& ks_name) const {
-    return false;
-}
-
-bool cql3::statements::authentication_statement::depends_on_column_family(
-                const sstring& cf_name) const {
+bool cql3::statements::authentication_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/authentication_statement.hh
+++ b/cql3/statements/authentication_statement.hh
@@ -56,9 +56,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;

--- a/cql3/statements/authorization_statement.cc
+++ b/cql3/statements/authorization_statement.cc
@@ -46,13 +46,7 @@ uint32_t cql3::statements::authorization_statement::get_bound_terms() const {
    return 0;
 }

-bool cql3::statements::authorization_statement::depends_on_keyspace(
-                const sstring& ks_name) const {
-    return false;
-}
-
-bool cql3::statements::authorization_statement::depends_on_column_family(
-                const sstring& cf_name) const {
+bool cql3::statements::authorization_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/authorization_statement.hh
+++ b/cql3/statements/authorization_statement.hh
@@ -60,9 +60,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;

--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -93,14 +93,9 @@ batch_statement::batch_statement(type type_,
 {
 }

-bool batch_statement::depends_on_keyspace(const sstring& ks_name) const
+bool batch_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
-    return false;
-}
-
-bool batch_statement::depends_on_column_family(const sstring& cf_name) const
-{
-    return false;
+    return boost::algorithm::any_of(_statements, [&ks_name, &cf_name] (auto&& s) { return s.statement->depends_on(ks_name, cf_name); });
 }

 uint32_t batch_statement::get_bound_terms() const
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -120,9 +120,7 @@ public:
                    std::unique_ptr<attributes> attrs,
                    cql_stats& stats);

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;

--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -307,6 +307,13 @@ create_index_statement::announce_migration(query_processor& qp) const {
                    format("Index {} is a duplicate of existing index {}", index.name(), existing_index.value().name()));
        }
    }
+    auto index_table_name = secondary_index::index_table_name(accepted_name);
+    if (db.has_schema(keyspace(), index_table_name)) {
+        return make_exception_future<::shared_ptr<cql_transport::event::schema_change>>(
+            exceptions::invalid_request_exception(format("Index {} cannot be created, because table {} already exists",
+                accepted_name, index_table_name))
+        );
+    }
    ++_cql_stats->secondary_index_creates;
    schema_builder builder{schema};
    builder.with_index(index);
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -540,12 +540,8 @@ modification_statement::validate(service::storage_proxy&, const service::client_
    }
 }

-bool modification_statement::depends_on_keyspace(const sstring& ks_name) const {
-    return keyspace() == ks_name;
-}
-
-bool modification_statement::depends_on_column_family(const sstring& cf_name) const {
-    return column_family() == cf_name;
+bool modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
 }

 void modification_statement::add_operation(::shared_ptr<operation> op) {
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -173,9 +173,7 @@ public:
    // Validate before execute, using client state and current schema
    void validate(service::storage_proxy&, const service::client_state& state) const override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    void add_operation(::shared_ptr<operation> op);

--- a/cql3/statements/schema_altering_statement.cc
+++ b/cql3/statements/schema_altering_statement.cc
@@ -67,12 +67,7 @@ future<> schema_altering_statement::grant_permissions_to_creator(const service::
    return make_ready_future<>();
 }

-bool schema_altering_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool schema_altering_statement::depends_on_column_family(const sstring& cf_name) const
+bool schema_altering_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/schema_altering_statement.hh
+++ b/cql3/statements/schema_altering_statement.hh
@@ -81,9 +81,7 @@ protected:
     */
    virtual future<> grant_permissions_to_creator(const service::client_state&) const;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;

--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -190,12 +190,8 @@ void select_statement::validate(service::storage_proxy&, const service::client_s
    // Nothing to do, all validation has been done by raw_statemet::prepare()
 }

-bool select_statement::depends_on_keyspace(const sstring& ks_name) const {
-    return keyspace() == ks_name;
-}
-
-bool select_statement::depends_on_column_family(const sstring& cf_name) const {
-    return column_family() == cf_name;
+bool select_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
 }

 const sstring& select_statement::keyspace() const {
@@ -461,7 +457,7 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const std::option
        if (!view_col) {
            throw std::runtime_error(format("Base key column not found in the view: {}", base_col.name_as_text()));
        }
-        if (base_col.type != view_col->type) {
+        if (base_col.type->without_reversed() != *view_col->type) {
            throw std::runtime_error(format("Mismatched types for base and view columns {}: {} and {}",
                    base_col.name_as_text(), base_col.type->cql3_type_name(), view_col->type->cql3_type_name()));
        }
@@ -965,6 +961,7 @@ lw_shared_ptr<const service::pager::paging_state> indexed_table_select_statement
    }

    auto paging_state_copy = make_lw_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
+    paging_state_copy->set_remaining(internal_paging_size);
    paging_state_copy->set_partition_key(std::move(index_pk));
    paging_state_copy->set_clustering_key(std::move(index_ck));
    return std::move(paging_state_copy);
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -121,8 +121,7 @@ public:
    virtual uint32_t get_bound_terms() const override;
    virtual future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
    virtual void validate(service::storage_proxy&, const service::client_state& state) const override;
-    virtual bool depends_on_keyspace(const sstring& ks_name) const;
-    virtual bool depends_on_column_family(const sstring& cf_name) const;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<::shared_ptr<cql_transport::messages::result_message>> execute(query_processor& qp,
        service::query_state& state, const query_options& options) const override;
--- a/cql3/statements/truncate_statement.cc
+++ b/cql3/statements/truncate_statement.cc
@@ -66,12 +66,7 @@ std::unique_ptr<prepared_statement> truncate_statement::prepare(database& db,cql
    return std::make_unique<prepared_statement>(::make_shared<truncate_statement>(*this));
 }

-bool truncate_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool truncate_statement::depends_on_column_family(const sstring& cf_name) const
+bool truncate_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/truncate_statement.hh
+++ b/cql3/statements/truncate_statement.hh
@@ -60,9 +60,7 @@ public:

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;

--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -52,6 +52,7 @@
 #include "types/list.hh"
 #include "types/user.hh"
 #include "concrete_types.hh"
+#include "validation.hh"

 namespace cql3 {

@@ -252,6 +253,7 @@ insert_prepared_json_statement::build_partition_keys(const query_options& option
        exploded.emplace_back(json_value->second);
    }
    auto pkey = partition_key::from_optional_exploded(*s, std::move(exploded));
+    validation::validate_cql_key(*s, pkey);
    auto k = query::range<query::ring_position>::make_singular(dht::decorate_key(*s, std::move(pkey)));
    ranges.emplace_back(std::move(k));
    return ranges;
--- a/cql3/statements/use_statement.cc
+++ b/cql3/statements/use_statement.cc
@@ -73,12 +73,7 @@ std::unique_ptr<prepared_statement> use_statement::prepare(database& db, cql_sta

 }

-bool use_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool use_statement::depends_on_column_family(const sstring& cf_name) const
+bool use_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/use_statement.hh
+++ b/cql3/statements/use_statement.hh
@@ -61,9 +61,7 @@ public:

    virtual uint32_t get_bound_terms() const override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;

--- a/database.cc
+++ b/database.cc
@@ -747,10 +747,8 @@ void database::set_format(sstables::sstable_version_types format) {
 void database::set_format_by_config() {
    if (_cfg.enable_sstables_md_format()) {
        set_format(sstables::sstable_version_types::md);
-    } else if (_cfg.enable_sstables_mc_format()) {
-        set_format(sstables::sstable_version_types::mc);
    } else {
-        set_format(sstables::sstable_version_types::la);
+        set_format(sstables::sstable_version_types::mc);
    }
 }

@@ -977,10 +975,9 @@ bool database::update_column_family(schema_ptr new_schema) {
    return columns_changed;
 }

-future<> database::remove(const column_family& cf) noexcept {
+void database::remove(const table& cf) noexcept {
    auto s = cf.schema();
    auto& ks = find_keyspace(s->ks_name());
-    _querier_cache.evict_all_for_table(s->id());
    _column_families.erase(s->id());
    ks.metadata()->remove_column_family(s);
    _ks_cf_to_uuid.erase(std::make_pair(s->ks_name(), s->cf_name()));
@@ -991,20 +988,26 @@ future<> database::remove(const column_family& cf) noexcept {
            // Drop view mutations received after base table drop.
        }
    }
-    co_return;
 }

 future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func tsf, bool snapshot) {
    auto& ks = find_keyspace(ks_name);
    auto uuid = find_uuid(ks_name, cf_name);
    auto cf = _column_families.at(uuid);
-    co_await remove(*cf);
+    remove(*cf);
    cf->clear_views();
-    co_return co_await cf->await_pending_ops().then([this, &ks, cf, tsf = std::move(tsf), snapshot] {
-        return truncate(ks, *cf, std::move(tsf), snapshot).finally([this, cf] {
-            return cf->stop();
-        });
-    }).finally([cf] {});
+    co_await cf->await_pending_ops();
+    _querier_cache.evict_all_for_table(cf->schema()->id());
+    std::exception_ptr ex;
+    try {
+        co_await truncate(ks, *cf, std::move(tsf), snapshot);
+    } catch (...) {
+        ex = std::current_exception();
+    }
+    co_await cf->stop();
+    if (ex) {
+        std::rethrow_exception(std::move(ex));
+    }
 }

 const utils::UUID& database::find_uuid(std::string_view ks, std::string_view cf) const {
@@ -1386,44 +1389,6 @@ database::existing_index_names(const sstring& ks_name, const sstring& cf_to_excl
    return names;
 }

-// Based on:
-//  - org.apache.cassandra.db.AbstractCell#reconcile()
-//  - org.apache.cassandra.db.BufferExpiringCell#reconcile()
-//  - org.apache.cassandra.db.BufferDeletedCell#reconcile()
-int
-compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
-    if (left.timestamp() != right.timestamp()) {
-        return left.timestamp() > right.timestamp() ? 1 : -1;
-    }
-    if (left.is_live() != right.is_live()) {
-        return left.is_live() ? -1 : 1;
-    }
-    if (left.is_live()) {
-        auto c = compare_unsigned(left.value(), right.value());
-        if (c != 0) {
-            return c;
-        }
-        if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
-            // prefer expiring cells.
-            return left.is_live_and_has_ttl() ? 1 : -1;
-        }
-        if (left.is_live_and_has_ttl() && left.expiry() != right.expiry()) {
-            return left.expiry() < right.expiry() ? -1 : 1;
-        }
-    } else {
-        // Both are deleted
-        if (left.deletion_time() != right.deletion_time()) {
-            // Origin compares big-endian serialized deletion time. That's because it
-            // delegates to AbstractCell.reconcile() which compares values after
-            // comparing timestamps, which in case of deleted cells will hold
-            // serialized expiry.
-            return (uint64_t) left.deletion_time().time_since_epoch().count()
-                   < (uint64_t) right.deletion_time().time_since_epoch().count() ? -1 : 1;
-        }
-    }
-    return 0;
-}
-
 future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>>
 database::query(schema_ptr s, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges,
                tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
@@ -1957,7 +1922,11 @@ sstring database::get_available_index_name(const sstring &ks_name, const sstring
    auto base_name = index_metadata::get_default_index_name(cf_name, index_name_root);
    sstring accepted_name = base_name;
    int i = 0;
-    while (existing_names.contains(accepted_name)) {
+    auto name_accepted = [&] {
+        auto index_table_name = secondary_index::index_table_name(accepted_name);
+        return !has_schema(ks_name, index_table_name) && !existing_names.contains(accepted_name);
+    };
+    while (!name_accepted()) {
        accepted_name = base_name + "_" + std::to_string(++i);
    }
    return accepted_name;
--- a/database.hh
+++ b/database.hh
@@ -239,9 +239,13 @@ public:
        return _memtables.back();
    }

-    // The caller has to make sure the element exist before calling this.
+    // # 8904 - this method is akin to std::set::erase(key_type), not
+    // erase(iterator). Should be tolerant against non-existing.
    void erase(const shared_memtable& element) {
-        _memtables.erase(boost::range::find(_memtables, element));
+        auto i = boost::range::find(_memtables, element);
+        if (i != _memtables.end()) {
+            _memtables.erase(i);
+        }
    }
    void clear() {
        _memtables.clear();
@@ -924,7 +928,7 @@ public:
        return _pending_writes_phaser.start();
    }

-    future<> await_pending_writes() {
+    future<> await_pending_writes() noexcept {
        return _pending_writes_phaser.advance_and_await();
    }

@@ -936,7 +940,7 @@ public:
        return _pending_reads_phaser.start();
    }

-    future<> await_pending_reads() {
+    future<> await_pending_reads() noexcept {
        return _pending_reads_phaser.advance_and_await();
    }

@@ -948,7 +952,7 @@ public:
        return _pending_streams_phaser.start();
    }

-    future<> await_pending_streams() {
+    future<> await_pending_streams() noexcept {
        return _pending_streams_phaser.advance_and_await();
    }

@@ -956,11 +960,11 @@ public:
        return _pending_streams_phaser.operations_in_progress();
    }

-    future<> await_pending_flushes() {
+    future<> await_pending_flushes() noexcept {
        return _pending_flushes_phaser.advance_and_await();
    }

-    future<> await_pending_ops() {
+    future<> await_pending_ops() noexcept {
        return when_all(await_pending_reads(), await_pending_writes(), await_pending_streams(), await_pending_flushes()).discard_result();
    }

@@ -1363,6 +1367,7 @@ private:
    Future update_write_metrics(Future&& f);
    void update_write_metrics_for_timed_out_write();
    future<> create_keyspace(const lw_shared_ptr<keyspace_metadata>&, bool is_bootstrap, system_keyspace system);
+    void remove(const table&) noexcept;
 public:
    static utils::UUID empty_version;

@@ -1546,7 +1551,6 @@ public:

    bool update_column_family(schema_ptr s);
    future<> drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func, bool with_snapshot = true);
-    future<> remove(const column_family&) noexcept;

    const logalloc::region_group& dirty_memory_region_group() const {
        return _dirty_memory_manager.region_group();
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -136,6 +136,7 @@ db::commitlog::config db::commitlog::config::from_db_config(const db::config& cf
    c.extensions = &cfg.extensions();
    c.reuse_segments = cfg.commitlog_reuse_segments();
    c.use_o_dsync = cfg.commitlog_use_o_dsync();
+    c.allow_going_over_size_limit = !cfg.commitlog_use_hard_size_limit();

    return c;
 }
@@ -316,6 +317,7 @@ public:
        uint64_t buffer_list_bytes = 0;
        // size on disk, actually used - i.e. containing data (allocate+cycle)
        uint64_t active_size_on_disk = 0;
+        uint64_t wasted_size_on_disk = 0;
        // size allocated on disk - i.e. files created (new, reserve, recycled)
        uint64_t total_size_on_disk = 0;
        uint64_t requests_blocked_memory = 0;
@@ -419,7 +421,11 @@ public:
    void flush_segments(uint64_t size_to_remove);

 private:
+    class shutdown_marker{};
+
    future<> clear_reserve_segments();
+    void abort_recycled_list(std::exception_ptr);
+    void abort_deletion_promise(std::exception_ptr);

    future<> rename_file(sstring, sstring) const;
    size_t max_request_controller_units() const;
@@ -433,6 +439,7 @@ private:
    timer<clock_type> _timer;
    future<> replenish_reserve();
    future<> _reserve_replenisher;
+    future<> _background_sync;
    seastar::gate _gate;
    uint64_t _new_counter = 0;
 };
@@ -541,6 +548,9 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c

    void end_flush() {
        _segment_manager->end_flush();
+        if (can_delete()) {
+            _segment_manager->discard_unused_segments();
+        }
    }

 public:
@@ -584,6 +594,7 @@ public:
            clogger.debug("Segment {} is no longer active and will submitted for delete now", *this);
            ++_segment_manager->totals.segments_destroyed;
            _segment_manager->totals.active_size_on_disk -= file_position();
+            _segment_manager->totals.wasted_size_on_disk -= (_size_on_disk - file_position());
            _segment_manager->add_file_to_delete(_file_name, _desc);
        } else if (_segment_manager->cfg.warn_about_segments_left_on_disk_after_shutdown) {
            clogger.warn("Segment {} is dirty and is left on disk.", *this);
@@ -695,7 +706,14 @@ public:
    }
    future<sseg_ptr> close() {
        _closed = true;
-        return sync().then([] (sseg_ptr s) { return s->flush(); }).then([] (sseg_ptr s) { return s->terminate(); });
+        return sync().then([] (sseg_ptr s) {
+            return s->flush();
+        }).then([](sseg_ptr s) {
+            return s->terminate();
+        }).then([](sseg_ptr s) {
+            s->_segment_manager->totals.wasted_size_on_disk += (s->_size_on_disk - s->file_position());
+            return s;
+        });
    }
    future<sseg_ptr> do_flush(uint64_t pos) {
        auto me = shared_from_this();
@@ -1137,13 +1155,15 @@ db::commitlog::segment_manager::segment_manager(config c)

        return cfg;
    }())
-    , max_size(std::min<size_t>(std::numeric_limits<position_type>::max(), std::max<size_t>(cfg.commitlog_segment_size_in_mb, 1) * 1024 * 1024))
+    , max_size(std::min<size_t>(std::numeric_limits<position_type>::max() / (1024 * 1024), std::max<size_t>(cfg.commitlog_segment_size_in_mb, 1)) * 1024 * 1024)
    , max_mutation_size(max_size >> 1)
    , max_disk_size(size_t(std::ceil(cfg.commitlog_total_space_in_mb / double(smp::count))) * 1024 * 1024)
    // our threshold for trying to force a flush. needs heristics, for now max - segment_size/2.
    , disk_usage_threshold(cfg.commitlog_flush_threshold_in_mb.has_value() 
        ? size_t(std::ceil(*cfg.commitlog_flush_threshold_in_mb / double(smp::count))) * 1024 * 1024 
-        : (max_disk_size - (max_disk_size > (max_size/2) ? (max_size/2) : 0)))
+        : (max_disk_size -
+            (max_disk_size >= (max_size*2) ? max_size
+                : (max_disk_size > (max_size/2) ? (max_size/2) : max_disk_size/3))))
    , _flush_semaphore(cfg.max_active_flushes)
    // That is enough concurrency to allow for our largest mutation (max_mutation_size), plus
    // an existing in-flight buffer. Since we'll force the cycling() of any buffer that is bigger
@@ -1153,6 +1173,7 @@ db::commitlog::segment_manager::segment_manager(config c)
    , _reserve_segments(1)
    , _recycled_segments(std::numeric_limits<size_t>::max())
    , _reserve_replenisher(make_ready_future<>())
+    , _background_sync(make_ready_future<>())
 {
    assert(max_size > 0);
    assert(max_mutation_size < segment::multi_entry_size_magic);
@@ -1190,6 +1211,12 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
                    return make_ready_future<>();
                });
            }).handle_exception([](std::exception_ptr ep) {
+                try {
+                    std::rethrow_exception(ep);
+                } catch (shutdown_marker&) {
+                    return make_ready_future<>();
+                } catch (...) {
+                }
                clogger.warn("Exception in segment reservation: {}", ep);
                return sleep(100ms);
            });
@@ -1334,6 +1361,10 @@ void db::commitlog::segment_manager::create_counters(const sstring& metrics_cate
                       sm::description("Holds a size of disk space in bytes used for data so far. "
                                       "A too high value indicates that we have some bottleneck in the writing to sstables path.")),

+        sm::make_gauge("disk_slack_end_bytes", totals.wasted_size_on_disk,
+                       sm::description("Holds a size of disk space in bytes unused because of segment switching (end slack). "
+                                       "A too high value indicates that we do not write enough data to each segment.")),
+
        sm::make_gauge("memory_buffer_bytes", totals.buffer_list_bytes,
                       sm::description("Holds the total number of bytes in internal memory buffers.")),
    });
@@ -1370,7 +1401,8 @@ void db::commitlog::segment_manager::flush_segments(uint64_t size_to_remove) {

    // Now get a set of used CF ids:
    std::unordered_set<cf_id_type> ids;
-    std::for_each(_segments.begin(), _segments.end() - 1, [&ids](sseg_ptr& s) {
+    auto e = std::find_if(_segments.begin(), _segments.end(), std::mem_fn(&segment::is_still_allocating));
+    std::for_each(_segments.begin(), e, [&ids](sseg_ptr& s) {
        for (auto& id : s->_cf_dirty | boost::adaptors::map_keys) {
            ids.insert(id);
        }
@@ -1446,6 +1478,9 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
                        m += s;
                    }
                    auto s = co_await f.dma_write(max_size - rem, std::move(v), service::get_local_commitlog_priority());
+                    if (!s) [[unlikely]] {
+                        on_internal_error(clogger, format("dma_write returned 0: max_size={} rem={} iovec.n={}", max_size, rem, n));
+                    }
                    rem -= s;
                }

@@ -1466,6 +1501,8 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
                }
            }
        }
+
+        f = make_checked_file(commit_error_handler, std::move(f));
    } catch (...) {
        ep = std::current_exception();
        commit_error_handler(ep);
@@ -1511,7 +1548,19 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
    if (!cfg.allow_going_over_size_limit && max_disk_size != 0 && totals.total_size_on_disk >= max_disk_size) {
        clogger.debug("Disk usage ({} MB) exceeds maximum ({} MB) - allocation will wait...", totals.total_size_on_disk/(1024*1024), max_disk_size/(1024*1024));
        auto f = cfg.reuse_segments ? _recycled_segments.not_empty() :  _disk_deletions.get_shared_future();
-        return f.then([this] {
+        if (!f.available()) {
+            _new_counter = 0; // zero this so timer task does not duplicate the below flush
+            flush_segments(0); // force memtable flush already
+        }
+        return f.handle_exception([this](auto ep) {
+            try {
+                std::rethrow_exception(ep);
+            } catch (shutdown_marker&) {
+                throw;
+            } catch (...) {
+            }
+            clogger.warn("Exception while waiting for segments {}. Will retry allocation...", ep);
+        }).then([this] {
            return allocate_segment();
        });
    }
@@ -1533,7 +1582,8 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
            clogger.debug("Increased segment reserve count to {}", _reserve_segments.max_size());
        }
        // if we have no reserve and we're above/at limits, make background task a little more eager.
-        if (!_shutdown && totals.total_size_on_disk >= disk_usage_threshold) {
+        auto cur = totals.active_size_on_disk + totals.wasted_size_on_disk;
+        if (!_shutdown && cur >= disk_usage_threshold) {
            _timer.cancel();
            _timer.arm(std::chrono::milliseconds(0));
        }
@@ -1670,7 +1720,10 @@ future<> db::commitlog::segment_manager::clear_reserve_segments() {

 future<> db::commitlog::segment_manager::sync_all_segments() {
    clogger.debug("Issuing sync for all segments");
-    return parallel_for_each(_segments, [] (sseg_ptr s) {
+    // #8952 - calls that do sync/cycle can end up altering
+    // _segments (end_flush()->discard_unused())
+    auto def_copy = _segments;
+    return parallel_for_each(def_copy, [] (sseg_ptr s) {
        return s->sync().then([](sseg_ptr s) {
            clogger.debug("Synced segment {}", *s);
        });
@@ -1679,7 +1732,10 @@ future<> db::commitlog::segment_manager::sync_all_segments() {

 future<> db::commitlog::segment_manager::shutdown_all_segments() {
    clogger.debug("Issuing shutdown for all segments");
-    return parallel_for_each(_segments, [] (sseg_ptr s) {
+    // #8952 - calls that do sync/cycle can end up altering
+    // _segments (end_flush()->discard_unused())
+    auto def_copy = _segments;
+    return parallel_for_each(def_copy, [] (sseg_ptr s) {
        return s->shutdown().then([](sseg_ptr s) {
            clogger.debug("Shutdown segment {}", *s);
        });
@@ -1693,13 +1749,36 @@ future<> db::commitlog::segment_manager::shutdown() {
        // Wait for all pending requests to finish. Need to sync first because segments that are
        // alive may be holding semaphore permits.
        auto block_new_requests = get_units(_request_controller, max_request_controller_units());
-        return sync_all_segments().then([this, block_new_requests = std::move(block_new_requests)] () mutable {
+        return sync_all_segments().then_wrapped([this, block_new_requests = std::move(block_new_requests)] (future<> f) mutable {
+            if (f.failed()) {
+                clogger.error("Syncing all segments failed during shutdown: {}. Aborting.", f.get_exception());
+                abort();
+            }
            return std::move(block_new_requests).then([this] (auto permits) {
                _timer.cancel(); // no more timer calls
                _shutdown = true; // no re-arm, no create new segments.
+
+                // do a discard + delete sweep to force 
+                // gate holder (i.e. replenish) to wake up
+                discard_unused_segments();
+                auto f = do_pending_deletes().then([this] {
+                    auto ep = std::make_exception_ptr(shutdown_marker{});
+                    if (_recycled_segments.empty()) {
+                        abort_recycled_list(ep);
+                    }
+                    abort_deletion_promise(ep);
+                    return std::exchange(_background_sync, make_ready_future<>());
+                });
+
+
                // Now first wait for periodic task to finish, then sync and close all
                // segments, flushing out any remaining data.
-                return _gate.close().then(std::bind(&segment_manager::shutdown_all_segments, this)).finally([permits = std::move(permits)] { });
+                return _gate.close().then([this, f = std::move(f)]() mutable {
+                    return std::move(f).then(std::bind(&segment_manager::shutdown_all_segments, this)).handle_exception([](std::exception_ptr ex) {
+                        clogger.error("Shutting down all segments failed during shutdown: {}. Aborting.", ex);
+                        abort();
+                    });
+                }).finally([permits = std::move(permits)] { });
            });
        }).finally([this] {
            discard_unused_segments();
@@ -1741,41 +1820,89 @@ future<> db::commitlog::segment_manager::delete_file(const sstring& filename) {
 }

 future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> files) {
-    auto i = files.begin();
-    auto e = files.end();
+    if (files.empty()) {
+        co_return;
+    }

-    return parallel_for_each(i, e, [this](auto& filename) {
-        auto f = make_ready_future();
-        auto exts = cfg.extensions;
-        if (exts && !exts->commitlog_file_extensions().empty()) {
-            f = parallel_for_each(exts->commitlog_file_extensions(), [&](auto& ext) {
-                return ext->before_delete(filename);
-            });
-        }
-        return f.finally([&] {
-            // We allow reuse of the segment if the current disk size is less than shard max.
-            auto usage = totals.total_size_on_disk;
-            if (!_shutdown && cfg.reuse_segments && usage <= max_disk_size) {
-                descriptor d(next_id(), "Recycled-" + cfg.fname_prefix);
-                auto dst = this->filename(d);
+    clogger.debug("Delete segments {}", files);

-                clogger.debug("Recycling segment file {}", filename);
-                // must rename the file since we must ensure the
-                // data is not replayed. Changing the name will
-                // cause header ID to be invalid in the file -> ignored
-                return rename_file(filename, dst).then([this, dst]() mutable {
-                    auto b = _recycled_segments.push(std::move(dst));
-                    assert(b); // we set this to max_size_t so...
-                    return make_ready_future<>();
-                }).handle_exception([this, filename](auto&&) {
-                    return delete_file(filename);
-                });
+    std::exception_ptr recycle_error;
+
+    while (!files.empty()) {
+        auto filename = std::move(files.back());
+        files.pop_back();
+
+        try {
+            auto exts = cfg.extensions;
+            if (exts && !exts->commitlog_file_extensions().empty()) {
+                for (auto& ext : exts->commitlog_file_extensions()) {
+                    co_await ext->before_delete(filename);
+                }
            }
-            return delete_file(filename);
-        }).handle_exception([&filename](auto ep) {
-            clogger.error("Could not delete segment {}: {}", filename, ep);
-        });
-    }).finally([files = std::move(files)] {});
+
+            // We allow reuse of the segment if the current disk size is less than shard max.
+            if (cfg.reuse_segments) {
+                auto usage = totals.total_size_on_disk;
+                auto recycle = usage <= max_disk_size;
+
+                // if total size is not a multiple of segment size, we need
+                // to check if we are the overlap segment, and noone else
+                // can be recycled. If so, let this one live so allocation
+                // can proceed. We assume/hope a future delete will kill
+                // files down to under the threshold, but we should expect
+                // to stomp around nearest multiple of segment size, not 
+                // the actual limit.
+                if (!recycle && _recycled_segments.empty() && files.empty()) {
+                    auto size = co_await seastar::file_size(filename);
+                    recycle = (usage - size) <= max_disk_size;
+                }
+
+                if (recycle) {
+                    descriptor d(next_id(), "Recycled-" + cfg.fname_prefix);
+                    auto dst = this->filename(d);
+
+                    clogger.debug("Recycling segment file {}", filename);
+                    // must rename the file since we must ensure the
+                    // data is not replayed. Changing the name will
+                    // cause header ID to be invalid in the file -> ignored
+                    try {
+                        co_await rename_file(filename, dst);
+                        auto b = _recycled_segments.push(std::move(dst));
+                        assert(b); // we set this to max_size_t so...
+                        continue;
+                    } catch (...) {
+                        recycle_error = std::current_exception();
+                        // fallthrough
+                    }
+                }
+            }
+            co_await delete_file(filename);
+        } catch (...) {
+            clogger.error("Could not delete segment {}: {}", filename, std::current_exception());
+        }
+    }
+
+    // #8376 - if we had an error in recycling (disk rename?), and no elements
+    // are available, we could have waiters hoping they will get segements.
+    // abort the queue (wakes up any existing waiters - futures), and let them
+    // retry. Since we did deletions instead, disk footprint should allow
+    // for new allocs at least. Or more likely, everything is broken, but
+    // we will at least make more noise.
+    if (recycle_error && _recycled_segments.empty()) {
+        abort_recycled_list(recycle_error);
+    }
+}
+
+void db::commitlog::segment_manager::abort_recycled_list(std::exception_ptr ep) {
+    // may not call here with elements in list. that would leak files.
+    assert(_recycled_segments.empty());
+    _recycled_segments.abort(ep);
+    // and ensure next lap(s) still has a queue
+    _recycled_segments = queue<sstring>(std::numeric_limits<size_t>::max());
+}
+
+void db::commitlog::segment_manager::abort_deletion_promise(std::exception_ptr ep) {
+    std::exchange(_disk_deletions, {}).set_exception(ep);
 }

 future<> db::commitlog::segment_manager::do_pending_deletes() {
@@ -1814,9 +1941,15 @@ future<> db::commitlog::segment_manager::clear() {
 * Called by timer in periodic mode.
 */
 void db::commitlog::segment_manager::sync() {
-    for (auto s : _segments) {
-        (void)s->sync(); // we do not care about waiting...
-    }
+    auto f = std::exchange(_background_sync, make_ready_future<>());
+    // #8952 - calls that do sync/cycle can end up altering
+    // _segments (end_flush()->discard_unused())
+    auto def_copy = _segments;
+    _background_sync = parallel_for_each(def_copy, [](sseg_ptr s) {
+        return s->sync().discard_result();
+    }).then([f = std::move(f)]() mutable {
+        return std::move(f);
+    });
 }

 void db::commitlog::segment_manager::on_timer() {
@@ -1831,10 +1964,11 @@ void db::commitlog::segment_manager::on_timer() {
        // above threshold, request flush.
        if (_new_counter > 0) {
            auto max = disk_usage_threshold;
-            auto cur = totals.active_size_on_disk;
+            auto cur = totals.active_size_on_disk + totals.wasted_size_on_disk;
+
            if (max != 0 && cur >= max) {
-                _new_counter = 0;
                clogger.debug("Used size on disk {} MB exceeds local threshold {} MB", cur / (1024 * 1024), max / (1024 * 1024));
+                _new_counter = 0;
                flush_segments(cur - max);
            }
        }
@@ -2449,8 +2583,19 @@ std::vector<sstring> db::commitlog::get_active_segment_names() const {
    return _segment_manager->get_active_names();
 }

+uint64_t db::commitlog::disk_limit() const {
+    return _segment_manager->max_disk_size;
+}
+
+uint64_t db::commitlog::disk_footprint() const {
+    return _segment_manager->totals.total_size_on_disk;
+}
+
 uint64_t db::commitlog::get_total_size() const {
-    return _segment_manager->totals.active_size_on_disk + _segment_manager->totals.buffer_list_bytes;
+    return _segment_manager->totals.active_size_on_disk
+        + _segment_manager->totals.wasted_size_on_disk
+        + _segment_manager->totals.buffer_list_bytes
+        ;
 }

 uint64_t db::commitlog::get_completed_tasks() const {
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -140,7 +140,7 @@ public:
        bool reuse_segments = true;
        bool use_o_dsync = false;
        bool warn_about_segments_left_on_disk_after_shutdown = true;
-        bool allow_going_over_size_limit = false;
+        bool allow_going_over_size_limit = true;

        const db::extensions * extensions = nullptr;
    };
@@ -336,6 +336,16 @@ public:
     */
    uint64_t max_active_flushes() const;

+    /**
+     * Return disk footprint
+     */
+    uint64_t disk_footprint() const;
+
+    /**
+     * Return configured disk footprint limit
+     */
+    uint64_t disk_limit() const;
+
    future<> clear();

    const config& active_config() const;
--- a/db/config.cc
+++ b/db/config.cc
@@ -371,6 +371,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Whether or not to re-use commitlog segments when finished instead of deleting them. Can improve commitlog latency on some file systems.\n")
    , commitlog_use_o_dsync(this, "commitlog_use_o_dsync", value_status::Used, true,
        "Whether or not to use O_DSYNC mode for commitlog segments IO. Can improve commitlog latency on some file systems.\n")
+    , commitlog_use_hard_size_limit(this, "commitlog_use_hard_size_limit", value_status::Used, false,
+        "Whether or not to use a hard size limit for commitlog disk usage. Default is false. Enabling this can cause latency spikes, whereas the default can lead to occasional disk usage peaks.\n")
    /* Compaction settings */
    /* Related information: Configuring compaction */
    , compaction_preheat_key_cache(this, "compaction_preheat_key_cache", value_status::Unused, true,
@@ -747,8 +749,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        " Performance is affected to some extent as a result. Useful to help debugging problems that may arise at another layers.")
    , cpu_scheduler(this, "cpu_scheduler", value_status::Used, true, "Enable cpu scheduling")
    , view_building(this, "view_building", value_status::Used, true, "Enable view building; should only be set to false when the node is experience issues due to view building")
-    , enable_sstables_mc_format(this, "enable_sstables_mc_format", value_status::Used, true, "Enable SSTables 'mc' format to be used as the default file format")
-    , enable_sstables_md_format(this, "enable_sstables_md_format", value_status::Used, true, "Enable SSTables 'md' format to be used as the default file format (requires enable_sstables_mc_format)")
+    , enable_sstables_mc_format(this, "enable_sstables_mc_format", value_status::Unused, true, "Enable SSTables 'mc' format to be used as the default file format")
+    , enable_sstables_md_format(this, "enable_sstables_md_format", value_status::Used, true, "Enable SSTables 'md' format to be used as the default file format")
    , enable_dangerous_direct_import_of_cassandra_counters(this, "enable_dangerous_direct_import_of_cassandra_counters", value_status::Used, false, "Only turn this option on if you want to import tables from Cassandra containing counters, and you are SURE that no counters in that table were created in a version earlier than Cassandra 2.1."
        " It is not enough to have ever since upgraded to newer versions of Cassandra. If you EVER used a version earlier than 2.1 in the cluster where these SSTables come from, DO NOT TURN ON THIS OPTION! You will corrupt your data. You have been warned.")
    , enable_shard_aware_drivers(this, "enable_shard_aware_drivers", value_status::Used, true, "Enable native transport drivers to use connection-per-shard for better performance")
@@ -906,8 +908,11 @@ db::fs::path db::config::get_conf_sub(db::fs::path sub) {
 }

 bool db::config::check_experimental(experimental_features_t::feature f) const {
-    if (experimental() && f != experimental_features_t::UNUSED && f != experimental_features_t::UNUSED_CDC) {
-        return true;
+    if (experimental()
+        && f != experimental_features_t::UNUSED
+        && f != experimental_features_t::UNUSED_CDC
+        && f != experimental_features_t::RAFT) {
+            return true;
    }
    const auto& optval = experimental_features();
    return find(begin(optval), end(optval), enum_option<experimental_features_t>{f}) != end(optval);
@@ -962,11 +967,17 @@ std::unordered_map<sstring, db::experimental_features_t::feature> db::experiment
    // to UNUSED switch for a while, then remove altogether.
    // Change Data Capture is no longer experimental. Map it
    // to UNUSED_CDC switch for a while, then remove altogether.
-    return {{"lwt", UNUSED}, {"udf", UDF}, {"cdc", UNUSED_CDC}, {"alternator-streams", ALTERNATOR_STREAMS}};
+    return {
+        {"lwt", UNUSED},
+        {"udf", UDF},
+        {"cdc", UNUSED_CDC},
+        {"alternator-streams", ALTERNATOR_STREAMS},
+        {"raft", RAFT}
+    };
 }

 std::vector<enum_option<db::experimental_features_t>> db::experimental_features_t::all() {
-    return {UDF, ALTERNATOR_STREAMS};
+    return {UDF, ALTERNATOR_STREAMS, RAFT};
 }

 template struct utils::config_file::named_value<seastar::log_level>;
--- a/db/config.hh
+++ b/db/config.hh
@@ -82,7 +82,9 @@ namespace db {

 /// Enumeration of all valid values for the `experimental` config entry.
 struct experimental_features_t {
-    enum feature { UNUSED, UDF, UNUSED_CDC, ALTERNATOR_STREAMS };
+    // NOTE: RAFT feature is not enabled via `experimental` umbrella flag.
+    // This option should be enabled explicitly.
+    enum feature { UNUSED, UDF, UNUSED_CDC, ALTERNATOR_STREAMS, RAFT };
    static std::unordered_map<sstring, feature> map(); // See enum_option.
    static std::vector<enum_option<experimental_features_t>> all();
 };
@@ -163,6 +165,7 @@ public:
    named_value<int64_t> commitlog_total_space_in_mb;
    named_value<bool> commitlog_reuse_segments;
    named_value<bool> commitlog_use_o_dsync;
+    named_value<bool> commitlog_use_hard_size_limit;
    named_value<bool> compaction_preheat_key_cache;
    named_value<uint32_t> concurrent_compactors;
    named_value<uint32_t> in_memory_compaction_limit_in_mb;
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -441,30 +441,15 @@ bool manager::end_point_hints_manager::sender::can_send() noexcept {

    try {
        auto ep_state_ptr = _gossiper. get_endpoint_state_for_endpoint_ptr(end_point_key());
-        if (!ep_state_ptr || !ep_state_ptr->is_alive()) {
+        if (ep_state_ptr && ep_state_ptr->is_alive()) {
+            _state.remove(state::ep_state_left_the_ring);
+            return true;
+        } else {
            if (!_state.contains(state::ep_state_left_the_ring)) {
-                auto ep_gossip_state_val = _gossiper.get_gossip_status(end_point_key());
-                // If node has been removed from the ring it's going to be removed from the gossiper::endpoint_state
-                // map as well.
-                //
-                // However if it is still in the map then there are 3 possible STATE values for the node when it's in a DN/UN
-                // state:
-                //    - NORMAL
-                //    - SHUTDOWN
-                //    - "" - when node is in a DN state but was DOWN since the local node started up. In this case
-                //      gossiper::endpoint_state[node][STATUS] value is going to be not set at all.
-                _state.set_if<state::ep_state_left_the_ring>(
-                    !ep_state_ptr ||
-                    (ep_gossip_state_val != gms::versioned_value::STATUS_NORMAL &&
-                    ep_gossip_state_val != gms::versioned_value::SHUTDOWN &&
-                    ep_gossip_state_val != gms::versioned_value::STATUS_UNKNOWN)
-                );
+                _state.set_if<state::ep_state_left_the_ring>(!_shard_manager.local_db().get_token_metadata().is_member(end_point_key()));
            }
            // send the hints out if the destination Node is part of the ring - we will send to all new replicas in this case
            return _state.contains(state::ep_state_left_the_ring);
-        } else {
-            _state.remove(state::ep_state_left_the_ring);
-            return true;
        }
    } catch (...) {
        return false;
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -235,7 +235,7 @@ public:
            bool send_one_file(const sstring& fname);

            /// \brief Checks if we can still send hints.
-            /// \return TRUE if the destination Node is either ALIVE or has left the NORMAL state (e.g. has been decommissioned).
+            /// \return TRUE if the destination Node is either ALIVE or has left the ring (e.g. after decommission or removenode).
            bool can_send() noexcept;

            /// \brief Restore a mutation object from the hints file entry.
--- a/db/large_data_handler.cc
+++ b/db/large_data_handler.cc
@@ -124,7 +124,7 @@ static future<> try_record(std::string_view large_table, const sstables::sstable
    const auto sstable_name = sst.get_filename();
    std::string pk_str = key_to_str(partition_key.to_partition_key(s), s);
    auto timestamp = db_clock::now();
-    large_data_logger.warn("Writing large {} {}/{}: {}{} ({} bytes)", desc, ks_name, cf_name, pk_str, extra_path, size);
+    large_data_logger.warn("Writing large {} {}/{}: {}{} ({} bytes) to {}", desc, ks_name, cf_name, pk_str, extra_path, size, sstable_name);
    return db::qctx->execute_cql(req, ks_name, cf_name, sstable_name, size, pk_str, timestamp, args...)
            .discard_result()
            .handle_exception([ks_name, cf_name, large_table, sstable_name] (std::exception_ptr ep) {
@@ -140,9 +140,10 @@ future<> cql_table_large_data_handler::record_large_partitions(const sstables::s
 void cql_table_large_data_handler::log_too_many_rows(const sstables::sstable& sst, const sstables::key& partition_key,
        uint64_t rows_count) const {
    const schema& s = *sst.get_schema();
-    large_data_logger.warn("Writing a partition with too many rows [{}/{}:{}] ({} rows)",
+    const auto sstable_name = sst.get_filename();
+    large_data_logger.warn("Writing a partition with too many rows [{}/{}:{}] ({} rows) to {}",
                           s.ks_name(), s.cf_name(), partition_key.to_partition_key(s).with_schema(s),
-                           rows_count);
+                           rows_count, sstable_name);
 }

 future<> cql_table_large_data_handler::record_large_cells(const sstables::sstable& sst, const sstables::key& partition_key,
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1751,7 +1751,7 @@ future<> set_bootstrap_state(bootstrap_state state) {
    });
 }

-std::vector<schema_ptr> all_tables() {
+std::vector<schema_ptr> all_tables(const db::config& cfg) {
    std::vector<schema_ptr> r;
    auto schema_tables = db::schema_tables::all_tables(schema_features::full());
    std::copy(schema_tables.begin(), schema_tables.end(), std::back_inserter(r));
@@ -1760,12 +1760,14 @@ std::vector<schema_ptr> all_tables() {
                    compactions_in_progress(), compaction_history(),
                    sstable_activity(), clients(), size_estimates(), large_partitions(), large_rows(), large_cells(),
                    scylla_local(), db::schema_tables::scylla_table_schema_history(),
-                    raft(), raft_snapshots(),
                    v3::views_builds_in_progress(), v3::built_views(),
                    v3::scylla_views_builds_in_progress(),
                    v3::truncated(),
                    v3::cdc_local(),
    });
+    if (cfg.check_experimental(db::experimental_features_t::RAFT)) {
+        r.insert(r.end(), {raft(), raft_snapshots()});
+    }
    // legacy schema
    r.insert(r.end(), {
                    // TODO: once we migrate hints/batchlog and add convertor
@@ -1797,7 +1799,7 @@ static bool maybe_write_in_user_memory(schema_ptr s, database& db) {
 future<> make(database& db) {
    auto enable_cache = db.get_config().enable_cache();
    bool durable = db.get_config().data_file_directories().size() > 0;
-    for (auto&& table : all_tables()) {
+    for (auto&& table : all_tables(db.get_config())) {
        auto ks_name = table->ks_name();
        if (!db.has_keyspace(ks_name)) {
            auto ksm = make_lw_shared<keyspace_metadata>(ks_name,
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -82,6 +82,8 @@ namespace db {

 sstring system_keyspace_name();

+class config;
+
 namespace system_keyspace {

 static constexpr auto NAME = "system";
@@ -210,7 +212,7 @@ future<> remove_endpoint(gms::inet_address ep);
 future<> set_scylla_local_param(const sstring& key, const sstring& value);
 future<std::optional<sstring>> get_scylla_local_param(const sstring& key);

-std::vector<schema_ptr> all_tables();
+std::vector<schema_ptr> all_tables(const db::config& cfg);
 future<> make(database& db);

 /// overloads
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -1162,7 +1162,7 @@ get_view_natural_endpoint(const sstring& keyspace_name,
 }

 static future<> apply_to_remote_endpoints(gms::inet_address target, std::vector<gms::inet_address>&& pending_endpoints,
-        frozen_mutation_and_schema& mut, const dht::token& base_token, const dht::token& view_token,
+        frozen_mutation_and_schema&& mut, const dht::token& base_token, const dht::token& view_token,
        service::allow_hints allow_hints, tracing::trace_state_ptr tr_state) {

    tracing::trace(tr_state, "Sending view update for {}.{} to {}, with pending endpoints = {}; base token = {}; view token = {}",
@@ -1181,7 +1181,7 @@ static future<> apply_to_remote_endpoints(gms::inet_address target, std::vector<
 // appropriate paired replicas. This is done asynchronously - we do not wait
 // for the writes to complete.
 future<> mutate_MV(
-        const dht::token& base_token,
+        dht::token base_token,
        std::vector<frozen_mutation_and_schema> view_updates,
        db::view::stats& stats,
        cf_stats& cf_stats,
@@ -1197,28 +1197,7 @@ future<> mutate_MV(
        auto& keyspace_name = mut.s->ks_name();
        auto target_endpoint = get_view_natural_endpoint(keyspace_name, base_token, view_token);
        auto remote_endpoints = service::get_local_storage_service().get_token_metadata().pending_endpoints_for(view_token, keyspace_name);
-        auto maybe_account_failure = [s = mut.s, tr_state, &stats, &cf_stats, base_token, view_token, units = pending_view_updates.split(mut.fm.representation().size())] (
-                future<>&& f,
-                gms::inet_address target,
-                bool is_local,
-                size_t remotes) {
-            if (f.failed()) {
-                stats.view_updates_failed_local += is_local;
-                stats.view_updates_failed_remote += remotes;
-                cf_stats.total_view_updates_failed_local += is_local;
-                cf_stats.total_view_updates_failed_remote += remotes;
-                auto ep = f.get_exception();
-                tracing::trace(tr_state, "Failed to apply {}view update for {} and {} remote endpoints",
-                        seastar::value_of([is_local]{return is_local ? "local " : "";}), target, remotes);
-                vlogger.error("Error applying view update to {} (view: {}.{}, base token: {}, view token: {}): {}",
-                        target, s->ks_name(), s->cf_name(), base_token, view_token, ep);
-                return make_exception_future<>(std::move(ep));
-            } else {
-                tracing::trace(tr_state, "Successfully applied {}view update for {} and {} remote endpoints",
-                        seastar::value_of([is_local]{return is_local ? "local " : "";}), target, remotes);
-                return make_ready_future<>();
-            }
-        };
+        auto sem_units = pending_view_updates.split(mut.fm.representation().size());

        // First, find the local endpoint and ensure that if it exists,
        // it will be the target endpoint. That way, all endpoints in the
@@ -1255,11 +1234,20 @@ future<> mutate_MV(
            tracing::trace(tr_state, "Locally applying view update for {}.{}; base token = {}; view token = {}",
                    mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
            future<> local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, std::move(tr_state), db::commitlog::force_sync::no).then_wrapped(
-                    [&stats,
-                     maybe_account_failure = std::move(maybe_account_failure),
-                     mut_ptr = std::move(mut_ptr)] (future<>&& f) {
+                    [s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
+                            units = sem_units.split(sem_units.count())] (future<>&& f) {
                --stats.writes;
-                return maybe_account_failure(std::move(f), utils::fb_utilities::get_broadcast_address(), true, 0);
+                if (f.failed()) {
+                    ++stats.view_updates_failed_local;
+                    ++cf_stats.total_view_updates_failed_local;
+                    auto ep = f.get_exception();
+                    tracing::trace(tr_state, "Failed to apply local view update for {}", my_address);
+                    vlogger.error("Error applying view update to {} (view: {}.{}, base token: {}, view token: {}): {}",
+                            my_address, s->ks_name(), s->cf_name(), base_token, view_token, ep);
+                    return make_exception_future<>(std::move(ep));
+                }
+                tracing::trace(tr_state, "Successfully applied local view update for {}", my_address);
+                return make_ready_future<>();
            });
            fs->push_back(std::move(local_view_update));
            // We just applied a local update to the target endpoint, so it should now be removed
@@ -1281,11 +1269,23 @@ future<> mutate_MV(
            size_t updates_pushed_remote = remote_endpoints.size() + 1;
            stats.view_updates_pushed_remote += updates_pushed_remote;
            cf_stats.total_view_updates_pushed_remote += updates_pushed_remote;
-            future<> view_update = apply_to_remote_endpoints(*target_endpoint, std::move(remote_endpoints), mut, base_token, view_token, allow_hints, tr_state).then_wrapped(
-                    [target_endpoint,
-                     updates_pushed_remote,
-                     maybe_account_failure = std::move(maybe_account_failure)] (future<>&& f) mutable {
-                return maybe_account_failure(std::move(f), std::move(*target_endpoint), false, updates_pushed_remote);
+            schema_ptr s = mut.s;
+            future<> view_update = apply_to_remote_endpoints(*target_endpoint, std::move(remote_endpoints), std::move(mut), base_token, view_token, allow_hints, tr_state).then_wrapped(
+                    [s = std::move(s), &stats, &cf_stats, tr_state, base_token, view_token, target_endpoint, updates_pushed_remote,
+                            units = sem_units.split(sem_units.count())] (future<>&& f) mutable {
+                if (f.failed()) {
+                    stats.view_updates_failed_remote += updates_pushed_remote;
+                    cf_stats.total_view_updates_failed_remote += updates_pushed_remote;
+                    auto ep = f.get_exception();
+                    tracing::trace(tr_state, "Failed to apply view update for {} and {} remote endpoints",
+                            *target_endpoint, updates_pushed_remote);
+                    vlogger.error("Error applying view update to {} (view: {}.{}, base token: {}, view token: {}): {}",
+                            *target_endpoint, s->ks_name(), s->cf_name(), base_token, view_token, ep);
+                    return make_exception_future<>(std::move(ep));
+                }
+                tracing::trace(tr_state, "Successfully applied view update for {} and {} remote endpoints",
+                        *target_endpoint, updates_pushed_remote);
+                return make_ready_future<>();
            });
            if (wait_for_all) {
                fs->push_back(std::move(view_update));
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -153,7 +153,7 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(
 struct wait_for_all_updates_tag {};
 using wait_for_all_updates = bool_class<wait_for_all_updates_tag>;
 future<> mutate_MV(
-        const dht::token& base_token,
+        dht::token base_token,
        std::vector<frozen_mutation_and_schema> view_updates,
        db::view::stats& stats,
        cf_stats& cf_stats,
--- a/digester.hh
+++ b/digester.hh
@@ -58,7 +58,8 @@ public:

    template<typename T, typename... Args>
    void feed_hash(const T& value, Args&&... args) {
-        std::visit([&] (auto& hasher) noexcept -> void {
+        // FIXME uncomment the noexcept marking once clang bug 50994 is fixed or gcc compilation is turned on
+        std::visit([&] (auto& hasher) /* noexcept(noexcept(::feed_hash(hasher, value, args...))) */ -> void {
            ::feed_hash(hasher, value, std::forward<Args>(args)...);
        }, _impl);
    };
--- a/dirty_memory_manager.hh
+++ b/dirty_memory_manager.hh
@@ -215,6 +215,12 @@ public:
        });
    }

+    future<flush_permit> get_all_flush_permits() {
+        return get_units(_background_work_flush_serializer, _max_background_work).then([this] (auto&& units) {
+            return this->get_flush_permit(std::move(units));
+        });
+    }
+
    bool has_extraneous_flushes_requested() const {
        return _extraneous_flushes > 0;
    }
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -67,6 +67,7 @@ Description=Save coredump to scylla data directory
 Conflicts=umount.target
 Before=scylla-server.service
 After=local-fs.target
+DefaultDependencies=no

 [Mount]
 What=/var/lib/scylla/coredump
--- a/dist/common/scripts/scylla_cpuscaling_setup
+++ b/dist/common/scripts/scylla_cpuscaling_setup
@@ -22,6 +22,7 @@

 import os
 import sys
+import argparse
 import shlex
 import distro
 from scylla_util import *
@@ -46,7 +47,12 @@ if __name__ == '__main__':
    if os.getuid() > 0:
        print('Requires root permission.')
        sys.exit(1)
-    if not os.path.exists('/sys/devices/system/cpu/cpufreq/policy0/scaling_governor'):
+    parser = argparse.ArgumentParser(description='CPU scaling setup script for Scylla.')
+    parser.add_argument('--force', dest='force', action='store_true',
+                        help='force running setup even CPU scaling unsupported')
+    args = parser.parse_args()
+
+    if not args.force and not os.path.exists('/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor'):
        print('This computer doesn\'t supported CPU scaling configuration.')
        sys.exit(0)
    if not is_debian_variant():
@@ -56,6 +62,11 @@ if __name__ == '__main__':
        if not shutil.which('cpufreq-set'):
            pkg_install('cpufrequtils')
    if is_debian_variant():
+        try:
+            ondemand = systemd_unit('ondemand')
+            ondemand.disable()
+        except:
+            pass
        cfg = sysconfig_parser('/etc/default/cpufrequtils')
        cfg.set('GOVERNOR', 'performance')
        cfg.commit()
--- a/dist/common/scripts/scylla_io_setup
+++ b/dist/common/scripts/scylla_io_setup
@@ -229,6 +229,52 @@ if __name__ == "__main__":
                disk_properties["read_bandwidth"] = 507338935 * nr_disks
                disk_properties["write_iops"] = 57100 * nr_disks
                disk_properties["write_bandwidth"] = 483141731 * nr_disks
+            elif idata.instance_class() in ("c6gd", "m6gd", "r6gd", "x2gd"):
+                if idata.instance_size() == "medium":
+                    disk_properties["read_iops"] = 14808
+                    disk_properties["read_bandwidth"] = 77869147
+                    disk_properties["write_iops"] = 5972
+                    disk_properties["write_bandwidth"] = 32820302
+                elif idata.instance_size() == "large":
+                    disk_properties["read_iops"] = 29690
+                    disk_properties["read_bandwidth"] = 157712240
+                    disk_properties["write_iops"] = 12148
+                    disk_properties["write_bandwidth"] = 65978069
+                elif idata.instance_size() == "xlarge":
+                    disk_properties["read_iops"] = 59688
+                    disk_properties["read_bandwidth"] = 318762880
+                    disk_properties["write_iops"] = 24449
+                    disk_properties["write_bandwidth"] = 133311808
+                elif idata.instance_size() == "2xlarge":
+                    disk_properties["read_iops"] = 119353
+                    disk_properties["read_bandwidth"] = 634795733
+                    disk_properties["write_iops"] = 49069
+                    disk_properties["write_bandwidth"] = 266841680
+                elif idata.instance_size() == "4xlarge":
+                    disk_properties["read_iops"] = 237196
+                    disk_properties["read_bandwidth"] = 1262309504
+                    disk_properties["write_iops"] = 98884
+                    disk_properties["write_bandwidth"] = 533938080
+                elif idata.instance_size() == "8xlarge":
+                    disk_properties["read_iops"] = 442945
+                    disk_properties["read_bandwidth"] = 2522688939
+                    disk_properties["write_iops"] = 166021
+                    disk_properties["write_bandwidth"] = 1063041152
+                elif idata.instance_size() == "12xlarge":
+                    disk_properties["read_iops"] = 353691 * nr_disks
+                    disk_properties["read_bandwidth"] = 1908192256 * nr_disks
+                    disk_properties["write_iops"] = 146732 * nr_disks
+                    disk_properties["write_bandwidth"] = 806399360 * nr_disks
+                elif idata.instance_size() == "16xlarge":
+                    disk_properties["read_iops"] = 426893 * nr_disks
+                    disk_properties["read_bandwidth"] = 2525781589 * nr_disks
+                    disk_properties["write_iops"] = 161740 * nr_disks
+                    disk_properties["write_bandwidth"] = 1063389952 * nr_disks
+                elif idata.instance_size() == "metal":
+                    disk_properties["read_iops"] = 416257 * nr_disks
+                    disk_properties["read_bandwidth"] = 2527296683 * nr_disks
+                    disk_properties["write_iops"] = 156326 * nr_disks
+                    disk_properties["write_bandwidth"] = 1063657088 * nr_disks
            properties_file = open(etcdir() + "/scylla.d/io_properties.yaml", "w")
            yaml.dump({ "disks": [ disk_properties ] }, properties_file,  default_flow_style=False)
            ioconf = open(etcdir() + "/scylla.d/io.conf", "w")
@@ -254,7 +300,7 @@ if __name__ == "__main__":
                    disk_properties["read_bandwidth"] = 2650 * mbs
                    disk_properties["write_iops"] = 360000
                    disk_properties["write_bandwidth"] = 1400 * mbs
-                elif nr_disks == "16":
+                elif nr_disks == 16:
                    disk_properties["read_iops"] = 1600000
                    disk_properties["read_bandwidth"] = 4521251328
                    #below is google, above is our measured
@@ -263,7 +309,7 @@ if __name__ == "__main__":
                    disk_properties["write_bandwidth"] = 2759452672
                    #below is google, above is our measured
                    #disk_properties["write_bandwidth"] = 3120 * mbs
-                elif nr_disks == "24":
+                elif nr_disks == 24:
                    disk_properties["read_iops"] = 2400000
                    disk_properties["read_bandwidth"] = 5921532416
                    #below is google, above is our measured
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -28,7 +28,6 @@ import distro

 from scylla_util import *
 from subprocess import run
-from multiprocessing import cpu_count

 def get_mode_cpuset(nic, mode):
    mode_cpu_mask = run('/opt/scylladb/scripts/perftune.py --tune net --nic {} --mode {} --get-cpu-mask-quiet'.format(nic, mode), shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
@@ -100,16 +99,6 @@ def verify_cpu():
                    print('\nIf this is a virtual machine, please update its CPU feature configuration or upgrade to a newer hypervisor.')
                    sys.exit(1)

-def configure_aio_slots():
-    with open('/proc/sys/fs/aio-max-nr') as f:
-        aio_max_nr = int(f.read())
-    # (10000 + 1024 + 2) * ncpus for scylla,
-    # 65536 for other apps
-    required_aio_slots = cpu_count() * 11026 + 65536
-    if aio_max_nr < required_aio_slots:
-        with open('/proc/sys/fs/aio-max-nr', 'w') as f:
-            f.write(str(required_aio_slots))
-
 if __name__ == '__main__':
    verify_cpu()

@@ -124,8 +113,6 @@ if __name__ == '__main__':
        os.remove('/etc/scylla/ami_disabled')
        sys.exit(1)

-    configure_aio_slots()
-
    if mode == 'virtio':
        tap = cfg.get('TAP')
        user = cfg.get('USER')
@@ -155,4 +142,3 @@ if __name__ == '__main__':
            print(f'Exception occurred while creating perftune.yaml: {e}')
            print('To fix the error, please re-run scylla_setup.')
            sys.exit(1)
-
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -115,10 +115,6 @@ if __name__ == '__main__':
        pkg_install('xfsprogs')
    if not shutil.which('mdadm'):
        pkg_install('mdadm')
-    try:
-        md_service = systemd_unit('mdmonitor.service')
-    except SystemdException:
-        md_service = systemd_unit('mdadm.service')

    print('Creating {type} for scylla using {nr_disk} disk(s): {disks}'.format(type='RAID0' if raid else 'XFS volume', nr_disk=len(disks), disks=args.disks))
    procs=[]
@@ -153,17 +149,15 @@ if __name__ == '__main__':
    os.makedirs(mount_at, exist_ok=True)

    uuid = run(f'blkid -s UUID -o value {fsdev}', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
-    after = 'local-fs.target'
-    if raid:
-        after += f' {md_service}'
    unit_data = f'''
 [Unit]
 Description=Scylla data directory
 Before=scylla-server.service
-After={after}
+After=local-fs.target
+DefaultDependencies=no

 [Mount]
-What=UUID={uuid}
+What=/dev/disk/by-uuid/{uuid}
 Where={mount_at}
 Type=xfs
 Options=noatime
@@ -183,8 +177,6 @@ WantedBy=multi-user.target
            f.write(f'RequiresMountsFor={mount_at}\n')

    systemd_unit.reload()
-    md_service.enable()
-    md_service.start()
    mount = systemd_unit(mntunit_bn)
    mount.start()
    if args.enable_on_nextboot:
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -36,6 +36,7 @@ from subprocess import run, DEVNULL
 import distro
 from scylla_sysconfdir import SYSCONFDIR

+from multiprocessing import cpu_count

 def scriptsdir_p():
    p = Path(sys.argv[0]).resolve()
@@ -146,6 +147,11 @@ class gcp_instance:
            if af == socket.AF_INET:
                addr, port = sa
                if addr == "169.254.169.254":
+                    # Make sure it is not on GKE
+                    try:
+                        gcp_instance().__instance_metadata("machine-type")
+                    except urllib.error.HTTPError:
+                        return False
                    return True
        return False

--- a/dist/common/sysctl.d/99-scylla-aio.conf
+++ b/dist/common/sysctl.d/99-scylla-aio.conf
@@ -0,0 +1,2 @@
+# Raise max AIO events
+fs.aio-max-nr = 5578536
--- a/dist/common/systemd/scylla-fstrim.timer
+++ b/dist/common/systemd/scylla-fstrim.timer
@@ -1,7 +1,5 @@
 [Unit]
 Description=Run Scylla fstrim daily
-After=scylla-server.service
-BindsTo=scylla-server.service

 [Timer]
 OnCalendar=Sat *-*-* 00:00:00
--- a/dist/debian/control.template
+++ b/dist/debian/control.template
@@ -12,6 +12,8 @@ Architecture: any
 Description: Scylla database main configuration file
 Scylla is a highly scalable, eventually consistent, distributed,
 partitioned row DB.
+ .
+ Dedicated to the memory of Alberto José Araújo, a coworker and a friend.
 Replaces: %{product}-server (<< 1.1)
 Conflicts: %{product}-server (<< 1.1)

--- a/dist/debian/debian/scylla-kernel-conf.postinst
+++ b/dist/debian/debian/scylla-kernel-conf.postinst
@@ -11,6 +11,7 @@ else
    sysctl -p/usr/lib/sysctl.d/99-scylla-sched.conf || :
    sysctl -p/usr/lib/sysctl.d/99-scylla-vm.conf || :
    sysctl -p/usr/lib/sysctl.d/99-scylla-inotify.conf || :
+    sysctl -p/usr/lib/sysctl.d/99-scylla-aio.conf || :
 fi

 #DEBHELPER#
--- a/dist/debian/debian/scylla-server.postrm
+++ b/dist/debian/debian/scylla-server.postrm
@@ -12,8 +12,6 @@ case "$1" in
        if [ "$1" = "purge" ]; then
            rm -rf /etc/systemd/system/scylla-server.service.d/
        fi
-        rm -f /etc/systemd/system/var-lib-systemd-coredump.mount
-        rm -f /etc/systemd/system/var-lib-scylla.mount
        ;;
 esac

--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -5,8 +5,8 @@ MAINTAINER Avi Kivity <avi@cloudius-systems.com>
 ENV container docker

 # The SCYLLA_REPO_URL argument specifies the URL to the RPM repository this Docker image uses to install Scylla. The default value is the Scylla's unstable RPM repository, which contains the daily build.
-ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo
-ARG VERSION=4.5.dev
+ARG SCYLLA_REPO_URL=downloads.scylladb.com/unstable/scylla/branch-4.5/rpm/centos/latest/
+ARG VERSION=4.5.7

 ADD scylla_bashrc /scylla_bashrc

--- a/dist/docker/redhat/etc/supervisord.conf.d/scylla-server.conf
+++ b/dist/docker/redhat/etc/supervisord.conf.d/scylla-server.conf
@@ -4,3 +4,4 @@ stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes=0
 stderr_logfile=/dev/stderr
 stderr_logfile_maxbytes=0
+stopwaitsecs=900
--- a/dist/docker/redhat/scyllasetup.py
+++ b/dist/docker/redhat/scyllasetup.py
@@ -121,12 +121,13 @@ class ScyllaSetup:
        if self._apiAddress is not None:
            args += ["--api-address %s" % self._apiAddress]

-        if self._alternatorPort is not None:
+        if self._alternatorAddress is not None:
            args += ["--alternator-address %s" % self._alternatorAddress]
+
+        if self._alternatorPort is not None:
            args += ["--alternator-port %s" % self._alternatorPort]

        if self._alternatorHttpsPort is not None:
-            args += ["--alternator-address %s" % self._alternatorAddress]
            args += ["--alternator-https-port %s" % self._alternatorHttpsPort]

        if self._alternatorWriteIsolation is not None:
--- a/dist/redhat/scylla.spec
+++ b/dist/redhat/scylla.spec
@@ -7,7 +7,7 @@ Group:          Applications/Databases
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Source0:        %{reloc_pkg}
-Requires:       %{product}-server = %{version} %{product}-conf = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version} %{product}-node-exporter = %{version}
+Requires:       %{product}-server = %{version} %{product}-conf = %{version} %{product}-python3 = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version} %{product}-node-exporter = %{version}
 Obsoletes:	scylla-server < 1.1

 %global _debugsource_template %{nil}
@@ -30,6 +30,8 @@ partitioned row DB.
 This package installs all required packages for ScyllaDB,  including
 %{product}-server, %{product}-jmx, %{product}-tools, %{product}-tools-core %{product}-node-exporter.

+Dedicated to the memory of Alberto José Araújo, a coworker and a friend.
+
 # this is needed to prevent python compilation error on CentOS (#2235)
 %if 0%{?rhel}
 %global __os_install_post    \
@@ -54,7 +56,7 @@ Group:          Applications/Databases
 Summary:        The Scylla database server
 License:        AGPLv3
 URL:            http://www.scylladb.com/
-Requires:       %{product}-conf %{product}-python3
+Requires:       %{product}-conf  = %{version} %{product}-python3 = %{version}
 Conflicts:      abrt
 AutoReqProv:    no

@@ -141,9 +143,9 @@ rm -rf $RPM_BUILD_ROOT
 %ghost /etc/systemd/system/scylla-server.service.d/capabilities.conf
 %ghost /etc/systemd/system/scylla-server.service.d/mounts.conf
 /etc/systemd/system/scylla-server.service.d/dependencies.conf
-%ghost /etc/systemd/system/var-lib-systemd-coredump.mount
+%ghost %config /etc/systemd/system/var-lib-systemd-coredump.mount
 %ghost /etc/systemd/system/scylla-cpupower.service
-%ghost /etc/systemd/system/var-lib-scylla.mount
+%ghost %config /etc/systemd/system/var-lib-scylla.mount

 %package conf
 Group:          Applications/Databases
@@ -211,6 +213,7 @@ if Scylla is the main application on your server and you wish to optimize its la
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
 /usr/lib/systemd/systemd-sysctl 99-scylla-vm.conf >/dev/null 2>&1 || :
 /usr/lib/systemd/systemd-sysctl 99-scylla-inotify.conf >/dev/null 2>&1 || :
+/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :

 %files kernel-conf
 %defattr(-,root,root)
@@ -228,13 +231,13 @@ Prometheus exporter for machine metrics, written in Go with pluggable metric col

 %post node-exporter
 if [ $1 -eq 1 ] ; then
-    /usr/bin/systemctl preset node-exporter.service ||:
+    /usr/bin/systemctl preset scylla-node-exporter.service ||:
 fi

 %preun node-exporter
 if [ $1 -eq 0 ] ; then
-    /usr/bin/systemctl --no-reload disable node-exporter.service ||:
-    /usr/bin/systemctl stop node-exporter.service ||:
+    /usr/bin/systemctl --no-reload disable scylla-node-exporter.service ||:
+    /usr/bin/systemctl stop scylla-node-exporter.service ||:
 fi

 %postun node-exporter
--- a/distributed_loader.cc
+++ b/distributed_loader.cc
@@ -478,7 +478,7 @@ distributed_loader::get_sstables_from_upload_dir(distributed<database>& db, sstr
            sstables_on_shards[this_shard_id()] = d.get_unsorted_sstables();
        }).get();

-        return std::make_tuple(table_id, sstables_on_shards);
+        return std::make_tuple(table_id, std::move(sstables_on_shards));
    });
 }

--- a/gms/feature_service.cc
+++ b/gms/feature_service.cc
@@ -98,14 +98,6 @@ feature_config feature_config_from_db_config(db::config& cfg, std::set<sstring>

    fcfg._disabled_features = std::move(disabled);

-    if (!cfg.enable_sstables_mc_format()) {
-        if (cfg.enable_sstables_md_format()) {
-            throw std::runtime_error(
-                    "You must use both enable_sstables_mc_format and enable_sstables_md_format "
-                    "to enable SSTables md format support");
-        }
-        fcfg._disabled_features.insert(sstring(gms::features::MC_SSTABLE));
-    }
    if (!cfg.enable_sstables_md_format()) {
        fcfg._disabled_features.insert(sstring(gms::features::MD_SSTABLE));
    }
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -1448,7 +1448,7 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
    logger.trace("marking as alive {}", addr);

    // Do not mark a node with status shutdown as UP.
-    auto status = get_gossip_status(local_state);
+    auto status = sstring(get_gossip_status(local_state));
    if (status == sstring(versioned_value::SHUTDOWN)) {
        logger.warn("Skip marking node {} with status = {} as UP", addr, status);
        return;
@@ -1467,6 +1467,8 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
        return;
    }

+    // Make a copy for endpoint_state because the code below can yield
+    endpoint_state state = local_state;
    _live_endpoints.push_back(addr);
    if (_endpoints_to_talk_with.empty()) {
        _endpoints_to_talk_with.push_back({addr});
@@ -1478,8 +1480,8 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
        logger.info("InetAddress {} is now UP, status = {}", addr, status);
    }

-    _subscribers.for_each([addr, local_state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
-        subscriber->on_alive(addr, local_state);
+    _subscribers.for_each([addr, state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
+        subscriber->on_alive(addr, state);
        logger.trace("Notified {}", fmt::ptr(subscriber.get()));
    });
 }
@@ -1488,11 +1490,12 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
 void gossiper::mark_dead(inet_address addr, endpoint_state& local_state) {
    logger.trace("marking as down {}", addr);
    local_state.mark_dead();
+    endpoint_state state = local_state;
    _live_endpoints.resize(std::distance(_live_endpoints.begin(), std::remove(_live_endpoints.begin(), _live_endpoints.end(), addr)));
    _unreachable_endpoints[addr] = now();
-    logger.info("InetAddress {} is now DOWN, status = {}", addr, get_gossip_status(local_state));
-    _subscribers.for_each([addr, local_state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
-        subscriber->on_dead(addr, local_state);
+    logger.info("InetAddress {} is now DOWN, status = {}", addr, get_gossip_status(state));
+    _subscribers.for_each([addr, state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
+        subscriber->on_dead(addr, state);
        logger.trace("Notified {}", fmt::ptr(subscriber.get()));
    });
 }
@@ -2131,6 +2134,32 @@ bool gossiper::is_alive(inet_address ep) const {
    return false;
 }

+// Runs inside seastar::async context
+void gossiper::wait_alive(std::vector<gms::inet_address> nodes, std::chrono::milliseconds timeout) {
+    auto start_time = std::chrono::steady_clock::now();
+    for (;;) {
+        std::vector<gms::inet_address> live_nodes;
+        for (const auto& node: nodes) {
+            size_t nr_alive = container().map_reduce0([node] (gossiper& g) -> size_t {
+                return g.is_alive(node) ? 1 : 0;
+            }, 0, std::plus<size_t>()).get0();
+            logger.debug("Marked node={} as alive on {} out of {} shards", node, nr_alive, smp::count);
+            if (nr_alive == smp::count) {
+                live_nodes.push_back(node);
+            }
+        }
+        logger.debug("Waited for marking node as up, replace_nodes={}, live_nodes={}", nodes, live_nodes);
+        if (live_nodes.size() == nodes.size()) {
+            break;
+        }
+        if (std::chrono::steady_clock::now() > timeout + start_time) {
+            throw std::runtime_error(format("Failed to mark node as alive in {} ms, nodes={}, live_nodes={}",
+                    timeout.count(), nodes, live_nodes));
+        }
+        sleep_abortable(std::chrono::milliseconds(100), _abort_source).get();
+    }
+}
+
 const versioned_value* gossiper::get_application_state_ptr(inet_address endpoint, application_state appstate) const noexcept {
    auto* eps = get_endpoint_state_for_endpoint_ptr(std::move(endpoint));
    if (!eps) {
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -442,6 +442,8 @@ private:
 public:
    bool is_alive(inet_address ep) const;
    bool is_dead_state(const endpoint_state& eps) const;
+    // Wait for nodes to be alive on all shards
+    void wait_alive(std::vector<gms::inet_address> nodes, std::chrono::milliseconds timeout);

    future<> apply_state_locally(std::map<inet_address, endpoint_state> map);

--- a/hashing.hh
+++ b/hashing.hh
@@ -62,7 +62,7 @@ struct appending_hash;
 template<typename H, typename T, typename... Args>
 requires Hasher<H>
 inline
-void feed_hash(H& h, const T& value, Args&&... args) noexcept {
+void feed_hash(H& h, const T& value, Args&&... args) noexcept(noexcept(std::declval<appending_hash<T>>()(h, value, args...))) {
    appending_hash<T>()(h, value, std::forward<Args>(args)...);
 };

--- a/install.sh
+++ b/install.sh
@@ -150,6 +150,10 @@ EOF
    chmod +x "$install"
 }

+install() {
+    command install -Z "$@"
+}
+
 installconfig() {
    local perm="$1"
    local src="$2"
@@ -210,13 +214,13 @@ if [ -z "$python3" ]; then
 fi
 rpython3=$(realpath -m "$root/$python3")
 if ! $nonroot; then
-    retc="$root/etc"
-    rsysconfdir="$root/$sysconfdir"
-    rusr="$root/usr"
-    rsystemd="$rusr/lib/systemd/system"
+    retc=$(realpath -m "$root/etc")
+    rsysconfdir=$(realpath -m "$root/$sysconfdir")
+    rusr=$(realpath -m "$root/usr")
+    rsystemd=$(realpath -m "$rusr/lib/systemd/system")
    rdoc="$rprefix/share/doc"
-    rdata="$root/var/lib/scylla"
-    rhkdata="$root/var/lib/scylla-housekeeping"
+    rdata=$(realpath -m "$root/var/lib/scylla")
+    rhkdata=$(realpath -m "$root/var/lib/scylla-housekeeping")
 else
    retc="$rprefix/etc"
    rsysconfdir="$rprefix/$sysconfdir"
@@ -245,6 +249,7 @@ if ! $nonroot; then
    done
 fi
 # scylla-node-exporter
+install -d -m755 "$rsysconfdir" "$rsystemd"
 install -d -m755 "$rprefix"/node_exporter
 install -d -m755 "$rprefix"/node_exporter/licenses
 install -m755 node_exporter/node_exporter "$rprefix"/node_exporter
@@ -278,7 +283,6 @@ fi

 # scylla-server
 install -m755 -d "$rprefix"
-install -m755 -d "$rsysconfdir"
 install -m755 -d "$retc/scylla.d"
 installconfig 644 dist/common/sysconfig/scylla-housekeeping "$rsysconfdir"
 installconfig 644 dist/common/sysconfig/scylla-server "$rsysconfdir"
@@ -286,7 +290,7 @@ for file in dist/common/scylla.d/*.conf; do
    installconfig 644 "$file" "$retc"/scylla.d
 done

-install -d -m755 "$retc"/scylla "$rsystemd" "$rprefix/bin" "$rprefix/libexec" "$rprefix/libreloc" "$rprefix/scripts" "$rprefix/bin"
+install -d -m755 "$retc"/scylla "$rprefix/bin" "$rprefix/libexec" "$rprefix/libreloc" "$rprefix/scripts" "$rprefix/bin"
 install -m644 dist/common/systemd/scylla-fstrim.service -Dt "$rsystemd"
 install -m644 dist/common/systemd/scylla-housekeeping-daily.service -Dt "$rsystemd"
 install -m644 dist/common/systemd/scylla-housekeeping-restart.service -Dt "$rsystemd"
--- a/main.cc
+++ b/main.cc
@@ -388,11 +388,38 @@ static auto defer_verbose_shutdown(const char* what, Func&& func) {
        startlog.info("Shutting down {}", what);
        try {
            func();
+            startlog.info("Shutting down {} was successful", what);
        } catch (...) {
-            startlog.error("Unexpected error shutting down {}: {}", what, std::current_exception());
-            throw;
+            auto ex = std::current_exception();
+            bool do_abort = true;
+            try {
+                std::rethrow_exception(ex);
+            } catch (const std::system_error& e) {
+                // System error codes we consider "environmental",
+                // i.e. not scylla's fault, therefore there is no point in
+                // aborting and dumping core.
+                for (int i : {EIO, EACCES, ENOSPC}) {
+                    if (e.code() == std::error_code(i, std::system_category())) {
+                        do_abort = false;
+                        break;
+                    }
+                }
+            } catch (...) {
+            }
+            auto msg = fmt::format("Unexpected error shutting down {}: {}", what, ex);
+            if (do_abort) {
+                startlog.error("{}: aborting", msg);
+                abort();
+            } else {
+                startlog.error("{}: exiting, at {}", msg, current_backtrace());
+
+                // Call _exit() rather than exit() to exit immediately
+                // without calling exit handlers, avoiding
+                // boost::intrusive::detail::destructor_impl assert failure
+                // from ~segment_pool exit handler.
+                _exit(255);
+            }
        }
-        startlog.info("Shutting down {} was successful", what);
    };

    auto ret = deferred_action(std::move(vfunc));
@@ -591,6 +618,22 @@ int main(int ac, char** av) {
            };
            auto background_reclaim_scheduling_group = make_sched_group("background_reclaim", 50);
            auto maintenance_scheduling_group = make_sched_group("streaming", 200);
+
+            smp::invoke_on_all([&cfg, background_reclaim_scheduling_group] {
+                logalloc::tracker::config st_cfg;
+                st_cfg.defragment_on_idle = cfg->defragment_memory_on_idle();
+                st_cfg.abort_on_lsa_bad_alloc = cfg->abort_on_lsa_bad_alloc();
+                st_cfg.lsa_reclamation_step = cfg->lsa_reclamation_step();
+                st_cfg.background_reclaim_sched_group = background_reclaim_scheduling_group;
+                logalloc::shard_tracker().configure(st_cfg);
+            }).get();
+
+            auto stop_lsa_background_reclaim = defer([&] {
+                smp::invoke_on_all([&] {
+                    return logalloc::shard_tracker().stop();
+                }).get();
+            });
+
            uint16_t api_port = cfg->api_port();
            ctx.api_dir = cfg->api_ui_dir();
            ctx.api_doc = cfg->api_doc_dir();
@@ -716,7 +759,7 @@ int main(int ac, char** av) {
            tracing::backend_registry tracing_backend_registry;
            tracing::register_tracing_keyspace_backend(tracing_backend_registry);
            tracing::tracing::create_tracing(tracing_backend_registry, "trace_keyspace_helper").get();
-            auto stop_tracing = defer_verbose_shutdown("tracing", [] {
+            auto destroy_tracing = defer_verbose_shutdown("tracing instance", [] {
                tracing::tracing::tracing_instance().stop().get();
            });
            supervisor::notify("creating snitch");
@@ -777,13 +820,6 @@ int main(int ac, char** av) {
                mscfg.encrypt = netw::messaging_service::encrypt_what::rack;
            }

-            if (clauth && (mscfg.encrypt == netw::messaging_service::encrypt_what::dc || mscfg.encrypt == netw::messaging_service::encrypt_what::dc)) {
-                startlog.warn("Setting require_client_auth is incompatible with 'rack' and 'dc' internode_encryption values."
-                    " To ensure that mutual TLS authentication is enforced, please set internode_encryption to 'all'. Continuing with"
-                    " potentially insecure configuration."
-                );
-            }
-
            sstring compress_what = cfg->internode_compression();
            if (compress_what == "all") {
                mscfg.compress = netw::messaging_service::compress_what::all;
@@ -1046,12 +1082,20 @@ int main(int ac, char** av) {
            auto stop_proxy_handlers = defer_verbose_shutdown("storage proxy RPC verbs", [&proxy] {
                proxy.invoke_on_all(&service::storage_proxy::uninit_messaging_service).get();
            });
-            supervisor::notify("initializing Raft services");
-            raft_srvs.start(std::ref(messaging), std::ref(gossiper), std::ref(qp)).get();
-            raft_srvs.invoke_on_all(&raft_services::init).get();
+
+            const bool raft_enabled = cfg->check_experimental(db::experimental_features_t::RAFT);
+            if (raft_enabled) {
+                supervisor::notify("initializing Raft services");
+                raft_srvs.start(std::ref(messaging), std::ref(gossiper), std::ref(qp)).get();
+                raft_srvs.invoke_on_all(&raft_services::init).get();
+            }
            auto stop_raft_sc_handlers = defer_verbose_shutdown("Raft services", [&raft_srvs] {
                raft_srvs.invoke_on_all(&raft_services::uninit).get();
            });
+            if (!raft_enabled) {
+                stop_raft_sc_handlers->cancel();
+            }
+
            supervisor::notify("starting streaming service");
            streaming::stream_session::init_streaming_service(db, sys_dist_ks, view_update_generator, messaging).get();
            auto stop_streaming_service = defer_verbose_shutdown("streaming service", [] {
@@ -1172,13 +1216,9 @@ int main(int ac, char** av) {

            supervisor::notify("starting tracing");
            tracing::tracing::start_tracing(qp).get();
-            /*
-             * FIXME -- tracing is stopped inside drain_on_shutdown, which
-             * is deferred later on. If the start aborts before it, the
-             * tracing will remain started and will continue referencing
-             * the query processor. Nowadays the latter is not stopped
-             * either, but when it will, this place shold be fixed too.
-             */
+            auto stop_tracing = defer_verbose_shutdown("tracing", [] {
+                tracing::tracing::stop_tracing().get();
+            });

            startlog.info("SSTable data integrity checker is {}.",
                    cfg->enable_sstable_data_integrity_check() ? "enabled" : "disabled");
@@ -1409,21 +1449,6 @@ int main(int ac, char** av) {
                }).get();
            }

-            smp::invoke_on_all([&cfg, background_reclaim_scheduling_group] {
-                logalloc::tracker::config st_cfg;
-                st_cfg.defragment_on_idle = cfg->defragment_memory_on_idle();
-                st_cfg.abort_on_lsa_bad_alloc = cfg->abort_on_lsa_bad_alloc();
-                st_cfg.lsa_reclamation_step = cfg->lsa_reclamation_step();
-                st_cfg.background_reclaim_sched_group = background_reclaim_scheduling_group;
-                logalloc::shard_tracker().configure(st_cfg);
-            }).get();
-
-            auto stop_lsa_background_reclaim = defer([&] {
-                smp::invoke_on_all([&] {
-                    return logalloc::shard_tracker().stop();
-                }).get();
-            });
-
            seastar::set_abort_on_ebadf(cfg->abort_on_ebadf());
            api::set_server_done(ctx).get();
            supervisor::notify("serving");
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -357,29 +357,9 @@ void messaging_service::do_start_listen() {
        cfg.sched_group = scheduling_group_for_isolation_cookie(isolation_cookie);
        return cfg;
    };
-    if (!_server[0] && _cfg.encrypt != encrypt_what::all) {
+    if (!_server[0]) {
        auto listen = [&] (const gms::inet_address& a, rpc::streaming_domain_type sdomain) {
            so.streaming_domain = sdomain;
-            so.filter_connection = {};
-            switch (_cfg.encrypt) {
-                default:
-                case encrypt_what::none:
-                    break;
-                case encrypt_what::dc:
-                    so.filter_connection = [](const seastar::socket_address& addr) {
-                        auto& snitch = locator::i_endpoint_snitch::get_local_snitch_ptr();
-                        return snitch->get_datacenter(addr) == snitch->get_datacenter(utils::fb_utilities::get_broadcast_address());
-                    };
-                    break;
-                case encrypt_what::rack:
-                    so.filter_connection = [](const seastar::socket_address& addr) {
-                        auto& snitch = locator::i_endpoint_snitch::get_local_snitch_ptr();
-                        return snitch->get_datacenter(addr) == snitch->get_datacenter(utils::fb_utilities::get_broadcast_address())
-                            && snitch->get_rack(addr) == snitch->get_rack(utils::fb_utilities::get_broadcast_address())
-                            ;
-                    };
-                    break;
-            }
            auto addr = socket_address{a, _cfg.port};
            return std::unique_ptr<rpc_protocol_server_wrapper>(new rpc_protocol_server_wrapper(_rpc->protocol(),
                    so, addr, limits));
@@ -389,10 +369,9 @@ void messaging_service::do_start_listen() {
            _server[1] = listen(utils::fb_utilities::get_broadcast_address(), rpc::streaming_domain_type(0x66BB));
        }
    }
-    
+
    if (!_server_tls[0]) {
        auto listen = [&] (const gms::inet_address& a, rpc::streaming_domain_type sdomain) {
-            so.filter_connection = {};
            so.streaming_domain = sdomain;
            return std::unique_ptr<rpc_protocol_server_wrapper>(
                    [this, &so, &a, limits] () -> std::unique_ptr<rpc_protocol_server_wrapper>{
@@ -713,7 +692,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        remove_error_rpc_client(verb, id);
    }

-    auto must_encrypt = [&id, &verb, this] {
+    auto must_encrypt = [&id, this] {
        if (_cfg.encrypt == encrypt_what::none) {
            return false;
        }
@@ -721,23 +700,14 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
            return true;
        }

-        // if we have dc/rack encryption but this is gossip, we should
-        // use tls anyway, to avoid having mismatched ideas on which 
-        // group we/client are in. 
-        if (verb >= messaging_verb::GOSSIP_DIGEST_SYN && verb <= messaging_verb::GOSSIP_SHUTDOWN) {
-            return true;
-        }
-
        auto& snitch_ptr = locator::i_endpoint_snitch::get_local_snitch_ptr();

-        // either rack/dc need to be in same dc to use non-tls
-        if (snitch_ptr->get_datacenter(id.addr) != snitch_ptr->get_datacenter(utils::fb_utilities::get_broadcast_address())) {
-            return true;
+        if (_cfg.encrypt == encrypt_what::dc) {
+            return snitch_ptr->get_datacenter(id.addr)
+                            != snitch_ptr->get_datacenter(utils::fb_utilities::get_broadcast_address());
        }
-        // if cross-rack tls, check rack.
-        return _cfg.encrypt == encrypt_what::rack &&
-            snitch_ptr->get_rack(id.addr) != snitch_ptr->get_rack(utils::fb_utilities::get_broadcast_address())
-            ;
+        return snitch_ptr->get_rack(id.addr)
+                        != snitch_ptr->get_rack(utils::fb_utilities::get_broadcast_address());
    }();

    auto must_compress = [&id, this] {
@@ -781,12 +751,11 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;
    }

-    auto baddr = socket_address(utils::fb_utilities::get_broadcast_address(), 0);
    auto client = must_encrypt ?
                    ::make_shared<rpc_protocol_client_wrapper>(_rpc->protocol(), std::move(opts),
-                                    remote_addr, baddr, _credentials) :
+                                    remote_addr, socket_address(), _credentials) :
                    ::make_shared<rpc_protocol_client_wrapper>(_rpc->protocol(), std::move(opts),
-                                    remote_addr, baddr);
+                                    remote_addr);

    auto res = _clients[idx].emplace(id, shard_info(std::move(client)));
    assert(res.second);
--- a/mutation.hh
+++ b/mutation.hh
@@ -188,7 +188,7 @@ stop_iteration consume_clustering_fragments(const schema& s, mutation_partition&
            emit_rt = rts_it != rts_end;
        }
        if (emit_rt) {
-            stop = consumer.consume(std::move(*rts_it));
+            stop = consumer.consume(range_tombstone(std::move(*rts_it), range_tombstone::without_link{}));
            ++rts_it;
        } else {
            stop = consumer.consume(clustering_row(std::move(*crs_it)));
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -1150,6 +1150,9 @@ flat_mutation_reader evictable_reader::recreate_reader() {
    _range_override.reset();
    _slice_override.reset();

+    _drop_partition_start = false;
+    _drop_static_row = false;
+
    if (_last_pkey) {
        bool partition_range_is_inclusive = true;

@@ -1235,13 +1238,25 @@ void evictable_reader::maybe_validate_partition_start(const flat_mutation_reader
    // is in range.
    if (_last_pkey) {
        const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
-        if (_drop_partition_start) { // should be the same partition
+        if (_drop_partition_start) { // we expect to continue from the same partition
+            // We cannot assume the partition we stopped the read at is still alive
+            // when we recreate the reader. It might have been compacted away in the
+            // meanwhile, so allow for a larger partition too.
            require(
-                    cmp_res == 0,
-                    "{}(): validation failed, expected partition with key equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
+                    cmp_res <= 0,
+                    "{}(): validation failed, expected partition with key larger or equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
                    __FUNCTION__,
                    *_last_pkey,
                    ps.key());
+            // Reset drop flags and next pos if we are not continuing from the same partition
+            if (cmp_res < 0) {
+                // Close previous partition, we are not going to continue it.
+                push_mutation_fragment(*_schema, _permit, partition_end{});
+                _drop_partition_start = false;
+                _drop_static_row = false;
+                _next_position_in_partition = position_in_partition::for_partition_start();
+                _trim_range_tombstones = false;
+            }
        } else { // should be a larger partition
            require(
                    cmp_res < 0,
@@ -1292,9 +1307,14 @@ bool evictable_reader::should_drop_fragment(const mutation_fragment& mf) {
        _drop_partition_start = false;
        return true;
    }
-    if (_drop_static_row && mf.is_static_row()) {
-        _drop_static_row = false;
-        return true;
+    // Unlike partition-start above, a partition is not guaranteed to have a
+    // static row fragment. So reset the flag regardless of whether we could
+    // drop one or not.
+    // We are guaranteed to get here only right after dropping a partition-start,
+    // so if we are not seeing a static row here, the partition doesn't have one.
+    if (_drop_static_row) {
+         _drop_static_row = false;
+        return mf.is_static_row();
    }
    return false;
 }
@@ -1537,8 +1557,8 @@ class shard_reader : public enable_lw_shared_from_this<shard_reader>, public fla
 private:
    shared_ptr<reader_lifecycle_policy> _lifecycle_policy;
    const unsigned _shard;
-    const dht::partition_range* _pr;
-    const query::partition_slice& _ps;
+    dht::partition_range _pr;
+    query::partition_slice _ps;
    const io_priority_class& _pc;
    tracing::global_trace_state_ptr _trace_state;
    const mutation_reader::forwarding _fwd_mr;
@@ -1563,7 +1583,7 @@ public:
        : impl(std::move(schema), std::move(permit))
        , _lifecycle_policy(std::move(lifecycle_policy))
        , _shard(shard)
-        , _pr(&pr)
+        , _pr(pr)
        , _ps(ps)
        , _pc(pc)
        , _trace_state(std::move(trace_state))
@@ -1647,7 +1667,7 @@ future<> shard_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
            });
            auto s = gs.get();
            auto rreader = make_foreign(std::make_unique<evictable_reader>(evictable_reader::auto_pause::yes, std::move(ms),
-                        s, _lifecycle_policy->semaphore().make_permit(s.get(), "shard-reader"), *_pr, _ps, _pc, _trace_state, _fwd_mr));
+                        s, _lifecycle_policy->semaphore().make_permit(s.get(), "shard-reader"), _pr, _ps, _pc, _trace_state, _fwd_mr));
            tracing::trace(_trace_state, "Creating shard reader on shard: {}", this_shard_id());
            auto f = rreader->fill_buffer(timeout);
            return f.then([rreader = std::move(rreader)] () mutable {
@@ -1701,7 +1721,7 @@ future<> shard_reader::next_partition() {
 }

 future<> shard_reader::fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) {
-    _pr = &pr;
+    _pr = pr;

    if (!_reader && !_read_ahead) {
        // No need to fast-forward uncreated readers, they will be passed the new
@@ -1710,12 +1730,12 @@ future<> shard_reader::fast_forward_to(const dht::partition_range& pr, db::timeo
    }

    auto f = _read_ahead ? *std::exchange(_read_ahead, std::nullopt) : make_ready_future<>();
-    return f.then([this, &pr, timeout] {
+    return f.then([this, timeout] {
        _end_of_stream = false;
        clear_buffer();

-        return smp::submit_to(_shard, [this, &pr, timeout] {
-            return _reader->fast_forward_to(pr, timeout);
+        return smp::submit_to(_shard, [this, timeout] {
+            return _reader->fast_forward_to(_pr, timeout);
        });
    });
 }
@@ -2275,9 +2295,9 @@ position_reader_queue::~position_reader_queue() {}
 // are not implemented and throw an error; the reader is only used for single partition queries.
 //
 // Assumes that:
-// - the queue contains at least one reader,
 // - there are no static rows,
-// - the returned fragments do not contain partition tombstones.
+// - the returned fragments do not contain partition tombstones,
+// - the merged readers return fragments from the same partition (but some or even all of them may be empty).
 class clustering_order_reader_merger {
    const schema_ptr _schema;
    const reader_permit _permit;
@@ -2389,12 +2409,17 @@ class clustering_order_reader_merger {
            if (!mf) {
                // The reader returned end-of-stream before returning end-of-partition
                // (otherwise we would have removed it in a previous peek). This means that
-                // we are in forwarding mode and the reader won't return any more fragments in the current range.
+                // either the reader was empty from the beginning (not even returning a `partition_start`)
+                // or we are in forwarding mode and the reader won't return any more fragments in the current range.
                // If the reader's upper bound is smaller then the end of the current range then it won't
                // return any more fragments in later ranges as well (subsequent fast-forward-to ranges
                // are non-overlapping and strictly increasing), so we can remove it now.
-                // Otherwise it may start returning fragments later, so we save it for the moment
-                // in _halted_readers and will bring it back when we get fast-forwarded.
+                // Otherwise, if it previously returned a `partition_start`, it may start returning more fragments
+                // later (after we fast-forward) so we save it for the moment in _halted_readers and will bring it
+                // back when we get fast-forwarded.
+                // We also save the reader if it was empty from the beginning (no `partition_start`) since
+                // it makes the code simpler (to check for this here we would need additional state); it is a bit wasteful
+                // but completely empty readers should be rare.
                if (_cmp(it->upper_bound, _pr_end) < 0) {
                    _all_readers.erase(it);
                } else {
@@ -2524,19 +2549,6 @@ public:
                        : position_in_partition_view::after_all_clustered_rows())
        , _should_emit_partition_end(fwd_sm == streamed_mutation::forwarding::no)
    {
-        // The first call to `_reader_queue::pop` uses `after_all_clustered_rows`
-        // so we obtain at least one reader; we will return this reader's `partition_start`
-        // as the first fragment.
-        auto rs = _reader_queue->pop(position_in_partition_view::after_all_clustered_rows());
-        for (auto& r: rs) {
-            _all_readers.push_front(std::move(r));
-            _unpeeked_readers.push_back(_all_readers.begin());
-        }
-
-        if (rs.empty()) {
-            // No readers, no partition.
-            _should_emit_partition_end = false;
-        }
    }

    // We assume that operator() is called sequentially and that the caller doesn't use the batch
@@ -2553,8 +2565,22 @@ public:
            return peek_readers(timeout).then([this, timeout] { return (*this)(timeout); });
        }

-        auto next_peeked_pos = _peeked_readers.empty() ? _pr_end : _peeked_readers.front()->reader.peek_buffer().position();
-        // There might be queued readers containing fragments with positions <= next_peeked_pos:
+        // Before we return a batch of fragments using currently opened readers we must check the queue
+        // for potential new readers that must be opened. There are three cases which determine how ``far''
+        // should we look:
+        // - If there are some peeked readers in the heap, we must check for new readers
+        //   whose `min_position`s are <= the position of the first peeked reader; there is no need
+        //   to check for ``later'' readers (yet).
+        // - Otherwise, if we already fetched a partition start fragment, we need to look no further
+        //   than the end of the current position range (_pr_end).
+        // - Otherwise we need to look for any reader (by calling the queue with `after_all_clustered_rows`),
+        //   even for readers whose `min_position`s may be outside the current position range since they
+        //   may be the only readers which have a `partition_start` fragment which we need to return
+        //   before end-of-stream.
+        auto next_peeked_pos =
+            _peeked_readers.empty()
+                ? (_partition_start_fetched ? _pr_end : position_in_partition_view::after_all_clustered_rows())
+                : _peeked_readers.front()->reader.peek_buffer().position();
        if (!_reader_queue->empty(next_peeked_pos)) {
            auto rs = _reader_queue->pop(next_peeked_pos);
            for (auto& r: rs) {
@@ -2568,8 +2594,11 @@ public:
            // We are either in forwarding mode and waiting for a fast-forward,
            // or we've exhausted all the readers.
            if (_should_emit_partition_end) {
-                // Not forwarding, so all readers must be exhausted. Return the last fragment.
-                _current_batch.push_back(mutation_fragment(*_schema, _permit, partition_end()));
+                // Not forwarding, so all readers must be exhausted.
+                // Return a partition end fragment unless all readers have been empty from the beginning.
+                if (_partition_start_fetched) {
+                    _current_batch.push_back(mutation_fragment(*_schema, _permit, partition_end()));
+                }
                _should_emit_partition_end = false;
            }
            return make_ready_future<mutation_fragment_batch>(_current_batch);
--- a/mutation_writer/feed_writers.hh
+++ b/mutation_writer/feed_writers.hh
@@ -57,6 +57,8 @@ future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
                auto f2 = rd.is_buffer_empty() ? rd.fill_buffer(db::no_timeout) : make_ready_future<>();
                return when_all_succeed(std::move(f1), std::move(f2)).discard_result();
            });
+        }).then([&wr] {
+            wr.consume_end_of_stream();
        }).then_wrapped([&wr] (future<> f) {
            if (f.failed()) {
                auto ex = f.get_exception();
@@ -70,7 +72,6 @@ future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
                    return make_exception_future<>(std::move(ex));
                });
            } else {
-                wr.consume_end_of_stream();
                return wr.close();
            }
        });
--- a/partition_snapshot_row_cursor.hh
+++ b/partition_snapshot_row_cursor.hh
@@ -417,11 +417,11 @@ public:
        } else {
            // Copy row from older version because rows in evictable versions must
            // hold values which are independently complete to be consistent on eviction.
-            auto e = current_allocator().construct<rows_entry>(_schema, *_current_row[0].it);
+            auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(_schema, *_current_row[0].it));
            e->set_continuous(latest_i != rows.end() && latest_i->continuous());
            _snp.tracker()->insert(*e);
-            rows.insert_before(latest_i, *e);
-            return {*e, true};
+            auto e_i = rows.insert_before(latest_i, std::move(e));
+            return ensure_result{*e_i, true};
        }
    }

@@ -453,11 +453,11 @@ public:
        }
        auto&& rows = _snp.version()->partition().clustered_rows();
        auto latest_i = get_iterator_in_latest_version();
-        auto e = current_allocator().construct<rows_entry>(_schema, pos, is_dummy(!pos.is_clustering_row()),
-            is_continuous(latest_i != rows.end() && latest_i->continuous()));
+        auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(_schema, pos, is_dummy(!pos.is_clustering_row()),
+            is_continuous(latest_i != rows.end() && latest_i->continuous())));
        _snp.tracker()->insert(*e);
-        rows.insert_before(latest_i, *e);
-        return ensure_result{*e, true};
+        auto e_i = rows.insert_before(latest_i, std::move(e));
+        return ensure_result{*e_i, true};
    }

    // Brings the entry pointed to by the cursor to the front of the LRU
--- a/range_tombstone.hh
+++ b/range_tombstone.hh
@@ -267,9 +267,14 @@ public:
        return _current_tombstone;
    }

-    const std::deque<range_tombstone>& range_tombstones_for_row(const clustering_key_prefix& ck) {
+    std::vector<range_tombstone> range_tombstones_for_row(const clustering_key_prefix& ck) {
        drop_unneeded_tombstones(ck);
-        return _range_tombstones;
+        std::vector<range_tombstone> result(_range_tombstones.begin(), _range_tombstones.end());
+        auto cmp = [&] (const range_tombstone& rt1, const range_tombstone& rt2) {
+            return _cmp(rt1.start_bound(), rt2.start_bound());
+        };
+        std::sort(result.begin(), result.end(), cmp);
+        return result;
    }

    std::deque<range_tombstone> range_tombstones() && {
--- a/read_context.hh
+++ b/read_context.hh
@@ -142,6 +142,7 @@ class read_context final : public enable_lw_shared_from_this<read_context> {
    mutation_source_opt _underlying_snapshot;
    dht::partition_range _sm_range;
    std::optional<dht::decorated_key> _key;
+    bool _partition_exists;
    row_cache::phase_type _phase;
 public:
    read_context(row_cache& cache,
@@ -190,22 +191,34 @@ public:
    autoupdating_underlying_reader& underlying() { return _underlying; }
    row_cache::phase_type phase() const { return _phase; }
    const dht::decorated_key& key() const { return *_key; }
+    bool partition_exists() const { return _partition_exists; }
    void on_underlying_created() { ++_underlying_created; }
    bool digest_requested() const { return _slice.options.contains<query::partition_slice::option::with_digest>(); }
 public:
    future<> ensure_underlying(db::timeout_clock::time_point timeout) {
        if (_underlying_snapshot) {
-            return create_underlying(true, timeout);
+            return create_underlying(timeout).then([this, timeout] {
+                return _underlying.underlying()(timeout).then([this] (mutation_fragment_opt&& mfopt) {
+                    _partition_exists = bool(mfopt);
+                });
+            });
        }
+        // We know that partition exists because all the callers of
+        // enter_partition(const dht::decorated_key&, row_cache::phase_type)
+        // check that and there's no other way of setting _underlying_snapshot
+        // to empty. Except for calling create_underlying.
+        _partition_exists = true;
        return make_ready_future<>();
    }
 public:
-    future<> create_underlying(bool skip_first_fragment, db::timeout_clock::time_point timeout);
+    future<> create_underlying(db::timeout_clock::time_point timeout);
    void enter_partition(const dht::decorated_key& dk, mutation_source& snapshot, row_cache::phase_type phase) {
        _phase = phase;
        _underlying_snapshot = snapshot;
        _key = dk;
    }
+    // Precondition: each caller needs to make sure that partition with |dk| key
+    //               exists in underlying before calling this function.
    void enter_partition(const dht::decorated_key& dk, row_cache::phase_type phase) {
        _phase = phase;
        _underlying_snapshot = {};
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -77,7 +77,7 @@ class reader_permit::impl : public boost::intrusive::list_base_hook<boost::intru
    sstring _op_name;
    std::string_view _op_name_view;
    reader_resources _resources;
-    reader_permit::state _state = reader_permit::state::registered;
+    reader_permit::state _state = reader_permit::state::active;

 public:
    struct value_tag {};
@@ -126,40 +126,25 @@ public:
    }

    void on_admission() {
-        _state = reader_permit::state::admitted;
-        _semaphore.consume(_resources);
+        _state = reader_permit::state::active;
    }

    void on_register_as_inactive() {
-        if (_state != reader_permit::state::admitted) {
-            _state = reader_permit::state::inactive;
-            _semaphore.consume(_resources);
-        }
+        _state = reader_permit::state::inactive;
    }

    void on_unregister_as_inactive() {
-        if (_state == reader_permit::state::inactive) {
-            _state = reader_permit::state::registered;
-            _semaphore.signal(_resources);
-        }
-    }
-
-    bool should_forward_cost() const {
-        return _state == reader_permit::state::admitted || _state == reader_permit::state::inactive;
+        _state = reader_permit::state::active;
    }

    void consume(reader_resources res) {
        _resources += res;
-        if (should_forward_cost()) {
-            _semaphore.consume(res);
-        }
+        _semaphore.consume(res);
    }

    void signal(reader_resources res) {
        _resources -= res;
-        if (should_forward_cost()) {
-            _semaphore.signal(res);
-        }
+        _semaphore.signal(res);
    }

    reader_resources resources() const {
@@ -226,14 +211,11 @@ reader_resources reader_permit::consumed_resources() const {

 std::ostream& operator<<(std::ostream& os, reader_permit::state s) {
    switch (s) {
-        case reader_permit::state::registered:
-            os << "registered";
-            break;
        case reader_permit::state::waiting:
            os << "waiting";
            break;
-        case reader_permit::state::admitted:
-            os << "admitted";
+        case reader_permit::state::active:
+            os << "active";
            break;
        case reader_permit::state::inactive:
            os << "inactive";
@@ -273,7 +255,7 @@ struct permit_group_key_hash {

 using permit_groups = std::unordered_map<permit_group_key, permit_stats, permit_group_key_hash>;

-static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const permit_groups& permits, reader_permit::state state, bool sort_by_memory) {
+static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const permit_groups& permits, reader_permit::state state) {
    struct permit_summary {
        const schema* s;
        std::string_view op_name;
@@ -289,25 +271,17 @@ static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const pe
        }
    }

-    std::ranges::sort(permit_summaries, [sort_by_memory] (const permit_summary& a, const permit_summary& b) {
-        if (sort_by_memory) {
-            return a.memory < b.memory;
-        } else {
-            return a.count < b.count;
-        }
+    std::ranges::sort(permit_summaries, [] (const permit_summary& a, const permit_summary& b) {
+        return a.memory < b.memory;
    });

    permit_stats total;

-    auto print_line = [&os, sort_by_memory] (auto col1, auto col2, auto col3) {
-        if (sort_by_memory) {
-            fmt::print(os, "{}\t{}\t{}\n", col2, col1, col3);
-        } else {
-            fmt::print(os, "{}\t{}\t{}\n", col1, col2, col3);
-        }
+    auto print_line = [&os] (auto col1, auto col2, auto col3) {
+        fmt::print(os, "{}\t{}\t{}\n", col2, col1, col3);
    };

-    fmt::print(os, "Permits with state {}, sorted by {}\n", state, sort_by_memory ? "memory" : "count");
+    fmt::print(os, "Permits with state {}\n", state);
    print_line("count", "memory", "name");
    for (const auto& summary : permit_summaries) {
        total.count += summary.count;
@@ -333,13 +307,11 @@ static void do_dump_reader_permit_diagnostics(std::ostream& os, const reader_con
    permit_stats total;

    fmt::print(os, "Semaphore {}: {}, dumping permit diagnostics:\n", semaphore.name(), problem);
-    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::admitted, true);
+    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::active);
    fmt::print(os, "\n");
-    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::inactive, false);
+    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::inactive);
    fmt::print(os, "\n");
-    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::waiting, false);
-    fmt::print(os, "\n");
-    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::registered, false);
+    total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::waiting);
    fmt::print(os, "\n");
    fmt::print(os, "Total: permits: {}, memory: {}\n", total.count, utils::to_hr_size(total.memory));
 }
@@ -417,11 +389,9 @@ reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore:
    auto& permit_impl = *reader.permit()._impl;
    // Implies _inactive_reads.empty(), we don't queue new readers before
    // evicting all inactive reads.
-    // FIXME: #4758, workaround for keeping tabs on un-admitted reads that are
-    // still registered as inactive. Without the below check, these can
-    // accumulate without limit. The real fix is #4758 -- that is to make all
-    // reads pass admission before getting started.
-    if (_wait_list.empty() && (permit_impl.get_state() == reader_permit::state::admitted || _resources >= permit_impl.resources())) {
+    // Checking the _wait_list covers the count resources only, so check memory
+    // separately.
+    if (_wait_list.empty() && _resources.memory > 0) {
      try {
        auto irp = std::make_unique<inactive_read>(std::move(reader));
        auto& ir = *irp;
@@ -514,13 +484,13 @@ void reader_concurrency_semaphore::evict(inactive_read& ir, evict_reason reason)
 }

 bool reader_concurrency_semaphore::has_available_units(const resources& r) const {
-    return bool(_resources) && _resources >= r;
+    // Special case: when there is no active reader (based on count) admit one
+    // regardless of availability of memory.
+    return (bool(_resources) && _resources >= r) || _resources.count == _initial_resources.count;
 }

 bool reader_concurrency_semaphore::may_proceed(const resources& r) const {
-    // Special case: when there is no active reader (based on count) admit one
-    // regardless of availability of memory.
-    return _wait_list.empty() && (has_available_units(r) || _resources.count == _initial_resources.count);
+    return _wait_list.empty() && has_available_units(r);
 }

 future<reader_permit::resource_units> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, size_t memory,
@@ -567,6 +537,12 @@ void reader_concurrency_semaphore::broken(std::exception_ptr ex) {
    }
 }

+std::string reader_concurrency_semaphore::dump_diagnostics() const {
+    std::ostringstream os;
+    do_dump_reader_permit_diagnostics(os, *this, *_permit_list, "user request");
+    return os.str();
+}
+
 // A file that tracks the memory usage of buffers resulting from read
 // operations.
 class tracking_file_impl : public file_impl {
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -293,4 +293,6 @@ public:
    }

    void broken(std::exception_ptr ex);
+
+    std::string dump_diagnostics() const;
 };
--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -91,10 +91,9 @@ public:
    class resource_units;

    enum class state {
-        registered, // read is registered, but didn't attempt admission yet
        waiting, // waiting for admission
-        admitted,
-        inactive, // un-admitted reads that are registered as inactive
+        active,
+        inactive,
    };

    class impl;
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -326,7 +326,7 @@ float node_ops_metrics::repair_finished_percentage() {
 tracker::tracker(size_t nr_shards, size_t max_repair_memory)
    : _shutdown(false)
    , _repairs(nr_shards) {
-    auto nr = std::max(size_t(1), size_t(max_repair_memory / max_repair_memory_per_range()));
+    auto nr = std::max(size_t(1), size_t(max_repair_memory / max_repair_memory_per_range() / 4));
    rlogger.info("Setting max_repair_memory={}, max_repair_memory_per_range={}, max_repair_ranges_in_parallel={}",
        max_repair_memory, max_repair_memory_per_range(), nr);
    _range_parallelism_semaphores.reserve(nr_shards);
@@ -1578,6 +1578,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, seastar::sharded<
            auto& strat = ks.get_replication_strategy();
            dht::token_range_vector desired_ranges = strat.get_pending_address_ranges(tmptr, tokens, myip, utils::can_yield::yes);
            bool find_node_in_local_dc_only = strat.get_type() == locator::replication_strategy_type::network_topology;
+            bool everywhere_topology = strat.get_type() == locator::replication_strategy_type::everywhere_topology;

            //Active ranges
            auto metadata_clone = tmptr->clone_only_token_map().get0();
@@ -1655,7 +1656,9 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, seastar::sharded<
                        };
                        auto old_endpoints_in_local_dc = get_old_endpoints_in_local_dc();
                        auto rf_in_local_dc = get_rf_in_local_dc();
-                        if (old_endpoints.size() == strat.get_replication_factor()) {
+                        if (everywhere_topology) {
+                            neighbors = old_endpoints_in_local_dc;
+                        } else if (old_endpoints.size() == strat.get_replication_factor()) {
                            // For example, with RF = 3 and 3 nodes n1, n2, n3
                            // in the cluster, n4 is bootstrapped, old_replicas
                            // = {n1, n2, n3}, new_replicas = {n1, n2, n4}, n3
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -332,7 +332,7 @@ public:
    }
 };

-future<> read_context::create_underlying(bool skip_first_fragment, db::timeout_clock::time_point timeout) {
+future<> read_context::create_underlying(db::timeout_clock::time_point timeout) {
    if (_range_query) {
        // FIXME: Singular-range mutation readers don't support fast_forward_to(), so need to use a wide range
        // here in case the same reader will need to be fast forwarded later.
@@ -340,13 +340,8 @@ future<> read_context::create_underlying(bool skip_first_fragment, db::timeout_c
    } else {
        _sm_range = dht::partition_range::make_singular({dht::ring_position(*_key)});
    }
-    return _underlying.fast_forward_to(std::move(_sm_range), *_underlying_snapshot, _phase, timeout).then([this, skip_first_fragment, timeout] {
+    return _underlying.fast_forward_to(std::move(_sm_range), *_underlying_snapshot, _phase, timeout).then([this] {
        _underlying_snapshot = {};
-        if (skip_first_fragment) {
-            return _underlying.underlying()(timeout).then([](auto &&mf) {});
-        } else {
-            return make_ready_future<>();
-        }
    });
 }

@@ -366,7 +361,7 @@ private:
        auto src_and_phase = _cache.snapshot_of(_read_context->range().start()->value());
        auto phase = src_and_phase.phase;
        _read_context->enter_partition(_read_context->range().start()->value().as_decorated_key(), src_and_phase.snapshot, phase);
-        return _read_context->create_underlying(false, timeout).then([this, phase, timeout] {
+        return _read_context->create_underlying(timeout).then([this, phase, timeout] {
          return _read_context->underlying().underlying()(timeout).then([this, phase] (auto&& mfopt) {
            if (!mfopt) {
                if (phase == _cache.phase_of(_read_context->range().start()->value())) {
@@ -728,7 +723,7 @@ row_cache::make_reader(schema_ptr s,
            auto&& pos = ctx->range().start()->value();
            partitions_type::bound_hint hint;
            auto i = _partitions.lower_bound(pos, cmp, hint);
-            if (i != _partitions.end() && hint.match) {
+            if (hint.match) {
                cache_entry& e = *i;
                upgrade_entry(e);
                on_partition_hit();
--- a/2
+++ b/2
--- a/serialization_visitors.hh
+++ b/serialization_visitors.hh
@@ -89,7 +89,7 @@ template<typename Input>
 size_type read_frame_size(Input& in) {
    auto sz = deserialize(in, boost::type<size_type>());
    if (sz < sizeof(size_type)) {
-        throw std::runtime_error("Truncated frame");
+        throw std::runtime_error(fmt::format("IDL frame truncated: expected to have at least {} bytes, got {}", sizeof(size_type), sz));
    }
    return sz - sizeof(size_type);
 }
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -53,6 +53,7 @@
 #include "database.hh"
 #include "db/schema_tables.hh"
 #include "types/user.hh"
+#include "db/schema_tables.hh"

 namespace service {

@@ -1075,8 +1076,19 @@ future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging
            // referenced by the incoming request.
            // That means the column mapping for the schema should always be inserted
            // with TTL (refresh TTL in case column mapping already existed prior to that).
-            return db::schema_tables::store_column_mapping(proxy, s.unfreeze(db::schema_ctxt(proxy)), true).then([s] {
-                return s;
+            auto us = s.unfreeze(db::schema_ctxt(proxy));
+            // if this is a view - we might need to fix it's schema before registering it.
+            if (us->is_view()) {
+                auto& db = proxy.local().local_db();
+                schema_ptr base_schema = db.find_schema(us->view_info()->base_id());
+                auto fixed_view = db::schema_tables::maybe_fix_legacy_secondary_index_mv_schema(db, view_ptr(us), base_schema,
+                        db::schema_tables::preserve_version::yes);
+                if (fixed_view) {
+                    us = fixed_view;
+                }
+            }
+            return db::schema_tables::store_column_mapping(proxy, us, true).then([us] {
+                return frozen_schema{us};
            });
        });
    }).then([] (schema_ptr s) {
@@ -1084,7 +1096,7 @@ future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging
        // table.
        if (s->is_view()) {
            if (!s->view_info()->base_info()) {
-                auto& db = service::get_local_storage_proxy().get_db().local();
+                auto& db = service::get_local_storage_proxy().local_db();
                // This line might throw a no_such_column_family
                // It should be fine since if we tried to register a view for which
                // we don't know the base table, our registry is broken.
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -3643,7 +3643,12 @@ protected:
    }

 public:
-    virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(storage_proxy::clock_type::time_point timeout) {
+    future<foreign_ptr<lw_shared_ptr<query::result>>> execute(storage_proxy::clock_type::time_point timeout) {
+        if (_targets.empty()) {
+            // We may have no targets to read from if a DC with zero replication is queried with LOCACL_QUORUM.
+            // Return an empty result in this case
+            return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>>(make_foreign(make_lw_shared(query::result())));
+        }
        digest_resolver_ptr digest_resolver = ::make_shared<digest_read_resolver>(_schema, _cl, _block_for,
                db::is_datacenter_local(_cl) ? db::count_local_endpoints(_targets): _targets.size(), timeout);
        auto exec = shared_from_this();
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -797,6 +797,19 @@ storage_service::get_range_to_address_map(const sstring& keyspace,
    return construct_range_to_endpoint_map(ks, get_all_ranges(sorted_tokens));
 }

+void storage_service::handle_state_replacing_update_pending_ranges(mutable_token_metadata_ptr tmptr, inet_address replacing_node) {
+    try {
+        slogger.info("handle_state_replacing: Waiting for replacing node {} to be alive on all shards", replacing_node);
+        _gossiper.wait_alive({replacing_node}, std::chrono::milliseconds(5 * 1000));
+        slogger.info("handle_state_replacing: Replacing node {} is now alive on all shards", replacing_node);
+    } catch (...) {
+        slogger.warn("handle_state_replacing: Failed to wait for replacing node {} to be alive on all shards: {}",
+                replacing_node, std::current_exception());
+    }
+    slogger.info("handle_state_replacing: Update pending ranges for replacing node {}", replacing_node);
+    update_pending_ranges(tmptr, format("handle_state_replacing {}", replacing_node)).get();
+}
+
 void storage_service::handle_state_replacing(inet_address replacing_node) {
    slogger.debug("endpoint={} handle_state_replacing", replacing_node);
    auto host_id = _gossiper.get_host_id(replacing_node);
@@ -817,7 +830,13 @@ void storage_service::handle_state_replacing(inet_address replacing_node) {
    slogger.info("Node {} is replacing existing node {} with host_id={}, existing_tokens={}, replacing_tokens={}",
            replacing_node, existing_node, host_id, existing_tokens, replacing_tokens);
    tmptr->add_replacing_endpoint(existing_node, replacing_node);
-    update_pending_ranges(tmptr, format("handle_state_replacing {}", replacing_node)).get();
+    if (_gossiper.is_alive(replacing_node)) {
+        slogger.info("handle_state_replacing: Replacing node {} is already alive, update pending ranges", replacing_node);
+        handle_state_replacing_update_pending_ranges(tmptr, replacing_node);
+    } else {
+        slogger.info("handle_state_replacing: Replacing node {} is not alive yet, delay update pending ranges", replacing_node);
+        _replacing_nodes_pending_ranges_updater.insert(replacing_node);
+    }
    replicate_to_all_cores(std::move(tmptr)).get();
 }

@@ -1127,6 +1146,14 @@ void storage_service::on_alive(gms::inet_address endpoint, gms::endpoint_state s
    if (get_token_metadata().is_member(endpoint)) {
        notify_up(endpoint);
    }
+    if (_replacing_nodes_pending_ranges_updater.contains(endpoint)) {
+        _replacing_nodes_pending_ranges_updater.erase(endpoint);
+        slogger.info("Trigger pending ranges updater for replacing node {}", endpoint);
+        auto tmlock = get_token_metadata_lock().get0();
+        auto tmptr = get_mutable_token_metadata_ptr().get0();
+        handle_state_replacing_update_pending_ranges(tmptr, endpoint);
+        replicate_to_all_cores(std::move(tmptr)).get();
+    }
 }

 void storage_service::before_change(gms::inet_address endpoint, gms::endpoint_state current_state, gms::application_state new_state_key, const gms::versioned_value& new_value) {
@@ -2301,7 +2328,13 @@ future<> storage_service::restore_replica_count(inet_address endpoint, inet_addr
    }
  return seastar::async([this, endpoint, notify_endpoint] {
    auto tmptr = get_token_metadata_ptr();
-    auto streamer = make_lw_shared<dht::range_streamer>(_db, tmptr, _abort_source, get_broadcast_address(), "Restore_replica_count", streaming::stream_reason::removenode);
+    abort_source as;
+    auto sub = _abort_source.subscribe([&as] () noexcept {
+        if (!as.abort_requested()) {
+            as.request_abort();
+        }
+    });
+    auto streamer = make_lw_shared<dht::range_streamer>(_db, tmptr, as, get_broadcast_address(), "Restore_replica_count", streaming::stream_reason::removenode);
    auto my_address = get_broadcast_address();
    auto non_system_keyspaces = _db.local().get_non_system_keyspaces();
    for (const auto& keyspace_name : non_system_keyspaces) {
@@ -2319,6 +2352,42 @@ future<> storage_service::restore_replica_count(inet_address endpoint, inet_addr
        }
        streamer->add_rx_ranges(keyspace_name, std::move(ranges_per_endpoint));
    }
+    auto status_checker = seastar::async([this, endpoint, &as] {
+        slogger.info("restore_replica_count: Started status checker for removing node {}", endpoint);
+        while (!as.abort_requested()) {
+            auto status = _gossiper.get_gossip_status(endpoint);
+            // If the node to be removed is already in removed status, it has
+            // probably been removed forcely with `nodetool removenode force`.
+            // Abort the restore_replica_count in such case to avoid streaming
+            // attempt since the user has removed the node forcely.
+            if (status == sstring(versioned_value::REMOVED_TOKEN)) {
+                slogger.info("restore_replica_count: Detected node {} has left the cluster, status={}, abort restore_replica_count for removing node {}",
+                        endpoint, status, endpoint);
+                if (!as.abort_requested()) {
+                    as.request_abort();
+                }
+                return;
+            }
+            slogger.debug("restore_replica_count: Sleep and detect removing node {}, status={}", endpoint, status);
+            sleep_abortable(std::chrono::seconds(10), as).get();
+        }
+    });
+    auto stop_status_checker = defer([endpoint, &status_checker, &as] () mutable {
+        try {
+            slogger.info("restore_replica_count: Started to stop status checker for removing node {}", endpoint);
+            if (!as.abort_requested()) {
+                as.request_abort();
+            }
+            status_checker.get();
+        } catch (const seastar::sleep_aborted& ignored) {
+            slogger.debug("restore_replica_count: Got sleep_abort to stop status checker for removing node {}: {}", endpoint, ignored);
+        } catch (...) {
+            slogger.warn("restore_replica_count: Found error in status checker for removing node {}: {}",
+                    endpoint, std::current_exception());
+        }
+        slogger.info("restore_replica_count: Finished to stop status checker for removing node {}", endpoint);
+    });
+
    streamer->stream_async().then_wrapped([this, streamer, notify_endpoint] (auto&& f) {
        try {
            f.get();
@@ -2338,15 +2407,16 @@ void storage_service::excise(std::unordered_set<token> tokens, inet_address endp
    slogger.info("Removing tokens {} for {}", tokens, endpoint);
    // FIXME: HintedHandOffManager.instance.deleteHintsForEndpoint(endpoint);
    remove_endpoint(endpoint);
-    auto tmlock = get_token_metadata_lock().get0();
+    auto tmlock = std::make_optional(get_token_metadata_lock().get0());
    auto tmptr = get_mutable_token_metadata_ptr().get0();
    tmptr->remove_endpoint(endpoint);
    tmptr->remove_bootstrap_tokens(tokens);

-    notify_left(endpoint);
-
    update_pending_ranges(tmptr, format("excise {}", endpoint)).get();
    replicate_to_all_cores(std::move(tmptr)).get();
+    tmlock.reset();
+
+    notify_left(endpoint);
 }

 void storage_service::excise(std::unordered_set<token> tokens, inet_address endpoint, int64_t expire_time) {
@@ -2473,7 +2543,7 @@ private:
        int32_t status = 0;
        while (auto status_opt = co_await _source()) {
            status = std::get<0>(*status_opt);
-            slogger.debug("send_meta_data: got error code={}, from node={}, status={}", status, _node);
+            slogger.debug("send_meta_data: got error code={}, from node={}", status, _node);
            if (status == -1) {
                _error_from_peer = true;
            }
@@ -2553,7 +2623,7 @@ future<> storage_service::load_and_stream(sstring ks_name, sstring cf_name,
    auto& table = _db.local().find_column_family(table_id);
    auto s = table.schema();
    const auto cf_id = s->id();
-    const auto reason = streaming::stream_reason::rebuild;
+    const auto reason = streaming::stream_reason::repair;
    auto& rs = _db.local().find_keyspace(ks_name).get_replication_strategy();

    size_t nr_sst_total = sstables.size();
@@ -3278,7 +3348,7 @@ shared_ptr<node_ops_info> node_ops_meta_data::get_ops_info() {

 void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {
    slogger.debug("node_ops_update_heartbeat: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
+    auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
    auto it = _node_ops.find(ops_uuid);
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
@@ -3288,7 +3358,7 @@ void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {

 void storage_service::node_ops_done(utils::UUID ops_uuid) {
    slogger.debug("node_ops_done: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
+    auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
    auto it = _node_ops.find(ops_uuid);
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
@@ -3299,7 +3369,7 @@ void storage_service::node_ops_done(utils::UUID ops_uuid) {

 void storage_service::node_ops_abort(utils::UUID ops_uuid) {
    slogger.debug("node_ops_abort: ops_uuid={}", ops_uuid);
-    auto permit = seastar::get_units(_node_ops_abort_sem, 1);
+    auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
    auto it = _node_ops.find(ops_uuid);
    if (it != _node_ops.end()) {
        node_ops_meta_data& meta = it->second;
--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -587,6 +587,7 @@ private:
    sharded<db::view::view_update_generator>& _view_update_generator;
    locator::snitch_signal_slot_t _snitch_reconfigure;
    serialized_action _schema_version_publisher;
+    std::unordered_set<gms::inet_address> _replacing_nodes_pending_ranges_updater;
 private:
    /**
     * Handle node bootstrap
@@ -641,6 +642,8 @@ private:
     */
    void handle_state_replacing(inet_address endpoint);

+    void handle_state_replacing_update_pending_ranges(mutable_token_metadata_ptr tmptr, inet_address replacing_node);
+
 private:
    void excise(std::unordered_set<token> tokens, inet_address endpoint);
    void excise(std::unordered_set<token> tokens, inet_address endpoint, long expire_time);
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -468,7 +468,6 @@ protected:
    mutation_source_metadata _ms_metadata = {};
    garbage_collected_sstable_writer::data _gc_sstable_writer_data;
    compaction_sstable_replacer_fn _replacer;
-    std::optional<compaction_weight_registration> _weight_registration;
    utils::UUID _run_identifier;
    ::io_priority_class _io_priority;
    // optional clone of sstable set to be used for expiration purposes, so it will be set if expiration is enabled.
@@ -487,7 +486,6 @@ protected:
        , _sstable_level(descriptor.level)
        , _gc_sstable_writer_data(*this)
        , _replacer(std::move(descriptor.replacer))
-        , _weight_registration(std::move(descriptor.weight_registration))
        , _run_identifier(descriptor.run_identifier)
        , _io_priority(descriptor.io_priority)
        , _sstable_set(std::move(descriptor.all_sstables_snapshot))
@@ -951,9 +949,6 @@ public:
    }

    virtual void on_end_of_compaction() override {
-        if (_weight_registration) {
-            _cf.get_compaction_manager().on_compaction_complete(*_weight_registration);
-        }
        replace_remaining_exhausted_sstables();
    }

--- a/sstables/compaction_descriptor.hh
+++ b/sstables/compaction_descriptor.hh
@@ -134,8 +134,6 @@ struct compaction_descriptor {
    uint64_t max_sstable_bytes;
    // Run identifier of output sstables.
    utils::UUID run_identifier;
-    // Holds ownership of a weight assigned to this compaction iff it's a regular one.
-    std::optional<compaction_weight_registration> weight_registration;
    // Calls compaction manager's task for this compaction to release reference to exhausted sstables.
    std::function<void(const std::vector<shared_sstable>& exhausted_sstables)> release_exhausted;
    // The options passed down to the compaction code.
--- a/sstables/compaction_manager.cc
+++ b/sstables/compaction_manager.cc
@@ -314,6 +314,7 @@ future<> compaction_manager::run_custom_job(column_family* cf, sstring name, non
            cmlog.info("{} was abruptly stopped, reason: {}", name, e.what());
        } catch (...) {
            cmlog.error("{} failed: {}", name, std::current_exception());
+            throw;
        }
    });
    return task->compaction_done.get_future().then([task] {});
@@ -438,7 +439,7 @@ void compaction_manager::reevaluate_postponed_compactions() {
 }

 void compaction_manager::postpone_compaction_for_column_family(column_family* cf) {
-    _postponed.push_back(cf);
+    _postponed.insert(cf);
 }

 future<> compaction_manager::stop_ongoing_compactions(sstring reason) {
@@ -578,7 +579,7 @@ void compaction_manager::submit(column_family* cf) {
                return make_ready_future<stop_iteration>(stop_iteration::yes);
            }
            auto compacting = make_lw_shared<compacting_sstable_registration>(this, descriptor.sstables);
-            descriptor.weight_registration = compaction_weight_registration(this, weight);
+            auto weight_r = compaction_weight_registration(this, weight);
            descriptor.release_exhausted = [compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
                compacting->release_compacting(exhausted_sstables);
            };
@@ -588,7 +589,7 @@ void compaction_manager::submit(column_family* cf) {
            _stats.pending_tasks--;
            _stats.active_tasks++;
            task->compaction_running = true;
-            return cf.run_compaction(std::move(descriptor)).then_wrapped([this, task, compacting = std::move(compacting)] (future<> f) mutable {
+            return cf.run_compaction(std::move(descriptor)).then_wrapped([this, task, compacting = std::move(compacting), weight_r = std::move(weight_r)] (future<> f) mutable {
                _stats.active_tasks--;
                task->compaction_running = false;

@@ -853,12 +854,15 @@ future<> compaction_manager::remove(column_family* cf) {
            task->stopping = true;
        }
    }
-    _postponed.erase(boost::remove(_postponed, cf), _postponed.end());
+    _postponed.erase(cf);

    // Wait for the termination of an ongoing compaction on cf, if any.
    return do_for_each(*tasks_to_stop, [this, cf] (auto& task) {
        return this->task_stop(task);
    }).then([this, cf, tasks_to_stop] {
+#ifdef DEBUG
+        assert(std::find_if(_tasks.begin(), _tasks.end(), [cf] (auto& task) { return task->compacting_cf == cf; }) == _tasks.end());
+#endif
        _compaction_locks.erase(cf);
    });
 }
@@ -885,11 +889,6 @@ void compaction_manager::stop_compaction(sstring type) {
    }
 }

-void compaction_manager::on_compaction_complete(compaction_weight_registration& weight_registration) {
-    weight_registration.deregister();
-    reevaluate_postponed_compactions();
-}
-
 void compaction_manager::propagate_replacement(column_family* cf,
        const std::vector<sstables::shared_sstable>& removed, const std::vector<sstables::shared_sstable>& added) {
    for (auto& info : _compactions) {
--- a/sstables/compaction_manager.hh
+++ b/sstables/compaction_manager.hh
@@ -100,7 +100,7 @@ private:
    future<> _waiting_reevalution = make_ready_future<>();
    condition_variable _postponed_reevaluation;
    // column families that wait for compaction but had its submission postponed due to ongoing compaction.
-    std::vector<column_family*> _postponed;
+    std::unordered_set<column_family*> _postponed;
    // tracks taken weights of ongoing compactions, only one compaction per weight is allowed.
    // weight is value assigned to a compaction job that is log base N of total size of all input sstables.
    std::unordered_set<int> _weight_tracker;
@@ -257,11 +257,6 @@ public:
    // Stops ongoing compaction of a given type.
    void stop_compaction(sstring type);

-    // Called by compaction procedure to release the weight lock assigned to it, such that
-    // another compaction waiting on same weight can start as soon as possible. That's usually
-    // called before compaction seals sstable and such and after all compaction work is done.
-    void on_compaction_complete(compaction_weight_registration& weight_registration);
-
    double backlog() {
        return _backlog_manager.backlog();
    }
--- a/sstables/compaction_strategy.cc
+++ b/sstables/compaction_strategy.cc
@@ -503,7 +503,8 @@ date_tiered_manifest::get_compaction_candidates(column_family& cf, std::vector<s

 int64_t date_tiered_manifest::get_now(column_family& cf) {
    int64_t max_timestamp = 0;
-    for (auto& sst : *cf.get_sstables()) {
+    auto shared_set = cf.get_sstables();
+    for (auto& sst : *shared_set) {
        int64_t candidate = sst->get_stats_metadata().max_timestamp;
        max_timestamp = candidate > max_timestamp ? candidate : max_timestamp;
    }
--- a/sstables/kl/writer.cc
+++ b/sstables/kl/writer.cc
@@ -124,7 +124,7 @@ void sstable_writer_k_l::maybe_flush_pi_block(file_writer& out,
        // block includes them), but we set block_next_start_offset after - so
        // even if we wrote a lot of open tombstones, we still get a full
        // block size of new data.
-        auto& rts = _pi_write.tombstone_accumulator->range_tombstones_for_row(
+        auto rts = _pi_write.tombstone_accumulator->range_tombstones_for_row(
                clustering_key_prefix::from_range(clustering_key.values()));
        for (const auto& rt : rts) {
            auto start = composite::from_clustering_element(*_pi_write.schemap, rt.start);
--- a/sstables/leveled_compaction_strategy.cc
+++ b/sstables/leveled_compaction_strategy.cc
@@ -78,7 +78,11 @@ compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(colu
 }

 void leveled_compaction_strategy::notify_completion(const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
-    if (removed.empty() || added.empty()) {
+    // All the update here is only relevant for regular compaction's round-robin picking policy, and if
+    // last_compacted_keys wasn't generated by regular, it means regular is disabled since last restart,
+    // therefore we can skip the updates here until regular runs for the first time. Once it runs,
+    // it will be able to generate last_compacted_keys correctly by looking at metadata of files.
+    if (removed.empty() || added.empty() || !_last_compacted_keys) {
        return;
    }
    auto min_level = std::numeric_limits<uint32_t>::max();
@@ -147,7 +151,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
        unsigned overlapping_sstables = 0;
        auto prev_last = dht::ring_position::min();
        for (auto& sst : sstables) {
-            if (dht::ring_position(sst->get_first_decorated_key()).less_compare(*schema, prev_last)) {
+            if (dht::ring_position(sst->get_first_decorated_key()).tri_compare(*schema, prev_last) <= 0) {
                overlapping_sstables++;
            }
            prev_last = dht::ring_position(sst->get_last_decorated_key());
@@ -193,7 +197,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
    // If there's only disjoint L0 sstables like on bootstrap, let's compact them all into a level L which has capacity to store the output.
    // The best possible level can be calculated with the formula: log (base fan_out) of (L0_total_bytes / max_sstable_size)
    auto [l0_disjoint, _] = is_disjoint(level_info[0], 0);
-    if (mode == reshape_mode::strict && level_info[0].size() == input.size() && l0_disjoint) {
+    if (mode == reshape_mode::strict && level_info[0].size() >= offstrategy_threshold && level_info[0].size() == input.size() && l0_disjoint) {
        auto log_fanout = [fanout = leveled_manifest::leveled_fan_out] (double x) {
            double inv_log_fanout = 1.0f / std::log(fanout);
            return log(x) * inv_log_fanout;
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`Dedicated to the memory of Alberto José Araújo, a coworker and a friend.`