release: prepare for 4.2.1

Merge 'Move temporaries to value view' from Piotr S
" Issue https://github.com/scylladb/scylla/issues/7019 describes a problem of an ever-growing map of temporary values stored in query_options. In order to mitigate this kind of problems, the storage for temporary values is moved from an external data structure to the value views itself. This way, the temporary lives only as long as it's accessible and is automatically destroyed once a request finishes. The downside is that each temporary is now allocated separately, while previously they were bundled in a single byte stream. Tests: unit(dev) Fixes https://github.com/scylladb/scylla/issues/7019 " 7055297649 ("cql3: remove query_options::linearize and _temporaries") is reverted from this backport since linearize() is still used in this branch. * psarna-move_temporaries_to_value_view: cql3: remove query_options::linearize and _temporaries cql3: remove make_temporary helper function cql3: store temporaries in-place instead of in query_options cql3: add temporary_value to value view cql3: allow moving data out of raw_value cql3: split values.hh into a .cc file (cherry picked from commit 2b308a973f)
2020-11-08 12:41:06 +02:00 · 2020-11-05 19:24:23 +02:00 · 2020-11-05 17:53:08 +02:00 · 2020-10-25 09:12:38 +02:00 · 2020-10-23 17:18:02 +02:00 · 2020-10-23 18:09:45 +03:00
136 changed files with 3437 additions and 749 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=666.development
+VERSION=4.2.1

 if test -f version
 then
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -129,7 +129,7 @@ future<std::string> get_key_from_roles(cql3::query_processor& qp, std::string us
            auth::meta::roles_table::qualified_name(), auth::meta::roles_table::role_col_name);

    auto cl = auth::password_authenticator::consistency_for_user(username);
-    auto timeout = auth::internal_distributed_timeout_config();
+    auto& timeout = auth::internal_distributed_timeout_config();
    return qp.execute_internal(query, cl, timeout, {sstring(username)}, true).then_wrapped([username = std::move(username)] (future<::shared_ptr<cql3::untyped_result_set>> f) {
        auto res = f.get0();
        auto salted_hash = std::optional<sstring>();
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -98,6 +98,11 @@ struct nonempty : public size_check {

 // Check that array has the expected number of elements
 static void verify_operand_count(const rjson::value* array, const size_check& expected, const rjson::value& op) {
+    if (!array && expected(0)) {
+        // If expected() allows an empty AttributeValueList, it is also fine
+        // that it is missing.
+        return;
+    }
    if (!array || !array->IsArray()) {
        throw api_error("ValidationException", "With ComparisonOperator, AttributeValueList must be given and an array");
    }
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -626,11 +626,8 @@ void rmw_operation::set_default_write_isolation(std::string_view value) {
    default_write_isolation = parse_write_isolation(value);
 }

-// FIXME: Updating tags currently relies on updating schema, which may be subject
-// to races during concurrent updates of the same table. Once Scylla schema updates
-// are fixed, this issue will automatically get fixed as well.
 enum class update_tags_action { add_tags, delete_tags };
-static future<> update_tags(service::migration_manager& mm, const rjson::value& tags, schema_ptr schema, std::map<sstring, sstring>&& tags_map, update_tags_action action) {
+static void update_tags_map(const rjson::value& tags, std::map<sstring, sstring>& tags_map, update_tags_action action) {
    if (action == update_tags_action::add_tags) {
        for (auto it = tags.Begin(); it != tags.End(); ++it) {
            const rjson::value& key = (*it)["Key"];
@@ -652,28 +649,20 @@ static future<> update_tags(service::migration_manager& mm, const rjson::value&
    }

    if (tags_map.size() > 50) {
-        return make_exception_future<>(api_error("ValidationException", "Number of Tags exceed the current limit for the provided ResourceArn"));
+        throw api_error("ValidationException", "Number of Tags exceed the current limit for the provided ResourceArn");
    }
    validate_tags(tags_map);
+}

+// FIXME: Updating tags currently relies on updating schema, which may be subject
+// to races during concurrent updates of the same table. Once Scylla schema updates
+// are fixed, this issue will automatically get fixed as well.
+static future<> update_tags(service::migration_manager& mm, schema_ptr schema, std::map<sstring, sstring>&& tags_map) {
    schema_builder builder(schema);
    builder.set_extensions(schema::extensions_map{{sstring(tags_extension::NAME), ::make_shared<tags_extension>(std::move(tags_map))}});
    return mm.announce_column_family_update(builder.build(), false, std::vector<view_ptr>(), false);
 }

-static future<> add_tags(service::migration_manager& mm, service::storage_proxy& proxy, schema_ptr schema, rjson::value& request_info) {
-    const rjson::value* tags = rjson::find(request_info, "Tags");
-    if (!tags || !tags->IsArray()) {
-        return make_exception_future<>(api_error("ValidationException", format("Cannot parse tags")));
-    }
-    if (tags->Size() < 1) {
-        return make_exception_future<>(api_error("ValidationException", "The number of tags must be at least 1"));
-    }
-
-    std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
-    return update_tags(mm, rjson::copy(*tags), schema, std::move(tags_map), update_tags_action::add_tags);
-}
-
 future<executor::request_return_type> executor::tag_resource(client_state& client_state, service_permit permit, rjson::value request) {
    _stats.api_operations.tag_resource++;

@@ -683,7 +672,16 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
            return api_error("AccessDeniedException", "Incorrect resource identifier");
        }
        schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
-        add_tags(_mm, _proxy, schema, request).get();
+        std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
+        const rjson::value* tags = rjson::find(request, "Tags");
+        if (!tags || !tags->IsArray()) {
+            return api_error("ValidationException", format("Cannot parse tags"));
+        }
+        if (tags->Size() < 1) {
+            return api_error("ValidationException", "The number of tags must be at least 1") ;
+        }
+        update_tags_map(*tags, tags_map,  update_tags_action::add_tags);
+        update_tags(_mm, schema, std::move(tags_map)).get();
        return json_string("");
    });
 }
@@ -704,7 +702,8 @@ future<executor::request_return_type> executor::untag_resource(client_state& cli
        schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));

        std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
-        update_tags(_mm, *tags, schema, std::move(tags_map), update_tags_action::delete_tags).get();
+        update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
+        update_tags(_mm, schema, std::move(tags_map)).get();
        return json_string("");
    });
 }
@@ -891,9 +890,22 @@ future<executor::request_return_type> executor::create_table(client_state& clien
            view_builders.emplace_back(std::move(view_builder));
        }
    }
-    if (rjson::find(request, "SSESpecification")) {
-        return make_ready_future<request_return_type>(api_error("ValidationException", "SSESpecification: configuring encryption-at-rest is not yet supported."));
+
+    // We don't yet support configuring server-side encryption (SSE) via the
+    // SSESpecifiction attribute, but an SSESpecification with Enabled=false
+    // is simply the default, and should be accepted:
+    rjson::value* sse_specification = rjson::find(request, "SSESpecification");
+    if (sse_specification && sse_specification->IsObject()) {
+        rjson::value* enabled = rjson::find(*sse_specification, "Enabled");
+        if (!enabled || !enabled->IsBool()) {
+            return make_ready_future<request_return_type>(api_error("ValidationException", "SSESpecification needs boolean Enabled"));
+        }
+        if (enabled->GetBool()) {
+            // TODO: full support for SSESpecification
+            return make_ready_future<request_return_type>(api_error("ValidationException", "SSESpecification: configuring encryption-at-rest is not yet supported."));
+        }
    }
+
    // We don't yet support streams (CDC), but a StreamSpecification asking
    // *not* to use streams should be accepted:
    rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
@@ -908,6 +920,14 @@ future<executor::request_return_type> executor::create_table(client_state& clien
        }
    }

+    // Parse the "Tags" parameter early, so we can avoid creating the table
+    // at all if this parsing failed.
+    const rjson::value* tags = rjson::find(request, "Tags");
+    std::map<sstring, sstring> tags_map;
+    if (tags && tags->IsArray()) {
+        update_tags_map(*tags, tags_map, update_tags_action::add_tags);
+    }
+
    builder.set_extensions(schema::extensions_map{{sstring(tags_extension::NAME), ::make_shared<tags_extension>()}});
    schema_ptr schema = builder.build();
    auto where_clause_it = where_clauses.begin();
@@ -928,14 +948,14 @@ future<executor::request_return_type> executor::create_table(client_state& clien

    return create_keyspace(keyspace_name).handle_exception_type([] (exceptions::already_exists_exception&) {
            // Ignore the fact that the keyspace may already exist. See discussion in #6340
-        }).then([this, table_name, request = std::move(request), schema, view_builders = std::move(view_builders)] () mutable {
-        return futurize_invoke([&] { return _mm.announce_new_column_family(schema, false); }).then([this, table_info = std::move(request), schema, view_builders = std::move(view_builders)] () mutable {
+        }).then([this, table_name, request = std::move(request), schema, view_builders = std::move(view_builders), tags_map = std::move(tags_map)] () mutable {
+        return futurize_invoke([&] { return _mm.announce_new_column_family(schema, false); }).then([this, table_info = std::move(request), schema, view_builders = std::move(view_builders), tags_map = std::move(tags_map)] () mutable {
            return parallel_for_each(std::move(view_builders), [this, schema] (schema_builder builder) {
                return _mm.announce_new_view(view_ptr(builder.build()));
-            }).then([this, table_info = std::move(table_info), schema] () mutable {
+            }).then([this, table_info = std::move(table_info), schema, tags_map = std::move(tags_map)] () mutable {
                future<> f = make_ready_future<>();
-                if (rjson::find(table_info, "Tags")) {
-                    f = add_tags(_mm, _proxy, schema, table_info);
+                if (!tags_map.empty()) {
+                    f = update_tags(_mm, schema, std::move(tags_map));
                }
                return f.then([this] {
                    return wait_for_schema_agreement(_mm, db::timeout_clock::now() + 10s);
@@ -963,15 +983,24 @@ class attribute_collector {
    void add(bytes&& name, atomic_cell&& cell) {
        collected.emplace(std::move(name), std::move(cell));
    }
+    void add(const bytes& name, atomic_cell&& cell) {
+        collected.emplace(name, std::move(cell));
+    }
 public:
    attribute_collector() : collected(attrs_type()->get_keys_type()->as_less_comparator()) { }
-    void put(bytes&& name, bytes&& val, api::timestamp_type ts) {
-        add(std::move(name), atomic_cell::make_live(*bytes_type, ts, std::move(val), atomic_cell::collection_member::yes));
+    void put(bytes&& name, const bytes& val, api::timestamp_type ts) {
+        add(std::move(name), atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));

    }
+    void put(const bytes& name, const bytes& val, api::timestamp_type ts) {
+        add(name, atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
+    }
    void del(bytes&& name, api::timestamp_type ts) {
        add(std::move(name), atomic_cell::make_dead(ts, gc_clock::now()));
    }
+    void del(const bytes& name, api::timestamp_type ts) {
+        add(name, atomic_cell::make_dead(ts, gc_clock::now()));
+    }
    collection_mutation_description to_mut() {
        collection_mutation_description ret;
        for (auto&& e : collected) {
@@ -1048,7 +1077,7 @@ public:
    put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item);
    // put_or_delete_item doesn't keep a reference to schema (so it can be
    // moved between shards for LWT) so it needs to be given again to build():
-    mutation build(schema_ptr schema, api::timestamp_type ts);
+    mutation build(schema_ptr schema, api::timestamp_type ts) const;
    const partition_key& pk() const { return _pk; }
    const clustering_key& ck() const { return _ck; }
 };
@@ -1077,7 +1106,7 @@ put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr sche
    }
 }

-mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) {
+mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) const {
    mutation m(schema, _pk);
    // If there's no clustering key, a tombstone should be created directly
    // on a partition, not on a clustering row - otherwise it will look like
@@ -1099,7 +1128,7 @@ mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) {
    for (auto& c : *_cells) {
        const column_definition* cdef = schema->get_column_definition(c.column_name);
        if (!cdef) {
-            attrs_collector.put(std::move(c.column_name), std::move(c.value), ts);
+            attrs_collector.put(c.column_name, c.value, ts);
        } else {
            row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, std::move(c.value)));
        }
@@ -1410,7 +1439,7 @@ public:
               check_needs_read_before_write(_condition_expression) ||
               _returnvalues == returnvalues::ALL_OLD;
    }
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override {
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
        if (!verify_expected(_request, previous_item.get()) ||
            !verify_condition_expression(_condition_expression, previous_item.get())) {
            // If the update is to be cancelled because of an unfulfilled Expected
@@ -1420,6 +1449,8 @@ public:
        }
        if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
            _return_attributes = std::move(*previous_item);
+        } else {
+            _return_attributes = {};
        }
        return _mutation_builder.build(_schema, ts);
    }
@@ -1493,7 +1524,7 @@ public:
                check_needs_read_before_write(_condition_expression) ||
                _returnvalues == returnvalues::ALL_OLD;
    }
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override {
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
        if (!verify_expected(_request, previous_item.get()) ||
            !verify_condition_expression(_condition_expression, previous_item.get())) {
            // If the update is to be cancelled because of an unfulfilled Expected
@@ -1503,6 +1534,8 @@ public:
        }
        if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
            _return_attributes = std::move(*previous_item);
+        } else {
+            _return_attributes = {};
        }
        return _mutation_builder.build(_schema, ts);
    }
@@ -1577,7 +1610,7 @@ public:
    virtual ~put_or_delete_item_cas_request() = default;
    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override {
        std::optional<mutation> ret;
-        for (put_or_delete_item& mutation_builder : _mutation_builders) {
+        for (const put_or_delete_item& mutation_builder : _mutation_builders) {
            // We assume all these builders have the same partition.
            if (ret) {
                ret->apply(mutation_builder.build(schema, ts));
@@ -1906,7 +1939,7 @@ public:

    update_item_operation(service::storage_proxy& proxy, rjson::value&& request);
    virtual ~update_item_operation() = default;
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override;
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override;
    bool needs_read_before_write() const;
 };

@@ -1984,7 +2017,7 @@ update_item_operation::needs_read_before_write() const {
 }

 std::optional<mutation>
-update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) {
+update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const {
    if (!verify_expected(_request, previous_item.get()) ||
        !verify_condition_expression(_condition_expression, previous_item.get())) {
        // If the update is to be cancelled because of an unfulfilled
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -87,7 +87,11 @@ protected:
    // When _returnvalues != NONE, apply() should store here, in JSON form,
    // the values which are to be returned in the "Attributes" field.
    // The default null JSON means do not return an Attributes field at all.
-    rjson::value _return_attributes;
+    // This field is marked "mutable" so that the const apply() can modify
+    // it (see explanation below), but note that because apply() may be
+    // called more than once, if apply() will sometimes set this field it
+    // must set it (even if just to the default empty value) every time.
+    mutable rjson::value _return_attributes;
 public:
    // The constructor of a rmw_operation subclass should parse the request
    // and try to discover as many input errors as it can before really
@@ -100,7 +104,12 @@ public:
    // conditional expression, apply() should return an empty optional.
    // apply() may throw if it encounters input errors not discovered during
    // the constructor.
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) = 0;
+    // apply() may be called more than once in case of contention, so it must
+    // not change the state saved in the object (issue #7218 was caused by
+    // violating this). We mark apply() "const" to let the compiler validate
+    // this for us. The output-only field _return_attributes is marked
+    // "mutable" above so that apply() can still write to it.
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
    // Convert the above apply() into the signature needed by cas_request:
    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override;
    virtual ~rmw_operation() = default;
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -322,8 +322,8 @@ void set_storage_service(http_context& ctx, routes& r) {
                for (auto cf : column_families) {
                    column_families_vec.push_back(&db.find_column_family(keyspace, cf));
                }
-                return parallel_for_each(column_families_vec, [&cm] (column_family* cf) {
-                    return cm.perform_cleanup(cf);
+                return parallel_for_each(column_families_vec, [&cm, &db] (column_family* cf) {
+                    return cm.perform_cleanup(db, cf);
                });
            }).then([]{
                return make_ready_future<json::json_return_type>(0);
--- a/configure.py
+++ b/configure.py
@@ -386,6 +386,7 @@ scylla_tests = set([
    'test/boost/view_schema_ckey_test',
    'test/boost/vint_serialization_test',
    'test/boost/virtual_reader_test',
+    'test/boost/stall_free_test',
    'test/manual/ec2_snitch_test',
    'test/manual/gce_snitch_test',
    'test/manual/gossip',
@@ -573,6 +574,7 @@ scylla_core = (['database.cc',
                'cql3/sets.cc',
                'cql3/tuples.cc',
                'cql3/maps.cc',
+                'cql3/values.cc',
                'cql3/functions/user_function.cc',
                'cql3/functions/functions.cc',
                'cql3/functions/aggregate_fcts.cc',
--- a/cql3/functions/aggregate_fcts.cc
+++ b/cql3/functions/aggregate_fcts.cc
@@ -267,10 +267,13 @@ public:
    }
 };

-/// The same as `impl_max_function_for' but without knowledge of `Type'.
+/// The same as `impl_max_function_for' but without compile-time dependency on `Type'.
 class impl_max_dynamic_function final : public aggregate_function::aggregate {
+    data_type _io_type;
    opt_bytes _max;
 public:
+    impl_max_dynamic_function(data_type io_type) : _io_type(std::move(io_type)) {}
+
    virtual void reset() override {
        _max = {};
    }
@@ -278,12 +281,11 @@ public:
        return _max.value_or(bytes{});
    }
    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
-        if (!values[0]) {
+        if (values.empty() || !values[0]) {
            return;
        }
-        const auto val = *values[0];
-        if (!_max || *_max < val) {
-            _max = val;
+        if (!_max || _io_type->less(*_max, *values[0])) {
+            _max = values[0];
        }
    }
 };
@@ -298,10 +300,13 @@ public:
 };

 class max_dynamic_function final : public native_aggregate_function {
+    data_type _io_type;
 public:
-    max_dynamic_function(data_type io_type) : native_aggregate_function("max", io_type, { io_type }) {}
+    max_dynamic_function(data_type io_type)
+            : native_aggregate_function("max", io_type, { io_type })
+            , _io_type(std::move(io_type)) {}
    virtual std::unique_ptr<aggregate> new_aggregate() override {
-        return std::make_unique<impl_max_dynamic_function>();
+        return std::make_unique<impl_max_dynamic_function>(_io_type);
    }
 };

@@ -358,10 +363,13 @@ public:
    }
 };

-/// The same as `impl_min_function_for' but without knowledge of `Type'.
+/// The same as `impl_min_function_for' but without compile-time dependency on `Type'.
 class impl_min_dynamic_function final : public aggregate_function::aggregate {
+    data_type _io_type;
    opt_bytes _min;
 public:
+    impl_min_dynamic_function(data_type io_type) : _io_type(std::move(io_type)) {}
+
    virtual void reset() override {
        _min = {};
    }
@@ -369,12 +377,11 @@ public:
        return _min.value_or(bytes{});
    }
    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
-        if (!values[0]) {
+        if (values.empty() || !values[0]) {
            return;
        }
-        const auto val = *values[0];
-        if (!_min || val < *_min) {
-            _min = val;
+        if (!_min || _io_type->less(*values[0], *_min)) {
+            _min = values[0];
        }
    }
 };
@@ -389,10 +396,13 @@ public:
 };

 class min_dynamic_function final : public native_aggregate_function {
+    data_type _io_type;
 public:
-    min_dynamic_function(data_type io_type) : native_aggregate_function("min", io_type, { io_type }) {}
+    min_dynamic_function(data_type io_type)
+            : native_aggregate_function("min", io_type, { io_type })
+            , _io_type(std::move(io_type)) {}
    virtual std::unique_ptr<aggregate> new_aggregate() override {
-        return std::make_unique<impl_min_dynamic_function>();
+        return std::make_unique<impl_min_dynamic_function>(_io_type);
    }
 };

--- a/cql3/functions/castas_fcts.cc
+++ b/cql3/functions/castas_fcts.cc
@@ -88,16 +88,13 @@ static data_value castas_fctn_simple(data_value from) {
 template<typename ToType>
 static data_value castas_fctn_from_decimal_to_float(data_value from) {
    auto val_from = value_cast<big_decimal>(from);
-    boost::multiprecision::cpp_int ten(10);
-    boost::multiprecision::cpp_rational r = val_from.unscaled_value();
-    r /= boost::multiprecision::pow(ten, val_from.scale());
-    return static_cast<ToType>(r);
+    return static_cast<ToType>(val_from.as_rational());
 }

 static utils::multiprecision_int from_decimal_to_cppint(const data_value& from) {
    const auto& val_from = value_cast<big_decimal>(from);
-    boost::multiprecision::cpp_int ten(10);
-    return boost::multiprecision::cpp_int(val_from.unscaled_value() / boost::multiprecision::pow(ten, val_from.scale()));
+    auto r = val_from.as_rational();
+    return utils::multiprecision_int(numerator(r)/denominator(r));
 }

 template<typename ToType>
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -445,7 +445,7 @@ function_call::bind_and_get(const query_options& options) {
        buffers.push_back(std::move(to_bytes_opt(val)));
    }
    auto result = execute_internal(options.get_cql_serialization_format(), *_fun, std::move(buffers));
-    return options.make_temporary(cql3::raw_value::make_value(result));
+    return cql3::raw_value_view::make_temporary(cql3::raw_value::make_value(result));
 }

 bytes_opt
--- a/cql3/lists.cc
+++ b/cql3/lists.cc
@@ -357,7 +357,12 @@ lists::setter_by_uuid::execute(mutation& m, const clustering_key_prefix& prefix,

    collection_mutation_description mut;
    mut.cells.reserve(1);
-    mut.cells.emplace_back(to_bytes(*index), params.make_cell(*ltype->value_comparator(), *value, atomic_cell::collection_member::yes));
+
+    if (!value) {
+        mut.cells.emplace_back(to_bytes(*index), params.make_dead_cell());
+    } else {
+        mut.cells.emplace_back(to_bytes(*index), params.make_cell(*ltype->value_comparator(), *value, atomic_cell::collection_member::yes));
+    }

    m.set_cell(prefix, column, mut.serialize(*ltype));
 }
--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -161,17 +161,6 @@ query_options::query_options(std::vector<cql3::raw_value> values)
          db::consistency_level::ONE, infinite_timeout_config, std::move(values))
 {}

-cql3::raw_value_view query_options::make_temporary(cql3::raw_value value) const
-{
-    if (value) {
-        auto value_view = *value;
-        auto ptr = _temporaries.write_place_holder(value_view.size());
-        std::copy_n(value_view.data(), value_view.size(), ptr);
-        return cql3::raw_value_view::make_value(fragmented_temporary_buffer::view(bytes_view{ptr, value_view.size()}));
-    }
-    return cql3::raw_value_view::make_null();
-}
-
 bytes_view query_options::linearize(fragmented_temporary_buffer::view view) const
 {
    if (view.empty()) {
--- a/cql3/query_options.hh
+++ b/cql3/query_options.hh
@@ -178,7 +178,6 @@ public:
        return _value_views.size();
    }

-    cql3::raw_value_view make_temporary(cql3::raw_value value) const;
    bytes_view linearize(fragmented_temporary_buffer::view) const;

    bool skip_metadata() const {
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -417,7 +417,7 @@ std::vector<const column_definition*> statement_restrictions::get_column_defs_fo
                    _clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
            for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
                ::shared_ptr<single_column_restriction> restr;
-                if (single_pk_restrs) {
+                if (single_ck_restrs) {
                    auto it = single_ck_restrs->restrictions().find(cdef);
                    if (it != single_ck_restrs->restrictions().end()) {
                        restr = dynamic_pointer_cast<single_column_restriction>(it->second);
@@ -688,6 +688,11 @@ static query::range<bytes_view> to_range(const term_slice& slice, const query_op
        extract_bound(statements::bound::END));
 }

+static bool contains_without_wraparound(
+        const query::range<bytes_view>& range, bytes_view value, const serialized_tri_compare& cmp) {
+    return !range.is_wrap_around(cmp) && range.contains(value, cmp);
+}
+
 bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
        const partition_key& key,
        const clustering_key_prefix& ckey,
@@ -702,13 +707,13 @@ bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
        return false;
    }
    return cell_value->with_linearized([&] (bytes_view cell_value_bv) {
-        return to_range(_slice, options, _column_def.name_as_text()).contains(
+        return contains_without_wraparound(to_range(_slice, options, _column_def.name_as_text()),
                cell_value_bv, _column_def.type->as_tri_comparator());
    });
 }

 bool single_column_restriction::slice::is_satisfied_by(bytes_view data, const query_options& options) const {
-    return to_range(_slice, options, _column_def.name_as_text()).contains(
+    return contains_without_wraparound(to_range(_slice, options, _column_def.name_as_text()),
            data, _column_def.type->underlying_type()->as_tri_comparator());
 }

--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -207,6 +207,9 @@ void alter_table_statement::add_column(const schema& schema, const table& cf, sc
                "because a collection with the same name and a different type has already been used in the past", column_name));
        }
    }
+    if (type->is_counter() && !schema.is_counter()) {
+        throw exceptions::configuration_exception(format("Cannot add a counter column ({}) in a non counter column family", column_name));
+    }

    cfm.with_column(column_name.name(), type, is_static ? column_kind::static_column : column_kind::regular_column);

@@ -222,7 +225,7 @@ void alter_table_statement::add_column(const schema& schema, const table& cf, sc
            schema_builder builder(view);
            if (view->view_info()->include_all_columns()) {
                builder.with_column(column_name.name(), type);
-            } else if (view->view_info()->base_non_pk_columns_in_view_pk().empty()) {
+            } else if (!view->view_info()->has_base_non_pk_columns_in_view_pk()) {
                db::view::create_virtual_column(builder, column_name.name(), type);
            }
            view_updates.push_back(view_ptr(builder.build()));
--- a/cql3/term.hh
+++ b/cql3/term.hh
@@ -190,7 +190,7 @@ public:
    virtual cql3::raw_value get(const query_options& options) = 0;

    virtual cql3::raw_value_view bind_and_get(const query_options& options) override {
-        return options.make_temporary(get(options));
+        return raw_value_view::make_temporary(get(options));
    }

    virtual sstring to_string() const = 0;
@@ -227,7 +227,7 @@ public:
    virtual cql3::raw_value_view bind_and_get(const query_options& options) override {
        auto t = bind(options);
        if (t) {
-            return options.make_temporary(t->get(options));
+            return cql3::raw_value_view::make_temporary(t->get(options));
        }
        return cql3::raw_value_view::make_null();
    };
--- a/cql3/tuples.hh
+++ b/cql3/tuples.hh
@@ -184,7 +184,7 @@ public:

        virtual cql3::raw_value_view bind_and_get(const query_options& options) override {
            // We don't "need" that override but it saves us the allocation of a Value object if used
-            return options.make_temporary(cql3::raw_value::make_value(_type->build_value(bind_internal(options))));
+            return cql3::raw_value_view::make_temporary(cql3::raw_value::make_value(_type->build_value(bind_internal(options))));
        }
    };

--- a/cql3/user_types.cc
+++ b/cql3/user_types.cc
@@ -227,7 +227,7 @@ shared_ptr<terminal> user_types::delayed_value::bind(const query_options& option
 }

 cql3::raw_value_view user_types::delayed_value::bind_and_get(const query_options& options) {
-    return options.make_temporary(cql3::raw_value::make_value(user_type_impl::build_value(bind_internal(options))));
+    return cql3::raw_value_view::make_temporary(cql3::raw_value::make_value(user_type_impl::build_value(bind_internal(options))));
 }

 shared_ptr<terminal> user_types::marker::bind(const query_options& options) {
--- a/cql3/values.cc
+++ b/cql3/values.cc
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cql3/values.hh"
+
+namespace cql3 {
+
+std::ostream& operator<<(std::ostream& os, const raw_value_view& value) {
+    seastar::visit(value._data, [&] (fragmented_temporary_buffer::view v) {
+        os << "{ value: ";
+        using boost::range::for_each;
+        for_each(v, [&os] (bytes_view bv) { os << bv; });
+        os << " }";
+    }, [&] (null_value) {
+        os << "{ null }";
+    }, [&] (unset_value) {
+        os << "{ unset }";
+    });
+    return os;
+}
+
+raw_value_view raw_value::to_view() const {
+    switch (_data.index()) {
+    case 0:  return raw_value_view::make_value(fragmented_temporary_buffer::view(bytes_view{std::get<bytes>(_data)}));
+    case 1:  return raw_value_view::make_null();
+    default: return raw_value_view::make_unset_value();
+    }
+}
+
+raw_value raw_value::make_value(const raw_value_view& view) {
+    if (view.is_null()) {
+        return make_null();
+    }
+    if (view.is_unset_value()) {
+        return make_unset_value();
+    }
+    return make_value(linearized(*view));
+}
+
+raw_value_view raw_value_view::make_temporary(raw_value&& value) {
+    if (!value) {
+        return raw_value_view::make_null();
+    }
+    return raw_value_view(std::move(value).extract_value());
+}
+
+raw_value_view::raw_value_view(bytes&& tmp) {
+    _temporary_storage = make_lw_shared<bytes>(std::move(tmp));
+    _data = fragmented_temporary_buffer::view(bytes_view(*_temporary_storage));
+}
+
+}
--- a/cql3/values.hh
+++ b/cql3/values.hh
@@ -39,11 +39,22 @@ struct null_value {
 struct unset_value {
 };

+class raw_value;
 /// \brief View to a raw CQL protocol value.
 ///
 /// \see raw_value
 struct raw_value_view {
    std::variant<fragmented_temporary_buffer::view, null_value, unset_value> _data;
+    // Temporary storage is only useful if a raw_value_view needs to be instantiated
+    // with a value which lifetime is bounded only to the view itself.
+    // This hack is introduced in order to avoid storing temporary storage
+    // in an external container, which may cause memory leaking problems.
+    // This pointer is disengaged for regular raw_value_view instances.
+    // Data is stored in a shared pointer for two reasons:
+    // - pointers are cheap to copy
+    // - it makes the view keep its semantics - it's safe to copy a view multiple times
+    //   and all copies still refer to the same underlying data.
+    lw_shared_ptr<bytes> _temporary_storage = nullptr;

    raw_value_view(null_value&& data)
        : _data{std::move(data)}
@@ -54,6 +65,9 @@ struct raw_value_view {
    raw_value_view(fragmented_temporary_buffer::view data)
        : _data{data}
    {}
+    // This constructor is only used by make_temporary() and it acquires ownership
+    // of the given buffer. The view created that way refers to its own temporary storage.
+    explicit raw_value_view(bytes&& temporary_storage);
 public:
    static raw_value_view make_null() {
        return raw_value_view{std::move(null_value{})};
@@ -64,6 +78,7 @@ public:
    static raw_value_view make_value(fragmented_temporary_buffer::view view) {
        return raw_value_view{view};
    }
+    static raw_value_view make_temporary(raw_value&& value);
    bool is_null() const {
        return std::holds_alternative<null_value>(_data);
    }
@@ -102,19 +117,7 @@ public:
        return !(*this == other);
    }

-    friend std::ostream& operator<<(std::ostream& os, const raw_value_view& value) {
-        seastar::visit(value._data, [&] (fragmented_temporary_buffer::view v) {
-            os << "{ value: ";
-            using boost::range::for_each;
-            for_each(v, [&os] (bytes_view bv) { os << bv; });
-            os << " }";
-        }, [&] (null_value) {
-            os << "{ null }";
-        }, [&] (unset_value) {
-            os << "{ unset }";
-        });
-        return os;
-    }
+    friend std::ostream& operator<<(std::ostream& os, const raw_value_view& value);
 };

 /// \brief Raw CQL protocol value.
@@ -144,15 +147,7 @@ public:
    static raw_value make_unset_value() {
        return raw_value{std::move(unset_value{})};
    }
-    static raw_value make_value(const raw_value_view& view) {
-        if (view.is_null()) {
-            return make_null();
-        }
-        if (view.is_unset_value()) {
-            return make_unset_value();
-        }
-        return make_value(linearized(*view));
-    }
+    static raw_value make_value(const raw_value_view& view);
    static raw_value make_value(bytes&& bytes) {
        return raw_value{std::move(bytes)};
    }
@@ -189,13 +184,12 @@ public:
    const bytes& operator*() const {
        return std::get<bytes>(_data);
    }
-    raw_value_view to_view() const {
-        switch (_data.index()) {
-        case 0:  return raw_value_view::make_value(fragmented_temporary_buffer::view(bytes_view{std::get<bytes>(_data)}));
-        case 1:  return raw_value_view::make_null();
-        default: return raw_value_view::make_unset_value();
-        }
+    bytes&& extract_value() && {
+        auto b = std::get_if<bytes>(&_data);
+        assert(b);
+        return std::move(*b);
    }
+    raw_value_view to_view() const;
 };

 }
--- a/database.cc
+++ b/database.cc
@@ -552,14 +552,14 @@ database::~database() {
 }

 void database::update_version(const utils::UUID& version) {
-    if (_version != version) {
+    if (_version.get() != version) {
        _schema_change_count++;
    }
-    _version = version;
+    _version.set(version);
 }

 const utils::UUID& database::get_version() const {
-    return _version;
+    return _version.get();
 }

 static future<>
@@ -1851,7 +1851,11 @@ future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_fun
                            // TODO: indexes.
                            // Note: since discard_sstables was changed to only count tables owned by this shard,
                            // we can get zero rp back. Changed assert, and ensure we save at least low_mark.
-                            assert(low_mark <= rp || rp == db::replay_position());
+                            // #6995 - the assert below was broken in c2c6c71 and remained so for many years. 
+                            // We nowadays do not flush tables with sstables but autosnapshot=false. This means
+                            // the low_mark assertion does not hold, because we maybe/probably never got around to 
+                            // creating the sstables that would create them.
+                            assert(!should_flush || low_mark <= rp || rp == db::replay_position());
                            rp = std::max(low_mark, rp);
                            return truncate_views(cf, truncated_at, should_flush).then([&cf, truncated_at, rp] {
                                // save_truncation_record() may actually fail after we cached the truncation time
@@ -1957,31 +1961,6 @@ future<> database::clear_snapshot(sstring tag, std::vector<sstring> keyspace_nam
    });
 }

-future<utils::UUID> update_schema_version(distributed<service::storage_proxy>& proxy, schema_features features)
-{
-    return db::schema_tables::calculate_schema_digest(proxy, features).then([&proxy] (utils::UUID uuid) {
-        return proxy.local().get_db().invoke_on_all([uuid] (database& db) {
-            db.update_version(uuid);
-        }).then([uuid] {
-            return db::system_keyspace::update_schema_version(uuid);
-        }).then([uuid] {
-            dblog.info("Schema version changed to {}", uuid);
-            return uuid;
-        });
-    });
-}
-
-future<> announce_schema_version(utils::UUID schema_version) {
-    return service::migration_manager::passive_announce(schema_version);
-}
-
-future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy, schema_features features)
-{
-    return update_schema_version(proxy, features).then([] (utils::UUID uuid) {
-        return announce_schema_version(uuid);
-    });
-}
-
 std::ostream& operator<<(std::ostream& os, const user_types_metadata& m) {
    os << "org.apache.cassandra.config.UTMetaData@" << &m;
    return os;
--- a/database.hh
+++ b/database.hh
@@ -55,6 +55,7 @@
 #include <limits>
 #include <cstddef>
 #include "schema_fwd.hh"
+#include "db/view/view.hh"
 #include "db/schema_features.hh"
 #include "gms/feature.hh"
 #include "timestamp.hh"
@@ -95,6 +96,7 @@
 #include "user_types_metadata.hh"
 #include "query_class_config.hh"
 #include "absl-flat_hash_map.hh"
+#include "utils/updateable_value.hh"

 class cell_locker;
 class cell_locker_stats;
@@ -901,7 +903,7 @@ public:
    lw_shared_ptr<const sstable_list> get_sstables_including_compacted_undeleted() const;
    const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const;
    std::vector<sstables::shared_sstable> select_sstables(const dht::partition_range& range) const;
-    std::vector<sstables::shared_sstable> candidates_for_compaction() const;
+    std::vector<sstables::shared_sstable> non_staging_sstables() const;
    std::vector<sstables::shared_sstable> sstables_need_rewrite() const;
    size_t sstables_count() const;
    std::vector<uint64_t> sstable_count_per_level() const;
@@ -1008,8 +1010,9 @@ public:
        return *_config.sstables_manager;
    }

+    // Reader's schema must be the same as the base schema of each of the views.
    future<> populate_views(
-            std::vector<view_ptr>,
+            std::vector<db::view::view_and_base>,
            dht::token base_token,
            flat_mutation_reader&&,
            gc_clock::time_point);
@@ -1027,7 +1030,7 @@ private:
            tracing::trace_state_ptr tr_state, reader_concurrency_semaphore& sem, const io_priority_class& io_priority, query::partition_slice::option_set custom_opts) const;
    std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update, gc_clock::time_point now) const;
    future<> generate_and_propagate_view_updates(const schema_ptr& base,
-            std::vector<view_ptr>&& views,
+            std::vector<db::view::view_and_base>&& views,
            mutation&& m,
            flat_mutation_reader_opt existings,
            tracing::trace_state_ptr tr_state,
@@ -1099,6 +1102,10 @@ flat_mutation_reader make_local_shard_sstable_reader(schema_ptr s,
        mutation_reader::forwarding fwd_mr,
        sstables::read_monitor_generator& monitor_generator = sstables::default_read_monitor_generator());

+/// Read a range from the passed-in sstables.
+///
+/// The reader is unrestricted, but will account its resource usage on the
+/// semaphore belonging to the passed-in permit.
 flat_mutation_reader make_range_sstable_reader(schema_ptr s,
        reader_permit permit,
        lw_shared_ptr<sstables::sstable_set> sstables,
@@ -1110,6 +1117,21 @@ flat_mutation_reader make_range_sstable_reader(schema_ptr s,
        mutation_reader::forwarding fwd_mr,
        sstables::read_monitor_generator& monitor_generator = sstables::default_read_monitor_generator());

+/// Read a range from the passed-in sstables.
+///
+/// The reader is restricted, that is it will wait for admission on the semaphore
+/// belonging to the passed-in permit, before starting to read.
+flat_mutation_reader make_restricted_range_sstable_reader(schema_ptr s,
+        reader_permit permit,
+        lw_shared_ptr<sstables::sstable_set> sstables,
+        const dht::partition_range& pr,
+        const query::partition_slice& slice,
+        const io_priority_class& pc,
+        tracing::trace_state_ptr trace_state,
+        streamed_mutation::forwarding fwd,
+        mutation_reader::forwarding fwd_mr,
+        sstables::read_monitor_generator& monitor_generator = sstables::default_read_monitor_generator());
+
 class user_types_metadata;

 class keyspace_metadata final {
@@ -1376,7 +1398,7 @@ private:
        flat_hash_map<std::pair<sstring, sstring>, utils::UUID, utils::tuple_hash, string_pair_eq>;
    ks_cf_to_uuid_t _ks_cf_to_uuid;
    std::unique_ptr<db::commitlog> _commitlog;
-    utils::UUID _version;
+    utils::updateable_value_source<utils::UUID> _version;
    uint32_t _schema_change_count = 0;
    // compaction_manager object is referenced by all column families of a database.
    std::unique_ptr<compaction_manager> _compaction_manager;
@@ -1447,6 +1469,7 @@ public:
    void update_version(const utils::UUID& version);

    const utils::UUID& get_version() const;
+    utils::observable<utils::UUID>& observable_schema_version() const { return _version.as_observable(); }

    db::commitlog* commitlog() const {
        return _commitlog.get();
@@ -1664,10 +1687,6 @@ future<> stop_database(sharded<database>& db);
 flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db, schema_ptr schema,
        std::function<std::optional<dht::partition_range>()> range_generator);

-future<utils::UUID> update_schema_version(distributed<service::storage_proxy>& proxy, db::schema_features);
-future<> announce_schema_version(utils::UUID schema_version);
-future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy, db::schema_features);
-
 bool is_internal_keyspace(const sstring& name);

 #endif /* DATABASE_HH_ */
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -290,7 +290,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
            mutation m(schema, key);
            auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
            m.partition().apply_delete(*schema, clustering_key_prefix::make_empty(), tombstone(now, gc_clock::now()));
-            return _qp.proxy().mutate_locally(m, tracing::trace_state_ptr());
+            return _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
        });
    };

--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -521,7 +521,7 @@ public:
            _segment_manager->totals.total_size_on_disk -= size_on_disk();
            _segment_manager->totals.total_size -= (size_on_disk() + _buffer.size_bytes());
            _segment_manager->add_file_to_delete(_file_name, _desc);
-        } else {
+        } else if (_segment_manager->cfg.warn_about_segments_left_on_disk_after_shutdown) {
            clogger.warn("Segment {} is dirty and is left on disk.", *this);
        }
    }
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -137,6 +137,7 @@ public:

        bool reuse_segments = true;
        bool use_o_dsync = false;
+        bool warn_about_segments_left_on_disk_after_shutdown = true;

        const db::extensions * extensions = nullptr;
    };
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -304,7 +304,7 @@ future<> db::commitlog_replayer::impl::process(stats* s, commitlog::buffer_and_r
                mutation m(cf.schema(), fm.decorated_key(*cf.schema()));
                converting_mutation_partition_applier v(cm, *cf.schema(), m.partition());
                fm.partition().accept(cm, v);
-                return do_with(std::move(m), [&db, &cf] (mutation m) {
+                return do_with(std::move(m), [&db, &cf] (const mutation& m) {
                    return db.apply_in_memory(m, cf, db::rp_handle(), db::no_timeout);
                });
            } else {
--- a/db/config.cc
+++ b/db/config.cc
@@ -662,6 +662,15 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "\tpriority_string : GnuTLS priority string controlling TLS algorithms used/allowed.\n"
        "\trequire_client_auth : (Default: false ) Enables or disables certificate authentication.\n"
        "Related information: Client-to-node encryption")
+    , alternator_encryption_options(this, "alternator_encryption_options", value_status::Used, {/*none*/},
+        "When Alternator via HTTPS is enabled with alternator_https_port, where to take the key and certificate. The available options are:\n"
+        "\n"
+        "\tcertificate: (Default: conf/scylla.crt) The location of a PEM-encoded x509 certificate used to identify and encrypt the client/server communication.\n"
+        "\tkeyfile: (Default: conf/scylla.key) PEM Key file associated with certificate.\n"
+        "\n"
+        "The advanced settings are:\n"
+        "\n"
+        "\tpriority_string : GnuTLS priority string controlling TLS algorithms used/allowed.")
    , ssl_storage_port(this, "ssl_storage_port", value_status::Used, 7001,
        "The SSL port for encrypted communication. Unused unless enabled in encryption_options.")
    , enable_in_memory_data_store(this, "enable_in_memory_data_store", value_status::Used, false, "Enable in memory mode (system tables are always persisted)")
@@ -681,7 +690,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , replace_address(this, "replace_address", value_status::Used, "", "The listen_address or broadcast_address of the dead node to replace. Same as -Dcassandra.replace_address.")
    , replace_address_first_boot(this, "replace_address_first_boot", value_status::Used, "", "Like replace_address option, but if the node has been bootstrapped successfully it will be ignored. Same as -Dcassandra.replace_address_first_boot.")
    , override_decommission(this, "override_decommission", value_status::Used, false, "Set true to force a decommissioned node to join the cluster")
-    , enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based")
+    , enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, false, "Set true to use enable repair based node operations instead of streaming based")
    , ring_delay_ms(this, "ring_delay_ms", value_status::Used, 30 * 1000, "Time a node waits to hear from other nodes before joining the ring in milliseconds. Same as -Dcassandra.ring_delay_ms in cassandra.")
    , shadow_round_ms(this, "shadow_round_ms", value_status::Used, 300 * 1000, "The maximum gossip shadow round time. Can be used to reduce the gossip feature check time during node boot up.")
    , fd_max_interval_ms(this, "fd_max_interval_ms", value_status::Used, 2 * 1000, "The maximum failure_detector interval time in milliseconds. Interval larger than the maximum will be ignored. Larger cluster may need to increase the default.")
--- a/db/config.hh
+++ b/db/config.hh
@@ -252,6 +252,7 @@ public:
    named_value<uint32_t> permissions_cache_max_entries;
    named_value<string_map> server_encryption_options;
    named_value<string_map> client_encryption_options;
+    named_value<string_map> alternator_encryption_options;
    named_value<uint32_t> ssl_storage_port;
    named_value<bool> enable_in_memory_data_store;
    named_value<bool> enable_cache;
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -224,7 +224,9 @@ future<> manager::end_point_hints_manager::stop(drain should_drain) noexcept {
        with_lock(file_update_mutex(), [this] {
            if (_hints_store_anchor) {
                hints_store_ptr tmp = std::exchange(_hints_store_anchor, nullptr);
-                return tmp->shutdown().finally([tmp] {});
+                return tmp->shutdown().finally([tmp] {
+                    return tmp->release();
+                }).finally([tmp] {});
            }
            return make_ready_future<>();
        }).handle_exception([&eptr] (auto e) { eptr = std::move(e); }).get();
@@ -290,7 +292,7 @@ inline bool manager::have_ep_manager(ep_key_type ep) const noexcept {
 }

 bool manager::store_hint(ep_key_type ep, schema_ptr s, lw_shared_ptr<const frozen_mutation> fm, tracing::trace_state_ptr tr_state) noexcept {
-    if (stopping() || !started() || !can_hint_for(ep)) {
+    if (stopping() || draining_all() || !started() || !can_hint_for(ep)) {
        manager_logger.trace("Can't store a hint to {}", ep);
        ++_stats.dropped;
        return false;
@@ -326,6 +328,10 @@ future<db::commitlog> manager::end_point_hints_manager::add_store() noexcept {
            // HH doesn't utilize the flow that benefits from reusing segments.
            // Therefore let's simply disable it to avoid any possible confusion.
            cfg.reuse_segments = false;
+            // HH leaves segments on disk after commitlog shutdown, and later reads
+            // them when commitlog is re-created. This is expected to happen regularly
+            // during standard HH workload, so no need to print a warning about it.
+            cfg.warn_about_segments_left_on_disk_after_shutdown = false;

            return commitlog::create_commitlog(std::move(cfg)).then([this] (commitlog l) {
                // add_store() is triggered every time hint files are forcefully flushed to I/O (every hints_flush_period).
@@ -352,7 +358,9 @@ future<> manager::end_point_hints_manager::flush_current_hints() noexcept {
        return futurize_invoke([this] {
            return with_lock(file_update_mutex(), [this]() -> future<> {
                return get_or_load().then([] (hints_store_ptr cptr) {
-                    return cptr->shutdown();
+                    return cptr->shutdown().finally([cptr] {
+                        return cptr->release();
+                    }).finally([cptr] {});
                }).then([this] {
                    // Un-hold the commitlog object. Since we are under the exclusive _file_update_mutex lock there are no
                    // other hints_store_ptr copies and this would destroy the commitlog shared value.
@@ -529,7 +537,7 @@ bool manager::check_dc_for(ep_key_type ep) const noexcept {
 }

 void manager::drain_for(gms::inet_address endpoint) {
-    if (stopping()) {
+    if (stopping() || draining_all()) {
        return;
    }

@@ -540,6 +548,7 @@ void manager::drain_for(gms::inet_address endpoint) {
        return with_semaphore(drain_lock(), 1, [this, endpoint] {
            return futurize_invoke([this, endpoint] () {
                if (utils::fb_utilities::is_me(endpoint)) {
+                    set_draining_all();
                    return parallel_for_each(_ep_managers, [] (auto& pair) {
                        return pair.second.stop(drain::yes).finally([&pair] {
                            return with_file_update_mutex(pair.second, [&pair] {
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -424,12 +424,14 @@ public:
    enum class state {
        started,                // hinting is currently allowed (start() call is complete)
        replay_allowed,         // replaying (hints sending) is allowed
+        draining_all,           // hinting is not allowed - all ep managers are being stopped because this node is leaving the cluster
        stopping                // hinting is not allowed - stopping is in progress (stop() method has been called)
    };

    using state_set = enum_set<super_enum<state,
        state::started,
        state::replay_allowed,
+        state::draining_all,
        state::stopping>>;

 private:
@@ -690,6 +692,14 @@ private:
        return _state.contains(state::replay_allowed);
    }

+    void set_draining_all() noexcept {
+        _state.set(state::draining_all);
+    }
+
+    bool draining_all() noexcept {
+        return _state.contains(state::draining_all);
+    }
+
 public:
    ep_managers_map_type::iterator find_ep_manager(ep_key_type ep_key) noexcept {
        return _ep_managers.find(ep_key);
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -801,6 +801,19 @@ future<> merge_unlock() {
    return smp::submit_to(0, [] { the_merge_lock.signal(); });
 }

+static
+future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy, schema_features features) {
+    return calculate_schema_digest(proxy, features).then([&proxy] (utils::UUID uuid) {
+        return db::system_keyspace::update_schema_version(uuid).then([&proxy, uuid] {
+            return proxy.local().get_db().invoke_on_all([uuid] (database& db) {
+                db.update_version(uuid);
+            });
+        }).then([uuid] {
+            slogger.info("Schema version changed to {}", uuid);
+        });
+    });
+}
+
 /**
 * Merge remote schema in form of mutations with local and mutate ks/cf metadata objects
 * (which also involves fs operations on add/drop ks/cf)
@@ -821,6 +834,14 @@ future<> merge_schema(distributed<service::storage_proxy>& proxy, gms::feature_s
    });
 }

+future<> recalculate_schema_version(distributed<service::storage_proxy>& proxy, gms::feature_service& feat) {
+    return merge_lock().then([&proxy, &feat] {
+        return update_schema_version_and_announce(proxy, feat.cluster_schema_features());
+    }).finally([] {
+        return merge_unlock();
+    });
+}
+
 future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush)
 {
    return merge_lock().then([&proxy, mutations = std::move(mutations), do_flush] () mutable {
@@ -2904,10 +2925,6 @@ future<> maybe_update_legacy_secondary_index_mv_schema(service::migration_manage
    // format, where "token" is not marked as computed. Once we're sure that all indexes have their
    // columns marked as computed (because they were either created on a node that supports computed
    // columns or were fixed by this utility function), it's safe to remove this function altogether.
-    if (!db.features().cluster_supports_computed_columns()) {
-        return make_ready_future<>();
-    }
-
    if (v->clustering_key_size() == 0) {
        return make_ready_future<>();
    }
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -170,6 +170,13 @@ future<> merge_schema(distributed<service::storage_proxy>& proxy, gms::feature_s

 future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush);

+// Recalculates the local schema version.
+//
+// It is safe to call concurrently with recalculate_schema_version() and merge_schema() in which case it
+// is guaranteed that the schema version we end up with after all calls will reflect the most recent state
+// of feature_service and schema tables.
+future<> recalculate_schema_version(distributed<service::storage_proxy>& proxy, gms::feature_service& feat);
+
 future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after);

 std::vector<mutation> make_create_keyspace_mutations(lw_shared_ptr<keyspace_metadata> keyspace, api::timestamp_type timestamp, bool with_tables_and_types_and_functions = true);
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -58,6 +58,7 @@
 #include "cql3/util.hh"
 #include "db/view/view.hh"
 #include "db/view/view_builder.hh"
+#include "db/view/view_updating_consumer.hh"
 #include "db/system_keyspace_view_types.hh"
 #include "db/system_keyspace.hh"
 #include "frozen_mutation.hh"
@@ -136,17 +137,90 @@ const column_definition* view_info::view_column(const column_definition& base_de
    return _schema.get_column_definition(base_def.name());
 }

-const std::vector<column_id>& view_info::base_non_pk_columns_in_view_pk() const {
+void view_info::set_base_info(db::view::base_info_ptr base_info) {
+    _base_info = std::move(base_info);
+}
+
+// A constructor for a base info that can facilitate reads and writes from the materialized view.
+db::view::base_dependent_view_info::base_dependent_view_info(schema_ptr base_schema, std::vector<column_id>&& base_non_pk_columns_in_view_pk)
+        : _base_schema{std::move(base_schema)}
+        , _base_non_pk_columns_in_view_pk{std::move(base_non_pk_columns_in_view_pk)}
+        , has_base_non_pk_columns_in_view_pk{!_base_non_pk_columns_in_view_pk.empty()}
+        , use_only_for_reads{false} {
+
+}
+
+// A constructor for a base info that can facilitate only reads from the materialized view.
+db::view::base_dependent_view_info::base_dependent_view_info(bool has_base_non_pk_columns_in_view_pk)
+        : _base_schema{nullptr}
+        , has_base_non_pk_columns_in_view_pk{has_base_non_pk_columns_in_view_pk}
+        , use_only_for_reads{true} {
+}
+
+const std::vector<column_id>& db::view::base_dependent_view_info::base_non_pk_columns_in_view_pk() const {
+    if (use_only_for_reads) {
+        on_internal_error(vlogger, "base_non_pk_columns_in_view_pk(): operation unsupported when initialized only for view reads.");
+    }
    return _base_non_pk_columns_in_view_pk;
 }

-void view_info::initialize_base_dependent_fields(const schema& base) {
+const schema_ptr& db::view::base_dependent_view_info::base_schema() const {
+    if (use_only_for_reads) {
+        on_internal_error(vlogger, "base_schema(): operation unsupported when initialized only for view reads.");
+    }
+    return _base_schema;
+}
+
+db::view::base_info_ptr view_info::make_base_dependent_view_info(const schema& base) const {
+    std::vector<column_id> base_non_pk_columns_in_view_pk;
+    bool has_base_non_pk_columns_in_view_pk = false;
+    bool can_only_read_from_view = false;
+
    for (auto&& view_col : boost::range::join(_schema.partition_key_columns(), _schema.clustering_key_columns())) {
+        if (view_col.is_computed()) {
+            // we are not going to find it in the base table...
+            continue;
+        }
        auto* base_col = base.get_column_definition(view_col.name());
        if (base_col && !base_col->is_primary_key()) {
-            _base_non_pk_columns_in_view_pk.push_back(base_col->id);
+            base_non_pk_columns_in_view_pk.push_back(base_col->id);
+            has_base_non_pk_columns_in_view_pk = true;
+        } else if (!base_col) {
+            // If we didn't find the column in the base column then it must have been deleted
+            // or not yet added (by alter command), this means it is for sure not a pk column
+            // in the base table. This can happen if the version of the base schema is not the
+            // one that the view was created with. Seting this schema as the base can't harm since
+            // if we got to such a situation then it means it is only going to be used for reading
+            // (computation of shadowable tombstones) and in that case the existence of such a column
+            // is the only thing that is of interest to us.
+            has_base_non_pk_columns_in_view_pk = true;
+            can_only_read_from_view = true;
+
+            // We can break the loop here since we have the info we wanted and the list
+            // of columns is not going to be reliable anyhow.
+            break;
        }
    }
+
+    if (can_only_read_from_view) {
+        return make_lw_shared<db::view::base_dependent_view_info>(has_base_non_pk_columns_in_view_pk);
+    } else {
+        return make_lw_shared<db::view::base_dependent_view_info>(base.shared_from_this(), std::move(base_non_pk_columns_in_view_pk));
+    }
+
+}
+
+bool view_info::has_base_non_pk_columns_in_view_pk() const {
+    // The base info is not always available, this is because
+    // the base info initialization is separate from the view
+    // info construction. If we are trying to get this info without
+    // initializing the base information it means that we have a
+    // schema integrity problem as the creator of owning view schema
+    // didn't make sure to initialize it with base information.
+    if (!_base_info) {
+        on_internal_error(vlogger, "Tried to perform a view query which is base info dependant without initializing it");
+    }
+    return _base_info->has_base_non_pk_columns_in_view_pk;
 }

 namespace db {
@@ -194,12 +268,12 @@ bool may_be_affected_by(const schema& base, const view_info& view, const dht::de
 }

 static bool update_requires_read_before_write(const schema& base,
-        const std::vector<view_ptr>& views,
+        const std::vector<view_and_base>& views,
        const dht::decorated_key& key,
        const rows_entry& update,
        gc_clock::time_point now) {
    for (auto&& v : views) {
-        view_info& vf = *v->view_info();
+        view_info& vf = *v.view->view_info();
        if (may_be_affected_by(base, vf, key, update, now)) {
            return true;
        }
@@ -246,12 +320,14 @@ class view_updates final {
    view_ptr _view;
    const view_info& _view_info;
    schema_ptr _base;
+    base_info_ptr _base_info;
    std::unordered_map<partition_key, mutation_partition, partition_key::hashing, partition_key::equality> _updates;
 public:
-    explicit view_updates(view_ptr view, schema_ptr base)
-            : _view(std::move(view))
+    explicit view_updates(view_and_base vab)
+            : _view(std::move(vab.view))
            , _view_info(*_view->view_info())
-            , _base(std::move(base))
+            , _base(vab.base->base_schema())
+            , _base_info(vab.base)
            , _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view)) {
    }

@@ -313,7 +389,7 @@ row_marker view_updates::compute_row_marker(const clustering_row& base_row) cons
    // they share liveness information. It's true especially in the only case currently allowed by CQL,
    // which assumes there's up to one non-pk column in the view key. It's also true in alternator,
    // which does not carry TTL information.
-    const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
+    const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk();
    if (!col_ids.empty()) {
        auto& def = _base->regular_column_at(col_ids[0]);
        // Note: multi-cell columns can't be part of the primary key.
@@ -544,7 +620,7 @@ void view_updates::delete_old_entry(const partition_key& base_key, const cluster

 void view_updates::do_delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now) {
    auto& r = get_view_row(base_key, existing);
-    const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
+    const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk();
    if (!col_ids.empty()) {
        // We delete the old row using a shadowable row tombstone, making sure that
        // the tombstone deletes everything in the row (or it might still show up).
@@ -685,7 +761,7 @@ void view_updates::generate_update(
        return;
    }

-    const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
+    const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk();
    if (col_ids.empty()) {
        // The view key is necessarily the same pre and post update.
        if (existing && existing->is_live(*_base)) {
@@ -940,12 +1016,17 @@ future<stop_iteration> view_update_builder::on_results() {

 future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
        const schema_ptr& base,
-        std::vector<view_ptr>&& views_to_update,
+        std::vector<view_and_base>&& views_to_update,
        flat_mutation_reader&& updates,
        flat_mutation_reader_opt&& existings,
        gc_clock::time_point now) {
-    auto vs = boost::copy_range<std::vector<view_updates>>(views_to_update | boost::adaptors::transformed([&] (auto&& v) {
-        return view_updates(std::move(v), base);
+    auto vs = boost::copy_range<std::vector<view_updates>>(views_to_update | boost::adaptors::transformed([&] (view_and_base v) {
+        if (base->version() != v.base->base_schema()->version()) {
+            on_internal_error(vlogger, format("Schema version used for view updates ({}) does not match the current"
+                                              " base schema version of the view ({}) for view {}.{} of {}.{}",
+                base->version(), v.base->base_schema()->version(), v.view->ks_name(), v.view->cf_name(), base->ks_name(), base->cf_name()));
+        }
+        return view_updates(std::move(v));
    }));
    auto builder = std::make_unique<view_update_builder>(base, std::move(vs), std::move(updates), std::move(existings), now);
    auto f = builder->build();
@@ -955,7 +1036,7 @@ future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
 query::clustering_row_ranges calculate_affected_clustering_ranges(const schema& base,
        const dht::decorated_key& key,
        const mutation_partition& mp,
-        const std::vector<view_ptr>& views,
+        const std::vector<view_and_base>& views,
        gc_clock::time_point now) {
    std::vector<nonwrapping_range<clustering_key_prefix_view>> row_ranges;
    std::vector<nonwrapping_range<clustering_key_prefix_view>> view_row_ranges;
@@ -963,11 +1044,11 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(const schema&
    if (mp.partition_tombstone() || !mp.row_tombstones().empty()) {
        for (auto&& v : views) {
            // FIXME: #2371
-            if (v->view_info()->select_statement().get_restrictions()->has_unrestricted_clustering_columns()) {
+            if (v.view->view_info()->select_statement().get_restrictions()->has_unrestricted_clustering_columns()) {
                view_row_ranges.push_back(nonwrapping_range<clustering_key_prefix_view>::make_open_ended_both_sides());
                break;
            }
-            for (auto&& r : v->view_info()->partition_slice().default_row_ranges()) {
+            for (auto&& r : v.view->view_info()->partition_slice().default_row_ranges()) {
                view_row_ranges.push_back(r.transform(std::mem_fn(&clustering_key_prefix::view)));
            }
        }
@@ -1210,25 +1291,61 @@ view_builder::view_builder(database& db, db::system_distributed_keyspace& sys_di
 }

 future<> view_builder::start(service::migration_manager& mm) {
-    _started = seastar::async([this, &mm] {
-        // Guard the whole startup routine with a semaphore,
-        // so that it's not intercepted by `on_drop_view`, `on_create_view`
-        // or `on_update_view` events.
-        auto units = get_units(_sem, 1).get0();
-        // Wait for schema agreement even if we're a seed node.
-        while (!mm.have_schema_agreement()) {
-            if (_as.abort_requested()) {
-                return;
+    _started = do_with(view_builder_init_state{}, [this, &mm] (view_builder_init_state& vbi) {
+        return seastar::async([this, &mm, &vbi] {
+            // Guard the whole startup routine with a semaphore,
+            // so that it's not intercepted by `on_drop_view`, `on_create_view`
+            // or `on_update_view` events.
+            auto units = get_units(_sem, 1).get0();
+            // Wait for schema agreement even if we're a seed node.
+            while (!mm.have_schema_agreement()) {
+                seastar::sleep_abortable(500ms, _as).get();
            }
-            seastar::sleep(500ms).get();
-        }
-        auto built = system_keyspace::load_built_views().get0();
-        auto in_progress = system_keyspace::load_view_build_progress().get0();
-        calculate_shard_build_step(std::move(built), std::move(in_progress)).get();
-        _mnotifier.register_listener(this);
-        _current_step = _base_to_build_step.begin();
-        // Waited on indirectly in stop().
-        (void)_build_step.trigger();
+            auto built = system_keyspace::load_built_views().get0();
+            auto in_progress = system_keyspace::load_view_build_progress().get0();
+            setup_shard_build_step(vbi, std::move(built), std::move(in_progress));
+        }).then_wrapped([this] (future<>&& f) {
+            // All shards need to arrive at the same decisions on whether or not to
+            // restart a view build at some common token (reshard), and which token
+            // to restart at. So we need to wait until all shards have read the view
+            // build statuses before they can all proceed to make the (same) decision.
+            // If we don't synchronize here, a fast shard may make a decision, start
+            // building and finish a build step - before the slowest shard even read
+            // the view build information.
+            std::exception_ptr eptr;
+            if (f.failed()) {
+                eptr = f.get_exception();
+            }
+
+            return container().invoke_on(0, [eptr = std::move(eptr)] (view_builder& builder) {
+                // The &builder is alive, because it can only be destroyed in
+                // sharded<view_builder>::stop(), which, in turn, waits for all
+                // view_builder::stop()-s to finish, and each stop() waits for
+                // the shard's current future (called _started) to resolve.
+                if (!eptr) {
+                    if (++builder._shards_finished_read == smp::count) {
+                        builder._shards_finished_read_promise.set_value();
+                    }
+                } else {
+                    if (builder._shards_finished_read < smp::count) {
+                        builder._shards_finished_read = smp::count;
+                        builder._shards_finished_read_promise.set_exception(std::move(eptr));
+                    }
+                }
+                return builder._shards_finished_read_promise.get_shared_future();
+            });
+        }).then([this, &vbi] {
+            return calculate_shard_build_step(vbi);
+        }).then([this] {
+            _mnotifier.register_listener(this);
+            _current_step = _base_to_build_step.begin();
+            // Waited on indirectly in stop().
+            (void)_build_step.trigger();
+            return make_ready_future<>();
+        });
+    }).handle_exception([] (std::exception_ptr eptr) {
+        vlogger.error("start failed: {}", eptr);
+        return make_ready_future<>();
    });
    return make_ready_future<>();
 }
@@ -1236,7 +1353,7 @@ future<> view_builder::start(service::migration_manager& mm) {
 future<> view_builder::stop() {
    vlogger.info("Stopping view builder");
    _as.request_abort();
-    return _started.finally([this] {
+    return _started.then([this] {
        return _mnotifier.unregister_listener(this).then([this] {
            return _sem.wait();
        }).then([this] {
@@ -1370,12 +1487,12 @@ void view_builder::reshard(
    }
 }

-future<> view_builder::calculate_shard_build_step(
+void view_builder::setup_shard_build_step(
+        view_builder_init_state& vbi,
        std::vector<system_keyspace::view_name> built,
        std::vector<system_keyspace::view_build_progress> in_progress) {
    // Shard 0 makes cleanup changes to the system tables, but none that could conflict
    // with the other shards; everyone is thus able to proceed independently.
-    auto bookkeeping_ops = std::make_unique<std::vector<future<>>>();
    auto base_table_exists = [this] (const view_ptr& view) {
        // This is a safety check in case this node missed a create MV statement
        // but got a drop table for the base, and another node didn't get the
@@ -1402,9 +1519,9 @@ future<> view_builder::calculate_shard_build_step(
            // Fall-through
        }
        if (this_shard_id() == 0) {
-            bookkeeping_ops->push_back(_sys_dist_ks.remove_view(name.first, name.second));
-            bookkeeping_ops->push_back(system_keyspace::remove_built_view(name.first, name.second));
-            bookkeeping_ops->push_back(
+            vbi.bookkeeping_ops.push_back(_sys_dist_ks.remove_view(name.first, name.second));
+            vbi.bookkeeping_ops.push_back(system_keyspace::remove_built_view(name.first, name.second));
+            vbi.bookkeeping_ops.push_back(
                    system_keyspace::remove_view_build_progress_across_all_shards(
                            std::move(name.first),
                            std::move(name.second)));
@@ -1412,50 +1529,48 @@ future<> view_builder::calculate_shard_build_step(
        return view_ptr(nullptr);
    };

-    auto built_views = boost::copy_range<std::unordered_set<utils::UUID>>(built
+    vbi.built_views = boost::copy_range<std::unordered_set<utils::UUID>>(built
            | boost::adaptors::transformed(maybe_fetch_view)
            | boost::adaptors::filtered([] (const view_ptr& v) { return bool(v); })
            | boost::adaptors::transformed([] (const view_ptr& v) { return v->id(); }));

-    std::vector<std::vector<view_build_status>> view_build_status_per_shard;
    for (auto& [view_name, first_token, next_token_opt, cpu_id] : in_progress) {
        if (auto view = maybe_fetch_view(view_name)) {
-            if (built_views.find(view->id()) != built_views.end()) {
+            if (vbi.built_views.find(view->id()) != vbi.built_views.end()) {
                if (this_shard_id() == 0) {
                    auto f = _sys_dist_ks.finish_view_build(std::move(view_name.first), std::move(view_name.second)).then([view = std::move(view)] {
                        return system_keyspace::remove_view_build_progress_across_all_shards(view->cf_name(), view->ks_name());
                    });
-                    bookkeeping_ops->push_back(std::move(f));
+                    vbi.bookkeeping_ops.push_back(std::move(f));
                }
                continue;
            }
-            view_build_status_per_shard.resize(std::max(view_build_status_per_shard.size(), size_t(cpu_id + 1)));
-            view_build_status_per_shard[cpu_id].emplace_back(view_build_status{
+            vbi.status_per_shard.resize(std::max(vbi.status_per_shard.size(), size_t(cpu_id + 1)));
+            vbi.status_per_shard[cpu_id].emplace_back(view_build_status{
                    std::move(view),
                    std::move(first_token),
                    std::move(next_token_opt)});
        }
    }
+}

-    // All shards need to arrive at the same decisions on whether or not to
-    // restart a view build at some common token (reshard), and which token
-    // to restart at. So we need to wait until all shards have read the view
-    // build statuses before they can all proceed to make the (same) decision.
-    // If we don't synchronoize here, a fast shard may make a decision, start
-    // building and finish a build step - before the slowest shard even read
-    // the view build information.
-    container().invoke_on(0, [] (view_builder& builder) {
-        if (++builder._shards_finished_read == smp::count) {
-            builder._shards_finished_read_promise.set_value();
+future<> view_builder::calculate_shard_build_step(view_builder_init_state& vbi) {
+    auto base_table_exists = [this] (const view_ptr& view) {
+        // This is a safety check in case this node missed a create MV statement
+        // but got a drop table for the base, and another node didn't get the
+        // drop notification and sent us the view schema.
+        try {
+            _db.find_schema(view->view_info()->base_id());
+            return true;
+        } catch (const no_such_column_family&) {
+            return false;
        }
-        return builder._shards_finished_read_promise.get_shared_future();
-    }).get();
-
+    };
    std::unordered_set<utils::UUID> loaded_views;
-    if (view_build_status_per_shard.size() != smp::count) {
-        reshard(std::move(view_build_status_per_shard), loaded_views);
-    } else if (!view_build_status_per_shard.empty()) {
-        for (auto& status : view_build_status_per_shard[this_shard_id()]) {
+    if (vbi.status_per_shard.size() != smp::count) {
+        reshard(std::move(vbi.status_per_shard), loaded_views);
+    } else if (!vbi.status_per_shard.empty()) {
+        for (auto& status : vbi.status_per_shard[this_shard_id()]) {
            load_view_status(std::move(status), loaded_views);
        }
    }
@@ -1472,18 +1587,18 @@ future<> view_builder::calculate_shard_build_step(
    auto all_views = _db.get_views();
    auto is_new = [&] (const view_ptr& v) {
        return base_table_exists(v) && loaded_views.find(v->id()) == loaded_views.end()
-                && built_views.find(v->id()) == built_views.end();
+                && vbi.built_views.find(v->id()) == vbi.built_views.end();
    };
    for (auto&& view : all_views | boost::adaptors::filtered(is_new)) {
-        bookkeeping_ops->push_back(add_new_view(view, get_or_create_build_step(view->view_info()->base_id())));
+        vbi.bookkeeping_ops.push_back(add_new_view(view, get_or_create_build_step(view->view_info()->base_id())));
    }

    for (auto& [_, build_step] : _base_to_build_step) {
        initialize_reader_at_current_token(build_step);
    }

-    auto f = seastar::when_all_succeed(bookkeeping_ops->begin(), bookkeeping_ops->end());
-    return f.handle_exception([this, bookkeeping_ops = std::move(bookkeeping_ops)] (std::exception_ptr ep) {
+    auto f = seastar::when_all_succeed(vbi.bookkeeping_ops.begin(), vbi.bookkeeping_ops.end());
+    return f.handle_exception([this] (std::exception_ptr ep) {
        log_level severity = _as.abort_requested() ? log_level::warn : log_level::error;
        vlogger.log(severity, "Failed to update materialized view bookkeeping ({}), continuing anyway.", ep);
    });
@@ -1732,7 +1847,7 @@ public:
            return stop_iteration::yes;
        }

-        _fragments_memory_usage += cr.memory_usage(*_step.base->schema());
+        _fragments_memory_usage += cr.memory_usage(*_step.reader.schema());
        _fragments.push_back(std::move(cr));
        if (_fragments_memory_usage > batch_memory_max) {
            // Although we have not yet completed the batch of base rows that
@@ -1754,10 +1869,14 @@ public:
        _builder._as.check();
        if (!_fragments.empty()) {
            _fragments.push_front(partition_start(_step.current_key, tombstone()));
+            auto base_schema = _step.base->schema();
+            auto views = with_base_info_snapshot(_views_to_build);
+            auto reader = make_flat_mutation_reader_from_fragments(_step.reader.schema(), std::move(_fragments));
+            reader.upgrade_schema(base_schema);
            _step.base->populate_views(
-                    _views_to_build,
+                    std::move(views),
                    _step.current_token(),
-                    make_flat_mutation_reader_from_fragments(_step.base->schema(), std::move(_fragments)),
+                    std::move(reader),
                    _now).get();
            _fragments.clear();
            _fragments_memory_usage = 0;
@@ -1909,5 +2028,54 @@ future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_d
    });
 }

+const size_t view_updating_consumer::buffer_size_soft_limit{1 * 1024 * 1024};
+const size_t view_updating_consumer::buffer_size_hard_limit{2 * 1024 * 1024};
+
+void view_updating_consumer::do_flush_buffer() {
+    _staging_reader_handle.pause();
+
+    if (_buffer.front().partition().empty()) {
+        // If we flushed mid-partition we can have an empty mutation if we
+        // flushed right before getting the end-of-partition fragment.
+        _buffer.pop_front();
+    }
+
+    while (!_buffer.empty()) {
+        try {
+            auto lock_holder = _view_update_pusher(std::move(_buffer.front())).get();
+        } catch (...) {
+            vlogger.warn("Failed to push replica updates for table {}.{}: {}", _schema->ks_name(), _schema->cf_name(), std::current_exception());
+        }
+        _buffer.pop_front();
+    }
+
+    _buffer_size = 0;
+    _m = nullptr;
+}
+
+void view_updating_consumer::maybe_flush_buffer_mid_partition() {
+    if (_buffer_size >= buffer_size_hard_limit) {
+        auto m = mutation(_schema, _m->decorated_key(), mutation_partition(_schema));
+        do_flush_buffer();
+        _buffer.emplace_back(std::move(m));
+        _m = &_buffer.back();
+    }
+}
+
+view_updating_consumer::view_updating_consumer(schema_ptr schema, table& table, std::vector<sstables::shared_sstable> excluded_sstables, const seastar::abort_source& as,
+        evictable_reader_handle& staging_reader_handle)
+    : view_updating_consumer(std::move(schema), as, staging_reader_handle,
+            [table = table.shared_from_this(), excluded_sstables = std::move(excluded_sstables)] (mutation m) mutable {
+        auto s = m.schema();
+        return table->stream_view_replica_updates(std::move(s), std::move(m), db::no_timeout, excluded_sstables);
+    })
+{ }
+
+std::vector<db::view::view_and_base> with_base_info_snapshot(std::vector<view_ptr> vs) {
+    return boost::copy_range<std::vector<db::view::view_and_base>>(vs | boost::adaptors::transformed([] (const view_ptr& v) {
+        return db::view::view_and_base{v, v->view_info()->base_info()};
+    }));
+}
+
 } // namespace view
 } // namespace db
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -43,6 +43,46 @@ namespace db {

 namespace view {

+// Part of the view description which depends on the base schema version.
+//
+// This structure may change even though the view schema doesn't change, so
+// it needs to live outside view_ptr.
+struct base_dependent_view_info {
+private:
+    schema_ptr _base_schema;
+    // Id of a regular base table column included in the view's PK, if any.
+    // Scylla views only allow one such column, alternator can have up to two.
+    std::vector<column_id> _base_non_pk_columns_in_view_pk;
+public:
+    const std::vector<column_id>& base_non_pk_columns_in_view_pk() const;
+    const schema_ptr& base_schema() const;
+
+    // Indicates if the view hase pk columns which are not part of the base
+    // pk, it seems that !base_non_pk_columns_in_view_pk.empty() is the same,
+    // but actually there are cases where we can compute this boolean without
+    // succeeding to reliably build the former.
+    const bool has_base_non_pk_columns_in_view_pk;
+
+    // If base_non_pk_columns_in_view_pk couldn't reliably be built, this base
+    // info can't be used for computing view updates, only for reading the materialized
+    // view.
+    const bool use_only_for_reads;
+
+    // A constructor for a base info that can facilitate reads and writes from the materialized view.
+    base_dependent_view_info(schema_ptr base_schema, std::vector<column_id>&& base_non_pk_columns_in_view_pk);
+    // A constructor for a base info that can facilitate only reads from the materialized view.
+    base_dependent_view_info(bool has_base_non_pk_columns_in_view_pk);
+};
+
+// Immutable snapshot of view's base-schema-dependent part.
+using base_info_ptr = lw_shared_ptr<const base_dependent_view_info>;
+
+// Snapshot of the view schema and its base-schema-dependent part.
+struct view_and_base {
+    view_ptr view;
+    base_info_ptr base;
+};
+
 /**
 * Whether the view filter considers the specified partition key.
 *
@@ -94,7 +134,7 @@ bool clustering_prefix_matches(const schema& base, const partition_key& key, con

 future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
        const schema_ptr& base,
-        std::vector<view_ptr>&& views_to_update,
+        std::vector<view_and_base>&& views_to_update,
        flat_mutation_reader&& updates,
        flat_mutation_reader_opt&& existings,
        gc_clock::time_point now);
@@ -103,7 +143,7 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(
        const schema& base,
        const dht::decorated_key& key,
        const mutation_partition& mp,
-        const std::vector<view_ptr>& views,
+        const std::vector<view_and_base>& views,
        gc_clock::time_point now);

 struct wait_for_all_updates_tag {};
@@ -133,6 +173,13 @@ future<> mutate_MV(
 */
 void create_virtual_column(schema_builder& builder, const bytes& name, const data_type& type);

+/**
+ * Converts a collection of view schema snapshots into a collection of
+ * view_and_base objects, which are snapshots of both the view schema
+ * and the base-schema-dependent part of view description.
+ */
+std::vector<view_and_base> with_base_info_snapshot(std::vector<view_ptr>);
+
 }

 }
--- a/db/view/view_builder.hh
+++ b/db/view/view_builder.hh
@@ -165,6 +165,12 @@ class view_builder final : public service::migration_listener::only_view_notific
    // Used for testing.
    std::unordered_map<std::pair<sstring, sstring>, seastar::shared_promise<>, utils::tuple_hash> _build_notifiers;

+    struct view_builder_init_state {
+        std::vector<future<>> bookkeeping_ops;
+        std::vector<std::vector<view_build_status>> status_per_shard;
+        std::unordered_set<utils::UUID> built_views;
+    };
+
 public:
    // The view builder processes the base table in steps of batch_size rows.
    // However, if the individual rows are large, there is no real need to
@@ -201,7 +207,8 @@ private:
    void initialize_reader_at_current_token(build_step&);
    void load_view_status(view_build_status, std::unordered_set<utils::UUID>&);
    void reshard(std::vector<std::vector<view_build_status>>, std::unordered_set<utils::UUID>&);
-    future<> calculate_shard_build_step(std::vector<system_keyspace::view_name>, std::vector<system_keyspace::view_build_progress>);
+    void setup_shard_build_step(view_builder_init_state& vbi, std::vector<system_keyspace::view_name>, std::vector<system_keyspace::view_build_progress>);
+    future<> calculate_shard_build_step(view_builder_init_state& vbi);
    future<> add_new_view(view_ptr, build_step&);
    future<> do_build_step();
    void execute(build_step&, exponential_backoff_retry);
--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -42,35 +42,52 @@ future<> view_update_generator::start() {
                _pending_sstables.wait().get();
            }

+            // To ensure we don't race with updates, move the entire content
+            // into a local variable.
+            auto sstables_with_tables = std::exchange(_sstables_with_tables, {});
+
            // If we got here, we will process all tables we know about so far eventually so there
            // is no starvation
-            for (auto& t : _sstables_with_tables | boost::adaptors::map_keys) {
+            for (auto table_it = sstables_with_tables.begin(); table_it != sstables_with_tables.end(); table_it = sstables_with_tables.erase(table_it)) {
+                auto& [t, sstables] = *table_it;
                schema_ptr s = t->schema();

-                // Copy what we have so far so we don't miss new updates
-                auto sstables = std::exchange(_sstables_with_tables[t], {});
+                vug_logger.trace("Processing {}.{}: {} sstables", s->ks_name(), s->cf_name(), sstables.size());
+
+                const auto num_sstables = sstables.size();

                try {
-                    // temporary: need an sstable set for the flat mutation reader, but the
-                    // compaction_descriptor takes a vector. Soon this will become a compaction
-                    // so the transformation to the SSTable set will not be needed.
-                    auto ssts = make_lw_shared(t->get_compaction_strategy().make_sstable_set(s));
+                    // Exploit the fact that sstables in the staging directory
+                    // are usually non-overlapping and use a partitioned set for
+                    // the read.
+                    auto ssts = make_lw_shared(sstables::make_partitioned_sstable_set(s, make_lw_shared<sstable_list>(sstable_list{}), false));
                    for (auto& sst : sstables) {
                        ssts->insert(sst);
                    }

-                    flat_mutation_reader staging_sstable_reader = ::make_range_sstable_reader(s,
+                    auto ms = mutation_source([this, ssts] (
+                                schema_ptr s,
+                                reader_permit permit,
+                                const dht::partition_range& pr,
+                                const query::partition_slice& ps,
+                                const io_priority_class& pc,
+                                tracing::trace_state_ptr ts,
+                                streamed_mutation::forwarding fwd_ms,
+                                mutation_reader::forwarding fwd_mr) {
+                        return ::make_restricted_range_sstable_reader(s, std::move(permit), std::move(ssts), pr, ps, pc, std::move(ts), fwd_ms, fwd_mr);
+                    });
+                    auto [staging_sstable_reader, staging_sstable_reader_handle] = make_manually_paused_evictable_reader(
+                            std::move(ms),
+                            s,
                            _db.make_query_class_config().semaphore.make_permit(),
-                            std::move(ssts),
                            query::full_partition_range,
                            s->full_slice(),
                            service::get_local_streaming_priority(),
                            nullptr,
-                            ::streamed_mutation::forwarding::no,
                            ::mutation_reader::forwarding::no);

                    inject_failure("view_update_generator_consume_staging_sstable");
-                    auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, *t, sstables, _as), db::no_timeout);
+                    auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, *t, sstables, _as, staging_sstable_reader_handle), db::no_timeout);
                    if (result == stop_iteration::yes) {
                        break;
                    }
@@ -89,7 +106,7 @@ future<> view_update_generator::start() {
                    // Move from staging will be retried upon restart.
                    vug_logger.warn("Moving {} from staging failed: {}:{}. Ignoring...", s->ks_name(), s->cf_name(), std::current_exception());
                }
-                _registration_sem.signal();
+                _registration_sem.signal(num_sstables);
            }
            // For each table, move the processed staging sstables into the table's base dir.
            for (auto it = _sstables_to_move.begin(); it != _sstables_to_move.end(); ) {
--- a/db/view/view_update_generator.hh
+++ b/db/view/view_update_generator.hh
@@ -32,7 +32,10 @@
 namespace db::view {

 class view_update_generator {
+public:
    static constexpr size_t registration_queue_size = 5;
+
+private:
    database& _db;
    seastar::abort_source _as;
    future<> _started = make_ready_future<>();
@@ -51,6 +54,8 @@ public:
    future<> start();
    future<> stop();
    future<> register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table);
+
+    ssize_t available_register_units() const { return _registration_sem.available_units(); }
 private:
    bool should_throttle() const;
 };
--- a/db/view/view_updating_consumer.hh
+++ b/db/view/view_updating_consumer.hh
@@ -27,6 +27,8 @@
 #include "sstables/shared_sstable.hh"
 #include "database.hh"

+class evictable_reader_handle;
+
 namespace db::view {

 /*
@@ -34,22 +36,46 @@ namespace db::view {
 * It is expected to be run in seastar::async threaded context through consume_in_thread()
 */
 class view_updating_consumer {
-    schema_ptr _schema;
-    lw_shared_ptr<table> _table;
-    std::vector<sstables::shared_sstable> _excluded_sstables;
-    const seastar::abort_source* _as;
-    std::optional<mutation> _m;
 public:
-    view_updating_consumer(schema_ptr schema, table& table, std::vector<sstables::shared_sstable> excluded_sstables, const seastar::abort_source& as)
+    // We prefer flushing on partition boundaries, so at the end of a partition,
+    // we flush on reaching the soft limit. Otherwise we continue accumulating
+    // data. We flush mid-partition if we reach the hard limit.
+    static const size_t buffer_size_soft_limit;
+    static const size_t buffer_size_hard_limit;
+
+private:
+    schema_ptr _schema;
+    const seastar::abort_source* _as;
+    evictable_reader_handle& _staging_reader_handle;
+    circular_buffer<mutation> _buffer;
+    mutation* _m{nullptr};
+    size_t _buffer_size{0};
+    noncopyable_function<future<row_locker::lock_holder>(mutation)> _view_update_pusher;
+
+private:
+    void do_flush_buffer();
+    void maybe_flush_buffer_mid_partition();
+
+public:
+    // Push updates with a custom pusher. Mainly for tests.
+    view_updating_consumer(schema_ptr schema, const seastar::abort_source& as, evictable_reader_handle& staging_reader_handle,
+            noncopyable_function<future<row_locker::lock_holder>(mutation)> view_update_pusher)
            : _schema(std::move(schema))
-            , _table(table.shared_from_this())
-            , _excluded_sstables(std::move(excluded_sstables))
            , _as(&as)
-            , _m()
+            , _staging_reader_handle(staging_reader_handle)
+            , _view_update_pusher(std::move(view_update_pusher))
    { }

+    view_updating_consumer(schema_ptr schema, table& table, std::vector<sstables::shared_sstable> excluded_sstables, const seastar::abort_source& as,
+            evictable_reader_handle& staging_reader_handle);
+
+    view_updating_consumer(view_updating_consumer&&) = default;
+
+    view_updating_consumer& operator=(view_updating_consumer&&) = delete;
+
    void consume_new_partition(const dht::decorated_key& dk) {
-        _m = mutation(_schema, dk, mutation_partition(_schema));
+        _buffer.emplace_back(_schema, dk, mutation_partition(_schema));
+        _m = &_buffer.back();
    }

    void consume(tombstone t) {
@@ -60,7 +86,9 @@ public:
        if (_as->abort_requested()) {
            return stop_iteration::yes;
        }
+        _buffer_size += sr.memory_usage(*_schema);
        _m->partition().apply(*_schema, std::move(sr));
+        maybe_flush_buffer_mid_partition();
        return stop_iteration::no;
    }

@@ -68,7 +96,9 @@ public:
        if (_as->abort_requested()) {
            return stop_iteration::yes;
        }
+        _buffer_size += cr.memory_usage(*_schema);
        _m->partition().apply(*_schema, std::move(cr));
+        maybe_flush_buffer_mid_partition();
        return stop_iteration::no;
    }

@@ -76,14 +106,27 @@ public:
        if (_as->abort_requested()) {
            return stop_iteration::yes;
        }
+        _buffer_size += rt.memory_usage(*_schema);
        _m->partition().apply(*_schema, std::move(rt));
+        maybe_flush_buffer_mid_partition();
        return stop_iteration::no;
    }

    // Expected to be run in seastar::async threaded context (consume_in_thread())
-    stop_iteration consume_end_of_partition();
+    stop_iteration consume_end_of_partition() {
+        if (_as->abort_requested()) {
+            return stop_iteration::yes;
+        }
+        if (_buffer_size >= buffer_size_soft_limit) {
+            do_flush_buffer();
+        }
+        return stop_iteration::no;
+    }

    stop_iteration consume_end_of_stream() {
+        if (!_buffer.empty()) {
+            do_flush_buffer();
+        }
        return stop_iteration(_as->abort_requested());
    }
 };
--- a/dht/boot_strapper.cc
+++ b/dht/boot_strapper.cc
@@ -59,7 +59,12 @@ future<> boot_strapper::bootstrap(streaming::stream_reason reason) {
        return make_exception_future<>(std::runtime_error("Wrong stream_reason provided: it can only be replace or bootstrap"));
    }
    auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _abort_source, _tokens, _address, description, reason);
-    streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(gms::get_local_gossiper().get_unreachable_members()));
+    auto nodes_to_filter = gms::get_local_gossiper().get_unreachable_members();
+    if (reason == streaming::stream_reason::replace && _db.local().get_replace_address()) {
+        nodes_to_filter.insert(_db.local().get_replace_address().value());
+    }
+    blogger.debug("nodes_to_filter={}", nodes_to_filter);
+    streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(nodes_to_filter));
    auto keyspaces = make_lw_shared<std::vector<sstring>>(_db.local().get_non_system_keyspaces());
    return do_for_each(*keyspaces, [this, keyspaces, streamer] (sstring& keyspace_name) {
        auto& ks = _db.local().find_keyspace(keyspace_name);
--- a/digest_algorithm.hh
+++ b/digest_algorithm.hh
@@ -28,7 +28,8 @@ namespace query {
 enum class digest_algorithm : uint8_t {
    none = 0,  // digest not required
    MD5 = 1,
-    xxHash = 2,// default algorithm
+    legacy_xxHash_without_null_digest = 2,
+    xxHash = 3, // default algorithm
 };

 }
--- a/digester.hh
+++ b/digester.hh
@@ -36,7 +36,7 @@ struct noop_hasher {
 };

 class digester final {
-    std::variant<noop_hasher, md5_hasher, xx_hasher> _impl;
+    std::variant<noop_hasher, md5_hasher, xx_hasher, legacy_xx_hasher_without_null_digest> _impl;

 public:
    explicit digester(digest_algorithm algo) {
@@ -47,6 +47,9 @@ public:
        case digest_algorithm::xxHash:
            _impl = xx_hasher();
            break;
+        case digest_algorithm::legacy_xxHash_without_null_digest:
+            _impl = legacy_xx_hasher_without_null_digest();
+            break;
        case digest_algorithm ::none:
            _impl = noop_hasher();
            break;
--- a/dist/common/scripts/scylla-housekeeping
+++ b/dist/common/scripts/scylla-housekeeping
@@ -61,7 +61,15 @@ def sh_command(*args):
    return out

 def get_url(path):
-    return urllib.request.urlopen(path).read().decode('utf-8')
+    # If server returns any error, like 403, or 500 urllib.request throws exception, which is not serializable.
+    # When multiprocessing routines fail to serialize it, it throws ambiguous serialization exception
+    #   from get_json_from_url.
+    # In order to see legit error we catch it from the inside of process, covert to string and
+    #   pass it as part of return value
+    try:
+        return 0, urllib.request.urlopen(path).read().decode('utf-8')
+    except Exception as exc:
+        return 1, str(exc)

 def get_json_from_url(path):
    pool = mp.Pool(processes=1)
@@ -71,13 +79,16 @@ def get_json_from_url(path):
    # to enforce a wallclock timeout.
    result = pool.apply_async(get_url, args=(path,))
    try:
-        retval = result.get(timeout=5)
+        status, retval = result.get(timeout=5)
    except mp.TimeoutError as err:
        pool.terminate()
        pool.join()
        raise
+    if status == 1:
+        raise RuntimeError(f'Failed to get "{path}" due to the following error: {retval}')
    return json.loads(retval)

+
 def get_api(path):
    return get_json_from_url("http://" + api_address + path)

--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -27,6 +27,7 @@ import glob
 import shutil
 import io
 import stat
+import distro
 from scylla_util import *

 interactive = False
@@ -385,6 +386,9 @@ if __name__ == '__main__':
                if not stat.S_ISBLK(os.stat(dsk).st_mode):
                    print('{} is not block device'.format(dsk))
                    continue
+                if dsk in selected:
+                    print(f'{dsk} is already added')
+                    continue
                selected.append(dsk)
                devices.remove(dsk)
            disks = ','.join(selected)
@@ -468,5 +472,10 @@ if __name__ == '__main__':
            print('Please restart your machine before using ScyllaDB, as you have disabled')
            print(' SELinux.')

-        if dist_name() == 'Ubuntu':
-            run('apt-get install -y hugepages')
+        if distro.id() == 'ubuntu':
+            # Ubuntu version is 20.04 or later
+            if int(distro.major_version()) >= 20:
+                hugepkg = 'libhugetlbfs-bin'
+            else:
+                hugepkg = 'hugepages'
+            run(f'apt-get install -y {hugepkg}')
--- a/dist/common/scripts/scylla_swap_setup
+++ b/dist/common/scripts/scylla_swap_setup
@@ -40,6 +40,10 @@ if __name__ == '__main__':
        sys.exit(1)

    memtotal = get_memtotal_gb()
+    if memtotal == 0:
+        print('memory too small: {} KB'.format(get_memtotal()))
+        sys.exit(1)
+
    # Scylla document says 'swap size should be set to either total_mem/3 or
    # 16GB - lower of the two', so we need to compare 16g vs memtotal/3 and
    # choose lower one
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -184,7 +184,7 @@ class aws_instance:
        instance_size = self.instance_size()
        if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
            return 'ixgbevf'
-        if instance_class in ['a1', 'c5', 'c5d', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d']:
+        if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d']:
            return 'ena'
        if instance_class == 'm4':
            if instance_size == '16xlarge':
@@ -331,7 +331,7 @@ class scylla_cpuinfo:

 # When a CLI tool is not installed, use relocatable CLI tool provided by Scylla
 scylla_env = os.environ.copy()
-scylla_env['PATH'] =  '{}:{}'.format(scylla_env['PATH'], scyllabindir())
+scylla_env['PATH'] =  '{}:{}'.format(scyllabindir(), scylla_env['PATH'])

 def run(cmd, shell=False, silent=False, exception=True):
    stdout = subprocess.DEVNULL if silent else None
@@ -446,6 +446,19 @@ def dist_ver():
    return distro.version()


+SYSTEM_PARTITION_UUIDS = [
+        '21686148-6449-6e6f-744e-656564454649', # BIOS boot partition
+        'c12a7328-f81f-11d2-ba4b-00a0c93ec93b', # EFI system partition
+        '024dee41-33e7-11d3-9d69-0008c781f39f'  # MBR partition scheme
+]
+
+def get_partition_uuid(dev):
+    return out(f'lsblk -n -oPARTTYPE {dev}')
+
+def is_system_partition(dev):
+    uuid = get_partition_uuid(dev)
+    return (uuid in SYSTEM_PARTITION_UUIDS)
+
 def is_unused_disk(dev):
    # dev is not in /sys/class/block/, like /dev/nvme[0-9]+
    if not os.path.isdir('/sys/class/block/{dev}'.format(dev=dev.replace('/dev/', ''))):
@@ -453,7 +466,8 @@ def is_unused_disk(dev):
    try:
        fd = os.open(dev, os.O_EXCL)
        os.close(fd)
-        return True
+        # dev is not reserved for system
+        return not is_system_partition(dev)
    except OSError:
        return False

--- a/dist/debian/debian/rules
+++ b/dist/debian/debian/rules
@@ -39,6 +39,7 @@ override_dh_strip:
 	# The binaries (ethtool...patchelf) don't pass dh_strip after going through patchelf. Since they are
 	# already stripped, nothing is lost if we exclude them, so that's what we do.
 	dh_strip -Xlibprotobuf.so.15 -Xld.so -Xethtool -Xgawk -Xgzip -Xhwloc-calc -Xhwloc-distrib -Xifconfig -Xlscpu -Xnetstat -Xpatchelf --dbg-package=$(product)-server-dbg
+	find $(CURDIR)/debian/$(product)-server-dbg/usr/lib/debug/.build-id/ -name "*.debug" -exec objcopy --decompress-debug-sections {} \;

 override_dh_makeshlibs:

--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -5,8 +5,8 @@ MAINTAINER Avi Kivity <avi@cloudius-systems.com>
 ENV container docker

 # The SCYLLA_REPO_URL argument specifies the URL to the RPM repository this Docker image uses to install Scylla. The default value is the Scylla's unstable RPM repository, which contains the daily build.
-ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo
-ARG VERSION=666.development
+ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/scylla-4.2/latest/scylla.repo
+ARG VERSION=4.2

 ADD scylla_bashrc /scylla_bashrc

--- a/docs/alternator/alternator.md
+++ b/docs/alternator/alternator.md
@@ -25,6 +25,15 @@ By default, Scylla listens on this port on all network interfaces.
 To listen only on a specific interface, pass also an "`alternator-address`"
 option.

+In addition to (or instead of) serving HTTP requests on `alternator-port`,
+Scylla can accept DynamoDB API requests over HTTPS (encrypted), on the port
+specified by `alternator-https-port`. As usual for HTTPS servers, the
+operator must specify certificate and key files. By default these should
+be placed in `/etc/scylla/scylla.crt` and `/etc/scylla/scylla.key`, but
+these default locations can overridden by specifying
+`--alternator-encryption-options keyfile="..."` and
+`--alternator-encryption-options certificate="..."`.
+
 As we explain below in the "Write isolation policies", Alternator has
 four different choices for the implementation of writes, each with
 different advantages. You should consider which of the options makes
--- a/docs/protocols.md
+++ b/docs/protocols.md
@@ -137,10 +137,9 @@ TODO: is there an SSL version of Thrift?

 # DynamoDB client protocol

-Scylla also supports, as an experimental feature, Amazon's DynamoDB API.
-The DynamoDB API is a JSON over HTTP (unencrypted) or HTTPS (encrypted)
-protocol. Because Scylla's support for this protocol is experimental,
-it is not turned on by default, and must be turned on manually by setting
+Scylla also supports Amazon's DynamoDB API. The DynamoDB API is a JSON over
+HTTP (unencrypted) or HTTPS (encrypted) protocol. Support for this protocol
+is not turned on by default, and must be turned on manually by setting
 the `alternator_port` and/or `alternator_https_port` configuration option.
 "Alternator" is the codename of Scylla's DynamoDB API support, and is
 documented in more detail in [alternator.md](alternator/alternator.md).
@@ -153,6 +152,13 @@ There is also an `alternator_address` configuration option to set the IP
 address (and therefore network interface) on which Scylla should listen
 for the DynamoDB protocol. This address defaults to 0.0.0.0.

+When the HTTPS-based protocol is enabled, the server also needs to know
+the certificate and key files to use. The default locations of these files
+are `/etc/scylla/scylla.crt` and `/etc/scylla/scylla.key` respectively, but
+can be overridden by specifying in `alternator_encryption_options` the
+`keyfile` and `certificate` options. For example,
+`--alternator-encryption-options keyfile="..."`.
+
 # Redis client protocol

 Scylla also has partial and experimental support for the Redis API.
--- a/flat_mutation_reader.hh
+++ b/flat_mutation_reader.hh
@@ -468,6 +468,9 @@ public:
    size_t buffer_size() const {
        return _impl->buffer_size();
    }
+    const circular_buffer<mutation_fragment>& buffer() const {
+        return _impl->buffer();
+    }
    // Detach the internal buffer of the reader.
    // Roughly equivalent to depleting it by calling pop_mutation_fragment()
    // until is_buffer_empty() returns true.
--- a/gms/feature.hh
+++ b/gms/feature.hh
@@ -141,6 +141,7 @@ extern const std::string_view HINTED_HANDOFF_SEPARATE_CONNECTION;
 extern const std::string_view LWT;
 extern const std::string_view PER_TABLE_PARTITIONERS;
 extern const std::string_view PER_TABLE_CACHING;
+extern const std::string_view DIGEST_FOR_NULL_VALUES;

 }

--- a/gms/feature_service.cc
+++ b/gms/feature_service.cc
@@ -56,6 +56,7 @@ constexpr std::string_view features::HINTED_HANDOFF_SEPARATE_CONNECTION = "HINTE
 constexpr std::string_view features::LWT = "LWT";
 constexpr std::string_view features::PER_TABLE_PARTITIONERS = "PER_TABLE_PARTITIONERS";
 constexpr std::string_view features::PER_TABLE_CACHING = "PER_TABLE_CACHING";
+constexpr std::string_view features::DIGEST_FOR_NULL_VALUES = "DIGEST_FOR_NULL_VALUES";

 static logging::logger logger("features");

@@ -90,8 +91,9 @@ feature_service::feature_service(feature_config cfg) : _config(cfg)
        , _hinted_handoff_separate_connection(*this, features::HINTED_HANDOFF_SEPARATE_CONNECTION)
        , _lwt_feature(*this, features::LWT)
        , _per_table_partitioners_feature(*this, features::PER_TABLE_PARTITIONERS)
-        , _per_table_caching_feature(*this, features::PER_TABLE_CACHING) {
-}
+        , _per_table_caching_feature(*this, features::PER_TABLE_CACHING)
+        , _digest_for_null_values_feature(*this, features::DIGEST_FOR_NULL_VALUES)
+{}

 feature_config feature_config_from_db_config(db::config& cfg, std::set<sstring> disabled) {
    feature_config fcfg;
@@ -179,6 +181,7 @@ std::set<std::string_view> feature_service::known_feature_set() {
        gms::features::MC_SSTABLE,
        gms::features::UDF,
        gms::features::CDC,
+        gms::features::DIGEST_FOR_NULL_VALUES,
    };

    for (const sstring& s : _config._disabled_features) {
@@ -269,6 +272,7 @@ void feature_service::enable(const std::set<std::string_view>& list) {
        std::ref(_lwt_feature),
        std::ref(_per_table_partitioners_feature),
        std::ref(_per_table_caching_feature),
+        std::ref(_digest_for_null_values_feature),
    })
    {
        if (list.count(f.name())) {
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -103,6 +103,7 @@ private:
    gms::feature _lwt_feature;
    gms::feature _per_table_partitioners_feature;
    gms::feature _per_table_caching_feature;
+    gms::feature _digest_for_null_values_feature;

 public:
    bool cluster_supports_range_tombstones() const {
@@ -177,6 +178,10 @@ public:
        return _per_table_caching_feature;
    }

+    const feature& cluster_supports_digest_for_null_values() const {
+        return _digest_for_null_values_feature;
+    }
+
    bool cluster_supports_row_level_repair() const {
        return bool(_row_level_repair_feature);
    }
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -428,6 +428,7 @@ future<> gossiper::handle_shutdown_msg(inet_address from) {
        return make_ready_future<>();
    }
    return seastar::async([this, from] {
+        auto permit = this->lock_endpoint(from).get0();
        this->mark_as_shutdown(from);
    });
 }
--- a/install-dependencies.sh
+++ b/install-dependencies.sh
@@ -98,6 +98,7 @@ fedora_packages=(
    debhelper
    fakeroot
    file
+    dpkg-dev
 )

 centos_packages=(
--- a/install.sh
+++ b/install.sh
@@ -132,6 +132,7 @@ relocate_python3() {
    cp "$script" "$relocateddir"
    cat > "$install"<<EOF
 #!/usr/bin/env bash
+export LC_ALL=en_US.UTF-8
 x="\$(readlink -f "\$0")"
 b="\$(basename "\$x")"
 d="\$(dirname "\$x")"
--- a/locator/abstract_replication_strategy.cc
+++ b/locator/abstract_replication_strategy.cc
@@ -168,15 +168,33 @@ insert_token_range_to_sorted_container_while_unwrapping(

 dht::token_range_vector
 abstract_replication_strategy::get_ranges(inet_address ep) const {
-    return get_ranges(ep, _token_metadata);
+    return do_get_ranges(ep, _token_metadata, false);
+}
+
+dht::token_range_vector
+abstract_replication_strategy::get_ranges_in_thread(inet_address ep) const {
+    return do_get_ranges(ep, _token_metadata, true);
 }

 dht::token_range_vector
 abstract_replication_strategy::get_ranges(inet_address ep, token_metadata& tm) const {
+    return do_get_ranges(ep, tm, false);
+}
+
+dht::token_range_vector
+abstract_replication_strategy::get_ranges_in_thread(inet_address ep, token_metadata& tm) const {
+    return do_get_ranges(ep, tm, true);
+}
+
+dht::token_range_vector
+abstract_replication_strategy::do_get_ranges(inet_address ep, token_metadata& tm, bool can_yield) const {
    dht::token_range_vector ret;
    auto prev_tok = tm.sorted_tokens().back();
    for (auto tok : tm.sorted_tokens()) {
        for (inet_address a : calculate_natural_endpoints(tok, tm)) {
+            if (can_yield) {
+                seastar::thread::maybe_yield();
+            }
            if (a == ep) {
                insert_token_range_to_sorted_container_while_unwrapping(prev_tok, tok, ret);
                break;
--- a/locator/abstract_replication_strategy.hh
+++ b/locator/abstract_replication_strategy.hh
@@ -113,10 +113,15 @@ public:
    // It the analogue of Origin's getAddressRanges().get(endpoint).
    // This function is not efficient, and not meant for the fast path.
    dht::token_range_vector get_ranges(inet_address ep) const;
+    dht::token_range_vector get_ranges_in_thread(inet_address ep) const;

    // Use the token_metadata provided by the caller instead of _token_metadata
    dht::token_range_vector get_ranges(inet_address ep, token_metadata& tm) const;
+    dht::token_range_vector get_ranges_in_thread(inet_address ep, token_metadata& tm) const;
+private:
+    dht::token_range_vector do_get_ranges(inet_address ep, token_metadata& tm, bool can_yield) const;

+public:
    // get_primary_ranges() returns the list of "primary ranges" for the given
    // endpoint. "Primary ranges" are the ranges that the node is responsible
    // for storing replica primarily, which means this is the first node
--- a/lua.cc
+++ b/lua.cc
@@ -262,14 +262,12 @@ static auto visit_lua_raw_value(lua_State* l, int index, Func&& f) {

 template <typename Func>
 static auto visit_decimal(const big_decimal &v, Func&& f) {
-    boost::multiprecision::cpp_int ten(10);
-    const auto& dividend = v.unscaled_value();
-    auto divisor = boost::multiprecision::pow(ten, v.scale());
+    boost::multiprecision::cpp_rational r = v.as_rational();
+    const boost::multiprecision::cpp_int& dividend = numerator(r);
+    const boost::multiprecision::cpp_int& divisor = denominator(r);
    if (dividend % divisor == 0) {
-        return f(utils::multiprecision_int(boost::multiprecision::cpp_int(dividend/divisor)));
+        return f(utils::multiprecision_int(dividend/divisor));
    }
-    boost::multiprecision::cpp_rational r = dividend;
-    r /= divisor;
    return f(r.convert_to<double>());
 }

--- a/main.cc
+++ b/main.cc
@@ -830,6 +830,7 @@ int main(int ac, char** av) {
            storage_proxy_smp_service_group_config.max_nonlocal_requests = 5000;
            spcfg.read_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
            spcfg.write_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
+            spcfg.hints_write_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
            spcfg.write_ack_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
            static db::view::node_update_backlog node_backlog(smp::count, 10ms);
            scheduling_group_key_config storage_proxy_stats_cfg =
@@ -967,12 +968,16 @@ int main(int ac, char** av) {
                mm.init_messaging_service();
            }).get();
            supervisor::notify("initializing storage proxy RPC verbs");
-            proxy.invoke_on_all([] (service::storage_proxy& p) {
-                p.init_messaging_service();
-            }).get();
+            proxy.invoke_on_all(&service::storage_proxy::init_messaging_service).get();
+            auto stop_proxy_handlers = defer_verbose_shutdown("storage proxy RPC verbs", [&proxy] {
+                proxy.invoke_on_all(&service::storage_proxy::uninit_messaging_service).get();
+            });

            supervisor::notify("starting streaming service");
            streaming::stream_session::init_streaming_service(db, sys_dist_ks, view_update_generator).get();
+            auto stop_streaming_service = defer_verbose_shutdown("streaming service", [] {
+                streaming::stream_session::uninit_streaming_service().get();
+            });
            api::set_server_stream_manager(ctx).get();

            supervisor::notify("starting hinted handoff manager");
@@ -1005,6 +1010,9 @@ int main(int ac, char** av) {
                rs.stop().get();
            });
            repair_init_messaging_service_handler(rs, sys_dist_ks, view_update_generator).get();
+            auto stop_repair_messages = defer_verbose_shutdown("repair message handlers", [] {
+                repair_uninit_messaging_service_handler().get();
+            });
            supervisor::notify("starting storage service", true);
            auto& ss = service::get_local_storage_service();
            ss.init_messaging_service_part().get();
@@ -1196,22 +1204,30 @@ int main(int ac, char** av) {
                std::optional<uint16_t> alternator_https_port;
                std::optional<tls::credentials_builder> creds;
                if (cfg->alternator_https_port()) {
-                    creds.emplace();
                    alternator_https_port = cfg->alternator_https_port();
-                    creds->set_dh_level(tls::dh_params::level::MEDIUM);
-                    creds->set_x509_key_file(cert, key, tls::x509_crt_format::PEM).get();
-                    if (trust_store.empty()) {
-                        creds->set_system_trust().get();
-                    } else {
-                        creds->set_x509_trust_file(trust_store, tls::x509_crt_format::PEM).get();
+                    creds.emplace();
+                    auto opts = cfg->alternator_encryption_options();
+                    if (opts.empty()) {
+                        // Earlier versions mistakenly configured Alternator's
+                        // HTTPS parameters via the "server_encryption_option"
+                        // configuration parameter. We *temporarily* continue
+                        // to allow this, for backward compatibility.
+                        opts = cfg->server_encryption_options();
+                        if (!opts.empty()) {
+                            startlog.warn("Setting server_encryption_options to configure "
+                                    "Alternator's HTTPS encryption is deprecated. Please "
+                                    "switch to setting alternator_encryption_options instead.");
+                        }
                    }
+                    creds->set_dh_level(tls::dh_params::level::MEDIUM);
+                    auto cert = get_or_default(opts, "certificate", db::config::get_conf_sub("scylla.crt").string());
+                    auto key = get_or_default(opts, "keyfile", db::config::get_conf_sub("scylla.key").string());
+                    creds->set_x509_key_file(cert, key, tls::x509_crt_format::PEM).get();
+                    auto prio = get_or_default(opts, "priority_string", sstring());
                    creds->set_priority_string(db::config::default_tls_priority);
                    if (!prio.empty()) {
                        creds->set_priority_string(prio);
                    }
-                    if (clauth) {
-                        creds->set_client_auth(seastar::tls::client_auth::REQUIRE);
-                    }
                }
                bool alternator_enforce_authorization = cfg->alternator_enforce_authorization();
                with_scheduling_group(dbcfg.statement_scheduling_group,
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -572,7 +572,12 @@ messaging_service::initial_scheduling_info() const {

 scheduling_group
 messaging_service::scheduling_group_for_verb(messaging_verb verb) const {
-    return _scheduling_info_for_connection_index[get_rpc_client_idx(verb)].sched_group;
+    // We are not using get_rpc_client_idx() because it figures out the client
+    // index based on the current scheduling group, which is relevant when
+    // selecting the right client for sending a message, but is not relevant
+    // when registering handlers.
+    const auto idx = s_rpc_client_idx_table[static_cast<size_t>(verb)];
+    return _scheduling_info_for_connection_index[idx].sched_group;
 }

 scheduling_group
@@ -791,6 +796,10 @@ void messaging_service::register_stream_mutation_fragments(std::function<future<
    register_handler(this, messaging_verb::STREAM_MUTATION_FRAGMENTS, std::move(func));
 }

+future<> messaging_service::unregister_stream_mutation_fragments() {
+    return unregister_handler(messaging_verb::STREAM_MUTATION_FRAGMENTS);
+}
+
 template<class SinkType, class SourceType>
 future<rpc::sink<SinkType>, rpc::source<SourceType>>
 do_make_sink_source(messaging_verb verb, uint32_t repair_meta_id, shared_ptr<messaging_service::rpc_protocol_client_wrapper> rpc_client, std::unique_ptr<messaging_service::rpc_protocol_wrapper>& rpc) {
@@ -822,6 +831,9 @@ rpc::sink<repair_row_on_wire_with_cmd> messaging_service::make_sink_for_repair_g
 void messaging_service::register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_hash_with_cmd> source)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM, std::move(func));
 }
+future<> messaging_service::unregister_repair_get_row_diff_with_rpc_stream() {
+    return unregister_handler(messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM);
+}

 // Wrapper for REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM
 future<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>
@@ -841,6 +853,9 @@ rpc::sink<repair_stream_cmd> messaging_service::make_sink_for_repair_put_row_dif
 void messaging_service::register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source)>&& func) {
    register_handler(this, messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM, std::move(func));
 }
+future<> messaging_service::unregister_repair_put_row_diff_with_rpc_stream() {
+    return unregister_handler(messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM);
+}

 // Wrapper for REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM
 future<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>
@@ -860,6 +875,9 @@ rpc::sink<repair_hash_with_cmd> messaging_service::make_sink_for_repair_get_full
 void messaging_service::register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_stream_cmd> source)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM, std::move(func));
 }
+future<> messaging_service::unregister_repair_get_full_row_hashes_with_rpc_stream() {
+    return unregister_handler(messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM);
+}

 // Send a message for verb
 template <typename MsgIn, typename... MsgOut>
@@ -943,6 +961,9 @@ future<streaming::prepare_message> messaging_service::send_prepare_message(msg_a
    return send_message<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
        std::move(msg), plan_id, std::move(description), reason);
 }
+future<> messaging_service::unregister_prepare_message() {
+    return unregister_handler(messaging_verb::PREPARE_MESSAGE);
+}

 // PREPARE_DONE_MESSAGE
 void messaging_service::register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func) {
@@ -952,6 +973,9 @@ future<> messaging_service::send_prepare_done_message(msg_addr id, UUID plan_id,
    return send_message<void>(this, messaging_verb::PREPARE_DONE_MESSAGE, id,
        plan_id, dst_cpu_id);
 }
+future<> messaging_service::unregister_prepare_done_message() {
+    return unregister_handler(messaging_verb::PREPARE_DONE_MESSAGE);
+}

 // STREAM_MUTATION
 void messaging_service::register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool> fragmented, rpc::optional<streaming::stream_reason> reason)>&& func) {
@@ -976,6 +1000,9 @@ future<> messaging_service::send_stream_mutation_done(msg_addr id, UUID plan_id,
    return send_message<void>(this, messaging_verb::STREAM_MUTATION_DONE, id,
        plan_id, std::move(ranges), cf_id, dst_cpu_id);
 }
+future<> messaging_service::unregister_stream_mutation_done() {
+    return unregister_handler(messaging_verb::STREAM_MUTATION_DONE);
+}

 // COMPLETE_MESSAGE
 void messaging_service::register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func) {
@@ -985,6 +1012,9 @@ future<> messaging_service::send_complete_message(msg_addr id, UUID plan_id, uns
    return send_message<void>(this, messaging_verb::COMPLETE_MESSAGE, id,
        plan_id, dst_cpu_id, failed);
 }
+future<> messaging_service::unregister_complete_message() {
+    return unregister_handler(messaging_verb::COMPLETE_MESSAGE);
+}

 void messaging_service::register_gossip_echo(std::function<future<> ()>&& func) {
    register_handler(this, messaging_verb::GOSSIP_ECHO, std::move(func));
@@ -1199,14 +1229,14 @@ future<partition_checksum> messaging_service::send_repair_checksum_range(
 }

 // Wrapper for REPAIR_GET_FULL_ROW_HASHES
-void messaging_service::register_repair_get_full_row_hashes(std::function<future<std::unordered_set<repair_hash>> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
+void messaging_service::register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(func));
 }
 future<> messaging_service::unregister_repair_get_full_row_hashes() {
    return unregister_handler(messaging_verb::REPAIR_GET_FULL_ROW_HASHES);
 }
-future<std::unordered_set<repair_hash>> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
-    return send_message<future<std::unordered_set<repair_hash>>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
+future<repair_hash_set> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
+    return send_message<future<repair_hash_set>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
 }

 // Wrapper for REPAIR_GET_COMBINED_ROW_HASH
@@ -1231,13 +1261,13 @@ future<get_sync_boundary_response> messaging_service::send_repair_get_sync_bound
 }

 // Wrapper for REPAIR_GET_ROW_DIFF
-void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows)>&& func) {
+void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(func));
 }
 future<> messaging_service::unregister_repair_get_row_diff() {
    return unregister_handler(messaging_verb::REPAIR_GET_ROW_DIFF);
 }
-future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows) {
+future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows) {
    return send_message<future<repair_rows_on_wire>>(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(id), repair_meta_id, std::move(set_diff), needs_all_rows);
 }

--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -297,10 +297,12 @@ public:
            streaming::prepare_message msg, UUID plan_id, sstring description, rpc::optional<streaming::stream_reason> reason)>&& func);
    future<streaming::prepare_message> send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
            sstring description, streaming::stream_reason);
+    future<> unregister_prepare_message();

    // Wrapper for PREPARE_DONE_MESSAGE verb
    void register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func);
    future<> send_prepare_done_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id);
+    future<> unregister_prepare_done_message();

    // Wrapper for STREAM_MUTATION verb
    void register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool>, rpc::optional<streaming::stream_reason>)>&& func);
@@ -309,6 +311,7 @@ public:
    // Wrapper for STREAM_MUTATION_FRAGMENTS
    // The receiver of STREAM_MUTATION_FRAGMENTS sends status code to the sender to notify any error on the receiver side. The status code is of type int32_t. 0 means successful, -1 means error, other status code value are reserved for future use.
    void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>> source)>&& func);
+    future<> unregister_stream_mutation_fragments();
    rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>>& source);
    future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);

@@ -316,22 +319,27 @@ public:
    future<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>> make_sink_and_source_for_repair_get_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
    rpc::sink<repair_row_on_wire_with_cmd> make_sink_for_repair_get_row_diff_with_rpc_stream(rpc::source<repair_hash_with_cmd>& source);
    void register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_hash_with_cmd> source)>&& func);
+    future<> unregister_repair_get_row_diff_with_rpc_stream();

    // Wrapper for REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM
    future<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>> make_sink_and_source_for_repair_put_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
    rpc::sink<repair_stream_cmd> make_sink_for_repair_put_row_diff_with_rpc_stream(rpc::source<repair_row_on_wire_with_cmd>& source);
    void register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source)>&& func);
+    future<> unregister_repair_put_row_diff_with_rpc_stream();

    // Wrapper for REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM
    future<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>> make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
    rpc::sink<repair_hash_with_cmd> make_sink_for_repair_get_full_row_hashes_with_rpc_stream(rpc::source<repair_stream_cmd>& source);
    void register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_stream_cmd> source)>&& func);
+    future<> unregister_repair_get_full_row_hashes_with_rpc_stream();

    void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
    future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);
+    future<> unregister_stream_mutation_done();

    void register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func);
    future<> send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id, bool failed = false);
+    future<> unregister_complete_message();

    // Wrapper for REPAIR_CHECKSUM_RANGE verb
    void register_repair_checksum_range(std::function<future<partition_checksum> (sstring keyspace, sstring cf, dht::token_range range, rpc::optional<repair_checksum> hash_version)>&& func);
@@ -339,9 +347,9 @@ public:
    future<partition_checksum> send_repair_checksum_range(msg_addr id, sstring keyspace, sstring cf, dht::token_range range, repair_checksum hash_version);

    // Wrapper for REPAIR_GET_FULL_ROW_HASHES
-    void register_repair_get_full_row_hashes(std::function<future<std::unordered_set<repair_hash>> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
+    void register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
    future<> unregister_repair_get_full_row_hashes();
-    future<std::unordered_set<repair_hash>> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);
+    future<repair_hash_set> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);

    // Wrapper for REPAIR_GET_COMBINED_ROW_HASH
    void register_repair_get_combined_row_hash(std::function<future<get_combined_row_hash_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::optional<repair_sync_boundary> common_sync_boundary)>&& func);
@@ -354,9 +362,9 @@ public:
    future<get_sync_boundary_response> send_repair_get_sync_boundary(msg_addr id, uint32_t repair_meta_id, std::optional<repair_sync_boundary> skipped_sync_boundary);

    // Wrapper for REPAIR_GET_ROW_DIFF
-    void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows)>&& func);
+    void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func);
    future<> unregister_repair_get_row_diff();
-    future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows);
+    future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows);

    // Wrapper for REPAIR_PUT_ROW_DIFF
    void register_repair_put_row_diff(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_rows_on_wire row_diff)>&& func);
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -300,10 +300,9 @@ flat_mutation_reader read_context::create_reader(
    }

    auto& table = _db.local().find_column_family(schema);
-    auto class_config = _db.local().make_query_class_config();

    if (!rm.rparts) {
-        rm.rparts = make_foreign(std::make_unique<reader_meta::remote_parts>(class_config.semaphore));
+        rm.rparts = make_foreign(std::make_unique<reader_meta::remote_parts>(semaphore()));
    }

    rm.rparts->range = std::make_unique<const dht::partition_range>(pr);
@@ -513,18 +512,28 @@ future<> read_context::lookup_readers() {
    }

    return parallel_for_each(boost::irange(0u, smp::count), [this] (shard_id shard) {
-        return _db.invoke_on(shard, [shard, cmd = &_cmd, ranges = &_ranges, gs = global_schema_ptr(_schema),
+        return _db.invoke_on(shard, [this, shard, cmd = &_cmd, ranges = &_ranges, gs = global_schema_ptr(_schema),
                gts = tracing::global_trace_state_ptr(_trace_state)] (database& db) mutable {
            auto schema = gs.get();
            auto querier_opt = db.get_querier_cache().lookup_shard_mutation_querier(cmd->query_uuid, *schema, *ranges, cmd->slice, gts.get());
            auto& table = db.find_column_family(schema);
-            auto& semaphore = db.make_query_class_config().semaphore;
+            auto& semaphore = this->semaphore();

            if (!querier_opt) {
                return reader_meta(reader_state::inexistent, reader_meta::remote_parts(semaphore));
            }

            auto& q = *querier_opt;
+
+            if (&q.permit().semaphore() != &semaphore) {
+                on_internal_error(mmq_log, format("looked-up reader belongs to different semaphore than the one appropriate for this query class: "
+                        "looked-up reader belongs to {} (0x{:x}) the query class appropriate is {} (0x{:x})",
+                        q.permit().semaphore().name(),
+                        reinterpret_cast<uintptr_t>(&q.permit().semaphore()),
+                        semaphore.name(),
+                        reinterpret_cast<uintptr_t>(&semaphore)));
+            }
+
            auto handle = pause(semaphore, std::move(q).reader());
            return reader_meta(
                    reader_state::successful_lookup,
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -734,56 +734,78 @@ void write_counter_cell(RowWriter& w, const query::partition_slice& slice, ::ato
  });
 }

-// Used to return the timestamp of the latest update to the row
-struct max_timestamp {
-    api::timestamp_type max = api::missing_timestamp;
-
-    void update(api::timestamp_type ts) {
-        max = std::max(max, ts);
-    }
-};
-
-template<>
-struct appending_hash<row> {
-    template<typename Hasher>
-    void operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const {
-        for (auto id : columns) {
-            const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
-            if (!cell_and_hash) {
-                return;
-            }
-            auto&& def = s.column_at(kind, id);
-            if (def.is_atomic()) {
-                max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp());
-                if constexpr (query::using_hash_of_hash_v<Hasher>) {
-                    if (cell_and_hash->hash) {
-                        feed_hash(h, *cell_and_hash->hash);
-                    } else {
-                        query::default_hasher cellh;
-                        feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def);
-                        feed_hash(h, cellh.finalize_uint64());
-                    }
+template<typename Hasher>
+void appending_hash<row>::operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const {
+    for (auto id : columns) {
+        const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
+        if (!cell_and_hash) {
+            feed_hash(h, appending_hash<row>::null_hash_value);
+            continue;
+        }
+        auto&& def = s.column_at(kind, id);
+        if (def.is_atomic()) {
+            max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp());
+            if constexpr (query::using_hash_of_hash_v<Hasher>) {
+                if (cell_and_hash->hash) {
+                    feed_hash(h, *cell_and_hash->hash);
                } else {
-                    feed_hash(h, cell_and_hash->cell.as_atomic_cell(def), def);
+                    query::default_hasher cellh;
+                    feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def);
+                    feed_hash(h, cellh.finalize_uint64());
                }
            } else {
-                auto cm = cell_and_hash->cell.as_collection_mutation();
-                max_ts.update(cm.last_update(*def.type));
-                if constexpr (query::using_hash_of_hash_v<Hasher>) {
-                    if (cell_and_hash->hash) {
-                        feed_hash(h, *cell_and_hash->hash);
-                    } else {
-                        query::default_hasher cellh;
-                        feed_hash(cellh, cm, def);
-                        feed_hash(h, cellh.finalize_uint64());
-                    }
+                feed_hash(h, cell_and_hash->cell.as_atomic_cell(def), def);
+            }
+        } else {
+            auto cm = cell_and_hash->cell.as_collection_mutation();
+            max_ts.update(cm.last_update(*def.type));
+            if constexpr (query::using_hash_of_hash_v<Hasher>) {
+                if (cell_and_hash->hash) {
+                    feed_hash(h, *cell_and_hash->hash);
                } else {
-                    feed_hash(h, cm, def);
+                    query::default_hasher cellh;
+                    feed_hash(cellh, cm, def);
+                    feed_hash(h, cellh.finalize_uint64());
                }
+            } else {
+                feed_hash(h, cm, def);
            }
        }
    }
-};
+}
+// Instantiation for mutation_test.cc
+template void appending_hash<row>::operator()<xx_hasher>(xx_hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const;
+
+template<>
+void appending_hash<row>::operator()<legacy_xx_hasher_without_null_digest>(legacy_xx_hasher_without_null_digest& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const {
+    for (auto id : columns) {
+        const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
+        if (!cell_and_hash) {
+            return;
+        }
+        auto&& def = s.column_at(kind, id);
+        if (def.is_atomic()) {
+            max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp());
+            if (cell_and_hash->hash) {
+                feed_hash(h, *cell_and_hash->hash);
+            } else {
+                query::default_hasher cellh;
+                feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def);
+                feed_hash(h, cellh.finalize_uint64());
+            }
+        } else {
+            auto cm = cell_and_hash->cell.as_collection_mutation();
+            max_ts.update(cm.last_update(*def.type));
+            if (cell_and_hash->hash) {
+                feed_hash(h, *cell_and_hash->hash);
+            } else {
+                query::default_hasher cellh;
+                feed_hash(cellh, cm, def);
+                feed_hash(h, cellh.finalize_uint64());
+            }
+        }
+    }
+}

 cell_hash_opt row::cell_hash_for(column_id id) const {
    if (_type == storage_type::vector) {
@@ -1721,7 +1743,7 @@ void row::apply_monotonically(const schema& s, column_kind kind, row&& other) {
 // we erase the live cells according to the shadowable_tombstone rules.
 static bool dead_marker_shadows_row(const schema& s, column_kind kind, const row_marker& marker) {
    return s.is_view()
-            && !s.view_info()->base_non_pk_columns_in_view_pk().empty()
+            && s.view_info()->has_base_non_pk_columns_in_view_pk()
            && !marker.is_live()
            && kind == column_kind::regular_column; // not applicable to static rows
 }
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -649,6 +649,22 @@ public:
    };
 };

+// Used to return the timestamp of the latest update to the row
+struct max_timestamp {
+    api::timestamp_type max = api::missing_timestamp;
+
+    void update(api::timestamp_type ts) {
+        max = std::max(max, ts);
+    }
+};
+
+template<>
+struct appending_hash<row> {
+    static constexpr int null_hash_value = 0xbeefcafe;
+    template<typename Hasher>
+    void operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const;
+};
+
 class row_marker;
 int compare_row_marker_for_merge(const row_marker& left, const row_marker& right) noexcept;

--- a/mutation_query.hh
+++ b/mutation_query.hh
@@ -114,9 +114,6 @@ class reconcilable_result_builder {
    const schema& _schema;
    const query::partition_slice& _slice;

-    utils::chunked_vector<partition> _result;
-    uint32_t _live_rows{};
-
    bool _return_static_content_on_partition_with_no_rows{};
    bool _static_row_is_alive{};
    uint32_t _total_live_rows = 0;
@@ -124,6 +121,10 @@ class reconcilable_result_builder {
    stop_iteration _stop;
    bool _short_read_allowed;
    std::optional<streamed_mutation_freezer> _mutation_consumer;
+
+    uint32_t _live_rows{};
+    // make this the last member so it is destroyed first. #7240
+    utils::chunked_vector<partition> _result;
 public:
    reconcilable_result_builder(const schema& s, const query::partition_slice& slice,
                                query::result_memory_accounter&& accounter)
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -30,6 +30,7 @@
 #include "schema_registry.hh"
 #include "mutation_compactor.hh"

+logging::logger mrlog("mutation_reader");

 static constexpr size_t merger_small_vector_size = 4;

@@ -659,6 +660,8 @@ flat_mutation_reader make_combined_reader(schema_ptr schema,
    return make_combined_reader(std::move(schema), std::move(v), fwd_sm, fwd_mr);
 }

+const ssize_t new_reader_base_cost{16 * 1024};
+
 class restricting_mutation_reader : public flat_mutation_reader::impl {
    struct mutation_source_and_params {
        mutation_source _ms;
@@ -685,8 +688,6 @@ class restricting_mutation_reader : public flat_mutation_reader::impl {
    };
    std::variant<pending_state, admitted_state> _state;

-    static const ssize_t new_reader_base_cost{16 * 1024};
-
    template<typename Function>
    requires std::is_move_constructible<Function>::value
        && requires(Function fn, flat_mutation_reader& reader) {
@@ -1026,6 +1027,13 @@ private:
    bool _reader_created = false;
    bool _drop_partition_start = false;
    bool _drop_static_row = false;
+    // Trim range tombstones on the start of the buffer to the start of the read
+    // range (_next_position_in_partition). Set after reader recreation.
+    // Also validate the first not-trimmed mutation fragment's position.
+    bool _trim_range_tombstones = false;
+    // Validate the partition key of the first emitted partition, set after the
+    // reader was recreated.
+    bool _validate_partition_key = false;
    position_in_partition::tri_compare _tri_cmp;

    std::optional<dht::decorated_key> _last_pkey;
@@ -1047,7 +1055,10 @@ private:
    void adjust_partition_slice();
    flat_mutation_reader recreate_reader();
    flat_mutation_reader resume_or_create_reader();
+    void maybe_validate_partition_start(const circular_buffer<mutation_fragment>& buffer);
+    void validate_position_in_partition(position_in_partition_view pos) const;
    bool should_drop_fragment(const mutation_fragment& mf);
+    bool maybe_trim_range_tombstone(mutation_fragment& mf) const;
    future<> do_fill_buffer(flat_mutation_reader& reader, db::timeout_clock::time_point timeout);
    future<> fill_buffer(flat_mutation_reader& reader, db::timeout_clock::time_point timeout);

@@ -1120,16 +1131,11 @@ void evictable_reader::update_next_position(flat_mutation_reader& reader) {
            _next_position_in_partition = position_in_partition::before_all_clustered_rows();
            break;
        case partition_region::clustered:
-            if (reader.is_buffer_empty()) {
-                _next_position_in_partition = position_in_partition::after_key(last_pos);
-            } else {
-               const auto& next_frag = reader.peek_buffer();
-               if (next_frag.is_end_of_partition()) {
+            if (!reader.is_buffer_empty() && reader.peek_buffer().is_end_of_partition()) {
                   push_mutation_fragment(reader.pop_mutation_fragment());
                   _next_position_in_partition = position_in_partition::for_partition_start();
-               } else {
-                   _next_position_in_partition = position_in_partition(next_frag.position());
-               }
+            } else {
+                _next_position_in_partition = position_in_partition::after_key(last_pos);
            }
            break;
        case partition_region::partition_end:
@@ -1154,6 +1160,9 @@ flat_mutation_reader evictable_reader::recreate_reader() {
    const dht::partition_range* range = _pr;
    const query::partition_slice* slice = &_ps;

+    _range_override.reset();
+    _slice_override.reset();
+
    if (_last_pkey) {
        bool partition_range_is_inclusive = true;

@@ -1190,6 +1199,9 @@ flat_mutation_reader evictable_reader::recreate_reader() {
        range = &*_range_override;
    }

+    _trim_range_tombstones = true;
+    _validate_partition_key = true;
+
    return _ms.make_reader(
            _schema,
            _permit,
@@ -1216,6 +1228,78 @@ flat_mutation_reader evictable_reader::resume_or_create_reader() {
    return recreate_reader();
 }

+template <typename... Arg>
+static void require(bool condition, const char* msg, const Arg&... arg) {
+    if (!condition) {
+        on_internal_error(mrlog, format(msg, arg...));
+    }
+}
+
+void evictable_reader::maybe_validate_partition_start(const circular_buffer<mutation_fragment>& buffer) {
+    if (!_validate_partition_key || buffer.empty()) {
+        return;
+    }
+
+    // If this is set we can assume the first fragment is a partition-start.
+    const auto& ps = buffer.front().as_partition_start();
+    const auto tri_cmp = dht::ring_position_comparator(*_schema);
+    // If we recreated the reader after fast-forwarding it we won't have
+    // _last_pkey set. In this case it is enough to check if the partition
+    // is in range.
+    if (_last_pkey) {
+        const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
+        if (_drop_partition_start) { // should be the same partition
+            require(
+                    cmp_res == 0,
+                    "{}(): validation failed, expected partition with key equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
+                    __FUNCTION__,
+                    *_last_pkey,
+                    ps.key());
+        } else { // should be a larger partition
+            require(
+                    cmp_res < 0,
+                    "{}(): validation failed, expected partition with key larger than _last_pkey {} due to _drop_partition_start being unset, but got {}",
+                    __FUNCTION__,
+                    *_last_pkey,
+                    ps.key());
+        }
+    }
+    const auto& prange = _range_override ? *_range_override : *_pr;
+    require(
+            // TODO: somehow avoid this copy
+            prange.contains(ps.key(), tri_cmp),
+            "{}(): validation failed, expected partition with key that falls into current range {}, but got {}",
+            __FUNCTION__,
+            prange,
+            ps.key());
+
+    _validate_partition_key = false;
+}
+
+void evictable_reader::validate_position_in_partition(position_in_partition_view pos) const {
+    require(
+            _tri_cmp(_next_position_in_partition, pos) <= 0,
+            "{}(): validation failed, expected position in partition that is larger-than-equal than _next_position_in_partition {}, but got {}",
+            __FUNCTION__,
+            _next_position_in_partition,
+            pos);
+
+    if (_slice_override && pos.region() == partition_region::clustered) {
+        const auto ranges = _slice_override->row_ranges(*_schema, _last_pkey->key());
+        const bool any_contains = std::any_of(ranges.begin(), ranges.end(), [this, &pos] (const query::clustering_range& cr) {
+            // TODO: somehow avoid this copy
+            auto range = position_range(cr);
+            return range.contains(*_schema, pos);
+        });
+        require(
+                any_contains,
+                "{}(): validation failed, expected clustering fragment that is included in the slice {}, but got {}",
+                __FUNCTION__,
+                *_slice_override,
+                pos);
+    }
+}
+
 bool evictable_reader::should_drop_fragment(const mutation_fragment& mf) {
    if (_drop_partition_start && mf.is_partition_start()) {
        _drop_partition_start = false;
@@ -1228,12 +1312,50 @@ bool evictable_reader::should_drop_fragment(const mutation_fragment& mf) {
    return false;
 }

+bool evictable_reader::maybe_trim_range_tombstone(mutation_fragment& mf) const {
+    // We either didn't read a partition yet (evicted after fast-forwarding) or
+    // didn't stop in a clustering region. We don't need to trim range
+    // tombstones in either case.
+    if (!_last_pkey || _next_position_in_partition.region() != partition_region::clustered) {
+        return false;
+    }
+    if (!mf.is_range_tombstone()) {
+        validate_position_in_partition(mf.position());
+        return false;
+    }
+
+    if (_tri_cmp(mf.position(), _next_position_in_partition) >= 0) {
+        validate_position_in_partition(mf.position());
+        return false; // rt in range, no need to trim
+    }
+
+    auto& rt = mf.as_mutable_range_tombstone();
+
+    require(
+            _tri_cmp(_next_position_in_partition, rt.end_position()) <= 0,
+            "{}(): validation failed, expected range tombstone with end pos larger than _next_position_in_partition {}, but got {}",
+            __FUNCTION__,
+            _next_position_in_partition,
+            rt.end_position());
+
+    rt.set_start(*_schema, position_in_partition_view::before_key(_next_position_in_partition));
+
+    return true;
+}
+
 future<> evictable_reader::do_fill_buffer(flat_mutation_reader& reader, db::timeout_clock::time_point timeout) {
    if (!_drop_partition_start && !_drop_static_row) {
-        return reader.fill_buffer(timeout);
+        auto fill_buf_fut = reader.fill_buffer(timeout);
+        if (_validate_partition_key) {
+            fill_buf_fut = fill_buf_fut.then([this, &reader] {
+                maybe_validate_partition_start(reader.buffer());
+            });
+        }
+        return fill_buf_fut;
    }
    return repeat([this, &reader, timeout] {
        return reader.fill_buffer(timeout).then([this, &reader] {
+            maybe_validate_partition_start(reader.buffer());
            while (!reader.is_buffer_empty() && should_drop_fragment(reader.peek_buffer())) {
                reader.pop_mutation_fragment();
            }
@@ -1247,6 +1369,11 @@ future<> evictable_reader::fill_buffer(flat_mutation_reader& reader, db::timeout
        if (reader.is_buffer_empty()) {
            return make_ready_future<>();
        }
+        while (_trim_range_tombstones && !reader.is_buffer_empty()) {
+            auto mf = reader.pop_mutation_fragment();
+            _trim_range_tombstones = maybe_trim_range_tombstone(mf);
+            push_mutation_fragment(std::move(mf));
+        }
        reader.move_buffer_content_to(*this);
        auto stop = [this, &reader] {
            // The only problematic fragment kind is the range tombstone.
@@ -1287,7 +1414,13 @@ future<> evictable_reader::fill_buffer(flat_mutation_reader& reader, db::timeout
            if (reader.is_buffer_empty()) {
                return do_fill_buffer(reader, timeout);
            }
-            push_mutation_fragment(reader.pop_mutation_fragment());
+            if (_trim_range_tombstones) {
+                auto mf = reader.pop_mutation_fragment();
+                _trim_range_tombstones = maybe_trim_range_tombstone(mf);
+                push_mutation_fragment(std::move(mf));
+            } else {
+                push_mutation_fragment(reader.pop_mutation_fragment());
+            }
            return make_ready_future<>();
        });
    }).then([this, &reader] {
--- a/mutation_reader.hh
+++ b/mutation_reader.hh
@@ -304,6 +304,8 @@ public:
 mutation_source make_empty_mutation_source();
 snapshot_source make_empty_snapshot_source();

+extern const ssize_t new_reader_base_cost;
+
 // Creates a restricted reader whose resource usages will be tracked
 // during it's lifetime. If there are not enough resources (dues to
 // existing readers) to create the new reader, it's construction will
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -163,6 +163,11 @@ public:
        return {partition_region::clustered, bound_weight::before_all_prefixed, &ck};
    }

+    // Returns a view to before_key(pos._ck) if pos.is_clustering_row() else returns pos as-is.
+    static position_in_partition_view before_key(position_in_partition_view pos) {
+        return {partition_region::clustered, pos._bound_weight == bound_weight::equal ? bound_weight::before_all_prefixed : pos._bound_weight, pos._ck};
+    }
+
    partition_region region() const { return _type; }
    bound_weight get_bound_weight() const { return _bound_weight; }
    bool is_partition_start() const { return _type == partition_region::partition_start; }
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -28,6 +28,7 @@

 reader_permit::resource_units::resource_units(reader_concurrency_semaphore& semaphore, reader_resources res) noexcept
        : _semaphore(&semaphore), _resources(res) {
+    _semaphore->consume(res);
 }

 reader_permit::resource_units::resource_units(resource_units&& o) noexcept
@@ -75,7 +76,6 @@ reader_permit::resource_units reader_permit::consume_memory(size_t memory) {
 }

 reader_permit::resource_units reader_permit::consume_resources(reader_resources res) {
-    _semaphore->consume(res);
    return resource_units(*_semaphore, res);
 }

@@ -83,7 +83,6 @@ void reader_concurrency_semaphore::signal(const resources& r) noexcept {
    _resources += r;
    while (!_wait_list.empty() && has_available_units(_wait_list.front().res)) {
        auto& x = _wait_list.front();
-        _resources -= x.res;
        try {
            x.pr.set_value(reader_permit::resource_units(*this, x.res));
        } catch (...) {
@@ -104,7 +103,7 @@ reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore:
        const auto [it, _] = _inactive_reads.emplace(_next_id++, std::move(ir));
        (void)_;
        ++_inactive_read_stats.population;
-        return inactive_read_handle(it->first);
+        return inactive_read_handle(*this, it->first);
    }

    // The evicted reader will release its permit, hopefully allowing us to
@@ -115,6 +114,17 @@ reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore:
 }

 std::unique_ptr<reader_concurrency_semaphore::inactive_read> reader_concurrency_semaphore::unregister_inactive_read(inactive_read_handle irh) {
+    if (irh && irh._sem != this) {
+        throw std::runtime_error(fmt::format(
+                    "reader_concurrency_semaphore::unregister_inactive_read(): "
+                    "attempted to unregister an inactive read with a handle belonging to another semaphore: "
+                    "this is {} (0x{:x}) but the handle belongs to {} (0x{:x})",
+                    name(),
+                    reinterpret_cast<uintptr_t>(this),
+                    irh._sem->name(),
+                    reinterpret_cast<uintptr_t>(irh._sem)));
+    }
+
    if (auto it = _inactive_reads.find(irh._id); it != _inactive_reads.end()) {
        auto ir = std::move(it->second);
        _inactive_reads.erase(it);
@@ -158,7 +168,6 @@ future<reader_permit::resource_units> reader_concurrency_semaphore::do_wait_admi
        --_inactive_read_stats.population;
    }
    if (may_proceed(r)) {
-        _resources -= r;
        return make_ready_future<reader_permit::resource_units>(reader_permit::resource_units(*this, r));
    }
    promise<reader_permit::resource_units> pr;
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -60,18 +60,20 @@ public:
    };

    class inactive_read_handle {
+        reader_concurrency_semaphore* _sem = nullptr;
        uint64_t _id = 0;

        friend class reader_concurrency_semaphore;

-        explicit inactive_read_handle(uint64_t id)
-            : _id(id) {
+        explicit inactive_read_handle(reader_concurrency_semaphore& sem, uint64_t id)
+            : _sem(&sem), _id(id) {
        }
    public:
        inactive_read_handle() = default;
-        inactive_read_handle(inactive_read_handle&& o) : _id(std::exchange(o._id, 0)) {
+        inactive_read_handle(inactive_read_handle&& o) : _sem(std::exchange(o._sem, nullptr)), _id(std::exchange(o._id, 0)) {
        }
        inactive_read_handle& operator=(inactive_read_handle&& o) {
+            _sem = std::exchange(o._sem, nullptr);
            _id = std::exchange(o._id, 0);
            return *this;
        }
@@ -105,6 +107,7 @@ private:
    };

 private:
+    const resources _initial_resources;
    resources _resources;

    expiring_fifo<entry, expiry_handler, db::timeout_clock> _wait_list;
@@ -135,7 +138,8 @@ public:
            sstring name,
            size_t max_queue_length = std::numeric_limits<size_t>::max(),
            std::function<void()> prethrow_action = nullptr)
-        : _resources(count, memory)
+        : _initial_resources(count, memory)
+        , _resources(count, memory)
        , _wait_list(expiry_handler(name))
        , _name(std::move(name))
        , _max_queue_length(max_queue_length)
@@ -144,11 +148,11 @@ public:
    /// Create a semaphore with practically unlimited count and memory.
    ///
    /// And conversely, no queue limit either.
-    explicit reader_concurrency_semaphore(no_limits)
+    explicit reader_concurrency_semaphore(no_limits, sstring name = "unlimited reader_concurrency_semaphore")
        : reader_concurrency_semaphore(
                std::numeric_limits<int>::max(),
                std::numeric_limits<ssize_t>::max(),
-                "unlimited reader_concurrency_semaphore") {}
+                std::move(name)) {}

    ~reader_concurrency_semaphore();

@@ -158,6 +162,13 @@ public:
    reader_concurrency_semaphore(reader_concurrency_semaphore&&) = delete;
    reader_concurrency_semaphore& operator=(reader_concurrency_semaphore&&) = delete;

+    /// Returns the name of the semaphore
+    ///
+    /// If the semaphore has no name, "unnamed reader concurrency semaphore" is returned.
+    std::string_view name() const {
+        return _name.empty() ? "unnamed reader concurrency semaphore" : std::string_view(_name);
+    }
+
    /// Register an inactive read.
    ///
    /// The semaphore will evict this read when there is a shortage of
@@ -193,6 +204,10 @@ public:

    reader_permit make_permit();

+    const resources initial_resources() const {
+        return _initial_resources;
+    }
+
    const resources available_resources() const {
        return _resources;
    }
--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -42,12 +42,20 @@ struct reader_resources {
        return count >= other.count && memory >= other.memory;
    }

+    reader_resources operator-(const reader_resources& other) const {
+        return reader_resources{count - other.count, memory - other.memory};
+    }
+
    reader_resources& operator-=(const reader_resources& other) {
        count -= other.count;
        memory -= other.memory;
        return *this;
    }

+    reader_resources operator+(const reader_resources& other) const {
+        return reader_resources{count + other.count, memory + other.memory};
+    }
+
    reader_resources& operator+=(const reader_resources& other) {
        count += other.count;
        memory += other.memory;
--- a/redis/commands.cc
+++ b/redis/commands.cc
@@ -62,7 +62,7 @@ shared_ptr<abstract_command> exists::prepare(service::storage_proxy& proxy, requ
 }

 future<redis_message> exists::execute(service::storage_proxy& proxy, redis::redis_options& options, service_permit permit) {
-    return seastar::do_for_each(_keys, [&proxy, &options, &permit, this] (bytes key) {
+    return seastar::do_for_each(_keys, [&proxy, &options, permit, this] (bytes& key) {
        return redis::read_strings(proxy, options, key, permit).then([this] (lw_shared_ptr<strings_result> result) {
            if (result->has_result()) {
                _count++;
--- a/reloc/build_deb.sh
+++ b/reloc/build_deb.sh
@@ -44,15 +44,15 @@ mkdir -p $BUILDDIR/scylla-package
 tar -C $BUILDDIR/scylla-package -xpf $RELOC_PKG
 cd $BUILDDIR/scylla-package

-PRODUCT=$(cat scylla/SCYLLA-PRODUCT-FILE)
-SCYLLA_VERSION=$(cat scylla/SCYLLA-VERSION-FILE)
-SCYLLA_RELEASE=$(cat scylla/SCYLLA-RELEASE-FILE)
-
-ln -fv $RELOC_PKG ../$PRODUCT-server_$SCYLLA_VERSION-$SCYLLA_RELEASE.orig.tar.gz
-
 if $DIST; then
    export DEB_BUILD_OPTIONS="housekeeping"
 fi

 mv scylla/debian debian
+
+PKG_NAME=$(dpkg-parsechangelog --show-field Source)
+# XXX: Drop revision number from version string.
+#      Since it always '1', this should be okay for now.
+PKG_VERSION=$(dpkg-parsechangelog --show-field Version |sed -e 's/-1$//')
+ln -fv $RELOC_PKG ../"$PKG_NAME"_"$PKG_VERSION".orig.tar.gz
 debuild -rfakeroot -us -uc
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -1633,6 +1633,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, locator::token_me
            auto& ks = db.local().find_keyspace(keyspace_name);
            auto& strat = ks.get_replication_strategy();
            dht::token_range_vector desired_ranges = strat.get_pending_address_ranges(tm, tokens, myip);
+            bool find_node_in_local_dc_only = strat.get_type() == locator::replication_strategy_type::network_topology;

            //Active ranges
            auto metadata_clone = tm.clone_only_token_map();
@@ -1719,6 +1720,9 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, locator::token_me
                            mandatory_neighbors = get_node_losing_the_ranges(old_endpoints, new_endpoints);
                            neighbors = mandatory_neighbors;
                        } else if (old_endpoints.size() < strat.get_replication_factor()) {
+                          if (!find_node_in_local_dc_only) {
+                            neighbors = old_endpoints;
+                          } else {
                            if (old_endpoints_in_local_dc.size() == rf_in_local_dc) {
                                // Local DC has enough replica nodes.
                                mandatory_neighbors = get_node_losing_the_ranges(old_endpoints_in_local_dc, new_endpoints);
@@ -1746,6 +1750,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, locator::token_me
                                throw std::runtime_error(format("bootstrap_with_repair: keyspace={}, range={}, wrong number of old_endpoints_in_local_dc={}, rf_in_local_dc={}",
                                        keyspace_name, desired_range, old_endpoints_in_local_dc.size(), rf_in_local_dc));
                            }
+                          }
                        } else {
                            throw std::runtime_error(format("bootstrap_with_repair: keyspace={}, range={}, wrong number of old_endpoints={}, rf={}",
                                        keyspace_name, desired_range, old_endpoints, strat.get_replication_factor()));
--- a/repair/repair.hh
+++ b/repair/repair.hh
@@ -23,6 +23,7 @@

 #include <unordered_map>
 #include <exception>
+#include <absl/container/btree_set.h>

 #include <seastar/core/sstring.hh>
 #include <seastar/core/sharded.hh>
@@ -339,6 +340,8 @@ public:
    }
 };

+using repair_hash_set = absl::btree_set<repair_hash>;
+
 enum class repair_row_level_start_status: uint8_t {
    ok,
    no_such_column_family,
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -47,6 +47,7 @@
 #include "gms/gossiper.hh"
 #include "repair/row_level.hh"
 #include "mutation_source_metadata.hh"
+#include "utils/stall_free.hh"

 extern logging::logger rlogger;

@@ -529,7 +530,7 @@ public:
                    sstables::shared_sstable sst = use_view_update_path ? t->make_streaming_staging_sstable() : t->make_streaming_sstable_for_write();
                    schema_ptr s = reader.schema();
                    auto& pc = service::get_local_streaming_priority();
-                    return sst->write_components(std::move(reader), std::max(1ul, adjusted_estimated_partitions), s,
+                    return sst->write_components(std::move(reader), adjusted_estimated_partitions, s,
                                                 t->get_sstables_manager().configure_writer(),
                                                 encoding_stats{}, pc).then([sst] {
                        return sst->open_data();
@@ -666,7 +667,7 @@ private:
    // Tracks current sync boundary
    std::optional<repair_sync_boundary> _current_sync_boundary;
    // Contains the hashes of rows in the _working_row_buffor for all peer nodes
-    std::vector<std::unordered_set<repair_hash>> _peer_row_hash_sets;
+    std::vector<repair_hash_set> _peer_row_hash_sets;
    // Gate used to make sure pending operation of meta data is done
    seastar::gate _gate;
    sink_source_for_get_full_row_hashes _sink_source_for_get_full_row_hashes;
@@ -754,11 +755,12 @@ public:
 public:
    future<> stop() {
        auto gate_future = _gate.close();
-        auto writer_future = _repair_writer.wait_for_writer_done();
        auto f1 = _sink_source_for_get_full_row_hashes.close();
        auto f2 = _sink_source_for_get_row_diff.close();
        auto f3 = _sink_source_for_put_row_diff.close();
-        return when_all_succeed(std::move(gate_future), std::move(writer_future), std::move(f1), std::move(f2), std::move(f3)).discard_result();
+        return when_all_succeed(std::move(gate_future), std::move(f1), std::move(f2), std::move(f3)).discard_result().finally([this] {
+            return _repair_writer.wait_for_writer_done();
+        });
    }

    static std::unordered_map<node_repair_meta_id, lw_shared_ptr<repair_meta>>& repair_meta_map() {
@@ -886,9 +888,9 @@ public:
    }

    // Must run inside a seastar thread
-    static std::unordered_set<repair_hash>
-    get_set_diff(const std::unordered_set<repair_hash>& x, const std::unordered_set<repair_hash>& y) {
-        std::unordered_set<repair_hash> set_diff;
+    static repair_hash_set
+    get_set_diff(const repair_hash_set& x, const repair_hash_set& y) {
+        repair_hash_set set_diff;
        // Note std::set_difference needs x and y are sorted.
        std::copy_if(x.begin(), x.end(), std::inserter(set_diff, set_diff.end()),
                [&y] (auto& item) { thread::maybe_yield(); return y.find(item) == y.end(); });
@@ -906,14 +908,14 @@ public:

    }

-    std::unordered_set<repair_hash>& peer_row_hash_sets(unsigned node_idx) {
+    repair_hash_set& peer_row_hash_sets(unsigned node_idx) {
        return _peer_row_hash_sets[node_idx];
    }

    // Get a list of row hashes in _working_row_buf
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    working_row_hashes() {
-        return do_with(std::unordered_set<repair_hash>(), [this] (std::unordered_set<repair_hash>& hashes) {
+        return do_with(repair_hash_set(), [this] (repair_hash_set& hashes) {
            return do_for_each(_working_row_buf, [&hashes] (repair_row& r) {
                hashes.emplace(r.hash());
            }).then([&hashes] {
@@ -1090,24 +1092,32 @@ private:
        });
    }

+    future<> clear_row_buf() {
+        return utils::clear_gently(_row_buf);
+    }
+
+    future<> clear_working_row_buf() {
+        return utils::clear_gently(_working_row_buf).then([this] {
+            _working_row_buf_combined_hash.clear();
+        });
+    }
+
    // Read rows from disk until _max_row_buf_size of rows are filled into _row_buf.
    // Calculate the combined checksum of the rows
    // Calculate the total size of the rows in _row_buf
    future<get_sync_boundary_response>
    get_sync_boundary(std::optional<repair_sync_boundary> skipped_sync_boundary) {
+        auto f = make_ready_future<>();
        if (skipped_sync_boundary) {
            _current_sync_boundary = skipped_sync_boundary;
-            _row_buf.clear();
-            _working_row_buf.clear();
-            _working_row_buf_combined_hash.clear();
-        } else {
-            _working_row_buf.clear();
-            _working_row_buf_combined_hash.clear();
+            f = clear_row_buf();
        }
        // Here is the place we update _last_sync_boundary
        rlogger.trace("SET _last_sync_boundary from {} to {}", _last_sync_boundary, _current_sync_boundary);
        _last_sync_boundary = _current_sync_boundary;
-        return row_buf_size().then([this, sb = std::move(skipped_sync_boundary)] (size_t cur_size) {
+      return f.then([this, sb = std::move(skipped_sync_boundary)] () mutable {
+       return clear_working_row_buf().then([this, sb = sb] () mutable {
+        return row_buf_size().then([this, sb = std::move(sb)] (size_t cur_size) {
            return read_rows_from_disk(cur_size).then([this, sb = std::move(sb)] (std::list<repair_row> new_rows, size_t new_rows_size) mutable {
                size_t new_rows_nr = new_rows.size();
                _row_buf.splice(_row_buf.end(), new_rows);
@@ -1124,6 +1134,8 @@ private:
                });
            });
        });
+       });
+      });
    }

    future<> move_row_buf_to_working_row_buf() {
@@ -1199,9 +1211,9 @@ private:
    }

    future<std::list<repair_row>>
-    copy_rows_from_working_row_buf_within_set_diff(std::unordered_set<repair_hash> set_diff) {
+    copy_rows_from_working_row_buf_within_set_diff(repair_hash_set set_diff) {
        return do_with(std::list<repair_row>(), std::move(set_diff),
-                [this] (std::list<repair_row>& rows, std::unordered_set<repair_hash>& set_diff) {
+                [this] (std::list<repair_row>& rows, repair_hash_set& set_diff) {
            return do_for_each(_working_row_buf, [this, &set_diff, &rows] (const repair_row& r) {
                if (set_diff.count(r.hash()) > 0) {
                    rows.push_back(r);
@@ -1216,7 +1228,7 @@ private:
    // Give a set of row hashes, return the corresponding rows
    // If needs_all_rows is set, return all the rows in _working_row_buf, ignore the set_diff
    future<std::list<repair_row>>
-    get_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows = needs_all_rows_t::no) {
+    get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows = needs_all_rows_t::no) {
        if (needs_all_rows) {
            if (!_repair_master || _nr_peer_nodes == 1) {
                return make_ready_future<std::list<repair_row>>(std::move(_working_row_buf));
@@ -1227,19 +1239,28 @@ private:
        }
    }

-    future<> do_apply_rows(std::list<repair_row>& row_diff, unsigned node_idx, update_working_row_buf update_buf) {
-        return with_semaphore(_repair_writer.sem(), 1, [this, node_idx, update_buf, &row_diff] {
-            _repair_writer.create_writer(_db, node_idx);
-            return do_for_each(row_diff, [this, node_idx, update_buf] (repair_row& r) {
-                if (update_buf) {
-                    _working_row_buf_combined_hash.add(r.hash());
-                }
-                // The repair_row here is supposed to have
-                // mutation_fragment attached because we have stored it in
-                // to_repair_rows_list above where the repair_row is created.
-                mutation_fragment mf = std::move(r.get_mutation_fragment());
-                auto dk_with_hash = r.get_dk_with_hash();
-                return _repair_writer.do_write(node_idx, std::move(dk_with_hash), std::move(mf));
+    future<> do_apply_rows(std::list<repair_row>&& row_diff, unsigned node_idx, update_working_row_buf update_buf) {
+        return do_with(std::move(row_diff), [this, node_idx, update_buf] (std::list<repair_row>& row_diff) {
+            return with_semaphore(_repair_writer.sem(), 1, [this, node_idx, update_buf, &row_diff] {
+                _repair_writer.create_writer(_db, node_idx);
+                return repeat([this, node_idx, update_buf, &row_diff] () mutable {
+                    if (row_diff.empty()) {
+                        return make_ready_future<stop_iteration>(stop_iteration::yes);
+                    }
+                    repair_row& r = row_diff.front();
+                    if (update_buf) {
+                        _working_row_buf_combined_hash.add(r.hash());
+                    }
+                    // The repair_row here is supposed to have
+                    // mutation_fragment attached because we have stored it in
+                    // to_repair_rows_list above where the repair_row is created.
+                    mutation_fragment mf = std::move(r.get_mutation_fragment());
+                    auto dk_with_hash = r.get_dk_with_hash();
+                    return _repair_writer.do_write(node_idx, std::move(dk_with_hash), std::move(mf)).then([&row_diff] {
+                        row_diff.pop_front();
+                        return make_ready_future<stop_iteration>(stop_iteration::no);
+                    });
+                });
            });
        });
    }
@@ -1257,19 +1278,17 @@ private:
        stats().rx_row_nr += row_diff.size();
        stats().rx_row_nr_peer[from] += row_diff.size();
        if (update_buf) {
-            std::list<repair_row> tmp;
-            tmp.swap(_working_row_buf);
            // Both row_diff and _working_row_buf and are ordered, merging
            // two sored list to make sure the combination of row_diff
            // and _working_row_buf are ordered.
-            std::merge(tmp.begin(), tmp.end(), row_diff.begin(), row_diff.end(), std::back_inserter(_working_row_buf),
-                [this] (const repair_row& x, const repair_row& y) { thread::maybe_yield(); return _cmp(x.boundary(), y.boundary()) < 0; });
+            utils::merge_to_gently(_working_row_buf, row_diff,
+                 [this] (const repair_row& x, const repair_row& y) { return _cmp(x.boundary(), y.boundary()) < 0; });
        }
        if (update_hash_set) {
-            _peer_row_hash_sets[node_idx] = boost::copy_range<std::unordered_set<repair_hash>>(row_diff |
+            _peer_row_hash_sets[node_idx] = boost::copy_range<repair_hash_set>(row_diff |
                    boost::adaptors::transformed([] (repair_row& r) { thread::maybe_yield(); return r.hash(); }));
        }
-        do_apply_rows(row_diff, node_idx, update_buf).get();
+        do_apply_rows(std::move(row_diff), node_idx, update_buf).get();
    }

    future<>
@@ -1277,11 +1296,9 @@ private:
        if (rows.empty()) {
            return make_ready_future<>();
        }
-        return to_repair_rows_list(rows).then([this] (std::list<repair_row> row_diff) {
-            return do_with(std::move(row_diff), [this] (std::list<repair_row>& row_diff) {
-                unsigned node_idx = 0;
-                return do_apply_rows(row_diff, node_idx, update_working_row_buf::no);
-            });
+        return to_repair_rows_list(std::move(rows)).then([this] (std::list<repair_row> row_diff) {
+            unsigned node_idx = 0;
+            return do_apply_rows(std::move(row_diff), node_idx, update_working_row_buf::no);
        });
    }

@@ -1360,13 +1377,13 @@ private:
 public:
    // RPC API
    // Return the hashes of the rows in _working_row_buf
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    get_full_row_hashes(gms::inet_address remote_node) {
        if (remote_node == _myip) {
            return get_full_row_hashes_handler();
        }
        return netw::get_local_messaging_service().send_repair_get_full_row_hashes(msg_addr(remote_node),
-                _repair_meta_id).then([this, remote_node] (std::unordered_set<repair_hash> hashes) {
+                _repair_meta_id).then([this, remote_node] (repair_hash_set hashes) {
            rlogger.debug("Got full hashes from peer={}, nr_hashes={}", remote_node, hashes.size());
            _metrics.rx_hashes_nr += hashes.size();
            stats().rx_hashes_nr += hashes.size();
@@ -1377,7 +1394,7 @@ public:

 private:
    future<> get_full_row_hashes_source_op(
-            lw_shared_ptr<std::unordered_set<repair_hash>> current_hashes,
+            lw_shared_ptr<repair_hash_set> current_hashes,
            gms::inet_address remote_node,
            unsigned node_idx,
            rpc::source<repair_hash_with_cmd>& source) {
@@ -1415,12 +1432,12 @@ private:
    }

 public:
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    get_full_row_hashes_with_rpc_stream(gms::inet_address remote_node, unsigned node_idx) {
        if (remote_node == _myip) {
            return get_full_row_hashes_handler();
        }
-        auto current_hashes = make_lw_shared<std::unordered_set<repair_hash>>();
+        auto current_hashes = make_lw_shared<repair_hash_set>();
        return _sink_source_for_get_full_row_hashes.get_sink_source(remote_node, node_idx).then(
                [this, current_hashes, remote_node, node_idx]
                (rpc::sink<repair_stream_cmd>& sink, rpc::source<repair_hash_with_cmd>& source) mutable {
@@ -1435,7 +1452,7 @@ public:
    }

    // RPC handler
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    get_full_row_hashes_handler() {
        return with_gate(_gate, [this] {
            return working_row_hashes();
@@ -1585,7 +1602,7 @@ public:
    // RPC API
    // Return rows in the _working_row_buf with hash within the given sef_diff
    // Must run inside a seastar thread
-    void get_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
+    void get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
        if (needs_all_rows || !set_diff.empty()) {
            if (remote_node == _myip) {
                return;
@@ -1654,11 +1671,11 @@ private:
    }

    future<> get_row_diff_sink_op(
-            std::unordered_set<repair_hash> set_diff,
+            repair_hash_set set_diff,
            needs_all_rows_t needs_all_rows,
            rpc::sink<repair_hash_with_cmd>& sink,
            gms::inet_address remote_node) {
-        return do_with(std::move(set_diff), [needs_all_rows, remote_node, &sink] (std::unordered_set<repair_hash>& set_diff) mutable {
+        return do_with(std::move(set_diff), [needs_all_rows, remote_node, &sink] (repair_hash_set& set_diff) mutable {
            if (inject_rpc_stream_error) {
                return make_exception_future<>(std::runtime_error("get_row_diff: Inject sender error in sink loop"));
            }
@@ -1685,7 +1702,7 @@ private:
 public:
    // Must run inside a seastar thread
    void get_row_diff_with_rpc_stream(
-            std::unordered_set<repair_hash> set_diff,
+            repair_hash_set set_diff,
            needs_all_rows_t needs_all_rows,
            update_peer_row_hash_sets update_hash_set,
            gms::inet_address remote_node,
@@ -1711,7 +1728,7 @@ public:
    }

    // RPC handler
-    future<repair_rows_on_wire> get_row_diff_handler(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows) {
+    future<repair_rows_on_wire> get_row_diff_handler(repair_hash_set set_diff, needs_all_rows_t needs_all_rows) {
        return with_gate(_gate, [this, set_diff = std::move(set_diff), needs_all_rows] () mutable {
            return get_row_diff(std::move(set_diff), needs_all_rows).then([this] (std::list<repair_row> row_diff) {
                return to_repair_rows_on_wire(std::move(row_diff));
@@ -1721,15 +1738,16 @@ public:

    // RPC API
    // Send rows in the _working_row_buf with hash within the given sef_diff
-    future<> put_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
+    future<> put_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
        if (!set_diff.empty()) {
            if (remote_node == _myip) {
                return make_ready_future<>();
            }
-            auto sz = set_diff.size();
+            size_t sz = set_diff.size();
            return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, sz] (std::list<repair_row> row_diff) {
                if (row_diff.size() != sz) {
-                    throw std::runtime_error("row_diff.size() != set_diff.size()");
+                    rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
+                            _schema->ks_name(), _schema->cf_name(), _range, row_diff.size(), sz);
                }
                return do_with(std::move(row_diff), [this, remote_node] (std::list<repair_row>& row_diff) {
                    return get_repair_rows_size(row_diff).then([this, remote_node, &row_diff] (size_t row_bytes) mutable {
@@ -1796,17 +1814,18 @@ private:

 public:
    future<> put_row_diff_with_rpc_stream(
-            std::unordered_set<repair_hash> set_diff,
+            repair_hash_set set_diff,
            needs_all_rows_t needs_all_rows,
            gms::inet_address remote_node, unsigned node_idx) {
        if (!set_diff.empty()) {
            if (remote_node == _myip) {
                return make_ready_future<>();
            }
-            auto sz = set_diff.size();
+            size_t sz = set_diff.size();
            return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, node_idx, sz] (std::list<repair_row> row_diff) {
                if (row_diff.size() != sz) {
-                    throw std::runtime_error("row_diff.size() != set_diff.size()");
+                    rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
+                            _schema->ks_name(), _schema->cf_name(), _range, row_diff.size(), sz);
                }
                return do_with(std::move(row_diff), [this, remote_node, node_idx] (std::list<repair_row>& row_diff) {
                    return get_repair_rows_size(row_diff).then([this, remote_node, node_idx, &row_diff] (size_t row_bytes) mutable {
@@ -1845,7 +1864,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
        rpc::sink<repair_row_on_wire_with_cmd> sink,
        rpc::source<repair_hash_with_cmd> source,
        bool &error,
-        std::unordered_set<repair_hash>& current_set_diff,
+        repair_hash_set& current_set_diff,
        std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) {
    repair_hash_with_cmd hash_cmd = std::get<0>(hash_cmd_opt.value());
    rlogger.trace("Got repair_hash_with_cmd from peer={}, hash={}, cmd={}", from, hash_cmd.hash, int(hash_cmd.cmd));
@@ -1858,7 +1877,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
        }
        bool needs_all_rows = hash_cmd.cmd == repair_stream_cmd::needs_all_rows;
        _metrics.rx_hashes_nr += current_set_diff.size();
-        auto fp = make_foreign(std::make_unique<std::unordered_set<repair_hash>>(std::move(current_set_diff)));
+        auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(current_set_diff)));
        return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, needs_all_rows, fp = std::move(fp)] {
            auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
            if (fp.get_owner_shard() == this_shard_id()) {
@@ -1936,12 +1955,12 @@ static future<stop_iteration> repair_get_full_row_hashes_with_rpc_stream_process
    if (status == repair_stream_cmd::get_full_row_hashes) {
        return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id] {
            auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
-            return rm->get_full_row_hashes_handler().then([] (std::unordered_set<repair_hash> hashes) {
+            return rm->get_full_row_hashes_handler().then([] (repair_hash_set hashes) {
                _metrics.tx_hashes_nr += hashes.size();
                return hashes;
            });
-        }).then([sink] (std::unordered_set<repair_hash> hashes) mutable {
-            return do_with(std::move(hashes), [sink] (std::unordered_set<repair_hash>& hashes) mutable {
+        }).then([sink] (repair_hash_set hashes) mutable {
+            return do_with(std::move(hashes), [sink] (repair_hash_set& hashes) mutable {
                return do_for_each(hashes, [sink] (const repair_hash& hash) mutable {
                    return sink(repair_hash_with_cmd{repair_stream_cmd::hash_data, hash});
                }).then([sink] () mutable {
@@ -1964,7 +1983,7 @@ static future<> repair_get_row_diff_with_rpc_stream_handler(
        uint32_t repair_meta_id,
        rpc::sink<repair_row_on_wire_with_cmd> sink,
        rpc::source<repair_hash_with_cmd> source) {
-    return do_with(false, std::unordered_set<repair_hash>(), [from, src_cpu_id, repair_meta_id, sink, source] (bool& error, std::unordered_set<repair_hash>& current_set_diff) mutable {
+    return do_with(false, repair_hash_set(), [from, src_cpu_id, repair_meta_id, sink, source] (bool& error, repair_hash_set& current_set_diff) mutable {
        return repeat([from, src_cpu_id, repair_meta_id, sink, source, &error, &current_set_diff] () mutable {
            return source().then([from, src_cpu_id, repair_meta_id, sink, source, &error, &current_set_diff] (std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) mutable {
                if (hash_cmd_opt) {
@@ -2107,7 +2126,7 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
            auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
            return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id] {
                auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
-                return rm->get_full_row_hashes_handler().then([] (std::unordered_set<repair_hash> hashes) {
+                return rm->get_full_row_hashes_handler().then([] (repair_hash_set hashes) {
                    _metrics.tx_hashes_nr += hashes.size();
                    return hashes;
                });
@@ -2135,11 +2154,11 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
            });
        });
        ms.register_repair_get_row_diff([] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
-                std::unordered_set<repair_hash> set_diff, bool needs_all_rows) {
+                repair_hash_set set_diff, bool needs_all_rows) {
            auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
            auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
            _metrics.rx_hashes_nr += set_diff.size();
-            auto fp = make_foreign(std::make_unique<std::unordered_set<repair_hash>>(std::move(set_diff)));
+            auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(set_diff)));
            return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, fp = std::move(fp), needs_all_rows] () mutable {
                auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
                if (fp.get_owner_shard() == this_shard_id()) {
@@ -2207,6 +2226,25 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
    });
 }

+future<> repair_uninit_messaging_service_handler() {
+    return netw::get_messaging_service().invoke_on_all([] (auto& ms) {
+        return when_all_succeed(
+            ms.unregister_repair_get_row_diff_with_rpc_stream(),
+            ms.unregister_repair_put_row_diff_with_rpc_stream(),
+            ms.unregister_repair_get_full_row_hashes_with_rpc_stream(),
+            ms.unregister_repair_get_full_row_hashes(),
+            ms.unregister_repair_get_combined_row_hash(),
+            ms.unregister_repair_get_sync_boundary(),
+            ms.unregister_repair_get_row_diff(),
+            ms.unregister_repair_put_row_diff(),
+            ms.unregister_repair_row_level_start(),
+            ms.unregister_repair_row_level_stop(),
+            ms.unregister_repair_get_estimated_partitions(),
+            ms.unregister_repair_set_estimated_partitions(),
+            ms.unregister_repair_get_diff_algorithms()).discard_result();
+    });
+}
+
 class row_level_repair {
    repair_info& _ri;
    sstring _cf_name;
@@ -2439,7 +2477,7 @@ private:
            // sequentially because the rows from repair follower 1 to
            // repair master might reduce the amount of missing data
            // between repair master and repair follower 2.
-            std::unordered_set<repair_hash> set_diff = repair_meta::get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get0());
+            repair_hash_set set_diff = repair_meta::get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get0());
            // Request missing sets from peer node
            rlogger.debug("Before get_row_diff to node {}, local={}, peer={}, set_diff={}",
                    node, master.working_row_hashes().get0().size(), master.peer_row_hash_sets(node_idx).size(), set_diff.size());
@@ -2462,9 +2500,9 @@ private:
        // So we can figure out which rows peer node are missing and send the missing rows to them
        check_in_shutdown();
        _ri.check_in_abort();
-        std::unordered_set<repair_hash> local_row_hash_sets = master.working_row_hashes().get0();
+        repair_hash_set local_row_hash_sets = master.working_row_hashes().get0();
        auto sz = _all_live_peer_nodes.size();
-        std::vector<std::unordered_set<repair_hash>> set_diffs(sz);
+        std::vector<repair_hash_set> set_diffs(sz);
        for (size_t idx : boost::irange(size_t(0), sz)) {
            set_diffs[idx] = repair_meta::get_set_diff(local_row_hash_sets, master.peer_row_hash_sets(idx));
        }
--- a/repair/row_level.hh
+++ b/repair/row_level.hh
@@ -45,6 +45,7 @@ private:
 };

 future<> repair_init_messaging_service_handler(repair_service& rs, distributed<db::system_distributed_keyspace>& sys_dist_ks, distributed<db::view::view_update_generator>& view_update_generator);
+future<> repair_uninit_messaging_service_handler();

 class repair_info;

--- a/schema.cc
+++ b/schema.cc
@@ -19,6 +19,7 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <seastar/core/on_internal_error.hh>
 #include <map>
 #include "utils/UUID_gen.hh"
 #include "cql3/column_identifier.hh"
@@ -43,6 +44,8 @@

 constexpr int32_t schema::NAME_LENGTH;

+extern logging::logger dblog;
+
 sstring to_sstring(column_kind k) {
    switch (k) {
    case column_kind::partition_key:  return "PARTITION_KEY";
@@ -592,11 +595,15 @@ schema::get_column_definition(const bytes& name) const {

 const column_definition&
 schema::column_at(column_kind kind, column_id id) const {
-    return _raw._columns.at(column_offset(kind) + id);
+    return column_at(static_cast<ordinal_column_id>(column_offset(kind) + id));
 }

 const column_definition&
 schema::column_at(ordinal_column_id ordinal_id) const {
+    if (size_t(ordinal_id) >= _raw._columns.size()) [[unlikely]] {
+        on_internal_error(dblog, format("{}.{}@{}: column id {:d} >= {:d}",
+            ks_name(), cf_name(), version(), size_t(ordinal_id), _raw._columns.size()));
+    }
    return _raw._columns.at(static_cast<column_count_type>(ordinal_id));
 }

--- a/scripts/create-relocatable-package.py
+++ b/scripts/create-relocatable-package.py
@@ -92,7 +92,8 @@ executables = ['build/{}/scylla'.format(args.mode),
               '/usr/sbin/ethtool',
               '/usr/bin/netstat',
               '/usr/bin/hwloc-distrib',
-               '/usr/bin/hwloc-calc']
+               '/usr/bin/hwloc-calc',
+               '/usr/bin/lsblk']

 output = args.dest

--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -597,7 +597,7 @@ def current_shard():


 def find_db(shard=None):
-    if not shard:
+    if shard is None:
        shard = current_shard()
    return gdb.parse_and_eval('::debug::db')['_instances']['_M_impl']['_M_start'][shard]['service']['_p']

--- a/scylla_post_install.sh
+++ b/scylla_post_install.sh
@@ -63,6 +63,17 @@ MemoryHigh=1200M
 MemoryMax=1400M
 MemoryLimit=1400M
 EOS
+
+# On CentOS7, systemd does not support percentage-based parameter.
+# To apply memory parameter on CentOS7, we need to override the parameter
+# in bytes, instead of percentage.
+elif [ "$RHEL" -a "$VERSION_ID" = "7" ]; then
+    MEMORY_LIMIT=$((MEMTOTAL_BYTES / 100 * 5))
+    mkdir -p /etc/systemd/system/scylla-helper.slice.d/
+    cat << EOS > /etc/systemd/system/scylla-helper.slice.d/memory.conf
+[Slice]
+MemoryLimit=$MEMORY_LIMIT
+EOS
 fi

 systemctl --system daemon-reload >/dev/null || true
--- a/2
+++ b/2
--- a/serializer_impl.hh
+++ b/serializer_impl.hh
@@ -25,6 +25,7 @@
 #include <seastar/util/bool_class.hh>
 #include <boost/range/algorithm/for_each.hpp>
 #include "utils/small_vector.hh"
+#include <absl/container/btree_set.h>

 namespace ser {

@@ -81,6 +82,17 @@ static inline void serialize_array(Output& out, const Container& v) {
 template<typename Container>
 struct container_traits;

+template<typename T>
+struct container_traits<absl::btree_set<T>> {
+    struct back_emplacer {
+        absl::btree_set<T>& c;
+        back_emplacer(absl::btree_set<T>& c_) : c(c_) {}
+        void operator()(T&& v) {
+            c.emplace(std::move(v));
+        }
+    };
+};
+
 template<typename T>
 struct container_traits<std::unordered_set<T>> {
    struct back_emplacer {
@@ -253,6 +265,27 @@ struct serializer<std::list<T>> {
    }
 };

+template<typename T>
+struct serializer<absl::btree_set<T>> {
+    template<typename Input>
+    static absl::btree_set<T> read(Input& in) {
+        auto sz = deserialize(in, boost::type<uint32_t>());
+        absl::btree_set<T> v;
+        deserialize_array_helper<false, T>::doit(in, v, sz);
+        return v;
+    }
+    template<typename Output>
+    static void write(Output& out, const absl::btree_set<T>& v) {
+        safe_serialize_as_uint32(out, v.size());
+        serialize_array_helper<false, T>::doit(out, v);
+    }
+    template<typename Input>
+    static void skip(Input& in) {
+        auto sz = deserialize(in, boost::type<uint32_t>());
+        skip_array<T>(in, sz);
+    }
+};
+
 template<typename T>
 struct serializer<std::unordered_set<T>> {
    template<typename Input>
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -92,7 +92,7 @@ void migration_manager::init_messaging_service()
        //FIXME: future discarded.
        (void)with_gate(_background_tasks, [this] {
            mlogger.debug("features changed, recalculating schema version");
-            return update_schema_version_and_announce(get_storage_proxy(), _feat.cluster_schema_features());
+            return db::schema_tables::recalculate_schema_version(get_storage_proxy(), _feat);
        });
    };

@@ -1104,6 +1104,20 @@ future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging
        mlogger.debug("Requesting schema {} from {}", v, dst);
        auto& ms = netw::get_local_messaging_service();
        return ms.send_get_schema_version(dst, v);
+    }).then([] (schema_ptr s) {
+        // If this is a view so this schema also needs a reference to the base
+        // table.
+        if (s->is_view()) {
+            if (!s->view_info()->base_info()) {
+                auto& db = service::get_local_storage_proxy().get_db().local();
+                // This line might throw a no_such_column_family
+                // It should be fine since if we tried to register a view for which
+                // we don't know the base table, our registry is broken.
+                schema_ptr base_schema = db.find_schema(s->view_info()->base_id());
+                s->view_info()->set_base_info(s->view_info()->make_base_dependent_view_info(*base_schema));
+            }
+        }
+        return s;
    });
 }

--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -120,9 +120,11 @@ using fbu = utils::fb_utilities;

 static inline
 query::digest_algorithm digest_algorithm(service::storage_proxy& proxy) {
-    return proxy.features().cluster_supports_xxhash_digest_algorithm()
-         ? query::digest_algorithm::xxHash
-         : query::digest_algorithm::MD5;
+    return proxy.features().cluster_supports_digest_for_null_values()
+            ? query::digest_algorithm::xxHash
+            : proxy.features().cluster_supports_xxhash_digest_algorithm()
+                    ? query::digest_algorithm::legacy_xxHash_without_null_digest
+                    : query::digest_algorithm::MD5;
 }

 static inline
@@ -1760,6 +1762,7 @@ storage_proxy::storage_proxy(distributed<database>& db, storage_proxy::config cf
    , _token_metadata(tm)
    , _read_smp_service_group(cfg.read_smp_service_group)
    , _write_smp_service_group(cfg.write_smp_service_group)
+    , _hints_write_smp_service_group(cfg.hints_write_smp_service_group)
    , _write_ack_smp_service_group(cfg.write_ack_smp_service_group)
    , _next_response_id(std::chrono::system_clock::now().time_since_epoch()/1ms)
    , _hints_resource_manager(cfg.available_memory / 10)
@@ -1803,39 +1806,48 @@ storage_proxy::response_id_type storage_proxy::unique_response_handler::release(
 }

 future<>
-storage_proxy::mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, clock_type::time_point timeout) {
+storage_proxy::mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp) {
    auto shard = _db.local().shard_of(m);
    get_stats().replica_cross_shard_ops += shard != this_shard_id();
-    return _db.invoke_on(shard, {_write_smp_service_group, timeout},
-            [s = global_schema_ptr(m.schema()), m = freeze(m), gtr = tracing::global_trace_state_ptr(std::move(tr_state)), timeout] (database& db) mutable -> future<> {
-        return db.apply(s, m, gtr.get(), db::commitlog::force_sync::no, timeout);
+    return _db.invoke_on(shard, {smp_grp, timeout},
+            [s = global_schema_ptr(m.schema()),
+             m = freeze(m),
+             gtr = tracing::global_trace_state_ptr(std::move(tr_state)),
+             timeout,
+             sync] (database& db) mutable -> future<> {
+        return db.apply(s, m, gtr.get(), sync, timeout);
    });
 }

 future<>
-storage_proxy::mutate_locally(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout) {
+storage_proxy::mutate_locally(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout,
+        smp_service_group smp_grp) {
    auto shard = _db.local().shard_of(m);
    get_stats().replica_cross_shard_ops += shard != this_shard_id();
-    return _db.invoke_on(shard, {_write_smp_service_group, timeout},
+    return _db.invoke_on(shard, {smp_grp, timeout},
            [&m, gs = global_schema_ptr(s), gtr = tracing::global_trace_state_ptr(std::move(tr_state)), timeout, sync] (database& db) mutable -> future<> {
        return db.apply(gs, m, gtr.get(), sync, timeout);
    });
 }

 future<>
-storage_proxy::mutate_locally(std::vector<mutation> mutations, tracing::trace_state_ptr tr_state, clock_type::time_point timeout) {
-    return do_with(std::move(mutations), [this, timeout, tr_state = std::move(tr_state)] (std::vector<mutation>& pmut) mutable {
-        return parallel_for_each(pmut.begin(), pmut.end(), [this, tr_state = std::move(tr_state), timeout] (const mutation& m) mutable {
-            return mutate_locally(m, tr_state, timeout);
+storage_proxy::mutate_locally(std::vector<mutation> mutations, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, smp_service_group smp_grp) {
+    return do_with(std::move(mutations), [this, timeout, tr_state = std::move(tr_state), smp_grp] (std::vector<mutation>& pmut) mutable {
+        return parallel_for_each(pmut.begin(), pmut.end(), [this, tr_state = std::move(tr_state), timeout, smp_grp] (const mutation& m) mutable {
+            return mutate_locally(m, tr_state, db::commitlog::force_sync::no, timeout, smp_grp);
        });
    });
 }

+future<> 
+storage_proxy::mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout) {
+        return mutate_locally(std::move(mutation), tr_state, timeout, _write_smp_service_group);
+}
 future<>
 storage_proxy::mutate_hint(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, clock_type::time_point timeout) {
    auto shard = _db.local().shard_of(m);
    get_stats().replica_cross_shard_ops += shard != this_shard_id();
-    return _db.invoke_on(shard, {_write_smp_service_group, timeout}, [&m, gs = global_schema_ptr(s), tr_state = std::move(tr_state), timeout] (database& db) mutable -> future<> {
+    return _db.invoke_on(shard, {_hints_write_smp_service_group, timeout}, [&m, gs = global_schema_ptr(s), tr_state = std::move(tr_state), timeout] (database& db) mutable -> future<> {
        return db.apply_hint(gs, m, std::move(tr_state), timeout);
    });
 }
@@ -4488,6 +4500,12 @@ future<bool> storage_proxy::cas(schema_ptr schema, shared_ptr<cas_request> reque
                                    paxos::paxos_state::logger.debug("CAS[{}] successful", handler->id());
                                    tracing::trace(handler->tr_state, "CAS successful");
                                    return std::optional<bool>(condition_met);
+                                }).handle_exception_type([handler] (unavailable_exception& e) {
+                                    // if learning stage encountered unavailablity error lets re-map it to a write error
+                                    // since unavailable error means that operation has never ever started which is not the case here
+                                    schema_ptr schema = handler->schema();
+                                    return make_exception_future<std::optional<bool>>(mutation_write_timeout_exception(schema->ks_name(), schema->cf_name(),
+                                                               e.consistency, e.alive, e.required, db::write_type::CAS));
                                });
                            }
                            paxos::paxos_state::logger.debug("CAS[{}] PAXOS proposal not accepted (pre-empted by a higher ballot)",
@@ -4849,7 +4867,7 @@ void storage_proxy::init_messaging_service() {
        });
    };

-    auto receive_mutation_handler = [] (const rpc::client_info& cinfo, rpc::opt_time_point t, frozen_mutation in, std::vector<gms::inet_address> forward,
+    auto receive_mutation_handler = [] (smp_service_group smp_grp, const rpc::client_info& cinfo, rpc::opt_time_point t, frozen_mutation in, std::vector<gms::inet_address> forward,
            gms::inet_address reply_to, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info) {
        tracing::trace_state_ptr trace_state_ptr;
        auto src_addr = netw::messaging_service::get_source(cinfo);
@@ -4857,9 +4875,9 @@ void storage_proxy::init_messaging_service() {
        utils::UUID schema_version = in.schema_version();
        return handle_write(src_addr, t, schema_version, std::move(in), std::move(forward), reply_to, shard, response_id,
                trace_info ? *trace_info : std::nullopt,
-                /* apply_fn */ [] (shared_ptr<storage_proxy>& p, tracing::trace_state_ptr tr_state, schema_ptr s, const frozen_mutation& m,
+                /* apply_fn */ [smp_grp] (shared_ptr<storage_proxy>& p, tracing::trace_state_ptr tr_state, schema_ptr s, const frozen_mutation& m,
                        clock_type::time_point timeout) {
-                    return p->mutate_locally(std::move(s), m, std::move(tr_state), db::commitlog::force_sync::no, timeout);
+                    return p->mutate_locally(std::move(s), m, std::move(tr_state), db::commitlog::force_sync::no, timeout, smp_grp);
                },
                /* forward_fn */ [] (netw::messaging_service::msg_addr addr, clock_type::time_point timeout, const frozen_mutation& m,
                        gms::inet_address reply_to, unsigned shard, response_id_type response_id,
@@ -4868,8 +4886,8 @@ void storage_proxy::init_messaging_service() {
                    return ms.send_mutation(addr, timeout, m, {}, reply_to, shard, response_id, std::move(trace_info));
                });
    };
-    ms.register_mutation(receive_mutation_handler);
-    ms.register_hint_mutation(receive_mutation_handler);
+    ms.register_mutation(std::bind_front<>(receive_mutation_handler, _write_smp_service_group));
+    ms.register_hint_mutation(std::bind_front<>(receive_mutation_handler, _hints_write_smp_service_group));

    ms.register_paxos_learn([] (const rpc::client_info& cinfo, rpc::opt_time_point t, paxos::proposal decision,
            std::vector<gms::inet_address> forward, gms::inet_address reply_to, unsigned shard,
@@ -5112,18 +5130,22 @@ void storage_proxy::init_messaging_service() {
 future<> storage_proxy::uninit_messaging_service() {
    auto& ms = netw::get_local_messaging_service();
    return when_all_succeed(
+        ms.unregister_counter_mutation(),
        ms.unregister_mutation(),
+        ms.unregister_hint_mutation(),
        ms.unregister_mutation_done(),
        ms.unregister_mutation_failed(),
        ms.unregister_read_data(),
        ms.unregister_read_mutation_data(),
        ms.unregister_read_digest(),
        ms.unregister_truncate(),
+        ms.unregister_get_schema_version(),
        ms.unregister_paxos_prepare(),
        ms.unregister_paxos_accept(),
        ms.unregister_paxos_learn(),
        ms.unregister_paxos_prune()
    ).discard_result();
+
 }

 future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>
@@ -5217,8 +5239,7 @@ future<> storage_proxy::drain_on_shutdown() {

 future<>
 storage_proxy::stop() {
-    // FIXME: hints manager should be stopped here but it seems like this function is never called
-    return uninit_messaging_service();
+    return make_ready_future<>();
 }

 }
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -166,6 +166,7 @@ public:
        size_t available_memory;
        smp_service_group read_smp_service_group = default_smp_service_group();
        smp_service_group write_smp_service_group = default_smp_service_group();
+        smp_service_group hints_write_smp_service_group = default_smp_service_group();
        // Write acknowledgments might not be received on the correct shard, and
        // they need a separate smp_service_group to prevent an ABBA deadlock
        // with writes.
@@ -256,6 +257,7 @@ private:
    locator::token_metadata& _token_metadata;
    smp_service_group _read_smp_service_group;
    smp_service_group _write_smp_service_group;
+    smp_service_group _hints_write_smp_service_group;
    smp_service_group _write_ack_smp_service_group;
    response_id_type _next_response_id;
    response_handlers_map _response_handlers;
@@ -314,7 +316,6 @@ private:

    cdc_stats _cdc_stats;
 private:
-    future<> uninit_messaging_service();
    future<coordinator_query_result> query_singular(lw_shared_ptr<query::read_command> cmd,
            dht::partition_range_vector&& partition_ranges,
            db::consistency_level cl,
@@ -469,13 +470,31 @@ public:
        return next;
    }
    void init_messaging_service();
+    future<> uninit_messaging_service();

+private:
    // Applies mutation on this node.
    // Resolves with timed_out_error when timeout is reached.
-    future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, clock_type::time_point timeout = clock_type::time_point::max());
+    future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp);
    // Applies mutation on this node.
    // Resolves with timed_out_error when timeout is reached.
-    future<> mutate_locally(const schema_ptr&, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max());
+    future<> mutate_locally(const schema_ptr&, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout,
+            smp_service_group smp_grp);
+    // Applies mutations on this node.
+    // Resolves with timed_out_error when timeout is reached.
+    future<> mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, smp_service_group smp_grp);
+
+public:
+    // Applies mutation on this node.
+    // Resolves with timed_out_error when timeout is reached.
+    future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max()) {
+        return mutate_locally(m, tr_state, sync, timeout, _write_smp_service_group);
+    }
+    // Applies mutation on this node.
+    // Resolves with timed_out_error when timeout is reached.
+    future<> mutate_locally(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max()) {
+        return mutate_locally(s, m, tr_state, sync, timeout, _write_smp_service_group);
+    }
    // Applies mutations on this node.
    // Resolves with timed_out_error when timeout is reached.
    future<> mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout = clock_type::time_point::max());
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -108,7 +108,8 @@ storage_service::storage_service(abort_source& abort_source, distributed<databas
        , _replicate_action([this] { return do_replicate_to_all_cores(); })
        , _update_pending_ranges_action([this] { return do_update_pending_ranges(); })
        , _sys_dist_ks(sys_dist_ks)
-        , _view_update_generator(view_update_generator) {
+        , _view_update_generator(view_update_generator)
+        , _schema_version_publisher([this] { return publish_schema_version(); }) {
    register_metrics();
    sstable_read_error.connect([this] { do_isolate_on_error(disk_error::regular); });
    sstable_write_error.connect([this] { do_isolate_on_error(disk_error::regular); });
@@ -209,6 +210,16 @@ bool storage_service::should_bootstrap() const {
    return is_auto_bootstrap() && !db::system_keyspace::bootstrap_complete() && !_gossiper.get_seeds().count(get_broadcast_address());
 }

+void storage_service::install_schema_version_change_listener() {
+    _listeners.emplace_back(make_lw_shared(_db.local().observable_schema_version().observe([this] (utils::UUID schema_version) {
+        (void)_schema_version_publisher.trigger();
+    })));
+}
+
+future<> storage_service::publish_schema_version() {
+    return get_local_migration_manager().passive_announce(_db.local().get_version());
+}
+
 // Runs inside seastar::async context
 void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints, const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features, bind_messaging_port do_bind) {
    std::map<gms::application_state, gms::versioned_value> app_states;
@@ -351,7 +362,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
    auto broadcast_rpc_address = utils::fb_utilities::get_broadcast_rpc_address();
    auto& proxy = service::get_storage_proxy();
    // Ensure we know our own actual Schema UUID in preparation for updates
-    auto schema_version = update_schema_version(proxy, _feature_service.cluster_schema_features()).get0();
+    db::schema_tables::recalculate_schema_version(proxy, _feature_service).get0();
    app_states.emplace(gms::application_state::NET_VERSION, versioned_value::network_version());
    app_states.emplace(gms::application_state::HOST_ID, versioned_value::host_id(local_host_id));
    app_states.emplace(gms::application_state::RPC_ADDRESS, versioned_value::rpcaddress(broadcast_rpc_address));
@@ -361,7 +372,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
    app_states.emplace(gms::application_state::SCHEMA_TABLES_VERSION, versioned_value(db::schema_tables::version));
    app_states.emplace(gms::application_state::RPC_READY, versioned_value::cql_ready(false));
    app_states.emplace(gms::application_state::VIEW_BACKLOG, versioned_value(""));
-    app_states.emplace(gms::application_state::SCHEMA, versioned_value::schema(schema_version));
+    app_states.emplace(gms::application_state::SCHEMA, versioned_value::schema(_db.local().get_version()));
    if (restarting_normal_node) {
        // Order is important: both the CDC streams timestamp and tokens must be known when a node handles our status.
        // Exception: there might be no CDC streams timestamp proposed by us if we're upgrading from a non-CDC version.
@@ -369,11 +380,16 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
        app_states.emplace(gms::application_state::CDC_STREAMS_TIMESTAMP, versioned_value::cdc_streams_timestamp(_cdc_streams_ts));
        app_states.emplace(gms::application_state::STATUS, versioned_value::normal(my_tokens));
    }
+    if (replacing_a_node_with_same_ip || replacing_a_node_with_diff_ip) {
+        app_states.emplace(gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens));
+    }
    slogger.info("Starting up server gossip");

    auto generation_number = db::system_keyspace::increment_and_get_generation().get0();
    _gossiper.start_gossiping(generation_number, app_states, gms::bind_messaging_port(bool(do_bind))).get();

+    install_schema_version_change_listener();
+
    // gossip snitch infos (local DC and rack)
    gossip_snitch_info().get();

@@ -698,6 +714,8 @@ bool storage_service::do_handle_cdc_generation_intercept_nonfatal_errors(db_cloc
        throw cdc_generation_handling_nonfatal_exception(e.what());
    } catch (exceptions::unavailable_exception& e) {
        throw cdc_generation_handling_nonfatal_exception(e.what());
+    } catch (exceptions::read_failure_exception& e) {
+        throw cdc_generation_handling_nonfatal_exception(e.what());
    } catch (...) {
        const auto ep = std::current_exception();
        if (is_timeout_exception(ep)) {
@@ -890,12 +908,14 @@ future<> storage_service::check_and_repair_cdc_streams() {
            cdc_log.error("Aborting CDC generation repair due to missing STATUS");
            return;
        }
+        // Update _cdc_streams_ts first, so that do_handle_cdc_generation (which will get called due to the status update)
+        // won't try to update the gossiper, which would result in a deadlock inside add_local_application_state
+        _cdc_streams_ts = new_streams_ts;
        _gossiper.add_local_application_state({
                { gms::application_state::CDC_STREAMS_TIMESTAMP, versioned_value::cdc_streams_timestamp(new_streams_ts) },
                { gms::application_state::STATUS, *status }
        }).get();
        db::system_keyspace::update_cdc_streams_timestamp(new_streams_ts).get();
-        _cdc_streams_ts = new_streams_ts;
    });
 }

@@ -1728,6 +1748,9 @@ future<> storage_service::gossip_sharder() {
 future<> storage_service::stop() {
    return uninit_messaging_service().then([this] {
        return _service_memory_limiter.wait(_service_memory_total); // make sure nobody uses the semaphore
+    }).finally([this] {
+        _listeners.clear();
+        return _schema_version_publisher.join();
    });
 }

@@ -1884,9 +1907,11 @@ future<std::map<gms::inet_address, float>> storage_service::effective_ownership(
        return do_with(dht::token::describe_ownership(ss._token_metadata.sorted_tokens()),
                ss._token_metadata.get_topology().get_datacenter_endpoints(),
                std::map<gms::inet_address, float>(),
-                [&ss, keyspace_name](const std::map<token, float>& token_ownership, std::unordered_map<sstring,
+                std::move(keyspace_name),
+                [&ss](const std::map<token, float>& token_ownership, std::unordered_map<sstring,
                        std::unordered_set<gms::inet_address>>& datacenter_endpoints,
-                        std::map<gms::inet_address, float>& final_ownership) {
+                        std::map<gms::inet_address, float>& final_ownership,
+                        sstring& keyspace_name) {
            return do_for_each(datacenter_endpoints, [&ss, &keyspace_name, &final_ownership, &token_ownership](std::pair<sstring,std::unordered_set<inet_address>>&& endpoints) mutable {
                return do_with(std::unordered_set<inet_address>(endpoints.second), [&ss, &keyspace_name, &final_ownership, &token_ownership](const std::unordered_set<inet_address>& endpoints_map) mutable {
                    return do_for_each(endpoints_map, [&ss, &keyspace_name, &final_ownership, &token_ownership](const gms::inet_address& endpoint) mutable {
--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -150,6 +150,7 @@ private:
    semaphore _service_memory_limiter;
    using client_shutdown_hook = noncopyable_function<void()>;
    std::vector<std::pair<std::string, client_shutdown_hook>> _client_shutdown_hooks;
+    std::vector<std::any> _listeners;

    /* For unit tests only.
     *
@@ -170,7 +171,8 @@ public:
 private:
    future<> do_update_pending_ranges();
    void register_metrics();
-
+    future<> publish_schema_version();
+    void install_schema_version_change_listener();
 public:
    future<> keyspace_changed(const sstring& ks_name);
    future<> update_pending_ranges();
@@ -545,6 +547,7 @@ private:
    serialized_action _update_pending_ranges_action;
    sharded<db::system_distributed_keyspace>& _sys_dist_ks;
    sharded<db::view::view_update_generator>& _view_update_generator;
+    serialized_action _schema_version_publisher;
 private:
    /**
     * Replicates token_metadata contents on shard0 instance to other shards.
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -602,7 +602,7 @@ private:
        // - add support to merge summary (message: Partition merge counts were {%s}.).
        // - there is no easy way, currently, to know the exact number of total partitions.
        // By the time being, using estimated key count.
-        sstring formatted_msg = fmt::format("{} sstables to [{}]. {} to {} (~{} of original) in {}ms = {}. " \
+        sstring formatted_msg = fmt::format("{} sstables to [{}]. {} to {} (~{}% of original) in {}ms = {}. " \
            "~{} total partitions merged to {}.",
            _info->sstables, new_sstables_msg, pretty_printed_data_size(_info->start_size), pretty_printed_data_size(_info->end_size), int(ratio * 100),
            std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_info->end_size, duration),
@@ -1236,11 +1236,8 @@ private:
    // return estimated partitions per sstable for a given shard
    uint64_t partitions_per_sstable(shard_id s) const {
        uint64_t estimated_sstables = std::max(uint64_t(1), uint64_t(ceil(double(_estimation_per_shard[s].estimated_size) / _max_sstable_size)));
-        // As we adjust this estimate downwards from the compaction strategy, it can get to 0 so
-        // make sure we're returning at least 1.
-        return std::max(uint64_t(1),
-                std::min(uint64_t(ceil(double(_estimation_per_shard[s].estimated_partitions) / estimated_sstables)),
-                _cf.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions)));
+        return std::min(uint64_t(ceil(double(_estimation_per_shard[s].estimated_partitions) / estimated_sstables)),
+                _cf.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions));
    }
 public:
    resharding_compaction(column_family& cf, sstables::compaction_descriptor descriptor)
--- a/sstables/compaction_backlog_manager.hh
+++ b/sstables/compaction_backlog_manager.hh
@@ -92,6 +92,9 @@ public:
    void transfer_ongoing_charges(compaction_backlog_tracker& new_bt, bool move_read_charges = true);
    void revert_charges(sstables::shared_sstable sst);
 private:
+    // Returns true if this SSTable can be added or removed from the tracker.
+    bool sstable_belongs_to_tracker(const sstables::shared_sstable& sst);
+
    void disable() {
        _disabled = true;
        _ongoing_writes = {};
--- a/sstables/compaction_manager.cc
+++ b/sstables/compaction_manager.cc
@@ -218,7 +218,7 @@ std::vector<sstables::shared_sstable> compaction_manager::get_candidates(const c
    auto& cs = cf.get_compaction_strategy();

    // Filter out sstables that are being compacted.
-    for (auto& sst : cf.candidates_for_compaction()) {
+    for (auto& sst : cf.non_staging_sstables()) {
        if (_compacting_sstables.count(sst)) {
            continue;
        }
@@ -708,8 +708,8 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
    return task->compaction_done.get_future().then([task] {});
 }

-static bool needs_cleanup(const sstables::shared_sstable& sst,
-                   const dht::token_range_vector& owned_ranges,
+bool needs_cleanup(const sstables::shared_sstable& sst,
+                   const dht::token_range_vector& sorted_owned_ranges,
                   schema_ptr s) {
    auto first = sst->get_first_partition_key();
    auto last = sst->get_last_partition_key();
@@ -717,29 +717,40 @@ static bool needs_cleanup(const sstables::shared_sstable& sst,
    auto last_token = dht::get_token(*s, last);
    dht::token_range sst_token_range = dht::token_range::make(first_token, last_token);

+    auto r = std::lower_bound(sorted_owned_ranges.begin(), sorted_owned_ranges.end(), first_token,
+            [] (const range<dht::token>& a, const dht::token& b) {
+        // check that range a is before token b.
+        return a.after(b, dht::token_comparator());
+    });
+
    // return true iff sst partition range isn't fully contained in any of the owned ranges.
-    for (auto& r : owned_ranges) {
-        if (r.contains(sst_token_range, dht::token_comparator())) {
+    if (r != sorted_owned_ranges.end()) {
+        if (r->contains(sst_token_range, dht::token_comparator())) {
            return false;
        }
    }
    return true;
 }

-future<> compaction_manager::perform_cleanup(column_family* cf) {
+future<> compaction_manager::perform_cleanup(database& db, column_family* cf) {
    if (check_for_cleanup(cf)) {
        throw std::runtime_error(format("cleanup request failed: there is an ongoing cleanup on {}.{}",
            cf->schema()->ks_name(), cf->schema()->cf_name()));
    }
-    return rewrite_sstables(cf, sstables::compaction_options::make_cleanup(), [this] (const table& table) {
-        auto schema = table.schema();
-        auto owned_ranges = service::get_local_storage_service().get_local_ranges(schema->ks_name());
+    return seastar::async([this, cf, &db] {
+        auto schema = cf->schema();
+        auto& rs = db.find_keyspace(schema->ks_name()).get_replication_strategy();
+        auto sorted_owned_ranges = rs.get_ranges_in_thread(utils::fb_utilities::get_broadcast_address());
        auto sstables = std::vector<sstables::shared_sstable>{};
-        const auto candidates = table.candidates_for_compaction();
-        std::copy_if(candidates.begin(), candidates.end(), std::back_inserter(sstables), [&owned_ranges, schema] (const sstables::shared_sstable& sst) {
-            return owned_ranges.empty() || needs_cleanup(sst, owned_ranges, schema);
+        const auto candidates = get_candidates(*cf);
+        std::copy_if(candidates.begin(), candidates.end(), std::back_inserter(sstables), [&sorted_owned_ranges, schema] (const sstables::shared_sstable& sst) {
+            seastar::thread::maybe_yield();
+            return sorted_owned_ranges.empty() || needs_cleanup(sst, sorted_owned_ranges, schema);
        });
        return sstables;
+    }).then([this, cf] (std::vector<sstables::shared_sstable> sstables) {
+        return rewrite_sstables(cf, sstables::compaction_options::make_cleanup(),
+                [sstables = std::move(sstables)] (const table&) { return sstables; });
    });
 }

@@ -754,7 +765,7 @@ future<> compaction_manager::perform_sstable_upgrade(column_family* cf, bool exc
        return cf->run_with_compaction_disabled([this, cf, &tables, exclude_current_version] {
            auto last_version = cf->get_sstables_manager().get_highest_supported_format();

-            for (auto& sst : cf->candidates_for_compaction()) {
+            for (auto& sst : get_candidates(*cf)) {
                // if we are a "normal" upgrade, we only care about
                // tables with older versions, but potentially
                // we are to actually rewrite everything. (-a)
@@ -779,8 +790,8 @@ future<> compaction_manager::perform_sstable_upgrade(column_family* cf, bool exc

 // Submit a column family to be scrubbed and wait for its termination.
 future<> compaction_manager::perform_sstable_scrub(column_family* cf, bool skip_corrupted) {
-    return rewrite_sstables(cf, sstables::compaction_options::make_scrub(skip_corrupted), [] (const table& cf) {
-        return cf.candidates_for_compaction();
+    return rewrite_sstables(cf, sstables::compaction_options::make_scrub(skip_corrupted), [this] (const table& cf) {
+        return get_candidates(cf);
    });
 }

@@ -857,7 +868,7 @@ double compaction_backlog_tracker::backlog() const {
 }

 void compaction_backlog_tracker::add_sstable(sstables::shared_sstable sst) {
-    if (_disabled) {
+    if (_disabled || !sstable_belongs_to_tracker(sst)) {
        return;
    }
    _ongoing_writes.erase(sst);
@@ -870,7 +881,7 @@ void compaction_backlog_tracker::add_sstable(sstables::shared_sstable sst) {
 }

 void compaction_backlog_tracker::remove_sstable(sstables::shared_sstable sst) {
-    if (_disabled) {
+    if (_disabled || !sstable_belongs_to_tracker(sst)) {
        return;
    }

@@ -883,6 +894,10 @@ void compaction_backlog_tracker::remove_sstable(sstables::shared_sstable sst) {
    }
 }

+bool compaction_backlog_tracker::sstable_belongs_to_tracker(const sstables::shared_sstable& sst) {
+    return !sst->requires_view_building();
+}
+
 void compaction_backlog_tracker::register_partially_written_sstable(sstables::shared_sstable sst, backlog_write_progress_manager& wp) {
    if (_disabled) {
        return;
--- a/sstables/compaction_manager.hh
+++ b/sstables/compaction_manager.hh
@@ -205,7 +205,7 @@ public:
    // Cleanup is about discarding keys that are no longer relevant for a
    // given sstable, e.g. after node loses part of its token range because
    // of a newly added node.
-    future<> perform_cleanup(column_family* cf);
+    future<> perform_cleanup(database& db, column_family* cf);

    // Submit a column family to be upgraded and wait for its termination.
    future<> perform_sstable_upgrade(column_family* cf, bool exclude_current_version);
@@ -271,3 +271,5 @@ public:
    friend class compaction_weight_registration;
 };

+bool needs_cleanup(const sstables::shared_sstable& sst, const dht::token_range_vector& owned_ranges, schema_ptr s);
+
--- a/sstables/compaction_strategy.cc
+++ b/sstables/compaction_strategy.cc
@@ -438,8 +438,8 @@ std::unique_ptr<sstable_set_impl> leveled_compaction_strategy::make_sstable_set(
    return std::make_unique<partitioned_sstable_set>(std::move(schema));
 }

-std::unique_ptr<sstable_set_impl> make_partitioned_sstable_set(schema_ptr schema, bool use_level_metadata) {
-    return std::make_unique<partitioned_sstable_set>(std::move(schema), use_level_metadata);
+sstable_set make_partitioned_sstable_set(schema_ptr schema, lw_shared_ptr<sstable_list> all, bool use_level_metadata) {
+    return sstables::sstable_set(std::make_unique<partitioned_sstable_set>(schema, use_level_metadata), schema, std::move(all));
 }

 compaction_descriptor compaction_strategy_impl::get_major_compaction_job(column_family& cf, std::vector<sstables::shared_sstable> candidates) {
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -453,9 +453,16 @@ private:
                    auto indexes = std::move(entries_reader->_consumer.indexes);
                    return entries_reader->_context.close().then([indexes = std::move(indexes), ex = std::move(ex)] () mutable {
                        if (ex) {
-                            std::rethrow_exception(std::move(ex));
+                            return do_with(std::move(indexes), [ex = std::move(ex)] (index_list& indexes) mutable {
+                                return parallel_for_each(indexes, [] (index_entry& ie) mutable {
+                                    return ie.close_pi_stream();
+                                }).then_wrapped([ex = std::move(ex)] (future<>&& fut) mutable {
+                                    fut.ignore_ready_future();
+                                    return make_exception_future<index_list>(std::move(ex));
+                                });
+                            });
                        }
-                        return std::move(indexes);
+                        return make_ready_future<index_list>(std::move(indexes));
                    });

                });
--- a/sstables/leveled_compaction_strategy.cc
+++ b/sstables/leveled_compaction_strategy.cc
@@ -178,7 +178,13 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input

    size_t offstrategy_threshold = std::max(schema->min_compaction_threshold(), 4);
    size_t max_sstables = std::max(schema->max_compaction_threshold(), int(offstrategy_threshold));
-    unsigned tolerance = mode == reshape_mode::strict ? 0 : leveled_manifest::leveled_fan_out * 2;
+    auto tolerance = [mode] (unsigned level) -> unsigned {
+        if (mode == reshape_mode::strict) {
+            return 0;
+        }
+        constexpr unsigned fan_out = leveled_manifest::leveled_fan_out;
+        return std::max(double(fan_out), std::ceil(std::pow(fan_out, level) * 0.1));
+    };

    if (level_info[0].size() > offstrategy_threshold) {
        level_info[0].resize(std::min(level_info[0].size(), max_sstables));
@@ -193,7 +199,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
        }
        max_filled_level = std::max(max_filled_level, level);

-        if (!is_disjoint(level_info[level], tolerance)) {
+        if (!is_disjoint(level_info[level], tolerance(level))) {
            leveled_manifest::logger.warn("Turns out that level {} is not disjoint, so compacting everything on behalf of {}.{}", level, schema->ks_name(), schema->cf_name());
            // Unfortunately no good limit to limit input size to max_sstables for LCS major
            compaction_descriptor desc(std::move(input), std::optional<sstables::sstable_set>(), iop, max_filled_level, _max_sstable_size_in_mb * 1024 * 1024);
--- a/sstables/mc/writer.cc
+++ b/sstables/mc/writer.cc
@@ -741,6 +741,11 @@ public:
        , _run_identifier(cfg.run_identifier)
        , _write_regular_as_static(cfg.correctly_serialize_static_compact_in_mc && s.is_static_compact_table())
    {
+        // This can be 0 in some cases, which is albeit benign, can wreak havoc
+        // in lower-level writer code, so clamp it to [1, +inf) here, which is
+        // exactly what callers used to do anyway.
+        estimated_partitions = std::max(uint64_t(1), estimated_partitions);
+
        _sst.generate_toc(_schema.get_compressor_params().get_compressor(), _schema.bloom_filter_fp_chance());
        _sst.write_toc(_pc);
        _sst.create_data().get();
--- a/sstables/size_tiered_compaction_strategy.cc
+++ b/sstables/size_tiered_compaction_strategy.cc
@@ -27,7 +27,7 @@
 namespace sstables {

 std::vector<std::pair<sstables::shared_sstable, uint64_t>>
-size_tiered_compaction_strategy::create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables) const {
+size_tiered_compaction_strategy::create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables) {

    std::vector<std::pair<sstables::shared_sstable, uint64_t>> sstable_length_pairs;
    sstable_length_pairs.reserve(sstables.size());
@@ -43,7 +43,7 @@ size_tiered_compaction_strategy::create_sstable_and_length_pairs(const std::vect
 }

 std::vector<std::vector<sstables::shared_sstable>>
-size_tiered_compaction_strategy::get_buckets(const std::vector<sstables::shared_sstable>& sstables) const {
+size_tiered_compaction_strategy::get_buckets(const std::vector<sstables::shared_sstable>& sstables, size_tiered_compaction_strategy_options options) {
    // sstables sorted by size of its data file.
    auto sorted_sstables = create_sstable_and_length_pairs(sstables);

@@ -64,8 +64,8 @@ size_tiered_compaction_strategy::get_buckets(const std::vector<sstables::shared_
        for (auto it = buckets.begin(); it != buckets.end(); it++) {
            size_t old_average_size = it->first;

-            if ((size > (old_average_size * _options.bucket_low) && size < (old_average_size * _options.bucket_high)) ||
-                    (size < _options.min_sstable_size && old_average_size < _options.min_sstable_size)) {
+            if ((size > (old_average_size * options.bucket_low) && size < (old_average_size * options.bucket_high)) ||
+                    (size < options.min_sstable_size && old_average_size < options.min_sstable_size)) {
                auto bucket = std::move(it->second);
                size_t total_size = bucket.size() * old_average_size;
                size_t new_average_size = (total_size + size) / (bucket.size() + 1);
@@ -97,6 +97,11 @@ size_tiered_compaction_strategy::get_buckets(const std::vector<sstables::shared_
    return bucket_list;
 }

+std::vector<std::vector<sstables::shared_sstable>>
+size_tiered_compaction_strategy::get_buckets(const std::vector<sstables::shared_sstable>& sstables) const {
+    return get_buckets(sstables, _options);
+}
+
 std::vector<sstables::shared_sstable>
 size_tiered_compaction_strategy::most_interesting_bucket(std::vector<std::vector<sstables::shared_sstable>> buckets,
        unsigned min_threshold, unsigned max_threshold)
@@ -176,23 +181,28 @@ size_tiered_compaction_strategy::get_sstables_for_compaction(column_family& cfs,
    return sstables::compaction_descriptor();
 }

+int64_t size_tiered_compaction_strategy::estimated_pending_compactions(const std::vector<sstables::shared_sstable>& sstables,
+        int min_threshold, int max_threshold, size_tiered_compaction_strategy_options options) {
+    int64_t n = 0;
+    for (auto& bucket : get_buckets(sstables, options)) {
+        if (bucket.size() >= size_t(min_threshold)) {
+            n += std::ceil(double(bucket.size()) / max_threshold);
+        }
+    }
+    return n;
+}
+
 int64_t size_tiered_compaction_strategy::estimated_pending_compactions(column_family& cf) const {
    int min_threshold = cf.min_compaction_threshold();
    int max_threshold = cf.schema()->max_compaction_threshold();
    std::vector<sstables::shared_sstable> sstables;
-    int64_t n = 0;

    sstables.reserve(cf.sstables_count());
    for (auto& entry : *cf.get_sstables()) {
        sstables.push_back(entry);
    }

-    for (auto& bucket : get_buckets(sstables)) {
-        if (bucket.size() >= size_t(min_threshold)) {
-            n += std::ceil(double(bucket.size()) / max_threshold);
-        }
-    }
-    return n;
+    return estimated_pending_compactions(sstables, min_threshold, max_threshold, _options);
 }

 std::vector<sstables::shared_sstable>
--- a/sstables/size_tiered_compaction_strategy.hh
+++ b/sstables/size_tiered_compaction_strategy.hh
@@ -116,9 +116,11 @@ class size_tiered_compaction_strategy : public compaction_strategy_impl {
    compaction_backlog_tracker _backlog_tracker;

    // Return a list of pair of shared_sstable and its respective size.
-    std::vector<std::pair<sstables::shared_sstable, uint64_t>> create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables) const;
+    static std::vector<std::pair<sstables::shared_sstable, uint64_t>> create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables);

    // Group files of similar size into buckets.
+    static std::vector<std::vector<sstables::shared_sstable>> get_buckets(const std::vector<sstables::shared_sstable>& sstables, size_tiered_compaction_strategy_options options);
+
    std::vector<std::vector<sstables::shared_sstable>> get_buckets(const std::vector<sstables::shared_sstable>& sstables) const;

    // Maybe return a bucket of sstables to compact
@@ -154,6 +156,8 @@ public:

    virtual compaction_descriptor get_sstables_for_compaction(column_family& cfs, std::vector<sstables::shared_sstable> candidates) override;

+    static int64_t estimated_pending_compactions(const std::vector<sstables::shared_sstable>& sstables,
+        int min_threshold, int max_threshold, size_tiered_compaction_strategy_options options);
    virtual int64_t estimated_pending_compactions(column_family& cf) const override;

    virtual compaction_strategy_type type() const {
--- a/Show More
+++ b/Show More