schema.cc/describe: fix invalid compaction options in schema

There is a typo in schema.cql of snapshot, lack of comma after compaction strategy. It will fail to restore schema by the file. AND compaction = {'class': 'SizeTieredCompactionStrategy''max_compaction_threshold': '32'} map_as_cql_param() function has a `first` parameter to smartly add comma, the compaction_strategy_options is always not the first. Fixes #7741 Signed-off-by: Amos Kong <amos@scylladb.com> Closes #7734 (cherry picked from commit 6b1659ee80)
sstable: writer: ka/la: Write row marker cell after row tombstone
2021-03-24 12:58:11 +02:00 · 2021-03-24 10:42:11 +02:00 · 2021-03-21 10:51:36 +02:00 · 2021-03-18 19:20:10 +02:00 · 2021-03-18 14:29:38 +02:00 · 2021-03-11 08:24:56 +02:00
167 changed files with 4724 additions and 1011 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=666.development
+VERSION=4.2.4

 if test -f version
 then
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -129,7 +129,7 @@ future<std::string> get_key_from_roles(cql3::query_processor& qp, std::string us
            auth::meta::roles_table::qualified_name(), auth::meta::roles_table::role_col_name);

    auto cl = auth::password_authenticator::consistency_for_user(username);
-    auto timeout = auth::internal_distributed_timeout_config();
+    auto& timeout = auth::internal_distributed_timeout_config();
    return qp.execute_internal(query, cl, timeout, {sstring(username)}, true).then_wrapped([username = std::move(username)] (future<::shared_ptr<cql3::untyped_result_set>> f) {
        auto res = f.get0();
        auto salted_hash = std::optional<sstring>();
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -98,6 +98,11 @@ struct nonempty : public size_check {

 // Check that array has the expected number of elements
 static void verify_operand_count(const rjson::value* array, const size_check& expected, const rjson::value& op) {
+    if (!array && expected(0)) {
+        // If expected() allows an empty AttributeValueList, it is also fine
+        // that it is missing.
+        return;
+    }
    if (!array || !array->IsArray()) {
        throw api_error("ValidationException", "With ComparisonOperator, AttributeValueList must be given and an array");
    }
@@ -154,23 +159,40 @@ static bool check_NE(const rjson::value* v1, const rjson::value& v2) {
 }

 // Check if two JSON-encoded values match with the BEGINS_WITH relation
-static bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2) {
-    // BEGINS_WITH requires that its single operand (v2) be a string or
-    // binary - otherwise it's a validation error. However, problems with
-    // the stored attribute (v1) will just return false (no match).
-    if (!v2.IsObject() || v2.MemberCount() != 1) {
-        throw api_error("ValidationException", format("BEGINS_WITH operator encountered malformed AttributeValue: {}", v2));
-    }
-    auto it2 = v2.MemberBegin();
-    if (it2->name != "S" && it2->name != "B") {
-        throw api_error("ValidationException", format("BEGINS_WITH operator requires String or Binary type in AttributeValue, got {}", it2->name));
-    }
-
-
+bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2,
+                       bool v1_from_query, bool v2_from_query) {
+    bool bad = false;
    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+        if (v1_from_query) {
+            throw api_error("ValidationException", "begins_with() encountered malformed argument");
+        } else {
+            bad = true;
+        }
+    } else if (v1->MemberBegin()->name != "S" && v1->MemberBegin()->name != "B") {
+        if (v1_from_query) {
+            throw api_error("ValidationException", format("begins_with supports only string or binary type, got: {}", *v1));
+        } else {
+            bad = true;
+        }
+    }
+    if (!v2.IsObject() || v2.MemberCount() != 1) {
+        if (v2_from_query) {
+            throw api_error("ValidationException", "begins_with() encountered malformed argument");
+        } else {
+            bad = true;
+        }
+    } else if (v2.MemberBegin()->name != "S" && v2.MemberBegin()->name != "B") {
+        if (v2_from_query) {
+            throw api_error("ValidationException", format("begins_with() supports only string or binary type, got: {}", v2));
+        } else {
+            bad = true;
+        }
+    }
+    if (bad) {
        return false;
    }
    auto it1 = v1->MemberBegin();
+    auto it2 = v2.MemberBegin();
    if (it1->name != it2->name) {
        return false;
    }
@@ -274,24 +296,38 @@ static bool check_NOT_NULL(const rjson::value* val) {
    return val != nullptr;
 }

+// Only types S, N or B (string, number or bytes) may be compared by the
+// various comparion operators - lt, le, gt, ge, and between.
+static bool check_comparable_type(const rjson::value& v) {
+    if (!v.IsObject() || v.MemberCount() != 1) {
+        return false;
+    }
+    const rjson::value& type = v.MemberBegin()->name;
+    return type == "S" || type == "N" || type == "B";
+}
+
 // Check if two JSON-encoded values match with cmp.
 template <typename Comparator>
-bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp) {
-    if (!v2.IsObject() || v2.MemberCount() != 1) {
-        throw api_error("ValidationException",
-                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic));
+bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp,
+                   bool v1_from_query, bool v2_from_query) {
+    bool bad = false;
+    if (!v1 || !check_comparable_type(*v1)) {
+        if (v1_from_query) {
+            throw api_error("ValidationException", format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+        }
+        bad = true;
    }
-    const auto& kv2 = *v2.MemberBegin();
-    if (kv2.name != "S" && kv2.name != "N" && kv2.name != "B") {
-        throw api_error("ValidationException",
-                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic));
+    if (!check_comparable_type(v2)) {
+        if (v2_from_query) {
+            throw api_error("ValidationException", format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+        }
+        bad = true;
    }
-    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+    if (bad) {
        return false;
    }
    const auto& kv1 = *v1->MemberBegin();
+    const auto& kv2 = *v2.MemberBegin();
    if (kv1.name != kv2.name) {
        return false;
    }
@@ -305,7 +341,8 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
    if (kv1.name == "B") {
        return cmp(base64_decode(kv1.value), base64_decode(kv2.value));
    }
-    clogger.error("check_compare panic: LHS type equals RHS type, but one is in {N,S,B} while the other isn't");
+    // cannot reach here, as check_comparable_type() verifies the type is one
+    // of the above options.
    return false;
 }

@@ -336,57 +373,71 @@ struct cmp_gt {
    static constexpr const char* diagnostic = "GT operator";
 };

-// True if v is between lb and ub, inclusive.  Throws if lb > ub.
+// True if v is between lb and ub, inclusive.  Throws or returns false
+// (depending on bounds_from_query parameter) if lb > ub.
 template <typename T>
-static bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
+static bool check_BETWEEN(const T& v, const T& lb, const T& ub, bool bounds_from_query) {
    if (cmp_lt()(ub, lb)) {
-        throw api_error("ValidationException",
-                        format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+        if (bounds_from_query) {
+            throw api_error("ValidationException",
+                format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+        } else {
+            return false;
+        }
    }
    return cmp_ge()(v, lb) && cmp_le()(v, ub);
 }

-static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub) {
-    if (!v) {
+static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub,
+                          bool v_from_query, bool lb_from_query, bool ub_from_query) {
+    if ((v && v_from_query && !check_comparable_type(*v)) ||
+        (lb_from_query && !check_comparable_type(lb)) ||
+        (ub_from_query && !check_comparable_type(ub))) {
+        throw api_error("ValidationException", "between allow only the types String, Number, or Binary");
+
+    }
+    if (!v || !v->IsObject() || v->MemberCount() != 1 ||
+        !lb.IsObject() || lb.MemberCount() != 1 ||
+        !ub.IsObject() || ub.MemberCount() != 1) {
        return false;
    }
-    if (!v->IsObject() || v->MemberCount() != 1) {
-        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", *v));
-    }
-    if (!lb.IsObject() || lb.MemberCount() != 1) {
-        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", lb));
-    }
-    if (!ub.IsObject() || ub.MemberCount() != 1) {
-        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", ub));
-    }

    const auto& kv_v = *v->MemberBegin();
    const auto& kv_lb = *lb.MemberBegin();
    const auto& kv_ub = *ub.MemberBegin();
+    bool bounds_from_query = lb_from_query && ub_from_query;
    if (kv_lb.name != kv_ub.name) {
-        throw api_error(
-                "ValidationException",
+        if (bounds_from_query) {
+           throw api_error("ValidationException",
                format("BETWEEN operator requires the same type for lower and upper bound; instead got {} and {}",
                       kv_lb.name, kv_ub.name));
+        } else {
+            return false;
+        }
    }
    if (kv_v.name != kv_lb.name) { // Cannot compare different types, so v is NOT between lb and ub.
        return false;
    }
    if (kv_v.name == "N") {
        const char* diag = "BETWEEN operator";
-        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag));
+        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag), bounds_from_query);
    }
    if (kv_v.name == "S") {
        return check_BETWEEN(std::string_view(kv_v.value.GetString(), kv_v.value.GetStringLength()),
                             std::string_view(kv_lb.value.GetString(), kv_lb.value.GetStringLength()),
-                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()));
+                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()),
+                             bounds_from_query);
    }
    if (kv_v.name == "B") {
-        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value));
+        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value), bounds_from_query);
    }
-    throw api_error("ValidationException",
-        format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
+    if (v_from_query) {
+        throw api_error("ValidationException",
+            format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
               kv_lb.name));
+    } else {
+        return false;
+    }
 }

 // Verify one Expect condition on one attribute (whose content is "got")
@@ -433,19 +484,19 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
            return check_NE(got, (*attribute_value_list)[0]);
        case comparison_operator_type::LT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_lt{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_lt{}, false, true);
        case comparison_operator_type::LE:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_le{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_le{}, false, true);
        case comparison_operator_type::GT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_gt{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_gt{}, false, true);
        case comparison_operator_type::GE:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_ge{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_ge{}, false, true);
        case comparison_operator_type::BEGINS_WITH:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_BEGINS_WITH(got, (*attribute_value_list)[0]);
+            return check_BEGINS_WITH(got, (*attribute_value_list)[0], false, true);
        case comparison_operator_type::IN:
            verify_operand_count(attribute_value_list, nonempty(), *comparison_operator);
            return check_IN(got, *attribute_value_list);
@@ -457,7 +508,8 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
            return check_NOT_NULL(got);
        case comparison_operator_type::BETWEEN:
            verify_operand_count(attribute_value_list, exact_size(2), *comparison_operator);
-            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1]);
+            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1],
+                                 false, true, true);
        case comparison_operator_type::CONTAINS:
            {
                verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
@@ -569,7 +621,8 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
            // Shouldn't happen unless we have a bug in the parser
            throw std::logic_error(format("Wrong number of values {} in BETWEEN primitive_condition", cond._values.size()));
        }
-        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2]);
+        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2],
+                             cond._values[0].is_constant(), cond._values[1].is_constant(), cond._values[2].is_constant());
    case parsed::primitive_condition::type::IN:
        return check_IN(calculated_values);
    case parsed::primitive_condition::type::VALUE:
@@ -600,13 +653,17 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
    case parsed::primitive_condition::type::NE:
        return check_NE(&calculated_values[0], calculated_values[1]);
    case parsed::primitive_condition::type::GT:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::GE:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::LT:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::LE:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    default:
        // Shouldn't happen unless we have a bug in the parser
        throw std::logic_error(format("Unknown type {} in primitive_condition object", (int)(cond._op)));
--- a/alternator/conditions.hh
+++ b/alternator/conditions.hh
@@ -52,6 +52,7 @@ bool verify_expected(const rjson::value& req, const rjson::value* previous_item)
 bool verify_condition(const rjson::value& condition, bool require_all, const rjson::value* previous_item);

 bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2);
+bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2, bool v1_from_query, bool v2_from_query);

 bool verify_condition_expression(
        const parsed::condition_expression& condition_expression,
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -626,11 +626,8 @@ void rmw_operation::set_default_write_isolation(std::string_view value) {
    default_write_isolation = parse_write_isolation(value);
 }

-// FIXME: Updating tags currently relies on updating schema, which may be subject
-// to races during concurrent updates of the same table. Once Scylla schema updates
-// are fixed, this issue will automatically get fixed as well.
 enum class update_tags_action { add_tags, delete_tags };
-static future<> update_tags(service::migration_manager& mm, const rjson::value& tags, schema_ptr schema, std::map<sstring, sstring>&& tags_map, update_tags_action action) {
+static void update_tags_map(const rjson::value& tags, std::map<sstring, sstring>& tags_map, update_tags_action action) {
    if (action == update_tags_action::add_tags) {
        for (auto it = tags.Begin(); it != tags.End(); ++it) {
            const rjson::value& key = (*it)["Key"];
@@ -652,28 +649,20 @@ static future<> update_tags(service::migration_manager& mm, const rjson::value&
    }

    if (tags_map.size() > 50) {
-        return make_exception_future<>(api_error("ValidationException", "Number of Tags exceed the current limit for the provided ResourceArn"));
+        throw api_error("ValidationException", "Number of Tags exceed the current limit for the provided ResourceArn");
    }
    validate_tags(tags_map);
+}

+// FIXME: Updating tags currently relies on updating schema, which may be subject
+// to races during concurrent updates of the same table. Once Scylla schema updates
+// are fixed, this issue will automatically get fixed as well.
+static future<> update_tags(service::migration_manager& mm, schema_ptr schema, std::map<sstring, sstring>&& tags_map) {
    schema_builder builder(schema);
    builder.set_extensions(schema::extensions_map{{sstring(tags_extension::NAME), ::make_shared<tags_extension>(std::move(tags_map))}});
    return mm.announce_column_family_update(builder.build(), false, std::vector<view_ptr>(), false);
 }

-static future<> add_tags(service::migration_manager& mm, service::storage_proxy& proxy, schema_ptr schema, rjson::value& request_info) {
-    const rjson::value* tags = rjson::find(request_info, "Tags");
-    if (!tags || !tags->IsArray()) {
-        return make_exception_future<>(api_error("ValidationException", format("Cannot parse tags")));
-    }
-    if (tags->Size() < 1) {
-        return make_exception_future<>(api_error("ValidationException", "The number of tags must be at least 1"));
-    }
-
-    std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
-    return update_tags(mm, rjson::copy(*tags), schema, std::move(tags_map), update_tags_action::add_tags);
-}
-
 future<executor::request_return_type> executor::tag_resource(client_state& client_state, service_permit permit, rjson::value request) {
    _stats.api_operations.tag_resource++;

@@ -683,7 +672,16 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
            return api_error("AccessDeniedException", "Incorrect resource identifier");
        }
        schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
-        add_tags(_mm, _proxy, schema, request).get();
+        std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
+        const rjson::value* tags = rjson::find(request, "Tags");
+        if (!tags || !tags->IsArray()) {
+            return api_error("ValidationException", format("Cannot parse tags"));
+        }
+        if (tags->Size() < 1) {
+            return api_error("ValidationException", "The number of tags must be at least 1") ;
+        }
+        update_tags_map(*tags, tags_map,  update_tags_action::add_tags);
+        update_tags(_mm, schema, std::move(tags_map)).get();
        return json_string("");
    });
 }
@@ -704,7 +702,8 @@ future<executor::request_return_type> executor::untag_resource(client_state& cli
        schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));

        std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
-        update_tags(_mm, *tags, schema, std::move(tags_map), update_tags_action::delete_tags).get();
+        update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
+        update_tags(_mm, schema, std::move(tags_map)).get();
        return json_string("");
    });
 }
@@ -891,9 +890,22 @@ future<executor::request_return_type> executor::create_table(client_state& clien
            view_builders.emplace_back(std::move(view_builder));
        }
    }
-    if (rjson::find(request, "SSESpecification")) {
-        return make_ready_future<request_return_type>(api_error("ValidationException", "SSESpecification: configuring encryption-at-rest is not yet supported."));
+
+    // We don't yet support configuring server-side encryption (SSE) via the
+    // SSESpecifiction attribute, but an SSESpecification with Enabled=false
+    // is simply the default, and should be accepted:
+    rjson::value* sse_specification = rjson::find(request, "SSESpecification");
+    if (sse_specification && sse_specification->IsObject()) {
+        rjson::value* enabled = rjson::find(*sse_specification, "Enabled");
+        if (!enabled || !enabled->IsBool()) {
+            return make_ready_future<request_return_type>(api_error("ValidationException", "SSESpecification needs boolean Enabled"));
+        }
+        if (enabled->GetBool()) {
+            // TODO: full support for SSESpecification
+            return make_ready_future<request_return_type>(api_error("ValidationException", "SSESpecification: configuring encryption-at-rest is not yet supported."));
+        }
    }
+
    // We don't yet support streams (CDC), but a StreamSpecification asking
    // *not* to use streams should be accepted:
    rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
@@ -908,6 +920,14 @@ future<executor::request_return_type> executor::create_table(client_state& clien
        }
    }

+    // Parse the "Tags" parameter early, so we can avoid creating the table
+    // at all if this parsing failed.
+    const rjson::value* tags = rjson::find(request, "Tags");
+    std::map<sstring, sstring> tags_map;
+    if (tags && tags->IsArray()) {
+        update_tags_map(*tags, tags_map, update_tags_action::add_tags);
+    }
+
    builder.set_extensions(schema::extensions_map{{sstring(tags_extension::NAME), ::make_shared<tags_extension>()}});
    schema_ptr schema = builder.build();
    auto where_clause_it = where_clauses.begin();
@@ -928,14 +948,14 @@ future<executor::request_return_type> executor::create_table(client_state& clien

    return create_keyspace(keyspace_name).handle_exception_type([] (exceptions::already_exists_exception&) {
            // Ignore the fact that the keyspace may already exist. See discussion in #6340
-        }).then([this, table_name, request = std::move(request), schema, view_builders = std::move(view_builders)] () mutable {
-        return futurize_invoke([&] { return _mm.announce_new_column_family(schema, false); }).then([this, table_info = std::move(request), schema, view_builders = std::move(view_builders)] () mutable {
+        }).then([this, table_name, request = std::move(request), schema, view_builders = std::move(view_builders), tags_map = std::move(tags_map)] () mutable {
+        return futurize_invoke([&] { return _mm.announce_new_column_family(schema, false); }).then([this, table_info = std::move(request), schema, view_builders = std::move(view_builders), tags_map = std::move(tags_map)] () mutable {
            return parallel_for_each(std::move(view_builders), [this, schema] (schema_builder builder) {
                return _mm.announce_new_view(view_ptr(builder.build()));
-            }).then([this, table_info = std::move(table_info), schema] () mutable {
+            }).then([this, table_info = std::move(table_info), schema, tags_map = std::move(tags_map)] () mutable {
                future<> f = make_ready_future<>();
-                if (rjson::find(table_info, "Tags")) {
-                    f = add_tags(_mm, _proxy, schema, table_info);
+                if (!tags_map.empty()) {
+                    f = update_tags(_mm, schema, std::move(tags_map));
                }
                return f.then([this] {
                    return wait_for_schema_agreement(_mm, db::timeout_clock::now() + 10s);
@@ -963,15 +983,24 @@ class attribute_collector {
    void add(bytes&& name, atomic_cell&& cell) {
        collected.emplace(std::move(name), std::move(cell));
    }
+    void add(const bytes& name, atomic_cell&& cell) {
+        collected.emplace(name, std::move(cell));
+    }
 public:
    attribute_collector() : collected(attrs_type()->get_keys_type()->as_less_comparator()) { }
-    void put(bytes&& name, bytes&& val, api::timestamp_type ts) {
-        add(std::move(name), atomic_cell::make_live(*bytes_type, ts, std::move(val), atomic_cell::collection_member::yes));
+    void put(bytes&& name, const bytes& val, api::timestamp_type ts) {
+        add(std::move(name), atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));

    }
+    void put(const bytes& name, const bytes& val, api::timestamp_type ts) {
+        add(name, atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
+    }
    void del(bytes&& name, api::timestamp_type ts) {
        add(std::move(name), atomic_cell::make_dead(ts, gc_clock::now()));
    }
+    void del(const bytes& name, api::timestamp_type ts) {
+        add(name, atomic_cell::make_dead(ts, gc_clock::now()));
+    }
    collection_mutation_description to_mut() {
        collection_mutation_description ret;
        for (auto&& e : collected) {
@@ -1048,7 +1077,7 @@ public:
    put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item);
    // put_or_delete_item doesn't keep a reference to schema (so it can be
    // moved between shards for LWT) so it needs to be given again to build():
-    mutation build(schema_ptr schema, api::timestamp_type ts);
+    mutation build(schema_ptr schema, api::timestamp_type ts) const;
    const partition_key& pk() const { return _pk; }
    const clustering_key& ck() const { return _ck; }
 };
@@ -1077,7 +1106,7 @@ put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr sche
    }
 }

-mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) {
+mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) const {
    mutation m(schema, _pk);
    // If there's no clustering key, a tombstone should be created directly
    // on a partition, not on a clustering row - otherwise it will look like
@@ -1099,7 +1128,7 @@ mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) {
    for (auto& c : *_cells) {
        const column_definition* cdef = schema->get_column_definition(c.column_name);
        if (!cdef) {
-            attrs_collector.put(std::move(c.column_name), std::move(c.value), ts);
+            attrs_collector.put(c.column_name, c.value, ts);
        } else {
            row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, std::move(c.value)));
        }
@@ -1410,7 +1439,7 @@ public:
               check_needs_read_before_write(_condition_expression) ||
               _returnvalues == returnvalues::ALL_OLD;
    }
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override {
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
        if (!verify_expected(_request, previous_item.get()) ||
            !verify_condition_expression(_condition_expression, previous_item.get())) {
            // If the update is to be cancelled because of an unfulfilled Expected
@@ -1420,6 +1449,8 @@ public:
        }
        if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
            _return_attributes = std::move(*previous_item);
+        } else {
+            _return_attributes = {};
        }
        return _mutation_builder.build(_schema, ts);
    }
@@ -1493,7 +1524,7 @@ public:
                check_needs_read_before_write(_condition_expression) ||
                _returnvalues == returnvalues::ALL_OLD;
    }
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override {
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
        if (!verify_expected(_request, previous_item.get()) ||
            !verify_condition_expression(_condition_expression, previous_item.get())) {
            // If the update is to be cancelled because of an unfulfilled Expected
@@ -1503,6 +1534,8 @@ public:
        }
        if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
            _return_attributes = std::move(*previous_item);
+        } else {
+            _return_attributes = {};
        }
        return _mutation_builder.build(_schema, ts);
    }
@@ -1577,7 +1610,7 @@ public:
    virtual ~put_or_delete_item_cas_request() = default;
    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override {
        std::optional<mutation> ret;
-        for (put_or_delete_item& mutation_builder : _mutation_builders) {
+        for (const put_or_delete_item& mutation_builder : _mutation_builders) {
            // We assume all these builders have the same partition.
            if (ret) {
                ret->apply(mutation_builder.build(schema, ts));
@@ -1762,7 +1795,8 @@ static std::string get_item_type_string(const rjson::value& v) {

 // calculate_attrs_to_get() takes either AttributesToGet or
 // ProjectionExpression parameters (having both is *not* allowed),
-// and returns the list of cells we need to read.
+// and returns the list of cells we need to read, or an empty set when
+// *all* attributes are to be returned.
 // In our current implementation, only top-level attributes are stored
 // as cells, and nested documents are stored serialized as JSON.
 // So this function currently returns only the the top-level attributes
@@ -1906,7 +1940,7 @@ public:

    update_item_operation(service::storage_proxy& proxy, rjson::value&& request);
    virtual ~update_item_operation() = default;
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override;
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override;
    bool needs_read_before_write() const;
 };

@@ -1984,7 +2018,7 @@ update_item_operation::needs_read_before_write() const {
 }

 std::optional<mutation>
-update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) {
+update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const {
    if (!verify_expected(_request, previous_item.get()) ||
        !verify_condition_expression(_condition_expression, previous_item.get())) {
        // If the update is to be cancelled because of an unfulfilled
@@ -2100,19 +2134,30 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
                    rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item.get());
                    rjson::value v2 = calculate_value(addition, calculate_value_caller::UpdateExpression, previous_item.get());
                    rjson::value result;
-                    std::string v1_type = get_item_type_string(v1);
-                    if (v1_type == "N") {
-                        if (get_item_type_string(v2) != "N") {
-                            throw api_error("ValidationException", format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
+                    // An ADD can be used to create a new attribute (when
+                    // v1.IsNull()) or to add to a pre-existing attribute:
+                    if (v1.IsNull()) {
+                        std::string v2_type = get_item_type_string(v2);
+                        if (v2_type == "N" || v2_type == "SS" || v2_type == "NS" || v2_type == "BS") {
+                            result = v2;
+                        } else {
+                            throw api_error("ValidationException", format("An operand in the update expression has an incorrect data type: {}", v2));
                        }
-                        result = number_add(v1, v2);
-                    } else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") {
-                        if (get_item_type_string(v2) != v1_type) {
-                            throw api_error("ValidationException", format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
-                        }
-                        result = set_sum(v1, v2);
                    } else {
-                        throw api_error("ValidationException", format("An operand in the update expression has an incorrect data type: {}", v1));
+                        std::string v1_type = get_item_type_string(v1);
+                        if (v1_type == "N") {
+                            if (get_item_type_string(v2) != "N") {
+                                throw api_error("ValidationException", format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
+                            }
+                            result = number_add(v1, v2);
+                        } else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") {
+                            if (get_item_type_string(v2) != v1_type) {
+                                throw api_error("ValidationException", format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
+                            }
+                            result = set_sum(v1, v2);
+                        } else {
+                            throw api_error("ValidationException", format("An operand in the update expression has an incorrect data type: {}", v1));
+                        }
                    }
                    do_update(to_bytes(column_name), result);
                },
@@ -2426,6 +2471,10 @@ public:
            std::unordered_set<std::string>& used_attribute_values);
    bool check(const rjson::value& item) const;
    bool filters_on(std::string_view attribute) const;
+    // for_filters_on() runs the given function on the attributes that the
+    // filter works on. It may run for the same attribute more than once if
+    // used more than once in the filter.
+    void for_filters_on(const noncopyable_function<void(std::string_view)>& func) const;
    operator bool() const { return bool(_imp); }
 };

@@ -2506,10 +2555,26 @@ bool filter::filters_on(std::string_view attribute) const {
    }, *_imp);
 }

+void filter::for_filters_on(const noncopyable_function<void(std::string_view)>& func) const {
+    if (_imp) {
+        std::visit(overloaded_functor {
+            [&] (const conditions_filter& f) -> void {
+                for (auto it = f.conditions.MemberBegin(); it != f.conditions.MemberEnd(); ++it) {
+                    func(rjson::to_string_view(it->name));
+                }
+            },
+            [&] (const expression_filter& f) -> void {
+                return for_condition_expression_on(f.expression, func);
+            }
+        }, *_imp);
+    }
+}
+
 class describe_items_visitor {
    typedef std::vector<const column_definition*> columns_t;
    const columns_t& _columns;
    const std::unordered_set<std::string>& _attrs_to_get;
+    std::unordered_set<std::string> _extra_filter_attrs;
    const filter& _filter;
    typename columns_t::const_iterator _column_it;
    rjson::value _item;
@@ -2525,7 +2590,20 @@ public:
            , _item(rjson::empty_object())
            , _items(rjson::empty_array())
            , _scanned_count(0)
-    { }
+    {
+        // _filter.check() may need additional attributes not listed in
+        // _attrs_to_get (i.e., not requested as part of the output).
+        // We list those in _extra_filter_attrs. We will include them in
+        // the JSON but take them out before finally returning the JSON.
+        if (!_attrs_to_get.empty()) {
+            _filter.for_filters_on([&] (std::string_view attr) {
+                std::string a(attr); // no heterogenous maps searches :-(
+                if (!_attrs_to_get.contains(a)) {
+                    _extra_filter_attrs.emplace(std::move(a));
+                }
+            });
+        }
+    }

    void start_row() {
        _column_it = _columns.begin();
@@ -2539,7 +2617,7 @@ public:
        result_bytes_view->with_linearized([this] (bytes_view bv) {
            std::string column_name = (*_column_it)->name_as_text();
            if (column_name != executor::ATTRS_COLUMN_NAME) {
-                if (_attrs_to_get.empty() || _attrs_to_get.count(column_name) > 0) {
+                if (_attrs_to_get.empty() || _attrs_to_get.contains(column_name) || _extra_filter_attrs.contains(column_name)) {
                    if (!_item.HasMember(column_name.c_str())) {
                        rjson::set_with_string_name(_item, column_name, rjson::empty_object());
                    }
@@ -2551,7 +2629,7 @@ public:
                auto keys_and_values = value_cast<map_type_impl::native_type>(deserialized);
                for (auto entry : keys_and_values) {
                    std::string attr_name = value_cast<sstring>(entry.first);
-                    if (_attrs_to_get.empty() || _attrs_to_get.count(attr_name) > 0) {
+                    if (_attrs_to_get.empty() || _attrs_to_get.contains(attr_name) || _extra_filter_attrs.contains(attr_name)) {
                        bytes value = value_cast<bytes>(entry.second);
                        rjson::set_with_string_name(_item, attr_name, deserialize_item(value));
                    }
@@ -2563,6 +2641,11 @@ public:

    void end_row() {
        if (_filter.check(_item)) {
+            // Remove the extra attributes _extra_filter_attrs which we had
+            // to add just for the filter, and not requested to be returned:
+            for (const auto& attr : _extra_filter_attrs) {
+                rjson::remove_member(_item, attr);
+            }
            rjson::push_back(_items, std::move(_item));
        }
        _item = rjson::empty_object();
@@ -2597,7 +2680,7 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag
    for (const column_definition& cdef : schema.partition_key_columns()) {
        rjson::set_with_string_name(last_evaluated_key, std::string_view(cdef.name_as_text()), rjson::empty_object());
        rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()];
-        rjson::set_with_string_name(key_entry, type_to_string(cdef.type), rjson::parse(to_json_string(*cdef.type, *exploded_pk_it)));
+        rjson::set_with_string_name(key_entry, type_to_string(cdef.type), json_key_column_value(*exploded_pk_it, cdef));
        ++exploded_pk_it;
    }
    auto ck = paging_state.get_clustering_key();
@@ -2607,7 +2690,7 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag
        for (const column_definition& cdef : schema.clustering_key_columns()) {
            rjson::set_with_string_name(last_evaluated_key, std::string_view(cdef.name_as_text()), rjson::empty_object());
            rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()];
-            rjson::set_with_string_name(key_entry, type_to_string(cdef.type), rjson::parse(to_json_string(*cdef.type, *exploded_ck_it)));
+            rjson::set_with_string_name(key_entry, type_to_string(cdef.type), json_key_column_value(*exploded_ck_it, cdef));
            ++exploded_ck_it;
        }
    }
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -348,6 +348,39 @@ bool condition_expression_on(const parsed::condition_expression& ce, std::string
    }, ce._expression);
 }

+// for_condition_expression_on() runs a given function over all the attributes
+// mentioned in the expression. If the same attribute is mentioned more than
+// once, the function will be called more than once for the same attribute.
+
+static void for_value_on(const parsed::value& v, const noncopyable_function<void(std::string_view)>& func) {
+    std::visit(overloaded_functor {
+        [&] (const parsed::constant& c) { },
+        [&] (const parsed::value::function_call& f) {
+            for (const parsed::value& value : f._parameters) {
+                for_value_on(value, func);
+            }
+        },
+        [&] (const parsed::path& p) {
+            func(p.root());
+        }
+    }, v._value);
+}
+
+void for_condition_expression_on(const parsed::condition_expression& ce, const noncopyable_function<void(std::string_view)>& func) {
+    std::visit(overloaded_functor {
+        [&] (const parsed::primitive_condition& cond) {
+            for (const parsed::value& value : cond._values) {
+                for_value_on(value, func);
+            }
+        },
+        [&] (const parsed::condition_expression::condition_list& list) {
+            for (const parsed::condition_expression& cond : list.conditions) {
+                for_condition_expression_on(cond, func);
+            }
+        }
+    }, ce._expression);
+}
+
 // The following calculate_value() functions calculate, or evaluate, a parsed
 // expression. The parsed expression is assumed to have been "resolved", with
 // the matching resolve_* function.
@@ -570,52 +603,8 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
            }
            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
            rjson::value v2 = calculate_value(f._parameters[1], caller, previous_item);
-            // TODO: There's duplication here with check_BEGINS_WITH().
-            // But unfortunately, the two functions differ a bit.
-
-            // If one of v1 or v2 is malformed or has an unsupported type
-            // (not B or S), what we do depends on whether it came from
-            // the user's query (is_constant()), or the item. Unsupported
-            // values in the query result in an error, but if they are in
-            // the item, we silently return false (no match).
-            bool bad = false;
-            if (!v1.IsObject() || v1.MemberCount() != 1) {
-                bad = true;
-                if (f._parameters[0].is_constant()) {
-                    throw api_error("ValidationException", format("{}: begins_with() encountered malformed AttributeValue: {}", caller, v1));
-                }
-            } else if (v1.MemberBegin()->name != "S" && v1.MemberBegin()->name != "B") {
-                bad = true;
-                if (f._parameters[0].is_constant()) {
-                    throw api_error("ValidationException", format("{}: begins_with() supports only string or binary in AttributeValue: {}", caller, v1));
-                }
-            }
-            if (!v2.IsObject() || v2.MemberCount() != 1) {
-                bad = true;
-                if (f._parameters[1].is_constant()) {
-                    throw api_error("ValidationException", format("{}: begins_with() encountered malformed AttributeValue: {}", caller, v2));
-                }
-            } else if (v2.MemberBegin()->name != "S" && v2.MemberBegin()->name != "B") {
-                bad = true;
-                if (f._parameters[1].is_constant()) {
-                    throw api_error("ValidationException", format("{}: begins_with() supports only string or binary in AttributeValue: {}", caller, v2));
-                }
-            }
-            bool ret = false;
-            if (!bad) {
-                auto it1 = v1.MemberBegin();
-                auto it2 = v2.MemberBegin();
-                if (it1->name == it2->name) {
-                    if (it2->name == "S") {
-                        std::string_view val1 = rjson::to_string_view(it1->value);
-                        std::string_view val2 = rjson::to_string_view(it2->value);
-                        ret = val1.starts_with(val2);
-                    } else /* it2->name == "B" */ {
-                        ret = base64_begins_with(rjson::to_string_view(it1->value), rjson::to_string_view(it2->value));
-                    }
-                }
-            }
-            return to_bool_json(ret);
+            return to_bool_json(check_BEGINS_WITH(v1.IsNull() ? nullptr : &v1,  v2,
+                                    f._parameters[0].is_constant(), f._parameters[1].is_constant()));
        }
    },
    {"contains", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
--- a/alternator/expressions.hh
+++ b/alternator/expressions.hh
@@ -27,6 +27,8 @@
 #include <unordered_set>
 #include <string_view>

+#include <seastar/util/noncopyable_function.hh>
+
 #include "expressions_types.hh"
 #include "rjson.hh"

@@ -59,6 +61,11 @@ void validate_value(const rjson::value& v, const char* caller);

 bool condition_expression_on(const parsed::condition_expression& ce, std::string_view attribute);

+// for_condition_expression_on() runs the given function on the attributes
+// that the expression uses. It may run for the same attribute more than once
+// if the same attribute is used more than once in the expression.
+void for_condition_expression_on(const parsed::condition_expression& ce, const noncopyable_function<void(std::string_view)>& func);
+
 // calculate_value() behaves slightly different (especially, different
 // functions supported) when used in different types of expressions, as
 // enumerated in this enum:
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -87,7 +87,11 @@ protected:
    // When _returnvalues != NONE, apply() should store here, in JSON form,
    // the values which are to be returned in the "Attributes" field.
    // The default null JSON means do not return an Attributes field at all.
-    rjson::value _return_attributes;
+    // This field is marked "mutable" so that the const apply() can modify
+    // it (see explanation below), but note that because apply() may be
+    // called more than once, if apply() will sometimes set this field it
+    // must set it (even if just to the default empty value) every time.
+    mutable rjson::value _return_attributes;
 public:
    // The constructor of a rmw_operation subclass should parse the request
    // and try to discover as many input errors as it can before really
@@ -100,7 +104,12 @@ public:
    // conditional expression, apply() should return an empty optional.
    // apply() may throw if it encounters input errors not discovered during
    // the constructor.
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) = 0;
+    // apply() may be called more than once in case of contention, so it must
+    // not change the state saved in the object (issue #7218 was caused by
+    // violating this). We mark apply() "const" to let the compiler validate
+    // this for us. The output-only field _return_attributes is marked
+    // "mutable" above so that apply() can still write to it.
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
    // Convert the above apply() into the signature needed by cas_request:
    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override;
    virtual ~rmw_operation() = default;
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -650,7 +650,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_size();
+                return s + sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -658,7 +658,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_size();
+                return s + sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -666,7 +666,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_memory_size();
+                return s + sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -674,7 +674,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_memory_size();
+                return s + sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -682,7 +682,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->get_summary().memory_footprint();
+                return s + sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
@@ -690,7 +690,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->get_summary().memory_footprint();
+                return s + sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -322,8 +322,8 @@ void set_storage_service(http_context& ctx, routes& r) {
                for (auto cf : column_families) {
                    column_families_vec.push_back(&db.find_column_family(keyspace, cf));
                }
-                return parallel_for_each(column_families_vec, [&cm] (column_family* cf) {
-                    return cm.perform_cleanup(cf);
+                return parallel_for_each(column_families_vec, [&cm, &db] (column_family* cf) {
+                    return cm.perform_cleanup(db, cf);
                });
            }).then([]{
                return make_ready_future<json::json_return_type>(0);
--- a/configure.py
+++ b/configure.py
@@ -386,6 +386,7 @@ scylla_tests = set([
    'test/boost/view_schema_ckey_test',
    'test/boost/vint_serialization_test',
    'test/boost/virtual_reader_test',
+    'test/boost/stall_free_test',
    'test/manual/ec2_snitch_test',
    'test/manual/gce_snitch_test',
    'test/manual/gossip',
--- a/cql3/functions/aggregate_fcts.cc
+++ b/cql3/functions/aggregate_fcts.cc
@@ -267,10 +267,13 @@ public:
    }
 };

-/// The same as `impl_max_function_for' but without knowledge of `Type'.
+/// The same as `impl_max_function_for' but without compile-time dependency on `Type'.
 class impl_max_dynamic_function final : public aggregate_function::aggregate {
+    data_type _io_type;
    opt_bytes _max;
 public:
+    impl_max_dynamic_function(data_type io_type) : _io_type(std::move(io_type)) {}
+
    virtual void reset() override {
        _max = {};
    }
@@ -278,12 +281,11 @@ public:
        return _max.value_or(bytes{});
    }
    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
-        if (!values[0]) {
+        if (values.empty() || !values[0]) {
            return;
        }
-        const auto val = *values[0];
-        if (!_max || *_max < val) {
-            _max = val;
+        if (!_max || _io_type->less(*_max, *values[0])) {
+            _max = values[0];
        }
    }
 };
@@ -298,10 +300,13 @@ public:
 };

 class max_dynamic_function final : public native_aggregate_function {
+    data_type _io_type;
 public:
-    max_dynamic_function(data_type io_type) : native_aggregate_function("max", io_type, { io_type }) {}
+    max_dynamic_function(data_type io_type)
+            : native_aggregate_function("max", io_type, { io_type })
+            , _io_type(std::move(io_type)) {}
    virtual std::unique_ptr<aggregate> new_aggregate() override {
-        return std::make_unique<impl_max_dynamic_function>();
+        return std::make_unique<impl_max_dynamic_function>(_io_type);
    }
 };

@@ -358,10 +363,13 @@ public:
    }
 };

-/// The same as `impl_min_function_for' but without knowledge of `Type'.
+/// The same as `impl_min_function_for' but without compile-time dependency on `Type'.
 class impl_min_dynamic_function final : public aggregate_function::aggregate {
+    data_type _io_type;
    opt_bytes _min;
 public:
+    impl_min_dynamic_function(data_type io_type) : _io_type(std::move(io_type)) {}
+
    virtual void reset() override {
        _min = {};
    }
@@ -369,12 +377,11 @@ public:
        return _min.value_or(bytes{});
    }
    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
-        if (!values[0]) {
+        if (values.empty() || !values[0]) {
            return;
        }
-        const auto val = *values[0];
-        if (!_min || val < *_min) {
-            _min = val;
+        if (!_min || _io_type->less(*values[0], *_min)) {
+            _min = values[0];
        }
    }
 };
@@ -389,10 +396,13 @@ public:
 };

 class min_dynamic_function final : public native_aggregate_function {
+    data_type _io_type;
 public:
-    min_dynamic_function(data_type io_type) : native_aggregate_function("min", io_type, { io_type }) {}
+    min_dynamic_function(data_type io_type)
+            : native_aggregate_function("min", io_type, { io_type })
+            , _io_type(std::move(io_type)) {}
    virtual std::unique_ptr<aggregate> new_aggregate() override {
-        return std::make_unique<impl_min_dynamic_function>();
+        return std::make_unique<impl_min_dynamic_function>(_io_type);
    }
 };

--- a/cql3/functions/castas_fcts.cc
+++ b/cql3/functions/castas_fcts.cc
@@ -88,16 +88,13 @@ static data_value castas_fctn_simple(data_value from) {
 template<typename ToType>
 static data_value castas_fctn_from_decimal_to_float(data_value from) {
    auto val_from = value_cast<big_decimal>(from);
-    boost::multiprecision::cpp_int ten(10);
-    boost::multiprecision::cpp_rational r = val_from.unscaled_value();
-    r /= boost::multiprecision::pow(ten, val_from.scale());
-    return static_cast<ToType>(r);
+    return static_cast<ToType>(val_from.as_rational());
 }

 static utils::multiprecision_int from_decimal_to_cppint(const data_value& from) {
    const auto& val_from = value_cast<big_decimal>(from);
-    boost::multiprecision::cpp_int ten(10);
-    return boost::multiprecision::cpp_int(val_from.unscaled_value() / boost::multiprecision::pow(ten, val_from.scale()));
+    auto r = val_from.as_rational();
+    return utils::multiprecision_int(numerator(r)/denominator(r));
 }

 template<typename ToType>
--- a/cql3/lists.cc
+++ b/cql3/lists.cc
@@ -357,7 +357,12 @@ lists::setter_by_uuid::execute(mutation& m, const clustering_key_prefix& prefix,

    collection_mutation_description mut;
    mut.cells.reserve(1);
-    mut.cells.emplace_back(to_bytes(*index), params.make_cell(*ltype->value_comparator(), *value, atomic_cell::collection_member::yes));
+
+    if (!value) {
+        mut.cells.emplace_back(to_bytes(*index), params.make_dead_cell());
+    } else {
+        mut.cells.emplace_back(to_bytes(*index), params.make_cell(*ltype->value_comparator(), *value, atomic_cell::collection_member::yes));
+    }

    m.set_cell(prefix, column, mut.serialize(*ltype));
 }
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -417,7 +417,7 @@ std::vector<const column_definition*> statement_restrictions::get_column_defs_fo
                    _clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
            for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
                ::shared_ptr<single_column_restriction> restr;
-                if (single_pk_restrs) {
+                if (single_ck_restrs) {
                    auto it = single_ck_restrs->restrictions().find(cdef);
                    if (it != single_ck_restrs->restrictions().end()) {
                        restr = dynamic_pointer_cast<single_column_restriction>(it->second);
@@ -688,6 +688,11 @@ static query::range<bytes_view> to_range(const term_slice& slice, const query_op
        extract_bound(statements::bound::END));
 }

+static bool contains_without_wraparound(
+        const query::range<bytes_view>& range, bytes_view value, const serialized_tri_compare& cmp) {
+    return !range.is_wrap_around(cmp) && range.contains(value, cmp);
+}
+
 bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
        const partition_key& key,
        const clustering_key_prefix& ckey,
@@ -702,13 +707,13 @@ bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
        return false;
    }
    return cell_value->with_linearized([&] (bytes_view cell_value_bv) {
-        return to_range(_slice, options, _column_def.name_as_text()).contains(
+        return contains_without_wraparound(to_range(_slice, options, _column_def.name_as_text()),
                cell_value_bv, _column_def.type->as_tri_comparator());
    });
 }

 bool single_column_restriction::slice::is_satisfied_by(bytes_view data, const query_options& options) const {
-    return to_range(_slice, options, _column_def.name_as_text()).contains(
+    return contains_without_wraparound(to_range(_slice, options, _column_def.name_as_text()),
            data, _column_def.type->underlying_type()->as_tri_comparator());
 }

--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -207,6 +207,9 @@ void alter_table_statement::add_column(const schema& schema, const table& cf, sc
                "because a collection with the same name and a different type has already been used in the past", column_name));
        }
    }
+    if (type->is_counter() && !schema.is_counter()) {
+        throw exceptions::configuration_exception(format("Cannot add a counter column ({}) in a non counter column family", column_name));
+    }

    cfm.with_column(column_name.name(), type, is_static ? column_kind::static_column : column_kind::regular_column);

@@ -222,7 +225,7 @@ void alter_table_statement::add_column(const schema& schema, const table& cf, sc
            schema_builder builder(view);
            if (view->view_info()->include_all_columns()) {
                builder.with_column(column_name.name(), type);
-            } else if (view->view_info()->base_non_pk_columns_in_view_pk().empty()) {
+            } else if (!view->view_info()->has_base_non_pk_columns_in_view_pk()) {
                db::view::create_virtual_column(builder, column_name.name(), type);
            }
            view_updates.push_back(view_ptr(builder.build()));
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -59,6 +59,7 @@
 #include "db/timeout_clock.hh"
 #include "db/consistency_level_validations.hh"
 #include "database.hh"
+#include "test/lib/select_statement_utils.hh"
 #include <boost/algorithm/cxx11/any_of.hpp>

 bool is_system_keyspace(const sstring& name);
@@ -67,6 +68,8 @@ namespace cql3 {

 namespace statements {

+static constexpr int DEFAULT_INTERNAL_PAGING_SIZE = select_statement::DEFAULT_COUNT_PAGE_SIZE;
+thread_local int internal_paging_size = DEFAULT_INTERNAL_PAGING_SIZE;
 thread_local const lw_shared_ptr<const select_statement::parameters> select_statement::_default_parameters = make_lw_shared<select_statement::parameters>();

 select_statement::parameters::parameters()
@@ -333,7 +336,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
    const bool aggregate = _selection->is_aggregate() || has_group_by();
    const bool nonpaged_filtering = restrictions_need_filtering && page_size <= 0;
    if (aggregate || nonpaged_filtering) {
-        page_size = DEFAULT_COUNT_PAGE_SIZE;
+        page_size = internal_paging_size;
    }

    auto key_ranges = _restrictions->get_partition_key_ranges(options);
@@ -530,13 +533,29 @@ indexed_table_select_statement::do_execute_base_query(
            if (old_paging_state && concurrency == 1) {
                auto base_pk = generate_base_key_from_index_pk<partition_key>(old_paging_state->get_partition_key(),
                        old_paging_state->get_clustering_key(), *_schema, *_view_schema);
+                auto row_ranges = command->slice.default_row_ranges();
                if (old_paging_state->get_clustering_key() && _schema->clustering_key_size() > 0) {
                    auto base_ck = generate_base_key_from_index_pk<clustering_key>(old_paging_state->get_partition_key(),
                            old_paging_state->get_clustering_key(), *_schema, *_view_schema);
-                    command->slice.set_range(*_schema, base_pk,
-                            std::vector<query::clustering_range>{query::clustering_range::make_starting_with(range_bound<clustering_key>(base_ck, false))});
+
+                    query::trim_clustering_row_ranges_to(*_schema, row_ranges, base_ck, false);
+                    command->slice.set_range(*_schema, base_pk, row_ranges);
                } else {
-                    command->slice.set_range(*_schema, base_pk, std::vector<query::clustering_range>{query::clustering_range::make_open_ended_both_sides()});
+                    // There is no clustering key in old_paging_state and/or no clustering key in 
+                    // _schema, therefore read an entire partition (whole clustering range).
+                    //
+                    // The only exception to applying no restrictions on clustering key
+                    // is a case when we have a secondary index on the first column
+                    // of clustering key. In such a case we should not read the
+                    // entire clustering range - only a range in which first column
+                    // of clustering key has the correct value. 
+                    //
+                    // This means that we should not set a open_ended_both_sides
+                    // clustering range on base_pk, instead intersect it with
+                    // _row_ranges (which contains the restrictions neccessary for the
+                    // case described above). The result of such intersection is just
+                    // _row_ranges, which we explicity set on base_pk.
+                    command->slice.set_range(*_schema, base_pk, row_ranges);
                }
            }
            concurrency *= 2;
@@ -974,12 +993,16 @@ indexed_table_select_statement::do_execute(service::storage_proxy& proxy,
    const bool aggregate = _selection->is_aggregate() || has_group_by();
    if (aggregate) {
        const bool restrictions_need_filtering = _restrictions->need_filtering();
-        return do_with(cql3::selection::result_set_builder(*_selection, now, options.get_cql_serialization_format()), std::make_unique<cql3::query_options>(cql3::query_options(options)),
+        return do_with(cql3::selection::result_set_builder(*_selection, now, options.get_cql_serialization_format(), *_group_by_cell_indices), std::make_unique<cql3::query_options>(cql3::query_options(options)),
                [this, &options, &proxy, &state, now, whole_partitions, partition_slices, restrictions_need_filtering] (cql3::selection::result_set_builder& builder, std::unique_ptr<cql3::query_options>& internal_options) {
            // page size is set to the internal count page size, regardless of the user-provided value
-            internal_options.reset(new cql3::query_options(std::move(internal_options), options.get_paging_state(), DEFAULT_COUNT_PAGE_SIZE));
+            internal_options.reset(new cql3::query_options(std::move(internal_options), options.get_paging_state(), internal_paging_size));
            return repeat([this, &builder, &options, &internal_options, &proxy, &state, now, whole_partitions, partition_slices, restrictions_need_filtering] () {
-                auto consume_results = [this, &builder, &options, &internal_options, restrictions_need_filtering] (foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd) {
+                auto consume_results = [this, &builder, &options, &internal_options, &proxy, &state, restrictions_need_filtering] (foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd, lw_shared_ptr<const service::pager::paging_state> paging_state) {
+                    if (paging_state) {
+                        paging_state = generate_view_paging_state_from_base_query_results(paging_state, results, proxy, state, options);
+                    }
+                    internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? make_lw_shared<service::pager::paging_state>(*paging_state) : nullptr));
                    if (restrictions_need_filtering) {
                        _stats.filtered_rows_read_total += *results->row_count();
                        query::result_view::consume(*results, cmd->slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection,
@@ -987,24 +1010,24 @@ indexed_table_select_statement::do_execute(service::storage_proxy& proxy,
                    } else {
                        query::result_view::consume(*results, cmd->slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection));
                    }
+                    bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
+                    return stop_iteration(!has_more_pages);
                };

                if (whole_partitions || partition_slices) {
                    return find_index_partition_ranges(proxy, state, *internal_options).then(
                            [this, now, &state, &internal_options, &proxy, consume_results = std::move(consume_results)] (dht::partition_range_vector partition_ranges, lw_shared_ptr<const service::pager::paging_state> paging_state) {
-                        bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
-                        internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? make_lw_shared<service::pager::paging_state>(*paging_state) : nullptr));
-                        return do_execute_base_query(proxy, std::move(partition_ranges), state, *internal_options, now, std::move(paging_state)).then(consume_results).then([has_more_pages] {
-                            return stop_iteration(!has_more_pages);
+                        return do_execute_base_query(proxy, std::move(partition_ranges), state, *internal_options, now, paging_state)
+                        .then([paging_state, consume_results = std::move(consume_results)](foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd) {
+                            return consume_results(std::move(results), std::move(cmd), std::move(paging_state));
                        });
                    });
                } else {
                    return find_index_clustering_rows(proxy, state, *internal_options).then(
                            [this, now, &state, &internal_options, &proxy, consume_results = std::move(consume_results)] (std::vector<primary_key> primary_keys, lw_shared_ptr<const service::pager::paging_state> paging_state) {
-                        bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
-                        internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? make_lw_shared<service::pager::paging_state>(*paging_state) : nullptr));
-                        return this->do_execute_base_query(proxy, std::move(primary_keys), state, *internal_options, now, std::move(paging_state)).then(consume_results).then([has_more_pages] {
-                            return stop_iteration(!has_more_pages);
+                        return this->do_execute_base_query(proxy, std::move(primary_keys), state, *internal_options, now, paging_state)
+                        .then([paging_state, consume_results = std::move(consume_results)](foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd) {
+                            return consume_results(std::move(results), std::move(cmd), std::move(paging_state));
                        });
                    });
                }
@@ -1661,6 +1684,16 @@ std::vector<size_t> select_statement::prepare_group_by(const schema& schema, sel

 }

+future<> set_internal_paging_size(int paging_size) {
+    return seastar::smp::invoke_on_all([paging_size] {
+        internal_paging_size = paging_size;
+    });
+}
+
+future<> reset_internal_paging_size() {
+    return set_internal_paging_size(DEFAULT_INTERNAL_PAGING_SIZE);
+}
+
 }

 namespace util {
--- a/database.cc
+++ b/database.cc
@@ -552,14 +552,14 @@ database::~database() {
 }

 void database::update_version(const utils::UUID& version) {
-    if (_version != version) {
+    if (_version.get() != version) {
        _schema_change_count++;
    }
-    _version = version;
+    _version.set(version);
 }

 const utils::UUID& database::get_version() const {
-    return _version;
+    return _version.get();
 }

 static future<>
@@ -767,7 +767,7 @@ future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_
    remove(*cf);
    cf->clear_views();
    auto& ks = find_keyspace(ks_name);
-    return when_all_succeed(cf->await_pending_writes(), cf->await_pending_reads()).then_unpack([this, &ks, cf, tsf = std::move(tsf), snapshot] {
+    return cf->await_pending_ops().then([this, &ks, cf, tsf = std::move(tsf), snapshot] {
        return truncate(ks, *cf, std::move(tsf), snapshot).finally([this, cf] {
            return cf->stop();
        });
@@ -1851,7 +1851,11 @@ future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_fun
                            // TODO: indexes.
                            // Note: since discard_sstables was changed to only count tables owned by this shard,
                            // we can get zero rp back. Changed assert, and ensure we save at least low_mark.
-                            assert(low_mark <= rp || rp == db::replay_position());
+                            // #6995 - the assert below was broken in c2c6c71 and remained so for many years. 
+                            // We nowadays do not flush tables with sstables but autosnapshot=false. This means
+                            // the low_mark assertion does not hold, because we maybe/probably never got around to 
+                            // creating the sstables that would create them.
+                            assert(!should_flush || low_mark <= rp || rp == db::replay_position());
                            rp = std::max(low_mark, rp);
                            return truncate_views(cf, truncated_at, should_flush).then([&cf, truncated_at, rp] {
                                // save_truncation_record() may actually fail after we cached the truncation time
@@ -1957,31 +1961,6 @@ future<> database::clear_snapshot(sstring tag, std::vector<sstring> keyspace_nam
    });
 }

-future<utils::UUID> update_schema_version(distributed<service::storage_proxy>& proxy, schema_features features)
-{
-    return db::schema_tables::calculate_schema_digest(proxy, features).then([&proxy] (utils::UUID uuid) {
-        return proxy.local().get_db().invoke_on_all([uuid] (database& db) {
-            db.update_version(uuid);
-        }).then([uuid] {
-            return db::system_keyspace::update_schema_version(uuid);
-        }).then([uuid] {
-            dblog.info("Schema version changed to {}", uuid);
-            return uuid;
-        });
-    });
-}
-
-future<> announce_schema_version(utils::UUID schema_version) {
-    return service::migration_manager::passive_announce(schema_version);
-}
-
-future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy, schema_features features)
-{
-    return update_schema_version(proxy, features).then([] (utils::UUID uuid) {
-        return announce_schema_version(uuid);
-    });
-}
-
 std::ostream& operator<<(std::ostream& os, const user_types_metadata& m) {
    os << "org.apache.cassandra.config.UTMetaData@" << &m;
    return os;
--- a/database.hh
+++ b/database.hh
@@ -55,6 +55,7 @@
 #include <limits>
 #include <cstddef>
 #include "schema_fwd.hh"
+#include "db/view/view.hh"
 #include "db/schema_features.hh"
 #include "gms/feature.hh"
 #include "timestamp.hh"
@@ -95,6 +96,7 @@
 #include "user_types_metadata.hh"
 #include "query_class_config.hh"
 #include "absl-flat_hash_map.hh"
+#include "utils/updateable_value.hh"

 class cell_locker;
 class cell_locker_stats;
@@ -539,6 +541,8 @@ private:
    utils::phased_barrier _pending_reads_phaser;
    // Corresponding phaser for in-progress streams
    utils::phased_barrier _pending_streams_phaser;
+    // Corresponding phaser for in-progress flushes
+    utils::phased_barrier _pending_flushes_phaser;

    // This field cashes the last truncation time for the table.
    // The master resides in system.truncated table
@@ -901,7 +905,7 @@ public:
    lw_shared_ptr<const sstable_list> get_sstables_including_compacted_undeleted() const;
    const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const;
    std::vector<sstables::shared_sstable> select_sstables(const dht::partition_range& range) const;
-    std::vector<sstables::shared_sstable> candidates_for_compaction() const;
+    std::vector<sstables::shared_sstable> non_staging_sstables() const;
    std::vector<sstables::shared_sstable> sstables_need_rewrite() const;
    size_t sstables_count() const;
    std::vector<uint64_t> sstable_count_per_level() const;
@@ -984,6 +988,14 @@ public:
        return _pending_streams_phaser.advance_and_await();
    }

+    future<> await_pending_flushes() {
+        return _pending_flushes_phaser.advance_and_await();
+    }
+
+    future<> await_pending_ops() {
+        return when_all(await_pending_reads(), await_pending_writes(), await_pending_streams(), await_pending_flushes()).discard_result();
+    }
+
    void add_or_update_view(view_ptr v);
    void remove_view(view_ptr v);
    void clear_views();
@@ -1008,8 +1020,9 @@ public:
        return *_config.sstables_manager;
    }

+    // Reader's schema must be the same as the base schema of each of the views.
    future<> populate_views(
-            std::vector<view_ptr>,
+            std::vector<db::view::view_and_base>,
            dht::token base_token,
            flat_mutation_reader&&,
            gc_clock::time_point);
@@ -1027,7 +1040,7 @@ private:
            tracing::trace_state_ptr tr_state, reader_concurrency_semaphore& sem, const io_priority_class& io_priority, query::partition_slice::option_set custom_opts) const;
    std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update, gc_clock::time_point now) const;
    future<> generate_and_propagate_view_updates(const schema_ptr& base,
-            std::vector<view_ptr>&& views,
+            std::vector<db::view::view_and_base>&& views,
            mutation&& m,
            flat_mutation_reader_opt existings,
            tracing::trace_state_ptr tr_state,
@@ -1099,6 +1112,10 @@ flat_mutation_reader make_local_shard_sstable_reader(schema_ptr s,
        mutation_reader::forwarding fwd_mr,
        sstables::read_monitor_generator& monitor_generator = sstables::default_read_monitor_generator());

+/// Read a range from the passed-in sstables.
+///
+/// The reader is unrestricted, but will account its resource usage on the
+/// semaphore belonging to the passed-in permit.
 flat_mutation_reader make_range_sstable_reader(schema_ptr s,
        reader_permit permit,
        lw_shared_ptr<sstables::sstable_set> sstables,
@@ -1110,6 +1127,21 @@ flat_mutation_reader make_range_sstable_reader(schema_ptr s,
        mutation_reader::forwarding fwd_mr,
        sstables::read_monitor_generator& monitor_generator = sstables::default_read_monitor_generator());

+/// Read a range from the passed-in sstables.
+///
+/// The reader is restricted, that is it will wait for admission on the semaphore
+/// belonging to the passed-in permit, before starting to read.
+flat_mutation_reader make_restricted_range_sstable_reader(schema_ptr s,
+        reader_permit permit,
+        lw_shared_ptr<sstables::sstable_set> sstables,
+        const dht::partition_range& pr,
+        const query::partition_slice& slice,
+        const io_priority_class& pc,
+        tracing::trace_state_ptr trace_state,
+        streamed_mutation::forwarding fwd,
+        mutation_reader::forwarding fwd_mr,
+        sstables::read_monitor_generator& monitor_generator = sstables::default_read_monitor_generator());
+
 class user_types_metadata;

 class keyspace_metadata final {
@@ -1376,7 +1408,7 @@ private:
        flat_hash_map<std::pair<sstring, sstring>, utils::UUID, utils::tuple_hash, string_pair_eq>;
    ks_cf_to_uuid_t _ks_cf_to_uuid;
    std::unique_ptr<db::commitlog> _commitlog;
-    utils::UUID _version;
+    utils::updateable_value_source<utils::UUID> _version;
    uint32_t _schema_change_count = 0;
    // compaction_manager object is referenced by all column families of a database.
    std::unique_ptr<compaction_manager> _compaction_manager;
@@ -1447,6 +1479,7 @@ public:
    void update_version(const utils::UUID& version);

    const utils::UUID& get_version() const;
+    utils::observable<utils::UUID>& observable_schema_version() const { return _version.as_observable(); }

    db::commitlog* commitlog() const {
        return _commitlog.get();
@@ -1664,10 +1697,6 @@ future<> stop_database(sharded<database>& db);
 flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db, schema_ptr schema,
        std::function<std::optional<dht::partition_range>()> range_generator);

-future<utils::UUID> update_schema_version(distributed<service::storage_proxy>& proxy, db::schema_features);
-future<> announce_schema_version(utils::UUID schema_version);
-future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy, db::schema_features);
-
 bool is_internal_keyspace(const sstring& name);

 #endif /* DATABASE_HH_ */
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -290,7 +290,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
            mutation m(schema, key);
            auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
            m.partition().apply_delete(*schema, clustering_key_prefix::make_empty(), tombstone(now, gc_clock::now()));
-            return _qp.proxy().mutate_locally(m, tracing::trace_state_ptr());
+            return _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
        });
    };

--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -521,7 +521,7 @@ public:
            _segment_manager->totals.total_size_on_disk -= size_on_disk();
            _segment_manager->totals.total_size -= (size_on_disk() + _buffer.size_bytes());
            _segment_manager->add_file_to_delete(_file_name, _desc);
-        } else {
+        } else if (_segment_manager->cfg.warn_about_segments_left_on_disk_after_shutdown) {
            clogger.warn("Segment {} is dirty and is left on disk.", *this);
        }
    }
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -137,6 +137,7 @@ public:

        bool reuse_segments = true;
        bool use_o_dsync = false;
+        bool warn_about_segments_left_on_disk_after_shutdown = true;

        const db::extensions * extensions = nullptr;
    };
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -304,7 +304,7 @@ future<> db::commitlog_replayer::impl::process(stats* s, commitlog::buffer_and_r
                mutation m(cf.schema(), fm.decorated_key(*cf.schema()));
                converting_mutation_partition_applier v(cm, *cf.schema(), m.partition());
                fm.partition().accept(cm, v);
-                return do_with(std::move(m), [&db, &cf] (mutation m) {
+                return do_with(std::move(m), [&db, &cf] (const mutation& m) {
                    return db.apply_in_memory(m, cf, db::rp_handle(), db::no_timeout);
                });
            } else {
--- a/db/config.cc
+++ b/db/config.cc
@@ -662,6 +662,15 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "\tpriority_string : GnuTLS priority string controlling TLS algorithms used/allowed.\n"
        "\trequire_client_auth : (Default: false ) Enables or disables certificate authentication.\n"
        "Related information: Client-to-node encryption")
+    , alternator_encryption_options(this, "alternator_encryption_options", value_status::Used, {/*none*/},
+        "When Alternator via HTTPS is enabled with alternator_https_port, where to take the key and certificate. The available options are:\n"
+        "\n"
+        "\tcertificate: (Default: conf/scylla.crt) The location of a PEM-encoded x509 certificate used to identify and encrypt the client/server communication.\n"
+        "\tkeyfile: (Default: conf/scylla.key) PEM Key file associated with certificate.\n"
+        "\n"
+        "The advanced settings are:\n"
+        "\n"
+        "\tpriority_string : GnuTLS priority string controlling TLS algorithms used/allowed.")
    , ssl_storage_port(this, "ssl_storage_port", value_status::Used, 7001,
        "The SSL port for encrypted communication. Unused unless enabled in encryption_options.")
    , enable_in_memory_data_store(this, "enable_in_memory_data_store", value_status::Used, false, "Enable in memory mode (system tables are always persisted)")
@@ -681,7 +690,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , replace_address(this, "replace_address", value_status::Used, "", "The listen_address or broadcast_address of the dead node to replace. Same as -Dcassandra.replace_address.")
    , replace_address_first_boot(this, "replace_address_first_boot", value_status::Used, "", "Like replace_address option, but if the node has been bootstrapped successfully it will be ignored. Same as -Dcassandra.replace_address_first_boot.")
    , override_decommission(this, "override_decommission", value_status::Used, false, "Set true to force a decommissioned node to join the cluster")
-    , enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based")
+    , enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, false, "Set true to use enable repair based node operations instead of streaming based")
    , ring_delay_ms(this, "ring_delay_ms", value_status::Used, 30 * 1000, "Time a node waits to hear from other nodes before joining the ring in milliseconds. Same as -Dcassandra.ring_delay_ms in cassandra.")
    , shadow_round_ms(this, "shadow_round_ms", value_status::Used, 300 * 1000, "The maximum gossip shadow round time. Can be used to reduce the gossip feature check time during node boot up.")
    , fd_max_interval_ms(this, "fd_max_interval_ms", value_status::Used, 2 * 1000, "The maximum failure_detector interval time in milliseconds. Interval larger than the maximum will be ignored. Larger cluster may need to increase the default.")
--- a/db/config.hh
+++ b/db/config.hh
@@ -252,6 +252,7 @@ public:
    named_value<uint32_t> permissions_cache_max_entries;
    named_value<string_map> server_encryption_options;
    named_value<string_map> client_encryption_options;
+    named_value<string_map> alternator_encryption_options;
    named_value<uint32_t> ssl_storage_port;
    named_value<bool> enable_in_memory_data_store;
    named_value<bool> enable_cache;
--- a/db/consistency_level.cc
+++ b/db/consistency_level.cc
@@ -61,7 +61,8 @@ namespace db {
 logging::logger cl_logger("consistency");

 size_t quorum_for(const keyspace& ks) {
-    return (ks.get_replication_strategy().get_replication_factor() / 2) + 1;
+    size_t replication_factor = ks.get_replication_strategy().get_replication_factor();
+    return replication_factor ? (replication_factor / 2) + 1 : 0;
 }

 size_t local_quorum_for(const keyspace& ks, const sstring& dc) {
@@ -72,8 +73,8 @@ size_t local_quorum_for(const keyspace& ks, const sstring& dc) {
    if (rs.get_type() == replication_strategy_type::network_topology) {
        const network_topology_strategy* nrs =
            static_cast<const network_topology_strategy*>(&rs);
-
-        return (nrs->get_replication_factor(dc) / 2) + 1;
+        size_t replication_factor = nrs->get_replication_factor(dc);
+        return replication_factor ? (replication_factor / 2) + 1 : 0;
    }

    return quorum_for(ks);
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -224,7 +224,9 @@ future<> manager::end_point_hints_manager::stop(drain should_drain) noexcept {
        with_lock(file_update_mutex(), [this] {
            if (_hints_store_anchor) {
                hints_store_ptr tmp = std::exchange(_hints_store_anchor, nullptr);
-                return tmp->shutdown().finally([tmp] {});
+                return tmp->shutdown().finally([tmp] {
+                    return tmp->release();
+                }).finally([tmp] {});
            }
            return make_ready_future<>();
        }).handle_exception([&eptr] (auto e) { eptr = std::move(e); }).get();
@@ -290,7 +292,7 @@ inline bool manager::have_ep_manager(ep_key_type ep) const noexcept {
 }

 bool manager::store_hint(ep_key_type ep, schema_ptr s, lw_shared_ptr<const frozen_mutation> fm, tracing::trace_state_ptr tr_state) noexcept {
-    if (stopping() || !started() || !can_hint_for(ep)) {
+    if (stopping() || draining_all() || !started() || !can_hint_for(ep)) {
        manager_logger.trace("Can't store a hint to {}", ep);
        ++_stats.dropped;
        return false;
@@ -326,6 +328,10 @@ future<db::commitlog> manager::end_point_hints_manager::add_store() noexcept {
            // HH doesn't utilize the flow that benefits from reusing segments.
            // Therefore let's simply disable it to avoid any possible confusion.
            cfg.reuse_segments = false;
+            // HH leaves segments on disk after commitlog shutdown, and later reads
+            // them when commitlog is re-created. This is expected to happen regularly
+            // during standard HH workload, so no need to print a warning about it.
+            cfg.warn_about_segments_left_on_disk_after_shutdown = false;

            return commitlog::create_commitlog(std::move(cfg)).then([this] (commitlog l) {
                // add_store() is triggered every time hint files are forcefully flushed to I/O (every hints_flush_period).
@@ -352,7 +358,9 @@ future<> manager::end_point_hints_manager::flush_current_hints() noexcept {
        return futurize_invoke([this] {
            return with_lock(file_update_mutex(), [this]() -> future<> {
                return get_or_load().then([] (hints_store_ptr cptr) {
-                    return cptr->shutdown();
+                    return cptr->shutdown().finally([cptr] {
+                        return cptr->release();
+                    }).finally([cptr] {});
                }).then([this] {
                    // Un-hold the commitlog object. Since we are under the exclusive _file_update_mutex lock there are no
                    // other hints_store_ptr copies and this would destroy the commitlog shared value.
@@ -529,7 +537,7 @@ bool manager::check_dc_for(ep_key_type ep) const noexcept {
 }

 void manager::drain_for(gms::inet_address endpoint) {
-    if (stopping()) {
+    if (stopping() || draining_all()) {
        return;
    }

@@ -540,6 +548,7 @@ void manager::drain_for(gms::inet_address endpoint) {
        return with_semaphore(drain_lock(), 1, [this, endpoint] {
            return futurize_invoke([this, endpoint] () {
                if (utils::fb_utilities::is_me(endpoint)) {
+                    set_draining_all();
                    return parallel_for_each(_ep_managers, [] (auto& pair) {
                        return pair.second.stop(drain::yes).finally([&pair] {
                            return with_file_update_mutex(pair.second, [&pair] {
@@ -808,7 +817,7 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
    int replayed_segments_count = 0;

    try {
-        while (replay_allowed() && have_segments()) {
+        while (replay_allowed() && have_segments() && can_send()) {
            if (!send_one_file(*_segments_to_replay.begin())) {
                break;
            }
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -424,12 +424,14 @@ public:
    enum class state {
        started,                // hinting is currently allowed (start() call is complete)
        replay_allowed,         // replaying (hints sending) is allowed
+        draining_all,           // hinting is not allowed - all ep managers are being stopped because this node is leaving the cluster
        stopping                // hinting is not allowed - stopping is in progress (stop() method has been called)
    };

    using state_set = enum_set<super_enum<state,
        state::started,
        state::replay_allowed,
+        state::draining_all,
        state::stopping>>;

 private:
@@ -690,6 +692,14 @@ private:
        return _state.contains(state::replay_allowed);
    }

+    void set_draining_all() noexcept {
+        _state.set(state::draining_all);
+    }
+
+    bool draining_all() noexcept {
+        return _state.contains(state::draining_all);
+    }
+
 public:
    ep_managers_map_type::iterator find_ep_manager(ep_key_type ep_key) noexcept {
        return _ep_managers.find(ep_key);
--- a/db/large_data_handler.cc
+++ b/db/large_data_handler.cc
@@ -113,7 +113,7 @@ future<> cql_table_large_data_handler::record_large_cells(const sstables::sstabl
        auto ck_str = key_to_str(*clustering_key, s);
        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, format("{} {}", ck_str, column_name), extra_fields, ck_str, column_name);
    } else {
-        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, column_name, extra_fields, nullptr, column_name);
+        return try_record("cell", sst, partition_key, int64_t(cell_size), cell_type, column_name, extra_fields, data_value::make_null(utf8_type), column_name);
    }
 }

@@ -125,7 +125,7 @@ future<> cql_table_large_data_handler::record_large_rows(const sstables::sstable
        std::string ck_str = key_to_str(*clustering_key, s);
        return try_record("row", sst, partition_key, int64_t(row_size), "row", ck_str, extra_fields,  ck_str);
    } else {
-        return try_record("row", sst, partition_key, int64_t(row_size), "static row", "", extra_fields, nullptr);
+        return try_record("row", sst, partition_key, int64_t(row_size), "static row", "", extra_fields, data_value::make_null(utf8_type));
    }
 }

--- a/db/large_data_handler.hh
+++ b/db/large_data_handler.hh
@@ -111,27 +111,12 @@ public:
        return make_ready_future<>();
    }

-    future<> maybe_delete_large_data_entries(const schema& s, sstring filename, uint64_t data_size) {
+    future<> maybe_delete_large_data_entries(const schema& /*s*/, sstring /*filename*/, uint64_t /*data_size*/) {
        assert(running());
-        future<> large_partitions = make_ready_future<>();
-        if (__builtin_expect(data_size > _partition_threshold_bytes, false)) {
-            large_partitions = with_sem([&s, filename, this] () mutable {
-                return delete_large_data_entries(s, std::move(filename), db::system_keyspace::LARGE_PARTITIONS);
-            });
-        }
-        future<> large_rows = make_ready_future<>();
-        if (__builtin_expect(data_size > _row_threshold_bytes, false)) {
-            large_rows = with_sem([&s, filename, this] () mutable {
-                return delete_large_data_entries(s, std::move(filename), db::system_keyspace::LARGE_ROWS);
-            });
-        }
-        future<> large_cells = make_ready_future<>();
-        if (__builtin_expect(data_size > _cell_threshold_bytes, false)) {
-            large_cells = with_sem([&s, filename, this] () mutable {
-                return delete_large_data_entries(s, std::move(filename), db::system_keyspace::LARGE_CELLS);
-            });
-        }
-        return when_all(std::move(large_partitions), std::move(large_rows), std::move(large_cells)).discard_result();
+
+        // Deletion of large data entries is disabled due to #7668
+        // They will evetually expire based on the 30 days TTL.
+        return make_ready_future<>();
    }

    const large_data_handler::stats& stats() const { return _stats; }
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -801,6 +801,19 @@ future<> merge_unlock() {
    return smp::submit_to(0, [] { the_merge_lock.signal(); });
 }

+static
+future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy, schema_features features) {
+    return calculate_schema_digest(proxy, features).then([&proxy] (utils::UUID uuid) {
+        return db::system_keyspace::update_schema_version(uuid).then([&proxy, uuid] {
+            return proxy.local().get_db().invoke_on_all([uuid] (database& db) {
+                db.update_version(uuid);
+            });
+        }).then([uuid] {
+            slogger.info("Schema version changed to {}", uuid);
+        });
+    });
+}
+
 /**
 * Merge remote schema in form of mutations with local and mutate ks/cf metadata objects
 * (which also involves fs operations on add/drop ks/cf)
@@ -821,6 +834,14 @@ future<> merge_schema(distributed<service::storage_proxy>& proxy, gms::feature_s
    });
 }

+future<> recalculate_schema_version(distributed<service::storage_proxy>& proxy, gms::feature_service& feat) {
+    return merge_lock().then([&proxy, &feat] {
+        return update_schema_version_and_announce(proxy, feat.cluster_schema_features());
+    }).finally([] {
+        return merge_unlock();
+    });
+}
+
 future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush)
 {
    return merge_lock().then([&proxy, mutations = std::move(mutations), do_flush] () mutable {
@@ -2904,10 +2925,6 @@ future<> maybe_update_legacy_secondary_index_mv_schema(service::migration_manage
    // format, where "token" is not marked as computed. Once we're sure that all indexes have their
    // columns marked as computed (because they were either created on a node that supports computed
    // columns or were fixed by this utility function), it's safe to remove this function altogether.
-    if (!db.features().cluster_supports_computed_columns()) {
-        return make_ready_future<>();
-    }
-
    if (v->clustering_key_size() == 0) {
        return make_ready_future<>();
    }
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -170,6 +170,13 @@ future<> merge_schema(distributed<service::storage_proxy>& proxy, gms::feature_s

 future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush);

+// Recalculates the local schema version.
+//
+// It is safe to call concurrently with recalculate_schema_version() and merge_schema() in which case it
+// is guaranteed that the schema version we end up with after all calls will reflect the most recent state
+// of feature_service and schema tables.
+future<> recalculate_schema_version(distributed<service::storage_proxy>& proxy, gms::feature_service& feat);
+
 future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after);

 std::vector<mutation> make_create_keyspace_mutations(lw_shared_ptr<keyspace_metadata> keyspace, api::timestamp_type timestamp, bool with_tables_and_types_and_functions = true);
--- a/db/size_estimates_virtual_reader.cc
+++ b/db/size_estimates_virtual_reader.cc
@@ -201,10 +201,10 @@ static future<std::vector<token_range>> get_local_ranges(database& db) {
        // All queries will be on that table, where all entries are text and there's no notion of
        // token ranges form the CQL point of view.
        auto left_inf = boost::find_if(ranges, [] (auto&& r) {
-            return !r.start() || r.start()->value() == dht::minimum_token();
+            return r.end() && (!r.start() || r.start()->value() == dht::minimum_token());
        });
        auto right_inf = boost::find_if(ranges, [] (auto&& r) {
-            return !r.end() || r.start()->value() == dht::maximum_token();
+            return r.start() && (!r.end() || r.end()->value() == dht::maximum_token());
        });
        if (left_inf != right_inf && left_inf != ranges.end() && right_inf != ranges.end()) {
            local_ranges.push_back(token_range{to_bytes(right_inf->start()), to_bytes(left_inf->end())});
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -58,6 +58,7 @@
 #include "cql3/util.hh"
 #include "db/view/view.hh"
 #include "db/view/view_builder.hh"
+#include "db/view/view_updating_consumer.hh"
 #include "db/system_keyspace_view_types.hh"
 #include "db/system_keyspace.hh"
 #include "frozen_mutation.hh"
@@ -136,17 +137,90 @@ const column_definition* view_info::view_column(const column_definition& base_de
    return _schema.get_column_definition(base_def.name());
 }

-const std::vector<column_id>& view_info::base_non_pk_columns_in_view_pk() const {
+void view_info::set_base_info(db::view::base_info_ptr base_info) {
+    _base_info = std::move(base_info);
+}
+
+// A constructor for a base info that can facilitate reads and writes from the materialized view.
+db::view::base_dependent_view_info::base_dependent_view_info(schema_ptr base_schema, std::vector<column_id>&& base_non_pk_columns_in_view_pk)
+        : _base_schema{std::move(base_schema)}
+        , _base_non_pk_columns_in_view_pk{std::move(base_non_pk_columns_in_view_pk)}
+        , has_base_non_pk_columns_in_view_pk{!_base_non_pk_columns_in_view_pk.empty()}
+        , use_only_for_reads{false} {
+
+}
+
+// A constructor for a base info that can facilitate only reads from the materialized view.
+db::view::base_dependent_view_info::base_dependent_view_info(bool has_base_non_pk_columns_in_view_pk)
+        : _base_schema{nullptr}
+        , has_base_non_pk_columns_in_view_pk{has_base_non_pk_columns_in_view_pk}
+        , use_only_for_reads{true} {
+}
+
+const std::vector<column_id>& db::view::base_dependent_view_info::base_non_pk_columns_in_view_pk() const {
+    if (use_only_for_reads) {
+        on_internal_error(vlogger, "base_non_pk_columns_in_view_pk(): operation unsupported when initialized only for view reads.");
+    }
    return _base_non_pk_columns_in_view_pk;
 }

-void view_info::initialize_base_dependent_fields(const schema& base) {
+const schema_ptr& db::view::base_dependent_view_info::base_schema() const {
+    if (use_only_for_reads) {
+        on_internal_error(vlogger, "base_schema(): operation unsupported when initialized only for view reads.");
+    }
+    return _base_schema;
+}
+
+db::view::base_info_ptr view_info::make_base_dependent_view_info(const schema& base) const {
+    std::vector<column_id> base_non_pk_columns_in_view_pk;
+    bool has_base_non_pk_columns_in_view_pk = false;
+    bool can_only_read_from_view = false;
+
    for (auto&& view_col : boost::range::join(_schema.partition_key_columns(), _schema.clustering_key_columns())) {
+        if (view_col.is_computed()) {
+            // we are not going to find it in the base table...
+            continue;
+        }
        auto* base_col = base.get_column_definition(view_col.name());
        if (base_col && !base_col->is_primary_key()) {
-            _base_non_pk_columns_in_view_pk.push_back(base_col->id);
+            base_non_pk_columns_in_view_pk.push_back(base_col->id);
+            has_base_non_pk_columns_in_view_pk = true;
+        } else if (!base_col) {
+            // If we didn't find the column in the base column then it must have been deleted
+            // or not yet added (by alter command), this means it is for sure not a pk column
+            // in the base table. This can happen if the version of the base schema is not the
+            // one that the view was created with. Seting this schema as the base can't harm since
+            // if we got to such a situation then it means it is only going to be used for reading
+            // (computation of shadowable tombstones) and in that case the existence of such a column
+            // is the only thing that is of interest to us.
+            has_base_non_pk_columns_in_view_pk = true;
+            can_only_read_from_view = true;
+
+            // We can break the loop here since we have the info we wanted and the list
+            // of columns is not going to be reliable anyhow.
+            break;
        }
    }
+
+    if (can_only_read_from_view) {
+        return make_lw_shared<db::view::base_dependent_view_info>(has_base_non_pk_columns_in_view_pk);
+    } else {
+        return make_lw_shared<db::view::base_dependent_view_info>(base.shared_from_this(), std::move(base_non_pk_columns_in_view_pk));
+    }
+
+}
+
+bool view_info::has_base_non_pk_columns_in_view_pk() const {
+    // The base info is not always available, this is because
+    // the base info initialization is separate from the view
+    // info construction. If we are trying to get this info without
+    // initializing the base information it means that we have a
+    // schema integrity problem as the creator of owning view schema
+    // didn't make sure to initialize it with base information.
+    if (!_base_info) {
+        on_internal_error(vlogger, "Tried to perform a view query which is base info dependant without initializing it");
+    }
+    return _base_info->has_base_non_pk_columns_in_view_pk;
 }

 namespace db {
@@ -194,12 +268,12 @@ bool may_be_affected_by(const schema& base, const view_info& view, const dht::de
 }

 static bool update_requires_read_before_write(const schema& base,
-        const std::vector<view_ptr>& views,
+        const std::vector<view_and_base>& views,
        const dht::decorated_key& key,
        const rows_entry& update,
        gc_clock::time_point now) {
    for (auto&& v : views) {
-        view_info& vf = *v->view_info();
+        view_info& vf = *v.view->view_info();
        if (may_be_affected_by(base, vf, key, update, now)) {
            return true;
        }
@@ -246,12 +320,14 @@ class view_updates final {
    view_ptr _view;
    const view_info& _view_info;
    schema_ptr _base;
+    base_info_ptr _base_info;
    std::unordered_map<partition_key, mutation_partition, partition_key::hashing, partition_key::equality> _updates;
 public:
-    explicit view_updates(view_ptr view, schema_ptr base)
-            : _view(std::move(view))
+    explicit view_updates(view_and_base vab)
+            : _view(std::move(vab.view))
            , _view_info(*_view->view_info())
-            , _base(std::move(base))
+            , _base(vab.base->base_schema())
+            , _base_info(vab.base)
            , _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view)) {
    }

@@ -313,7 +389,7 @@ row_marker view_updates::compute_row_marker(const clustering_row& base_row) cons
    // they share liveness information. It's true especially in the only case currently allowed by CQL,
    // which assumes there's up to one non-pk column in the view key. It's also true in alternator,
    // which does not carry TTL information.
-    const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
+    const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk();
    if (!col_ids.empty()) {
        auto& def = _base->regular_column_at(col_ids[0]);
        // Note: multi-cell columns can't be part of the primary key.
@@ -544,7 +620,7 @@ void view_updates::delete_old_entry(const partition_key& base_key, const cluster

 void view_updates::do_delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now) {
    auto& r = get_view_row(base_key, existing);
-    const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
+    const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk();
    if (!col_ids.empty()) {
        // We delete the old row using a shadowable row tombstone, making sure that
        // the tombstone deletes everything in the row (or it might still show up).
@@ -685,7 +761,7 @@ void view_updates::generate_update(
        return;
    }

-    const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
+    const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk();
    if (col_ids.empty()) {
        // The view key is necessarily the same pre and post update.
        if (existing && existing->is_live(*_base)) {
@@ -940,12 +1016,17 @@ future<stop_iteration> view_update_builder::on_results() {

 future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
        const schema_ptr& base,
-        std::vector<view_ptr>&& views_to_update,
+        std::vector<view_and_base>&& views_to_update,
        flat_mutation_reader&& updates,
        flat_mutation_reader_opt&& existings,
        gc_clock::time_point now) {
-    auto vs = boost::copy_range<std::vector<view_updates>>(views_to_update | boost::adaptors::transformed([&] (auto&& v) {
-        return view_updates(std::move(v), base);
+    auto vs = boost::copy_range<std::vector<view_updates>>(views_to_update | boost::adaptors::transformed([&] (view_and_base v) {
+        if (base->version() != v.base->base_schema()->version()) {
+            on_internal_error(vlogger, format("Schema version used for view updates ({}) does not match the current"
+                                              " base schema version of the view ({}) for view {}.{} of {}.{}",
+                base->version(), v.base->base_schema()->version(), v.view->ks_name(), v.view->cf_name(), base->ks_name(), base->cf_name()));
+        }
+        return view_updates(std::move(v));
    }));
    auto builder = std::make_unique<view_update_builder>(base, std::move(vs), std::move(updates), std::move(existings), now);
    auto f = builder->build();
@@ -955,7 +1036,7 @@ future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
 query::clustering_row_ranges calculate_affected_clustering_ranges(const schema& base,
        const dht::decorated_key& key,
        const mutation_partition& mp,
-        const std::vector<view_ptr>& views,
+        const std::vector<view_and_base>& views,
        gc_clock::time_point now) {
    std::vector<nonwrapping_range<clustering_key_prefix_view>> row_ranges;
    std::vector<nonwrapping_range<clustering_key_prefix_view>> view_row_ranges;
@@ -963,11 +1044,11 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(const schema&
    if (mp.partition_tombstone() || !mp.row_tombstones().empty()) {
        for (auto&& v : views) {
            // FIXME: #2371
-            if (v->view_info()->select_statement().get_restrictions()->has_unrestricted_clustering_columns()) {
+            if (v.view->view_info()->select_statement().get_restrictions()->has_unrestricted_clustering_columns()) {
                view_row_ranges.push_back(nonwrapping_range<clustering_key_prefix_view>::make_open_ended_both_sides());
                break;
            }
-            for (auto&& r : v->view_info()->partition_slice().default_row_ranges()) {
+            for (auto&& r : v.view->view_info()->partition_slice().default_row_ranges()) {
                view_row_ranges.push_back(r.transform(std::mem_fn(&clustering_key_prefix::view)));
            }
        }
@@ -1210,25 +1291,61 @@ view_builder::view_builder(database& db, db::system_distributed_keyspace& sys_di
 }

 future<> view_builder::start(service::migration_manager& mm) {
-    _started = seastar::async([this, &mm] {
-        // Guard the whole startup routine with a semaphore,
-        // so that it's not intercepted by `on_drop_view`, `on_create_view`
-        // or `on_update_view` events.
-        auto units = get_units(_sem, 1).get0();
-        // Wait for schema agreement even if we're a seed node.
-        while (!mm.have_schema_agreement()) {
-            if (_as.abort_requested()) {
-                return;
+    _started = do_with(view_builder_init_state{}, [this, &mm] (view_builder_init_state& vbi) {
+        return seastar::async([this, &mm, &vbi] {
+            // Guard the whole startup routine with a semaphore,
+            // so that it's not intercepted by `on_drop_view`, `on_create_view`
+            // or `on_update_view` events.
+            auto units = get_units(_sem, 1).get0();
+            // Wait for schema agreement even if we're a seed node.
+            while (!mm.have_schema_agreement()) {
+                seastar::sleep_abortable(500ms, _as).get();
            }
-            seastar::sleep(500ms).get();
-        }
-        auto built = system_keyspace::load_built_views().get0();
-        auto in_progress = system_keyspace::load_view_build_progress().get0();
-        calculate_shard_build_step(std::move(built), std::move(in_progress)).get();
-        _mnotifier.register_listener(this);
-        _current_step = _base_to_build_step.begin();
-        // Waited on indirectly in stop().
-        (void)_build_step.trigger();
+            auto built = system_keyspace::load_built_views().get0();
+            auto in_progress = system_keyspace::load_view_build_progress().get0();
+            setup_shard_build_step(vbi, std::move(built), std::move(in_progress));
+        }).then_wrapped([this] (future<>&& f) {
+            // All shards need to arrive at the same decisions on whether or not to
+            // restart a view build at some common token (reshard), and which token
+            // to restart at. So we need to wait until all shards have read the view
+            // build statuses before they can all proceed to make the (same) decision.
+            // If we don't synchronize here, a fast shard may make a decision, start
+            // building and finish a build step - before the slowest shard even read
+            // the view build information.
+            std::exception_ptr eptr;
+            if (f.failed()) {
+                eptr = f.get_exception();
+            }
+
+            return container().invoke_on(0, [eptr = std::move(eptr)] (view_builder& builder) {
+                // The &builder is alive, because it can only be destroyed in
+                // sharded<view_builder>::stop(), which, in turn, waits for all
+                // view_builder::stop()-s to finish, and each stop() waits for
+                // the shard's current future (called _started) to resolve.
+                if (!eptr) {
+                    if (++builder._shards_finished_read == smp::count) {
+                        builder._shards_finished_read_promise.set_value();
+                    }
+                } else {
+                    if (builder._shards_finished_read < smp::count) {
+                        builder._shards_finished_read = smp::count;
+                        builder._shards_finished_read_promise.set_exception(std::move(eptr));
+                    }
+                }
+                return builder._shards_finished_read_promise.get_shared_future();
+            });
+        }).then([this, &vbi] {
+            return calculate_shard_build_step(vbi);
+        }).then([this] {
+            _mnotifier.register_listener(this);
+            _current_step = _base_to_build_step.begin();
+            // Waited on indirectly in stop().
+            (void)_build_step.trigger();
+            return make_ready_future<>();
+        });
+    }).handle_exception([] (std::exception_ptr eptr) {
+        vlogger.error("start failed: {}", eptr);
+        return make_ready_future<>();
    });
    return make_ready_future<>();
 }
@@ -1236,7 +1353,7 @@ future<> view_builder::start(service::migration_manager& mm) {
 future<> view_builder::stop() {
    vlogger.info("Stopping view builder");
    _as.request_abort();
-    return _started.finally([this] {
+    return _started.then([this] {
        return _mnotifier.unregister_listener(this).then([this] {
            return _sem.wait();
        }).then([this] {
@@ -1370,12 +1487,12 @@ void view_builder::reshard(
    }
 }

-future<> view_builder::calculate_shard_build_step(
+void view_builder::setup_shard_build_step(
+        view_builder_init_state& vbi,
        std::vector<system_keyspace::view_name> built,
        std::vector<system_keyspace::view_build_progress> in_progress) {
    // Shard 0 makes cleanup changes to the system tables, but none that could conflict
    // with the other shards; everyone is thus able to proceed independently.
-    auto bookkeeping_ops = std::make_unique<std::vector<future<>>>();
    auto base_table_exists = [this] (const view_ptr& view) {
        // This is a safety check in case this node missed a create MV statement
        // but got a drop table for the base, and another node didn't get the
@@ -1402,9 +1519,9 @@ future<> view_builder::calculate_shard_build_step(
            // Fall-through
        }
        if (this_shard_id() == 0) {
-            bookkeeping_ops->push_back(_sys_dist_ks.remove_view(name.first, name.second));
-            bookkeeping_ops->push_back(system_keyspace::remove_built_view(name.first, name.second));
-            bookkeeping_ops->push_back(
+            vbi.bookkeeping_ops.push_back(_sys_dist_ks.remove_view(name.first, name.second));
+            vbi.bookkeeping_ops.push_back(system_keyspace::remove_built_view(name.first, name.second));
+            vbi.bookkeeping_ops.push_back(
                    system_keyspace::remove_view_build_progress_across_all_shards(
                            std::move(name.first),
                            std::move(name.second)));
@@ -1412,50 +1529,48 @@ future<> view_builder::calculate_shard_build_step(
        return view_ptr(nullptr);
    };

-    auto built_views = boost::copy_range<std::unordered_set<utils::UUID>>(built
+    vbi.built_views = boost::copy_range<std::unordered_set<utils::UUID>>(built
            | boost::adaptors::transformed(maybe_fetch_view)
            | boost::adaptors::filtered([] (const view_ptr& v) { return bool(v); })
            | boost::adaptors::transformed([] (const view_ptr& v) { return v->id(); }));

-    std::vector<std::vector<view_build_status>> view_build_status_per_shard;
    for (auto& [view_name, first_token, next_token_opt, cpu_id] : in_progress) {
        if (auto view = maybe_fetch_view(view_name)) {
-            if (built_views.find(view->id()) != built_views.end()) {
+            if (vbi.built_views.find(view->id()) != vbi.built_views.end()) {
                if (this_shard_id() == 0) {
                    auto f = _sys_dist_ks.finish_view_build(std::move(view_name.first), std::move(view_name.second)).then([view = std::move(view)] {
                        return system_keyspace::remove_view_build_progress_across_all_shards(view->cf_name(), view->ks_name());
                    });
-                    bookkeeping_ops->push_back(std::move(f));
+                    vbi.bookkeeping_ops.push_back(std::move(f));
                }
                continue;
            }
-            view_build_status_per_shard.resize(std::max(view_build_status_per_shard.size(), size_t(cpu_id + 1)));
-            view_build_status_per_shard[cpu_id].emplace_back(view_build_status{
+            vbi.status_per_shard.resize(std::max(vbi.status_per_shard.size(), size_t(cpu_id + 1)));
+            vbi.status_per_shard[cpu_id].emplace_back(view_build_status{
                    std::move(view),
                    std::move(first_token),
                    std::move(next_token_opt)});
        }
    }
+}

-    // All shards need to arrive at the same decisions on whether or not to
-    // restart a view build at some common token (reshard), and which token
-    // to restart at. So we need to wait until all shards have read the view
-    // build statuses before they can all proceed to make the (same) decision.
-    // If we don't synchronoize here, a fast shard may make a decision, start
-    // building and finish a build step - before the slowest shard even read
-    // the view build information.
-    container().invoke_on(0, [] (view_builder& builder) {
-        if (++builder._shards_finished_read == smp::count) {
-            builder._shards_finished_read_promise.set_value();
+future<> view_builder::calculate_shard_build_step(view_builder_init_state& vbi) {
+    auto base_table_exists = [this] (const view_ptr& view) {
+        // This is a safety check in case this node missed a create MV statement
+        // but got a drop table for the base, and another node didn't get the
+        // drop notification and sent us the view schema.
+        try {
+            _db.find_schema(view->view_info()->base_id());
+            return true;
+        } catch (const no_such_column_family&) {
+            return false;
        }
-        return builder._shards_finished_read_promise.get_shared_future();
-    }).get();
-
+    };
    std::unordered_set<utils::UUID> loaded_views;
-    if (view_build_status_per_shard.size() != smp::count) {
-        reshard(std::move(view_build_status_per_shard), loaded_views);
-    } else if (!view_build_status_per_shard.empty()) {
-        for (auto& status : view_build_status_per_shard[this_shard_id()]) {
+    if (vbi.status_per_shard.size() != smp::count) {
+        reshard(std::move(vbi.status_per_shard), loaded_views);
+    } else if (!vbi.status_per_shard.empty()) {
+        for (auto& status : vbi.status_per_shard[this_shard_id()]) {
            load_view_status(std::move(status), loaded_views);
        }
    }
@@ -1472,18 +1587,18 @@ future<> view_builder::calculate_shard_build_step(
    auto all_views = _db.get_views();
    auto is_new = [&] (const view_ptr& v) {
        return base_table_exists(v) && loaded_views.find(v->id()) == loaded_views.end()
-                && built_views.find(v->id()) == built_views.end();
+                && vbi.built_views.find(v->id()) == vbi.built_views.end();
    };
    for (auto&& view : all_views | boost::adaptors::filtered(is_new)) {
-        bookkeeping_ops->push_back(add_new_view(view, get_or_create_build_step(view->view_info()->base_id())));
+        vbi.bookkeeping_ops.push_back(add_new_view(view, get_or_create_build_step(view->view_info()->base_id())));
    }

    for (auto& [_, build_step] : _base_to_build_step) {
        initialize_reader_at_current_token(build_step);
    }

-    auto f = seastar::when_all_succeed(bookkeeping_ops->begin(), bookkeeping_ops->end());
-    return f.handle_exception([this, bookkeeping_ops = std::move(bookkeeping_ops)] (std::exception_ptr ep) {
+    auto f = seastar::when_all_succeed(vbi.bookkeeping_ops.begin(), vbi.bookkeeping_ops.end());
+    return f.handle_exception([this] (std::exception_ptr ep) {
        log_level severity = _as.abort_requested() ? log_level::warn : log_level::error;
        vlogger.log(severity, "Failed to update materialized view bookkeeping ({}), continuing anyway.", ep);
    });
@@ -1732,7 +1847,7 @@ public:
            return stop_iteration::yes;
        }

-        _fragments_memory_usage += cr.memory_usage(*_step.base->schema());
+        _fragments_memory_usage += cr.memory_usage(*_step.reader.schema());
        _fragments.push_back(std::move(cr));
        if (_fragments_memory_usage > batch_memory_max) {
            // Although we have not yet completed the batch of base rows that
@@ -1754,10 +1869,14 @@ public:
        _builder._as.check();
        if (!_fragments.empty()) {
            _fragments.push_front(partition_start(_step.current_key, tombstone()));
+            auto base_schema = _step.base->schema();
+            auto views = with_base_info_snapshot(_views_to_build);
+            auto reader = make_flat_mutation_reader_from_fragments(_step.reader.schema(), std::move(_fragments));
+            reader.upgrade_schema(base_schema);
            _step.base->populate_views(
-                    _views_to_build,
+                    std::move(views),
                    _step.current_token(),
-                    make_flat_mutation_reader_from_fragments(_step.base->schema(), std::move(_fragments)),
+                    std::move(reader),
                    _now).get();
            _fragments.clear();
            _fragments_memory_usage = 0;
@@ -1909,5 +2028,54 @@ future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_d
    });
 }

+const size_t view_updating_consumer::buffer_size_soft_limit{1 * 1024 * 1024};
+const size_t view_updating_consumer::buffer_size_hard_limit{2 * 1024 * 1024};
+
+void view_updating_consumer::do_flush_buffer() {
+    _staging_reader_handle.pause();
+
+    if (_buffer.front().partition().empty()) {
+        // If we flushed mid-partition we can have an empty mutation if we
+        // flushed right before getting the end-of-partition fragment.
+        _buffer.pop_front();
+    }
+
+    while (!_buffer.empty()) {
+        try {
+            auto lock_holder = _view_update_pusher(std::move(_buffer.front())).get();
+        } catch (...) {
+            vlogger.warn("Failed to push replica updates for table {}.{}: {}", _schema->ks_name(), _schema->cf_name(), std::current_exception());
+        }
+        _buffer.pop_front();
+    }
+
+    _buffer_size = 0;
+    _m = nullptr;
+}
+
+void view_updating_consumer::maybe_flush_buffer_mid_partition() {
+    if (_buffer_size >= buffer_size_hard_limit) {
+        auto m = mutation(_schema, _m->decorated_key(), mutation_partition(_schema));
+        do_flush_buffer();
+        _buffer.emplace_back(std::move(m));
+        _m = &_buffer.back();
+    }
+}
+
+view_updating_consumer::view_updating_consumer(schema_ptr schema, table& table, std::vector<sstables::shared_sstable> excluded_sstables, const seastar::abort_source& as,
+        evictable_reader_handle& staging_reader_handle)
+    : view_updating_consumer(std::move(schema), as, staging_reader_handle,
+            [table = table.shared_from_this(), excluded_sstables = std::move(excluded_sstables)] (mutation m) mutable {
+        auto s = m.schema();
+        return table->stream_view_replica_updates(std::move(s), std::move(m), db::no_timeout, excluded_sstables);
+    })
+{ }
+
+std::vector<db::view::view_and_base> with_base_info_snapshot(std::vector<view_ptr> vs) {
+    return boost::copy_range<std::vector<db::view::view_and_base>>(vs | boost::adaptors::transformed([] (const view_ptr& v) {
+        return db::view::view_and_base{v, v->view_info()->base_info()};
+    }));
+}
+
 } // namespace view
 } // namespace db
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -43,6 +43,46 @@ namespace db {

 namespace view {

+// Part of the view description which depends on the base schema version.
+//
+// This structure may change even though the view schema doesn't change, so
+// it needs to live outside view_ptr.
+struct base_dependent_view_info {
+private:
+    schema_ptr _base_schema;
+    // Id of a regular base table column included in the view's PK, if any.
+    // Scylla views only allow one such column, alternator can have up to two.
+    std::vector<column_id> _base_non_pk_columns_in_view_pk;
+public:
+    const std::vector<column_id>& base_non_pk_columns_in_view_pk() const;
+    const schema_ptr& base_schema() const;
+
+    // Indicates if the view hase pk columns which are not part of the base
+    // pk, it seems that !base_non_pk_columns_in_view_pk.empty() is the same,
+    // but actually there are cases where we can compute this boolean without
+    // succeeding to reliably build the former.
+    const bool has_base_non_pk_columns_in_view_pk;
+
+    // If base_non_pk_columns_in_view_pk couldn't reliably be built, this base
+    // info can't be used for computing view updates, only for reading the materialized
+    // view.
+    const bool use_only_for_reads;
+
+    // A constructor for a base info that can facilitate reads and writes from the materialized view.
+    base_dependent_view_info(schema_ptr base_schema, std::vector<column_id>&& base_non_pk_columns_in_view_pk);
+    // A constructor for a base info that can facilitate only reads from the materialized view.
+    base_dependent_view_info(bool has_base_non_pk_columns_in_view_pk);
+};
+
+// Immutable snapshot of view's base-schema-dependent part.
+using base_info_ptr = lw_shared_ptr<const base_dependent_view_info>;
+
+// Snapshot of the view schema and its base-schema-dependent part.
+struct view_and_base {
+    view_ptr view;
+    base_info_ptr base;
+};
+
 /**
 * Whether the view filter considers the specified partition key.
 *
@@ -94,7 +134,7 @@ bool clustering_prefix_matches(const schema& base, const partition_key& key, con

 future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
        const schema_ptr& base,
-        std::vector<view_ptr>&& views_to_update,
+        std::vector<view_and_base>&& views_to_update,
        flat_mutation_reader&& updates,
        flat_mutation_reader_opt&& existings,
        gc_clock::time_point now);
@@ -103,7 +143,7 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(
        const schema& base,
        const dht::decorated_key& key,
        const mutation_partition& mp,
-        const std::vector<view_ptr>& views,
+        const std::vector<view_and_base>& views,
        gc_clock::time_point now);

 struct wait_for_all_updates_tag {};
@@ -133,6 +173,13 @@ future<> mutate_MV(
 */
 void create_virtual_column(schema_builder& builder, const bytes& name, const data_type& type);

+/**
+ * Converts a collection of view schema snapshots into a collection of
+ * view_and_base objects, which are snapshots of both the view schema
+ * and the base-schema-dependent part of view description.
+ */
+std::vector<view_and_base> with_base_info_snapshot(std::vector<view_ptr>);
+
 }

 }
--- a/db/view/view_builder.hh
+++ b/db/view/view_builder.hh
@@ -165,6 +165,12 @@ class view_builder final : public service::migration_listener::only_view_notific
    // Used for testing.
    std::unordered_map<std::pair<sstring, sstring>, seastar::shared_promise<>, utils::tuple_hash> _build_notifiers;

+    struct view_builder_init_state {
+        std::vector<future<>> bookkeeping_ops;
+        std::vector<std::vector<view_build_status>> status_per_shard;
+        std::unordered_set<utils::UUID> built_views;
+    };
+
 public:
    // The view builder processes the base table in steps of batch_size rows.
    // However, if the individual rows are large, there is no real need to
@@ -201,7 +207,8 @@ private:
    void initialize_reader_at_current_token(build_step&);
    void load_view_status(view_build_status, std::unordered_set<utils::UUID>&);
    void reshard(std::vector<std::vector<view_build_status>>, std::unordered_set<utils::UUID>&);
-    future<> calculate_shard_build_step(std::vector<system_keyspace::view_name>, std::vector<system_keyspace::view_build_progress>);
+    void setup_shard_build_step(view_builder_init_state& vbi, std::vector<system_keyspace::view_name>, std::vector<system_keyspace::view_build_progress>);
+    future<> calculate_shard_build_step(view_builder_init_state& vbi);
    future<> add_new_view(view_ptr, build_step&);
    future<> do_build_step();
    void execute(build_step&, exponential_backoff_retry);
--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -42,35 +42,52 @@ future<> view_update_generator::start() {
                _pending_sstables.wait().get();
            }

+            // To ensure we don't race with updates, move the entire content
+            // into a local variable.
+            auto sstables_with_tables = std::exchange(_sstables_with_tables, {});
+
            // If we got here, we will process all tables we know about so far eventually so there
            // is no starvation
-            for (auto& t : _sstables_with_tables | boost::adaptors::map_keys) {
+            for (auto table_it = sstables_with_tables.begin(); table_it != sstables_with_tables.end(); table_it = sstables_with_tables.erase(table_it)) {
+                auto& [t, sstables] = *table_it;
                schema_ptr s = t->schema();

-                // Copy what we have so far so we don't miss new updates
-                auto sstables = std::exchange(_sstables_with_tables[t], {});
+                vug_logger.trace("Processing {}.{}: {} sstables", s->ks_name(), s->cf_name(), sstables.size());
+
+                const auto num_sstables = sstables.size();

                try {
-                    // temporary: need an sstable set for the flat mutation reader, but the
-                    // compaction_descriptor takes a vector. Soon this will become a compaction
-                    // so the transformation to the SSTable set will not be needed.
-                    auto ssts = make_lw_shared(t->get_compaction_strategy().make_sstable_set(s));
+                    // Exploit the fact that sstables in the staging directory
+                    // are usually non-overlapping and use a partitioned set for
+                    // the read.
+                    auto ssts = make_lw_shared(sstables::make_partitioned_sstable_set(s, make_lw_shared<sstable_list>(sstable_list{}), false));
                    for (auto& sst : sstables) {
                        ssts->insert(sst);
                    }

-                    flat_mutation_reader staging_sstable_reader = ::make_range_sstable_reader(s,
+                    auto ms = mutation_source([this, ssts] (
+                                schema_ptr s,
+                                reader_permit permit,
+                                const dht::partition_range& pr,
+                                const query::partition_slice& ps,
+                                const io_priority_class& pc,
+                                tracing::trace_state_ptr ts,
+                                streamed_mutation::forwarding fwd_ms,
+                                mutation_reader::forwarding fwd_mr) {
+                        return ::make_restricted_range_sstable_reader(s, std::move(permit), std::move(ssts), pr, ps, pc, std::move(ts), fwd_ms, fwd_mr);
+                    });
+                    auto [staging_sstable_reader, staging_sstable_reader_handle] = make_manually_paused_evictable_reader(
+                            std::move(ms),
+                            s,
                            _db.make_query_class_config().semaphore.make_permit(),
-                            std::move(ssts),
                            query::full_partition_range,
                            s->full_slice(),
                            service::get_local_streaming_priority(),
                            nullptr,
-                            ::streamed_mutation::forwarding::no,
                            ::mutation_reader::forwarding::no);

                    inject_failure("view_update_generator_consume_staging_sstable");
-                    auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, *t, sstables, _as), db::no_timeout);
+                    auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, *t, sstables, _as, staging_sstable_reader_handle), db::no_timeout);
                    if (result == stop_iteration::yes) {
                        break;
                    }
@@ -89,7 +106,7 @@ future<> view_update_generator::start() {
                    // Move from staging will be retried upon restart.
                    vug_logger.warn("Moving {} from staging failed: {}:{}. Ignoring...", s->ks_name(), s->cf_name(), std::current_exception());
                }
-                _registration_sem.signal();
+                _registration_sem.signal(num_sstables);
            }
            // For each table, move the processed staging sstables into the table's base dir.
            for (auto it = _sstables_to_move.begin(); it != _sstables_to_move.end(); ) {
--- a/db/view/view_update_generator.hh
+++ b/db/view/view_update_generator.hh
@@ -32,7 +32,10 @@
 namespace db::view {

 class view_update_generator {
+public:
    static constexpr size_t registration_queue_size = 5;
+
+private:
    database& _db;
    seastar::abort_source _as;
    future<> _started = make_ready_future<>();
@@ -51,6 +54,8 @@ public:
    future<> start();
    future<> stop();
    future<> register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table);
+
+    ssize_t available_register_units() const { return _registration_sem.available_units(); }
 private:
    bool should_throttle() const;
 };
--- a/db/view/view_updating_consumer.hh
+++ b/db/view/view_updating_consumer.hh
@@ -27,6 +27,8 @@
 #include "sstables/shared_sstable.hh"
 #include "database.hh"

+class evictable_reader_handle;
+
 namespace db::view {

 /*
@@ -34,22 +36,46 @@ namespace db::view {
 * It is expected to be run in seastar::async threaded context through consume_in_thread()
 */
 class view_updating_consumer {
-    schema_ptr _schema;
-    lw_shared_ptr<table> _table;
-    std::vector<sstables::shared_sstable> _excluded_sstables;
-    const seastar::abort_source* _as;
-    std::optional<mutation> _m;
 public:
-    view_updating_consumer(schema_ptr schema, table& table, std::vector<sstables::shared_sstable> excluded_sstables, const seastar::abort_source& as)
+    // We prefer flushing on partition boundaries, so at the end of a partition,
+    // we flush on reaching the soft limit. Otherwise we continue accumulating
+    // data. We flush mid-partition if we reach the hard limit.
+    static const size_t buffer_size_soft_limit;
+    static const size_t buffer_size_hard_limit;
+
+private:
+    schema_ptr _schema;
+    const seastar::abort_source* _as;
+    evictable_reader_handle& _staging_reader_handle;
+    circular_buffer<mutation> _buffer;
+    mutation* _m{nullptr};
+    size_t _buffer_size{0};
+    noncopyable_function<future<row_locker::lock_holder>(mutation)> _view_update_pusher;
+
+private:
+    void do_flush_buffer();
+    void maybe_flush_buffer_mid_partition();
+
+public:
+    // Push updates with a custom pusher. Mainly for tests.
+    view_updating_consumer(schema_ptr schema, const seastar::abort_source& as, evictable_reader_handle& staging_reader_handle,
+            noncopyable_function<future<row_locker::lock_holder>(mutation)> view_update_pusher)
            : _schema(std::move(schema))
-            , _table(table.shared_from_this())
-            , _excluded_sstables(std::move(excluded_sstables))
            , _as(&as)
-            , _m()
+            , _staging_reader_handle(staging_reader_handle)
+            , _view_update_pusher(std::move(view_update_pusher))
    { }

+    view_updating_consumer(schema_ptr schema, table& table, std::vector<sstables::shared_sstable> excluded_sstables, const seastar::abort_source& as,
+            evictable_reader_handle& staging_reader_handle);
+
+    view_updating_consumer(view_updating_consumer&&) = default;
+
+    view_updating_consumer& operator=(view_updating_consumer&&) = delete;
+
    void consume_new_partition(const dht::decorated_key& dk) {
-        _m = mutation(_schema, dk, mutation_partition(_schema));
+        _buffer.emplace_back(_schema, dk, mutation_partition(_schema));
+        _m = &_buffer.back();
    }

    void consume(tombstone t) {
@@ -60,7 +86,9 @@ public:
        if (_as->abort_requested()) {
            return stop_iteration::yes;
        }
+        _buffer_size += sr.memory_usage(*_schema);
        _m->partition().apply(*_schema, std::move(sr));
+        maybe_flush_buffer_mid_partition();
        return stop_iteration::no;
    }

@@ -68,7 +96,9 @@ public:
        if (_as->abort_requested()) {
            return stop_iteration::yes;
        }
+        _buffer_size += cr.memory_usage(*_schema);
        _m->partition().apply(*_schema, std::move(cr));
+        maybe_flush_buffer_mid_partition();
        return stop_iteration::no;
    }

@@ -76,14 +106,27 @@ public:
        if (_as->abort_requested()) {
            return stop_iteration::yes;
        }
+        _buffer_size += rt.memory_usage(*_schema);
        _m->partition().apply(*_schema, std::move(rt));
+        maybe_flush_buffer_mid_partition();
        return stop_iteration::no;
    }

    // Expected to be run in seastar::async threaded context (consume_in_thread())
-    stop_iteration consume_end_of_partition();
+    stop_iteration consume_end_of_partition() {
+        if (_as->abort_requested()) {
+            return stop_iteration::yes;
+        }
+        if (_buffer_size >= buffer_size_soft_limit) {
+            do_flush_buffer();
+        }
+        return stop_iteration::no;
+    }

    stop_iteration consume_end_of_stream() {
+        if (!_buffer.empty()) {
+            do_flush_buffer();
+        }
        return stop_iteration(_as->abort_requested());
    }
 };
--- a/dht/boot_strapper.cc
+++ b/dht/boot_strapper.cc
@@ -59,7 +59,12 @@ future<> boot_strapper::bootstrap(streaming::stream_reason reason) {
        return make_exception_future<>(std::runtime_error("Wrong stream_reason provided: it can only be replace or bootstrap"));
    }
    auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _abort_source, _tokens, _address, description, reason);
-    streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(gms::get_local_gossiper().get_unreachable_members()));
+    auto nodes_to_filter = gms::get_local_gossiper().get_unreachable_members();
+    if (reason == streaming::stream_reason::replace && _db.local().get_replace_address()) {
+        nodes_to_filter.insert(_db.local().get_replace_address().value());
+    }
+    blogger.debug("nodes_to_filter={}", nodes_to_filter);
+    streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(nodes_to_filter));
    auto keyspaces = make_lw_shared<std::vector<sstring>>(_db.local().get_non_system_keyspaces());
    return do_for_each(*keyspaces, [this, keyspaces, streamer] (sstring& keyspace_name) {
        auto& ks = _db.local().find_keyspace(keyspace_name);
--- a/digest_algorithm.hh
+++ b/digest_algorithm.hh
@@ -28,7 +28,8 @@ namespace query {
 enum class digest_algorithm : uint8_t {
    none = 0,  // digest not required
    MD5 = 1,
-    xxHash = 2,// default algorithm
+    legacy_xxHash_without_null_digest = 2,
+    xxHash = 3, // default algorithm
 };

 }
--- a/digester.hh
+++ b/digester.hh
@@ -36,7 +36,7 @@ struct noop_hasher {
 };

 class digester final {
-    std::variant<noop_hasher, md5_hasher, xx_hasher> _impl;
+    std::variant<noop_hasher, md5_hasher, xx_hasher, legacy_xx_hasher_without_null_digest> _impl;

 public:
    explicit digester(digest_algorithm algo) {
@@ -47,6 +47,9 @@ public:
        case digest_algorithm::xxHash:
            _impl = xx_hasher();
            break;
+        case digest_algorithm::legacy_xxHash_without_null_digest:
+            _impl = legacy_xx_hasher_without_null_digest();
+            break;
        case digest_algorithm ::none:
            _impl = noop_hasher();
            break;
--- a/dist/common/scripts/node_exporter_install
+++ b/dist/common/scripts/node_exporter_install
@@ -42,6 +42,11 @@ if __name__ == '__main__':
    if node_exporter_p.exists() or (bindir_p() / 'prometheus-node_exporter').exists():
        if force:
            print('node_exporter already installed, reinstalling')
+            try:
+                node_exporter = systemd_unit('node-exporter.service')
+                node_exporter.stop()
+            except:
+                pass
        else:
            print('node_exporter already installed, you can use `--force` to force reinstallation')
            sys.exit(1)
--- a/dist/common/scripts/scylla-housekeeping
+++ b/dist/common/scripts/scylla-housekeeping
@@ -61,7 +61,15 @@ def sh_command(*args):
    return out

 def get_url(path):
-    return urllib.request.urlopen(path).read().decode('utf-8')
+    # If server returns any error, like 403, or 500 urllib.request throws exception, which is not serializable.
+    # When multiprocessing routines fail to serialize it, it throws ambiguous serialization exception
+    #   from get_json_from_url.
+    # In order to see legit error we catch it from the inside of process, covert to string and
+    #   pass it as part of return value
+    try:
+        return 0, urllib.request.urlopen(path).read().decode('utf-8')
+    except Exception as exc:
+        return 1, str(exc)

 def get_json_from_url(path):
    pool = mp.Pool(processes=1)
@@ -71,13 +79,16 @@ def get_json_from_url(path):
    # to enforce a wallclock timeout.
    result = pool.apply_async(get_url, args=(path,))
    try:
-        retval = result.get(timeout=5)
+        status, retval = result.get(timeout=5)
    except mp.TimeoutError as err:
        pool.terminate()
        pool.join()
        raise
+    if status == 1:
+        raise RuntimeError(f'Failed to get "{path}" due to the following error: {retval}')
    return json.loads(retval)

+
 def get_api(path):
    return get_json_from_url("http://" + api_address + path)

--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -27,6 +27,7 @@ import glob
 import shutil
 import io
 import stat
+import distro
 from scylla_util import *

 interactive = False
@@ -385,6 +386,9 @@ if __name__ == '__main__':
                if not stat.S_ISBLK(os.stat(dsk).st_mode):
                    print('{} is not block device'.format(dsk))
                    continue
+                if dsk in selected:
+                    print(f'{dsk} is already added')
+                    continue
                selected.append(dsk)
                devices.remove(dsk)
            disks = ','.join(selected)
@@ -468,5 +472,10 @@ if __name__ == '__main__':
            print('Please restart your machine before using ScyllaDB, as you have disabled')
            print(' SELinux.')

-        if dist_name() == 'Ubuntu':
-            run('apt-get install -y hugepages')
+        if distro.id() == 'ubuntu':
+            # Ubuntu version is 20.04 or later
+            if int(distro.major_version()) >= 20:
+                hugepkg = 'libhugetlbfs-bin'
+            else:
+                hugepkg = 'hugepages'
+            run(f'apt-get install -y {hugepkg}')
--- a/dist/common/scripts/scylla_swap_setup
+++ b/dist/common/scripts/scylla_swap_setup
@@ -40,6 +40,10 @@ if __name__ == '__main__':
        sys.exit(1)

    memtotal = get_memtotal_gb()
+    if memtotal == 0:
+        print('memory too small: {} KB'.format(get_memtotal()))
+        sys.exit(1)
+
    # Scylla document says 'swap size should be set to either total_mem/3 or
    # 16GB - lower of the two', so we need to compare 16g vs memtotal/3 and
    # choose lower one
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -98,7 +98,7 @@ def curl(url, byte=False):
                    return res.read()
                else:
                    return res.read().decode('utf-8')
-        except urllib.error.HTTPError:
+        except urllib.error.URLError:
            logging.warn("Failed to grab %s..." % url)
            time.sleep(5)
            retries += 1
@@ -133,6 +133,8 @@ class aws_instance:
            raise Exception("found more than one disk mounted at root'".format(root_dev_candidates))

        root_dev = root_dev_candidates[0].device
+        if root_dev == '/dev/root':
+            root_dev = run('findmnt -n -o SOURCE /', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
        nvmes_present = list(filter(nvme_re.match, os.listdir("/dev")))
        return {"root": [ root_dev ], "ephemeral": [ x for x in nvmes_present if not root_dev.startswith(os.path.join("/dev/", x)) ] }

@@ -184,7 +186,7 @@ class aws_instance:
        instance_size = self.instance_size()
        if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
            return 'ixgbevf'
-        if instance_class in ['a1', 'c5', 'c5d', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d']:
+        if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d']:
            return 'ena'
        if instance_class == 'm4':
            if instance_size == '16xlarge':
@@ -221,7 +223,7 @@ class aws_instance:

    def ebs_disks(self):
        """Returns all EBS disks"""
-        return set(self._disks["ephemeral"])
+        return set(self._disks["ebs"])

    def public_ipv4(self):
        """Returns the public IPv4 address of this instance"""
@@ -329,9 +331,7 @@ class scylla_cpuinfo:
            return len(self._cpu_data["system"])


-# When a CLI tool is not installed, use relocatable CLI tool provided by Scylla
 scylla_env = os.environ.copy()
-scylla_env['PATH'] =  '{}:{}'.format(scylla_env['PATH'], scyllabindir())

 def run(cmd, shell=False, silent=False, exception=True):
    stdout = subprocess.DEVNULL if silent else None
@@ -446,6 +446,19 @@ def dist_ver():
    return distro.version()


+SYSTEM_PARTITION_UUIDS = [
+        '21686148-6449-6e6f-744e-656564454649', # BIOS boot partition
+        'c12a7328-f81f-11d2-ba4b-00a0c93ec93b', # EFI system partition
+        '024dee41-33e7-11d3-9d69-0008c781f39f'  # MBR partition scheme
+]
+
+def get_partition_uuid(dev):
+    return out(f'lsblk -n -oPARTTYPE {dev}')
+
+def is_system_partition(dev):
+    uuid = get_partition_uuid(dev)
+    return (uuid in SYSTEM_PARTITION_UUIDS)
+
 def is_unused_disk(dev):
    # dev is not in /sys/class/block/, like /dev/nvme[0-9]+
    if not os.path.isdir('/sys/class/block/{dev}'.format(dev=dev.replace('/dev/', ''))):
@@ -453,7 +466,8 @@ def is_unused_disk(dev):
    try:
        fd = os.open(dev, os.O_EXCL)
        os.close(fd)
-        return True
+        # dev is not reserved for system
+        return not is_system_partition(dev)
    except OSError:
        return False

--- a/dist/common/sysctl.d/99-scylla-inotify.conf
+++ b/dist/common/sysctl.d/99-scylla-inotify.conf
@@ -0,0 +1,4 @@
+# allocate enough inotify instances for large machines
+# each tls instance needs 1 inotify instance, and there can be
+# multiple tls instances per shard.
+fs.inotify.max_user_instances = 1200
--- a/dist/debian/debian/rules
+++ b/dist/debian/debian/rules
@@ -39,6 +39,7 @@ override_dh_strip:
 	# The binaries (ethtool...patchelf) don't pass dh_strip after going through patchelf. Since they are
 	# already stripped, nothing is lost if we exclude them, so that's what we do.
 	dh_strip -Xlibprotobuf.so.15 -Xld.so -Xethtool -Xgawk -Xgzip -Xhwloc-calc -Xhwloc-distrib -Xifconfig -Xlscpu -Xnetstat -Xpatchelf --dbg-package=$(product)-server-dbg
+	find $(CURDIR)/debian/$(product)-server-dbg/usr/lib/debug/.build-id/ -name "*.debug" -exec objcopy --decompress-debug-sections {} \;

 override_dh_makeshlibs:

--- a/dist/debian/debian/scylla-kernel-conf.postinst
+++ b/dist/debian/debian/scylla-kernel-conf.postinst
@@ -11,6 +11,7 @@ else
    sysctl -p/usr/lib/sysctl.d/99-scylla-sched.conf || :
    sysctl -p/usr/lib/sysctl.d/99-scylla-aio.conf || :
    sysctl -p/usr/lib/sysctl.d/99-scylla-vm.conf || :
+    sysctl -p/usr/lib/sysctl.d/99-scylla-inotify.conf || :
 fi

 #DEBHELPER#
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -5,8 +5,8 @@ MAINTAINER Avi Kivity <avi@cloudius-systems.com>
 ENV container docker

 # The SCYLLA_REPO_URL argument specifies the URL to the RPM repository this Docker image uses to install Scylla. The default value is the Scylla's unstable RPM repository, which contains the daily build.
-ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo
-ARG VERSION=666.development
+ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/scylla-4.2/latest/scylla.repo
+ARG VERSION=4.2

 ADD scylla_bashrc /scylla_bashrc

--- a/dist/redhat/scylla.spec
+++ b/dist/redhat/scylla.spec
@@ -129,10 +129,9 @@ rm -rf $RPM_BUILD_ROOT
 %attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla-housekeeping
 %ghost /etc/systemd/system/scylla-helper.slice.d/
 %ghost /etc/systemd/system/scylla-helper.slice.d/memory.conf
-%ghost /etc/systemd/system/scylla-server.service.d/
 %ghost /etc/systemd/system/scylla-server.service.d/capabilities.conf
 %ghost /etc/systemd/system/scylla-server.service.d/mounts.conf
-%ghost /etc/systemd/system/scylla-server.service.d/dependencies.conf
+/etc/systemd/system/scylla-server.service.d/dependencies.conf
 %ghost /etc/systemd/system/var-lib-systemd-coredump.mount
 %ghost /etc/systemd/system/scylla-cpupower.service

@@ -189,6 +188,8 @@ Summary:        Scylla configuration package for the Linux kernel
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Requires:       kmod
+# tuned overwrites our sysctl settings
+Obsoletes:	tuned

 %description kernel-conf
 This package contains Linux kernel configuration changes for the Scylla database.  Install this package
@@ -200,6 +201,7 @@ if Scylla is the main application on your server and you wish to optimize its la
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
 /usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :
 /usr/lib/systemd/systemd-sysctl 99-scylla-vm.conf >/dev/null 2>&1 || :
+/usr/lib/systemd/systemd-sysctl 99-scylla-inotify.conf >/dev/null 2>&1 || :

 %files kernel-conf
 %defattr(-,root,root)
--- a/docs/alternator/alternator.md
+++ b/docs/alternator/alternator.md
@@ -25,6 +25,15 @@ By default, Scylla listens on this port on all network interfaces.
 To listen only on a specific interface, pass also an "`alternator-address`"
 option.

+In addition to (or instead of) serving HTTP requests on `alternator-port`,
+Scylla can accept DynamoDB API requests over HTTPS (encrypted), on the port
+specified by `alternator-https-port`. As usual for HTTPS servers, the
+operator must specify certificate and key files. By default these should
+be placed in `/etc/scylla/scylla.crt` and `/etc/scylla/scylla.key`, but
+these default locations can overridden by specifying
+`--alternator-encryption-options keyfile="..."` and
+`--alternator-encryption-options certificate="..."`.
+
 As we explain below in the "Write isolation policies", Alternator has
 four different choices for the implementation of writes, each with
 different advantages. You should consider which of the options makes
--- a/docs/protocols.md
+++ b/docs/protocols.md
@@ -137,10 +137,9 @@ TODO: is there an SSL version of Thrift?

 # DynamoDB client protocol

-Scylla also supports, as an experimental feature, Amazon's DynamoDB API.
-The DynamoDB API is a JSON over HTTP (unencrypted) or HTTPS (encrypted)
-protocol. Because Scylla's support for this protocol is experimental,
-it is not turned on by default, and must be turned on manually by setting
+Scylla also supports Amazon's DynamoDB API. The DynamoDB API is a JSON over
+HTTP (unencrypted) or HTTPS (encrypted) protocol. Support for this protocol
+is not turned on by default, and must be turned on manually by setting
 the `alternator_port` and/or `alternator_https_port` configuration option.
 "Alternator" is the codename of Scylla's DynamoDB API support, and is
 documented in more detail in [alternator.md](alternator/alternator.md).
@@ -153,6 +152,13 @@ There is also an `alternator_address` configuration option to set the IP
 address (and therefore network interface) on which Scylla should listen
 for the DynamoDB protocol. This address defaults to 0.0.0.0.

+When the HTTPS-based protocol is enabled, the server also needs to know
+the certificate and key files to use. The default locations of these files
+are `/etc/scylla/scylla.crt` and `/etc/scylla/scylla.key` respectively, but
+can be overridden by specifying in `alternator_encryption_options` the
+`keyfile` and `certificate` options. For example,
+`--alternator-encryption-options keyfile="..."`.
+
 # Redis client protocol

 Scylla also has partial and experimental support for the Redis API.
--- a/flat_mutation_reader.hh
+++ b/flat_mutation_reader.hh
@@ -468,6 +468,9 @@ public:
    size_t buffer_size() const {
        return _impl->buffer_size();
    }
+    const circular_buffer<mutation_fragment>& buffer() const {
+        return _impl->buffer();
+    }
    // Detach the internal buffer of the reader.
    // Roughly equivalent to depleting it by calling pop_mutation_fragment()
    // until is_buffer_empty() returns true.
--- a/gms/feature.hh
+++ b/gms/feature.hh
@@ -141,6 +141,7 @@ extern const std::string_view HINTED_HANDOFF_SEPARATE_CONNECTION;
 extern const std::string_view LWT;
 extern const std::string_view PER_TABLE_PARTITIONERS;
 extern const std::string_view PER_TABLE_CACHING;
+extern const std::string_view DIGEST_FOR_NULL_VALUES;

 }

--- a/gms/feature_service.cc
+++ b/gms/feature_service.cc
@@ -56,6 +56,7 @@ constexpr std::string_view features::HINTED_HANDOFF_SEPARATE_CONNECTION = "HINTE
 constexpr std::string_view features::LWT = "LWT";
 constexpr std::string_view features::PER_TABLE_PARTITIONERS = "PER_TABLE_PARTITIONERS";
 constexpr std::string_view features::PER_TABLE_CACHING = "PER_TABLE_CACHING";
+constexpr std::string_view features::DIGEST_FOR_NULL_VALUES = "DIGEST_FOR_NULL_VALUES";

 static logging::logger logger("features");

@@ -90,8 +91,9 @@ feature_service::feature_service(feature_config cfg) : _config(cfg)
        , _hinted_handoff_separate_connection(*this, features::HINTED_HANDOFF_SEPARATE_CONNECTION)
        , _lwt_feature(*this, features::LWT)
        , _per_table_partitioners_feature(*this, features::PER_TABLE_PARTITIONERS)
-        , _per_table_caching_feature(*this, features::PER_TABLE_CACHING) {
-}
+        , _per_table_caching_feature(*this, features::PER_TABLE_CACHING)
+        , _digest_for_null_values_feature(*this, features::DIGEST_FOR_NULL_VALUES)
+{}

 feature_config feature_config_from_db_config(db::config& cfg, std::set<sstring> disabled) {
    feature_config fcfg;
@@ -179,6 +181,7 @@ std::set<std::string_view> feature_service::known_feature_set() {
        gms::features::MC_SSTABLE,
        gms::features::UDF,
        gms::features::CDC,
+        gms::features::DIGEST_FOR_NULL_VALUES,
    };

    for (const sstring& s : _config._disabled_features) {
@@ -269,6 +272,7 @@ void feature_service::enable(const std::set<std::string_view>& list) {
        std::ref(_lwt_feature),
        std::ref(_per_table_partitioners_feature),
        std::ref(_per_table_caching_feature),
+        std::ref(_digest_for_null_values_feature),
    })
    {
        if (list.count(f.name())) {
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -103,6 +103,7 @@ private:
    gms::feature _lwt_feature;
    gms::feature _per_table_partitioners_feature;
    gms::feature _per_table_caching_feature;
+    gms::feature _digest_for_null_values_feature;

 public:
    bool cluster_supports_range_tombstones() const {
@@ -177,6 +178,10 @@ public:
        return _per_table_caching_feature;
    }

+    const feature& cluster_supports_digest_for_null_values() const {
+        return _digest_for_null_values_feature;
+    }
+
    bool cluster_supports_row_level_repair() const {
        return bool(_row_level_repair_feature);
    }
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -428,6 +428,7 @@ future<> gossiper::handle_shutdown_msg(inet_address from) {
        return make_ready_future<>();
    }
    return seastar::async([this, from] {
+        auto permit = this->lock_endpoint(from).get0();
        this->mark_as_shutdown(from);
    });
 }
--- a/install-dependencies.sh
+++ b/install-dependencies.sh
@@ -98,6 +98,7 @@ fedora_packages=(
    debhelper
    fakeroot
    file
+    dpkg-dev
 )

 centos_packages=(
--- a/install.sh
+++ b/install.sh
@@ -132,6 +132,7 @@ relocate_python3() {
    cp "$script" "$relocateddir"
    cat > "$install"<<EOF
 #!/usr/bin/env bash
+export LC_ALL=en_US.UTF-8
 x="\$(readlink -f "\$0")"
 b="\$(basename "\$x")"
 d="\$(dirname "\$x")"
@@ -143,7 +144,7 @@ DEBIAN_SSL_CERT_FILE="/etc/ssl/certs/ca-certificates.crt"
 if [ -f "\${DEBIAN_SSL_CERT_FILE}" ]; then
  c=\${DEBIAN_SSL_CERT_FILE}
 fi
-PYTHONPATH="\${d}:\${d}/libexec:\$PYTHONPATH" PATH="\${d}/$pythonpath:\${PATH}" SSL_CERT_FILE="\${c}" exec -a "\$0" "\${d}/libexec/\${b}" "\$@"
+PYTHONPATH="\${d}:\${d}/libexec:\$PYTHONPATH" PATH="\${d}/../bin:\${d}/$pythonpath:\${PATH}" SSL_CERT_FILE="\${c}" exec -a "\$0" "\${d}/libexec/\${b}" "\$@"
 EOF
    chmod +x "$install"
 }
@@ -378,5 +379,9 @@ elif ! $packaging; then
    chown -R scylla:scylla $rdata
    chown -R scylla:scylla $rhkdata

+    for file in dist/common/sysctl.d/*.conf; do
+        bn=$(basename "$file")
+        sysctl -p "$rusr"/lib/sysctl.d/"$bn"
+    done
    $rprefix/scripts/scylla_post_install.sh
 fi
--- a/locator/abstract_replication_strategy.cc
+++ b/locator/abstract_replication_strategy.cc
@@ -168,15 +168,33 @@ insert_token_range_to_sorted_container_while_unwrapping(

 dht::token_range_vector
 abstract_replication_strategy::get_ranges(inet_address ep) const {
-    return get_ranges(ep, _token_metadata);
+    return do_get_ranges(ep, _token_metadata, false);
+}
+
+dht::token_range_vector
+abstract_replication_strategy::get_ranges_in_thread(inet_address ep) const {
+    return do_get_ranges(ep, _token_metadata, true);
 }

 dht::token_range_vector
 abstract_replication_strategy::get_ranges(inet_address ep, token_metadata& tm) const {
+    return do_get_ranges(ep, tm, false);
+}
+
+dht::token_range_vector
+abstract_replication_strategy::get_ranges_in_thread(inet_address ep, token_metadata& tm) const {
+    return do_get_ranges(ep, tm, true);
+}
+
+dht::token_range_vector
+abstract_replication_strategy::do_get_ranges(inet_address ep, token_metadata& tm, bool can_yield) const {
    dht::token_range_vector ret;
    auto prev_tok = tm.sorted_tokens().back();
    for (auto tok : tm.sorted_tokens()) {
        for (inet_address a : calculate_natural_endpoints(tok, tm)) {
+            if (can_yield) {
+                seastar::thread::maybe_yield();
+            }
            if (a == ep) {
                insert_token_range_to_sorted_container_while_unwrapping(prev_tok, tok, ret);
                break;
--- a/locator/abstract_replication_strategy.hh
+++ b/locator/abstract_replication_strategy.hh
@@ -113,10 +113,15 @@ public:
    // It the analogue of Origin's getAddressRanges().get(endpoint).
    // This function is not efficient, and not meant for the fast path.
    dht::token_range_vector get_ranges(inet_address ep) const;
+    dht::token_range_vector get_ranges_in_thread(inet_address ep) const;

    // Use the token_metadata provided by the caller instead of _token_metadata
    dht::token_range_vector get_ranges(inet_address ep, token_metadata& tm) const;
+    dht::token_range_vector get_ranges_in_thread(inet_address ep, token_metadata& tm) const;
+private:
+    dht::token_range_vector do_get_ranges(inet_address ep, token_metadata& tm, bool can_yield) const;

+public:
    // get_primary_ranges() returns the list of "primary ranges" for the given
    // endpoint. "Primary ranges" are the ranges that the node is responsible
    // for storing replica primarily, which means this is the first node
--- a/lua.cc
+++ b/lua.cc
@@ -262,14 +262,12 @@ static auto visit_lua_raw_value(lua_State* l, int index, Func&& f) {

 template <typename Func>
 static auto visit_decimal(const big_decimal &v, Func&& f) {
-    boost::multiprecision::cpp_int ten(10);
-    const auto& dividend = v.unscaled_value();
-    auto divisor = boost::multiprecision::pow(ten, v.scale());
+    boost::multiprecision::cpp_rational r = v.as_rational();
+    const boost::multiprecision::cpp_int& dividend = numerator(r);
+    const boost::multiprecision::cpp_int& divisor = denominator(r);
    if (dividend % divisor == 0) {
-        return f(utils::multiprecision_int(boost::multiprecision::cpp_int(dividend/divisor)));
+        return f(utils::multiprecision_int(dividend/divisor));
    }
-    boost::multiprecision::cpp_rational r = dividend;
-    r /= divisor;
    return f(r.convert_to<double>());
 }

--- a/main.cc
+++ b/main.cc
@@ -830,6 +830,7 @@ int main(int ac, char** av) {
            storage_proxy_smp_service_group_config.max_nonlocal_requests = 5000;
            spcfg.read_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
            spcfg.write_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
+            spcfg.hints_write_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
            spcfg.write_ack_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
            static db::view::node_update_backlog node_backlog(smp::count, 10ms);
            scheduling_group_key_config storage_proxy_stats_cfg =
@@ -967,12 +968,16 @@ int main(int ac, char** av) {
                mm.init_messaging_service();
            }).get();
            supervisor::notify("initializing storage proxy RPC verbs");
-            proxy.invoke_on_all([] (service::storage_proxy& p) {
-                p.init_messaging_service();
-            }).get();
+            proxy.invoke_on_all(&service::storage_proxy::init_messaging_service).get();
+            auto stop_proxy_handlers = defer_verbose_shutdown("storage proxy RPC verbs", [&proxy] {
+                proxy.invoke_on_all(&service::storage_proxy::uninit_messaging_service).get();
+            });

            supervisor::notify("starting streaming service");
            streaming::stream_session::init_streaming_service(db, sys_dist_ks, view_update_generator).get();
+            auto stop_streaming_service = defer_verbose_shutdown("streaming service", [] {
+                streaming::stream_session::uninit_streaming_service().get();
+            });
            api::set_server_stream_manager(ctx).get();

            supervisor::notify("starting hinted handoff manager");
@@ -1005,6 +1010,9 @@ int main(int ac, char** av) {
                rs.stop().get();
            });
            repair_init_messaging_service_handler(rs, sys_dist_ks, view_update_generator).get();
+            auto stop_repair_messages = defer_verbose_shutdown("repair message handlers", [] {
+                repair_uninit_messaging_service_handler().get();
+            });
            supervisor::notify("starting storage service", true);
            auto& ss = service::get_local_storage_service();
            ss.init_messaging_service_part().get();
@@ -1196,22 +1204,30 @@ int main(int ac, char** av) {
                std::optional<uint16_t> alternator_https_port;
                std::optional<tls::credentials_builder> creds;
                if (cfg->alternator_https_port()) {
-                    creds.emplace();
                    alternator_https_port = cfg->alternator_https_port();
-                    creds->set_dh_level(tls::dh_params::level::MEDIUM);
-                    creds->set_x509_key_file(cert, key, tls::x509_crt_format::PEM).get();
-                    if (trust_store.empty()) {
-                        creds->set_system_trust().get();
-                    } else {
-                        creds->set_x509_trust_file(trust_store, tls::x509_crt_format::PEM).get();
+                    creds.emplace();
+                    auto opts = cfg->alternator_encryption_options();
+                    if (opts.empty()) {
+                        // Earlier versions mistakenly configured Alternator's
+                        // HTTPS parameters via the "server_encryption_option"
+                        // configuration parameter. We *temporarily* continue
+                        // to allow this, for backward compatibility.
+                        opts = cfg->server_encryption_options();
+                        if (!opts.empty()) {
+                            startlog.warn("Setting server_encryption_options to configure "
+                                    "Alternator's HTTPS encryption is deprecated. Please "
+                                    "switch to setting alternator_encryption_options instead.");
+                        }
                    }
+                    creds->set_dh_level(tls::dh_params::level::MEDIUM);
+                    auto cert = get_or_default(opts, "certificate", db::config::get_conf_sub("scylla.crt").string());
+                    auto key = get_or_default(opts, "keyfile", db::config::get_conf_sub("scylla.key").string());
+                    creds->set_x509_key_file(cert, key, tls::x509_crt_format::PEM).get();
+                    auto prio = get_or_default(opts, "priority_string", sstring());
                    creds->set_priority_string(db::config::default_tls_priority);
                    if (!prio.empty()) {
                        creds->set_priority_string(prio);
                    }
-                    if (clauth) {
-                        creds->set_client_auth(seastar::tls::client_auth::REQUIRE);
-                    }
                }
                bool alternator_enforce_authorization = cfg->alternator_enforce_authorization();
                with_scheduling_group(dbcfg.statement_scheduling_group,
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -572,7 +572,12 @@ messaging_service::initial_scheduling_info() const {

 scheduling_group
 messaging_service::scheduling_group_for_verb(messaging_verb verb) const {
-    return _scheduling_info_for_connection_index[get_rpc_client_idx(verb)].sched_group;
+    // We are not using get_rpc_client_idx() because it figures out the client
+    // index based on the current scheduling group, which is relevant when
+    // selecting the right client for sending a message, but is not relevant
+    // when registering handlers.
+    const auto idx = s_rpc_client_idx_table[static_cast<size_t>(verb)];
+    return _scheduling_info_for_connection_index[idx].sched_group;
 }

 scheduling_group
@@ -791,6 +796,10 @@ void messaging_service::register_stream_mutation_fragments(std::function<future<
    register_handler(this, messaging_verb::STREAM_MUTATION_FRAGMENTS, std::move(func));
 }

+future<> messaging_service::unregister_stream_mutation_fragments() {
+    return unregister_handler(messaging_verb::STREAM_MUTATION_FRAGMENTS);
+}
+
 template<class SinkType, class SourceType>
 future<rpc::sink<SinkType>, rpc::source<SourceType>>
 do_make_sink_source(messaging_verb verb, uint32_t repair_meta_id, shared_ptr<messaging_service::rpc_protocol_client_wrapper> rpc_client, std::unique_ptr<messaging_service::rpc_protocol_wrapper>& rpc) {
@@ -822,6 +831,9 @@ rpc::sink<repair_row_on_wire_with_cmd> messaging_service::make_sink_for_repair_g
 void messaging_service::register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_hash_with_cmd> source)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM, std::move(func));
 }
+future<> messaging_service::unregister_repair_get_row_diff_with_rpc_stream() {
+    return unregister_handler(messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM);
+}

 // Wrapper for REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM
 future<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>
@@ -841,6 +853,9 @@ rpc::sink<repair_stream_cmd> messaging_service::make_sink_for_repair_put_row_dif
 void messaging_service::register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source)>&& func) {
    register_handler(this, messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM, std::move(func));
 }
+future<> messaging_service::unregister_repair_put_row_diff_with_rpc_stream() {
+    return unregister_handler(messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM);
+}

 // Wrapper for REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM
 future<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>
@@ -860,6 +875,9 @@ rpc::sink<repair_hash_with_cmd> messaging_service::make_sink_for_repair_get_full
 void messaging_service::register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_stream_cmd> source)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM, std::move(func));
 }
+future<> messaging_service::unregister_repair_get_full_row_hashes_with_rpc_stream() {
+    return unregister_handler(messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM);
+}

 // Send a message for verb
 template <typename MsgIn, typename... MsgOut>
@@ -943,6 +961,9 @@ future<streaming::prepare_message> messaging_service::send_prepare_message(msg_a
    return send_message<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
        std::move(msg), plan_id, std::move(description), reason);
 }
+future<> messaging_service::unregister_prepare_message() {
+    return unregister_handler(messaging_verb::PREPARE_MESSAGE);
+}

 // PREPARE_DONE_MESSAGE
 void messaging_service::register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func) {
@@ -952,6 +973,9 @@ future<> messaging_service::send_prepare_done_message(msg_addr id, UUID plan_id,
    return send_message<void>(this, messaging_verb::PREPARE_DONE_MESSAGE, id,
        plan_id, dst_cpu_id);
 }
+future<> messaging_service::unregister_prepare_done_message() {
+    return unregister_handler(messaging_verb::PREPARE_DONE_MESSAGE);
+}

 // STREAM_MUTATION
 void messaging_service::register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool> fragmented, rpc::optional<streaming::stream_reason> reason)>&& func) {
@@ -976,6 +1000,9 @@ future<> messaging_service::send_stream_mutation_done(msg_addr id, UUID plan_id,
    return send_message<void>(this, messaging_verb::STREAM_MUTATION_DONE, id,
        plan_id, std::move(ranges), cf_id, dst_cpu_id);
 }
+future<> messaging_service::unregister_stream_mutation_done() {
+    return unregister_handler(messaging_verb::STREAM_MUTATION_DONE);
+}

 // COMPLETE_MESSAGE
 void messaging_service::register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func) {
@@ -985,6 +1012,9 @@ future<> messaging_service::send_complete_message(msg_addr id, UUID plan_id, uns
    return send_message<void>(this, messaging_verb::COMPLETE_MESSAGE, id,
        plan_id, dst_cpu_id, failed);
 }
+future<> messaging_service::unregister_complete_message() {
+    return unregister_handler(messaging_verb::COMPLETE_MESSAGE);
+}

 void messaging_service::register_gossip_echo(std::function<future<> ()>&& func) {
    register_handler(this, messaging_verb::GOSSIP_ECHO, std::move(func));
@@ -1199,14 +1229,14 @@ future<partition_checksum> messaging_service::send_repair_checksum_range(
 }

 // Wrapper for REPAIR_GET_FULL_ROW_HASHES
-void messaging_service::register_repair_get_full_row_hashes(std::function<future<std::unordered_set<repair_hash>> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
+void messaging_service::register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(func));
 }
 future<> messaging_service::unregister_repair_get_full_row_hashes() {
    return unregister_handler(messaging_verb::REPAIR_GET_FULL_ROW_HASHES);
 }
-future<std::unordered_set<repair_hash>> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
-    return send_message<future<std::unordered_set<repair_hash>>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
+future<repair_hash_set> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
+    return send_message<future<repair_hash_set>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
 }

 // Wrapper for REPAIR_GET_COMBINED_ROW_HASH
@@ -1231,13 +1261,13 @@ future<get_sync_boundary_response> messaging_service::send_repair_get_sync_bound
 }

 // Wrapper for REPAIR_GET_ROW_DIFF
-void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows)>&& func) {
+void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(func));
 }
 future<> messaging_service::unregister_repair_get_row_diff() {
    return unregister_handler(messaging_verb::REPAIR_GET_ROW_DIFF);
 }
-future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows) {
+future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows) {
    return send_message<future<repair_rows_on_wire>>(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(id), repair_meta_id, std::move(set_diff), needs_all_rows);
 }

--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -297,10 +297,12 @@ public:
            streaming::prepare_message msg, UUID plan_id, sstring description, rpc::optional<streaming::stream_reason> reason)>&& func);
    future<streaming::prepare_message> send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
            sstring description, streaming::stream_reason);
+    future<> unregister_prepare_message();

    // Wrapper for PREPARE_DONE_MESSAGE verb
    void register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func);
    future<> send_prepare_done_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id);
+    future<> unregister_prepare_done_message();

    // Wrapper for STREAM_MUTATION verb
    void register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool>, rpc::optional<streaming::stream_reason>)>&& func);
@@ -309,6 +311,7 @@ public:
    // Wrapper for STREAM_MUTATION_FRAGMENTS
    // The receiver of STREAM_MUTATION_FRAGMENTS sends status code to the sender to notify any error on the receiver side. The status code is of type int32_t. 0 means successful, -1 means error, other status code value are reserved for future use.
    void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>> source)>&& func);
+    future<> unregister_stream_mutation_fragments();
    rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>>& source);
    future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);

@@ -316,22 +319,27 @@ public:
    future<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>> make_sink_and_source_for_repair_get_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
    rpc::sink<repair_row_on_wire_with_cmd> make_sink_for_repair_get_row_diff_with_rpc_stream(rpc::source<repair_hash_with_cmd>& source);
    void register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_hash_with_cmd> source)>&& func);
+    future<> unregister_repair_get_row_diff_with_rpc_stream();

    // Wrapper for REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM
    future<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>> make_sink_and_source_for_repair_put_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
    rpc::sink<repair_stream_cmd> make_sink_for_repair_put_row_diff_with_rpc_stream(rpc::source<repair_row_on_wire_with_cmd>& source);
    void register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source)>&& func);
+    future<> unregister_repair_put_row_diff_with_rpc_stream();

    // Wrapper for REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM
    future<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>> make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
    rpc::sink<repair_hash_with_cmd> make_sink_for_repair_get_full_row_hashes_with_rpc_stream(rpc::source<repair_stream_cmd>& source);
    void register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_stream_cmd> source)>&& func);
+    future<> unregister_repair_get_full_row_hashes_with_rpc_stream();

    void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
    future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);
+    future<> unregister_stream_mutation_done();

    void register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func);
    future<> send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id, bool failed = false);
+    future<> unregister_complete_message();

    // Wrapper for REPAIR_CHECKSUM_RANGE verb
    void register_repair_checksum_range(std::function<future<partition_checksum> (sstring keyspace, sstring cf, dht::token_range range, rpc::optional<repair_checksum> hash_version)>&& func);
@@ -339,9 +347,9 @@ public:
    future<partition_checksum> send_repair_checksum_range(msg_addr id, sstring keyspace, sstring cf, dht::token_range range, repair_checksum hash_version);

    // Wrapper for REPAIR_GET_FULL_ROW_HASHES
-    void register_repair_get_full_row_hashes(std::function<future<std::unordered_set<repair_hash>> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
+    void register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
    future<> unregister_repair_get_full_row_hashes();
-    future<std::unordered_set<repair_hash>> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);
+    future<repair_hash_set> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);

    // Wrapper for REPAIR_GET_COMBINED_ROW_HASH
    void register_repair_get_combined_row_hash(std::function<future<get_combined_row_hash_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::optional<repair_sync_boundary> common_sync_boundary)>&& func);
@@ -354,9 +362,9 @@ public:
    future<get_sync_boundary_response> send_repair_get_sync_boundary(msg_addr id, uint32_t repair_meta_id, std::optional<repair_sync_boundary> skipped_sync_boundary);

    // Wrapper for REPAIR_GET_ROW_DIFF
-    void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows)>&& func);
+    void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func);
    future<> unregister_repair_get_row_diff();
-    future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows);
+    future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows);

    // Wrapper for REPAIR_PUT_ROW_DIFF
    void register_repair_put_row_diff(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_rows_on_wire row_diff)>&& func);
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -300,10 +300,9 @@ flat_mutation_reader read_context::create_reader(
    }

    auto& table = _db.local().find_column_family(schema);
-    auto class_config = _db.local().make_query_class_config();

    if (!rm.rparts) {
-        rm.rparts = make_foreign(std::make_unique<reader_meta::remote_parts>(class_config.semaphore));
+        rm.rparts = make_foreign(std::make_unique<reader_meta::remote_parts>(semaphore()));
    }

    rm.rparts->range = std::make_unique<const dht::partition_range>(pr);
@@ -513,18 +512,28 @@ future<> read_context::lookup_readers() {
    }

    return parallel_for_each(boost::irange(0u, smp::count), [this] (shard_id shard) {
-        return _db.invoke_on(shard, [shard, cmd = &_cmd, ranges = &_ranges, gs = global_schema_ptr(_schema),
+        return _db.invoke_on(shard, [this, shard, cmd = &_cmd, ranges = &_ranges, gs = global_schema_ptr(_schema),
                gts = tracing::global_trace_state_ptr(_trace_state)] (database& db) mutable {
            auto schema = gs.get();
            auto querier_opt = db.get_querier_cache().lookup_shard_mutation_querier(cmd->query_uuid, *schema, *ranges, cmd->slice, gts.get());
            auto& table = db.find_column_family(schema);
-            auto& semaphore = db.make_query_class_config().semaphore;
+            auto& semaphore = this->semaphore();

            if (!querier_opt) {
                return reader_meta(reader_state::inexistent, reader_meta::remote_parts(semaphore));
            }

            auto& q = *querier_opt;
+
+            if (&q.permit().semaphore() != &semaphore) {
+                on_internal_error(mmq_log, format("looked-up reader belongs to different semaphore than the one appropriate for this query class: "
+                        "looked-up reader belongs to {} (0x{:x}) the query class appropriate is {} (0x{:x})",
+                        q.permit().semaphore().name(),
+                        reinterpret_cast<uintptr_t>(&q.permit().semaphore()),
+                        semaphore.name(),
+                        reinterpret_cast<uintptr_t>(&semaphore)));
+            }
+
            auto handle = pause(semaphore, std::move(q).reader());
            return reader_meta(
                    reader_state::successful_lookup,
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -734,56 +734,78 @@ void write_counter_cell(RowWriter& w, const query::partition_slice& slice, ::ato
  });
 }

-// Used to return the timestamp of the latest update to the row
-struct max_timestamp {
-    api::timestamp_type max = api::missing_timestamp;
-
-    void update(api::timestamp_type ts) {
-        max = std::max(max, ts);
-    }
-};
-
-template<>
-struct appending_hash<row> {
-    template<typename Hasher>
-    void operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const {
-        for (auto id : columns) {
-            const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
-            if (!cell_and_hash) {
-                return;
-            }
-            auto&& def = s.column_at(kind, id);
-            if (def.is_atomic()) {
-                max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp());
-                if constexpr (query::using_hash_of_hash_v<Hasher>) {
-                    if (cell_and_hash->hash) {
-                        feed_hash(h, *cell_and_hash->hash);
-                    } else {
-                        query::default_hasher cellh;
-                        feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def);
-                        feed_hash(h, cellh.finalize_uint64());
-                    }
+template<typename Hasher>
+void appending_hash<row>::operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const {
+    for (auto id : columns) {
+        const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
+        if (!cell_and_hash) {
+            feed_hash(h, appending_hash<row>::null_hash_value);
+            continue;
+        }
+        auto&& def = s.column_at(kind, id);
+        if (def.is_atomic()) {
+            max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp());
+            if constexpr (query::using_hash_of_hash_v<Hasher>) {
+                if (cell_and_hash->hash) {
+                    feed_hash(h, *cell_and_hash->hash);
                } else {
-                    feed_hash(h, cell_and_hash->cell.as_atomic_cell(def), def);
+                    query::default_hasher cellh;
+                    feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def);
+                    feed_hash(h, cellh.finalize_uint64());
                }
            } else {
-                auto cm = cell_and_hash->cell.as_collection_mutation();
-                max_ts.update(cm.last_update(*def.type));
-                if constexpr (query::using_hash_of_hash_v<Hasher>) {
-                    if (cell_and_hash->hash) {
-                        feed_hash(h, *cell_and_hash->hash);
-                    } else {
-                        query::default_hasher cellh;
-                        feed_hash(cellh, cm, def);
-                        feed_hash(h, cellh.finalize_uint64());
-                    }
+                feed_hash(h, cell_and_hash->cell.as_atomic_cell(def), def);
+            }
+        } else {
+            auto cm = cell_and_hash->cell.as_collection_mutation();
+            max_ts.update(cm.last_update(*def.type));
+            if constexpr (query::using_hash_of_hash_v<Hasher>) {
+                if (cell_and_hash->hash) {
+                    feed_hash(h, *cell_and_hash->hash);
                } else {
-                    feed_hash(h, cm, def);
+                    query::default_hasher cellh;
+                    feed_hash(cellh, cm, def);
+                    feed_hash(h, cellh.finalize_uint64());
                }
+            } else {
+                feed_hash(h, cm, def);
            }
        }
    }
-};
+}
+// Instantiation for mutation_test.cc
+template void appending_hash<row>::operator()<xx_hasher>(xx_hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const;
+
+template<>
+void appending_hash<row>::operator()<legacy_xx_hasher_without_null_digest>(legacy_xx_hasher_without_null_digest& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const {
+    for (auto id : columns) {
+        const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
+        if (!cell_and_hash) {
+            return;
+        }
+        auto&& def = s.column_at(kind, id);
+        if (def.is_atomic()) {
+            max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp());
+            if (cell_and_hash->hash) {
+                feed_hash(h, *cell_and_hash->hash);
+            } else {
+                query::default_hasher cellh;
+                feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def);
+                feed_hash(h, cellh.finalize_uint64());
+            }
+        } else {
+            auto cm = cell_and_hash->cell.as_collection_mutation();
+            max_ts.update(cm.last_update(*def.type));
+            if (cell_and_hash->hash) {
+                feed_hash(h, *cell_and_hash->hash);
+            } else {
+                query::default_hasher cellh;
+                feed_hash(cellh, cm, def);
+                feed_hash(h, cellh.finalize_uint64());
+            }
+        }
+    }
+}

 cell_hash_opt row::cell_hash_for(column_id id) const {
    if (_type == storage_type::vector) {
@@ -1721,7 +1743,7 @@ void row::apply_monotonically(const schema& s, column_kind kind, row&& other) {
 // we erase the live cells according to the shadowable_tombstone rules.
 static bool dead_marker_shadows_row(const schema& s, column_kind kind, const row_marker& marker) {
    return s.is_view()
-            && !s.view_info()->base_non_pk_columns_in_view_pk().empty()
+            && s.view_info()->has_base_non_pk_columns_in_view_pk()
            && !marker.is_live()
            && kind == column_kind::regular_column; // not applicable to static rows
 }
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -649,6 +649,22 @@ public:
    };
 };

+// Used to return the timestamp of the latest update to the row
+struct max_timestamp {
+    api::timestamp_type max = api::missing_timestamp;
+
+    void update(api::timestamp_type ts) {
+        max = std::max(max, ts);
+    }
+};
+
+template<>
+struct appending_hash<row> {
+    static constexpr int null_hash_value = 0xbeefcafe;
+    template<typename Hasher>
+    void operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const;
+};
+
 class row_marker;
 int compare_row_marker_for_merge(const row_marker& left, const row_marker& right) noexcept;

--- a/mutation_query.hh
+++ b/mutation_query.hh
@@ -114,9 +114,6 @@ class reconcilable_result_builder {
    const schema& _schema;
    const query::partition_slice& _slice;

-    utils::chunked_vector<partition> _result;
-    uint32_t _live_rows{};
-
    bool _return_static_content_on_partition_with_no_rows{};
    bool _static_row_is_alive{};
    uint32_t _total_live_rows = 0;
@@ -124,6 +121,10 @@ class reconcilable_result_builder {
    stop_iteration _stop;
    bool _short_read_allowed;
    std::optional<streamed_mutation_freezer> _mutation_consumer;
+
+    uint32_t _live_rows{};
+    // make this the last member so it is destroyed first. #7240
+    utils::chunked_vector<partition> _result;
 public:
    reconcilable_result_builder(const schema& s, const query::partition_slice& slice,
                                query::result_memory_accounter&& accounter)
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -30,6 +30,7 @@
 #include "schema_registry.hh"
 #include "mutation_compactor.hh"

+logging::logger mrlog("mutation_reader");

 static constexpr size_t merger_small_vector_size = 4;

@@ -659,6 +660,8 @@ flat_mutation_reader make_combined_reader(schema_ptr schema,
    return make_combined_reader(std::move(schema), std::move(v), fwd_sm, fwd_mr);
 }

+const ssize_t new_reader_base_cost{16 * 1024};
+
 class restricting_mutation_reader : public flat_mutation_reader::impl {
    struct mutation_source_and_params {
        mutation_source _ms;
@@ -685,8 +688,6 @@ class restricting_mutation_reader : public flat_mutation_reader::impl {
    };
    std::variant<pending_state, admitted_state> _state;

-    static const ssize_t new_reader_base_cost{16 * 1024};
-
    template<typename Function>
    requires std::is_move_constructible<Function>::value
        && requires(Function fn, flat_mutation_reader& reader) {
@@ -1026,6 +1027,13 @@ private:
    bool _reader_created = false;
    bool _drop_partition_start = false;
    bool _drop_static_row = false;
+    // Trim range tombstones on the start of the buffer to the start of the read
+    // range (_next_position_in_partition). Set after reader recreation.
+    // Also validate the first not-trimmed mutation fragment's position.
+    bool _trim_range_tombstones = false;
+    // Validate the partition key of the first emitted partition, set after the
+    // reader was recreated.
+    bool _validate_partition_key = false;
    position_in_partition::tri_compare _tri_cmp;

    std::optional<dht::decorated_key> _last_pkey;
@@ -1047,7 +1055,10 @@ private:
    void adjust_partition_slice();
    flat_mutation_reader recreate_reader();
    flat_mutation_reader resume_or_create_reader();
+    void maybe_validate_partition_start(const circular_buffer<mutation_fragment>& buffer);
+    void validate_position_in_partition(position_in_partition_view pos) const;
    bool should_drop_fragment(const mutation_fragment& mf);
+    bool maybe_trim_range_tombstone(mutation_fragment& mf) const;
    future<> do_fill_buffer(flat_mutation_reader& reader, db::timeout_clock::time_point timeout);
    future<> fill_buffer(flat_mutation_reader& reader, db::timeout_clock::time_point timeout);

@@ -1120,16 +1131,11 @@ void evictable_reader::update_next_position(flat_mutation_reader& reader) {
            _next_position_in_partition = position_in_partition::before_all_clustered_rows();
            break;
        case partition_region::clustered:
-            if (reader.is_buffer_empty()) {
-                _next_position_in_partition = position_in_partition::after_key(last_pos);
-            } else {
-               const auto& next_frag = reader.peek_buffer();
-               if (next_frag.is_end_of_partition()) {
+            if (!reader.is_buffer_empty() && reader.peek_buffer().is_end_of_partition()) {
                   push_mutation_fragment(reader.pop_mutation_fragment());
                   _next_position_in_partition = position_in_partition::for_partition_start();
-               } else {
-                   _next_position_in_partition = position_in_partition(next_frag.position());
-               }
+            } else {
+                _next_position_in_partition = position_in_partition::after_key(last_pos);
            }
            break;
        case partition_region::partition_end:
@@ -1154,6 +1160,9 @@ flat_mutation_reader evictable_reader::recreate_reader() {
    const dht::partition_range* range = _pr;
    const query::partition_slice* slice = &_ps;

+    _range_override.reset();
+    _slice_override.reset();
+
    if (_last_pkey) {
        bool partition_range_is_inclusive = true;

@@ -1190,6 +1199,9 @@ flat_mutation_reader evictable_reader::recreate_reader() {
        range = &*_range_override;
    }

+    _trim_range_tombstones = true;
+    _validate_partition_key = true;
+
    return _ms.make_reader(
            _schema,
            _permit,
@@ -1216,6 +1228,78 @@ flat_mutation_reader evictable_reader::resume_or_create_reader() {
    return recreate_reader();
 }

+template <typename... Arg>
+static void require(bool condition, const char* msg, const Arg&... arg) {
+    if (!condition) {
+        on_internal_error(mrlog, format(msg, arg...));
+    }
+}
+
+void evictable_reader::maybe_validate_partition_start(const circular_buffer<mutation_fragment>& buffer) {
+    if (!_validate_partition_key || buffer.empty()) {
+        return;
+    }
+
+    // If this is set we can assume the first fragment is a partition-start.
+    const auto& ps = buffer.front().as_partition_start();
+    const auto tri_cmp = dht::ring_position_comparator(*_schema);
+    // If we recreated the reader after fast-forwarding it we won't have
+    // _last_pkey set. In this case it is enough to check if the partition
+    // is in range.
+    if (_last_pkey) {
+        const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
+        if (_drop_partition_start) { // should be the same partition
+            require(
+                    cmp_res == 0,
+                    "{}(): validation failed, expected partition with key equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
+                    __FUNCTION__,
+                    *_last_pkey,
+                    ps.key());
+        } else { // should be a larger partition
+            require(
+                    cmp_res < 0,
+                    "{}(): validation failed, expected partition with key larger than _last_pkey {} due to _drop_partition_start being unset, but got {}",
+                    __FUNCTION__,
+                    *_last_pkey,
+                    ps.key());
+        }
+    }
+    const auto& prange = _range_override ? *_range_override : *_pr;
+    require(
+            // TODO: somehow avoid this copy
+            prange.contains(ps.key(), tri_cmp),
+            "{}(): validation failed, expected partition with key that falls into current range {}, but got {}",
+            __FUNCTION__,
+            prange,
+            ps.key());
+
+    _validate_partition_key = false;
+}
+
+void evictable_reader::validate_position_in_partition(position_in_partition_view pos) const {
+    require(
+            _tri_cmp(_next_position_in_partition, pos) <= 0,
+            "{}(): validation failed, expected position in partition that is larger-than-equal than _next_position_in_partition {}, but got {}",
+            __FUNCTION__,
+            _next_position_in_partition,
+            pos);
+
+    if (_slice_override && pos.region() == partition_region::clustered) {
+        const auto ranges = _slice_override->row_ranges(*_schema, _last_pkey->key());
+        const bool any_contains = std::any_of(ranges.begin(), ranges.end(), [this, &pos] (const query::clustering_range& cr) {
+            // TODO: somehow avoid this copy
+            auto range = position_range(cr);
+            return range.contains(*_schema, pos);
+        });
+        require(
+                any_contains,
+                "{}(): validation failed, expected clustering fragment that is included in the slice {}, but got {}",
+                __FUNCTION__,
+                *_slice_override,
+                pos);
+    }
+}
+
 bool evictable_reader::should_drop_fragment(const mutation_fragment& mf) {
    if (_drop_partition_start && mf.is_partition_start()) {
        _drop_partition_start = false;
@@ -1228,12 +1312,50 @@ bool evictable_reader::should_drop_fragment(const mutation_fragment& mf) {
    return false;
 }

+bool evictable_reader::maybe_trim_range_tombstone(mutation_fragment& mf) const {
+    // We either didn't read a partition yet (evicted after fast-forwarding) or
+    // didn't stop in a clustering region. We don't need to trim range
+    // tombstones in either case.
+    if (!_last_pkey || _next_position_in_partition.region() != partition_region::clustered) {
+        return false;
+    }
+    if (!mf.is_range_tombstone()) {
+        validate_position_in_partition(mf.position());
+        return false;
+    }
+
+    if (_tri_cmp(mf.position(), _next_position_in_partition) >= 0) {
+        validate_position_in_partition(mf.position());
+        return false; // rt in range, no need to trim
+    }
+
+    auto& rt = mf.as_mutable_range_tombstone();
+
+    require(
+            _tri_cmp(_next_position_in_partition, rt.end_position()) <= 0,
+            "{}(): validation failed, expected range tombstone with end pos larger than _next_position_in_partition {}, but got {}",
+            __FUNCTION__,
+            _next_position_in_partition,
+            rt.end_position());
+
+    rt.set_start(*_schema, position_in_partition_view::before_key(_next_position_in_partition));
+
+    return true;
+}
+
 future<> evictable_reader::do_fill_buffer(flat_mutation_reader& reader, db::timeout_clock::time_point timeout) {
    if (!_drop_partition_start && !_drop_static_row) {
-        return reader.fill_buffer(timeout);
+        auto fill_buf_fut = reader.fill_buffer(timeout);
+        if (_validate_partition_key) {
+            fill_buf_fut = fill_buf_fut.then([this, &reader] {
+                maybe_validate_partition_start(reader.buffer());
+            });
+        }
+        return fill_buf_fut;
    }
    return repeat([this, &reader, timeout] {
        return reader.fill_buffer(timeout).then([this, &reader] {
+            maybe_validate_partition_start(reader.buffer());
            while (!reader.is_buffer_empty() && should_drop_fragment(reader.peek_buffer())) {
                reader.pop_mutation_fragment();
            }
@@ -1247,6 +1369,11 @@ future<> evictable_reader::fill_buffer(flat_mutation_reader& reader, db::timeout
        if (reader.is_buffer_empty()) {
            return make_ready_future<>();
        }
+        while (_trim_range_tombstones && !reader.is_buffer_empty()) {
+            auto mf = reader.pop_mutation_fragment();
+            _trim_range_tombstones = maybe_trim_range_tombstone(mf);
+            push_mutation_fragment(std::move(mf));
+        }
        reader.move_buffer_content_to(*this);
        auto stop = [this, &reader] {
            // The only problematic fragment kind is the range tombstone.
@@ -1287,7 +1414,13 @@ future<> evictable_reader::fill_buffer(flat_mutation_reader& reader, db::timeout
            if (reader.is_buffer_empty()) {
                return do_fill_buffer(reader, timeout);
            }
-            push_mutation_fragment(reader.pop_mutation_fragment());
+            if (_trim_range_tombstones) {
+                auto mf = reader.pop_mutation_fragment();
+                _trim_range_tombstones = maybe_trim_range_tombstone(mf);
+                push_mutation_fragment(std::move(mf));
+            } else {
+                push_mutation_fragment(reader.pop_mutation_fragment());
+            }
            return make_ready_future<>();
        });
    }).then([this, &reader] {
@@ -1915,11 +2048,13 @@ public:
        }
    }
    void abort(std::exception_ptr ep) {
-        _end_of_stream = true;
        _ex = std::move(ep);
        if (_full) {
            _full->set_exception(_ex);
            _full.reset();
+        } else if (_not_full) {
+            _not_full->set_exception(_ex);
+            _not_full.reset();
        }
    }
 };
--- a/mutation_reader.hh
+++ b/mutation_reader.hh
@@ -304,6 +304,8 @@ public:
 mutation_source make_empty_mutation_source();
 snapshot_source make_empty_snapshot_source();

+extern const ssize_t new_reader_base_cost;
+
 // Creates a restricted reader whose resource usages will be tracked
 // during it's lifetime. If there are not enough resources (dues to
 // existing readers) to create the new reader, it's construction will
--- a/mutation_writer/feed_writers.hh
+++ b/mutation_writer/feed_writers.hh
@@ -36,8 +36,14 @@ future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
                auto f2 = rd.is_buffer_empty() ? rd.fill_buffer(db::no_timeout) : make_ready_future<>();
                return when_all_succeed(std::move(f1), std::move(f2)).discard_result();
            });
-        }).finally([&wr] {
-            return wr.consume_end_of_stream();
+        }).then_wrapped([&wr] (future<> f) {
+            if (f.failed()) {
+                auto ex = f.get_exception();
+                wr.abort(ex);
+                return make_exception_future<>(ex);
+            } else {
+                return wr.consume_end_of_stream();
+            }
        });
    });
 }
--- a/mutation_writer/shard_based_splitting_writer.cc
+++ b/mutation_writer/shard_based_splitting_writer.cc
@@ -57,6 +57,9 @@ class shard_based_splitting_mutation_writer {
            }
            return std::move(_consume_fut);
        }
+        void abort(std::exception_ptr ep) {
+            _handle.abort(ep);
+        }
    };

 private:
@@ -108,6 +111,13 @@ public:
            return shard->consume_end_of_stream();
        });
    }
+    void abort(std::exception_ptr ep) {
+        for (auto&& shard : _shards) {
+            if (shard) {
+                shard->abort(ep);
+            }
+        }
+    }
 };

 future<> segregate_by_shard(flat_mutation_reader producer, reader_consumer consumer) {
--- a/mutation_writer/timestamp_based_splitting_writer.cc
+++ b/mutation_writer/timestamp_based_splitting_writer.cc
@@ -144,6 +144,9 @@ class timestamp_based_splitting_mutation_writer {
            }
            return std::move(_consume_fut);
        }
+        void abort(std::exception_ptr ep) {
+            _handle.abort(ep);
+        }
    };

 private:
@@ -186,6 +189,11 @@ public:
            return bucket.second.consume_end_of_stream();
        });
    }
+    void abort(std::exception_ptr ep) {
+        for (auto&& b : _buckets) {
+            b.second.abort(ep);
+        }
+    }
 };

 future<> timestamp_based_splitting_mutation_writer::write_to_bucket(bucket_id bucket, mutation_fragment&& mf) {
--- a/partition_version.cc
+++ b/partition_version.cc
@@ -640,12 +640,12 @@ partition_snapshot_ptr partition_entry::read(logalloc::region& r,
    return partition_snapshot_ptr(std::move(snp));
 }

-std::vector<range_tombstone>
+partition_snapshot::range_tombstone_result
 partition_snapshot::range_tombstones(position_in_partition_view start, position_in_partition_view end)
 {
    partition_version* v = &*version();
    if (!v->next()) {
-        return boost::copy_range<std::vector<range_tombstone>>(
+        return boost::copy_range<range_tombstone_result>(
            v->partition().row_tombstones().slice(*_schema, start, end));
    }
    range_tombstone_list list(*_schema);
@@ -655,10 +655,10 @@ partition_snapshot::range_tombstones(position_in_partition_view start, position_
        }
        v = v->next();
    }
-    return boost::copy_range<std::vector<range_tombstone>>(list.slice(*_schema, start, end));
+    return boost::copy_range<range_tombstone_result>(list.slice(*_schema, start, end));
 }

-std::vector<range_tombstone>
+partition_snapshot::range_tombstone_result
 partition_snapshot::range_tombstones()
 {
    return range_tombstones(
--- a/partition_version.hh
+++ b/partition_version.hh
@@ -26,6 +26,7 @@
 #include "utils/anchorless_list.hh"
 #include "utils/logalloc.hh"
 #include "utils/coroutine.hh"
+#include "utils/chunked_vector.hh"

 #include <boost/intrusive/parent_from_member.hpp>
 #include <boost/intrusive/slist.hpp>
@@ -400,10 +401,13 @@ public:
    ::static_row static_row(bool digest_requested) const;
    bool static_row_continuous() const;
    mutation_partition squashed() const;
+
+    using range_tombstone_result = utils::chunked_vector<range_tombstone>;
+
    // Returns range tombstones overlapping with [start, end)
-    std::vector<range_tombstone> range_tombstones(position_in_partition_view start, position_in_partition_view end);
+    range_tombstone_result range_tombstones(position_in_partition_view start, position_in_partition_view end);
    // Returns all range tombstones
-    std::vector<range_tombstone> range_tombstones();
+    range_tombstone_result range_tombstones();
 };

 class partition_snapshot_ptr {
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -163,6 +163,11 @@ public:
        return {partition_region::clustered, bound_weight::before_all_prefixed, &ck};
    }

+    // Returns a view to before_key(pos._ck) if pos.is_clustering_row() else returns pos as-is.
+    static position_in_partition_view before_key(position_in_partition_view pos) {
+        return {partition_region::clustered, pos._bound_weight == bound_weight::equal ? bound_weight::before_all_prefixed : pos._bound_weight, pos._ck};
+    }
+
    partition_region region() const { return _type; }
    bound_weight get_bound_weight() const { return _bound_weight; }
    bool is_partition_start() const { return _type == partition_region::partition_start; }
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -28,6 +28,7 @@

 reader_permit::resource_units::resource_units(reader_concurrency_semaphore& semaphore, reader_resources res) noexcept
        : _semaphore(&semaphore), _resources(res) {
+    _semaphore->consume(res);
 }

 reader_permit::resource_units::resource_units(resource_units&& o) noexcept
@@ -75,7 +76,6 @@ reader_permit::resource_units reader_permit::consume_memory(size_t memory) {
 }

 reader_permit::resource_units reader_permit::consume_resources(reader_resources res) {
-    _semaphore->consume(res);
    return resource_units(*_semaphore, res);
 }

@@ -83,7 +83,6 @@ void reader_concurrency_semaphore::signal(const resources& r) noexcept {
    _resources += r;
    while (!_wait_list.empty() && has_available_units(_wait_list.front().res)) {
        auto& x = _wait_list.front();
-        _resources -= x.res;
        try {
            x.pr.set_value(reader_permit::resource_units(*this, x.res));
        } catch (...) {
@@ -104,7 +103,7 @@ reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore:
        const auto [it, _] = _inactive_reads.emplace(_next_id++, std::move(ir));
        (void)_;
        ++_inactive_read_stats.population;
-        return inactive_read_handle(it->first);
+        return inactive_read_handle(*this, it->first);
    }

    // The evicted reader will release its permit, hopefully allowing us to
@@ -115,6 +114,17 @@ reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore:
 }

 std::unique_ptr<reader_concurrency_semaphore::inactive_read> reader_concurrency_semaphore::unregister_inactive_read(inactive_read_handle irh) {
+    if (irh && irh._sem != this) {
+        throw std::runtime_error(fmt::format(
+                    "reader_concurrency_semaphore::unregister_inactive_read(): "
+                    "attempted to unregister an inactive read with a handle belonging to another semaphore: "
+                    "this is {} (0x{:x}) but the handle belongs to {} (0x{:x})",
+                    name(),
+                    reinterpret_cast<uintptr_t>(this),
+                    irh._sem->name(),
+                    reinterpret_cast<uintptr_t>(irh._sem)));
+    }
+
    if (auto it = _inactive_reads.find(irh._id); it != _inactive_reads.end()) {
        auto ir = std::move(it->second);
        _inactive_reads.erase(it);
@@ -158,7 +168,6 @@ future<reader_permit::resource_units> reader_concurrency_semaphore::do_wait_admi
        --_inactive_read_stats.population;
    }
    if (may_proceed(r)) {
-        _resources -= r;
        return make_ready_future<reader_permit::resource_units>(reader_permit::resource_units(*this, r));
    }
    promise<reader_permit::resource_units> pr;
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -60,18 +60,20 @@ public:
    };

    class inactive_read_handle {
+        reader_concurrency_semaphore* _sem = nullptr;
        uint64_t _id = 0;

        friend class reader_concurrency_semaphore;

-        explicit inactive_read_handle(uint64_t id)
-            : _id(id) {
+        explicit inactive_read_handle(reader_concurrency_semaphore& sem, uint64_t id)
+            : _sem(&sem), _id(id) {
        }
    public:
        inactive_read_handle() = default;
-        inactive_read_handle(inactive_read_handle&& o) : _id(std::exchange(o._id, 0)) {
+        inactive_read_handle(inactive_read_handle&& o) : _sem(std::exchange(o._sem, nullptr)), _id(std::exchange(o._id, 0)) {
        }
        inactive_read_handle& operator=(inactive_read_handle&& o) {
+            _sem = std::exchange(o._sem, nullptr);
            _id = std::exchange(o._id, 0);
            return *this;
        }
@@ -105,6 +107,7 @@ private:
    };

 private:
+    const resources _initial_resources;
    resources _resources;

    expiring_fifo<entry, expiry_handler, db::timeout_clock> _wait_list;
@@ -135,7 +138,8 @@ public:
            sstring name,
            size_t max_queue_length = std::numeric_limits<size_t>::max(),
            std::function<void()> prethrow_action = nullptr)
-        : _resources(count, memory)
+        : _initial_resources(count, memory)
+        , _resources(count, memory)
        , _wait_list(expiry_handler(name))
        , _name(std::move(name))
        , _max_queue_length(max_queue_length)
@@ -144,11 +148,11 @@ public:
    /// Create a semaphore with practically unlimited count and memory.
    ///
    /// And conversely, no queue limit either.
-    explicit reader_concurrency_semaphore(no_limits)
+    explicit reader_concurrency_semaphore(no_limits, sstring name = "unlimited reader_concurrency_semaphore")
        : reader_concurrency_semaphore(
                std::numeric_limits<int>::max(),
                std::numeric_limits<ssize_t>::max(),
-                "unlimited reader_concurrency_semaphore") {}
+                std::move(name)) {}

    ~reader_concurrency_semaphore();

@@ -158,6 +162,13 @@ public:
    reader_concurrency_semaphore(reader_concurrency_semaphore&&) = delete;
    reader_concurrency_semaphore& operator=(reader_concurrency_semaphore&&) = delete;

+    /// Returns the name of the semaphore
+    ///
+    /// If the semaphore has no name, "unnamed reader concurrency semaphore" is returned.
+    std::string_view name() const {
+        return _name.empty() ? "unnamed reader concurrency semaphore" : std::string_view(_name);
+    }
+
    /// Register an inactive read.
    ///
    /// The semaphore will evict this read when there is a shortage of
@@ -193,6 +204,10 @@ public:

    reader_permit make_permit();

+    const resources initial_resources() const {
+        return _initial_resources;
+    }
+
    const resources available_resources() const {
        return _resources;
    }
--- a/reader_permit.hh
+++ b/reader_permit.hh
@@ -42,12 +42,20 @@ struct reader_resources {
        return count >= other.count && memory >= other.memory;
    }

+    reader_resources operator-(const reader_resources& other) const {
+        return reader_resources{count - other.count, memory - other.memory};
+    }
+
    reader_resources& operator-=(const reader_resources& other) {
        count -= other.count;
        memory -= other.memory;
        return *this;
    }

+    reader_resources operator+(const reader_resources& other) const {
+        return reader_resources{count + other.count, memory + other.memory};
+    }
+
    reader_resources& operator+=(const reader_resources& other) {
        count += other.count;
        memory += other.memory;
--- a/redis/commands.cc
+++ b/redis/commands.cc
@@ -62,7 +62,7 @@ shared_ptr<abstract_command> exists::prepare(service::storage_proxy& proxy, requ
 }

 future<redis_message> exists::execute(service::storage_proxy& proxy, redis::redis_options& options, service_permit permit) {
-    return seastar::do_for_each(_keys, [&proxy, &options, &permit, this] (bytes key) {
+    return seastar::do_for_each(_keys, [&proxy, &options, permit, this] (bytes& key) {
        return redis::read_strings(proxy, options, key, permit).then([this] (lw_shared_ptr<strings_result> result) {
            if (result->has_result()) {
                _count++;
--- a/reloc/build_deb.sh
+++ b/reloc/build_deb.sh
@@ -44,15 +44,15 @@ mkdir -p $BUILDDIR/scylla-package
 tar -C $BUILDDIR/scylla-package -xpf $RELOC_PKG
 cd $BUILDDIR/scylla-package

-PRODUCT=$(cat scylla/SCYLLA-PRODUCT-FILE)
-SCYLLA_VERSION=$(cat scylla/SCYLLA-VERSION-FILE)
-SCYLLA_RELEASE=$(cat scylla/SCYLLA-RELEASE-FILE)
-
-ln -fv $RELOC_PKG ../$PRODUCT-server_$SCYLLA_VERSION-$SCYLLA_RELEASE.orig.tar.gz
-
 if $DIST; then
    export DEB_BUILD_OPTIONS="housekeeping"
 fi

 mv scylla/debian debian
+
+PKG_NAME=$(dpkg-parsechangelog --show-field Source)
+# XXX: Drop revision number from version string.
+#      Since it always '1', this should be okay for now.
+PKG_VERSION=$(dpkg-parsechangelog --show-field Version |sed -e 's/-1$//')
+ln -fv $RELOC_PKG ../"$PKG_NAME"_"$PKG_VERSION".orig.tar.gz
 debuild -rfakeroot -us -uc
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -1633,6 +1633,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, locator::token_me
            auto& ks = db.local().find_keyspace(keyspace_name);
            auto& strat = ks.get_replication_strategy();
            dht::token_range_vector desired_ranges = strat.get_pending_address_ranges(tm, tokens, myip);
+            bool find_node_in_local_dc_only = strat.get_type() == locator::replication_strategy_type::network_topology;

            //Active ranges
            auto metadata_clone = tm.clone_only_token_map();
@@ -1719,6 +1720,9 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, locator::token_me
                            mandatory_neighbors = get_node_losing_the_ranges(old_endpoints, new_endpoints);
                            neighbors = mandatory_neighbors;
                        } else if (old_endpoints.size() < strat.get_replication_factor()) {
+                          if (!find_node_in_local_dc_only) {
+                            neighbors = old_endpoints;
+                          } else {
                            if (old_endpoints_in_local_dc.size() == rf_in_local_dc) {
                                // Local DC has enough replica nodes.
                                mandatory_neighbors = get_node_losing_the_ranges(old_endpoints_in_local_dc, new_endpoints);
@@ -1746,6 +1750,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, locator::token_me
                                throw std::runtime_error(format("bootstrap_with_repair: keyspace={}, range={}, wrong number of old_endpoints_in_local_dc={}, rf_in_local_dc={}",
                                        keyspace_name, desired_range, old_endpoints_in_local_dc.size(), rf_in_local_dc));
                            }
+                          }
                        } else {
                            throw std::runtime_error(format("bootstrap_with_repair: keyspace={}, range={}, wrong number of old_endpoints={}, rf={}",
                                        keyspace_name, desired_range, old_endpoints, strat.get_replication_factor()));
--- a/repair/repair.hh
+++ b/repair/repair.hh
@@ -23,6 +23,7 @@

 #include <unordered_map>
 #include <exception>
+#include <absl/container/btree_set.h>

 #include <seastar/core/sstring.hh>
 #include <seastar/core/sharded.hh>
@@ -339,6 +340,8 @@ public:
    }
 };

+using repair_hash_set = absl::btree_set<repair_hash>;
+
 enum class repair_row_level_start_status: uint8_t {
    ok,
    no_such_column_family,
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -47,6 +47,7 @@
 #include "gms/gossiper.hh"
 #include "repair/row_level.hh"
 #include "mutation_source_metadata.hh"
+#include "utils/stall_free.hh"

 extern logging::logger rlogger;

@@ -459,7 +460,7 @@ public:
    }
 };

-class repair_writer {
+class repair_writer : public enable_lw_shared_from_this<repair_writer> {
    schema_ptr _schema;
    uint64_t _estimated_partitions;
    size_t _nr_peer_nodes;
@@ -516,6 +517,7 @@ public:
        auto [queue_reader, queue_handle] = make_queue_reader(_schema);
        _mq[node_idx] = std::move(queue_handle);
        table& t = db.local().find_column_family(_schema->id());
+        auto writer = shared_from_this();
        _writer_done[node_idx] = mutation_writer::distribute_reader_and_consume_on_shards(_schema, std::move(queue_reader),
                [&db, reason = this->_reason, estimated_partitions = this->_estimated_partitions] (flat_mutation_reader reader) {
            auto& t = db.local().find_column_family(reader.schema());
@@ -529,7 +531,7 @@ public:
                    sstables::shared_sstable sst = use_view_update_path ? t->make_streaming_staging_sstable() : t->make_streaming_sstable_for_write();
                    schema_ptr s = reader.schema();
                    auto& pc = service::get_local_streaming_priority();
-                    return sst->write_components(std::move(reader), std::max(1ul, adjusted_estimated_partitions), s,
+                    return sst->write_components(std::move(reader), adjusted_estimated_partitions, s,
                                                 t->get_sstables_manager().configure_writer(),
                                                 encoding_stats{}, pc).then([sst] {
                        return sst->open_data();
@@ -545,13 +547,13 @@ public:
                return consumer(std::move(reader));
            });
        },
-        t.stream_in_progress()).then([this, node_idx] (uint64_t partitions) {
+        t.stream_in_progress()).then([node_idx, writer] (uint64_t partitions) {
            rlogger.debug("repair_writer: keyspace={}, table={}, managed to write partitions={} to sstable",
-                _schema->ks_name(), _schema->cf_name(), partitions);
-        }).handle_exception([this, node_idx] (std::exception_ptr ep) {
+                writer->_schema->ks_name(), writer->_schema->cf_name(), partitions);
+        }).handle_exception([node_idx, writer] (std::exception_ptr ep) {
            rlogger.warn("repair_writer: keyspace={}, table={}, multishard_writer failed: {}",
-                    _schema->ks_name(), _schema->cf_name(), ep);
-            _mq[node_idx]->abort(ep);
+                    writer->_schema->ks_name(), writer->_schema->cf_name(), ep);
+            writer->_mq[node_idx]->abort(ep);
            return make_exception_future<>(std::move(ep));
        });
    }
@@ -654,7 +656,7 @@ private:
    size_t _nr_peer_nodes= 1;
    repair_stats _stats;
    repair_reader _repair_reader;
-    repair_writer _repair_writer;
+    lw_shared_ptr<repair_writer> _repair_writer;
    // Contains rows read from disk
    std::list<repair_row> _row_buf;
    // Contains rows we are working on to sync between peers
@@ -666,7 +668,7 @@ private:
    // Tracks current sync boundary
    std::optional<repair_sync_boundary> _current_sync_boundary;
    // Contains the hashes of rows in the _working_row_buffor for all peer nodes
-    std::vector<std::unordered_set<repair_hash>> _peer_row_hash_sets;
+    std::vector<repair_hash_set> _peer_row_hash_sets;
    // Gate used to make sure pending operation of meta data is done
    seastar::gate _gate;
    sink_source_for_get_full_row_hashes _sink_source_for_get_full_row_hashes;
@@ -735,7 +737,7 @@ public:
                    _seed,
                    repair_reader::is_local_reader(_repair_master || _same_sharding_config)
              )
-            , _repair_writer(_schema, _estimated_partitions, _nr_peer_nodes, _reason)
+            , _repair_writer(make_lw_shared<repair_writer>(_schema, _estimated_partitions, _nr_peer_nodes, _reason))
            , _sink_source_for_get_full_row_hashes(_repair_meta_id, _nr_peer_nodes,
                    [] (uint32_t repair_meta_id, netw::messaging_service::msg_addr addr) {
                        return netw::get_local_messaging_service().make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(repair_meta_id, addr);
@@ -754,11 +756,12 @@ public:
 public:
    future<> stop() {
        auto gate_future = _gate.close();
-        auto writer_future = _repair_writer.wait_for_writer_done();
        auto f1 = _sink_source_for_get_full_row_hashes.close();
        auto f2 = _sink_source_for_get_row_diff.close();
        auto f3 = _sink_source_for_put_row_diff.close();
-        return when_all_succeed(std::move(gate_future), std::move(writer_future), std::move(f1), std::move(f2), std::move(f3)).discard_result();
+        return when_all_succeed(std::move(gate_future), std::move(f1), std::move(f2), std::move(f3)).discard_result().finally([this] {
+            return _repair_writer->wait_for_writer_done();
+        });
    }

    static std::unordered_map<node_repair_meta_id, lw_shared_ptr<repair_meta>>& repair_meta_map() {
@@ -886,9 +889,9 @@ public:
    }

    // Must run inside a seastar thread
-    static std::unordered_set<repair_hash>
-    get_set_diff(const std::unordered_set<repair_hash>& x, const std::unordered_set<repair_hash>& y) {
-        std::unordered_set<repair_hash> set_diff;
+    static repair_hash_set
+    get_set_diff(const repair_hash_set& x, const repair_hash_set& y) {
+        repair_hash_set set_diff;
        // Note std::set_difference needs x and y are sorted.
        std::copy_if(x.begin(), x.end(), std::inserter(set_diff, set_diff.end()),
                [&y] (auto& item) { thread::maybe_yield(); return y.find(item) == y.end(); });
@@ -906,14 +909,14 @@ public:

    }

-    std::unordered_set<repair_hash>& peer_row_hash_sets(unsigned node_idx) {
+    repair_hash_set& peer_row_hash_sets(unsigned node_idx) {
        return _peer_row_hash_sets[node_idx];
    }

    // Get a list of row hashes in _working_row_buf
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    working_row_hashes() {
-        return do_with(std::unordered_set<repair_hash>(), [this] (std::unordered_set<repair_hash>& hashes) {
+        return do_with(repair_hash_set(), [this] (repair_hash_set& hashes) {
            return do_for_each(_working_row_buf, [&hashes] (repair_row& r) {
                hashes.emplace(r.hash());
            }).then([&hashes] {
@@ -1090,24 +1093,32 @@ private:
        });
    }

+    future<> clear_row_buf() {
+        return utils::clear_gently(_row_buf);
+    }
+
+    future<> clear_working_row_buf() {
+        return utils::clear_gently(_working_row_buf).then([this] {
+            _working_row_buf_combined_hash.clear();
+        });
+    }
+
    // Read rows from disk until _max_row_buf_size of rows are filled into _row_buf.
    // Calculate the combined checksum of the rows
    // Calculate the total size of the rows in _row_buf
    future<get_sync_boundary_response>
    get_sync_boundary(std::optional<repair_sync_boundary> skipped_sync_boundary) {
+        auto f = make_ready_future<>();
        if (skipped_sync_boundary) {
            _current_sync_boundary = skipped_sync_boundary;
-            _row_buf.clear();
-            _working_row_buf.clear();
-            _working_row_buf_combined_hash.clear();
-        } else {
-            _working_row_buf.clear();
-            _working_row_buf_combined_hash.clear();
+            f = clear_row_buf();
        }
        // Here is the place we update _last_sync_boundary
        rlogger.trace("SET _last_sync_boundary from {} to {}", _last_sync_boundary, _current_sync_boundary);
        _last_sync_boundary = _current_sync_boundary;
-        return row_buf_size().then([this, sb = std::move(skipped_sync_boundary)] (size_t cur_size) {
+      return f.then([this, sb = std::move(skipped_sync_boundary)] () mutable {
+       return clear_working_row_buf().then([this, sb = sb] () mutable {
+        return row_buf_size().then([this, sb = std::move(sb)] (size_t cur_size) {
            return read_rows_from_disk(cur_size).then([this, sb = std::move(sb)] (std::list<repair_row> new_rows, size_t new_rows_size) mutable {
                size_t new_rows_nr = new_rows.size();
                _row_buf.splice(_row_buf.end(), new_rows);
@@ -1124,6 +1135,8 @@ private:
                });
            });
        });
+       });
+      });
    }

    future<> move_row_buf_to_working_row_buf() {
@@ -1199,9 +1212,9 @@ private:
    }

    future<std::list<repair_row>>
-    copy_rows_from_working_row_buf_within_set_diff(std::unordered_set<repair_hash> set_diff) {
+    copy_rows_from_working_row_buf_within_set_diff(repair_hash_set set_diff) {
        return do_with(std::list<repair_row>(), std::move(set_diff),
-                [this] (std::list<repair_row>& rows, std::unordered_set<repair_hash>& set_diff) {
+                [this] (std::list<repair_row>& rows, repair_hash_set& set_diff) {
            return do_for_each(_working_row_buf, [this, &set_diff, &rows] (const repair_row& r) {
                if (set_diff.count(r.hash()) > 0) {
                    rows.push_back(r);
@@ -1216,7 +1229,7 @@ private:
    // Give a set of row hashes, return the corresponding rows
    // If needs_all_rows is set, return all the rows in _working_row_buf, ignore the set_diff
    future<std::list<repair_row>>
-    get_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows = needs_all_rows_t::no) {
+    get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows = needs_all_rows_t::no) {
        if (needs_all_rows) {
            if (!_repair_master || _nr_peer_nodes == 1) {
                return make_ready_future<std::list<repair_row>>(std::move(_working_row_buf));
@@ -1227,19 +1240,28 @@ private:
        }
    }

-    future<> do_apply_rows(std::list<repair_row>& row_diff, unsigned node_idx, update_working_row_buf update_buf) {
-        return with_semaphore(_repair_writer.sem(), 1, [this, node_idx, update_buf, &row_diff] {
-            _repair_writer.create_writer(_db, node_idx);
-            return do_for_each(row_diff, [this, node_idx, update_buf] (repair_row& r) {
-                if (update_buf) {
-                    _working_row_buf_combined_hash.add(r.hash());
-                }
-                // The repair_row here is supposed to have
-                // mutation_fragment attached because we have stored it in
-                // to_repair_rows_list above where the repair_row is created.
-                mutation_fragment mf = std::move(r.get_mutation_fragment());
-                auto dk_with_hash = r.get_dk_with_hash();
-                return _repair_writer.do_write(node_idx, std::move(dk_with_hash), std::move(mf));
+    future<> do_apply_rows(std::list<repair_row>&& row_diff, unsigned node_idx, update_working_row_buf update_buf) {
+        return do_with(std::move(row_diff), [this, node_idx, update_buf] (std::list<repair_row>& row_diff) {
+            return with_semaphore(_repair_writer->sem(), 1, [this, node_idx, update_buf, &row_diff] {
+                _repair_writer->create_writer(_db, node_idx);
+                return repeat([this, node_idx, update_buf, &row_diff] () mutable {
+                    if (row_diff.empty()) {
+                        return make_ready_future<stop_iteration>(stop_iteration::yes);
+                    }
+                    repair_row& r = row_diff.front();
+                    if (update_buf) {
+                        _working_row_buf_combined_hash.add(r.hash());
+                    }
+                    // The repair_row here is supposed to have
+                    // mutation_fragment attached because we have stored it in
+                    // to_repair_rows_list above where the repair_row is created.
+                    mutation_fragment mf = std::move(r.get_mutation_fragment());
+                    auto dk_with_hash = r.get_dk_with_hash();
+                    return _repair_writer->do_write(node_idx, std::move(dk_with_hash), std::move(mf)).then([&row_diff] {
+                        row_diff.pop_front();
+                        return make_ready_future<stop_iteration>(stop_iteration::no);
+                    });
+                });
            });
        });
    }
@@ -1257,19 +1279,17 @@ private:
        stats().rx_row_nr += row_diff.size();
        stats().rx_row_nr_peer[from] += row_diff.size();
        if (update_buf) {
-            std::list<repair_row> tmp;
-            tmp.swap(_working_row_buf);
            // Both row_diff and _working_row_buf and are ordered, merging
            // two sored list to make sure the combination of row_diff
            // and _working_row_buf are ordered.
-            std::merge(tmp.begin(), tmp.end(), row_diff.begin(), row_diff.end(), std::back_inserter(_working_row_buf),
-                [this] (const repair_row& x, const repair_row& y) { thread::maybe_yield(); return _cmp(x.boundary(), y.boundary()) < 0; });
+            utils::merge_to_gently(_working_row_buf, row_diff,
+                 [this] (const repair_row& x, const repair_row& y) { return _cmp(x.boundary(), y.boundary()) < 0; });
        }
        if (update_hash_set) {
-            _peer_row_hash_sets[node_idx] = boost::copy_range<std::unordered_set<repair_hash>>(row_diff |
+            _peer_row_hash_sets[node_idx] = boost::copy_range<repair_hash_set>(row_diff |
                    boost::adaptors::transformed([] (repair_row& r) { thread::maybe_yield(); return r.hash(); }));
        }
-        do_apply_rows(row_diff, node_idx, update_buf).get();
+        do_apply_rows(std::move(row_diff), node_idx, update_buf).get();
    }

    future<>
@@ -1277,11 +1297,9 @@ private:
        if (rows.empty()) {
            return make_ready_future<>();
        }
-        return to_repair_rows_list(rows).then([this] (std::list<repair_row> row_diff) {
-            return do_with(std::move(row_diff), [this] (std::list<repair_row>& row_diff) {
-                unsigned node_idx = 0;
-                return do_apply_rows(row_diff, node_idx, update_working_row_buf::no);
-            });
+        return to_repair_rows_list(std::move(rows)).then([this] (std::list<repair_row> row_diff) {
+            unsigned node_idx = 0;
+            return do_apply_rows(std::move(row_diff), node_idx, update_working_row_buf::no);
        });
    }

@@ -1360,13 +1378,13 @@ private:
 public:
    // RPC API
    // Return the hashes of the rows in _working_row_buf
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    get_full_row_hashes(gms::inet_address remote_node) {
        if (remote_node == _myip) {
            return get_full_row_hashes_handler();
        }
        return netw::get_local_messaging_service().send_repair_get_full_row_hashes(msg_addr(remote_node),
-                _repair_meta_id).then([this, remote_node] (std::unordered_set<repair_hash> hashes) {
+                _repair_meta_id).then([this, remote_node] (repair_hash_set hashes) {
            rlogger.debug("Got full hashes from peer={}, nr_hashes={}", remote_node, hashes.size());
            _metrics.rx_hashes_nr += hashes.size();
            stats().rx_hashes_nr += hashes.size();
@@ -1377,7 +1395,7 @@ public:

 private:
    future<> get_full_row_hashes_source_op(
-            lw_shared_ptr<std::unordered_set<repair_hash>> current_hashes,
+            lw_shared_ptr<repair_hash_set> current_hashes,
            gms::inet_address remote_node,
            unsigned node_idx,
            rpc::source<repair_hash_with_cmd>& source) {
@@ -1415,12 +1433,12 @@ private:
    }

 public:
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    get_full_row_hashes_with_rpc_stream(gms::inet_address remote_node, unsigned node_idx) {
        if (remote_node == _myip) {
            return get_full_row_hashes_handler();
        }
-        auto current_hashes = make_lw_shared<std::unordered_set<repair_hash>>();
+        auto current_hashes = make_lw_shared<repair_hash_set>();
        return _sink_source_for_get_full_row_hashes.get_sink_source(remote_node, node_idx).then(
                [this, current_hashes, remote_node, node_idx]
                (rpc::sink<repair_stream_cmd>& sink, rpc::source<repair_hash_with_cmd>& source) mutable {
@@ -1435,7 +1453,7 @@ public:
    }

    // RPC handler
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    get_full_row_hashes_handler() {
        return with_gate(_gate, [this] {
            return working_row_hashes();
@@ -1585,7 +1603,7 @@ public:
    // RPC API
    // Return rows in the _working_row_buf with hash within the given sef_diff
    // Must run inside a seastar thread
-    void get_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
+    void get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
        if (needs_all_rows || !set_diff.empty()) {
            if (remote_node == _myip) {
                return;
@@ -1654,11 +1672,11 @@ private:
    }

    future<> get_row_diff_sink_op(
-            std::unordered_set<repair_hash> set_diff,
+            repair_hash_set set_diff,
            needs_all_rows_t needs_all_rows,
            rpc::sink<repair_hash_with_cmd>& sink,
            gms::inet_address remote_node) {
-        return do_with(std::move(set_diff), [needs_all_rows, remote_node, &sink] (std::unordered_set<repair_hash>& set_diff) mutable {
+        return do_with(std::move(set_diff), [needs_all_rows, remote_node, &sink] (repair_hash_set& set_diff) mutable {
            if (inject_rpc_stream_error) {
                return make_exception_future<>(std::runtime_error("get_row_diff: Inject sender error in sink loop"));
            }
@@ -1685,7 +1703,7 @@ private:
 public:
    // Must run inside a seastar thread
    void get_row_diff_with_rpc_stream(
-            std::unordered_set<repair_hash> set_diff,
+            repair_hash_set set_diff,
            needs_all_rows_t needs_all_rows,
            update_peer_row_hash_sets update_hash_set,
            gms::inet_address remote_node,
@@ -1711,7 +1729,7 @@ public:
    }

    // RPC handler
-    future<repair_rows_on_wire> get_row_diff_handler(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows) {
+    future<repair_rows_on_wire> get_row_diff_handler(repair_hash_set set_diff, needs_all_rows_t needs_all_rows) {
        return with_gate(_gate, [this, set_diff = std::move(set_diff), needs_all_rows] () mutable {
            return get_row_diff(std::move(set_diff), needs_all_rows).then([this] (std::list<repair_row> row_diff) {
                return to_repair_rows_on_wire(std::move(row_diff));
@@ -1721,15 +1739,16 @@ public:

    // RPC API
    // Send rows in the _working_row_buf with hash within the given sef_diff
-    future<> put_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
+    future<> put_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
        if (!set_diff.empty()) {
            if (remote_node == _myip) {
                return make_ready_future<>();
            }
-            auto sz = set_diff.size();
+            size_t sz = set_diff.size();
            return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, sz] (std::list<repair_row> row_diff) {
                if (row_diff.size() != sz) {
-                    throw std::runtime_error("row_diff.size() != set_diff.size()");
+                    rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
+                            _schema->ks_name(), _schema->cf_name(), _range, row_diff.size(), sz);
                }
                return do_with(std::move(row_diff), [this, remote_node] (std::list<repair_row>& row_diff) {
                    return get_repair_rows_size(row_diff).then([this, remote_node, &row_diff] (size_t row_bytes) mutable {
@@ -1796,17 +1815,18 @@ private:

 public:
    future<> put_row_diff_with_rpc_stream(
-            std::unordered_set<repair_hash> set_diff,
+            repair_hash_set set_diff,
            needs_all_rows_t needs_all_rows,
            gms::inet_address remote_node, unsigned node_idx) {
        if (!set_diff.empty()) {
            if (remote_node == _myip) {
                return make_ready_future<>();
            }
-            auto sz = set_diff.size();
+            size_t sz = set_diff.size();
            return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, node_idx, sz] (std::list<repair_row> row_diff) {
                if (row_diff.size() != sz) {
-                    throw std::runtime_error("row_diff.size() != set_diff.size()");
+                    rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
+                            _schema->ks_name(), _schema->cf_name(), _range, row_diff.size(), sz);
                }
                return do_with(std::move(row_diff), [this, remote_node, node_idx] (std::list<repair_row>& row_diff) {
                    return get_repair_rows_size(row_diff).then([this, remote_node, node_idx, &row_diff] (size_t row_bytes) mutable {
@@ -1845,7 +1865,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
        rpc::sink<repair_row_on_wire_with_cmd> sink,
        rpc::source<repair_hash_with_cmd> source,
        bool &error,
-        std::unordered_set<repair_hash>& current_set_diff,
+        repair_hash_set& current_set_diff,
        std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) {
    repair_hash_with_cmd hash_cmd = std::get<0>(hash_cmd_opt.value());
    rlogger.trace("Got repair_hash_with_cmd from peer={}, hash={}, cmd={}", from, hash_cmd.hash, int(hash_cmd.cmd));
@@ -1858,7 +1878,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
        }
        bool needs_all_rows = hash_cmd.cmd == repair_stream_cmd::needs_all_rows;
        _metrics.rx_hashes_nr += current_set_diff.size();
-        auto fp = make_foreign(std::make_unique<std::unordered_set<repair_hash>>(std::move(current_set_diff)));
+        auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(current_set_diff)));
        return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, needs_all_rows, fp = std::move(fp)] {
            auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
            if (fp.get_owner_shard() == this_shard_id()) {
@@ -1936,12 +1956,12 @@ static future<stop_iteration> repair_get_full_row_hashes_with_rpc_stream_process
    if (status == repair_stream_cmd::get_full_row_hashes) {
        return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id] {
            auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
-            return rm->get_full_row_hashes_handler().then([] (std::unordered_set<repair_hash> hashes) {
+            return rm->get_full_row_hashes_handler().then([] (repair_hash_set hashes) {
                _metrics.tx_hashes_nr += hashes.size();
                return hashes;
            });
-        }).then([sink] (std::unordered_set<repair_hash> hashes) mutable {
-            return do_with(std::move(hashes), [sink] (std::unordered_set<repair_hash>& hashes) mutable {
+        }).then([sink] (repair_hash_set hashes) mutable {
+            return do_with(std::move(hashes), [sink] (repair_hash_set& hashes) mutable {
                return do_for_each(hashes, [sink] (const repair_hash& hash) mutable {
                    return sink(repair_hash_with_cmd{repair_stream_cmd::hash_data, hash});
                }).then([sink] () mutable {
@@ -1964,7 +1984,7 @@ static future<> repair_get_row_diff_with_rpc_stream_handler(
        uint32_t repair_meta_id,
        rpc::sink<repair_row_on_wire_with_cmd> sink,
        rpc::source<repair_hash_with_cmd> source) {
-    return do_with(false, std::unordered_set<repair_hash>(), [from, src_cpu_id, repair_meta_id, sink, source] (bool& error, std::unordered_set<repair_hash>& current_set_diff) mutable {
+    return do_with(false, repair_hash_set(), [from, src_cpu_id, repair_meta_id, sink, source] (bool& error, repair_hash_set& current_set_diff) mutable {
        return repeat([from, src_cpu_id, repair_meta_id, sink, source, &error, &current_set_diff] () mutable {
            return source().then([from, src_cpu_id, repair_meta_id, sink, source, &error, &current_set_diff] (std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) mutable {
                if (hash_cmd_opt) {
@@ -2107,7 +2127,7 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
            auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
            return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id] {
                auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
-                return rm->get_full_row_hashes_handler().then([] (std::unordered_set<repair_hash> hashes) {
+                return rm->get_full_row_hashes_handler().then([] (repair_hash_set hashes) {
                    _metrics.tx_hashes_nr += hashes.size();
                    return hashes;
                });
@@ -2135,11 +2155,11 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
            });
        });
        ms.register_repair_get_row_diff([] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
-                std::unordered_set<repair_hash> set_diff, bool needs_all_rows) {
+                repair_hash_set set_diff, bool needs_all_rows) {
            auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
            auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
            _metrics.rx_hashes_nr += set_diff.size();
-            auto fp = make_foreign(std::make_unique<std::unordered_set<repair_hash>>(std::move(set_diff)));
+            auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(set_diff)));
            return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, fp = std::move(fp), needs_all_rows] () mutable {
                auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
                if (fp.get_owner_shard() == this_shard_id()) {
@@ -2207,6 +2227,25 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
    });
 }

+future<> repair_uninit_messaging_service_handler() {
+    return netw::get_messaging_service().invoke_on_all([] (auto& ms) {
+        return when_all_succeed(
+            ms.unregister_repair_get_row_diff_with_rpc_stream(),
+            ms.unregister_repair_put_row_diff_with_rpc_stream(),
+            ms.unregister_repair_get_full_row_hashes_with_rpc_stream(),
+            ms.unregister_repair_get_full_row_hashes(),
+            ms.unregister_repair_get_combined_row_hash(),
+            ms.unregister_repair_get_sync_boundary(),
+            ms.unregister_repair_get_row_diff(),
+            ms.unregister_repair_put_row_diff(),
+            ms.unregister_repair_row_level_start(),
+            ms.unregister_repair_row_level_stop(),
+            ms.unregister_repair_get_estimated_partitions(),
+            ms.unregister_repair_set_estimated_partitions(),
+            ms.unregister_repair_get_diff_algorithms()).discard_result();
+    });
+}
+
 class row_level_repair {
    repair_info& _ri;
    sstring _cf_name;
@@ -2439,7 +2478,7 @@ private:
            // sequentially because the rows from repair follower 1 to
            // repair master might reduce the amount of missing data
            // between repair master and repair follower 2.
-            std::unordered_set<repair_hash> set_diff = repair_meta::get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get0());
+            repair_hash_set set_diff = repair_meta::get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get0());
            // Request missing sets from peer node
            rlogger.debug("Before get_row_diff to node {}, local={}, peer={}, set_diff={}",
                    node, master.working_row_hashes().get0().size(), master.peer_row_hash_sets(node_idx).size(), set_diff.size());
@@ -2462,9 +2501,9 @@ private:
        // So we can figure out which rows peer node are missing and send the missing rows to them
        check_in_shutdown();
        _ri.check_in_abort();
-        std::unordered_set<repair_hash> local_row_hash_sets = master.working_row_hashes().get0();
+        repair_hash_set local_row_hash_sets = master.working_row_hashes().get0();
        auto sz = _all_live_peer_nodes.size();
-        std::vector<std::unordered_set<repair_hash>> set_diffs(sz);
+        std::vector<repair_hash_set> set_diffs(sz);
        for (size_t idx : boost::irange(size_t(0), sz)) {
            set_diffs[idx] = repair_meta::get_set_diff(local_row_hash_sets, master.peer_row_hash_sets(idx));
        }
--- a/repair/row_level.hh
+++ b/repair/row_level.hh
@@ -45,6 +45,7 @@ private:
 };

 future<> repair_init_messaging_service_handler(repair_service& rs, distributed<db::system_distributed_keyspace>& sys_dist_ks, distributed<db::view::view_update_generator>& view_update_generator);
+future<> repair_uninit_messaging_service_handler();

 class repair_info;

--- a/row_cache.cc
+++ b/row_cache.cc
@@ -1272,7 +1272,9 @@ flat_mutation_reader cache_entry::read(row_cache& rc, read_context& reader, row_
 // Assumes reader is in the corresponding partition
 flat_mutation_reader cache_entry::do_read(row_cache& rc, read_context& reader) {
    auto snp = _pe.read(rc._tracker.region(), rc._tracker.cleaner(), _schema, &rc._tracker, reader.phase());
-    auto ckr = query::clustering_key_filter_ranges::get_ranges(*_schema, reader.slice(), _key.key());
+    auto ckr = with_linearized_managed_bytes([&] {
+        return query::clustering_key_filter_ranges::get_ranges(*_schema, reader.slice(), _key.key());
+    });
    auto r = make_cache_flat_mutation_reader(_schema, _key, std::move(ckr), rc, reader.shared_from_this(), std::move(snp));
    r.upgrade_schema(rc.schema());
    r.upgrade_schema(reader.schema());
--- a/schema.cc
+++ b/schema.cc
@@ -19,6 +19,7 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <seastar/core/on_internal_error.hh>
 #include <map>
 #include "utils/UUID_gen.hh"
 #include "cql3/column_identifier.hh"
@@ -43,6 +44,8 @@

 constexpr int32_t schema::NAME_LENGTH;

+extern logging::logger dblog;
+
 sstring to_sstring(column_kind k) {
    switch (k) {
    case column_kind::partition_key:  return "PARTITION_KEY";
@@ -592,11 +595,15 @@ schema::get_column_definition(const bytes& name) const {

 const column_definition&
 schema::column_at(column_kind kind, column_id id) const {
-    return _raw._columns.at(column_offset(kind) + id);
+    return column_at(static_cast<ordinal_column_id>(column_offset(kind) + id));
 }

 const column_definition&
 schema::column_at(ordinal_column_id ordinal_id) const {
+    if (size_t(ordinal_id) >= _raw._columns.size()) [[unlikely]] {
+        on_internal_error(dblog, format("{}.{}@{}: column id {:d} >= {:d}",
+            ks_name(), cf_name(), version(), size_t(ordinal_id), _raw._columns.size()));
+    }
    return _raw._columns.at(static_cast<column_count_type>(ordinal_id));
 }

@@ -820,7 +827,7 @@ std::ostream& schema::describe(std::ostream& os) const {
    os << "}";
    os << "\n    AND comment = '" << comment()<< "'";
    os << "\n    AND compaction = {'class': '" <<  sstables::compaction_strategy::name(compaction_strategy()) << "'";
-    map_as_cql_param(os, compaction_strategy_options()) << "}";
+    map_as_cql_param(os, compaction_strategy_options(), false) << "}";
    os << "\n    AND compression = {";
    map_as_cql_param(os,  get_compressor_params().get_options());
    os << "}";
--- a/scripts/create-relocatable-package.py
+++ b/scripts/create-relocatable-package.py
@@ -92,7 +92,8 @@ executables = ['build/{}/scylla'.format(args.mode),
               '/usr/sbin/ethtool',
               '/usr/bin/netstat',
               '/usr/bin/hwloc-distrib',
-               '/usr/bin/hwloc-calc']
+               '/usr/bin/hwloc-calc',
+               '/usr/bin/lsblk']

 output = args.dest

--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -597,7 +597,7 @@ def current_shard():


 def find_db(shard=None):
-    if not shard:
+    if shard is None:
        shard = current_shard()
    return gdb.parse_and_eval('::debug::db')['_instances']['_M_impl']['_M_start'][shard]['service']['_p']

--- a/scylla_post_install.sh
+++ b/scylla_post_install.sh
@@ -63,6 +63,17 @@ MemoryHigh=1200M
 MemoryMax=1400M
 MemoryLimit=1400M
 EOS
+
+# On CentOS7, systemd does not support percentage-based parameter.
+# To apply memory parameter on CentOS7, we need to override the parameter
+# in bytes, instead of percentage.
+elif [ "$RHEL" -a "$VERSION_ID" = "7" ]; then
+    MEMORY_LIMIT=$((MEMTOTAL_BYTES / 100 * 5))
+    mkdir -p /etc/systemd/system/scylla-helper.slice.d/
+    cat << EOS > /etc/systemd/system/scylla-helper.slice.d/memory.conf
+[Slice]
+MemoryLimit=$MEMORY_LIMIT
+EOS
 fi

 systemctl --system daemon-reload >/dev/null || true
--- a/2
+++ b/2
--- a/serializer_impl.hh
+++ b/serializer_impl.hh
@@ -25,6 +25,7 @@
 #include <seastar/util/bool_class.hh>
 #include <boost/range/algorithm/for_each.hpp>
 #include "utils/small_vector.hh"
+#include <absl/container/btree_set.h>

 namespace ser {

@@ -81,6 +82,17 @@ static inline void serialize_array(Output& out, const Container& v) {
 template<typename Container>
 struct container_traits;

+template<typename T>
+struct container_traits<absl::btree_set<T>> {
+    struct back_emplacer {
+        absl::btree_set<T>& c;
+        back_emplacer(absl::btree_set<T>& c_) : c(c_) {}
+        void operator()(T&& v) {
+            c.emplace(std::move(v));
+        }
+    };
+};
+
 template<typename T>
 struct container_traits<std::unordered_set<T>> {
    struct back_emplacer {
@@ -253,6 +265,27 @@ struct serializer<std::list<T>> {
    }
 };

+template<typename T>
+struct serializer<absl::btree_set<T>> {
+    template<typename Input>
+    static absl::btree_set<T> read(Input& in) {
+        auto sz = deserialize(in, boost::type<uint32_t>());
+        absl::btree_set<T> v;
+        deserialize_array_helper<false, T>::doit(in, v, sz);
+        return v;
+    }
+    template<typename Output>
+    static void write(Output& out, const absl::btree_set<T>& v) {
+        safe_serialize_as_uint32(out, v.size());
+        serialize_array_helper<false, T>::doit(out, v);
+    }
+    template<typename Input>
+    static void skip(Input& in) {
+        auto sz = deserialize(in, boost::type<uint32_t>());
+        skip_array<T>(in, sz);
+    }
+};
+
 template<typename T>
 struct serializer<std::unordered_set<T>> {
    template<typename Input>
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -92,7 +92,7 @@ void migration_manager::init_messaging_service()
        //FIXME: future discarded.
        (void)with_gate(_background_tasks, [this] {
            mlogger.debug("features changed, recalculating schema version");
-            return update_schema_version_and_announce(get_storage_proxy(), _feat.cluster_schema_features());
+            return db::schema_tables::recalculate_schema_version(get_storage_proxy(), _feat);
        });
    };

@@ -1104,6 +1104,20 @@ future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging
        mlogger.debug("Requesting schema {} from {}", v, dst);
        auto& ms = netw::get_local_messaging_service();
        return ms.send_get_schema_version(dst, v);
+    }).then([] (schema_ptr s) {
+        // If this is a view so this schema also needs a reference to the base
+        // table.
+        if (s->is_view()) {
+            if (!s->view_info()->base_info()) {
+                auto& db = service::get_local_storage_proxy().get_db().local();
+                // This line might throw a no_such_column_family
+                // It should be fine since if we tried to register a view for which
+                // we don't know the base table, our registry is broken.
+                schema_ptr base_schema = db.find_schema(s->view_info()->base_id());
+                s->view_info()->set_base_info(s->view_info()->make_base_dependent_view_info(*base_schema));
+            }
+        }
+        return s;
    });
 }

--- a/service/pager/query_pagers.cc
+++ b/service/pager/query_pagers.cc
@@ -347,7 +347,7 @@ public:
            _max = _max - row_count;
            _exhausted = (row_count < page_size && !results->is_short_read()) || _max == 0;

-            if (!_exhausted || row_count > 0) {
+            if (!_exhausted && row_count > 0) {
                if (_last_pkey) {
                    update_slice(*_last_pkey);
                }
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -120,9 +120,11 @@ using fbu = utils::fb_utilities;

 static inline
 query::digest_algorithm digest_algorithm(service::storage_proxy& proxy) {
-    return proxy.features().cluster_supports_xxhash_digest_algorithm()
-         ? query::digest_algorithm::xxHash
-         : query::digest_algorithm::MD5;
+    return proxy.features().cluster_supports_digest_for_null_values()
+            ? query::digest_algorithm::xxHash
+            : proxy.features().cluster_supports_xxhash_digest_algorithm()
+                    ? query::digest_algorithm::legacy_xxHash_without_null_digest
+                    : query::digest_algorithm::MD5;
 }

 static inline
@@ -1760,6 +1762,7 @@ storage_proxy::storage_proxy(distributed<database>& db, storage_proxy::config cf
    , _token_metadata(tm)
    , _read_smp_service_group(cfg.read_smp_service_group)
    , _write_smp_service_group(cfg.write_smp_service_group)
+    , _hints_write_smp_service_group(cfg.hints_write_smp_service_group)
    , _write_ack_smp_service_group(cfg.write_ack_smp_service_group)
    , _next_response_id(std::chrono::system_clock::now().time_since_epoch()/1ms)
    , _hints_resource_manager(cfg.available_memory / 10)
@@ -1803,39 +1806,48 @@ storage_proxy::response_id_type storage_proxy::unique_response_handler::release(
 }

 future<>
-storage_proxy::mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, clock_type::time_point timeout) {
+storage_proxy::mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp) {
    auto shard = _db.local().shard_of(m);
    get_stats().replica_cross_shard_ops += shard != this_shard_id();
-    return _db.invoke_on(shard, {_write_smp_service_group, timeout},
-            [s = global_schema_ptr(m.schema()), m = freeze(m), gtr = tracing::global_trace_state_ptr(std::move(tr_state)), timeout] (database& db) mutable -> future<> {
-        return db.apply(s, m, gtr.get(), db::commitlog::force_sync::no, timeout);
+    return _db.invoke_on(shard, {smp_grp, timeout},
+            [s = global_schema_ptr(m.schema()),
+             m = freeze(m),
+             gtr = tracing::global_trace_state_ptr(std::move(tr_state)),
+             timeout,
+             sync] (database& db) mutable -> future<> {
+        return db.apply(s, m, gtr.get(), sync, timeout);
    });
 }

 future<>
-storage_proxy::mutate_locally(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout) {
+storage_proxy::mutate_locally(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout,
+        smp_service_group smp_grp) {
    auto shard = _db.local().shard_of(m);
    get_stats().replica_cross_shard_ops += shard != this_shard_id();
-    return _db.invoke_on(shard, {_write_smp_service_group, timeout},
+    return _db.invoke_on(shard, {smp_grp, timeout},
            [&m, gs = global_schema_ptr(s), gtr = tracing::global_trace_state_ptr(std::move(tr_state)), timeout, sync] (database& db) mutable -> future<> {
        return db.apply(gs, m, gtr.get(), sync, timeout);
    });
 }

 future<>
-storage_proxy::mutate_locally(std::vector<mutation> mutations, tracing::trace_state_ptr tr_state, clock_type::time_point timeout) {
-    return do_with(std::move(mutations), [this, timeout, tr_state = std::move(tr_state)] (std::vector<mutation>& pmut) mutable {
-        return parallel_for_each(pmut.begin(), pmut.end(), [this, tr_state = std::move(tr_state), timeout] (const mutation& m) mutable {
-            return mutate_locally(m, tr_state, timeout);
+storage_proxy::mutate_locally(std::vector<mutation> mutations, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, smp_service_group smp_grp) {
+    return do_with(std::move(mutations), [this, timeout, tr_state = std::move(tr_state), smp_grp] (std::vector<mutation>& pmut) mutable {
+        return parallel_for_each(pmut.begin(), pmut.end(), [this, tr_state = std::move(tr_state), timeout, smp_grp] (const mutation& m) mutable {
+            return mutate_locally(m, tr_state, db::commitlog::force_sync::no, timeout, smp_grp);
        });
    });
 }

+future<> 
+storage_proxy::mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout) {
+        return mutate_locally(std::move(mutation), tr_state, timeout, _write_smp_service_group);
+}
 future<>
 storage_proxy::mutate_hint(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, clock_type::time_point timeout) {
    auto shard = _db.local().shard_of(m);
    get_stats().replica_cross_shard_ops += shard != this_shard_id();
-    return _db.invoke_on(shard, {_write_smp_service_group, timeout}, [&m, gs = global_schema_ptr(s), tr_state = std::move(tr_state), timeout] (database& db) mutable -> future<> {
+    return _db.invoke_on(shard, {_hints_write_smp_service_group, timeout}, [&m, gs = global_schema_ptr(s), tr_state = std::move(tr_state), timeout] (database& db) mutable -> future<> {
        return db.apply_hint(gs, m, std::move(tr_state), timeout);
    });
 }
@@ -2573,14 +2585,13 @@ future<> storage_proxy::send_hint_to_endpoint(frozen_mutation_and_schema fm_a_s,
 }

 future<> storage_proxy::send_hint_to_all_replicas(frozen_mutation_and_schema fm_a_s) {
-    const auto timeout = db::timeout_clock::now() + 1h;
    if (!_features.cluster_supports_hinted_handoff_separate_connection()) {
        std::array<mutation, 1> ms{fm_a_s.fm.unfreeze(fm_a_s.s)};
-        return mutate_internal(std::move(ms), db::consistency_level::ALL, false, nullptr, empty_service_permit(), timeout);
+        return mutate_internal(std::move(ms), db::consistency_level::ALL, false, nullptr, empty_service_permit());
    }

    std::array<hint_wrapper, 1> ms{hint_wrapper { std::move(fm_a_s.fm.unfreeze(fm_a_s.s)) }};
-    return mutate_internal(std::move(ms), db::consistency_level::ALL, false, nullptr, empty_service_permit(), timeout);
+    return mutate_internal(std::move(ms), db::consistency_level::ALL, false, nullptr, empty_service_permit());
 }

 /**
@@ -4488,6 +4499,12 @@ future<bool> storage_proxy::cas(schema_ptr schema, shared_ptr<cas_request> reque
                                    paxos::paxos_state::logger.debug("CAS[{}] successful", handler->id());
                                    tracing::trace(handler->tr_state, "CAS successful");
                                    return std::optional<bool>(condition_met);
+                                }).handle_exception_type([handler] (unavailable_exception& e) {
+                                    // if learning stage encountered unavailablity error lets re-map it to a write error
+                                    // since unavailable error means that operation has never ever started which is not the case here
+                                    schema_ptr schema = handler->schema();
+                                    return make_exception_future<std::optional<bool>>(mutation_write_timeout_exception(schema->ks_name(), schema->cf_name(),
+                                                               e.consistency, e.alive, e.required, db::write_type::CAS));
                                });
                            }
                            paxos::paxos_state::logger.debug("CAS[{}] PAXOS proposal not accepted (pre-empted by a higher ballot)",
@@ -4849,7 +4866,7 @@ void storage_proxy::init_messaging_service() {
        });
    };

-    auto receive_mutation_handler = [] (const rpc::client_info& cinfo, rpc::opt_time_point t, frozen_mutation in, std::vector<gms::inet_address> forward,
+    auto receive_mutation_handler = [] (smp_service_group smp_grp, const rpc::client_info& cinfo, rpc::opt_time_point t, frozen_mutation in, std::vector<gms::inet_address> forward,
            gms::inet_address reply_to, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info) {
        tracing::trace_state_ptr trace_state_ptr;
        auto src_addr = netw::messaging_service::get_source(cinfo);
@@ -4857,9 +4874,9 @@ void storage_proxy::init_messaging_service() {
        utils::UUID schema_version = in.schema_version();
        return handle_write(src_addr, t, schema_version, std::move(in), std::move(forward), reply_to, shard, response_id,
                trace_info ? *trace_info : std::nullopt,
-                /* apply_fn */ [] (shared_ptr<storage_proxy>& p, tracing::trace_state_ptr tr_state, schema_ptr s, const frozen_mutation& m,
+                /* apply_fn */ [smp_grp] (shared_ptr<storage_proxy>& p, tracing::trace_state_ptr tr_state, schema_ptr s, const frozen_mutation& m,
                        clock_type::time_point timeout) {
-                    return p->mutate_locally(std::move(s), m, std::move(tr_state), db::commitlog::force_sync::no, timeout);
+                    return p->mutate_locally(std::move(s), m, std::move(tr_state), db::commitlog::force_sync::no, timeout, smp_grp);
                },
                /* forward_fn */ [] (netw::messaging_service::msg_addr addr, clock_type::time_point timeout, const frozen_mutation& m,
                        gms::inet_address reply_to, unsigned shard, response_id_type response_id,
@@ -4868,8 +4885,8 @@ void storage_proxy::init_messaging_service() {
                    return ms.send_mutation(addr, timeout, m, {}, reply_to, shard, response_id, std::move(trace_info));
                });
    };
-    ms.register_mutation(receive_mutation_handler);
-    ms.register_hint_mutation(receive_mutation_handler);
+    ms.register_mutation(std::bind_front<>(receive_mutation_handler, _write_smp_service_group));
+    ms.register_hint_mutation(std::bind_front<>(receive_mutation_handler, _hints_write_smp_service_group));

    ms.register_paxos_learn([] (const rpc::client_info& cinfo, rpc::opt_time_point t, paxos::proposal decision,
            std::vector<gms::inet_address> forward, gms::inet_address reply_to, unsigned shard,
@@ -5112,18 +5129,22 @@ void storage_proxy::init_messaging_service() {
 future<> storage_proxy::uninit_messaging_service() {
    auto& ms = netw::get_local_messaging_service();
    return when_all_succeed(
+        ms.unregister_counter_mutation(),
        ms.unregister_mutation(),
+        ms.unregister_hint_mutation(),
        ms.unregister_mutation_done(),
        ms.unregister_mutation_failed(),
        ms.unregister_read_data(),
        ms.unregister_read_mutation_data(),
        ms.unregister_read_digest(),
        ms.unregister_truncate(),
+        ms.unregister_get_schema_version(),
        ms.unregister_paxos_prepare(),
        ms.unregister_paxos_accept(),
        ms.unregister_paxos_learn(),
        ms.unregister_paxos_prune()
    ).discard_result();
+
 }

 future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>
@@ -5217,8 +5238,7 @@ future<> storage_proxy::drain_on_shutdown() {

 future<>
 storage_proxy::stop() {
-    // FIXME: hints manager should be stopped here but it seems like this function is never called
-    return uninit_messaging_service();
+    return make_ready_future<>();
 }

 }
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -166,6 +166,7 @@ public:
        size_t available_memory;
        smp_service_group read_smp_service_group = default_smp_service_group();
        smp_service_group write_smp_service_group = default_smp_service_group();
+        smp_service_group hints_write_smp_service_group = default_smp_service_group();
        // Write acknowledgments might not be received on the correct shard, and
        // they need a separate smp_service_group to prevent an ABBA deadlock
        // with writes.
@@ -256,6 +257,7 @@ private:
    locator::token_metadata& _token_metadata;
    smp_service_group _read_smp_service_group;
    smp_service_group _write_smp_service_group;
+    smp_service_group _hints_write_smp_service_group;
    smp_service_group _write_ack_smp_service_group;
    response_id_type _next_response_id;
    response_handlers_map _response_handlers;
@@ -314,7 +316,6 @@ private:

    cdc_stats _cdc_stats;
 private:
-    future<> uninit_messaging_service();
    future<coordinator_query_result> query_singular(lw_shared_ptr<query::read_command> cmd,
            dht::partition_range_vector&& partition_ranges,
            db::consistency_level cl,
@@ -469,13 +470,31 @@ public:
        return next;
    }
    void init_messaging_service();
+    future<> uninit_messaging_service();

+private:
    // Applies mutation on this node.
    // Resolves with timed_out_error when timeout is reached.
-    future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, clock_type::time_point timeout = clock_type::time_point::max());
+    future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp);
    // Applies mutation on this node.
    // Resolves with timed_out_error when timeout is reached.
-    future<> mutate_locally(const schema_ptr&, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max());
+    future<> mutate_locally(const schema_ptr&, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout,
+            smp_service_group smp_grp);
+    // Applies mutations on this node.
+    // Resolves with timed_out_error when timeout is reached.
+    future<> mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, smp_service_group smp_grp);
+
+public:
+    // Applies mutation on this node.
+    // Resolves with timed_out_error when timeout is reached.
+    future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max()) {
+        return mutate_locally(m, tr_state, sync, timeout, _write_smp_service_group);
+    }
+    // Applies mutation on this node.
+    // Resolves with timed_out_error when timeout is reached.
+    future<> mutate_locally(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max()) {
+        return mutate_locally(s, m, tr_state, sync, timeout, _write_smp_service_group);
+    }
    // Applies mutations on this node.
    // Resolves with timed_out_error when timeout is reached.
    future<> mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout = clock_type::time_point::max());
--- a/Show More
+++ b/Show More