release: prepare for 4.0.11

Update seastar submodule
* seastar 065a40b34a...748428930a (1): > append_challenged_posix_file_impl: allow destructing file with no queued work Fixes #7285.
2020-10-26 18:12:47 +02:00 · 2020-10-19 15:06:24 +03:00 · 2020-10-19 15:05:13 +03:00 · 2020-10-18 15:03:04 +03:00 · 2020-10-06 17:12:28 +03:00 · 2020-10-04 18:05:00 +03:00
185 changed files with 4391 additions and 1295 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
@@ -15,3 +15,6 @@
 [submodule "zstd"]
 	path = zstd
 	url = ../zstd
+[submodule "abseil"]
+	path = abseil
+	url = ../abseil-cpp
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=666.development
+VERSION=4.0.11

 if test -f version
 then
--- a/1
+++ b/1
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -66,8 +66,9 @@ static std::string format_time_point(db_clock::time_point tp) {
    time_t time_point_repr = db_clock::to_time_t(tp);
    std::string time_point_str;
    time_point_str.resize(17);
+    ::tm time_buf;
    // strftime prints the terminating null character as well
-    std::strftime(time_point_str.data(), time_point_str.size(), "%Y%m%dT%H%M%SZ", std::gmtime(&time_point_repr));
+    std::strftime(time_point_str.data(), time_point_str.size(), "%Y%m%dT%H%M%SZ", ::gmtime_r(&time_point_repr, &time_buf));
    time_point_str.resize(16);
    return time_point_str;
 }
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -365,31 +365,35 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara

 struct cmp_lt {
    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs; }
+    // We cannot use the normal comparison operators like "<" on the bytes
+    // type, because they treat individual bytes as signed but we need to
+    // compare them as *unsigned*. So we need a specialization for bytes.
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) < 0; }
    static constexpr const char* diagnostic = "LT operator";
 };

 struct cmp_le {
-    // bytes only has <, so we cannot use <=.
-    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs || lhs == rhs; }
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs <= rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) <= 0; }
    static constexpr const char* diagnostic = "LE operator";
 };

 struct cmp_ge {
-    // bytes only has <, so we cannot use >=.
-    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs || lhs == rhs; }
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs >= rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) >= 0; }
    static constexpr const char* diagnostic = "GE operator";
 };

 struct cmp_gt {
-    // bytes only has <, so we cannot use >.
-    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs; }
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs > rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) > 0; }
    static constexpr const char* diagnostic = "GT operator";
 };

 // True if v is between lb and ub, inclusive.  Throws if lb > ub.
 template <typename T>
 bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
-    if (ub < lb) {
+    if (cmp_lt()(ub, lb)) {
        throw api_error("ValidationException",
                        format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
    }
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -208,12 +208,11 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
            throw api_error("ValidationException",
                    format("Non-string IndexName '{}'", index_name->GetString()));
        }
-    }
-
-    // If no tables for global indexes were found, the index may be local
-    if (!proxy.get_db().local().has_schema(keyspace_name, table_name)) {
-        type = table_or_view_type::lsi;
-        table_name = lsi_name(orig_table_name, index_name->GetString());
+        // If no tables for global indexes were found, the index may be local
+        if (!proxy.get_db().local().has_schema(keyspace_name, table_name)) {
+            type = table_or_view_type::lsi;
+            table_name = lsi_name(orig_table_name, index_name->GetString());
+        }
    }

    try {
@@ -566,7 +565,7 @@ static void validate_tags(const std::map<sstring, sstring>& tags) {
 // to races during concurrent updates of the same table. Once Scylla schema updates
 // are fixed, this issue will automatically get fixed as well.
 enum class update_tags_action { add_tags, delete_tags };
-static future<> update_tags(const rjson::value& tags, schema_ptr schema, std::map<sstring, sstring>&& tags_map, update_tags_action action) {
+static future<> update_tags(service::migration_manager& mm, const rjson::value& tags, schema_ptr schema, std::map<sstring, sstring>&& tags_map, update_tags_action action) {
    if (action == update_tags_action::add_tags) {
        for (auto it = tags.Begin(); it != tags.End(); ++it) {
            const rjson::value& key = (*it)["Key"];
@@ -593,24 +592,12 @@ static future<> update_tags(const rjson::value& tags, schema_ptr schema, std::ma
    }
    validate_tags(tags_map);

-    std::stringstream serialized_tags;
-    serialized_tags << '{';
-    for (auto& tag_entry : tags_map) {
-        serialized_tags << format("'{}':'{}',", tag_entry.first, tag_entry.second);
-    }
-    std::string serialized_tags_str = serialized_tags.str();
-    if (!tags_map.empty()) {
-        serialized_tags_str[serialized_tags_str.size() - 1] = '}'; // trims the last ',' delimiter
-    } else {
-        serialized_tags_str.push_back('}');
-    }
-
-    sstring req = format("ALTER TABLE \"{}\".\"{}\" WITH {} = {}",
-            schema->ks_name(), schema->cf_name(), tags_extension::NAME, serialized_tags_str);
-    return db::execute_cql(std::move(req)).discard_result();
+    schema_builder builder(schema);
+    builder.set_extensions(schema::extensions_map{{sstring(tags_extension::NAME), ::make_shared<tags_extension>(std::move(tags_map))}});
+    return mm.announce_column_family_update(builder.build(), false, std::vector<view_ptr>(), false);
 }

-static future<> add_tags(service::storage_proxy& proxy, schema_ptr schema, rjson::value& request_info) {
+static future<> add_tags(service::migration_manager& mm, service::storage_proxy& proxy, schema_ptr schema, rjson::value& request_info) {
    const rjson::value* tags = rjson::find(request_info, "Tags");
    if (!tags || !tags->IsArray()) {
        return make_exception_future<>(api_error("ValidationException", format("Cannot parse tags")));
@@ -620,7 +607,7 @@ static future<> add_tags(service::storage_proxy& proxy, schema_ptr schema, rjson
    }

    std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
-    return update_tags(rjson::copy(*tags), schema, std::move(tags_map), update_tags_action::add_tags);
+    return update_tags(mm, rjson::copy(*tags), schema, std::move(tags_map), update_tags_action::add_tags);
 }

 future<executor::request_return_type> executor::tag_resource(client_state& client_state, service_permit permit, rjson::value request) {
@@ -632,7 +619,7 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
            return api_error("AccessDeniedException", "Incorrect resource identifier");
        }
        schema_ptr schema = get_table_from_arn(_proxy, std::string_view(arn->GetString(), arn->GetStringLength()));
-        add_tags(_proxy, schema, request).get();
+        add_tags(_mm, _proxy, schema, request).get();
        return json_string("");
    });
 }
@@ -653,7 +640,7 @@ future<executor::request_return_type> executor::untag_resource(client_state& cli
        schema_ptr schema = get_table_from_arn(_proxy, std::string_view(arn->GetString(), arn->GetStringLength()));

        std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
-        update_tags(*tags, schema, std::move(tags_map), update_tags_action::delete_tags).get();
+        update_tags(_mm, *tags, schema, std::move(tags_map), update_tags_action::delete_tags).get();
        return json_string("");
    });
 }
@@ -870,7 +857,7 @@ future<executor::request_return_type> executor::create_table(client_state& clien
            }).then([this, table_info = std::move(table_info), schema] () mutable {
                future<> f = make_ready_future<>();
                if (rjson::find(table_info, "Tags")) {
-                    f = add_tags(_proxy, schema, table_info);
+                    f = add_tags(_mm, _proxy, schema, table_info);
                }
                return f.then([table_info = std::move(table_info), schema] () mutable {
                    rjson::value status = rjson::empty_object();
@@ -900,15 +887,24 @@ class attribute_collector {
    void add(bytes&& name, atomic_cell&& cell) {
        collected.emplace(std::move(name), std::move(cell));
    }
+    void add(const bytes& name, atomic_cell&& cell) {
+        collected.emplace(name, std::move(cell));
+    }
 public:
    attribute_collector() : collected(attrs_type()->get_keys_type()->as_less_comparator()) { }
-    void put(bytes&& name, bytes&& val, api::timestamp_type ts) {
-        add(std::move(name), atomic_cell::make_live(*bytes_type, ts, std::move(val), atomic_cell::collection_member::yes));
+    void put(bytes&& name, const bytes& val, api::timestamp_type ts) {
+        add(std::move(name), atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));

    }
+    void put(const bytes& name, const bytes& val, api::timestamp_type ts) {
+        add(name, atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
+    }
    void del(bytes&& name, api::timestamp_type ts) {
        add(std::move(name), atomic_cell::make_dead(ts, gc_clock::now()));
    }
+    void del(const bytes& name, api::timestamp_type ts) {
+        add(name, atomic_cell::make_dead(ts, gc_clock::now()));
+    }
    collection_mutation_description to_mut() {
        collection_mutation_description ret;
        for (auto&& e : collected) {
@@ -988,7 +984,7 @@ public:
    put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item);
    // put_or_delete_item doesn't keep a reference to schema (so it can be
    // moved between shards for LWT) so it needs to be given again to build():
-    mutation build(schema_ptr schema, api::timestamp_type ts);
+    mutation build(schema_ptr schema, api::timestamp_type ts) const;
    const partition_key& pk() const { return _pk; }
    const clustering_key& ck() const { return _ck; }
 };
@@ -1017,20 +1013,29 @@ put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr sche
    }
 }

-mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) {
+mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) const {
    mutation m(schema, _pk);
-    auto& row = m.partition().clustered_row(*schema, _ck);
+    // If there's no clustering key, a tombstone should be created directly
+    // on a partition, not on a clustering row - otherwise it will look like
+    // an open-ended range tombstone, which will crash on KA/LA sstable format.
+    // Ref: #6035
+    const bool use_partition_tombstone = schema->clustering_key_size() == 0;
    if (!_cells) {
-        // a DeleteItem operation:
-        row.apply(tombstone(ts, gc_clock::now()));
+        if (use_partition_tombstone) {
+            m.partition().apply(tombstone(ts, gc_clock::now()));
+        } else {
+            // a DeleteItem operation:
+            m.partition().clustered_row(*schema, _ck).apply(tombstone(ts, gc_clock::now()));
+        }
        return m;
    }
    // else, a PutItem operation:
+    auto& row = m.partition().clustered_row(*schema, _ck);
    attribute_collector attrs_collector;
    for (auto& c : *_cells) {
        const column_definition* cdef = schema->get_column_definition(c.column_name);
        if (!cdef) {
-            attrs_collector.put(std::move(c.column_name), std::move(c.value), ts);
+            attrs_collector.put(c.column_name, c.value, ts);
        } else {
            row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, std::move(c.value)));
        }
@@ -1048,7 +1053,11 @@ mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) {
    // Scylla proper, to implement the operation to replace an entire
    // collection ("UPDATE .. SET x = ..") - see
    // cql3::update_parameters::make_tombstone_just_before().
-    row.apply(tombstone(ts-1, gc_clock::now()));
+    if (use_partition_tombstone) {
+        m.partition().apply(tombstone(ts-1, gc_clock::now()));
+    } else {
+        row.apply(tombstone(ts-1, gc_clock::now()));
+    }
    return m;
 }

@@ -1202,11 +1211,6 @@ std::optional<shard_id> rmw_operation::shard_for_execute(bool needs_read_before_
 // PutItem, DeleteItem). All these return nothing by default, but can
 // optionally return Attributes if requested via the ReturnValues option.
 static future<executor::request_return_type> rmw_operation_return(rjson::value&& attributes) {
-    // As an optimization, in the simple and common case that nothing is to be
-    // returned, quickly return an empty result:
-    if (attributes.IsNull()) {
-        return make_ready_future<executor::request_return_type>(json_string(""));
-    }
    rjson::value ret = rjson::empty_object();
    if (!attributes.IsNull()) {
        rjson::set(ret, "Attributes", std::move(attributes));
@@ -1331,7 +1335,7 @@ public:
               check_needs_read_before_write(_condition_expression) ||
               _returnvalues == returnvalues::ALL_OLD;
    }
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override {
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
        std::unordered_set<std::string> used_attribute_values;
        std::unordered_set<std::string> used_attribute_names;
        if (!verify_expected(_request, previous_item) ||
@@ -1343,6 +1347,7 @@ public:
            // efficient than throwing an exception.
            return {};
        }
+        _return_attributes = {};
        if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
            // previous_item is supposed to have been created with
            // describe_item(), so has the "Item" attribute:
@@ -1409,7 +1414,7 @@ public:
                check_needs_read_before_write(_condition_expression) ||
                _returnvalues == returnvalues::ALL_OLD;
    }
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override {
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
        std::unordered_set<std::string> used_attribute_values;
        std::unordered_set<std::string> used_attribute_names;
        if (!verify_expected(_request, previous_item) ||
@@ -1421,6 +1426,7 @@ public:
            // efficient than throwing an exception.
            return {};
        }
+        _return_attributes = {};
        if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
            rjson::value* item = rjson::find(*previous_item, "Item");
            if (item) {
@@ -1504,7 +1510,7 @@ public:
    virtual ~put_or_delete_item_cas_request() = default;
    virtual std::optional<mutation> apply(query::result& qr, const query::partition_slice& slice, api::timestamp_type ts) override {
        std::optional<mutation> ret;
-        for (put_or_delete_item& mutation_builder : _mutation_builders) {
+        for (const put_or_delete_item& mutation_builder : _mutation_builders) {
            // We assume all these builders have the same partition.
            if (ret) {
                ret->apply(mutation_builder.build(schema, ts));
@@ -2329,7 +2335,7 @@ public:

    update_item_operation(service::storage_proxy& proxy, rjson::value&& request);
    virtual ~update_item_operation() = default;
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override;
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override;
    bool needs_read_before_write() const;
 };

@@ -2393,7 +2399,7 @@ update_item_operation::needs_read_before_write() const {
 }

 std::optional<mutation>
-update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) {
+update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const {
    std::unordered_set<std::string> used_attribute_values;
    std::unordered_set<std::string> used_attribute_names;
    if (!verify_expected(_request, previous_item) ||
@@ -2773,6 +2779,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
            [] (std::vector<std::tuple<std::string, std::optional<rjson::value>>> responses) {
        rjson::value response = rjson::empty_object();
        rjson::set(response, "Responses", rjson::empty_object());
+        rjson::set(response, "UnprocessedKeys", rjson::empty_object());
        for (auto& t : responses) {
            if (!response["Responses"].HasMember(std::get<0>(t).c_str())) {
                rjson::set_with_string_name(response["Responses"], std::get<0>(t), rjson::empty_array());
@@ -2889,6 +2896,7 @@ static future<executor::request_return_type> do_query(schema_ptr schema,
        uint32_t limit,
        db::consistency_level cl,
        ::shared_ptr<cql3::restrictions::statement_restrictions> filtering_restrictions,
+        query::partition_slice::option_set custom_opts,
        service::client_state& client_state,
        cql3::cql_stats& cql_stats,
        tracing::trace_state_ptr trace_state,
@@ -2909,7 +2917,9 @@ static future<executor::request_return_type> do_query(schema_ptr schema,
    auto regular_columns = boost::copy_range<query::column_id_vector>(
            schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
    auto selection = cql3::selection::selection::wildcard(schema);
-    auto partition_slice = query::partition_slice(std::move(ck_bounds), {}, std::move(regular_columns), selection->get_query_options());
+    query::partition_slice::option_set opts = selection->get_query_options();
+    opts.add(custom_opts);
+    auto partition_slice = query::partition_slice(std::move(ck_bounds), {}, std::move(regular_columns), opts);
    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, query::max_partitions);

    auto query_state_ptr = std::make_unique<service::query_state>(client_state, trace_state, std::move(permit));
@@ -2939,11 +2949,38 @@ static future<executor::request_return_type> do_query(schema_ptr schema,
    });
 }

+static dht::token token_for_segment(int segment, int total_segments) {
+    assert(total_segments > 1 && segment >= 0 && segment < total_segments);
+    uint64_t delta = std::numeric_limits<uint64_t>::max() / total_segments;
+    return dht::token::from_int64(std::numeric_limits<int64_t>::min() + delta * segment);
+}
+
+static dht::partition_range get_range_for_segment(int segment, int total_segments) {
+    if (total_segments == 1) {
+        return dht::partition_range::make_open_ended_both_sides();
+    }
+    if (segment == 0) {
+        dht::token ending_token = token_for_segment(1, total_segments);
+        return dht::partition_range::make_ending_with(
+                dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false));
+    } else if (segment == total_segments - 1) {
+        dht::token starting_token = token_for_segment(segment, total_segments);
+        return dht::partition_range::make_starting_with(
+                dht::partition_range::bound(dht::ring_position::starting_at(starting_token)));
+    } else {
+        dht::token starting_token = token_for_segment(segment, total_segments);
+        dht::token ending_token = token_for_segment(segment + 1, total_segments);
+        return dht::partition_range::make(
+            dht::partition_range::bound(dht::ring_position::starting_at(starting_token)),
+            dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false)
+        );
+    }
+}
+
 // TODO(sarna):
 // 1. Paging must have 1MB boundary according to the docs. IIRC we do have a replica-side reply size limit though - verify.
 // 2. Filtering - by passing appropriately created restrictions to pager as a last parameter
 // 3. Proper timeouts instead of gc_clock::now() and db::no_timeout
-// 4. Implement parallel scanning via Segments
 future<executor::request_return_type> executor::scan(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
    _stats.api_operations.scan++;
    elogger.trace("Scanning {}", request);
@@ -2954,10 +2991,21 @@ future<executor::request_return_type> executor::scan(client_state& client_state,
        return make_ready_future<request_return_type>(api_error("ValidationException",
                "FilterExpression is not yet implemented in alternator"));
    }
-    if (get_int_attribute(request, "Segment") || get_int_attribute(request, "TotalSegments")) {
-        // FIXME: need to support parallel scan. See issue #5059.
-        return make_ready_future<request_return_type>(api_error("ValidationException",
-                "Scan Segment/TotalSegments is not yet implemented in alternator"));
+    auto segment = get_int_attribute(request, "Segment");
+    auto total_segments = get_int_attribute(request, "TotalSegments");
+    if (segment || total_segments) {
+        if (!segment || !total_segments) {
+            return make_ready_future<request_return_type>(api_error("ValidationException",
+                    "Both Segment and TotalSegments attributes need to be present for a parallel scan"));
+        }
+        if (*segment < 0 || *segment >= *total_segments) {
+            return make_ready_future<request_return_type>(api_error("ValidationException",
+                    "Segment must be non-negative and less than TotalSegments"));
+        }
+        if (*total_segments < 0 || *total_segments > 1000000) {
+            return make_ready_future<request_return_type>(api_error("ValidationException",
+                    "TotalSegments must be non-negative and less or equal to 1000000"));
+        }
    }

    rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey");
@@ -2976,7 +3024,12 @@ future<executor::request_return_type> executor::scan(client_state& client_state,

    auto attrs_to_get = calculate_attrs_to_get(request);

-    dht::partition_range_vector partition_ranges{dht::partition_range::make_open_ended_both_sides()};
+    dht::partition_range_vector partition_ranges;
+    if (segment) {
+        partition_ranges.push_back(get_range_for_segment(*segment, *total_segments));
+    } else {
+        partition_ranges.push_back(dht::partition_range::make_open_ended_both_sides());
+    }
    std::vector<query::clustering_range> ck_bounds{query::clustering_range::make_open_ended_both_sides()};

    ::shared_ptr<cql3::restrictions::statement_restrictions> filtering_restrictions;
@@ -2986,14 +3039,15 @@ future<executor::request_return_type> executor::scan(client_state& client_state,
        partition_ranges = filtering_restrictions->get_partition_key_ranges(query_options);
        ck_bounds = filtering_restrictions->get_clustering_bounds(query_options);
    }
-    return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, std::move(filtering_restrictions), client_state, _stats.cql_stats, trace_state, std::move(permit));
+    return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
+            std::move(filtering_restrictions), query::partition_slice::option_set(), client_state, _stats.cql_stats, trace_state, std::move(permit));
 }

 static dht::partition_range calculate_pk_bound(schema_ptr schema, const column_definition& pk_cdef, comparison_operator_type op, const rjson::value& attrs) {
    if (attrs.Size() != 1) {
        throw api_error("ValidationException", format("Only a single attribute is allowed for a hash key restriction: {}", attrs));
    }
-    bytes raw_value = pk_cdef.type->from_string(attrs[0][type_to_string(pk_cdef.type)].GetString());
+    bytes raw_value = get_key_from_typed_value(attrs[0], pk_cdef);
    partition_key pk = partition_key::from_singular(*schema, pk_cdef.type->deserialize(raw_value));
    auto decorated_key = dht::decorate_key(*schema, pk);
    if (op != comparison_operator_type::EQ) {
@@ -3018,7 +3072,7 @@ static query::clustering_range calculate_ck_bound(schema_ptr schema, const colum
    if (attrs.Size() != expected_attrs_size) {
        throw api_error("ValidationException", format("{} arguments expected for a sort key restriction: {}", expected_attrs_size, attrs));
    }
-    bytes raw_value = ck_cdef.type->from_string(attrs[0][type_to_string(ck_cdef.type)].GetString());
+    bytes raw_value = get_key_from_typed_value(attrs[0], ck_cdef);
    clustering_key ck = clustering_key::from_single_value(*schema, raw_value);
    switch (op) {
    case comparison_operator_type::EQ:
@@ -3032,7 +3086,7 @@ static query::clustering_range calculate_ck_bound(schema_ptr schema, const colum
    case comparison_operator_type::GT:
        return query::clustering_range::make_starting_with(query::clustering_range::bound(ck, false));
    case comparison_operator_type::BETWEEN: {
-        bytes raw_upper_limit = ck_cdef.type->from_string(attrs[1][type_to_string(ck_cdef.type)].GetString());
+        bytes raw_upper_limit = get_key_from_typed_value(attrs[1], ck_cdef);
        clustering_key upper_limit = clustering_key::from_single_value(*schema, raw_upper_limit);
        return query::clustering_range::make(query::clustering_range::bound(ck), query::clustering_range::bound(upper_limit));
    }
@@ -3045,9 +3099,7 @@ static query::clustering_range calculate_ck_bound(schema_ptr schema, const colum
        if (!ck_cdef.type->is_compatible_with(*utf8_type)) {
            throw api_error("ValidationException", format("BEGINS_WITH operator cannot be applied to type {}", type_to_string(ck_cdef.type)));
        }
-        std::string raw_upper_limit_str = attrs[0][type_to_string(ck_cdef.type)].GetString();
-        bytes raw_upper_limit = ck_cdef.type->from_string(raw_upper_limit_str);
-        return get_clustering_range_for_begins_with(std::move(raw_upper_limit), ck, schema, ck_cdef.type);
+        return get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef.type);
    }
    default:
        throw api_error("ValidationException", format("Unknown primary key bound passed: {}", int(op)));
@@ -3429,11 +3481,7 @@ future<executor::request_return_type> executor::query(client_state& client_state
    if (rjson::find(request, "FilterExpression")) {
        return make_ready_future<request_return_type>(api_error("ValidationException", "FilterExpression is not yet implemented in alternator"));
    }
-    bool forward = get_bool_attribute(request, "ScanIndexForward", true);
-    if (!forward) {
-        // FIXME: need to support the !forward (i.e., reverse sort order) case. See issue #5153.
-        return make_ready_future<request_return_type>(api_error("ValidationException", "ScanIndexForward=false is not yet implemented in alternator"));
-    }
+    const bool forward = get_bool_attribute(request, "ScanIndexForward", true);

    rjson::value* key_conditions = rjson::find(request, "KeyConditions");
    rjson::value* key_condition_expression = rjson::find(request, "KeyConditionExpression");
@@ -3476,7 +3524,10 @@ future<executor::request_return_type> executor::query(client_state& client_state
    }
    verify_all_are_used(request, "ExpressionAttributeValues", used_attribute_values, "KeyConditionExpression");
    verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "KeyConditionExpression");
-    return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, std::move(filtering_restrictions), client_state, _stats.cql_stats, std::move(trace_state), std::move(permit));
+    query::partition_slice::option_set opts;
+    opts.set_if<query::partition_slice::option::reversed>(!forward);
+    return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
+            std::move(filtering_restrictions), opts, client_state, _stats.cql_stats, std::move(trace_state), std::move(permit));
 }

 future<executor::request_return_type> executor::list_tables(client_state& client_state, service_permit permit, rjson::value request) {
@@ -3567,12 +3618,12 @@ static std::map<sstring, sstring> get_network_topology_options(int rf) {
 // manually create the keyspace to override this predefined behavior.
 future<> executor::create_keyspace(std::string_view keyspace_name) {
    sstring keyspace_name_str(keyspace_name);
-    return gms::get_up_endpoint_count().then([this, keyspace_name_str = std::move(keyspace_name_str)] (int up_endpoint_count) {
+    return gms::get_all_endpoint_count().then([this, keyspace_name_str = std::move(keyspace_name_str)] (int endpoint_count) {
        int rf = 3;
-        if (up_endpoint_count < rf) {
+        if (endpoint_count < rf) {
            rf = 1;
-            elogger.warn("Creating keyspace '{}' for Alternator with unsafe RF={} because cluster only has {} live nodes.",
-                    keyspace_name_str, rf, up_endpoint_count);
+            elogger.warn("Creating keyspace '{}' for Alternator with unsafe RF={} because cluster only has {} nodes.",
+                    keyspace_name_str, rf, endpoint_count);
        }
        auto opts = get_network_topology_options(rf);
        auto ksm = keyspace_metadata::new_keyspace(keyspace_name_str, "org.apache.cassandra.locator.NetworkTopologyStrategy", std::move(opts), true);
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -83,7 +83,11 @@ protected:
    // When _returnvalues != NONE, apply() should store here, in JSON form,
    // the values which are to be returned in the "Attributes" field.
    // The default null JSON means do not return an Attributes field at all.
-    rjson::value _return_attributes;
+    // This field is marked "mutable" so that the const apply() can modify
+    // it (see explanation below), but note that because apply() may be
+    // called more than once, if apply() will sometimes set this field it
+    // must set it (even if just to the default empty value) every time.
+    mutable rjson::value _return_attributes;
 public:
    // The constructor of a rmw_operation subclass should parse the request
    // and try to discover as many input errors as it can before really
@@ -96,7 +100,12 @@ public:
    // conditional expression, apply() should return an empty optional.
    // apply() may throw if it encounters input errors not discovered during
    // the constructor.
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) = 0;
+    // apply() may be called more than once in case of contention, so it must
+    // not change the state saved in the object (issue #7218 was caused by
+    // violating this). We mark apply() "const" to let the compiler validate
+    // this for us. The output-only field _return_attributes is marked
+    // "mutable" above so that apply() can still write to it.
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
    // Convert the above apply() into the signature needed by cas_request:
    virtual std::optional<mutation> apply(query::result& qr, const query::partition_slice& slice, api::timestamp_type ts) override;
    virtual ~rmw_operation() = default;
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -54,26 +54,22 @@ static sstring validate_keyspace(http_context& ctx, const parameters& param) {
    throw bad_param_exception("Keyspace " + param["keyspace"] + " Does not exist");
 }

-static std::vector<ss::token_range> describe_ring(const sstring& keyspace) {
-    std::vector<ss::token_range> res;
-    for (auto d : service::get_local_storage_service().describe_ring(keyspace)) {
-        ss::token_range r;
-        r.start_token = d._start_token;
-        r.end_token = d._end_token;
-        r.endpoints = d._endpoints;
-        r.rpc_endpoints = d._rpc_endpoints;
-        for (auto det : d._endpoint_details) {
-            ss::endpoint_detail ed;
-            ed.host = det._host;
-            ed.datacenter = det._datacenter;
-            if (det._rack != "") {
-                ed.rack = det._rack;
-            }
-            r.endpoint_details.push(ed);
+static ss::token_range token_range_endpoints_to_json(const dht::token_range_endpoints& d) {
+    ss::token_range r;
+    r.start_token = d._start_token;
+    r.end_token = d._end_token;
+    r.endpoints = d._endpoints;
+    r.rpc_endpoints = d._rpc_endpoints;
+    for (auto det : d._endpoint_details) {
+        ss::endpoint_detail ed;
+        ed.host = det._host;
+        ed.datacenter = det._datacenter;
+        if (det._rack != "") {
+            ed.rack = det._rack;
        }
-        res.push_back(r);
+        r.endpoint_details.push(ed);
    }
-    return res;
+    return r;
 }

 using ks_cf_func = std::function<future<json::json_return_type>(http_context&, std::unique_ptr<request>, sstring, std::vector<sstring>)>;
@@ -175,13 +171,13 @@ void set_storage_service(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(res);
    });

-    ss::describe_any_ring.set(r, [&ctx](const_req req) {
-        return describe_ring("");
+    ss::describe_any_ring.set(r, [&ctx](std::unique_ptr<request> req) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(""), token_range_endpoints_to_json));
    });

-    ss::describe_ring.set(r, [&ctx](const_req req) {
-        auto keyspace = validate_keyspace(ctx, req.param);
-        return describe_ring(keyspace);
+    ss::describe_ring.set(r, [&ctx](std::unique_ptr<request> req) {
+        auto keyspace = validate_keyspace(ctx, req->param);
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(keyspace), token_range_endpoints_to_json));
    });

    ss::get_host_id_map.set(r, [&ctx](const_req req) {
@@ -256,8 +252,8 @@ void set_storage_service(http_context& ctx, routes& r) {
                for (auto cf : column_families) {
                    column_families_vec.push_back(&db.find_column_family(keyspace, cf));
                }
-                return parallel_for_each(column_families_vec, [&cm] (column_family* cf) {
-                    return cm.perform_cleanup(cf);
+                return parallel_for_each(column_families_vec, [&cm, &db] (column_family* cf) {
+                    return cm.perform_cleanup(db, cf);
                });
            }).then([]{
                return make_ready_future<json::json_return_type>(0);
@@ -1000,6 +996,9 @@ void set_snapshot(http_context& ctx, routes& r) {
        if (column_family.empty()) {
            resp = service::get_local_storage_service().take_snapshot(tag, keynames);
        } else {
+            if (keynames.empty()) {
+                throw httpd::bad_param_exception("The keyspace of column families must be specified");
+            }
            if (keynames.size() > 1) {
                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
            }
--- a/auth/role_manager.hh
+++ b/auth/role_manager.hh
@@ -33,6 +33,7 @@

 #include "auth/resource.hh"
 #include "seastarx.hh"
+#include "exceptions/exceptions.hh"

 namespace auth {

@@ -52,9 +53,9 @@ struct role_config_update final {
 ///
 /// A logical argument error for a role-management operation.
 ///
-class roles_argument_exception : public std::invalid_argument {
+class roles_argument_exception : public exceptions::invalid_request_exception {
 public:
-    using std::invalid_argument::invalid_argument;
+    using exceptions::invalid_request_exception::invalid_request_exception;
 };

 class role_already_exists : public roles_argument_exception {
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -1135,7 +1135,7 @@ public:
                    if (r.row().deleted_at()) {
                        touched_parts.set<stats::part_type::ROW_DELETE>();
                        cdc_op = operation::row_delete;
-                        if (pirow) {
+                        if (pirow && pikey) {
                            for (const column_definition& column: _schema->regular_columns()) {
                                assert(pirow->has(column.name_as_text()));
                                auto& cdef = *_log_schema->get_column_definition(log_data_column_name_bytes(column.name()));
--- a/clocks-impl.cc
+++ b/clocks-impl.cc
@@ -30,10 +30,12 @@ std::atomic<int64_t> clocks_offset;

 std::ostream& operator<<(std::ostream& os, db_clock::time_point tp) {
    auto t = db_clock::to_time_t(tp);
-    return os << std::put_time(std::gmtime(&t), "%Y/%m/%d %T");
+    ::tm t_buf;
+    return os << std::put_time(::gmtime_r(&t, &t_buf), "%Y/%m/%d %T");
 }

 std::string format_timestamp(api::timestamp_type ts) {
    auto t = std::time_t(std::chrono::duration_cast<std::chrono::seconds>(api::timestamp_clock::duration(ts)).count());
-    return format("{}", std::put_time(std::gmtime(&t), "%Y/%m/%d %T"));
+    ::tm t_buf;
+    return format("{}", std::put_time(::gmtime_r(&t, &t_buf), "%Y/%m/%d %T"));
 }
--- a/configure.py
+++ b/configure.py
@@ -381,6 +381,7 @@ scylla_tests = set([
    'test/boost/view_schema_ckey_test',
    'test/boost/vint_serialization_test',
    'test/boost/virtual_reader_test',
+    'test/boost/stall_free_test',
    'test/manual/ec2_snitch_test',
    'test/manual/gce_snitch_test',
    'test/manual/gossip',
@@ -1265,9 +1266,9 @@ def query_seastar_flags(pc_file, link_static_cxx=False):
    return cflags, libs

 for mode in build_modes:
-    seastar_cflags, seastar_libs = query_seastar_flags(pc[mode], link_static_cxx=args.staticcxx)
-    modes[mode]['seastar_cflags'] = seastar_cflags
-    modes[mode]['seastar_libs'] = seastar_libs
+    seastar_pc_cflags, seastar_pc_libs = query_seastar_flags(pc[mode], link_static_cxx=args.staticcxx)
+    modes[mode]['seastar_cflags'] = seastar_pc_cflags
+    modes[mode]['seastar_libs'] = seastar_pc_libs

 # We need to use experimental features of the zstd library (to use our own allocators for the (de)compression context),
 # which are available only when the library is linked statically.
@@ -1288,6 +1289,46 @@ def configure_zstd(build_dir, mode):
    os.makedirs(zstd_build_dir, exist_ok=True)
    subprocess.check_call(zstd_cmd, shell=False, cwd=zstd_build_dir)

+def configure_abseil(build_dir, mode):
+    abseil_build_dir = os.path.join(build_dir, mode, 'abseil')
+
+    abseil_cflags = seastar_cflags + ' ' + modes[mode]['cxx_ld_flags']
+    cmake_mode = MODE_TO_CMAKE_BUILD_TYPE[mode]
+    abseil_cmake_args = [
+        '-DCMAKE_BUILD_TYPE={}'.format(cmake_mode),
+        '-DCMAKE_INSTALL_PREFIX={}'.format(build_dir + '/inst'), # just to avoid a warning from absl
+        '-DCMAKE_C_COMPILER={}'.format(args.cc),
+        '-DCMAKE_CXX_COMPILER={}'.format(args.cxx),
+        '-DCMAKE_CXX_FLAGS_{}={}'.format(cmake_mode.upper(), abseil_cflags),
+    ]
+
+    abseil_cmd = ['cmake', '-G', 'Ninja', os.path.relpath('abseil', abseil_build_dir)] + abseil_cmake_args
+
+    os.makedirs(abseil_build_dir, exist_ok=True)
+    subprocess.check_call(abseil_cmd, shell=False, cwd=abseil_build_dir)
+
+abseil_libs = ['absl/' + lib for lib in [
+    'container/libabsl_hashtablez_sampler.a',
+    'container/libabsl_raw_hash_set.a',
+    'synchronization/libabsl_synchronization.a',
+    'synchronization/libabsl_graphcycles_internal.a',
+    'debugging/libabsl_stacktrace.a',
+    'debugging/libabsl_symbolize.a',
+    'debugging/libabsl_debugging_internal.a',
+    'debugging/libabsl_demangle_internal.a',
+    'time/libabsl_time.a',
+    'time/libabsl_time_zone.a',
+    'numeric/libabsl_int128.a',
+    'hash/libabsl_city.a',
+    'hash/libabsl_hash.a',
+    'base/libabsl_malloc_internal.a',
+    'base/libabsl_spinlock_wait.a',
+    'base/libabsl_base.a',
+    'base/libabsl_dynamic_annotations.a',
+    'base/libabsl_raw_logging_internal.a',
+    'base/libabsl_exponential_biased.a',
+    'base/libabsl_throw_delegate.a']]
+
 args.user_cflags += " " + pkg_config('jsoncpp', '--cflags')
 args.user_cflags += ' -march=' + args.target
 libs = ' '.join([maybe_static(args.staticyamlcpp, '-lyaml-cpp'), '-latomic', '-llz4', '-lz', '-lsnappy', pkg_config('jsoncpp', '--libs'),
@@ -1316,6 +1357,7 @@ if any(filter(thrift_version.startswith, thrift_boost_versions)):
 for pkg in pkgs:
    args.user_cflags += ' ' + pkg_config(pkg, '--cflags')
    libs += ' ' + pkg_config(pkg, '--libs')
+args.user_cflags += '-I abseil'
 user_cflags = args.user_cflags + ' -fvisibility=hidden'
 user_ldflags = args.user_ldflags + ' -fvisibility=hidden'
 if args.staticcxx:
@@ -1346,6 +1388,9 @@ else:
 for mode in build_modes:
    configure_zstd(outdir, mode)

+for mode in build_modes:
+    configure_abseil(outdir, mode)
+
 # configure.py may run automatically from an already-existing build.ninja.
 # If the user interrupts configure.py in the middle, we need build.ninja
 # to remain in a valid state.  So we write our output to a temporary
@@ -1480,6 +1525,8 @@ with open(buildfile_tmp, 'w') as f:
                objs.extend(['$builddir/' + mode + '/' + artifact for artifact in [
                    'libdeflate/libdeflate.a',
                    'zstd/lib/libzstd.a',
+                ] + [
+                    'abseil/' + x for x in abseil_libs
                ]])
                objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
                if binary in tests:
@@ -1621,6 +1668,12 @@ with open(buildfile_tmp, 'w') as f:
        f.write('  subdir = build/{mode}/zstd\n'.format(**locals()))
        f.write('  target = libzstd.a\n'.format(**locals()))

+        for lib in abseil_libs:
+            f.write('build build/{mode}/abseil/{lib}: ninja\n'.format(**locals()))
+            f.write('  pool = submodule_pool\n')
+            f.write('  subdir = build/{mode}/abseil\n'.format(**locals()))
+            f.write('  target = {lib}\n'.format(**locals()))
+
    mode = 'dev' if 'dev' in modes else modes[0]
    f.write('build checkheaders: phony || {}\n'.format(' '.join(['$builddir/{}/{}.o'.format(mode, hh) for hh in headers])))

--- a/cql3/functions/aggregate_fcts.cc
+++ b/cql3/functions/aggregate_fcts.cc
@@ -267,10 +267,13 @@ public:
    }
 };

-/// The same as `impl_max_function_for' but without knowledge of `Type'.
+/// The same as `impl_max_function_for' but without compile-time dependency on `Type'.
 class impl_max_dynamic_function final : public aggregate_function::aggregate {
+    data_type _io_type;
    opt_bytes _max;
 public:
+    impl_max_dynamic_function(data_type io_type) : _io_type(std::move(io_type)) {}
+
    virtual void reset() override {
        _max = {};
    }
@@ -278,12 +281,11 @@ public:
        return _max.value_or(bytes{});
    }
    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
-        if (!values[0]) {
+        if (values.empty() || !values[0]) {
            return;
        }
-        const auto val = *values[0];
-        if (!_max || *_max < val) {
-            _max = val;
+        if (!_max || _io_type->less(*_max, *values[0])) {
+            _max = values[0];
        }
    }
 };
@@ -298,10 +300,13 @@ public:
 };

 class max_dynamic_function final : public native_aggregate_function {
+    data_type _io_type;
 public:
-    max_dynamic_function(data_type io_type) : native_aggregate_function("max", io_type, { io_type }) {}
+    max_dynamic_function(data_type io_type)
+            : native_aggregate_function("max", io_type, { io_type })
+            , _io_type(std::move(io_type)) {}
    virtual std::unique_ptr<aggregate> new_aggregate() override {
-        return std::make_unique<impl_max_dynamic_function>();
+        return std::make_unique<impl_max_dynamic_function>(_io_type);
    }
 };

@@ -358,10 +363,13 @@ public:
    }
 };

-/// The same as `impl_min_function_for' but without knowledge of `Type'.
+/// The same as `impl_min_function_for' but without compile-time dependency on `Type'.
 class impl_min_dynamic_function final : public aggregate_function::aggregate {
+    data_type _io_type;
    opt_bytes _min;
 public:
+    impl_min_dynamic_function(data_type io_type) : _io_type(std::move(io_type)) {}
+
    virtual void reset() override {
        _min = {};
    }
@@ -369,12 +377,11 @@ public:
        return _min.value_or(bytes{});
    }
    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
-        if (!values[0]) {
+        if (values.empty() || !values[0]) {
            return;
        }
-        const auto val = *values[0];
-        if (!_min || val < *_min) {
-            _min = val;
+        if (!_min || _io_type->less(*values[0], *_min)) {
+            _min = values[0];
        }
    }
 };
@@ -389,10 +396,13 @@ public:
 };

 class min_dynamic_function final : public native_aggregate_function {
+    data_type _io_type;
 public:
-    min_dynamic_function(data_type io_type) : native_aggregate_function("min", io_type, { io_type }) {}
+    min_dynamic_function(data_type io_type)
+            : native_aggregate_function("min", io_type, { io_type })
+            , _io_type(std::move(io_type)) {}
    virtual std::unique_ptr<aggregate> new_aggregate() override {
-        return std::make_unique<impl_min_dynamic_function>();
+        return std::make_unique<impl_min_dynamic_function>(_io_type);
    }
 };

--- a/cql3/functions/castas_fcts.cc
+++ b/cql3/functions/castas_fcts.cc
@@ -87,17 +87,14 @@ template<typename ToType>
 std::function<data_value(data_value)> make_castas_fctn_from_decimal_to_float() {
    return [](data_value from) -> data_value {
        auto val_from = value_cast<big_decimal>(from);
-        boost::multiprecision::cpp_int ten(10);
-        boost::multiprecision::cpp_rational r = val_from.unscaled_value();
-        r /= boost::multiprecision::pow(ten, val_from.scale());
-        return static_cast<ToType>(r);
+        return static_cast<ToType>(val_from.as_rational());
    };
 }

 static utils::multiprecision_int from_decimal_to_cppint(const data_value& from) {
    const auto& val_from = value_cast<big_decimal>(from);
-    boost::multiprecision::cpp_int ten(10);
-    return boost::multiprecision::cpp_int(val_from.unscaled_value() / boost::multiprecision::pow(ten, val_from.scale()));
+    auto r = val_from.as_rational();
+    return utils::multiprecision_int(numerator(r)/denominator(r));
 }

 template<typename ToType>
--- a/cql3/lists.cc
+++ b/cql3/lists.cc
@@ -357,7 +357,12 @@ lists::setter_by_uuid::execute(mutation& m, const clustering_key_prefix& prefix,

    collection_mutation_description mut;
    mut.cells.reserve(1);
-    mut.cells.emplace_back(to_bytes(*index), params.make_cell(*ltype->value_comparator(), *value, atomic_cell::collection_member::yes));
+
+    if (!value) {
+        mut.cells.emplace_back(to_bytes(*index), params.make_dead_cell());
+    } else {
+        mut.cells.emplace_back(to_bytes(*index), params.make_cell(*ltype->value_comparator(), *value, atomic_cell::collection_member::yes));
+    }

    m.set_cell(prefix, column, mut.serialize(*ltype));
 }
--- a/cql3/relation.cc
+++ b/cql3/relation.cc
@@ -49,7 +49,7 @@ relation::to_column_definition(const schema& schema, const column_identifier::ra
    auto id = entity.prepare_column_identifier(schema);
    auto def = get_column_definition(schema, *id);
    if (!def || def->is_hidden_from_cql()) {
-        throw exceptions::unrecognized_entity_exception(id, shared_from_this());
+        throw exceptions::unrecognized_entity_exception(*id, to_string());
    }
    return *def;
 }
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -417,7 +417,7 @@ std::vector<const column_definition*> statement_restrictions::get_column_defs_fo
                    _clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
            for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
                ::shared_ptr<single_column_restriction> restr;
-                if (single_pk_restrs) {
+                if (single_ck_restrs) {
                    auto it = single_ck_restrs->restrictions().find(cdef);
                    if (it != single_ck_restrs->restrictions().end()) {
                        restr = dynamic_pointer_cast<single_column_restriction>(it->second);
@@ -624,9 +624,6 @@ bool single_column_restriction::EQ::is_satisfied_by(const schema& schema,
        const row& cells,
        const query_options& options,
        gc_clock::time_point now) const {
-    if (_column_def.type->is_counter()) {
-        fail(unimplemented::cause::COUNTERS);
-    }
    auto operand = value(options);
    if (operand) {
        auto cell_value = get_value(schema, key, ckey, cells, now);
@@ -641,9 +638,6 @@ bool single_column_restriction::EQ::is_satisfied_by(const schema& schema,
 }

 bool single_column_restriction::EQ::is_satisfied_by(bytes_view data, const query_options& options) const {
-    if (_column_def.type->is_counter()) {
-        fail(unimplemented::cause::COUNTERS);
-    }
    auto operand = value(options);
    return operand && _column_def.type->compare(*operand, data) == 0;
 }
@@ -654,9 +648,6 @@ bool single_column_restriction::IN::is_satisfied_by(const schema& schema,
        const row& cells,
        const query_options& options,
        gc_clock::time_point now) const {
-    if (_column_def.type->is_counter()) {
-        fail(unimplemented::cause::COUNTERS);
-    }
    auto cell_value = get_value(schema, key, ckey, cells, now);
    if (!cell_value) {
        return false;
@@ -670,9 +661,6 @@ bool single_column_restriction::IN::is_satisfied_by(const schema& schema,
 }

 bool single_column_restriction::IN::is_satisfied_by(bytes_view data, const query_options& options) const {
-    if (_column_def.type->is_counter()) {
-        fail(unimplemented::cause::COUNTERS);
-    }
    auto operands = values(options);
    return boost::algorithm::any_of(operands, [this, &data] (const bytes_opt& operand) {
        return operand && _column_def.type->compare(*operand, data) == 0;
@@ -697,6 +685,11 @@ static query::range<bytes_view> to_range(const term_slice& slice, const query_op
        extract_bound(statements::bound::END));
 }

+static bool contains_without_wraparound(
+        const query::range<bytes_view>& range, bytes_view value, const serialized_tri_compare& cmp) {
+    return !range.is_wrap_around(cmp) && range.contains(value, cmp);
+}
+
 bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
        const partition_key& key,
        const clustering_key_prefix& ckey,
@@ -711,15 +704,14 @@ bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
        return false;
    }
    return cell_value->with_linearized([&] (bytes_view cell_value_bv) {
-        return to_range(_slice, options).contains(cell_value_bv, _column_def.type->as_tri_comparator());
+        return contains_without_wraparound(to_range(_slice, options),
+                cell_value_bv, _column_def.type->as_tri_comparator());
    });
 }

 bool single_column_restriction::slice::is_satisfied_by(bytes_view data, const query_options& options) const {
-    if (_column_def.type->is_counter()) {
-        fail(unimplemented::cause::COUNTERS);
-    }
-    return to_range(_slice, options).contains(data, _column_def.type->underlying_type()->as_tri_comparator());
+    return contains_without_wraparound(to_range(_slice, options),
+            data, _column_def.type->underlying_type()->as_tri_comparator());
 }

 bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
@@ -728,9 +720,6 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
        const row& cells,
        const query_options& options,
        gc_clock::time_point now) const {
-    if (_column_def.type->is_counter()) {
-        fail(unimplemented::cause::COUNTERS);
-    }
    if (!_column_def.type->is_collection()) {
        return false;
    }
--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -207,6 +207,9 @@ void alter_table_statement::add_column(const schema& schema, const table& cf, sc
                "because a collection with the same name and a different type has already been used in the past", column_name));
        }
    }
+    if (type->is_counter() && !schema.is_counter()) {
+        throw exceptions::configuration_exception(format("Cannot add a counter column ({}) in a non counter column family", column_name));
+    }

    cfm.with_column(column_name.name(), type, is_static ? column_kind::static_column : column_kind::regular_column);

@@ -222,7 +225,7 @@ void alter_table_statement::add_column(const schema& schema, const table& cf, sc
            schema_builder builder(view);
            if (view->view_info()->include_all_columns()) {
                builder.with_column(column_name.name(), type);
-            } else if (view->view_info()->base_non_pk_columns_in_view_pk().empty()) {
+            } else if (!view->view_info()->has_base_non_pk_columns_in_view_pk()) {
                db::view::create_virtual_column(builder, column_name.name(), type);
            }
            view_updates.push_back(view_ptr(builder.build()));
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -68,6 +68,7 @@ batch_statement::batch_statement(int bound_terms, type type_,
    , _has_conditions(boost::algorithm::any_of(_statements, [] (auto&& s) { return s.statement->has_conditions(); }))
    , _stats(stats)
 {
+    validate();
    if (has_conditions()) {
        // A batch can be created not only by raw::batch_statement::prepare, but also by
        // cql_server::connection::process_batch, which doesn't call any methods of
@@ -448,7 +449,6 @@ batch_statement::prepare(database& db, cql_stats& stats) {
    prep_attrs->collect_marker_specification(bound_names);

    cql3::statements::batch_statement batch_statement_(bound_names.size(), _type, std::move(statements), std::move(prep_attrs), stats);
-    batch_statement_.validate();

    std::vector<uint16_t> partition_key_bind_indices;
    if (!have_multiple_cfs && batch_statement_.get_statements().size() > 0) {
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -255,7 +255,9 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_
        }
    }

-    builder.set_default_time_to_live(gc_clock::duration(get_int(KW_DEFAULT_TIME_TO_LIVE, DEFAULT_DEFAULT_TIME_TO_LIVE)));
+    if (has_property(KW_DEFAULT_TIME_TO_LIVE)) {
+        builder.set_default_time_to_live(gc_clock::duration(get_int(KW_DEFAULT_TIME_TO_LIVE, DEFAULT_DEFAULT_TIME_TO_LIVE)));
+    }

    if (has_property(KW_SPECULATIVE_RETRY)) {
        builder.set_speculative_retry(get_string(KW_SPECULATIVE_RETRY, builder.get_speculative_retry().to_sstring()));
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -434,6 +434,12 @@ GCC6_CONCEPT(
 static KeyType
 generate_base_key_from_index_pk(const partition_key& index_pk, const std::optional<clustering_key>& index_ck, const schema& base_schema, const schema& view_schema) {
    const auto& base_columns = std::is_same_v<KeyType, partition_key> ? base_schema.partition_key_columns() : base_schema.clustering_key_columns();
+
+    // An empty key in the index paging state translates to an empty base key
+    if (index_pk.is_empty() && !index_ck) {
+        return KeyType::make_empty();
+    }
+
    std::vector<bytes_view> exploded_base_key;
    exploded_base_key.reserve(base_columns.size());

@@ -507,8 +513,7 @@ indexed_table_select_statement::do_execute_base_query(
            if (old_paging_state && concurrency == 1) {
                auto base_pk = generate_base_key_from_index_pk<partition_key>(old_paging_state->get_partition_key(),
                        old_paging_state->get_clustering_key(), *_schema, *_view_schema);
-                if (_schema->clustering_key_size() > 0) {
-                    assert(old_paging_state->get_clustering_key().has_value());
+                if (old_paging_state->get_clustering_key() && _schema->clustering_key_size() > 0) {
                    auto base_ck = generate_base_key_from_index_pk<clustering_key>(old_paging_state->get_partition_key(),
                            old_paging_state->get_clustering_key(), *_schema, *_view_schema);
                    command->slice.set_range(*_schema, base_pk,
@@ -1362,8 +1367,8 @@ select_statement::prepare_restrictions(database& db,
        return ::make_shared<restrictions::statement_restrictions>(db, schema, statement_type::SELECT, std::move(_where_clause), bound_names,
            selection->contains_only_static_columns(), selection->contains_a_collection(), for_view, allow_filtering);
    } catch (const exceptions::unrecognized_entity_exception& e) {
-        if (contains_alias(*e.entity)) {
-            throw exceptions::invalid_request_exception(format("Aliases aren't allowed in the where clause ('{}')", e.relation->to_string()));
+        if (contains_alias(e.entity)) {
+            throw exceptions::invalid_request_exception(format("Aliases aren't allowed in the where clause ('{}')", e.relation_str));
        }
        throw;
    }
--- a/database.cc
+++ b/database.cc
@@ -1323,7 +1323,7 @@ future<mutation> database::do_apply_counter_update(column_family& cf, const froz
            // counter state for each modified cell...

            tracing::trace(trace_state, "Reading counter values from the CF");
-            return counter_write_query(m_schema, cf.as_mutation_source(), m.decorated_key(), slice, trace_state)
+            return counter_write_query(m_schema, cf.as_mutation_source(), m.decorated_key(), slice, trace_state, timeout)
                    .then([this, &cf, &m, m_schema, timeout, trace_state] (auto mopt) {
                // ...now, that we got existing state of all affected counter
                // cells we can look for our shard in each of them, increment
@@ -1827,7 +1827,11 @@ future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_fun
                            // TODO: indexes.
                            // Note: since discard_sstables was changed to only count tables owned by this shard,
                            // we can get zero rp back. Changed assert, and ensure we save at least low_mark.
-                            assert(low_mark <= rp || rp == db::replay_position());
+                            // #6995 - the assert below was broken in c2c6c71 and remained so for many years. 
+                            // We nowadays do not flush tables with sstables but autosnapshot=false. This means
+                            // the low_mark assertion does not hold, because we maybe/probably never got around to 
+                            // creating the sstables that would create them.
+                            assert(!should_flush || low_mark <= rp || rp == db::replay_position());
                            rp = std::max(low_mark, rp);
                            return truncate_views(cf, truncated_at, should_flush).then([&cf, truncated_at, rp] {
                                // save_truncation_record() may actually fail after we cached the truncation time
@@ -2005,9 +2009,10 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
            reader_concurrency_semaphore* semaphore;
        };
        distributed<database>& _db;
+        utils::UUID _table_id;
        std::vector<reader_context> _contexts;
    public:
-        explicit streaming_reader_lifecycle_policy(distributed<database>& db) : _db(db), _contexts(smp::count) {
+        streaming_reader_lifecycle_policy(distributed<database>& db, utils::UUID table_id) : _db(db), _table_id(table_id), _contexts(smp::count) {
        }
        virtual flat_mutation_reader create_reader(
                schema_ptr schema,
@@ -2036,7 +2041,12 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
            });
        }
        virtual reader_concurrency_semaphore& semaphore() override {
-            return *_contexts[engine().cpu_id()].semaphore;
+            const auto shard = engine().cpu_id();
+            if (!_contexts[shard].semaphore) {
+                auto& cf = _db.local().find_column_family(_table_id);
+                _contexts[shard].semaphore = &cf.streaming_read_concurrency_semaphore();
+            }
+            return *_contexts[shard].semaphore;
        }
    };
    auto ms = mutation_source([&db] (schema_ptr s,
@@ -2047,7 +2057,8 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
            tracing::trace_state_ptr trace_state,
            streamed_mutation::forwarding,
            mutation_reader::forwarding fwd_mr) {
-        return make_multishard_combining_reader(make_shared<streaming_reader_lifecycle_policy>(db), std::move(s), pr, ps, pc,
+        auto table_id = s->id();
+        return make_multishard_combining_reader(make_shared<streaming_reader_lifecycle_policy>(db, table_id), std::move(s), pr, ps, pc,
                std::move(trace_state), fwd_mr);
    });
    auto&& full_slice = schema->full_slice();
--- a/database.hh
+++ b/database.hh
@@ -55,6 +55,7 @@
 #include <limits>
 #include <cstddef>
 #include "schema_fwd.hh"
+#include "db/view/view.hh"
 #include "db/schema_features.hh"
 #include "gms/feature.hh"
 #include "timestamp.hh"
@@ -885,7 +886,7 @@ public:
    lw_shared_ptr<const sstable_list> get_sstables_including_compacted_undeleted() const;
    const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const;
    std::vector<sstables::shared_sstable> select_sstables(const dht::partition_range& range) const;
-    std::vector<sstables::shared_sstable> candidates_for_compaction() const;
+    std::vector<sstables::shared_sstable> non_staging_sstables() const;
    std::vector<sstables::shared_sstable> sstables_need_rewrite() const;
    size_t sstables_count() const;
    std::vector<uint64_t> sstable_count_per_level() const;
@@ -981,8 +982,9 @@ public:
        return *_config.sstables_manager;
    }

+    // Reader's schema must be the same as the base schema of each of the views.
    future<> populate_views(
-            std::vector<view_ptr>,
+            std::vector<db::view::view_and_base>,
            dht::token base_token,
            flat_mutation_reader&&);

@@ -998,7 +1000,7 @@ private:
    future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source, const io_priority_class& io_priority) const;
    std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
    future<> generate_and_propagate_view_updates(const schema_ptr& base,
-            std::vector<view_ptr>&& views,
+            std::vector<db::view::view_and_base>&& views,
            mutation&& m,
            flat_mutation_reader_opt existings) const;

--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -520,7 +520,7 @@ public:
            _segment_manager->totals.total_size_on_disk -= size_on_disk();
            _segment_manager->totals.total_size -= (size_on_disk() + _buffer.size_bytes());
            _segment_manager->add_file_to_delete(_file_name, _desc);
-        } else {
+        } else if (_segment_manager->cfg.warn_about_segments_left_on_disk_after_shutdown) {
            clogger.warn("Segment {} is dirty and is left on disk.", *this);
        }
    }
@@ -614,11 +614,17 @@ public:
    future<sseg_ptr> terminate() {
        assert(_closed);
        if (!std::exchange(_terminated, true)) {
-            clogger.trace("{} is closed but not terminated.", *this);
-            if (_buffer.empty()) {
-                new_buffer(0);
+            // write a terminating zero block iff we are ending (a reused)
+            // block before actual file end.
+            // we should only get here when all actual data is 
+            // already flushed (see below, close()).
+            if (size_on_disk() < _segment_manager->max_size) {
+                clogger.trace("{} is closed but not terminated.", *this);
+                if (_buffer.empty()) {
+                    new_buffer(0);
+                }
+                return cycle(true, true);
            }
-            return cycle(true, true);
        }
        return make_ready_future<sseg_ptr>(shared_from_this());
    }
@@ -1287,7 +1293,7 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
        auto fut = open_file_dma(filename, flags, opt);
        if (cfg.extensions && !cfg.extensions->commitlog_file_extensions().empty()) {
            for (auto * ext : cfg.extensions->commitlog_file_extensions()) {
-                fut = fut.then([ext, filename, flags](file f) {
+                fut = close_on_failure(std::move(fut), [ext, filename, flags](file f) {
                   return ext->wrap_file(filename, f, flags).then([f](file nf) mutable {
                       return nf ? nf : std::move(f);
                   });
@@ -2127,8 +2133,9 @@ db::commitlog::read_log_file(const sstring& filename, const sstring& pfx, seasta
        }).handle_exception([w](auto ep) {
            w->s.set_exception(ep);
        });
-
-        return ret.done();
+        // #6265 - must keep subscription alive.
+        auto res = ret.done();
+        return res.finally([ret = std::move(ret)] {});
    });
 }

--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -137,6 +137,7 @@ public:

        bool reuse_segments = true;
        bool use_o_dsync = false;
+        bool warn_about_segments_left_on_disk_after_shutdown = true;

        const db::extensions * extensions = nullptr;
    };
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -299,7 +299,7 @@ future<> db::commitlog_replayer::impl::process(stats* s, commitlog::buffer_and_r
                mutation m(cf.schema(), fm.decorated_key(*cf.schema()));
                converting_mutation_partition_applier v(cm, *cf.schema(), m.partition());
                fm.partition().accept(cm, v);
-                return do_with(std::move(m), [&db, &cf] (mutation m) {
+                return do_with(std::move(m), [&db, &cf] (const mutation& m) {
                    return db.apply_in_memory(m, cf, db::rp_handle(), db::no_timeout);
                });
            } else {
--- a/db/config.cc
+++ b/db/config.cc
@@ -681,7 +681,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , replace_address(this, "replace_address", value_status::Used, "", "The listen_address or broadcast_address of the dead node to replace. Same as -Dcassandra.replace_address.")
    , replace_address_first_boot(this, "replace_address_first_boot", value_status::Used, "", "Like replace_address option, but if the node has been bootstrapped successfully it will be ignored. Same as -Dcassandra.replace_address_first_boot.")
    , override_decommission(this, "override_decommission", value_status::Used, false, "Set true to force a decommissioned node to join the cluster")
-    , enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based")
+    , enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, false, "Set true to use enable repair based node operations instead of streaming based")
    , ring_delay_ms(this, "ring_delay_ms", value_status::Used, 30 * 1000, "Time a node waits to hear from other nodes before joining the ring in milliseconds. Same as -Dcassandra.ring_delay_ms in cassandra.")
    , shadow_round_ms(this, "shadow_round_ms", value_status::Used, 300 * 1000, "The maximum gossip shadow round time. Can be used to reduce the gossip feature check time during node boot up.")
    , fd_max_interval_ms(this, "fd_max_interval_ms", value_status::Used, 2 * 1000, "The maximum failure_detector interval time in milliseconds. Interval larger than the maximum will be ignored. Larger cluster may need to increase the default.")
@@ -689,6 +689,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
    , shutdown_announce_in_ms(this, "shutdown_announce_in_ms", value_status::Used, 2 * 1000, "Time a node waits after sending gossip shutdown message in milliseconds. Same as -Dcassandra.shutdown_announce_in_ms in cassandra.")
    , developer_mode(this, "developer_mode", value_status::Used, false, "Relax environment checks. Setting to true can reduce performance and reliability significantly.")
    , skip_wait_for_gossip_to_settle(this, "skip_wait_for_gossip_to_settle", value_status::Used, -1, "An integer to configure the wait for gossip to settle. -1: wait normally, 0: do not wait at all, n: wait for at most n polls. Same as -Dcassandra.skip_wait_for_gossip_to_settle in cassandra.")
+    , force_gossip_generation(this, "force_gossip_generation", liveness::LiveUpdate, value_status::Used, -1 , "Force gossip to use the generation number provided by user")
    , experimental(this, "experimental", value_status::Used, false, "Set to true to unlock all experimental features.")
    , experimental_features(this, "experimental_features", value_status::Used, {}, "Unlock experimental features provided as the option arguments (possible values: 'lwt', 'cdc', 'udf'). Can be repeated.")
    , lsa_reclamation_step(this, "lsa_reclamation_step", value_status::Used, 1, "Minimum number of segments to reclaim in a single step")
@@ -859,7 +860,7 @@ db::fs::path db::config::get_conf_sub(db::fs::path sub) {
 }

 bool db::config::check_experimental(experimental_features_t::feature f) const {
-    if (experimental()) {
+    if (experimental() && f != experimental_features_t::UNUSED) {
        return true;
    }
    const auto& optval = experimental_features();
@@ -911,11 +912,13 @@ const db::extensions& db::config::extensions() const {
 std::unordered_map<sstring, db::experimental_features_t::feature> db::experimental_features_t::map() {
    // We decided against using the construct-on-first-use idiom here:
    // https://github.com/scylladb/scylla/pull/5369#discussion_r353614807
-    return {{"lwt", LWT}, {"udf", UDF}, {"cdc", CDC}};
+    // Lightweight transactions are no longer experimental. Map them
+    // to UNUSED switch for a while, then remove altogether.
+    return {{"lwt", UNUSED}, {"udf", UDF}, {"cdc", CDC}};
 }

 std::vector<enum_option<db::experimental_features_t>> db::experimental_features_t::all() {
-    return {LWT, UDF, CDC};
+    return {UDF, CDC};
 }

 template struct utils::config_file::named_value<seastar::log_level>;
--- a/db/config.hh
+++ b/db/config.hh
@@ -81,7 +81,7 @@ namespace db {

 /// Enumeration of all valid values for the `experimental` config entry.
 struct experimental_features_t {
-    enum feature { LWT, UDF, CDC };
+    enum feature { UNUSED, UDF, CDC };
    static std::unordered_map<sstring, feature> map(); // See enum_option.
    static std::vector<enum_option<experimental_features_t>> all();
 };
@@ -278,6 +278,7 @@ public:
    named_value<uint32_t> shutdown_announce_in_ms;
    named_value<bool> developer_mode;
    named_value<int32_t> skip_wait_for_gossip_to_settle;
+    named_value<int32_t> force_gossip_generation;
    named_value<bool> experimental;
    named_value<std::vector<enum_option<experimental_features_t>>> experimental_features;
    named_value<size_t> lsa_reclamation_step;
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -224,7 +224,9 @@ future<> manager::end_point_hints_manager::stop(drain should_drain) noexcept {
        with_lock(file_update_mutex(), [this] {
            if (_hints_store_anchor) {
                hints_store_ptr tmp = std::exchange(_hints_store_anchor, nullptr);
-                return tmp->shutdown().finally([tmp] {});
+                return tmp->shutdown().finally([tmp] {
+                    return tmp->release();
+                }).finally([tmp] {});
            }
            return make_ready_future<>();
        }).handle_exception([&eptr] (auto e) { eptr = std::move(e); }).get();
@@ -326,6 +328,10 @@ future<db::commitlog> manager::end_point_hints_manager::add_store() noexcept {
            // HH doesn't utilize the flow that benefits from reusing segments.
            // Therefore let's simply disable it to avoid any possible confusion.
            cfg.reuse_segments = false;
+            // HH leaves segments on disk after commitlog shutdown, and later reads
+            // them when commitlog is re-created. This is expected to happen regularly
+            // during standard HH workload, so no need to print a warning about it.
+            cfg.warn_about_segments_left_on_disk_after_shutdown = false;

            return commitlog::create_commitlog(std::move(cfg)).then([this] (commitlog l) {
                // add_store() is triggered every time hint files are forcefully flushed to I/O (every hints_flush_period).
@@ -352,7 +358,9 @@ future<> manager::end_point_hints_manager::flush_current_hints() noexcept {
        return futurize_apply([this] {
            return with_lock(file_update_mutex(), [this]() -> future<> {
                return get_or_load().then([] (hints_store_ptr cptr) {
-                    return cptr->shutdown();
+                    return cptr->shutdown().finally([cptr] {
+                        return cptr->release();
+                    }).finally([cptr] {});
                }).then([this] {
                    // Un-hold the commitlog object. Since we are under the exclusive _file_update_mutex lock there are no
                    // other hints_store_ptr copies and this would destroy the commitlog shared value.
@@ -703,6 +711,7 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
                // Files are aggregated for at most manager::hints_timer_period therefore the oldest hint there is
                // (last_modification - manager::hints_timer_period) old.
                if (gc_clock::now().time_since_epoch() - secs_since_file_mod > gc_grace_sec - manager::hints_flush_period) {
+                    ctx_ptr->rps_set.erase(rp);
                    return make_ready_future<>();
                }

@@ -725,6 +734,7 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
                manager_logger.debug("send_hints(): {} at {}: {}", fname, rp, e.what());
                ++this->shard_stats().discarded;
            }
+            ctx_ptr->rps_set.erase(rp);
            return make_ready_future<>();
        }).finally([units = std::move(units), ctx_ptr] {});
    }).handle_exception([this, ctx_ptr] (auto eptr) {
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -822,6 +822,14 @@ future<> merge_schema(distributed<service::storage_proxy>& proxy, gms::feature_s
    });
 }

+future<> recalculate_schema_version(distributed<service::storage_proxy>& proxy, gms::feature_service& feat) {
+    return merge_lock().then([&proxy, &feat] {
+        return update_schema_version_and_announce(proxy, feat.cluster_schema_features());
+    }).finally([] {
+        return merge_unlock();
+    });
+}
+
 future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush)
 {
    return merge_lock().then([&proxy, mutations = std::move(mutations), do_flush] () mutable {
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -170,6 +170,13 @@ future<> merge_schema(distributed<service::storage_proxy>& proxy, gms::feature_s

 future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush);

+// Recalculates the local schema version and publishes it in gossip.
+//
+// It is safe to call concurrently with recalculate_schema_version() and merge_schema() in which case it
+// is guaranteed that the schema version we end up with after all calls will reflect the most recent state
+// of feature_service and schema tables.
+future<> recalculate_schema_version(distributed<service::storage_proxy>& proxy, gms::feature_service& feat);
+
 future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after);

 std::vector<mutation> make_create_keyspace_mutations(lw_shared_ptr<keyspace_metadata> keyspace, api::timestamp_type timestamp, bool with_tables_and_types_and_functions = true);
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -187,7 +187,7 @@ schema_ptr batchlog() {
        {{"cf_id", uuid_type}},
        // regular columns
        {
-            {"in_progress_ballot", timeuuid_type},
+            {"promise", timeuuid_type},
            {"most_recent_commit", bytes_type}, // serialization format is defined by frozen_mutation idl
            {"most_recent_commit_at", timeuuid_type},
            {"proposal", bytes_type}, // serialization format is defined by frozen_mutation idl
@@ -203,6 +203,7 @@ schema_ptr batchlog() {
        // operations on resulting CFMetaData:
        //    .compactionStrategyClass(LeveledCompactionStrategy.class);
       )));
+       builder.set_gc_grace_seconds(0);
       builder.with_version(generate_schema_version(builder.uuid()));
       builder.set_wait_for_sync_to_commitlog(true);
       return builder.build(schema_builder::compact_storage::no);
@@ -226,6 +227,7 @@ schema_ptr built_indexes() {
        // comment
        "built column indexes"
       )));
+       builder.set_gc_grace_seconds(0);
       builder.with_version(generate_schema_version(builder.uuid()));
       return builder.build(schema_builder::compact_storage::yes);
    }();
@@ -272,6 +274,7 @@ schema_ptr built_indexes() {
        // comment
        "information about the local node"
       )));
+       builder.set_gc_grace_seconds(0);
       builder.with_version(generate_schema_version(builder.uuid()));
       builder.remove_column("scylla_cpu_sharding_algorithm");
       builder.remove_column("scylla_nr_shards");
@@ -307,6 +310,7 @@ schema_ptr built_indexes() {
        // comment
        "information about known peers in the cluster"
       )));
+       builder.set_gc_grace_seconds(0);
       builder.with_version(generate_schema_version(builder.uuid()));
       return builder.build(schema_builder::compact_storage::no);
    }();
@@ -331,6 +335,7 @@ schema_ptr built_indexes() {
        // comment
        "events related to peers"
       )));
+       builder.set_gc_grace_seconds(0);
       builder.with_version(generate_schema_version(builder.uuid()));
       return builder.build(schema_builder::compact_storage::no);
    }();
@@ -353,6 +358,7 @@ schema_ptr built_indexes() {
        // comment
        "ranges requested for transfer"
       )));
+       builder.set_gc_grace_seconds(0);
       builder.with_version(generate_schema_version(builder.uuid()));
       return builder.build(schema_builder::compact_storage::no);
    }();
@@ -490,6 +496,7 @@ schema_ptr size_estimates() {
        // comment
        "partitions larger than specified threshold"
        )));
+        builder.set_gc_grace_seconds(0);
        builder.with_version(generate_schema_version(builder.uuid()));
        return builder.build(schema_builder::compact_storage::no);
    }();
@@ -510,6 +517,7 @@ static schema_ptr large_rows() {
                .with_column("compaction_time", timestamp_type)
                .set_comment("rows larger than specified threshold")
                .with_version(generate_schema_version(id))
+                .set_gc_grace_seconds(0)
                .build();
    }();
    return large_rows;
@@ -530,6 +538,7 @@ static schema_ptr large_cells() {
                .with_column("compaction_time", timestamp_type)
                .set_comment("cells larger than specified threshold")
                .with_version(generate_schema_version(id))
+                .set_gc_grace_seconds(0)
                .build();
    }();
    return large_cells;
@@ -553,6 +562,7 @@ static schema_ptr large_cells() {
        // comment
        "Scylla specific information about the local node"
       )));
+       builder.set_gc_grace_seconds(0);
       builder.with_version(generate_schema_version(builder.uuid()));
       return builder.build(schema_builder::compact_storage::no);
    }();
@@ -666,6 +676,7 @@ schema_ptr local() {
        // comment
        "information about the local node"
       )));
+       builder.set_gc_grace_seconds(0);
       builder.with_version(generate_schema_version(builder.uuid()));
       return builder.build(schema_builder::compact_storage::no);
    }();
@@ -693,6 +704,7 @@ schema_ptr truncated() {
        // comment
        "information about table truncation"
       )));
+       builder.set_gc_grace_seconds(0);
       builder.with_version(generate_schema_version(builder.uuid()));
       return builder.build(schema_builder::compact_storage::no);
    }();
@@ -755,6 +767,7 @@ schema_ptr available_ranges() {
        // comment
        "available keyspace/ranges during bootstrap/replace that are ready to be served"
       )));
+       builder.set_gc_grace_seconds(0);
       builder.with_version(generate_schema_version(builder.uuid()));
       return builder.build();
    }();
@@ -777,6 +790,7 @@ schema_ptr views_builds_in_progress() {
        // comment
        "views builds current progress"
       )));
+       builder.set_gc_grace_seconds(0);
       builder.with_version(generate_schema_version(builder.uuid()));
       return builder.build();
    }();
@@ -799,6 +813,7 @@ schema_ptr built_views() {
        // comment
        "built views"
       )));
+       builder.set_gc_grace_seconds(0);
       builder.with_version(generate_schema_version(builder.uuid()));
       return builder.build();
    }();
@@ -842,6 +857,7 @@ schema_ptr scylla_views_builds_in_progress() {
        // comment
        "CDC-specific information that the local node stores"
       )));
+       builder.set_gc_grace_seconds(0);
       builder.with_version(generate_schema_version(builder.uuid()));
       return builder.build(schema_builder::compact_storage::no);
    }();
@@ -2196,13 +2212,13 @@ future<service::paxos::paxos_state> load_paxos_state(const partition_key& key, s
    // FIXME: we need execute_cql_with_now()
    (void)now;
    auto f = execute_cql_with_timeout(cql, timeout, to_legacy(*key.get_compound_type(*s), key.representation()), s->id());
-    return f.then([s] (shared_ptr<cql3::untyped_result_set> results) mutable {
+    return f.then([s, key] (shared_ptr<cql3::untyped_result_set> results) mutable {
        if (results->empty()) {
            return service::paxos::paxos_state();
        }
        auto& row = results->one();
-        auto promised = row.has("in_progress_ballot")
-                        ? row.get_as<utils::UUID>("in_progress_ballot") : utils::UUID_gen::min_time_UUID(0);
+        auto promised = row.has("promise")
+                        ? row.get_as<utils::UUID>("promise") : utils::UUID_gen::min_time_UUID(0);

        std::optional<service::paxos::proposal> accepted;
        if (row.has("proposal")) {
@@ -2211,9 +2227,14 @@ future<service::paxos::paxos_state> load_paxos_state(const partition_key& key, s
        }

        std::optional<service::paxos::proposal> most_recent;
-        if (row.has("most_recent_commit")) {
+        if (row.has("most_recent_commit_at")) {
+            // the value can be missing if it was pruned, suply empty one since
+            // it will not going to be used anyway
+            auto fm = row.has("most_recent_commit") ?
+                     ser::deserialize_from_buffer<>(row.get_blob("most_recent_commit"), boost::type<frozen_mutation>(), 0) :
+                     freeze(mutation(s, key));
            most_recent = service::paxos::proposal(row.get_as<utils::UUID>("most_recent_commit_at"),
-                    ser::deserialize_from_buffer<>(row.get_blob("most_recent_commit"), boost::type<frozen_mutation>(), 0));
+                    std::move(fm));
        }

        return service::paxos::paxos_state(promised, std::move(accepted), std::move(most_recent));
@@ -2228,7 +2249,7 @@ static int32_t paxos_ttl_sec(const schema& s) {
 }

 future<> save_paxos_promise(const schema& s, const partition_key& key, const utils::UUID& ballot, db::timeout_clock::time_point timeout) {
-    static auto cql = format("UPDATE system.{} USING TIMESTAMP ? AND TTL ? SET in_progress_ballot = ? WHERE row_key = ? AND cf_id = ?", PAXOS);
+    static auto cql = format("UPDATE system.{} USING TIMESTAMP ? AND TTL ? SET promise = ? WHERE row_key = ? AND cf_id = ?", PAXOS);
    return execute_cql_with_timeout(cql,
            timeout,
            utils::UUID_gen::micros_timestamp(ballot),
@@ -2240,13 +2261,14 @@ future<> save_paxos_promise(const schema& s, const partition_key& key, const uti
 }

 future<> save_paxos_proposal(const schema& s, const service::paxos::proposal& proposal, db::timeout_clock::time_point timeout) {
-    static auto cql = format("UPDATE system.{} USING TIMESTAMP ? AND TTL ? SET proposal_ballot = ?, proposal = ? WHERE row_key = ? AND cf_id = ?", PAXOS);
+    static auto cql = format("UPDATE system.{} USING TIMESTAMP ? AND TTL ? SET promise = ?, proposal_ballot = ?, proposal = ? WHERE row_key = ? AND cf_id = ?", PAXOS);
    partition_key_view key = proposal.update.key(s);
    return execute_cql_with_timeout(cql,
            timeout,
            utils::UUID_gen::micros_timestamp(proposal.ballot),
            paxos_ttl_sec(s),
            proposal.ballot,
+            proposal.ballot,
            ser::serialize_to_buffer<bytes>(proposal.update),
            to_legacy(*key.get_compound_type(s), key.representation()),
            s.id()
@@ -2274,6 +2296,20 @@ future<> save_paxos_decision(const schema& s, const service::paxos::proposal& de
        ).discard_result();
 }

+future<> delete_paxos_decision(const schema& s, const partition_key& key, const utils::UUID& ballot, db::timeout_clock::time_point timeout) {
+    // This should be called only if a learn stage succeeded on all replicas.
+    // In this case we can remove learned paxos value using ballot's timestamp which
+    // guarantees that if there is more recent round it will not be affected.
+    static auto cql = format("DELETE most_recent_commit FROM system.{} USING TIMESTAMP ?  WHERE row_key = ? AND cf_id = ?", PAXOS);
+
+    return execute_cql_with_timeout(cql,
+            timeout,
+            utils::UUID_gen::micros_timestamp(ballot),
+            to_legacy(*key.get_compound_type(s), key.representation()),
+            s.id()
+        ).discard_result();
+}
+
 } // namespace system_keyspace

 sstring system_keyspace_name() {
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -647,6 +647,7 @@ future<service::paxos::paxos_state> load_paxos_state(const partition_key& key, s
 future<> save_paxos_promise(const schema& s, const partition_key& key, const utils::UUID& ballot, db::timeout_clock::time_point timeout);
 future<> save_paxos_proposal(const schema& s, const service::paxos::proposal& proposal, db::timeout_clock::time_point timeout);
 future<> save_paxos_decision(const schema& s, const service::paxos::proposal& decision, db::timeout_clock::time_point timeout);
+future<> delete_paxos_decision(const schema& s, const partition_key& key, const utils::UUID& ballot, db::timeout_clock::time_point timeout);

 } // namespace system_keyspace
 } // namespace db
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -130,17 +130,26 @@ const column_definition* view_info::view_column(const column_definition& base_de
    return _schema.get_column_definition(base_def.name());
 }

-const std::vector<column_id>& view_info::base_non_pk_columns_in_view_pk() const {
-    return _base_non_pk_columns_in_view_pk;
+void view_info::set_base_info(db::view::base_info_ptr base_info) {
+    _base_info = std::move(base_info);
 }

-void view_info::initialize_base_dependent_fields(const schema& base) {
+db::view::base_info_ptr view_info::make_base_dependent_view_info(const schema& base) const {
+    std::vector<column_id> base_non_pk_columns_in_view_pk;
    for (auto&& view_col : boost::range::join(_schema.partition_key_columns(), _schema.clustering_key_columns())) {
        auto* base_col = base.get_column_definition(view_col.name());
        if (base_col && !base_col->is_primary_key()) {
-            _base_non_pk_columns_in_view_pk.push_back(base_col->id);
+            base_non_pk_columns_in_view_pk.push_back(base_col->id);
        }
    }
+    return make_lw_shared<db::view::base_dependent_view_info>({
+        .base_schema = base.shared_from_this(),
+        .base_non_pk_columns_in_view_pk = std::move(base_non_pk_columns_in_view_pk)
+    });
+}
+
+bool view_info::has_base_non_pk_columns_in_view_pk() const {
+    return !_base_info->base_non_pk_columns_in_view_pk.empty();
 }

 namespace db {
@@ -188,11 +197,11 @@ bool may_be_affected_by(const schema& base, const view_info& view, const dht::de
 }

 static bool update_requires_read_before_write(const schema& base,
-        const std::vector<view_ptr>& views,
+        const std::vector<view_and_base>& views,
        const dht::decorated_key& key,
        const rows_entry& update) {
    for (auto&& v : views) {
-        view_info& vf = *v->view_info();
+        view_info& vf = *v.view->view_info();
        if (may_be_affected_by(base, vf, key, update)) {
            return true;
        }
@@ -239,12 +248,14 @@ class view_updates final {
    view_ptr _view;
    const view_info& _view_info;
    schema_ptr _base;
+    base_info_ptr _base_info;
    std::unordered_map<partition_key, mutation_partition, partition_key::hashing, partition_key::equality> _updates;
 public:
-    explicit view_updates(view_ptr view, schema_ptr base)
-            : _view(std::move(view))
+    explicit view_updates(view_and_base vab)
+            : _view(std::move(vab.view))
            , _view_info(*_view->view_info())
-            , _base(std::move(base))
+            , _base(vab.base->base_schema)
+            , _base_info(vab.base)
            , _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view)) {
    }

@@ -306,7 +317,7 @@ row_marker view_updates::compute_row_marker(const clustering_row& base_row) cons
    // they share liveness information. It's true especially in the only case currently allowed by CQL,
    // which assumes there's up to one non-pk column in the view key. It's also true in alternator,
    // which does not carry TTL information.
-    const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
+    const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk;
    if (!col_ids.empty()) {
        auto& def = _base->regular_column_at(col_ids[0]);
        // Note: multi-cell columns can't be part of the primary key.
@@ -537,7 +548,7 @@ void view_updates::delete_old_entry(const partition_key& base_key, const cluster

 void view_updates::do_delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now) {
    auto& r = get_view_row(base_key, existing);
-    const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
+    const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk;
    if (!col_ids.empty()) {
        // We delete the old row using a shadowable row tombstone, making sure that
        // the tombstone deletes everything in the row (or it might still show up).
@@ -678,7 +689,7 @@ void view_updates::generate_update(
        return;
    }

-    const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
+    const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk;
    if (col_ids.empty()) {
        // The view key is necessarily the same pre and post update.
        if (existing && existing->is_live(*_base)) {
@@ -932,11 +943,16 @@ future<stop_iteration> view_update_builder::on_results() {

 future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
        const schema_ptr& base,
-        std::vector<view_ptr>&& views_to_update,
+        std::vector<view_and_base>&& views_to_update,
        flat_mutation_reader&& updates,
        flat_mutation_reader_opt&& existings) {
-    auto vs = boost::copy_range<std::vector<view_updates>>(views_to_update | boost::adaptors::transformed([&] (auto&& v) {
-        return view_updates(std::move(v), base);
+    auto vs = boost::copy_range<std::vector<view_updates>>(views_to_update | boost::adaptors::transformed([&] (view_and_base v) {
+        if (base->version() != v.base->base_schema->version()) {
+            on_internal_error(vlogger, format("Schema version used for view updates ({}) does not match the current"
+                                              " base schema version of the view ({}) for view {}.{} of {}.{}",
+                base->version(), v.base->base_schema->version(), v.view->ks_name(), v.view->cf_name(), base->ks_name(), base->cf_name()));
+        }
+        return view_updates(std::move(v));
    }));
    auto builder = std::make_unique<view_update_builder>(base, std::move(vs), std::move(updates), std::move(existings));
    auto f = builder->build();
@@ -946,18 +962,18 @@ future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
 query::clustering_row_ranges calculate_affected_clustering_ranges(const schema& base,
        const dht::decorated_key& key,
        const mutation_partition& mp,
-        const std::vector<view_ptr>& views) {
+        const std::vector<view_and_base>& views) {
    std::vector<nonwrapping_range<clustering_key_prefix_view>> row_ranges;
    std::vector<nonwrapping_range<clustering_key_prefix_view>> view_row_ranges;
    clustering_key_prefix_view::tri_compare cmp(base);
    if (mp.partition_tombstone() || !mp.row_tombstones().empty()) {
        for (auto&& v : views) {
            // FIXME: #2371
-            if (v->view_info()->select_statement().get_restrictions()->has_unrestricted_clustering_columns()) {
+            if (v.view->view_info()->select_statement().get_restrictions()->has_unrestricted_clustering_columns()) {
                view_row_ranges.push_back(nonwrapping_range<clustering_key_prefix_view>::make_open_ended_both_sides());
                break;
            }
-            for (auto&& r : v->view_info()->partition_slice().default_row_ranges()) {
+            for (auto&& r : v.view->view_info()->partition_slice().default_row_ranges()) {
                view_row_ranges.push_back(r.transform(std::mem_fn(&clustering_key_prefix::view)));
            }
        }
@@ -1101,6 +1117,8 @@ future<> mutate_MV(
            }
        };
        if (paired_endpoint) {
+            // If paired endpoint is present, remove it from the list of pending endpoints to avoid duplicates
+            pending_endpoints.erase(std::remove(pending_endpoints.begin(), pending_endpoints.end(), *paired_endpoint), pending_endpoints.end());
            // When paired endpoint is the local node, we can just apply
            // the mutation locally, unless there are pending endpoints, in
            // which case we want to do an ordinary write so the view mutation
@@ -1715,7 +1733,7 @@ public:
            return stop_iteration::yes;
        }

-        _fragments_memory_usage += cr.memory_usage(*_step.base->schema());
+        _fragments_memory_usage += cr.memory_usage(*_step.reader.schema());
        _fragments.push_back(std::move(cr));
        if (_fragments_memory_usage > batch_memory_max) {
            // Although we have not yet completed the batch of base rows that
@@ -1735,10 +1753,14 @@ public:
        _builder._as.check();
        if (!_fragments.empty()) {
            _fragments.push_front(partition_start(_step.current_key, tombstone()));
+            auto base_schema = _step.base->schema();
+            auto views = with_base_info_snapshot(_views_to_build);
+            auto reader = make_flat_mutation_reader_from_fragments(_step.reader.schema(), std::move(_fragments));
+            reader.upgrade_schema(base_schema);
            _step.base->populate_views(
-                    _views_to_build,
+                    std::move(views),
                    _step.current_token(),
-                    make_flat_mutation_reader_from_fragments(_step.base->schema(), std::move(_fragments))).get();
+                    std::move(reader)).get();
            _fragments.clear();
            _fragments_memory_usage = 0;
        }
@@ -1885,5 +1907,11 @@ future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_d
    });
 }

+std::vector<db::view::view_and_base> with_base_info_snapshot(std::vector<view_ptr> vs) {
+    return boost::copy_range<std::vector<db::view::view_and_base>>(vs | boost::adaptors::transformed([] (const view_ptr& v) {
+        return db::view::view_and_base{v, v->view_info()->base_info()};
+    }));
+}
+
 } // namespace view
 } // namespace db
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -43,6 +43,27 @@ namespace db {

 namespace view {

+// Part of the view description which depends on the base schema version.
+//
+// This structure may change even though the view schema doesn't change, so
+// it needs to live outside view_ptr.
+struct base_dependent_view_info {
+    schema_ptr base_schema;
+
+    // Id of a regular base table column included in the view's PK, if any.
+    // Scylla views only allow one such column, alternator can have up to two.
+    std::vector<column_id> base_non_pk_columns_in_view_pk;
+};
+
+// Immutable snapshot of view's base-schema-dependent part.
+using base_info_ptr = lw_shared_ptr<const base_dependent_view_info>;
+
+// Snapshot of the view schema and its base-schema-dependent part.
+struct view_and_base {
+    view_ptr view;
+    base_info_ptr base;
+};
+
 /**
 * Whether the view filter considers the specified partition key.
 *
@@ -92,7 +113,7 @@ bool clustering_prefix_matches(const schema& base, const partition_key& key, con

 future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
        const schema_ptr& base,
-        std::vector<view_ptr>&& views_to_update,
+        std::vector<view_and_base>&& views_to_update,
        flat_mutation_reader&& updates,
        flat_mutation_reader_opt&& existings);

@@ -100,7 +121,7 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(
        const schema& base,
        const dht::decorated_key& key,
        const mutation_partition& mp,
-        const std::vector<view_ptr>& views);
+        const std::vector<view_and_base>& views);

 struct wait_for_all_updates_tag {};
 using wait_for_all_updates = bool_class<wait_for_all_updates_tag>;
@@ -128,6 +149,13 @@ future<> mutate_MV(
 */
 void create_virtual_column(schema_builder& builder, const bytes& name, const data_type& type);

+/**
+ * Converts a collection of view schema snapshots into a collection of
+ * view_and_base objects, which are snapshots of both the view schema
+ * and the base-schema-dependent part of view description.
+ */
+std::vector<view_and_base> with_base_info_snapshot(std::vector<view_ptr>);
+
 }

 }
--- a/dht/token.cc
+++ b/dht/token.cc
@@ -118,7 +118,7 @@ token token::midpoint(const token& t1, const token& t2) {
 }

 token token::get_random_token() {
-    return {kind::key, dht::get_random_number<int64_t>()};
+    return token(kind::key, dht::get_random_number<uint64_t>());
 }

 token token::from_sstring(const sstring& t) {
--- a/dht/token.hh
+++ b/dht/token.hh
@@ -58,19 +58,27 @@ public:
        , _data(normalize(d)) { }

    token(kind k, const bytes& b) : _kind(std::move(k)) {
-        if (b.size() != sizeof(_data)) {
-            throw std::runtime_error(fmt::format("Wrong token bytes size: expected {} but got {}", sizeof(_data), b.size()));
+        if (_kind != kind::key) {
+            _data = 0;
+        } else {
+            if (b.size() != sizeof(_data)) {
+                throw std::runtime_error(fmt::format("Wrong token bytes size: expected {} but got {}", sizeof(_data), b.size()));
+            }
+            std::copy_n(b.begin(), sizeof(_data), reinterpret_cast<int8_t *>(&_data));
+            _data = net::ntoh(_data);
        }
-        std::copy_n(b.begin(), sizeof(_data), reinterpret_cast<int8_t *>(&_data));
-        _data = net::ntoh(_data);
    }

    token(kind k, bytes_view b) : _kind(std::move(k)) {
-        if (b.size() != sizeof(_data)) {
-            throw std::runtime_error(fmt::format("Wrong token bytes size: expected {} but got {}", sizeof(_data), b.size()));
+        if (_kind != kind::key) {
+            _data = 0;
+        } else {
+            if (b.size() != sizeof(_data)) {
+                throw std::runtime_error(fmt::format("Wrong token bytes size: expected {} but got {}", sizeof(_data), b.size()));
+            }
+            std::copy_n(b.begin(), sizeof(_data), reinterpret_cast<int8_t *>(&_data));
+            _data = net::ntoh(_data);
        }
-        std::copy_n(b.begin(), sizeof(_data), reinterpret_cast<int8_t *>(&_data));
-        _data = net::ntoh(_data);
    }

    bool is_minimum() const {
--- a/dist/common/scripts/scylla-housekeeping
+++ b/dist/common/scripts/scylla-housekeeping
@@ -61,7 +61,15 @@ def sh_command(*args):
    return out

 def get_url(path):
-    return urllib.request.urlopen(path).read().decode('utf-8')
+    # If server returns any error, like 403, or 500 urllib.request throws exception, which is not serializable.
+    # When multiprocessing routines fail to serialize it, it throws ambiguous serialization exception
+    #   from get_json_from_url.
+    # In order to see legit error we catch it from the inside of process, covert to string and
+    #   pass it as part of return value
+    try:
+        return 0, urllib.request.urlopen(path).read().decode('utf-8')
+    except Exception as exc:
+        return 1, str(exc)

 def get_json_from_url(path):
    pool = mp.Pool(processes=1)
@@ -71,13 +79,16 @@ def get_json_from_url(path):
    # to enforce a wallclock timeout.
    result = pool.apply_async(get_url, args=(path,))
    try:
-        retval = result.get(timeout=5)
+        status, retval = result.get(timeout=5)
    except mp.TimeoutError as err:
        pool.terminate()
        pool.join()
        raise
+    if status == 1:
+        raise RuntimeError(f'Failed to get "{path}" due to the following error: {retval}')
    return json.loads(retval)

+
 def get_api(path):
    return get_json_from_url("http://" + api_address + path)

--- a/dist/common/scripts/scylla_fstrim_setup
+++ b/dist/common/scripts/scylla_fstrim_setup
@@ -31,5 +31,6 @@ if __name__ == '__main__':
        sys.exit(1)
    if is_systemd():
        systemd_unit('scylla-fstrim.timer').unmask()
+        systemd_unit('scylla-fstrim.timer').enable()
    if is_redhat_variant():
        systemd_unit('fstrim.timer').disable()
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -371,6 +371,9 @@ if __name__ == '__main__':
                if not stat.S_ISBLK(os.stat(dsk).st_mode):
                    print('{} is not block device'.format(dsk))
                    continue
+                if dsk in selected:
+                    print(f'{dsk} is already added')
+                    continue
                selected.append(dsk)
                devices.remove(dsk)
            disks = ','.join(selected)
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -182,7 +182,7 @@ class aws_instance:
        instance_size = self.instance_size()
        if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
            return 'ixgbevf'
-        if instance_class in ['c5', 'c5d', 'f1', 'g3', 'h1', 'i3', 'i3en', 'm5', 'm5d', 'p2', 'p3', 'r4', 'x1']:
+        if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d']:
            return 'ena'
        if instance_class == 'm4':
            if instance_size == '16xlarge':
@@ -329,7 +329,7 @@ class scylla_cpuinfo:

 # When a CLI tool is not installed, use relocatable CLI tool provided by Scylla
 scylla_env = os.environ.copy()
-scylla_env['PATH'] =  '{}:{}'.format(scylla_env['PATH'], scyllabindir())
+scylla_env['PATH'] =  '{}:{}'.format(scyllabindir(), scylla_env['PATH'])

 def run(cmd, shell=False, silent=False, exception=True):
    stdout = subprocess.DEVNULL if silent else None
@@ -441,6 +441,19 @@ def dist_ver():
    return platform.dist()[1]


+SYSTEM_PARTITION_UUIDS = [
+        '21686148-6449-6e6f-744e-656564454649', # BIOS boot partition
+        'c12a7328-f81f-11d2-ba4b-00a0c93ec93b', # EFI system partition
+        '024dee41-33e7-11d3-9d69-0008c781f39f'  # MBR partition scheme
+]
+
+def get_partition_uuid(dev):
+    return out(f'lsblk -n -oPARTTYPE {dev}')
+
+def is_system_partition(dev):
+    uuid = get_partition_uuid(dev)
+    return (uuid in SYSTEM_PARTITION_UUIDS)
+
 def is_unused_disk(dev):
    # dev is not in /sys/class/block/, like /dev/nvme[0-9]+
    if not os.path.isdir('/sys/class/block/{dev}'.format(dev=dev.replace('/dev/', ''))):
@@ -448,7 +461,8 @@ def is_unused_disk(dev):
    try:
        fd = os.open(dev, os.O_EXCL)
        os.close(fd)
-        return True
+        # dev is not reserved for system
+        return not is_system_partition(dev)
    except OSError:
        return False

@@ -481,8 +495,8 @@ def parse_scylla_dirs_with_default(conf='/etc/scylla/scylla.yaml'):
        y['data_file_directories'] = [os.path.join(y['workdir'], 'data')]
    for t in [ "commitlog", "hints", "view_hints", "saved_caches" ]:
        key = "%s_directory" % t
-        if key not in y or not y[k]:
-            y[k] = os.path.join(y['workdir'], t)
+        if key not in y or not y[key]:
+            y[key] = os.path.join(y['workdir'], t)
    return y


--- a/dist/debian/control.mustache
+++ b/dist/debian/control.mustache
@@ -5,6 +5,7 @@ Section: database
 Priority: optional
 X-Python3-Version: >= 3.4
 Standards-Version: 3.9.5
+Rules-Requires-Root: no

 Package: {{product}}-conf
 Architecture: any
--- a/dist/debian/python3/control.mustache
+++ b/dist/debian/python3/control.mustache
@@ -5,6 +5,7 @@ Section: python
 Priority: optional
 X-Python3-Version: >= 3.4
 Standards-Version: 3.9.5
+Rules-Requires-Root: no

 Package: {{product}}-python3
 Architecture: amd64
--- a/dist/debian/rules.mustache
+++ b/dist/debian/rules.mustache
@@ -37,6 +37,7 @@ override_dh_strip:
 	# The binaries (ethtool...patchelf) don't pass dh_strip after going through patchelf. Since they are
 	# already stripped, nothing is lost if we exclude them, so that's what we do.
 	dh_strip -Xlibprotobuf.so.15 -Xld.so -Xethtool -Xgawk -Xgzip -Xhwloc-calc -Xhwloc-distrib -Xifconfig -Xlscpu -Xnetstat -Xpatchelf --dbg-package={{product}}-server-dbg
+	find $(CURDIR)/debian/{{product}}-server-dbg/usr/lib/debug/.build-id/ -name "*.debug" -exec objcopy --decompress-debug-sections {} \;

 override_dh_makeshlibs:

--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -5,8 +5,8 @@ MAINTAINER Avi Kivity <avi@cloudius-systems.com>
 ENV container docker

 # The SCYLLA_REPO_URL argument specifies the URL to the RPM repository this Docker image uses to install Scylla. The default value is the Scylla's unstable RPM repository, which contains the daily build.
-ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo
-ARG VERSION=666.development
+ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/branch-4.0/latest/scylla.repo
+ARG VERSION=4.0.*

 ADD scylla_bashrc /scylla_bashrc

--- a/docs/alternator/alternator.md
+++ b/docs/alternator/alternator.md
@@ -21,10 +21,6 @@ DynamoDB API requests.
 For example., "`--alternator-port=8000`" on the command line will run
 Alternator on port 8000 - the traditional port used by DynamoDB.

-Alternator uses Scylla's LWT feature, which is currently considered
-experimental and needs to be seperately enabled as well, e.g. with the
-"`--experimental=on`" option.
-
 By default, Scylla listens on this port on all network interfaces.
 To listen only on a specific interface, pass also an "`alternator-address`"
 option.
@@ -55,9 +51,8 @@ Alternator's compatibility with DynamoDB, and will be updated as the work
 progresses and compatibility continues to improve.

 ### API Server
-* Transport: HTTP mostly supported, but small features like CRC header and
-  compression are still missing. HTTPS supported on top of HTTP, so small
-  features may still be missing.
+* Transport: HTTP and HTTPS are mostly supported, but small features like CRC
+  header and compression are still missing.
 * Authorization (verifying the originator of the request): implemented
  on top of system\_auth.roles table. The secret key used for authorization
  is the salted\_hash column from the roles table, selected with:
@@ -65,20 +60,19 @@ progresses and compatibility continues to improve.
  By default, authorization is not enforced at all. It can be turned on
  by providing an entry in Scylla configuration:
    alternator\_enforce\_authorization: true
-* DNS server for load balancing: Not yet supported. Client needs to pick
-  one of the live Scylla nodes and send a request to it.
+* Load balancing: Not a part of Alternator. One should use an external load
+  balancer or DNS server to balance the requests between the live Scylla
+  nodes. We plan to publish a reference example soon.
 ### Table Operations
-* CreateTable: Supported. Note our implementation is synchronous.
+* CreateTable and DeleteTable: Supported. Note our implementation is synchronous.
+* DescribeTable: Partial implementation. Missing creation date and size estimate.
 * UpdateTable: Not supported.
-* DescribeTable: Partial implementation. Missing creation date and size esitmate.
-* DeleteTable: Supported. Note our implementation is synchronous.
 * ListTables: Supported.
 ### Item Operations
 * GetItem: Support almost complete except that projection expressions can
  only ask for top-level attributes.
 * PutItem: Support almost complete except that condition expressions can
  only refer to to-level attributes.
-  pre-put content) not yet supported.
 * UpdateItem: Nested documents are supported but updates to nested attributes
  are not (e.g., `SET a.b[3].c=val`), and neither are nested attributes in
  condition expressions.
@@ -90,15 +84,14 @@ progresses and compatibility continues to improve.
 * BatchWriteItem: Supported. Doesn't limit the number of items (DynamoDB
  limits to 25) or size of items (400 KB) or total request size (16 MB).
 ### Scans
-* Scan: As usual, projection expressions only support top-level attributes.
-  Filter expressions (to filter some of the items) partially supported:
-  The ScanFilter syntax is supported but FilterExpression is not yet, and
-  only equality operator is supported so far.
-  The "Select" options which allows to count items instead of returning them
-  is not yet supported. Parallel scan is not yet supported.
-* Query: Same issues as Scan above. Additionally, missing support for
-  KeyConditionExpression (an alternative syntax replacing the older
-  KeyConditions parameter which we do support).
+Scan and Query are mostly supported, with the following limitations:
+* As above, projection expressions only support top-level attributes.
+* Filter expressions (to filter some of the items) are only partially
+  supported: The ScanFilter syntax is currently only supports the equality
+  operator, and the FilterExpression syntax is not yet supported at all.
+* The "Select" options which allows to count items instead of returning them
+  is not yet supported.
+* Parallel scan is not yet supported.
 ### Secondary Indexes
 Global Secondary Indexes (GSI) and Local Secondary Indexes (LSI) are
 implemented, with the following limitations:
@@ -116,24 +109,28 @@ implemented, with the following limitations:
  Writes are done in LOCAL_QURUM and reads in LOCAL_ONE (eventual consistency)
  or LOCAL_QUORUM (strong consistency).
 ### Global Tables
-* Not yet supported: CreateGlobalTable, UpdateGlobalTable,
-  DescribeGlobalTable, ListGlobalTables, UpdateGlobalTableSettings,
-  DescribeGlobalTableSettings. Implementation will use Scylla's multi-DC
-  features.
+* Currently, *all* Alternator tables are created as "Global Tables", i.e., can
+  be accessed from all of Scylla's DCs.
+* We do not yet support the DynamoDB API calls to make some of the tables
+  global and others local to a particular DC: CreateGlobalTable,
+  UpdateGlobalTable, DescribeGlobalTable, ListGlobalTables,
+  UpdateGlobalTableSettings, DescribeGlobalTableSettings, and UpdateTable.
 ### Backup and Restore
-* On-demand backup: Not yet supported: CreateBackup, DescribeBackup,
-  DeleteBackup, ListBackups, RestoreTableFromBackup. Implementation will
-  use Scylla's snapshots
+* On-demand backup: the DynamoDB APIs are not yet supported: CreateBackup,
+  DescribeBackup, DeleteBackup, ListBackups, RestoreTableFromBackup.
+  Users can use Scylla's [snapshots](https://docs.scylladb.com/operating-scylla/procedures/backup-restore/)
+  or [Scylla Manager](https://docs.scylladb.com/operating-scylla/manager/2.0/backup/).
 * Continuous backup: Not yet supported: UpdateContinuousBackups,
  DescribeContinuousBackups, RestoreTableToPoinInTime.
-### Transations
+### Transactions
 * Not yet supported: TransactWriteItems, TransactGetItems.
  Note that this is a new DynamoDB feature - these are more powerful than
  the old conditional updates which were "lightweight transactions".
-### Streams (CDC)
-* Not yet supported
+### Streams
+* Scylla has experimental support for [CDC](https://docs.scylladb.com/using-scylla/cdc/)
+  (change data capture), but the "DynamoDB Streams" API is not yet supported.
 ### Encryption at rest
-* Supported natively by Scylla, but needs to be enabled by default.
+* Supported by Scylla Enterprise (not in open-source). Needs to be enabled.
 ### ARNs and tags
 * ARN is generated for every alternator table
 * Tagging can be used with the help of the following requests:
@@ -166,7 +163,9 @@ implemented, with the following limitations:
 * Not required. Scylla cache is rather advanced and there is no need to place
  a cache in front of the database: https://www.scylladb.com/2017/07/31/database-caches-not-good/
 ### Metrics
-* Several metrics are available through the Grafana/Promethues stack: https://docs.scylladb.com/operating-scylla/monitoring/   It is different than the expectations of the current DynamoDB implementation. However, our
+* Several metrics are available through the Grafana/Prometheus stack:
+  https://docs.scylladb.com/operating-scylla/monitoring/
+  Those are different from the current DynamoDB metrics, but Scylla's
  monitoring is rather advanced and provide more insights to the internals.

 ## Alternator design and implementation
@@ -229,8 +228,3 @@ one DynamoDB feature which we cannot support safely: we cannot modify
 a non-top-level attribute (e.g., a.b[3].c) directly without RMW. We plan
 to fix this in a future version by rethinking the data model we use for
 attributes, or rethinking our implementation of RMW (as explained above).
-
-For reasons explained above, the data model used by Alternator to store
-data on disk is still in a state of flux, and may change in future versions.
-Therefore, in this early stage it is not recommended to store important
-production data using Alternator.
--- a/docs/alternator/getting-started.md
+++ b/docs/alternator/getting-started.md
@@ -10,12 +10,10 @@ This section will guide you through the steps for setting up the cluster:
   nightly image by running: `docker pull scylladb/scylla-nightly:latest`
 2. Follow the steps in the [Scylla official download web page](https://www.scylladb.com/download/open-source/#docker)
   add to every "docker run" command: `-p 8000:8000` before the image name
-   and `--alternator-port=8000 --experimental 1` at the end. The
-   "alternator-port" option specifies on which port Scylla will listen for
-   the (unencrypted) DynamoDB API, and "--experimental 1" is required to
-   enable the experimental LWT feature which Alternator uses.
+   and `--alternator-port=8000` at the end. The "alternator-port" option
+   specifies on which port Scylla will listen for the (unencrypted) DynamoDB API.
   For example,
-   `docker run --name scylla -d -p 8000:8000 scylladb/scylla-nightly:latest --alternator-port=8000 --experimental 1`
+   `docker run --name scylla -d -p 8000:8000 scylladb/scylla-nightly:latest --alternator-port=8000

 ## Testing Scylla's DynamoDB API support:
 ### Running AWS Tic Tac Toe demo app to test the cluster:
--- a/docs/sstable-scylla-format.md
+++ b/docs/sstable-scylla-format.md
@@ -76,6 +76,9 @@ Scylla with issue #4139 fixed)
 bit 4: CorrectEmptyCounters (if set, indicates the sstable was generated by
 Scylla with issue #4363 fixed)

+bit 5: CorrectUDTsInCollections (if set, indicates that the sstable was generated
+by Scylla with issue #6130 fixed)
+
 ## extension_attributes subcomponent

    extension_attributes = extension_attribute_count extension_attribute*
--- a/exceptions/unrecognized_entity_exception.hh
+++ b/exceptions/unrecognized_entity_exception.hh
@@ -56,22 +56,22 @@ public:
    /**
     * The unrecognized entity.
     */
-    ::shared_ptr<cql3::column_identifier> entity;
+    cql3::column_identifier entity;

    /**
-     * The entity relation.
+     * The entity relation in a stringified form.
     */
-    cql3::relation_ptr relation;
+    sstring relation_str;

    /**
     * Creates a new <code>UnrecognizedEntityException</code>.
     * @param entity the unrecognized entity
-     * @param relation the entity relation
+     * @param relation_str the entity relation string
     */
-    unrecognized_entity_exception(::shared_ptr<cql3::column_identifier> entity, cql3::relation_ptr relation)
-        : invalid_request_exception(format("Undefined name {} in where clause ('{}')", *entity, relation->to_string()))
-        , entity(entity)
-        , relation(relation)
+    unrecognized_entity_exception(cql3::column_identifier entity, sstring relation_str)
+        : invalid_request_exception(format("Undefined name {} in where clause ('{}')", entity, relation_str))
+        , entity(std::move(entity))
+        , relation_str(std::move(relation_str))
    { }
 };

--- a/flat_mutation_reader.hh
+++ b/flat_mutation_reader.hh
@@ -487,6 +487,9 @@ public:
    size_t buffer_size() const {
        return _impl->buffer_size();
    }
+    const circular_buffer<mutation_fragment>& buffer() const {
+        return _impl->buffer();
+    }
    // Detach the internal buffer of the reader.
    // Roughly equivalent to depleting it by calling pop_mutation_fragment()
    // until is_buffer_empty() returns true.
--- a/gms/feature_service.cc
+++ b/gms/feature_service.cc
@@ -110,10 +110,6 @@ feature_config feature_config_from_db_config(db::config& cfg) {
        fcfg.enable_cdc = true;
    }

-    if (cfg.check_experimental(db::experimental_features_t::LWT)) {
-        fcfg.enable_lwt = true;
-    }
-
    return fcfg;
 }

@@ -178,9 +174,7 @@ std::set<std::string_view> feature_service::known_feature_set() {
    if (_config.enable_cdc) {
        features.insert(gms::features::CDC);
    }
-    if (_config.enable_lwt) {
-        features.insert(gms::features::LWT);
-    }
+    features.insert(gms::features::LWT);

    for (const sstring& s : _config.disabled_features) {
        features.erase(s);
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -41,7 +41,6 @@ struct feature_config {
    bool enable_sstables_mc_format = false;
    bool enable_user_defined_functions = false;
    bool enable_cdc = false;
-    bool enable_lwt = false;
    std::set<sstring> disabled_features;
    feature_config();
 };
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -428,6 +428,7 @@ future<> gossiper::handle_shutdown_msg(inet_address from) {
        return make_ready_future<>();
    }
    return seastar::async([this, from] {
+        auto permit = this->lock_endpoint(from).get0();
        this->mark_as_shutdown(from);
    });
 }
@@ -632,7 +633,7 @@ void gossiper::remove_endpoint(inet_address endpoint) {
    // We can not run on_remove callbacks here becasue on_remove in
    // storage_service might take the gossiper::timer_callback_lock
    (void)seastar::async([this, endpoint] {
-        _subscribers.for_each([endpoint] (auto& subscriber) {
+        _subscribers.for_each([endpoint] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
            subscriber->on_remove(endpoint);
        });
    }).handle_exception([] (auto ep) {
@@ -1464,7 +1465,7 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
        logger.info("InetAddress {} is now UP, status = {}", addr, status);
    }

-    _subscribers.for_each([addr, local_state] (auto& subscriber) {
+    _subscribers.for_each([addr, local_state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
        subscriber->on_alive(addr, local_state);
        logger.trace("Notified {}", subscriber.get());
    });
@@ -1478,7 +1479,7 @@ void gossiper::mark_dead(inet_address addr, endpoint_state& local_state) {
    _live_endpoints_just_added.remove(addr);
    _unreachable_endpoints[addr] = now();
    logger.info("InetAddress {} is now DOWN, status = {}", addr, get_gossip_status(local_state));
-    _subscribers.for_each([addr, local_state] (auto& subscriber) {
+    _subscribers.for_each([addr, local_state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
        subscriber->on_dead(addr, local_state);
        logger.trace("Notified {}", subscriber.get());
    });
@@ -1510,7 +1511,7 @@ void gossiper::handle_major_state_change(inet_address ep, const endpoint_state&

    if (eps_old) {
        // the node restarted: it is up to the subscriber to take whatever action is necessary
-        _subscribers.for_each([ep, eps_old] (auto& subscriber) {
+        _subscribers.for_each([ep, eps_old] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
            subscriber->on_restart(ep, *eps_old);
        });
    }
@@ -1525,7 +1526,7 @@ void gossiper::handle_major_state_change(inet_address ep, const endpoint_state&

    auto* eps_new = get_endpoint_state_for_endpoint_ptr(ep);
    if (eps_new) {
-        _subscribers.for_each([ep, eps_new] (auto& subscriber) {
+        _subscribers.for_each([ep, eps_new] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
            subscriber->on_join(ep, *eps_new);
        });
    }
@@ -1618,14 +1619,14 @@ void gossiper::apply_new_states(inet_address addr, endpoint_state& local_state,

 // Runs inside seastar::async context
 void gossiper::do_before_change_notifications(inet_address addr, const endpoint_state& ep_state, const application_state& ap_state, const versioned_value& new_value) {
-    _subscribers.for_each([addr, ep_state, ap_state, new_value] (auto& subscriber) {
+    _subscribers.for_each([addr, ep_state, ap_state, new_value] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
        subscriber->before_change(addr, ep_state, ap_state, new_value);
    });
 }

 // Runs inside seastar::async context
 void gossiper::do_on_change_notifications(inet_address addr, const application_state& state, const versioned_value& value) {
-    _subscribers.for_each([addr, state, value] (auto& subscriber) {
+    _subscribers.for_each([addr, state, value] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
        subscriber->on_change(addr, state, value);
    });
 }
@@ -1725,8 +1726,12 @@ future<> gossiper::start_gossiping(int generation_nbr, std::map<application_stat
    // message on all cpus and forard them to cpu0 to process.
    return get_gossiper().invoke_on_all([do_bind] (gossiper& g) {
        g.init_messaging_service_handler(do_bind);
-    }).then([this, generation_nbr, preload_local_states] {
+    }).then([this, generation_nbr, preload_local_states] () mutable {
        build_seeds_list();
+        if (_cfg.force_gossip_generation() > 0) {
+            generation_nbr = _cfg.force_gossip_generation();
+            logger.warn("Use the generation number provided by user: generation = {}", generation_nbr);
+        }
        endpoint_state& local_state = endpoint_state_map[get_broadcast_address()];
        local_state.set_heart_beat_state_and_update_timestamp(heart_beat_state(generation_nbr));
        local_state.mark_alive();
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -591,6 +591,7 @@ public:
    std::map<sstring, sstring> get_simple_states();
    int get_down_endpoint_count();
    int get_up_endpoint_count();
+    int get_all_endpoint_count();
    sstring get_endpoint_state(sstring address);
    failure_detector& fd() { return _fd; }
 };
@@ -637,6 +638,12 @@ inline future<int> get_up_endpoint_count() {
    });
 }

+inline future<int> get_all_endpoint_count() {
+    return smp::submit_to(0, [] {
+        return static_cast<int>(get_local_gossiper().get_endpoint_states().size());
+    });
+}
+
 inline future<> set_phi_convict_threshold(double phi) {
    return smp::submit_to(0, [phi] {
        get_local_gossiper().fd().set_phi_convict_threshold(phi);
--- a/gms/inet_address.cc
+++ b/gms/inet_address.cc
@@ -69,7 +69,8 @@ std::ostream& gms::operator<<(std::ostream& os, const inet_address& x) {
    auto&& bytes = x.bytes();
    auto i = 0u;
    auto acc = 0u;
-    for (auto b : bytes) {
+    // extra paranoid sign extension evasion - #5808
+    for (uint8_t b : bytes) {
        acc <<= 8;
        acc |= b;
        if ((++i & 1) == 0) {
--- a/install-dependencies.sh
+++ b/install-dependencies.sh
@@ -76,6 +76,8 @@ fedora_packages=(
    python3-psutil
    python3-cassandra-driver
    python3-colorama
+    python3-boto3
+    python3-pytest
    dnf-utils
    pigz
    net-tools
--- a/install.sh
+++ b/install.sh
@@ -126,6 +126,7 @@ relocate_python3() {
    cp "$script" "$relocateddir"
    cat > "$install"<<EOF
 #!/usr/bin/env bash
+export LC_ALL=en_US.UTF-8
 x="\$(readlink -f "\$0")"
 b="\$(basename "\$x")"
 d="\$(dirname "\$x")"
--- a/licenses/abseil-license.txt
+++ b/licenses/abseil-license.txt
@@ -0,0 +1,203 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        https://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       https://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
--- a/locator/abstract_replication_strategy.cc
+++ b/locator/abstract_replication_strategy.cc
@@ -144,10 +144,33 @@ insert_token_range_to_sorted_container_while_unwrapping(

 dht::token_range_vector
 abstract_replication_strategy::get_ranges(inet_address ep) const {
+    return do_get_ranges(ep, _token_metadata, false);
+}
+
+dht::token_range_vector
+abstract_replication_strategy::get_ranges_in_thread(inet_address ep) const {
+    return do_get_ranges(ep, _token_metadata, true);
+}
+
+dht::token_range_vector
+abstract_replication_strategy::get_ranges(inet_address ep, token_metadata& tm) const {
+    return do_get_ranges(ep, tm, false);
+}
+
+dht::token_range_vector
+abstract_replication_strategy::get_ranges_in_thread(inet_address ep, token_metadata& tm) const {
+    return do_get_ranges(ep, tm, true);
+}
+
+dht::token_range_vector
+abstract_replication_strategy::do_get_ranges(inet_address ep, token_metadata& tm, bool can_yield) const {
    dht::token_range_vector ret;
-    auto prev_tok = _token_metadata.sorted_tokens().back();
-    for (auto tok : _token_metadata.sorted_tokens()) {
-        for (inet_address a : calculate_natural_endpoints(tok, _token_metadata)) {
+    auto prev_tok = tm.sorted_tokens().back();
+    for (auto tok : tm.sorted_tokens()) {
+        for (inet_address a : calculate_natural_endpoints(tok, tm)) {
+            if (can_yield) {
+                seastar::thread::maybe_yield();
+            }
            if (a == ep) {
                insert_token_range_to_sorted_container_while_unwrapping(prev_tok, tok, ret);
                break;
--- a/locator/abstract_replication_strategy.hh
+++ b/locator/abstract_replication_strategy.hh
@@ -106,6 +106,15 @@ public:
    // It the analogue of Origin's getAddressRanges().get(endpoint).
    // This function is not efficient, and not meant for the fast path.
    dht::token_range_vector get_ranges(inet_address ep) const;
+    dht::token_range_vector get_ranges_in_thread(inet_address ep) const;
+
+    // Use the token_metadata provided by the caller instead of _token_metadata
+    dht::token_range_vector get_ranges(inet_address ep, token_metadata& tm) const;
+    dht::token_range_vector get_ranges_in_thread(inet_address ep, token_metadata& tm) const;
+private:
+    dht::token_range_vector do_get_ranges(inet_address ep, token_metadata& tm, bool can_yield) const;
+
+public:
    // get_primary_ranges() returns the list of "primary ranges" for the given
    // endpoint. "Primary ranges" are the ranges that the node is responsible
    // for storing replica primarily, which means this is the first node
--- a/lua.cc
+++ b/lua.cc
@@ -264,14 +264,12 @@ static auto visit_lua_raw_value(lua_State* l, int index, Func&& f) {

 template <typename Func>
 static auto visit_decimal(const big_decimal &v, Func&& f) {
-    boost::multiprecision::cpp_int ten(10);
-    const auto& dividend = v.unscaled_value();
-    auto divisor = boost::multiprecision::pow(ten, v.scale());
+    boost::multiprecision::cpp_rational r = v.as_rational();
+    const boost::multiprecision::cpp_int& dividend = numerator(r);
+    const boost::multiprecision::cpp_int& divisor = denominator(r);
    if (dividend % divisor == 0) {
-        return f(utils::multiprecision_int(boost::multiprecision::cpp_int(dividend/divisor)));
+        return f(utils::multiprecision_int(dividend/divisor));
    }
-    boost::multiprecision::cpp_rational r = dividend;
-    r /= divisor;
    return f(r.convert_to<double>());
 }

--- a/main.cc
+++ b/main.cc
@@ -546,9 +546,13 @@ int main(int ac, char** av) {
            gms::feature_config fcfg = gms::feature_config_from_db_config(*cfg);

            feature_service.start(fcfg).get();
-            auto stop_feature_service = defer_verbose_shutdown("feature service", [&feature_service] {
-                feature_service.stop().get();
-            });
+            // FIXME storage_proxy holds a reference on it and is not yet stopped.
+            // also the proxy leaves range_slice_read_executor-s hanging around
+            // and willing to find out if the cluster_supports_digest_multipartition_reads
+            //
+            //auto stop_feature_service = defer_verbose_shutdown("feature service", [&feature_service] {
+            //    feature_service.stop().get();
+            //});

            schema::set_default_partitioner(cfg->partitioner(), cfg->murmur3_partitioner_ignore_msb_bits());
            auto make_sched_group = [&] (sstring name, unsigned shares) {
@@ -662,9 +666,17 @@ int main(int ac, char** av) {

            supervisor::notify("starting tokens manager");
            token_metadata.start().get();
-            auto stop_token_metadata = defer_verbose_shutdown("token metadata", [ &token_metadata ] {
-                token_metadata.stop().get();
-            });
+            // storage_proxy holds a reference on it and is not yet stopped.
+            // what's worse is that the calltrace
+            //   storage_proxy::do_query 
+            //                ::query_partition_key_range
+            //                ::query_partition_key_range_concurrent
+            // leaves unwaited futures on the reactor and once it gets there
+            // the token_metadata instance is accessed and ...
+            //
+            //auto stop_token_metadata = defer_verbose_shutdown("token metadata", [ &token_metadata ] {
+            //    token_metadata.stop().get();
+            //});

            supervisor::notify("starting migration manager notifier");
            mm_notifier.start().get();
@@ -935,12 +947,16 @@ int main(int ac, char** av) {
                mm.init_messaging_service();
            }).get();
            supervisor::notify("initializing storage proxy RPC verbs");
-            proxy.invoke_on_all([] (service::storage_proxy& p) {
-                p.init_messaging_service();
-            }).get();
+            proxy.invoke_on_all(&service::storage_proxy::init_messaging_service).get();
+            auto stop_proxy_handlers = defer_verbose_shutdown("storage proxy RPC verbs", [&proxy] {
+                proxy.invoke_on_all(&service::storage_proxy::uninit_messaging_service).get();
+            });

            supervisor::notify("starting streaming service");
            streaming::stream_session::init_streaming_service(db, sys_dist_ks, view_update_generator).get();
+            auto stop_streaming_service = defer_verbose_shutdown("streaming service", [] {
+                streaming::stream_session::uninit_streaming_service().get();
+            });
            api::set_server_stream_manager(ctx).get();

            supervisor::notify("starting hinted handoff manager");
@@ -973,6 +989,9 @@ int main(int ac, char** av) {
                rs.stop().get();
            });
            repair_init_messaging_service_handler(rs, sys_dist_ks, view_update_generator).get();
+            auto stop_repair_messages = defer_verbose_shutdown("repair message handlers", [] {
+                repair_uninit_messaging_service_handler().get();
+            });
            supervisor::notify("starting storage service", true);
            auto& ss = service::get_local_storage_service();
            ss.init_messaging_service_part().get();
@@ -1071,9 +1090,6 @@ int main(int ac, char** av) {
                static sharded<alternator::executor> alternator_executor;
                static sharded<alternator::server> alternator_server;

-                if (!cfg->check_experimental(db::experimental_features_t::LWT)) {
-                    throw std::runtime_error("Alternator enabled, but needs experimental LWT feature which wasn't enabled");
-                }
                net::inet_address addr;
                try {
                    addr = net::dns::get_host_by_name(cfg->alternator_address(), family).get0().addr_list.front();
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -452,6 +452,7 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
    case messaging_verb::PAXOS_PREPARE:
    case messaging_verb::PAXOS_ACCEPT:
    case messaging_verb::PAXOS_LEARN:
+    case messaging_verb::PAXOS_PRUNE:
        return 0;
    // GET_SCHEMA_VERSION is sent from read/mutate verbs so should be
    // sent on a different connection to avoid potential deadlocks
@@ -717,6 +718,10 @@ void messaging_service::register_stream_mutation_fragments(std::function<future<
    register_handler(this, messaging_verb::STREAM_MUTATION_FRAGMENTS, std::move(func));
 }

+future<> messaging_service::unregister_stream_mutation_fragments() {
+    return unregister_handler(messaging_verb::STREAM_MUTATION_FRAGMENTS);
+}
+
 template<class SinkType, class SourceType>
 future<rpc::sink<SinkType>, rpc::source<SourceType>>
 do_make_sink_source(messaging_verb verb, uint32_t repair_meta_id, shared_ptr<messaging_service::rpc_protocol_client_wrapper> rpc_client, std::unique_ptr<messaging_service::rpc_protocol_wrapper>& rpc) {
@@ -748,6 +753,9 @@ rpc::sink<repair_row_on_wire_with_cmd> messaging_service::make_sink_for_repair_g
 void messaging_service::register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_hash_with_cmd> source)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM, std::move(func));
 }
+future<> messaging_service::unregister_repair_get_row_diff_with_rpc_stream() {
+    return unregister_handler(messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM);
+}

 // Wrapper for REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM
 future<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>
@@ -767,6 +775,9 @@ rpc::sink<repair_stream_cmd> messaging_service::make_sink_for_repair_put_row_dif
 void messaging_service::register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source)>&& func) {
    register_handler(this, messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM, std::move(func));
 }
+future<> messaging_service::unregister_repair_put_row_diff_with_rpc_stream() {
+    return unregister_handler(messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM);
+}

 // Wrapper for REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM
 future<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>
@@ -786,6 +797,9 @@ rpc::sink<repair_hash_with_cmd> messaging_service::make_sink_for_repair_get_full
 void messaging_service::register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_stream_cmd> source)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM, std::move(func));
 }
+future<> messaging_service::unregister_repair_get_full_row_hashes_with_rpc_stream() {
+    return unregister_handler(messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM);
+}

 // Send a message for verb
 template <typename MsgIn, typename... MsgOut>
@@ -869,6 +883,9 @@ future<streaming::prepare_message> messaging_service::send_prepare_message(msg_a
    return send_message<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
        std::move(msg), plan_id, std::move(description), reason);
 }
+future<> messaging_service::unregister_prepare_message() {
+    return unregister_handler(messaging_verb::PREPARE_MESSAGE);
+}

 // PREPARE_DONE_MESSAGE
 void messaging_service::register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func) {
@@ -878,6 +895,9 @@ future<> messaging_service::send_prepare_done_message(msg_addr id, UUID plan_id,
    return send_message<void>(this, messaging_verb::PREPARE_DONE_MESSAGE, id,
        plan_id, dst_cpu_id);
 }
+future<> messaging_service::unregister_prepare_done_message() {
+    return unregister_handler(messaging_verb::PREPARE_DONE_MESSAGE);
+}

 // STREAM_MUTATION
 void messaging_service::register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool> fragmented, rpc::optional<streaming::stream_reason> reason)>&& func) {
@@ -902,6 +922,9 @@ future<> messaging_service::send_stream_mutation_done(msg_addr id, UUID plan_id,
    return send_message<void>(this, messaging_verb::STREAM_MUTATION_DONE, id,
        plan_id, std::move(ranges), cf_id, dst_cpu_id);
 }
+future<> messaging_service::unregister_stream_mutation_done() {
+    return unregister_handler(messaging_verb::STREAM_MUTATION_DONE);
+}

 // COMPLETE_MESSAGE
 void messaging_service::register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func) {
@@ -911,6 +934,9 @@ future<> messaging_service::send_complete_message(msg_addr id, UUID plan_id, uns
    return send_message<void>(this, messaging_verb::COMPLETE_MESSAGE, id,
        plan_id, dst_cpu_id, failed);
 }
+future<> messaging_service::unregister_complete_message() {
+    return unregister_handler(messaging_verb::COMPLETE_MESSAGE);
+}

 void messaging_service::register_gossip_echo(std::function<future<> ()>&& func) {
    register_handler(this, messaging_verb::GOSSIP_ECHO, std::move(func));
@@ -1125,14 +1151,14 @@ future<partition_checksum> messaging_service::send_repair_checksum_range(
 }

 // Wrapper for REPAIR_GET_FULL_ROW_HASHES
-void messaging_service::register_repair_get_full_row_hashes(std::function<future<std::unordered_set<repair_hash>> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
+void messaging_service::register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(func));
 }
 future<> messaging_service::unregister_repair_get_full_row_hashes() {
    return unregister_handler(messaging_verb::REPAIR_GET_FULL_ROW_HASHES);
 }
-future<std::unordered_set<repair_hash>> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
-    return send_message<future<std::unordered_set<repair_hash>>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
+future<repair_hash_set> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
+    return send_message<future<repair_hash_set>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
 }

 // Wrapper for REPAIR_GET_COMBINED_ROW_HASH
@@ -1157,13 +1183,13 @@ future<get_sync_boundary_response> messaging_service::send_repair_get_sync_bound
 }

 // Wrapper for REPAIR_GET_ROW_DIFF
-void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows)>&& func) {
+void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(func));
 }
 future<> messaging_service::unregister_repair_get_row_diff() {
    return unregister_handler(messaging_verb::REPAIR_GET_ROW_DIFF);
 }
-future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows) {
+future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows) {
    return send_message<future<repair_rows_on_wire>>(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(id), repair_meta_id, std::move(set_diff), needs_all_rows);
 }

@@ -1179,14 +1205,14 @@ future<> messaging_service::send_repair_put_row_diff(msg_addr id, uint32_t repai
 }

 // Wrapper for REPAIR_ROW_LEVEL_START
-void messaging_service::register_repair_row_level_start(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version)>&& func) {
+void messaging_service::register_repair_row_level_start(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, rpc::optional<streaming::stream_reason> reason)>&& func) {
    register_handler(this, messaging_verb::REPAIR_ROW_LEVEL_START, std::move(func));
 }
 future<> messaging_service::unregister_repair_row_level_start() {
    return unregister_handler(messaging_verb::REPAIR_ROW_LEVEL_START);
 }
-future<> messaging_service::send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version) {
-    return send_message<void>(this, messaging_verb::REPAIR_ROW_LEVEL_START, std::move(id), repair_meta_id, std::move(keyspace_name), std::move(cf_name), std::move(range), algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, std::move(remote_partitioner_name), std::move(schema_version));
+future<> messaging_service::send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, streaming::stream_reason reason) {
+    return send_message<void>(this, messaging_verb::REPAIR_ROW_LEVEL_START, std::move(id), repair_meta_id, std::move(keyspace_name), std::move(cf_name), std::move(range), algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, std::move(remote_partitioner_name), std::move(schema_version), reason);
 }

 // Wrapper for REPAIR_ROW_LEVEL_STOP
@@ -1281,6 +1307,19 @@ future<> messaging_service::send_paxos_learn(msg_addr id, clock_type::time_point
        std::move(reply_to), shard, std::move(response_id), std::move(trace_info));
 }

+void messaging_service::register_paxos_prune(std::function<future<rpc::no_wait_type>(
+        const rpc::client_info&, rpc::opt_time_point, UUID schema_id, partition_key key, utils::UUID ballot, std::optional<tracing::trace_info>)>&& func) {
+    register_handler(this, messaging_verb::PAXOS_PRUNE, std::move(func));
+}
+future<> messaging_service::unregister_paxos_prune() {
+    return unregister_handler(netw::messaging_verb::PAXOS_PRUNE);
+}
+future<>
+messaging_service::send_paxos_prune(gms::inet_address peer, clock_type::time_point timeout, UUID schema_id,
+        const partition_key& key, utils::UUID ballot, std::optional<tracing::trace_info> trace_info) {
+    return send_message_oneway_timeout(this, timeout, messaging_verb::PAXOS_PRUNE, netw::msg_addr(peer), schema_id, key, ballot, std::move(trace_info));
+}
+
 void messaging_service::register_hint_mutation(std::function<future<rpc::no_wait_type> (const rpc::client_info&, rpc::opt_time_point, frozen_mutation fm, std::vector<inet_address> forward,
        inet_address reply_to, unsigned shard, response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info)>&& func) {
    register_handler(this, netw::messaging_verb::HINT_MUTATION, std::move(func));
--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -139,7 +139,8 @@ enum class messaging_verb : int32_t {
    PAXOS_ACCEPT = 40,
    PAXOS_LEARN = 41,
    HINT_MUTATION = 42,
-    LAST = 43,
+    PAXOS_PRUNE = 43,
+    LAST = 44,
 };

 } // namespace netw
@@ -274,10 +275,12 @@ public:
            streaming::prepare_message msg, UUID plan_id, sstring description, rpc::optional<streaming::stream_reason> reason)>&& func);
    future<streaming::prepare_message> send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
            sstring description, streaming::stream_reason);
+    future<> unregister_prepare_message();

    // Wrapper for PREPARE_DONE_MESSAGE verb
    void register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func);
    future<> send_prepare_done_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id);
+    future<> unregister_prepare_done_message();

    // Wrapper for STREAM_MUTATION verb
    void register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool>, rpc::optional<streaming::stream_reason>)>&& func);
@@ -286,6 +289,7 @@ public:
    // Wrapper for STREAM_MUTATION_FRAGMENTS
    // The receiver of STREAM_MUTATION_FRAGMENTS sends status code to the sender to notify any error on the receiver side. The status code is of type int32_t. 0 means successful, -1 means error, other status code value are reserved for future use.
    void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>> source)>&& func);
+    future<> unregister_stream_mutation_fragments();
    rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>>& source);
    future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);

@@ -293,22 +297,27 @@ public:
    future<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>> make_sink_and_source_for_repair_get_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
    rpc::sink<repair_row_on_wire_with_cmd> make_sink_for_repair_get_row_diff_with_rpc_stream(rpc::source<repair_hash_with_cmd>& source);
    void register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_hash_with_cmd> source)>&& func);
+    future<> unregister_repair_get_row_diff_with_rpc_stream();

    // Wrapper for REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM
    future<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>> make_sink_and_source_for_repair_put_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
    rpc::sink<repair_stream_cmd> make_sink_for_repair_put_row_diff_with_rpc_stream(rpc::source<repair_row_on_wire_with_cmd>& source);
    void register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source)>&& func);
+    future<> unregister_repair_put_row_diff_with_rpc_stream();

    // Wrapper for REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM
    future<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>> make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
    rpc::sink<repair_hash_with_cmd> make_sink_for_repair_get_full_row_hashes_with_rpc_stream(rpc::source<repair_stream_cmd>& source);
    void register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_stream_cmd> source)>&& func);
+    future<> unregister_repair_get_full_row_hashes_with_rpc_stream();

    void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
    future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);
+    future<> unregister_stream_mutation_done();

    void register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func);
    future<> send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id, bool failed = false);
+    future<> unregister_complete_message();

    // Wrapper for REPAIR_CHECKSUM_RANGE verb
    void register_repair_checksum_range(std::function<future<partition_checksum> (sstring keyspace, sstring cf, dht::token_range range, rpc::optional<repair_checksum> hash_version)>&& func);
@@ -316,9 +325,9 @@ public:
    future<partition_checksum> send_repair_checksum_range(msg_addr id, sstring keyspace, sstring cf, dht::token_range range, repair_checksum hash_version);

    // Wrapper for REPAIR_GET_FULL_ROW_HASHES
-    void register_repair_get_full_row_hashes(std::function<future<std::unordered_set<repair_hash>> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
+    void register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
    future<> unregister_repair_get_full_row_hashes();
-    future<std::unordered_set<repair_hash>> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);
+    future<repair_hash_set> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);

    // Wrapper for REPAIR_GET_COMBINED_ROW_HASH
    void register_repair_get_combined_row_hash(std::function<future<get_combined_row_hash_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::optional<repair_sync_boundary> common_sync_boundary)>&& func);
@@ -331,9 +340,9 @@ public:
    future<get_sync_boundary_response> send_repair_get_sync_boundary(msg_addr id, uint32_t repair_meta_id, std::optional<repair_sync_boundary> skipped_sync_boundary);

    // Wrapper for REPAIR_GET_ROW_DIFF
-    void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows)>&& func);
+    void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func);
    future<> unregister_repair_get_row_diff();
-    future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows);
+    future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows);

    // Wrapper for REPAIR_PUT_ROW_DIFF
    void register_repair_put_row_diff(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_rows_on_wire row_diff)>&& func);
@@ -341,9 +350,9 @@ public:
    future<> send_repair_put_row_diff(msg_addr id, uint32_t repair_meta_id, repair_rows_on_wire row_diff);

    // Wrapper for REPAIR_ROW_LEVEL_START
-    void register_repair_row_level_start(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version)>&& func);
+    void register_repair_row_level_start(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, rpc::optional<streaming::stream_reason> reason)>&& func);
    future<> unregister_repair_row_level_start();
-    future<> send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version);
+    future<> send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, streaming::stream_reason reason);

    // Wrapper for REPAIR_ROW_LEVEL_STOP
    void register_repair_row_level_stop(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range)>&& func);
@@ -493,6 +502,14 @@ public:
            std::vector<inet_address> forward, inet_address reply_to, unsigned shard, response_id_type response_id,
            std::optional<tracing::trace_info> trace_info = std::nullopt);

+    void register_paxos_prune(std::function<future<rpc::no_wait_type>(const rpc::client_info&, rpc::opt_time_point, UUID schema_id, partition_key key,
+            utils::UUID ballot, std::optional<tracing::trace_info>)>&& func);
+
+    future<> unregister_paxos_prune();
+
+    future<> send_paxos_prune(gms::inet_address peer, clock_type::time_point timeout, UUID schema_id, const partition_key& key,
+            utils::UUID ballot, std::optional<tracing::trace_info> trace_info);
+
    void register_hint_mutation(std::function<future<rpc::no_wait_type> (const rpc::client_info&, rpc::opt_time_point, frozen_mutation fm, std::vector<inet_address> forward,
        inet_address reply_to, unsigned shard, response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info)>&& func);
    future<> unregister_hint_mutation();
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -195,6 +195,7 @@ class read_context : public reader_lifecycle_policy {

    // One for each shard. Index is shard id.
    std::vector<reader_meta> _readers;
+    std::vector<reader_concurrency_semaphore*> _semaphores;

    gate _dismantling_gate;

@@ -211,7 +212,8 @@ public:
            , _schema(std::move(s))
            , _cmd(cmd)
            , _ranges(ranges)
-            , _trace_state(std::move(trace_state)) {
+            , _trace_state(std::move(trace_state))
+            , _semaphores(smp::count, nullptr) {
        _readers.resize(smp::count);
    }

@@ -236,7 +238,12 @@ public:
    virtual void destroy_reader(shard_id shard, future<stopped_reader> reader_fut) noexcept override;

    virtual reader_concurrency_semaphore& semaphore() override {
-        return _readers[engine().cpu_id()].rparts->semaphore;
+        const auto shard = engine().cpu_id();
+        if (!_semaphores[shard]) {
+            auto& table = _db.local().find_column_family(_schema);
+            _semaphores[shard] = &table.read_concurrency_semaphore();
+        }
+        return *_semaphores[shard];
    }

    future<> lookup_readers();
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -1721,7 +1721,7 @@ void row::apply_monotonically(const schema& s, column_kind kind, row&& other) {
 // we erase the live cells according to the shadowable_tombstone rules.
 static bool dead_marker_shadows_row(const schema& s, column_kind kind, const row_marker& marker) {
    return s.is_view()
-            && !s.view_info()->base_non_pk_columns_in_view_pk().empty()
+            && s.view_info()->has_base_non_pk_columns_in_view_pk()
            && !marker.is_live()
            && kind == column_kind::regular_column; // not applicable to static rows
 }
@@ -2505,7 +2505,8 @@ mutation_partition::fully_discontinuous(const schema& s, const position_range& r
 future<mutation_opt> counter_write_query(schema_ptr s, const mutation_source& source,
                                         const dht::decorated_key& dk,
                                         const query::partition_slice& slice,
-                                         tracing::trace_state_ptr trace_ptr)
+                                         tracing::trace_state_ptr trace_ptr,
+                                         db::timeout_clock::time_point timeout)
 {
    struct range_and_reader {
        dht::partition_range range;
@@ -2530,7 +2531,7 @@ future<mutation_opt> counter_write_query(schema_ptr s, const mutation_source& so
    auto cwqrb = counter_write_query_result_builder(*s);
    auto cfq = make_stable_flattened_mutations_consumer<compact_for_query<emit_only_live_rows::yes, counter_write_query_result_builder>>(
            *s, gc_clock::now(), slice, query::max_rows, query::max_rows, std::move(cwqrb));
-    auto f = r_a_r->reader.consume(std::move(cfq), db::no_timeout);
+    auto f = r_a_r->reader.consume(std::move(cfq), timeout);
    return f.finally([r_a_r = std::move(r_a_r)] { });
 }

@@ -2605,7 +2606,7 @@ void mutation_cleaner_impl::start_worker() {
 stop_iteration mutation_cleaner_impl::merge_some(partition_snapshot& snp) noexcept {
    auto&& region = snp.region();
    return with_allocator(region.allocator(), [&] {
-        return with_linearized_managed_bytes([&] {
+        {
            // Allocating sections require the region to be reclaimable
            // which means that they cannot be nested.
            // It is, however, possible, that if the snapshot is taken
@@ -2617,13 +2618,15 @@ stop_iteration mutation_cleaner_impl::merge_some(partition_snapshot& snp) noexce
            }
            try {
                return _worker_state->alloc_section(region, [&] {
+                  return with_linearized_managed_bytes([&] {
                    return snp.merge_partition_versions(_app_stats);
+                  });
                });
            } catch (...) {
                // Merging failed, give up as there is no guarantee of forward progress.
                return stop_iteration::yes;
            }
-        });
+        }
    });
 }

--- a/mutation_query.hh
+++ b/mutation_query.hh
@@ -113,9 +113,6 @@ class reconcilable_result_builder {
    const schema& _schema;
    const query::partition_slice& _slice;

-    utils::chunked_vector<partition> _result;
-    uint32_t _live_rows{};
-
    bool _return_static_content_on_partition_with_no_rows{};
    bool _static_row_is_alive{};
    uint32_t _total_live_rows = 0;
@@ -123,6 +120,10 @@ class reconcilable_result_builder {
    stop_iteration _stop;
    bool _short_read_allowed;
    std::optional<streamed_mutation_freezer> _mutation_consumer;
+
+    uint32_t _live_rows{};
+    // make this the last member so it is destroyed first. #7240
+    utils::chunked_vector<partition> _result;
 public:
    reconcilable_result_builder(const schema& s, const query::partition_slice& slice,
                                query::result_memory_accounter&& accounter)
@@ -206,5 +207,6 @@ public:
 future<mutation_opt> counter_write_query(schema_ptr, const mutation_source&,
                                         const dht::decorated_key& dk,
                                         const query::partition_slice& slice,
-                                         tracing::trace_state_ptr trace_ptr);
+                                         tracing::trace_state_ptr trace_ptr,
+                                         db::timeout_clock::time_point timeout);

--- a/mutation_reader.cc
+++ b/mutation_reader.cc
--- a/mutation_reader.hh
+++ b/mutation_reader.hh
@@ -372,6 +372,64 @@ flat_mutation_reader make_foreign_reader(schema_ptr schema,
        foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader,
        streamed_mutation::forwarding fwd_sm = streamed_mutation::forwarding::no);

+/// Make an auto-paused evictable reader.
+///
+/// The reader is paused after each use, that is after each call to any of its
+/// members that cause actual reading to be done (`fill_buffer()` and
+/// `fast_forward_to()`). When paused, the reader is made evictable, that it is
+/// it is registered with reader concurrency semaphore as an inactive read.
+/// The reader is resumed automatically on the next use. If it was evicted, it
+/// will be recreated at the position it left off reading. This is all
+/// transparent to its user.
+/// Parameters passed by reference have to be kept alive while the reader is
+/// alive.
+flat_mutation_reader make_auto_paused_evictable_reader(
+        mutation_source ms,
+        schema_ptr schema,
+        reader_concurrency_semaphore& semaphore,
+        const dht::partition_range& pr,
+        const query::partition_slice& ps,
+        const io_priority_class& pc,
+        tracing::trace_state_ptr trace_state,
+        mutation_reader::forwarding fwd_mr);
+
+class evictable_reader;
+
+class evictable_reader_handle {
+    friend std::pair<flat_mutation_reader, evictable_reader_handle> make_manually_paused_evictable_reader(mutation_source, schema_ptr, reader_concurrency_semaphore&,
+            const dht::partition_range&, const query::partition_slice&, const io_priority_class&, tracing::trace_state_ptr, mutation_reader::forwarding);
+
+private:
+    evictable_reader* _r;
+
+private:
+    explicit evictable_reader_handle(evictable_reader& r);
+
+public:
+    void pause();
+};
+
+/// Make a manually-paused evictable reader.
+///
+/// The reader can be paused via the evictable reader handle when desired. The
+/// intended usage is subsequent reads done in bursts, after which the reader is
+/// not used for some time. When paused, the reader is made evictable, that is,
+/// it is registered with reader concurrency semaphore as an inactive read.
+/// The reader is resumed automatically on the next use. If it was evicted, it
+/// will be recreated at the position it left off reading. This is all
+/// transparent to its user.
+/// Parameters passed by reference have to be kept alive while the reader is
+/// alive.
+std::pair<flat_mutation_reader, evictable_reader_handle> make_manually_paused_evictable_reader(
+        mutation_source ms,
+        schema_ptr schema,
+        reader_concurrency_semaphore& semaphore,
+        const dht::partition_range& pr,
+        const query::partition_slice& ps,
+        const io_priority_class& pc,
+        tracing::trace_state_ptr trace_state,
+        mutation_reader::forwarding fwd_mr);
+
 /// Reader lifecycle policy for the mulitshard combining reader.
 ///
 /// This policy is expected to make sure any additional resource the readers
--- a/mutation_writer/multishard_writer.cc
+++ b/mutation_writer/multishard_writer.cc
@@ -173,6 +173,13 @@ future<> multishard_writer::distribute_mutation_fragments() {
                return handle_end_of_stream();
            }
        });
+    }).handle_exception([this] (std::exception_ptr ep) {
+        for (auto& q : _queue_reader_handles) {
+            if (q) {
+                q->abort(ep);
+            }
+        }
+        return make_exception_future<>(std::move(ep));
    });
 }

--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -163,6 +163,11 @@ public:
        return {partition_region::clustered, bound_weight::before_all_prefixed, &ck};
    }

+    // Returns a view to before_key(pos._ck) if pos.is_clustering_row() else returns pos as-is.
+    static position_in_partition_view before_key(position_in_partition_view pos) {
+        return {partition_region::clustered, pos._bound_weight == bound_weight::equal ? bound_weight::before_all_prefixed : pos._bound_weight, pos._ck};
+    }
+
    partition_region region() const { return _type; }
    bound_weight get_bound_weight() const { return _bound_weight; }
    bool is_partition_start() const { return _type == partition_region::partition_start; }
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -27,6 +27,7 @@


 reader_permit::impl::impl(reader_concurrency_semaphore& semaphore, reader_resources base_cost) : semaphore(semaphore), base_cost(base_cost) {
+    semaphore.consume(base_cost);
 }

 reader_permit::impl::~impl() {
@@ -88,7 +89,6 @@ void reader_concurrency_semaphore::signal(const resources& r) noexcept {
    _resources += r;
    while (!_wait_list.empty() && has_available_units(_wait_list.front().res)) {
        auto& x = _wait_list.front();
-        _resources -= x.res;
        try {
            x.pr.set_value(reader_permit(*this, x.res));
        } catch (...) {
@@ -160,7 +160,6 @@ future<reader_permit> reader_concurrency_semaphore::wait_admission(size_t memory
        --_inactive_read_stats.population;
    }
    if (may_proceed(r)) {
-        _resources -= r;
        return make_ready_future<reader_permit>(reader_permit(*this, r));
    }
    promise<reader_permit> pr;
@@ -170,7 +169,6 @@ future<reader_permit> reader_concurrency_semaphore::wait_admission(size_t memory
 }

 reader_permit reader_concurrency_semaphore::consume_resources(resources r) {
-    _resources -= r;
    return reader_permit(*this, r);
 }

--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -128,6 +128,10 @@ private:
        return has_available_units(r) && _wait_list.empty();
    }

+    void consume(resources r) {
+        _resources -= r;
+    }
+
    void consume_memory(size_t memory) {
        _resources.memory -= memory;
    }
--- a/reloc/get-dynamic-linker.sh
+++ b/reloc/get-dynamic-linker.sh
@@ -12,7 +12,11 @@
 # At the end of the build we check that the build-id is indeed in the
 # first page. At install time we check that patchelf doesn't modify
 # the program headers.
+
+# gdb has a SO_NAME_MAX_PATH_SIZE of 512, so limit the path size to
+# that. The 512 includes the null at the end, hence the 511 bellow.
+
 ORIGINAL_DYNAMIC_LINKER=$(gcc -### /dev/null -o t 2>&1 | perl -n  -e '/-dynamic-linker ([^ ]*) / && print $1')
-DYNAMIC_LINKER=$(printf "%2000s$ORIGINAL_DYNAMIC_LINKER" | sed 's| |/|g')
+DYNAMIC_LINKER=$(printf "%511s$ORIGINAL_DYNAMIC_LINKER" | sed 's| |/|g')

 echo $DYNAMIC_LINKER
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -672,7 +672,8 @@ repair_info::repair_info(seastar::sharded<database>& db_,
    const std::vector<sstring>& cfs_,
    int id_,
    const std::vector<sstring>& data_centers_,
-    const std::vector<sstring>& hosts_)
+    const std::vector<sstring>& hosts_,
+    streaming::stream_reason reason_)
    : db(db_)
    , partitioner(get_partitioner_for_tables(db_, keyspace_, cfs_))
    , keyspace(keyspace_)
@@ -682,6 +683,7 @@ repair_info::repair_info(seastar::sharded<database>& db_,
    , shard(engine().cpu_id())
    , data_centers(data_centers_)
    , hosts(hosts_)
+    , reason(reason_)
    , _row_level_repair(db.local().features().cluster_supports_row_level_repair()) {
 }

@@ -1462,7 +1464,7 @@ static int do_repair_start(seastar::sharded<database>& db, sstring keyspace,
                    data_centers = options.data_centers, hosts = options.hosts] (database& localdb) mutable {
                auto ri = make_lw_shared<repair_info>(db,
                        std::move(keyspace), std::move(ranges), std::move(cfs),
-                        id, std::move(data_centers), std::move(hosts));
+                        id, std::move(data_centers), std::move(hosts), streaming::stream_reason::repair);
                return repair_ranges(ri);
            });
            repair_results.push_back(std::move(f));
@@ -1524,14 +1526,15 @@ future<> repair_abort_all(seastar::sharded<database>& db) {
 future<> sync_data_using_repair(seastar::sharded<database>& db,
        sstring keyspace,
        dht::token_range_vector ranges,
-        std::unordered_map<dht::token_range, repair_neighbors> neighbors) {
+        std::unordered_map<dht::token_range, repair_neighbors> neighbors,
+        streaming::stream_reason reason) {
    if (ranges.empty()) {
        return make_ready_future<>();
    }
-    return smp::submit_to(0, [&db, keyspace = std::move(keyspace), ranges = std::move(ranges), neighbors = std::move(neighbors)] () mutable {
+    return smp::submit_to(0, [&db, keyspace = std::move(keyspace), ranges = std::move(ranges), neighbors = std::move(neighbors), reason] () mutable {
        int id = repair_tracker().next_repair_command();
        rlogger.info("repair id {} to sync data for keyspace={}, status=started", id, keyspace);
-        return repair_tracker().run(id, [id, &db, keyspace, ranges = std::move(ranges), neighbors = std::move(neighbors)] () mutable {
+        return repair_tracker().run(id, [id, &db, keyspace, ranges = std::move(ranges), neighbors = std::move(neighbors), reason] () mutable {
            auto cfs = list_column_families(db.local(), keyspace);
            if (cfs.empty()) {
                rlogger.warn("repair id {} to sync data for keyspace={}, no table in this keyspace", id, keyspace);
@@ -1540,12 +1543,12 @@ future<> sync_data_using_repair(seastar::sharded<database>& db,
            std::vector<future<>> repair_results;
            repair_results.reserve(smp::count);
            for (auto shard : boost::irange(unsigned(0), smp::count)) {
-                auto f = db.invoke_on(shard, [keyspace, cfs, id, ranges, neighbors] (database& localdb) mutable {
+                auto f = db.invoke_on(shard, [keyspace, cfs, id, ranges, neighbors, reason] (database& localdb) mutable {
                    auto data_centers = std::vector<sstring>();
                    auto hosts = std::vector<sstring>();
                    auto ri = make_lw_shared<repair_info>(service::get_local_storage_service().db(),
                            std::move(keyspace), std::move(ranges), std::move(cfs),
-                            id, std::move(data_centers), std::move(hosts));
+                            id, std::move(data_centers), std::move(hosts), reason);
                    ri->neighbors = std::move(neighbors);
                    return repair_ranges(ri);
                });
@@ -1584,6 +1587,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, locator::token_me
        auto keyspaces = db.local().get_non_system_keyspaces();
        rlogger.info("bootstrap_with_repair: started with keyspaces={}", keyspaces);
        auto myip = utils::fb_utilities::get_broadcast_address();
+        auto reason = streaming::stream_reason::bootstrap;
        for (auto& keyspace_name : keyspaces) {
            if (!db.local().has_keyspace(keyspace_name)) {
                rlogger.info("bootstrap_with_repair: keyspace={} does not exist any more, ignoring it", keyspace_name);
@@ -1716,7 +1720,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, locator::token_me
                }
            }
            auto nr_ranges = desired_ranges.size();
-            sync_data_using_repair(db, keyspace_name, std::move(desired_ranges), std::move(range_sources)).get();
+            sync_data_using_repair(db, keyspace_name, std::move(desired_ranges), std::move(range_sources), reason).get();
            rlogger.info("bootstrap_with_repair: finished with keyspace={}, nr_ranges={}", keyspace_name, nr_ranges);
        }
        rlogger.info("bootstrap_with_repair: finished with keyspaces={}", keyspaces);
@@ -1730,6 +1734,7 @@ future<> do_decommission_removenode_with_repair(seastar::sharded<database>& db,
        auto keyspaces = db.local().get_non_system_keyspaces();
        bool is_removenode = myip != leaving_node;
        auto op = is_removenode ? "removenode_with_repair" : "decommission_with_repair";
+        streaming::stream_reason reason = is_removenode ? streaming::stream_reason::removenode : streaming::stream_reason::decommission;
        rlogger.info("{}: started with keyspaces={}, leaving_node={}", op, keyspaces, leaving_node);
        for (auto& keyspace_name : keyspaces) {
            if (!db.local().has_keyspace(keyspace_name)) {
@@ -1867,7 +1872,7 @@ future<> do_decommission_removenode_with_repair(seastar::sharded<database>& db,
                ranges.swap(ranges_for_removenode);
            }
            auto nr_ranges_synced = ranges.size();
-            sync_data_using_repair(db, keyspace_name, std::move(ranges), std::move(range_sources)).get();
+            sync_data_using_repair(db, keyspace_name, std::move(ranges), std::move(range_sources), reason).get();
            rlogger.info("{}: finished with keyspace={}, leaving_node={}, nr_ranges={}, nr_ranges_synced={}, nr_ranges_skipped={}",
                op, keyspace_name, leaving_node, nr_ranges_total, nr_ranges_synced, nr_ranges_skipped);
        }
@@ -1883,8 +1888,8 @@ future<> removenode_with_repair(seastar::sharded<database>& db, locator::token_m
    return do_decommission_removenode_with_repair(db, std::move(tm), std::move(leaving_node));
 }

-future<> do_rebuild_replace_with_repair(seastar::sharded<database>& db, locator::token_metadata tm, sstring op, sstring source_dc) {
-    return seastar::async([&db, tm = std::move(tm), source_dc = std::move(source_dc), op = std::move(op)] () mutable {
+future<> do_rebuild_replace_with_repair(seastar::sharded<database>& db, locator::token_metadata tm, sstring op, sstring source_dc, streaming::stream_reason reason) {
+    return seastar::async([&db, tm = std::move(tm), source_dc = std::move(source_dc), op = std::move(op), reason] () mutable {
        auto keyspaces = db.local().get_non_system_keyspaces();
        rlogger.info("{}: started with keyspaces={}, source_dc={}", op, keyspaces, source_dc);
        auto myip = utils::fb_utilities::get_broadcast_address();
@@ -1921,7 +1926,7 @@ future<> do_rebuild_replace_with_repair(seastar::sharded<database>& db, locator:
                }
            }
            auto nr_ranges = ranges.size();
-            sync_data_using_repair(db, keyspace_name, std::move(ranges), std::move(range_sources)).get();
+            sync_data_using_repair(db, keyspace_name, std::move(ranges), std::move(range_sources), reason).get();
            rlogger.info("{}: finished with keyspace={}, source_dc={}, nr_ranges={}", op, keyspace_name, source_dc, nr_ranges);
        }
        rlogger.info("{}: finished with keyspaces={}, source_dc={}", op, keyspaces, source_dc);
@@ -1933,11 +1938,13 @@ future<> rebuild_with_repair(seastar::sharded<database>& db, locator::token_meta
    if (source_dc.empty()) {
        source_dc = get_local_dc();
    }
-    return do_rebuild_replace_with_repair(db, std::move(tm), std::move(op), std::move(source_dc));
+    auto reason = streaming::stream_reason::rebuild;
+    return do_rebuild_replace_with_repair(db, std::move(tm), std::move(op), std::move(source_dc), reason);
 }

 future<> replace_with_repair(seastar::sharded<database>& db, locator::token_metadata tm) {
    auto op = sstring("replace_with_repair");
    auto source_dc = get_local_dc();
-    return do_rebuild_replace_with_repair(db, std::move(tm), std::move(op), std::move(source_dc));
+    auto reason = streaming::stream_reason::bootstrap;
+    return do_rebuild_replace_with_repair(db, std::move(tm), std::move(op), std::move(source_dc), reason);
 }
--- a/repair/repair.hh
+++ b/repair/repair.hh
@@ -23,6 +23,7 @@

 #include <unordered_map>
 #include <exception>
+#include <absl/container/btree_set.h>

 #include <seastar/core/sstring.hh>
 #include <seastar/core/sharded.hh>
@@ -181,6 +182,7 @@ public:
    shard_id shard;
    std::vector<sstring> data_centers;
    std::vector<sstring> hosts;
+    streaming::stream_reason reason;
    std::unordered_map<dht::token_range, repair_neighbors> neighbors;
    size_t nr_failed_ranges = 0;
    bool aborted = false;
@@ -211,7 +213,8 @@ public:
            const std::vector<sstring>& cfs_,
            int id_,
            const std::vector<sstring>& data_centers_,
-            const std::vector<sstring>& hosts_);
+            const std::vector<sstring>& hosts_,
+            streaming::stream_reason reason_);
    future<> do_streaming();
    void check_failed_ranges();
    future<> request_transfer_ranges(const sstring& cf,
@@ -332,6 +335,8 @@ public:
    }
 };

+using repair_hash_set = absl::btree_set<repair_hash>;
+
 // Return value of the REPAIR_GET_SYNC_BOUNDARY RPC verb
 struct get_sync_boundary_response {
    std::optional<repair_sync_boundary> boundary;
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -47,6 +47,7 @@
 #include "gms/gossiper.hh"
 #include "repair/row_level.hh"
 #include "mutation_source_metadata.hh"
+#include "utils/stall_free.hh"

 extern logging::logger rlogger;

@@ -373,6 +374,7 @@ private:
    std::optional<utils::phased_barrier::operation> _local_read_op;
    // Local reader or multishard reader to read the range
    flat_mutation_reader _reader;
+    std::optional<evictable_reader_handle> _reader_handle;
    // Current partition read from disk
    lw_shared_ptr<const decorated_key_with_hash> _current_dk;

@@ -392,32 +394,49 @@ public:
            , _sharder(remote_partitioner, range, remote_shard)
            , _seed(seed)
            , _local_read_op(local_reader ? std::optional(cf.read_in_progress()) : std::nullopt)
-            , _reader(make_reader(db, cf, local_reader)) {
-    }
-
-private:
-    flat_mutation_reader
-    make_reader(seastar::sharded<database>& db,
-            column_family& cf,
-            is_local_reader local_reader) {
+            , _reader(nullptr) {
        if (local_reader) {
-            return cf.make_streaming_reader(_schema, _range);
+            auto ms = mutation_source([&cf] (
+                        schema_ptr s,
+                        reader_permit,
+                        const dht::partition_range& pr,
+                        const query::partition_slice& ps,
+                        const io_priority_class& pc,
+                        tracing::trace_state_ptr,
+                        streamed_mutation::forwarding,
+                        mutation_reader::forwarding fwd_mr) {
+                return cf.make_streaming_reader(std::move(s), pr, ps, fwd_mr);
+            });
+            std::tie(_reader, _reader_handle) = make_manually_paused_evictable_reader(
+                    std::move(ms),
+                    _schema,
+                    cf.streaming_read_concurrency_semaphore(),
+                    _range,
+                    _schema->full_slice(),
+                    service::get_local_streaming_read_priority(),
+                    {},
+                    mutation_reader::forwarding::no);
+        } else {
+            _reader = make_multishard_streaming_reader(db, _schema, [this] {
+                auto shard_range = _sharder.next();
+                if (shard_range) {
+                    return std::optional<dht::partition_range>(dht::to_partition_range(*shard_range));
+                }
+                return std::optional<dht::partition_range>();
+            });
        }
-        return make_multishard_streaming_reader(db, _schema, [this] {
-            auto shard_range = _sharder.next();
-            if (shard_range) {
-                return std::optional<dht::partition_range>(dht::to_partition_range(*shard_range));
-            }
-            return std::optional<dht::partition_range>();
-        });
    }

-public:
    future<mutation_fragment_opt>
    read_mutation_fragment() {
        return _reader(db::no_timeout);
    }

+    void on_end_of_stream() {
+        _reader = make_empty_flat_reader(_schema);
+        _reader_handle.reset();
+    }
+
    lw_shared_ptr<const decorated_key_with_hash>& get_current_dk() {
        return _current_dk;
    }
@@ -436,6 +455,11 @@ public:
        }
    }

+    void pause() {
+        if (_reader_handle) {
+            _reader_handle->pause();
+        }
+    }
 };

 class repair_writer {
@@ -443,7 +467,7 @@ class repair_writer {
    uint64_t _estimated_partitions;
    size_t _nr_peer_nodes;
    // Needs more than one for repair master
-    std::vector<std::optional<future<uint64_t>>> _writer_done;
+    std::vector<std::optional<future<>>> _writer_done;
    std::vector<std::optional<seastar::queue<mutation_fragment_opt>>> _mq;
    // Current partition written to disk
    std::vector<lw_shared_ptr<const decorated_key_with_hash>> _current_dk_written_to_sstable;
@@ -451,14 +475,18 @@ class repair_writer {
    // partition_start is written and is closed when a partition_end is
    // written.
    std::vector<bool> _partition_opened;
+    streaming::stream_reason _reason;
+    named_semaphore _sem{1, named_semaphore_exception_factory{"repair_writer"}};
 public:
    repair_writer(
            schema_ptr schema,
            uint64_t estimated_partitions,
-            size_t nr_peer_nodes)
+            size_t nr_peer_nodes,
+            streaming::stream_reason reason)
            : _schema(std::move(schema))
            , _estimated_partitions(estimated_partitions)
-            , _nr_peer_nodes(nr_peer_nodes) {
+            , _nr_peer_nodes(nr_peer_nodes)
+            , _reason(reason) {
        init_writer();
    }

@@ -495,9 +523,9 @@ public:
        table& t = db.local().find_column_family(_schema->id());
        _writer_done[node_idx] = mutation_writer::distribute_reader_and_consume_on_shards(_schema,
                make_generating_reader(_schema, std::move(get_next_mutation_fragment)),
-                [&db, estimated_partitions = this->_estimated_partitions] (flat_mutation_reader reader) {
+                [&db, reason = this->_reason, estimated_partitions = this->_estimated_partitions] (flat_mutation_reader reader) {
            auto& t = db.local().find_column_family(reader.schema());
-            return db::view::check_needs_view_update_path(_sys_dist_ks->local(), t, streaming::stream_reason::repair).then([t = t.shared_from_this(), estimated_partitions, reader = std::move(reader)] (bool use_view_update_path) mutable {
+            return db::view::check_needs_view_update_path(_sys_dist_ks->local(), t, reason).then([t = t.shared_from_this(), estimated_partitions, reader = std::move(reader)] (bool use_view_update_path) mutable {
                //FIXME: for better estimations this should be transmitted from remote
                auto metadata = mutation_source_metadata{};
                auto& cs = t->get_compaction_strategy();
@@ -523,7 +551,15 @@ public:
                return consumer(std::move(reader));
            });
        },
-        t.stream_in_progress());
+        t.stream_in_progress()).then([this, node_idx] (uint64_t partitions) {
+            rlogger.debug("repair_writer: keyspace={}, table={}, managed to write partitions={} to sstable",
+                _schema->ks_name(), _schema->cf_name(), partitions);
+        }).handle_exception([this, node_idx] (std::exception_ptr ep) {
+            rlogger.warn("repair_writer: keyspace={}, table={}, multishard_writer failed: {}",
+                    _schema->ks_name(), _schema->cf_name(), ep);
+            _mq[node_idx]->abort(ep);
+            return make_exception_future<>(std::move(ep));
+        });
    }

    future<> write_partition_end(unsigned node_idx) {
@@ -550,23 +586,41 @@ public:
        }
    }

+    future<> write_end_of_stream(unsigned node_idx) {
+        if (_mq[node_idx]) {
+          return with_semaphore(_sem, 1, [this, node_idx] {
+            // Partition_end is never sent on wire, so we have to write one ourselves.
+            return write_partition_end(node_idx).then([this, node_idx] () mutable {
+                // Empty mutation_fragment_opt means no more data, so the writer can seal the sstables.
+                return _mq[node_idx]->push_eventually(mutation_fragment_opt());
+            });
+          });
+        } else {
+            return make_ready_future<>();
+        }
+    }
+
+    future<> do_wait_for_writer_done(unsigned node_idx) {
+        if (_writer_done[node_idx]) {
+            return std::move(*(_writer_done[node_idx]));
+        } else {
+            return make_ready_future<>();
+        }
+    }
+
    future<> wait_for_writer_done() {
        return parallel_for_each(boost::irange(unsigned(0), unsigned(_nr_peer_nodes)), [this] (unsigned node_idx) {
-            if (_writer_done[node_idx] && _mq[node_idx]) {
-                // Partition_end is never sent on wire, so we have to write one ourselves.
-                return write_partition_end(node_idx).then([this, node_idx] () mutable {
-                    // Empty mutation_fragment_opt means no more data, so the writer can seal the sstables.
-                    return _mq[node_idx]->push_eventually(mutation_fragment_opt()).then([this, node_idx] () mutable {
-                        return (*_writer_done[node_idx]).then([] (uint64_t partitions) {
-                            rlogger.debug("Managed to write partitions={} to sstable", partitions);
-                            return make_ready_future<>();
-                        });
-                    });
-                });
-            }
-            return make_ready_future<>();
+            return when_all_succeed(write_end_of_stream(node_idx), do_wait_for_writer_done(node_idx));
+        }).handle_exception([this] (std::exception_ptr ep) {
+            rlogger.warn("repair_writer: keyspace={}, table={}, wait_for_writer_done failed: {}",
+                    _schema->ks_name(), _schema->cf_name(), ep);
+            return make_exception_future<>(std::move(ep));
        });
    }
+
+    named_semaphore& sem() {
+        return _sem;
+    }
 };

 class repair_meta {
@@ -590,6 +644,7 @@ private:
    repair_master _repair_master;
    gms::inet_address _myip;
    uint32_t _repair_meta_id;
+    streaming::stream_reason _reason;
    // Repair master's sharding configuration
    shard_config _master_node_shard_config;
    // Partitioner of repair master
@@ -613,7 +668,7 @@ private:
    // Tracks current sync boundary
    std::optional<repair_sync_boundary> _current_sync_boundary;
    // Contains the hashes of rows in the _working_row_buffor for all peer nodes
-    std::vector<std::unordered_set<repair_hash>> _peer_row_hash_sets;
+    std::vector<repair_hash_set> _peer_row_hash_sets;
    // Gate used to make sure pending operation of meta data is done
    seastar::gate _gate;
    sink_source_for_get_full_row_hashes _sink_source_for_get_full_row_hashes;
@@ -653,6 +708,7 @@ public:
            uint64_t seed,
            repair_master master,
            uint32_t repair_meta_id,
+            streaming::stream_reason reason,
            shard_config master_node_shard_config,
            size_t nr_peer_nodes = 1)
            : _db(db)
@@ -666,6 +722,7 @@ public:
            , _repair_master(master)
            , _myip(utils::fb_utilities::get_broadcast_address())
            , _repair_meta_id(repair_meta_id)
+            , _reason(reason)
            , _master_node_shard_config(std::move(master_node_shard_config))
            , _remote_partitioner(make_remote_partitioner())
            , _same_sharding_config(is_same_sharding_config())
@@ -681,7 +738,7 @@ public:
                    _seed,
                    repair_reader::is_local_reader(_repair_master || _same_sharding_config)
              )
-            , _repair_writer(_schema, _estimated_partitions, _nr_peer_nodes)
+            , _repair_writer(_schema, _estimated_partitions, _nr_peer_nodes, _reason)
            , _sink_source_for_get_full_row_hashes(_repair_meta_id, _nr_peer_nodes,
                    [] (uint32_t repair_meta_id, netw::messaging_service::msg_addr addr) {
                        return netw::get_local_messaging_service().make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(repair_meta_id, addr);
@@ -700,11 +757,12 @@ public:
 public:
    future<> stop() {
        auto gate_future = _gate.close();
-        auto writer_future = _repair_writer.wait_for_writer_done();
        auto f1 = _sink_source_for_get_full_row_hashes.close();
        auto f2 = _sink_source_for_get_row_diff.close();
        auto f3 = _sink_source_for_put_row_diff.close();
-        return when_all_succeed(std::move(gate_future), std::move(writer_future), std::move(f1), std::move(f2), std::move(f3));
+        return when_all_succeed(std::move(gate_future), std::move(f1), std::move(f2), std::move(f3)).finally([this] {
+            return _repair_writer.wait_for_writer_done();
+        });
    }

    static std::unordered_map<node_repair_meta_id, lw_shared_ptr<repair_meta>>& repair_meta_map() {
@@ -731,7 +789,8 @@ public:
            uint64_t max_row_buf_size,
            uint64_t seed,
            shard_config master_node_shard_config,
-            table_schema_version schema_version) {
+            table_schema_version schema_version,
+            streaming::stream_reason reason) {
        return service::get_schema_for_write(schema_version, {from, src_cpu_id}).then([from,
                repair_meta_id,
                range,
@@ -739,7 +798,8 @@ public:
                max_row_buf_size,
                seed,
                master_node_shard_config,
-                schema_version] (schema_ptr s) {
+                schema_version,
+                reason] (schema_ptr s) {
            auto& db = service::get_local_storage_proxy().get_db();
            auto& cf = db.local().find_column_family(s->id());
            node_repair_meta_id id{from, repair_meta_id};
@@ -752,6 +812,7 @@ public:
                    seed,
                    repair_meta::repair_master::no,
                    repair_meta_id,
+                    reason,
                    std::move(master_node_shard_config));
            bool insertion = repair_meta_map().emplace(id, rm).second;
            if (!insertion) {
@@ -829,9 +890,9 @@ public:
    }

    // Must run inside a seastar thread
-    static std::unordered_set<repair_hash>
-    get_set_diff(const std::unordered_set<repair_hash>& x, const std::unordered_set<repair_hash>& y) {
-        std::unordered_set<repair_hash> set_diff;
+    static repair_hash_set
+    get_set_diff(const repair_hash_set& x, const repair_hash_set& y) {
+        repair_hash_set set_diff;
        // Note std::set_difference needs x and y are sorted.
        std::copy_if(x.begin(), x.end(), std::inserter(set_diff, set_diff.end()),
                [&y] (auto& item) { thread::maybe_yield(); return y.find(item) == y.end(); });
@@ -849,14 +910,14 @@ public:

    }

-    std::unordered_set<repair_hash>& peer_row_hash_sets(unsigned node_idx) {
+    repair_hash_set& peer_row_hash_sets(unsigned node_idx) {
        return _peer_row_hash_sets[node_idx];
    }

    // Get a list of row hashes in _working_row_buf
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    working_row_hashes() {
-        return do_with(std::unordered_set<repair_hash>(), [this] (std::unordered_set<repair_hash>& hashes) {
+        return do_with(repair_hash_set(), [this] (repair_hash_set& hashes) {
            return do_for_each(_working_row_buf, [&hashes] (repair_row& r) {
                hashes.emplace(r.hash());
            }).then([&hashes] {
@@ -982,11 +1043,7 @@ private:
        return repair_hash(h.finalize_uint64());
    }

-    stop_iteration handle_mutation_fragment(mutation_fragment_opt mfopt, size_t& cur_size, size_t& new_rows_size, std::list<repair_row>& cur_rows) {
-        if (!mfopt) {
-            return stop_iteration::yes;
-        }
-        mutation_fragment& mf = *mfopt;
+    stop_iteration handle_mutation_fragment(mutation_fragment& mf, size_t& cur_size, size_t& new_rows_size, std::list<repair_row>& cur_rows) {
        if (mf.is_partition_start()) {
            auto& start = mf.as_partition_start();
            _repair_reader.set_current_dk(start.key());
@@ -1021,32 +1078,49 @@ private:
                }
                _gate.check();
                return _repair_reader.read_mutation_fragment().then([this, &cur_size, &new_rows_size, &cur_rows] (mutation_fragment_opt mfopt) mutable {
-                    return handle_mutation_fragment(std::move(mfopt), cur_size, new_rows_size, cur_rows);
+                    if (!mfopt) {
+                        _repair_reader.on_end_of_stream();
+                        return stop_iteration::yes;
+                    }
+                    return handle_mutation_fragment(*mfopt, cur_size, new_rows_size, cur_rows);
                });
-            }).then([&cur_rows, &new_rows_size] () mutable {
+            }).then_wrapped([this, &cur_rows, &new_rows_size] (future<> fut) mutable {
+                if (fut.failed()) {
+                    _repair_reader.on_end_of_stream();
+                    return make_exception_future<std::list<repair_row>, size_t>(fut.get_exception());
+                }
+                _repair_reader.pause();
                return make_ready_future<std::list<repair_row>, size_t>(std::move(cur_rows), new_rows_size);
            });
        });
    }

+    future<> clear_row_buf() {
+        return utils::clear_gently(_row_buf);
+    }
+
+    future<> clear_working_row_buf() {
+        return utils::clear_gently(_working_row_buf).then([this] {
+            _working_row_buf_combined_hash.clear();
+        });
+    }
+
    // Read rows from disk until _max_row_buf_size of rows are filled into _row_buf.
    // Calculate the combined checksum of the rows
    // Calculate the total size of the rows in _row_buf
    future<get_sync_boundary_response>
    get_sync_boundary(std::optional<repair_sync_boundary> skipped_sync_boundary) {
+        auto f = make_ready_future<>();
        if (skipped_sync_boundary) {
            _current_sync_boundary = skipped_sync_boundary;
-            _row_buf.clear();
-            _working_row_buf.clear();
-            _working_row_buf_combined_hash.clear();
-        } else {
-            _working_row_buf.clear();
-            _working_row_buf_combined_hash.clear();
+            f = clear_row_buf();
        }
        // Here is the place we update _last_sync_boundary
        rlogger.trace("SET _last_sync_boundary from {} to {}", _last_sync_boundary, _current_sync_boundary);
        _last_sync_boundary = _current_sync_boundary;
-        return row_buf_size().then([this, sb = std::move(skipped_sync_boundary)] (size_t cur_size) {
+      return f.then([this, sb = std::move(skipped_sync_boundary)] () mutable {
+       return clear_working_row_buf().then([this, sb = sb] () mutable {
+        return row_buf_size().then([this, sb = std::move(sb)] (size_t cur_size) {
            return read_rows_from_disk(cur_size).then([this, sb = std::move(sb)] (std::list<repair_row> new_rows, size_t new_rows_size) mutable {
                size_t new_rows_nr = new_rows.size();
                _row_buf.splice(_row_buf.end(), new_rows);
@@ -1063,6 +1137,8 @@ private:
                });
            });
        });
+       });
+      });
    }

    future<> move_row_buf_to_working_row_buf() {
@@ -1138,9 +1214,9 @@ private:
    }

    future<std::list<repair_row>>
-    copy_rows_from_working_row_buf_within_set_diff(std::unordered_set<repair_hash> set_diff) {
+    copy_rows_from_working_row_buf_within_set_diff(repair_hash_set set_diff) {
        return do_with(std::list<repair_row>(), std::move(set_diff),
-                [this] (std::list<repair_row>& rows, std::unordered_set<repair_hash>& set_diff) {
+                [this] (std::list<repair_row>& rows, repair_hash_set& set_diff) {
            return do_for_each(_working_row_buf, [this, &set_diff, &rows] (const repair_row& r) {
                if (set_diff.count(r.hash()) > 0) {
                    rows.push_back(r);
@@ -1155,7 +1231,7 @@ private:
    // Give a set of row hashes, return the corresponding rows
    // If needs_all_rows is set, return all the rows in _working_row_buf, ignore the set_diff
    future<std::list<repair_row>>
-    get_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows = needs_all_rows_t::no) {
+    get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows = needs_all_rows_t::no) {
        if (needs_all_rows) {
            if (!_repair_master || _nr_peer_nodes == 1) {
                return make_ready_future<std::list<repair_row>>(std::move(_working_row_buf));
@@ -1166,6 +1242,32 @@ private:
        }
    }

+    future<> do_apply_rows(std::list<repair_row>&& row_diff, unsigned node_idx, update_working_row_buf update_buf) {
+        return do_with(std::move(row_diff), [this, node_idx, update_buf] (std::list<repair_row>& row_diff) {
+            return with_semaphore(_repair_writer.sem(), 1, [this, node_idx, update_buf, &row_diff] {
+                _repair_writer.create_writer(_db, node_idx);
+                return repeat([this, node_idx, update_buf, &row_diff] () mutable {
+                    if (row_diff.empty()) {
+                        return make_ready_future<stop_iteration>(stop_iteration::yes);
+                    }
+                    repair_row& r = row_diff.front();
+                    if (update_buf) {
+                        _working_row_buf_combined_hash.add(r.hash());
+                    }
+                    // The repair_row here is supposed to have
+                    // mutation_fragment attached because we have stored it in
+                    // to_repair_rows_list above where the repair_row is created.
+                    mutation_fragment mf = std::move(r.get_mutation_fragment());
+                    auto dk_with_hash = r.get_dk_with_hash();
+                    return _repair_writer.do_write(node_idx, std::move(dk_with_hash), std::move(mf)).then([&row_diff] {
+                        row_diff.pop_front();
+                        return make_ready_future<stop_iteration>(stop_iteration::no);
+                    });
+                });
+            });
+        });
+    }
+
    // Give a list of rows, apply the rows to disk and update the _working_row_buf and _peer_row_hash_sets if requested
    // Must run inside a seastar thread
    void apply_rows_on_master_in_thread(repair_rows_on_wire rows, gms::inet_address from, update_working_row_buf update_buf,
@@ -1179,30 +1281,17 @@ private:
        stats().rx_row_nr += row_diff.size();
        stats().rx_row_nr_peer[from] += row_diff.size();
        if (update_buf) {
-            std::list<repair_row> tmp;
-            tmp.swap(_working_row_buf);
            // Both row_diff and _working_row_buf and are ordered, merging
            // two sored list to make sure the combination of row_diff
            // and _working_row_buf are ordered.
-            std::merge(tmp.begin(), tmp.end(), row_diff.begin(), row_diff.end(), std::back_inserter(_working_row_buf),
-                [this] (const repair_row& x, const repair_row& y) { thread::maybe_yield(); return _cmp(x.boundary(), y.boundary()) < 0; });
+            utils::merge_to_gently(_working_row_buf, row_diff,
+                 [this] (const repair_row& x, const repair_row& y) { return _cmp(x.boundary(), y.boundary()) < 0; });
        }
        if (update_hash_set) {
-            _peer_row_hash_sets[node_idx] = boost::copy_range<std::unordered_set<repair_hash>>(row_diff |
+            _peer_row_hash_sets[node_idx] = boost::copy_range<repair_hash_set>(row_diff |
                    boost::adaptors::transformed([] (repair_row& r) { thread::maybe_yield(); return r.hash(); }));
        }
-        _repair_writer.create_writer(_db, node_idx);
-        for (auto& r : row_diff) {
-            if (update_buf) {
-                _working_row_buf_combined_hash.add(r.hash());
-            }
-            // The repair_row here is supposed to have
-            // mutation_fragment attached because we have stored it in
-            // to_repair_rows_list above where the repair_row is created.
-            mutation_fragment mf = std::move(r.get_mutation_fragment());
-            auto dk_with_hash = r.get_dk_with_hash();
-            _repair_writer.do_write(node_idx, std::move(dk_with_hash), std::move(mf)).get();
-        }
+        do_apply_rows(std::move(row_diff), node_idx, update_buf).get();
    }

    future<>
@@ -1210,19 +1299,9 @@ private:
        if (rows.empty()) {
            return make_ready_future<>();
        }
-        return to_repair_rows_list(rows).then([this] (std::list<repair_row> row_diff) {
-            return do_with(std::move(row_diff), [this] (std::list<repair_row>& row_diff) {
-                unsigned node_idx = 0;
-                _repair_writer.create_writer(_db, node_idx);
-                return do_for_each(row_diff, [this, node_idx] (repair_row& r) {
-                    // The repair_row here is supposed to have
-                    // mutation_fragment attached because we have stored it in
-                    // to_repair_rows_list above where the repair_row is created.
-                    mutation_fragment mf = std::move(r.get_mutation_fragment());
-                    auto dk_with_hash = r.get_dk_with_hash();
-                    return _repair_writer.do_write(node_idx, std::move(dk_with_hash), std::move(mf));
-                });
-            });
+        return to_repair_rows_list(std::move(rows)).then([this] (std::list<repair_row> row_diff) {
+            unsigned node_idx = 0;
+            return do_apply_rows(std::move(row_diff), node_idx, update_working_row_buf::no);
        });
    }

@@ -1301,13 +1380,13 @@ private:
 public:
    // RPC API
    // Return the hashes of the rows in _working_row_buf
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    get_full_row_hashes(gms::inet_address remote_node) {
        if (remote_node == _myip) {
            return get_full_row_hashes_handler();
        }
        return netw::get_local_messaging_service().send_repair_get_full_row_hashes(msg_addr(remote_node),
-                _repair_meta_id).then([this, remote_node] (std::unordered_set<repair_hash> hashes) {
+                _repair_meta_id).then([this, remote_node] (repair_hash_set hashes) {
            rlogger.debug("Got full hashes from peer={}, nr_hashes={}", remote_node, hashes.size());
            _metrics.rx_hashes_nr += hashes.size();
            stats().rx_hashes_nr += hashes.size();
@@ -1318,7 +1397,7 @@ public:

 private:
    future<> get_full_row_hashes_source_op(
-            lw_shared_ptr<std::unordered_set<repair_hash>> current_hashes,
+            lw_shared_ptr<repair_hash_set> current_hashes,
            gms::inet_address remote_node,
            unsigned node_idx,
            rpc::source<repair_hash_with_cmd>& source) {
@@ -1356,12 +1435,12 @@ private:
    }

 public:
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    get_full_row_hashes_with_rpc_stream(gms::inet_address remote_node, unsigned node_idx) {
        if (remote_node == _myip) {
            return get_full_row_hashes_handler();
        }
-        auto current_hashes = make_lw_shared<std::unordered_set<repair_hash>>();
+        auto current_hashes = make_lw_shared<repair_hash_set>();
        return _sink_source_for_get_full_row_hashes.get_sink_source(remote_node, node_idx).then(
                [this, current_hashes, remote_node, node_idx]
                (rpc::sink<repair_stream_cmd>& sink, rpc::source<repair_hash_with_cmd>& source) mutable {
@@ -1376,7 +1455,7 @@ public:
    }

    // RPC handler
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    get_full_row_hashes_handler() {
        return with_gate(_gate, [this] {
            return working_row_hashes();
@@ -1412,28 +1491,28 @@ public:

    // RPC API
    future<>
-    repair_row_level_start(gms::inet_address remote_node, sstring ks_name, sstring cf_name, dht::token_range range, table_schema_version schema_version) {
+    repair_row_level_start(gms::inet_address remote_node, sstring ks_name, sstring cf_name, dht::token_range range, table_schema_version schema_version, streaming::stream_reason reason) {
        if (remote_node == _myip) {
            return make_ready_future<>();
        }
        stats().rpc_call_nr++;
        return netw::get_local_messaging_service().send_repair_row_level_start(msg_addr(remote_node),
                _repair_meta_id, std::move(ks_name), std::move(cf_name), std::move(range), _algo, _max_row_buf_size, _seed,
-                _master_node_shard_config.shard, _master_node_shard_config.shard_count, _master_node_shard_config.ignore_msb, _master_node_shard_config.partitioner_name, std::move(schema_version));
+                _master_node_shard_config.shard, _master_node_shard_config.shard_count, _master_node_shard_config.ignore_msb, _master_node_shard_config.partitioner_name, std::move(schema_version), reason);
    }

    // RPC handler
    static future<>
    repair_row_level_start_handler(gms::inet_address from, uint32_t src_cpu_id, uint32_t repair_meta_id, sstring ks_name, sstring cf_name,
            dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size,
-            uint64_t seed, shard_config master_node_shard_config, table_schema_version schema_version) {
+            uint64_t seed, shard_config master_node_shard_config, table_schema_version schema_version, streaming::stream_reason reason) {
        if (!_sys_dist_ks->local_is_initialized() || !_view_update_generator->local_is_initialized()) {
            return make_exception_future<>(std::runtime_error(format("Node {} is not fully initialized for repair, try again later",
                    utils::fb_utilities::get_broadcast_address())));
        }
        rlogger.debug(">>> Started Row Level Repair (Follower): local={}, peers={}, repair_meta_id={}, keyspace={}, cf={}, schema_version={}, range={}, seed={}, max_row_buf_siz={}",
            utils::fb_utilities::get_broadcast_address(), from, repair_meta_id, ks_name, cf_name, schema_version, range, seed, max_row_buf_size);
-        return insert_repair_meta(from, src_cpu_id, repair_meta_id, std::move(range), algo, max_row_buf_size, seed, std::move(master_node_shard_config), std::move(schema_version));
+        return insert_repair_meta(from, src_cpu_id, repair_meta_id, std::move(range), algo, max_row_buf_size, seed, std::move(master_node_shard_config), std::move(schema_version), reason);
    }

    // RPC API
@@ -1509,7 +1588,7 @@ public:
    // RPC API
    // Return rows in the _working_row_buf with hash within the given sef_diff
    // Must run inside a seastar thread
-    void get_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
+    void get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
        if (needs_all_rows || !set_diff.empty()) {
            if (remote_node == _myip) {
                return;
@@ -1578,11 +1657,11 @@ private:
    }

    future<> get_row_diff_sink_op(
-            std::unordered_set<repair_hash> set_diff,
+            repair_hash_set set_diff,
            needs_all_rows_t needs_all_rows,
            rpc::sink<repair_hash_with_cmd>& sink,
            gms::inet_address remote_node) {
-        return do_with(std::move(set_diff), [needs_all_rows, remote_node, &sink] (std::unordered_set<repair_hash>& set_diff) mutable {
+        return do_with(std::move(set_diff), [needs_all_rows, remote_node, &sink] (repair_hash_set& set_diff) mutable {
            if (inject_rpc_stream_error) {
                return make_exception_future<>(std::runtime_error("get_row_diff: Inject sender error in sink loop"));
            }
@@ -1609,7 +1688,7 @@ private:
 public:
    // Must run inside a seastar thread
    void get_row_diff_with_rpc_stream(
-            std::unordered_set<repair_hash> set_diff,
+            repair_hash_set set_diff,
            needs_all_rows_t needs_all_rows,
            update_peer_row_hash_sets update_hash_set,
            gms::inet_address remote_node,
@@ -1635,7 +1714,7 @@ public:
    }

    // RPC handler
-    future<repair_rows_on_wire> get_row_diff_handler(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows) {
+    future<repair_rows_on_wire> get_row_diff_handler(repair_hash_set set_diff, needs_all_rows_t needs_all_rows) {
        return with_gate(_gate, [this, set_diff = std::move(set_diff), needs_all_rows] () mutable {
            return get_row_diff(std::move(set_diff), needs_all_rows).then([this] (std::list<repair_row> row_diff) {
                return to_repair_rows_on_wire(std::move(row_diff));
@@ -1645,15 +1724,16 @@ public:

    // RPC API
    // Send rows in the _working_row_buf with hash within the given sef_diff
-    future<> put_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
+    future<> put_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
        if (!set_diff.empty()) {
            if (remote_node == _myip) {
                return make_ready_future<>();
            }
-            auto sz = set_diff.size();
+            size_t sz = set_diff.size();
            return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, sz] (std::list<repair_row> row_diff) {
                if (row_diff.size() != sz) {
-                    throw std::runtime_error("row_diff.size() != set_diff.size()");
+                    rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
+                            _schema->ks_name(), _schema->cf_name(), _range, row_diff.size(), sz);
                }
                return do_with(std::move(row_diff), [this, remote_node] (std::list<repair_row>& row_diff) {
                    return get_repair_rows_size(row_diff).then([this, remote_node, &row_diff] (size_t row_bytes) mutable {
@@ -1720,17 +1800,18 @@ private:

 public:
    future<> put_row_diff_with_rpc_stream(
-            std::unordered_set<repair_hash> set_diff,
+            repair_hash_set set_diff,
            needs_all_rows_t needs_all_rows,
            gms::inet_address remote_node, unsigned node_idx) {
        if (!set_diff.empty()) {
            if (remote_node == _myip) {
                return make_ready_future<>();
            }
-            auto sz = set_diff.size();
+            size_t sz = set_diff.size();
            return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, node_idx, sz] (std::list<repair_row> row_diff) {
                if (row_diff.size() != sz) {
-                    throw std::runtime_error("row_diff.size() != set_diff.size()");
+                    rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
+                            _schema->ks_name(), _schema->cf_name(), _range, row_diff.size(), sz);
                }
                return do_with(std::move(row_diff), [this, remote_node, node_idx] (std::list<repair_row>& row_diff) {
                    return get_repair_rows_size(row_diff).then([this, remote_node, node_idx, &row_diff] (size_t row_bytes) mutable {
@@ -1769,7 +1850,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
        rpc::sink<repair_row_on_wire_with_cmd> sink,
        rpc::source<repair_hash_with_cmd> source,
        bool &error,
-        std::unordered_set<repair_hash>& current_set_diff,
+        repair_hash_set& current_set_diff,
        std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) {
    repair_hash_with_cmd hash_cmd = std::get<0>(hash_cmd_opt.value());
    rlogger.trace("Got repair_hash_with_cmd from peer={}, hash={}, cmd={}", from, hash_cmd.hash, int(hash_cmd.cmd));
@@ -1782,7 +1863,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
        }
        bool needs_all_rows = hash_cmd.cmd == repair_stream_cmd::needs_all_rows;
        _metrics.rx_hashes_nr += current_set_diff.size();
-        auto fp = make_foreign(std::make_unique<std::unordered_set<repair_hash>>(std::move(current_set_diff)));
+        auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(current_set_diff)));
        return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, needs_all_rows, fp = std::move(fp)] {
            auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
            if (fp.get_owner_shard() == engine().cpu_id()) {
@@ -1860,12 +1941,12 @@ static future<stop_iteration> repair_get_full_row_hashes_with_rpc_stream_process
    if (status == repair_stream_cmd::get_full_row_hashes) {
        return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id] {
            auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
-            return rm->get_full_row_hashes_handler().then([] (std::unordered_set<repair_hash> hashes) {
+            return rm->get_full_row_hashes_handler().then([] (repair_hash_set hashes) {
                _metrics.tx_hashes_nr += hashes.size();
                return hashes;
            });
-        }).then([sink] (std::unordered_set<repair_hash> hashes) mutable {
-            return do_with(std::move(hashes), [sink] (std::unordered_set<repair_hash>& hashes) mutable {
+        }).then([sink] (repair_hash_set hashes) mutable {
+            return do_with(std::move(hashes), [sink] (repair_hash_set& hashes) mutable {
                return do_for_each(hashes, [sink] (const repair_hash& hash) mutable {
                    return sink(repair_hash_with_cmd{repair_stream_cmd::hash_data, hash});
                }).then([sink] () mutable {
@@ -1888,7 +1969,7 @@ static future<> repair_get_row_diff_with_rpc_stream_handler(
        uint32_t repair_meta_id,
        rpc::sink<repair_row_on_wire_with_cmd> sink,
        rpc::source<repair_hash_with_cmd> source) {
-    return do_with(false, std::unordered_set<repair_hash>(), [from, src_cpu_id, repair_meta_id, sink, source] (bool& error, std::unordered_set<repair_hash>& current_set_diff) mutable {
+    return do_with(false, repair_hash_set(), [from, src_cpu_id, repair_meta_id, sink, source] (bool& error, repair_hash_set& current_set_diff) mutable {
        return repeat([from, src_cpu_id, repair_meta_id, sink, source, &error, &current_set_diff] () mutable {
            return source().then([from, src_cpu_id, repair_meta_id, sink, source, &error, &current_set_diff] (std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) mutable {
                if (hash_cmd_opt) {
@@ -1904,22 +1985,17 @@ static future<> repair_get_row_diff_with_rpc_stream_handler(
                            current_set_diff,
                            std::move(hash_cmd_opt)).handle_exception([sink, &error] (std::exception_ptr ep) mutable {
                        error = true;
-                        return sink(repair_row_on_wire_with_cmd{repair_stream_cmd::error, repair_row_on_wire()}).then([sink] ()  mutable {
-                            return sink.close();
-                        }).then([sink] {
+                        return sink(repair_row_on_wire_with_cmd{repair_stream_cmd::error, repair_row_on_wire()}).then([] {
                            return make_ready_future<stop_iteration>(stop_iteration::no);
                        });
                    });
                } else {
-                    if (error) {
-                        return make_ready_future<stop_iteration>(stop_iteration::yes);
-                    }
-                    return sink.close().then([sink] {
-                        return make_ready_future<stop_iteration>(stop_iteration::yes);
-                    });
+                    return make_ready_future<stop_iteration>(stop_iteration::yes);
                }
            });
        });
+    }).finally([sink] () mutable {
+        return sink.close().finally([sink] { });
    });
 }

@@ -1945,22 +2021,17 @@ static future<> repair_put_row_diff_with_rpc_stream_handler(
                            current_rows,
                            std::move(row_opt)).handle_exception([sink, &error] (std::exception_ptr ep) mutable {
                        error = true;
-                        return sink(repair_stream_cmd::error).then([sink] ()  mutable {
-                            return sink.close();
-                        }).then([sink] {
+                        return sink(repair_stream_cmd::error).then([] {
                            return make_ready_future<stop_iteration>(stop_iteration::no);
                        });
                    });
                } else {
-                    if (error) {
-                        return make_ready_future<stop_iteration>(stop_iteration::yes);
-                    }
-                    return sink.close().then([sink] {
-                        return make_ready_future<stop_iteration>(stop_iteration::yes);
-                    });
+                    return make_ready_future<stop_iteration>(stop_iteration::yes);
                }
            });
        });
+    }).finally([sink] () mutable {
+        return sink.close().finally([sink] { });
    });
 }

@@ -1985,22 +2056,17 @@ static future<> repair_get_full_row_hashes_with_rpc_stream_handler(
                            error,
                            std::move(status_opt)).handle_exception([sink, &error] (std::exception_ptr ep) mutable {
                        error = true;
-                        return sink(repair_hash_with_cmd{repair_stream_cmd::error, repair_hash()}).then([sink] ()  mutable {
-                            return sink.close();
-                        }).then([sink] {
+                        return sink(repair_hash_with_cmd{repair_stream_cmd::error, repair_hash()}).then([] () {
                            return make_ready_future<stop_iteration>(stop_iteration::no);
                        });
                    });
                } else {
-                    if (error) {
-                        return make_ready_future<stop_iteration>(stop_iteration::yes);
-                    }
-                    return sink.close().then([sink] {
-                        return make_ready_future<stop_iteration>(stop_iteration::yes);
-                    });
+                    return make_ready_future<stop_iteration>(stop_iteration::yes);
                }
            });
        });
+    }).finally([sink] () mutable {
+        return sink.close().finally([sink] { });
    });
 }

@@ -2046,7 +2112,7 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
            auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
            return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id] {
                auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
-                return rm->get_full_row_hashes_handler().then([] (std::unordered_set<repair_hash> hashes) {
+                return rm->get_full_row_hashes_handler().then([] (repair_hash_set hashes) {
                    _metrics.tx_hashes_nr += hashes.size();
                    return hashes;
                });
@@ -2074,11 +2140,11 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
            });
        });
        ms.register_repair_get_row_diff([] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
-                std::unordered_set<repair_hash> set_diff, bool needs_all_rows) {
+                repair_hash_set set_diff, bool needs_all_rows) {
            auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
            auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
            _metrics.rx_hashes_nr += set_diff.size();
-            auto fp = make_foreign(std::make_unique<std::unordered_set<repair_hash>>(std::move(set_diff)));
+            auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(set_diff)));
            return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, fp = std::move(fp), needs_all_rows] () mutable {
                auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
                if (fp.get_owner_shard() == engine().cpu_id()) {
@@ -2104,15 +2170,16 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
        });
        ms.register_repair_row_level_start([] (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring ks_name,
                sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed,
-                unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version) {
+                unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, rpc::optional<streaming::stream_reason> reason) {
            auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
            auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
            return smp::submit_to(src_cpu_id % smp::count, [from, src_cpu_id, repair_meta_id, ks_name, cf_name,
-                    range, algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, remote_partitioner_name, schema_version] () mutable {
+                    range, algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, remote_partitioner_name, schema_version, reason] () mutable {
+                streaming::stream_reason r = reason ? *reason : streaming::stream_reason::repair;
                return repair_meta::repair_row_level_start_handler(from, src_cpu_id, repair_meta_id, std::move(ks_name),
                        std::move(cf_name), std::move(range), algo, max_row_buf_size, seed,
                        shard_config{remote_shard, remote_shard_count, remote_ignore_msb, std::move(remote_partitioner_name)},
-                        schema_version);
+                        schema_version, r);
            });
        });
        ms.register_repair_row_level_stop([] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
@@ -2145,6 +2212,25 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
    });
 }

+future<> repair_uninit_messaging_service_handler() {
+    return netw::get_messaging_service().invoke_on_all([] (auto& ms) {
+        return when_all_succeed(
+            ms.unregister_repair_get_row_diff_with_rpc_stream(),
+            ms.unregister_repair_put_row_diff_with_rpc_stream(),
+            ms.unregister_repair_get_full_row_hashes_with_rpc_stream(),
+            ms.unregister_repair_get_full_row_hashes(),
+            ms.unregister_repair_get_combined_row_hash(),
+            ms.unregister_repair_get_sync_boundary(),
+            ms.unregister_repair_get_row_diff(),
+            ms.unregister_repair_put_row_diff(),
+            ms.unregister_repair_row_level_start(),
+            ms.unregister_repair_row_level_stop(),
+            ms.unregister_repair_get_estimated_partitions(),
+            ms.unregister_repair_set_estimated_partitions(),
+            ms.unregister_repair_get_diff_algorithms()).discard_result();
+    });
+}
+
 class row_level_repair {
    repair_info& _ri;
    sstring _cf_name;
@@ -2374,7 +2460,7 @@ private:
            // sequentially because the rows from repair follower 1 to
            // repair master might reduce the amount of missing data
            // between repair master and repair follower 2.
-            std::unordered_set<repair_hash> set_diff = repair_meta::get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get0());
+            repair_hash_set set_diff = repair_meta::get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get0());
            // Request missing sets from peer node
            rlogger.debug("Before get_row_diff to node {}, local={}, peer={}, set_diff={}",
                    node, master.working_row_hashes().get0().size(), master.peer_row_hash_sets(node_idx).size(), set_diff.size());
@@ -2397,9 +2483,9 @@ private:
        // So we can figure out which rows peer node are missing and send the missing rows to them
        check_in_shutdown();
        _ri.check_in_abort();
-        std::unordered_set<repair_hash> local_row_hash_sets = master.working_row_hashes().get0();
+        repair_hash_set local_row_hash_sets = master.working_row_hashes().get0();
        auto sz = _all_live_peer_nodes.size();
-        std::vector<std::unordered_set<repair_hash>> set_diffs(sz);
+        std::vector<repair_hash_set> set_diffs(sz);
        for (size_t idx : boost::irange(size_t(0), sz)) {
            set_diffs[idx] = repair_meta::get_set_diff(local_row_hash_sets, master.peer_row_hash_sets(idx));
        }
@@ -2442,6 +2528,7 @@ public:
                    _seed,
                    repair_meta::repair_master::yes,
                    repair_meta_id,
+                    _ri.reason,
                    std::move(master_node_shard_config),
                    _all_live_peer_nodes.size());

@@ -2456,7 +2543,7 @@ public:
            nodes_to_stop.reserve(_all_nodes.size());
            try {
                parallel_for_each(_all_nodes, [&, this] (const gms::inet_address& node) {
-                    return master.repair_row_level_start(node, _ri.keyspace, _cf_name, _range, schema_version).then([&] () {
+                    return master.repair_row_level_start(node, _ri.keyspace, _cf_name, _range, schema_version, _ri.reason).then([&] () {
                        nodes_to_stop.push_back(node);
                        return master.repair_get_estimated_partitions(node).then([this, node] (uint64_t partitions) {
                            rlogger.trace("Get repair_get_estimated_partitions for node={}, estimated_partitions={}", node, partitions);
--- a/repair/row_level.hh
+++ b/repair/row_level.hh
@@ -45,6 +45,7 @@ private:
 };

 future<> repair_init_messaging_service_handler(repair_service& rs, distributed<db::system_distributed_keyspace>& sys_dist_ks, distributed<db::view::view_update_generator>& view_update_generator);
+future<> repair_uninit_messaging_service_handler();

 class repair_info;

--- a/row_cache.cc
+++ b/row_cache.cc
@@ -528,8 +528,12 @@ public:
        return _reader.move_to_next_partition(timeout).then([this] (auto&& mfopt) mutable {
            {
                if (!mfopt) {
-                    this->handle_end_of_stream();
-                    return make_ready_future<flat_mutation_reader_opt, mutation_fragment_opt>(std::nullopt, std::nullopt);
+                    return _cache._read_section(_cache._tracker.region(), [&] {
+                        return with_linearized_managed_bytes([&] {
+                            this->handle_end_of_stream();
+                            return make_ready_future<flat_mutation_reader_opt, mutation_fragment_opt>(std::nullopt, std::nullopt);
+                        });
+                    });
                }
                _cache.on_partition_miss();
                const partition_start& ps = mfopt->as_partition_start();
@@ -952,13 +956,15 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
                        // expensive and we need to amortize it somehow.
                        do {
                          STAP_PROBE(scylla, row_cache_update_partition_start);
-                          with_linearized_managed_bytes([&] {
+                          {
                            if (!update) {
                                _update_section(_tracker.region(), [&] {
+                                  with_linearized_managed_bytes([&] {
                                    memtable_entry& mem_e = *m.partitions.begin();
                                    size_entry = mem_e.size_in_allocator_without_rows(_tracker.allocator());
                                    auto cache_i = _partitions.lower_bound(mem_e.key(), cmp);
                                    update = updater(_update_section, cache_i, mem_e, is_present, real_dirty_acc);
+                                  });
                                });
                            }
                            // We use cooperative deferring instead of futures so that
@@ -970,14 +976,16 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
                            update = {};
                            real_dirty_acc.unpin_memory(size_entry);
                            _update_section(_tracker.region(), [&] {
+                              with_linearized_managed_bytes([&] {
                                auto i = m.partitions.begin();
                                memtable_entry& mem_e = *i;
                                m.partitions.erase(i);
                                mem_e.partition().evict(_tracker.memtable_cleaner());
                                current_allocator().destroy(&mem_e);
+                              });
                            });
                            ++partition_count;
-                          });
+                          }
                          STAP_PROBE(scylla, row_cache_update_partition_end);
                        } while (!m.partitions.empty() && !need_preempt());
                        with_allocator(standard_allocator(), [&] {
@@ -1124,8 +1132,8 @@ future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&
                seastar::thread::maybe_yield();

                while (true) {
-                    auto done = with_linearized_managed_bytes([&] {
-                        return _update_section(_tracker.region(), [&] {
+                    auto done = _update_section(_tracker.region(), [&] {
+                        return with_linearized_managed_bytes([&] {
                            auto cmp = cache_entry::compare(_schema);
                            auto it = _partitions.lower_bound(*_prev_snapshot_pos, cmp);
                            auto end = _partitions.lower_bound(dht::ring_position_view::for_range_end(range), cmp);
--- a/schema.cc
+++ b/schema.cc
@@ -42,6 +42,8 @@

 constexpr int32_t schema::NAME_LENGTH;

+extern logging::logger dblog;
+
 sstring to_sstring(column_kind k) {
    switch (k) {
    case column_kind::partition_key:  return "PARTITION_KEY";
@@ -319,10 +321,10 @@ schema::schema(const raw_schema& raw, std::optional<raw_view_info> raw_view_info
                    + column_offset(column_kind::regular_column),
            _raw._columns.end(), column_definition::name_comparator(regular_column_name_type()));

-    std::sort(_raw._columns.begin(),
+    std::stable_sort(_raw._columns.begin(),
              _raw._columns.begin() + column_offset(column_kind::clustering_key),
              [] (auto x, auto y) { return x.id < y.id; });
-    std::sort(_raw._columns.begin() + column_offset(column_kind::clustering_key),
+    std::stable_sort(_raw._columns.begin() + column_offset(column_kind::clustering_key),
              _raw._columns.begin() + column_offset(column_kind::static_column),
              [] (auto x, auto y) { return x.id < y.id; });

@@ -575,11 +577,15 @@ schema::get_column_definition(const bytes& name) const {

 const column_definition&
 schema::column_at(column_kind kind, column_id id) const {
-    return _raw._columns.at(column_offset(kind) + id);
+    return column_at(static_cast<ordinal_column_id>(column_offset(kind) + id));
 }

 const column_definition&
 schema::column_at(ordinal_column_id ordinal_id) const {
+    if (size_t(ordinal_id) >= _raw._columns.size()) {
+        on_internal_error(dblog, format("{}.{}@{}: column id {:d} >= {:d}",
+            ks_name(), cf_name(), version(), size_t(ordinal_id), _raw._columns.size()));
+    }
    return _raw._columns.at(static_cast<column_count_type>(ordinal_id));
 }

--- a/scripts/create-relocatable-package.py
+++ b/scripts/create-relocatable-package.py
@@ -79,7 +79,8 @@ executables = ['build/{}/scylla'.format(args.mode),
               '/usr/sbin/ethtool',
               '/usr/bin/netstat',
               '/usr/bin/hwloc-distrib',
-               '/usr/bin/hwloc-calc']
+               '/usr/bin/hwloc-calc',
+               '/usr/bin/lsblk']

 output = args.dest

--- a/scripts/jobs
+++ b/scripts/jobs
@@ -33,9 +33,10 @@ import os
 procs = os.sysconf('SC_NPROCESSORS_ONLN')
 mem = os.sysconf('SC_PHYS_PAGES') * os.sysconf('SC_PAGESIZE')

+mem_reserve = 1000000000
 job_mem = 4000000000

-jobs = min(procs, mem // job_mem)
+jobs = min(procs, (mem-mem_reserve) // job_mem)
 jobs = max(jobs, 1)

 print(jobs)
--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -596,7 +596,7 @@ def current_shard():


 def find_db(shard=None):
-    if not shard:
+    if shard is None:
        shard = current_shard()
    return gdb.parse_and_eval('::debug::db')['_instances']['_M_impl']['_M_start'][shard]['service']['_p']

--- a/scylla_post_install.sh
+++ b/scylla_post_install.sh
@@ -63,6 +63,17 @@ MemoryHigh=1200M
 MemoryMax=1400M
 MemoryLimit=1400M
 EOS
+
+# On CentOS7, systemd does not support percentage-based parameter.
+# To apply memory parameter on CentOS7, we need to override the parameter
+# in bytes, instead of percentage.
+elif [ "$RHEL" -a "$VERSION_ID" = "7" ]; then
+    MEMORY_LIMIT=$((MEMTOTAL_BYTES / 100 * 5))
+    mkdir -p /etc/systemd/system/scylla-helper.slice.d/
+    cat << EOS > /etc/systemd/system/scylla-helper.slice.d/memory.conf
+[Slice]
+MemoryLimit=$MEMORY_LIMIT
+EOS
 fi

 systemctl --system daemon-reload >/dev/null || true
--- a/2
+++ b/2
--- a/serializer_impl.hh
+++ b/serializer_impl.hh
@@ -25,6 +25,7 @@
 #include <seastar/util/bool_class.hh>
 #include <boost/range/algorithm/for_each.hpp>
 #include "utils/small_vector.hh"
+#include <absl/container/btree_set.h>

 namespace ser {

@@ -81,6 +82,17 @@ static inline void serialize_array(Output& out, const Container& v) {
 template<typename Container>
 struct container_traits;

+template<typename T>
+struct container_traits<absl::btree_set<T>> {
+    struct back_emplacer {
+        absl::btree_set<T>& c;
+        back_emplacer(absl::btree_set<T>& c_) : c(c_) {}
+        void operator()(T&& v) {
+            c.emplace(std::move(v));
+        }
+    };
+};
+
 template<typename T>
 struct container_traits<std::unordered_set<T>> {
    struct back_emplacer {
@@ -253,6 +265,27 @@ struct serializer<std::list<T>> {
    }
 };

+template<typename T>
+struct serializer<absl::btree_set<T>> {
+    template<typename Input>
+    static absl::btree_set<T> read(Input& in) {
+        auto sz = deserialize(in, boost::type<uint32_t>());
+        absl::btree_set<T> v;
+        deserialize_array_helper<false, T>::doit(in, v, sz);
+        return v;
+    }
+    template<typename Output>
+    static void write(Output& out, const absl::btree_set<T>& v) {
+        safe_serialize_as_uint32(out, v.size());
+        serialize_array_helper<false, T>::doit(out, v);
+    }
+    template<typename Input>
+    static void skip(Input& in) {
+        auto sz = deserialize(in, boost::type<uint32_t>());
+        skip_array<T>(in, sz);
+    }
+};
+
 template<typename T>
 struct serializer<std::unordered_set<T>> {
    template<typename Input>
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -92,7 +92,7 @@ void migration_manager::init_messaging_service()
        //FIXME: future discarded.
        (void)with_gate(_background_tasks, [this] {
            mlogger.debug("features changed, recalculating schema version");
-            return update_schema_version_and_announce(get_storage_proxy(), _feat.cluster_schema_features());
+            return db::schema_tables::recalculate_schema_version(get_storage_proxy(), _feat);
        });
    };

@@ -277,9 +277,9 @@ future<> migration_manager::maybe_schedule_schema_pull(const utils::UUID& their_
    }).finally([me = shared_from_this()] {});
 }

-future<> migration_manager::submit_migration_task(const gms::inet_address& endpoint)
+future<> migration_manager::submit_migration_task(const gms::inet_address& endpoint, bool can_ignore_down_node)
 {
-    return service::migration_task::run_may_throw(endpoint);
+    return service::migration_task::run_may_throw(endpoint, can_ignore_down_node);
 }

 future<> migration_manager::do_merge_schema_from(netw::messaging_service::msg_addr id)
@@ -1132,7 +1132,8 @@ future<> migration_manager::sync_schema(const database& db, const std::vector<gm
        }).then([this, &schema_map] {
            return parallel_for_each(schema_map, [this] (auto& x) {
                mlogger.debug("Pulling schema {} from {}", x.first, x.second.front());
-                return submit_migration_task(x.second.front());
+                bool can_ignore_down_node = false;
+                return submit_migration_task(x.second.front(), can_ignore_down_node);
            });
        });
    });
--- a/service/migration_manager.hh
+++ b/service/migration_manager.hh
@@ -82,7 +82,7 @@ public:

    future<> maybe_schedule_schema_pull(const utils::UUID& their_version, const gms::inet_address& endpoint);

-    future<> submit_migration_task(const gms::inet_address& endpoint);
+    future<> submit_migration_task(const gms::inet_address& endpoint, bool can_ignore_down_node = true);

    // Makes sure that this node knows about all schema changes known by "nodes" that were made prior to this call.
    future<> sync_schema(const database& db, const std::vector<gms::inet_address>& nodes);
--- a/service/migration_task.cc
+++ b/service/migration_task.cc
@@ -51,11 +51,12 @@ namespace service {

 static logging::logger mlogger("migration_task");

-future<> migration_task::run_may_throw(const gms::inet_address& endpoint)
+future<> migration_task::run_may_throw(const gms::inet_address& endpoint, bool can_ignore_down_node)
 {
    if (!gms::get_local_gossiper().is_alive(endpoint)) {
-        mlogger.warn("Can't send migration request: node {} is down.", endpoint);
-        return make_ready_future<>();
+        auto msg = format("Can't send migration request: node {} is down.", endpoint);
+        mlogger.warn("{}", msg);
+        return can_ignore_down_node ? make_ready_future<>() : make_exception_future<>(std::runtime_error(msg));
    }
    netw::messaging_service::msg_addr id{endpoint, 0};
    return service::get_local_migration_manager().merge_schema_from(id).handle_exception([](std::exception_ptr e) {
--- a/service/migration_task.hh
+++ b/service/migration_task.hh
@@ -47,7 +47,7 @@ namespace service {

 class migration_task {
 public:
-    static future<> run_may_throw(const gms::inet_address& endpoint);
+    static future<> run_may_throw(const gms::inet_address& endpoint, bool can_ignore_down_node);
 };

 }
--- a/service/paxos/paxos_state.cc
+++ b/service/paxos/paxos_state.cc
@@ -190,4 +190,11 @@ future<> paxos_state::learn(schema_ptr schema, proposal decision, clock_type::ti
    });
 }

+future<> paxos_state::prune(schema_ptr schema, const partition_key& key, utils::UUID ballot, clock_type::time_point timeout,
+        tracing::trace_state_ptr tr_state) {
+    logger.debug("Delete paxos state for ballot {}", ballot);
+    tracing::trace(tr_state, "Delete paxos state for ballot {}", ballot);
+    return db::system_keyspace::delete_paxos_decision(*schema, key, ballot, timeout);
+}
+
 } // end of namespace "service::paxos"
--- a/service/paxos/paxos_state.hh
+++ b/service/paxos/paxos_state.hh
@@ -124,6 +124,9 @@ public:
            clock_type::time_point timeout);
    // Replica RPC endpoint for Paxos "learn".
    static future<> learn(schema_ptr schema, proposal decision, clock_type::time_point timeout, tracing::trace_state_ptr tr_state);
+    // Replica RPC endpoint for pruning Paxos table
+    static future<> prune(schema_ptr schema, const partition_key& key, utils::UUID ballot, clock_type::time_point timeout,
+            tracing::trace_state_ptr tr_state);
 };

 } // end of namespace "service::paxos"
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -171,6 +171,7 @@ public:
    const schema_ptr& schema() {
        return _schema;
    }
+    // called only when all replicas replied
    virtual void release_mutation() = 0;
 };

@@ -300,9 +301,10 @@ public:

 class cas_mutation : public mutation_holder {
    lw_shared_ptr<paxos::proposal> _proposal;
+    shared_ptr<paxos_response_handler> _handler;
 public:
-    explicit cas_mutation(paxos::proposal proposal , schema_ptr s)
-            : _proposal(make_lw_shared<paxos::proposal>(std::move(proposal))) {
+    explicit cas_mutation(paxos::proposal proposal, schema_ptr s, shared_ptr<paxos_response_handler> handler)
+            : _proposal(make_lw_shared<paxos::proposal>(std::move(proposal))), _handler(std::move(handler)) {
        _size = _proposal->update.representation().size();
        _schema = std::move(s);
    }
@@ -327,6 +329,11 @@ public:
        return true;
    }
    virtual void release_mutation() override {
+        // The handler will be set for "learn", but not for PAXOS repair
+        // since repair may not include all replicas
+        if (_handler) {
+            _handler->prune(_proposal->ballot);
+        }
        _proposal.release();
    }
 };
@@ -1184,6 +1191,12 @@ future<bool> paxos_response_handler::accept_proposal(const paxos::proposal& prop
    return f;
 }

+// debug output in mutate_internal needs this
+std::ostream& operator<<(std::ostream& os, const paxos_response_handler& h) {
+    os << "paxos_response_handler{" << h.id() << "}";
+    return os;
+}
+
 // This function implements learning stage of Paxos protocol
 future<> paxos_response_handler::learn_decision(paxos::proposal decision, bool allow_hints) {
    tracing::trace(tr_state, "learn_decision: committing {} with cl={}", decision, _cl_for_learn);
@@ -1219,12 +1232,41 @@ future<> paxos_response_handler::learn_decision(paxos::proposal decision, bool a
    }

    // Path for the "base" mutations
-    std::array<std::tuple<paxos::proposal, schema_ptr, dht::token>, 1> m{std::make_tuple(std::move(decision), _schema, _key.token())};
+    std::array<std::tuple<paxos::proposal, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>, 1> m{std::make_tuple(std::move(decision), _schema, shared_from_this(), _key.token())};
    future<> f_lwt = _proxy->mutate_internal(std::move(m), _cl_for_learn, false, tr_state, _permit, _timeout);

    return when_all_succeed(std::move(f_cdc), std::move(f_lwt));
 }

+void paxos_response_handler::prune(utils::UUID ballot) {
+    if (_has_dead_endpoints) {
+        return;
+    }
+    if ( _proxy->get_stats().cas_now_pruning >= pruning_limit) {
+        _proxy->get_stats().cas_coordinator_dropped_prune++;
+        return;
+    }
+     _proxy->get_stats().cas_now_pruning++;
+    _proxy->get_stats().cas_prune++;
+    // running in the background, but the amount of the bg job is limited by pruning_limit
+    // it is waited by holding shared pointer to storage_proxy which guaranties
+    // that storage_proxy::stop() will wait for this to complete
+    (void)parallel_for_each(_live_endpoints, [this, ballot] (gms::inet_address peer) mutable {
+        return futurize_apply([&] {
+            if (fbu::is_me(peer)) {
+                tracing::trace(tr_state, "prune: prune {} locally", ballot);
+                return paxos::paxos_state::prune(_schema, _key.key(), ballot, _timeout, tr_state);
+            } else {
+                tracing::trace(tr_state, "prune: send prune of {} to {}", ballot, peer);
+                netw::messaging_service& ms = netw::get_local_messaging_service();
+                return ms.send_paxos_prune(peer, _timeout, _schema->version(), _key.key(), ballot, tracing::make_trace_info(tr_state));
+            }
+        });
+    }).finally([h = shared_from_this()] {
+        h->_proxy->get_stats().cas_now_pruning--;
+    });
+}
+
 static std::vector<gms::inet_address>
 replica_ids_to_endpoints(locator::token_metadata& tm, const std::vector<utils::UUID>& replica_ids) {
    std::vector<gms::inet_address> endpoints;
@@ -1571,6 +1613,14 @@ void storage_proxy_stats::stats::register_stats() {
        sm::make_histogram("cas_write_contention", sm::description("how many contended writes were encountered"),
                       {storage_proxy_stats::current_scheduling_group_label()},
                       [this]{ return cas_write_contention.get_histogram(1, 8);}),
+
+        sm::make_total_operations("cas_prune", cas_prune,
+                       sm::description("how many times paxos prune was done after successful cas operation"),
+                       {storage_proxy_stats::current_scheduling_group_label()}),
+
+        sm::make_total_operations("cas_dropped_prune", cas_coordinator_dropped_prune,
+                       sm::description("how many times a coordinator did not perfom prune after cas"),
+                       {storage_proxy_stats::current_scheduling_group_label()}),
    });

    _metrics.add_group(REPLICA_STATS_CATEGORY, {
@@ -1606,19 +1656,28 @@ void storage_proxy_stats::stats::register_stats() {
                       sm::description("number of operations that crossed a shard boundary"),
                       {storage_proxy_stats::current_scheduling_group_label()}),

+        sm::make_total_operations("cas_dropped_prune", cas_replica_dropped_prune,
+                       sm::description("how many times a coordinator did not perfom prune after cas"),
+                       {storage_proxy_stats::current_scheduling_group_label()}),
    });
 }

-inline uint64_t& storage_proxy_stats::split_stats::get_ep_stat(gms::inet_address ep) {
+inline uint64_t& storage_proxy_stats::split_stats::get_ep_stat(gms::inet_address ep) noexcept {
    if (fbu::is_me(ep)) {
        return _local.val;
    }

-    sstring dc = get_dc(ep);
-    if (_auto_register_metrics) {
-        register_metrics_for(ep);
+    try {
+        sstring dc = get_dc(ep);
+        if (_auto_register_metrics) {
+            register_metrics_for(ep);
+        }
+        return _dc_stats[dc].val;
+    } catch (...) {
+        static thread_local uint64_t dummy_stat;
+        slogger.error("Failed to obtain stats ({}), fall-back to dummy", std::current_exception());
+        return dummy_stat;
    }
-    return _dc_stats[dc].val;
 }

 void storage_proxy_stats::split_stats::register_metrics_local() {
@@ -1879,11 +1938,11 @@ storage_proxy::create_write_response_handler(const std::unordered_map<gms::inet_
 }

 storage_proxy::response_id_type
-storage_proxy::create_write_response_handler(const std::tuple<paxos::proposal, schema_ptr, dht::token>& meta,
+storage_proxy::create_write_response_handler(const std::tuple<paxos::proposal, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>& meta,
        db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
-    auto& [commit, s, t] = meta;
+    auto& [commit, s, h, t] = meta;

-    return create_write_response_handler_helper(s, t, std::make_unique<cas_mutation>(std::move(commit), s), cl,
+    return create_write_response_handler_helper(s, t, std::make_unique<cas_mutation>(std::move(commit), s, std::move(h)), cl,
            db::write_type::CAS, tr_state, std::move(permit));
 }

@@ -1898,7 +1957,7 @@ storage_proxy::create_write_response_handler(const std::tuple<paxos::proposal, s
    auto keyspace_name = s->ks_name();
    keyspace& ks = _db.local().find_keyspace(keyspace_name);

-    return create_write_response_handler(ks, cl, db::write_type::CAS, std::make_unique<cas_mutation>(std::move(commit), s), std::move(endpoints),
+    return create_write_response_handler(ks, cl, db::write_type::CAS, std::make_unique<cas_mutation>(std::move(commit), s, nullptr), std::move(endpoints),
                    std::vector<gms::inet_address>(), std::vector<gms::inet_address>(), std::move(tr_state), get_stats(), std::move(permit));
 }

@@ -2146,6 +2205,8 @@ storage_proxy::get_paxos_participants(const sstring& ks_name, const dht::token &
                cl_for_paxos, participants + 1, live_endpoints.size());
    }

+    bool dead = participants != live_endpoints.size();
+
    // Apart from the ballot, paxos_state::prepare() also sends the current value of the requested key.
    // If the values received from different replicas match, we skip a separate query stage thus saving
    // one network round trip. To generate less traffic, only closest replicas send data, others send
@@ -2153,7 +2214,7 @@ storage_proxy::get_paxos_participants(const sstring& ks_name, const dht::token &
    // list of participants by proximity to this instance.
    sort_endpoints_by_proximity(live_endpoints);

-    return paxos_participants{std::move(live_endpoints), required_participants};
+    return paxos_participants{std::move(live_endpoints), required_participants, dead};
 }


@@ -3412,7 +3473,9 @@ protected:
    uint32_t original_partition_limit() const {
        return _cmd->partition_limit;
    }
+    virtual void adjust_targets_for_reconciliation() {}
    void reconcile(db::consistency_level cl, storage_proxy::clock_type::time_point timeout, lw_shared_ptr<query::read_command> cmd) {
+        adjust_targets_for_reconciliation();
        data_resolver_ptr data_resolver = ::make_shared<data_read_resolver>(_schema, cl, _targets.size(), timeout);
        auto exec = shared_from_this();

@@ -3639,6 +3702,9 @@ public:
    virtual void got_cl() override {
        _speculate_timer.cancel();
    }
+    virtual void adjust_targets_for_reconciliation() override {
+        _targets = used_targets();
+    }
 };

 class range_slice_read_executor : public never_speculating_read_executor {
@@ -4942,22 +5008,64 @@ void storage_proxy::init_messaging_service() {

        return f;
    });
+    ms.register_paxos_prune([this] (const rpc::client_info& cinfo, rpc::opt_time_point timeout,
+                utils::UUID schema_id, partition_key key, utils::UUID ballot, std::optional<tracing::trace_info> trace_info) {
+        static thread_local uint16_t pruning = 0;
+        static constexpr uint16_t pruning_limit = 1000; // since PRUNE verb is one way replica side has its own queue limit
+        auto src_addr = netw::messaging_service::get_source(cinfo);
+        auto src_ip = src_addr.addr;
+        tracing::trace_state_ptr tr_state;
+        if (trace_info) {
+            tr_state = tracing::tracing::get_local_tracing_instance().create_session(*trace_info);
+            tracing::begin(tr_state);
+            tracing::trace(tr_state, "paxos_prune: message received from /{} ballot {}", src_ip, ballot);
+        }
+
+        if (pruning >= pruning_limit) {
+            get_stats().cas_replica_dropped_prune++;
+            tracing::trace(tr_state, "paxos_prune: do not prune due to overload", src_ip);
+            return make_ready_future<seastar::rpc::no_wait_type>(netw::messaging_service::no_wait());
+        }
+
+        pruning++;
+        auto d = defer([] { pruning--; });
+        return get_schema_for_read(schema_id, src_addr).then([this, key = std::move(key), ballot,
+                         timeout, tr_state = std::move(tr_state), src_ip, d = std::move(d)] (schema_ptr schema) mutable {
+            dht::token token = dht::get_token(*schema, key);
+            unsigned shard = dht::shard_of(*schema, token);
+            bool local = shard == engine().cpu_id();
+            get_stats().replica_cross_shard_ops += !local;
+            return smp::submit_to(shard, _write_smp_service_group, [gs = global_schema_ptr(schema), gt = tracing::global_trace_state_ptr(std::move(tr_state)),
+                                     local,  key = std::move(key), ballot, timeout, src_ip, d = std::move(d)] () {
+                tracing::trace_state_ptr tr_state = gt;
+                return paxos::paxos_state::prune(gs, key, ballot,  *timeout, tr_state).then([src_ip, tr_state] () {
+                    tracing::trace(tr_state, "paxos_prune: handling is done, sending a response to /{}", src_ip);
+                    return netw::messaging_service::no_wait();
+                });
+            });
+        });
+    });
 }

 future<> storage_proxy::uninit_messaging_service() {
    auto& ms = netw::get_local_messaging_service();
    return when_all_succeed(
+        ms.unregister_counter_mutation(),
        ms.unregister_mutation(),
+        ms.unregister_hint_mutation(),
        ms.unregister_mutation_done(),
        ms.unregister_mutation_failed(),
        ms.unregister_read_data(),
        ms.unregister_read_mutation_data(),
        ms.unregister_read_digest(),
        ms.unregister_truncate(),
+        ms.unregister_get_schema_version(),
        ms.unregister_paxos_prepare(),
        ms.unregister_paxos_accept(),
-        ms.unregister_paxos_learn()
+        ms.unregister_paxos_learn(),
+        ms.unregister_paxos_prune()
    );
+
 }

 future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>
@@ -5050,8 +5158,7 @@ future<> storage_proxy::drain_on_shutdown() {

 future<>
 storage_proxy::stop() {
-    // FIXME: hints manager should be stopped here but it seems like this function is never called
-    return uninit_messaging_service();
+    return make_ready_future<>();
 }

 }
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -242,6 +242,7 @@ public:
        std::vector<gms::inet_address> endpoints;
        // How many participants are required for a quorum (i.e. is it SERIAL or LOCAL_SERIAL).
        size_t required_participants;
+        bool has_dead_endpoints;
    };

    const gms::feature_service& features() const { return _features; }
@@ -297,7 +298,6 @@ private:
    cdc::cdc_service* _cdc = nullptr;
    cdc_stats _cdc_stats;
 private:
-    future<> uninit_messaging_service();
    future<coordinator_query_result> query_singular(lw_shared_ptr<query::read_command> cmd,
            dht::partition_range_vector&& partition_ranges,
            db::consistency_level cl,
@@ -317,7 +317,7 @@ private:
    response_id_type create_write_response_handler(const mutation&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
    response_id_type create_write_response_handler(const hint_wrapper&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
    response_id_type create_write_response_handler(const std::unordered_map<gms::inet_address, std::optional<mutation>>&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
-    response_id_type create_write_response_handler(const std::tuple<paxos::proposal, schema_ptr, dht::token>& proposal,
+    response_id_type create_write_response_handler(const std::tuple<paxos::proposal, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>& proposal,
            db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
    response_id_type create_write_response_handler(const std::tuple<paxos::proposal, schema_ptr, dht::token, std::unordered_set<gms::inet_address>>& meta,
            db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
@@ -451,6 +451,7 @@ public:
        return next;
    }
    void init_messaging_service();
+    future<> uninit_messaging_service();

    // Applies mutation on this node.
    // Resolves with timed_out_error when timeout is reached.
@@ -634,6 +635,11 @@ private:
    db::consistency_level _cl_for_learn;
    // Live endpoints, as per get_paxos_participants()
    std::vector<gms::inet_address> _live_endpoints;
+    // True if there are dead endpoints
+    // We don't include endpoints known to be unavailable in pending
+    // endpoints list, but need to be aware of them to avoid pruning
+    // system.paxos data if some endpoint is missing a Paxos write.
+    bool _has_dead_endpoints;
    // How many endpoints need to respond favourably for the protocol to progress to the next step.
    size_t _required_participants;
    // A deadline when the entire CAS operation timeout expires, derived from write_request_timeout_in_ms
@@ -651,6 +657,9 @@ private:
    // Unique request id for logging purposes.
    const uint64_t _id = next_id++;

+    // max pruning operations to run in parralel
+    static constexpr uint16_t pruning_limit = 1000;
+
 public:
    tracing::trace_state_ptr tr_state;

@@ -674,6 +683,7 @@ public:
        storage_proxy::paxos_participants pp = _proxy->get_paxos_participants(_schema->ks_name(), _key.token(), _cl_for_paxos);
        _live_endpoints = std::move(pp.endpoints);
        _required_participants = pp.required_participants;
+        _has_dead_endpoints = pp.has_dead_endpoints;
        tracing::trace(tr_state, "Create paxos_response_handler for token {} with live: {} and required participants: {}",
                _key.token(), _live_endpoints, _required_participants);
    }
@@ -691,6 +701,7 @@ public:
    future<paxos::prepare_summary> prepare_ballot(utils::UUID ballot);
    future<bool> accept_proposal(const paxos::proposal& proposal, bool timeout_if_partially_accepted = true);
    future<> learn_decision(paxos::proposal decision, bool allow_hints = false);
+    void prune(utils::UUID ballot);
    uint64_t id() const {
        return _id;
    }
--- a/service/storage_proxy_stats.hh
+++ b/service/storage_proxy_stats.hh
@@ -74,7 +74,7 @@ public:
     *
     * @return a reference to the requested counter
     */
-    uint64_t& get_ep_stat(gms::inet_address ep);
+    uint64_t& get_ep_stat(gms::inet_address ep) noexcept;
 };

 struct write_stats {
@@ -116,6 +116,11 @@ struct write_stats {
    uint64_t cas_write_condition_not_met = 0;
    uint64_t cas_write_timeout_due_to_uncertainty = 0;
    uint64_t cas_failed_read_round_optimization = 0;
+    uint16_t cas_now_pruning = 0;
+    uint64_t cas_prune = 0;
+    uint64_t cas_coordinator_dropped_prune = 0;
+    uint64_t cas_replica_dropped_prune = 0;
+

    std::chrono::microseconds last_mv_flow_control_delay; // delay added for MV flow control in the last request
 public:
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -1007,12 +1007,16 @@ storage_service::is_local_dc(const inet_address& targetHost) const {
 std::unordered_map<dht::token_range, std::vector<inet_address>>
 storage_service::get_range_to_address_map(const sstring& keyspace,
        const std::vector<token>& sorted_tokens) const {
+    sstring ks = keyspace;
    // some people just want to get a visual representation of things. Allow null and set it to the first
    // non-system keyspace.
-    if (keyspace == "" && _db.local().get_non_system_keyspaces().empty()) {
-        throw std::runtime_error("No keyspace provided and no non system kespace exist");
+    if (keyspace == "") {
+        auto keyspaces = _db.local().get_non_system_keyspaces();
+        if (keyspaces.empty()) {
+            throw std::runtime_error("No keyspace provided and no non system kespace exist");
+        }
+        ks = keyspaces[0];
    }
-    const sstring& ks = (keyspace == "") ? _db.local().get_non_system_keyspaces()[0] : keyspace;
    return construct_range_to_endpoint_map(ks, get_all_ranges(sorted_tokens));
 }

@@ -2171,7 +2175,8 @@ storage_service::get_snapshot_details() {
 }

 future<int64_t> storage_service::true_snapshots_size() {
-    return _db.map_reduce(adder<int64_t>(), [] (database& db) {
+  return run_snapshot_list_operation([] {
+    return get_local_storage_service()._db.map_reduce(adder<int64_t>(), [] (database& db) {
        return do_with(int64_t(0), [&db] (auto& local_total) {
            return parallel_for_each(db.get_column_families(), [&local_total] (auto& cf_pair) {
                return cf_pair.second->get_snapshot_details().then([&local_total] (auto map) {
@@ -2185,6 +2190,7 @@ future<int64_t> storage_service::true_snapshots_size() {
            });
        });
    });
+  });
 }

 static std::atomic<bool> isolated = { false };
@@ -3409,10 +3415,13 @@ void feature_enabled_listener::on_enabled() {

 future<> read_sstables_format(distributed<storage_service>& ss) {
    return db::system_keyspace::get_scylla_local_param(SSTABLE_FORMAT_PARAM_NAME).then([&ss] (std::optional<sstring> format_opt) {
-        sstables::sstable_version_types format = sstables::from_string(format_opt.value_or("ka"));
-        return ss.invoke_on_all([format] (storage_service& s) {
-            s._sstables_format = format;
-        });
+        if (format_opt) {
+            sstables::sstable_version_types format = sstables::from_string(*format_opt);
+            return ss.invoke_on_all([format] (storage_service& s) {
+                s._sstables_format = format;
+            });
+        }
+        return make_ready_future<>();
    });
 }

--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -312,7 +312,13 @@ private:
     */
    std::optional<db_clock::time_point> _cdc_streams_ts;

-    sstables::sstable_version_types _sstables_format = sstables::sstable_version_types::ka;
+    // _sstables_format is the format used for writing new sstables.
+    // Here we set its default value, but if we discover that all the nodes
+    // in the cluster support a newer format, _sstables_format will be set to
+    // that format. read_sstables_format() also overwrites _sstables_format
+    // if an sstable format was chosen earlier (and this choice was persisted
+    // in the system table).
+    sstables::sstable_version_types _sstables_format = sstables::sstable_version_types::la;
    seastar::named_semaphore _feature_listeners_sem = {1, named_semaphore_exception_factory{"feature listeners"}};
    feature_enabled_listener _la_feature_listener;
    feature_enabled_listener _mc_feature_listener;
--- a/sstables/column_translation.hh
+++ b/sstables/column_translation.hh
@@ -72,47 +72,8 @@ private:
        static std::vector<column_info> build(
                const schema& s,
                const utils::chunked_vector<serialization_header::column_desc>& src,
-                bool is_static) {
-            std::vector<column_info> cols;
-            if (s.is_dense()) {
-                const column_definition& col = is_static ? *s.static_begin() : *s.regular_begin();
-                cols.push_back(column_info{
-                    &col.name(),
-                    col.type,
-                    col.id,
-                    col.type->value_length_if_fixed(),
-                    col.is_multi_cell(),
-                    col.is_counter(),
-                    false
-                });
-            } else {
-                cols.reserve(src.size());
-                for (auto&& desc : src) {
-                    const bytes& type_name = desc.type_name.value;
-                    data_type type = db::marshal::type_parser::parse(to_sstring_view(type_name));
-                    const column_definition* def = s.get_column_definition(desc.name.value);
-                    std::optional<column_id> id;
-                    bool schema_mismatch = false;
-                    if (def) {
-                        id = def->id;
-                        schema_mismatch = def->is_multi_cell() != type->is_multi_cell() ||
-                                          def->is_counter() != type->is_counter() ||
-                                          !def->type->is_value_compatible_with(*type);
-                    }
-                    cols.push_back(column_info{
-                        &desc.name.value,
-                        type,
-                        id,
-                        type->value_length_if_fixed(),
-                        type->is_multi_cell(),
-                        type->is_counter(),
-                        schema_mismatch
-                    });
-                }
-                boost::range::stable_partition(cols, [](const column_info& column) { return !column.is_collection; });
-            }
-            return cols;
-        }
+                const sstable_enabled_features& features,
+                bool is_static);

        utils::UUID schema_uuid;
        std::vector<column_info> regular_schema_columns_from_sstable;
@@ -125,10 +86,10 @@ private:
        state(state&&) = default;
        state& operator=(state&&) = default;

-        state(const schema& s, const serialization_header& header)
+        state(const schema& s, const serialization_header& header, const sstable_enabled_features& features)
            : schema_uuid(s.version())
-            , regular_schema_columns_from_sstable(build(s, header.regular_columns.elements, false))
-            , static_schema_columns_from_sstable(build(s, header.static_columns.elements, true))
+            , regular_schema_columns_from_sstable(build(s, header.regular_columns.elements, features, false))
+            , static_schema_columns_from_sstable(build(s, header.static_columns.elements, features, true))
            , clustering_column_value_fix_lengths (get_clustering_values_fixed_lengths(header))
        {}
    };
@@ -136,9 +97,10 @@ private:
    lw_shared_ptr<const state> _state = make_lw_shared<const state>();

 public:
-    column_translation get_for_schema(const schema& s, const serialization_header& header) {
+    column_translation get_for_schema(
+            const schema& s, const serialization_header& header, const sstable_enabled_features& features) {
        if (s.version() != _state->schema_uuid) {
-            _state = make_lw_shared(state(s, header));
+            _state = make_lw_shared(state(s, header, features));
        }
        return *this;
    }
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -66,7 +66,6 @@
 #include "db_clock.hh"
 #include "mutation_compactor.hh"
 #include "leveled_manifest.hh"
-#include "utils/observable.hh"
 #include "dht/token.hh"

 namespace sstables {
@@ -284,68 +283,73 @@ public:
 // When compaction finishes, all the temporary sstables generated here will be deleted and removed
 // from table's sstable set.
 class garbage_collected_sstable_writer {
-    compaction* _c = nullptr;
-    std::vector<shared_sstable> _temp_sealed_gc_sstables;
-    std::deque<compaction_write_monitor> _active_write_monitors = {};
-    shared_sstable _sst;
-    std::optional<sstable_writer> _writer;
-    std::optional<utils::observer<>> _on_new_sstable_sealed_observer;
-    utils::UUID _run_identifier = utils::make_random_uuid();
-    bool _consuming_new_partition {};
-private:
-    void setup_on_new_sstable_sealed_handler();
-    void maybe_create_new_sstable_writer();
-    void finish_sstable_writer();
-    void on_end_of_stream();
 public:
-    garbage_collected_sstable_writer() = default;
-    explicit garbage_collected_sstable_writer(compaction& c) : _c(&c) {
-        setup_on_new_sstable_sealed_handler();
-    }
+    // Data for GC writer is stored separately to allow compaction class to communicate directly
+    // with garbage_collected_sstable_writer which is moved into mutation_compaction, making it
+    // unreachable after the compaction process has started.
+    class data {
+        compaction* _c = nullptr;
+        // Garbage collected sstables that are sealed but were not added to SSTable set yet.
+        std::vector<shared_sstable> _unused_garbage_collected_sstables;
+        // Garbage collected sstables that were added to SSTable set and should be eventually removed from it.
+        std::vector<shared_sstable> _used_garbage_collected_sstables;
+        std::deque<compaction_write_monitor> _active_write_monitors = {};
+        shared_sstable _sst;
+        std::optional<sstable_writer> _writer;
+        utils::UUID _run_identifier = utils::make_random_uuid();
+    public:
+        explicit data(compaction& c) : _c(&c) {
+        }
+
+        data& operator=(const data&) = delete;
+        data(const data&) = delete;
+
+        void maybe_create_new_sstable_writer();
+        void finish_sstable_writer();
+
+        // Retrieves all unused garbage collected sstables that will be subsequently added
+        // to the SSTable set, and mark them as used.
+        std::vector<shared_sstable> consume_unused_garbage_collected_sstables() {
+            auto unused = std::exchange(_unused_garbage_collected_sstables, {});
+            _used_garbage_collected_sstables.insert(_used_garbage_collected_sstables.end(), unused.begin(), unused.end());
+            return unused;
+        }
+
+        const std::vector<shared_sstable>& used_garbage_collected_sstables() const {
+            return _used_garbage_collected_sstables;
+        }
+
+        friend class garbage_collected_sstable_writer;
+    };
+private:
+    garbage_collected_sstable_writer::data* _data = nullptr;
+public:
+    explicit garbage_collected_sstable_writer() = default;
+    explicit garbage_collected_sstable_writer(garbage_collected_sstable_writer::data& data) : _data(&data) {}

    garbage_collected_sstable_writer& operator=(const garbage_collected_sstable_writer&) = delete;
    garbage_collected_sstable_writer(const garbage_collected_sstable_writer&) = delete;

-    garbage_collected_sstable_writer(garbage_collected_sstable_writer&& other)
-            : _c(other._c)
-            , _temp_sealed_gc_sstables(std::move(other._temp_sealed_gc_sstables))
-            , _active_write_monitors(std::move(other._active_write_monitors))
-            , _sst(std::move(other._sst))
-            , _writer(std::move(other._writer))
-            , _run_identifier(other._run_identifier)
-            , _consuming_new_partition(other._consuming_new_partition) {
-        other._on_new_sstable_sealed_observer->disconnect();
-        setup_on_new_sstable_sealed_handler();
-    }
-
-    garbage_collected_sstable_writer& operator=(garbage_collected_sstable_writer&& other) {
-        if (this != &other) {
-            this->~garbage_collected_sstable_writer();
-            new (this) garbage_collected_sstable_writer(std::move(other));
-        }
-        return *this;
-    }
+    garbage_collected_sstable_writer(garbage_collected_sstable_writer&& other) = default;
+    garbage_collected_sstable_writer& operator=(garbage_collected_sstable_writer&& other) = default;

    void consume_new_partition(const dht::decorated_key& dk) {
-        maybe_create_new_sstable_writer();
-        _writer->consume_new_partition(dk);
-        _consuming_new_partition = true;
+        _data->maybe_create_new_sstable_writer();
+        _data->_writer->consume_new_partition(dk);
    }

-    void consume(tombstone t) { _writer->consume(t); }
-    stop_iteration consume(static_row&& sr, tombstone, bool) { return _writer->consume(std::move(sr)); }
-    stop_iteration consume(clustering_row&& cr, row_tombstone, bool) { return _writer->consume(std::move(cr)); }
-    stop_iteration consume(range_tombstone&& rt) { return _writer->consume(std::move(rt)); }
+    void consume(tombstone t) { _data->_writer->consume(t); }
+    stop_iteration consume(static_row&& sr, tombstone, bool) { return _data->_writer->consume(std::move(sr)); }
+    stop_iteration consume(clustering_row&& cr, row_tombstone, bool) { return _data->_writer->consume(std::move(cr)); }
+    stop_iteration consume(range_tombstone&& rt) { return _data->_writer->consume(std::move(rt)); }

    stop_iteration consume_end_of_partition() {
-        _writer->consume_end_of_partition();
-        _consuming_new_partition = false;
+        _data->_writer->consume_end_of_partition();
        return stop_iteration::no;
    }

    void consume_end_of_stream() {
-        finish_sstable_writer();
-        on_end_of_stream();
+        _data->finish_sstable_writer();
    }
 };

@@ -380,6 +384,7 @@ public:
 class compaction {
 protected:
    column_family& _cf;
+    creator_fn _sstable_creator;
    schema_ptr _schema;
    std::vector<shared_sstable> _sstables;
    // Unused sstables are tracked because if compaction is interrupted we can only delete them.
@@ -393,15 +398,17 @@ protected:
    std::vector<unsigned long> _ancestors;
    db::replay_position _rp;
    encoding_stats_collector _stats_collector;
-    utils::observable<> _on_new_sstable_sealed;
    bool _contains_multi_fragment_runs = false;
+    garbage_collected_sstable_writer::data _gc_sstable_writer_data;
 protected:
-    compaction(column_family& cf, std::vector<shared_sstable> sstables, uint64_t max_sstable_size, uint32_t sstable_level)
+    compaction(column_family& cf, creator_fn creator, std::vector<shared_sstable> sstables, uint64_t max_sstable_size, uint32_t sstable_level)
        : _cf(cf)
+        , _sstable_creator(std::move(creator))
        , _schema(cf.schema())
        , _sstables(std::move(sstables))
        , _max_sstable_size(max_sstable_size)
        , _sstable_level(sstable_level)
+        , _gc_sstable_writer_data(*this)
    {
        _info->cf = &cf;
        for (auto& sst : _sstables) {
@@ -434,10 +441,6 @@ protected:
        writer = std::nullopt;
        sst->open_data().get0();
        _info->end_size += sst->bytes_on_disk();
-        // Notify GC'ed-data sstable writer's handler that an output sstable has just been sealed.
-        // The handler is responsible for making sure that deleting an input sstable will not
-        // result in resurrection on failure.
-        _on_new_sstable_sealed();
    }

    api::timestamp_type maximum_timestamp() const {
@@ -447,10 +450,6 @@ protected:
        return (*m)->get_stats_metadata().max_timestamp;
    }

-    utils::observer<> add_on_new_sstable_sealed_handler(std::function<void (void)> handler) noexcept {
-        return _on_new_sstable_sealed.observe(std::move(handler));
-    }
-
    encoding_stats get_encoding_stats() const {
        return _stats_collector.get();
    }
@@ -562,10 +561,9 @@ private:
        };
    }

-    virtual shared_sstable create_new_sstable() const = 0;
-
    // select a sstable writer based on decorated key.
    virtual sstable_writer* select_sstable_writer(const dht::decorated_key& dk) = 0;
+
    // stop current writer
    virtual void stop_sstable_writer() = 0;
    // finish all writers.
@@ -588,20 +586,9 @@ private:
            sst->mark_for_deletion();
        }
    }
-
-    void setup_garbage_collected_sstable(shared_sstable sst) {
-        // Add new sstable to table's set because expired tombstone should be available if compaction is abruptly stopped.
-        _cf.add_sstable_and_update_cache(std::move(sst)).get();
-    }
-
-    void eventually_delete_garbage_collected_sstable(shared_sstable sst) {
-        // Add sstable to compaction's input list for it to be eventually removed from table's set.
-        sst->mark_for_deletion();
-        _sstables.push_back(std::move(sst));
-    }
 public:
    garbage_collected_sstable_writer make_garbage_collected_sstable_writer() {
-        return garbage_collected_sstable_writer(*this);
+        return garbage_collected_sstable_writer(_gc_sstable_writer_data);
    }

    bool contains_multi_fragment_runs() const {
@@ -616,6 +603,7 @@ public:

    friend class compacting_sstable_writer;
    friend class garbage_collected_sstable_writer;
+    friend class garbage_collected_sstable_writer::data;
 };

 void compacting_sstable_writer::consume_new_partition(const dht::decorated_key& dk) {
@@ -642,22 +630,9 @@ void compacting_sstable_writer::consume_end_of_stream() {
    _c.finish_sstable_writer();
 }

-void garbage_collected_sstable_writer::setup_on_new_sstable_sealed_handler() {
-    _on_new_sstable_sealed_observer = _c->add_on_new_sstable_sealed_handler([this] {
-        // NOTE: This handler is called, BEFORE an input sstable is possibly deleted
-        // *AND* AFTER a new output sstable is sealed, to flush a garbage collected
-        // sstable being currently written.
-        // That way, data is resurrection is prevented by making sure that the
-        // GC'able data is still reachable in a temporary sstable.
-        assert(!_consuming_new_partition);
-        // Wait for current gc'ed-only-sstable to be flushed and added to table's set.
-        this->finish_sstable_writer();
-    });
-}
-
-void garbage_collected_sstable_writer::maybe_create_new_sstable_writer() {
+void garbage_collected_sstable_writer::data::maybe_create_new_sstable_writer() {
    if (!_writer) {
-        _sst = _c->create_new_sstable();
+        _sst = _c->_sstable_creator(this_shard_id());

        auto&& priority = service::get_local_compaction_priority();
        _active_write_monitors.emplace_back(_sst, _c->_cf, _c->maximum_timestamp(), _c->_sstable_level);
@@ -668,25 +643,16 @@ void garbage_collected_sstable_writer::maybe_create_new_sstable_writer() {
    }
 }

-void garbage_collected_sstable_writer::finish_sstable_writer() {
+void garbage_collected_sstable_writer::data::finish_sstable_writer() {
    if (_writer) {
        _writer->consume_end_of_stream();
        _writer = std::nullopt;
        _sst->open_data().get0();
-        _c->setup_garbage_collected_sstable(_sst);
-        _temp_sealed_gc_sstables.push_back(std::move(_sst));
-    }
-}
-
-void garbage_collected_sstable_writer::on_end_of_stream() {
-    for (auto&& sst : _temp_sealed_gc_sstables) {
-        clogger.debug("Asking for deletion of temporary tombstone-only sstable {}", sst->get_filename());
-        _c->eventually_delete_garbage_collected_sstable(std::move(sst));
+        _unused_garbage_collected_sstables.push_back(std::move(_sst));
    }
 }

 class regular_compaction : public compaction {
-    std::function<shared_sstable()> _creator;
    replacer_fn _replacer;
    std::unordered_set<shared_sstable> _compacting_for_max_purgeable_func;
    // store a clone of sstable set for column family, which needs to be alive for incremental selector.
@@ -701,10 +667,9 @@ class regular_compaction : public compaction {
    std::deque<compaction_write_monitor> _active_write_monitors = {};
    utils::UUID _run_identifier;
 public:
-    regular_compaction(column_family& cf, compaction_descriptor descriptor, std::function<shared_sstable()> creator, replacer_fn replacer)
-        : compaction(cf, std::move(descriptor.sstables), descriptor.max_sstable_bytes, descriptor.level)
-        , _creator(std::move(creator))
-        , _replacer(std::move(replacer))
+    regular_compaction(column_family& cf, compaction_descriptor descriptor)
+        : compaction(cf, std::move(descriptor.creator), std::move(descriptor.sstables), descriptor.max_sstable_bytes, descriptor.level)
+        , _replacer(std::move(descriptor.replacer))
        , _compacting_for_max_purgeable_func(std::unordered_set<shared_sstable>(_sstables.begin(), _sstables.end()))
        , _set(cf.get_sstable_set())
        , _selector(_set.make_incremental_selector())
@@ -755,13 +720,9 @@ public:
        };
    }

-    virtual shared_sstable create_new_sstable() const override {
-        return _creator();
-    }
-
    virtual sstable_writer* select_sstable_writer(const dht::decorated_key& dk) override {
        if (!_writer) {
-            _sst = _creator();
+            _sst = _sstable_creator(0);
            setup_new_sstable(_sst);

            _active_write_monitors.emplace_back(_sst, _cf, maximum_timestamp(), _sstable_level);
@@ -834,6 +795,15 @@ private:
                // Fully expired sstable is not actually compacted, therefore it's not present in the compacting set.
                _compacting->erase(sst);
            });
+            // Make sure SSTable created by garbage collected writer is made available
+            // before exhausted SSTable is released, so as to prevent data resurrection.
+            _gc_sstable_writer_data.finish_sstable_writer();
+            // Added Garbage collected SSTables to list of unused SSTables that will be added
+            // to SSTable set. GC SSTables should be added before compaction completes because
+            // a failure could result in data resurrection if data is not made available.
+            auto unused_gc_sstables = _gc_sstable_writer_data.consume_unused_garbage_collected_sstables();
+            _new_unused_sstables.insert(_new_unused_sstables.end(), unused_gc_sstables.begin(), unused_gc_sstables.end());
+
            auto exhausted_ssts = std::vector<shared_sstable>(exhausted, _sstables.end());
            _replacer(get_compaction_completion_desc(exhausted_ssts, std::move(_new_unused_sstables)));
            _sstables.erase(exhausted, _sstables.end());
@@ -842,11 +812,16 @@ private:
    }

    void replace_remaining_exhausted_sstables() {
-        if (!_sstables.empty()) {
-            std::vector<shared_sstable> sstables_compacted;
-            std::move(_sstables.begin(), _sstables.end(), std::back_inserter(sstables_compacted));
-            _replacer(get_compaction_completion_desc(std::move(sstables_compacted), std::move(_new_unused_sstables)));
-        }
+        if (!_sstables.empty() || !_gc_sstable_writer_data.used_garbage_collected_sstables().empty()) {
+            std::vector<shared_sstable> old_sstables;
+            std::move(_sstables.begin(), _sstables.end(), std::back_inserter(old_sstables));
+
+            // Remove Garbage Collected SSTables from the SSTable set if any was previously added.
+            auto& used_garbage_collected_sstables = _gc_sstable_writer_data.used_garbage_collected_sstables();
+            old_sstables.insert(old_sstables.end(), used_garbage_collected_sstables.begin(), used_garbage_collected_sstables.end());
+
+            _replacer(get_compaction_completion_desc(std::move(old_sstables), std::move(_new_unused_sstables)));
+         }
    }

    void do_pending_replacements() {
@@ -909,8 +884,8 @@ protected:
        return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges_for_for_invalidation)};
    }
 public:
-    cleanup_compaction(column_family& cf, compaction_descriptor descriptor, std::function<shared_sstable()> creator, replacer_fn replacer)
-        : regular_compaction(cf, std::move(descriptor), std::move(creator), std::move(replacer))
+    cleanup_compaction(column_family& cf, compaction_descriptor descriptor)
+        : regular_compaction(cf, std::move(descriptor))
        , _owned_ranges(service::get_local_storage_service().get_local_ranges(_schema->ks_name()))
    {
        _info->type = compaction_type::Cleanup;
@@ -1114,9 +1089,8 @@ private:
    compaction_options::scrub _options;

 public:
-    scrub_compaction(column_family& cf, compaction_descriptor descriptor, compaction_options::scrub options, std::function<shared_sstable()> creator,
-            replacer_fn replacer)
-        : regular_compaction(cf, std::move(descriptor), std::move(creator), std::move(replacer))
+    scrub_compaction(column_family& cf, compaction_descriptor descriptor, compaction_options::scrub options)
+        : regular_compaction(cf, std::move(descriptor))
        , _options(options) {
        _info->type = compaction_type::Scrub;
    }
@@ -1143,7 +1117,6 @@ flat_mutation_reader make_scrubbing_reader(flat_mutation_reader rd, bool skip_co
 class resharding_compaction final : public compaction {
    std::vector<std::pair<shared_sstable, std::optional<sstable_writer>>> _output_sstables;
    shard_id _shard; // shard of current sstable writer
-    std::function<shared_sstable(shard_id)> _sstable_creator;
    compaction_backlog_tracker _resharding_backlog_tracker;

    // Partition count estimation for a shard S:
@@ -1168,11 +1141,9 @@ private:
        return ceil(double(_estimation_per_shard[s].estimated_partitions) / estimated_sstables);
    }
 public:
-    resharding_compaction(std::vector<shared_sstable> sstables, column_family& cf, std::function<shared_sstable(shard_id)> creator,
-            uint64_t max_sstable_size, uint32_t sstable_level)
-        : compaction(cf, std::move(sstables), max_sstable_size, sstable_level)
+    resharding_compaction(column_family& cf, sstables::compaction_descriptor descriptor)
+        : compaction(cf, std::move(descriptor.creator), std::move(descriptor.sstables), descriptor.max_sstable_bytes, descriptor.level)
        , _output_sstables(smp::count)
-        , _sstable_creator(std::move(creator))
        , _resharding_backlog_tracker(std::make_unique<resharding_backlog_tracker>())
        , _estimation_per_shard(smp::count)
        , _run_identifiers(smp::count)
@@ -1224,10 +1195,6 @@ public:

    void backlog_tracker_adjust_charges() override { }

-    shared_sstable create_new_sstable() const override {
-        return _sstable_creator(_shard);
-    }
-
    sstable_writer* select_sstable_writer(const dht::decorated_key& dk) override {
        _shard = dht::shard_of(*_schema, dk.token());
        auto& sst = _output_sstables[_shard].first;
@@ -1298,38 +1265,35 @@ compaction_type compaction_options::type() const {
    return index_to_type[_options.index()];
 }

-static std::unique_ptr<compaction> make_compaction(column_family& cf, sstables::compaction_descriptor descriptor,
-        std::function<shared_sstable()> creator, replacer_fn replacer) {
+static std::unique_ptr<compaction> make_compaction(column_family& cf, sstables::compaction_descriptor descriptor) {
    struct {
        column_family& cf;
        sstables::compaction_descriptor&& descriptor;
-        std::function<shared_sstable()>&& creator;
-        replacer_fn&& replacer;

        std::unique_ptr<compaction> operator()(compaction_options::regular) {
-            return std::make_unique<regular_compaction>(cf, std::move(descriptor), std::move(creator), std::move(replacer));
+            return std::make_unique<regular_compaction>(cf, std::move(descriptor));
        }
        std::unique_ptr<compaction> operator()(compaction_options::cleanup) {
-            return std::make_unique<cleanup_compaction>(cf, std::move(descriptor), std::move(creator), std::move(replacer));
+            return std::make_unique<cleanup_compaction>(cf, std::move(descriptor));
        }
        std::unique_ptr<compaction> operator()(compaction_options::upgrade) {
-            return std::make_unique<cleanup_compaction>(cf, std::move(descriptor), std::move(creator), std::move(replacer));
+            return std::make_unique<cleanup_compaction>(cf, std::move(descriptor));
        }
        std::unique_ptr<compaction> operator()(compaction_options::scrub scrub_options) {
-            return std::make_unique<scrub_compaction>(cf, std::move(descriptor), scrub_options, std::move(creator), std::move(replacer));
+            return std::make_unique<scrub_compaction>(cf, std::move(descriptor), scrub_options);
        }
-    } visitor_factory{cf, std::move(descriptor), std::move(creator), std::move(replacer)};
+    } visitor_factory{cf, std::move(descriptor)};

    return descriptor.options.visit(visitor_factory);
 }

 future<compaction_info>
-compact_sstables(sstables::compaction_descriptor descriptor, column_family& cf, std::function<shared_sstable()> creator, replacer_fn replacer) {
+compact_sstables(sstables::compaction_descriptor descriptor, column_family& cf) {
    if (descriptor.sstables.empty()) {
        throw std::runtime_error(format("Called {} compaction with empty set on behalf of {}.{}", compaction_name(descriptor.options.type()),
                cf.schema()->ks_name(), cf.schema()->cf_name()));
    }
-    auto c = make_compaction(cf, std::move(descriptor), std::move(creator), std::move(replacer));
+    auto c = make_compaction(cf, std::move(descriptor));
    if (c->contains_multi_fragment_runs()) {
        auto gc_writer = c->make_garbage_collected_sstable_writer();
        return compaction::run(std::move(c), std::move(gc_writer));
@@ -1343,7 +1307,10 @@ reshard_sstables(std::vector<shared_sstable> sstables, column_family& cf, std::f
    if (sstables.empty()) {
        throw std::runtime_error(format("Called resharding with empty set on behalf of {}.{}", cf.schema()->ks_name(), cf.schema()->cf_name()));
    }
-    auto c = std::make_unique<resharding_compaction>(std::move(sstables), cf, std::move(creator), max_sstable_size, sstable_level);
+    sstables::compaction_descriptor descriptor(std::move(sstables), sstable_level, max_sstable_size);
+    descriptor.creator = std::move(creator);
+
+    auto c = std::make_unique<resharding_compaction>(cf, std::move(descriptor));
    return compaction::run(std::move(c)).then([] (auto ret) {
        return std::move(ret.new_sstables);
    });
--- a/sstables/compaction.hh
+++ b/sstables/compaction.hh
@@ -84,6 +84,20 @@ namespace sstables {
        compaction_type type() const;
    };

+    struct compaction_completion_desc {
+        // Old, existing SSTables that should be deleted and removed from the SSTable set.
+        std::vector<shared_sstable> old_sstables;
+        // New, fresh SSTables that should be added to SSTable set, replacing the old ones.
+        std::vector<shared_sstable> new_sstables;
+        // Set of compacted partition ranges that should be invalidated in the cache.
+        dht::partition_range_vector ranges_for_cache_invalidation;
+    };
+
+    // creates a new SSTable for a given shard
+    using creator_fn = std::function<shared_sstable(shard_id shard)>;
+    // Replaces old sstable(s) by new one(s) which contain all non-expired data.
+    using replacer_fn = std::function<void(compaction_completion_desc)>;
+
    struct compaction_descriptor {
        // List of sstables to be compacted.
        std::vector<sstables::shared_sstable> sstables;
@@ -101,6 +115,9 @@ namespace sstables {
        // This also selects the kind of compaction to do.
        compaction_options options = compaction_options::make_regular();

+        creator_fn creator;
+        replacer_fn replacer;
+
        compaction_descriptor() = default;

        static constexpr int default_level = 0;
@@ -190,16 +207,6 @@ namespace sstables {
        }
    };

-    struct compaction_completion_desc {
-        std::vector<shared_sstable> input_sstables;
-        std::vector<shared_sstable> output_sstables;
-        // Set of compacted partition ranges that should be invalidated in the cache.
-        dht::partition_range_vector ranges_for_cache_invalidation;
-    };
-
-    // Replaces old sstable(s) by new one(s) which contain all non-expired data.
-    using replacer_fn = std::function<void(compaction_completion_desc)>;
-
    // Compact a list of N sstables into M sstables.
    // Returns info about the finished compaction, which includes vector to new sstables.
    //
@@ -212,8 +219,7 @@ namespace sstables {
    // If descriptor.cleanup is true, mutation that doesn't belong to current node will be
    // cleaned up, log messages will inform the user that compact_sstables runs for
    // cleaning operation, and compaction history will not be updated.
-    future<compaction_info> compact_sstables(sstables::compaction_descriptor descriptor, column_family& cf,
-        std::function<shared_sstable()> creator, replacer_fn replacer);
+    future<compaction_info> compact_sstables(sstables::compaction_descriptor descriptor, column_family& cf);

    // Compacts a set of N shared sstables into M sstables. For every shard involved,
    // i.e. which owns any of the sstables, a new unshared sstable is created.
--- a/Show More
+++ b/Show More