release: prepare for 4.1.11

compaction: compaction_writer: destroy shared_sstable after the sstable_writer
sstable_writer may depend on the sstable throughout its whole lifecycle. If the sstable is freed before the sstable_writer we might hit use-after-free as in the follwing case: ``` std::_Deque_iterator<sstables::compression::segmented_offsets::bucket, sstables::compression::segmented_offsets::bucket&, sstables::compression::segmented_offsets::bucket*>::operator+=(long) at /usr/include/c++/10/bits/stl_deque.h:240 (inlined by) std::operator+(std::_Deque_iterator<sstables::compression::segmented_offsets::bucket, sstables::compression::segmented_offsets::bucket&, sstables::compression::segmented_offsets::bucket*> const&, long) at /usr/include/c++/10/bits/stl_deque.h:378 (inlined by) std::_Deque_iterator<sstables::compression::segmented_offsets::bucket, sstables::compression::segmented_offsets::bucket&, sstables::compression::segmented_offsets::bucket*>::operator[](long) const at /usr/include/c++/10/bits/stl_deque.h:252 (inlined by) std::deque<sstables::compression::segmented_offsets::bucket, std::allocator<sstables::compression::segmented_offsets::bucket> >::operator[](unsigned long) at /usr/include/c++/10/bits/stl_deque.h:1327 (inlined by) sstables::compression::segmented_offsets::push_back(unsigned long, sstables::compression::segmented_offsets::state&) at ./sstables/compress.cc:214 sstables::compression::segmented_offsets::writer::push_back(unsigned long) at ./sstables/compress.hh:123 (inlined by) compressed_file_data_sink_impl<crc32_utils, (compressed_checksum_mode)1>::put(seastar::temporary_buffer<char>) at ./sstables/compress.cc:519 seastar::output_stream<char>::put(seastar::temporary_buffer<char>) at table.cc:? (inlined by) seastar::output_stream<char>::put(seastar::temporary_buffer<char>) at ././seastar/include/seastar/core/iostream-impl.hh:432 seastar::output_stream<char>::flush() at table.cc:? seastar::output_stream<char>::close() at table.cc:? sstables::file_writer::close() at sstables.cc:? sstables::mc::writer::~writer() at writer.cc:? (inlined by) sstables::mc::writer::~writer() at ./sstables/mx/writer.cc:790 sstables::mc::writer::~writer() at writer.cc:? flat_mutation_reader::impl::consumer_adapter<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >::~consumer_adapter() at compaction.cc:? (inlined by) std::_Optional_payload_base<sstables::compaction_writer>::_M_destroy() at /usr/include/c++/10/optional:260 (inlined by) std::_Optional_payload_base<sstables::compaction_writer>::_M_reset() at /usr/include/c++/10/optional:280 (inlined by) std::_Optional_payload<sstables::compaction_writer, false, false, false>::~_Optional_payload() at /usr/include/c++/10/optional:401 (inlined by) std::_Optional_base<sstables::compaction_writer, false, false>::~_Optional_base() at /usr/include/c++/10/optional:474 (inlined by) std::optional<sstables::compaction_writer>::~optional() at /usr/include/c++/10/optional:659 (inlined by) sstables::compacting_sstable_writer::~compacting_sstable_writer() at ./sstables/compaction.cc:229 (inlined by) compact_mutation<(emit_only_live_rows)0, (compact_for_sstables)1, sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>::~compact_mutation() at ././mutation_compactor.hh:468 (inlined by) compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>::~compact_for_compaction() at ././mutation_compactor.hh:538 (inlined by) std::default_delete<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >::operator()(compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>*) const at /usr/include/c++/10/bits/unique_ptr.h:85 (inlined by) std::unique_ptr<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>, std::default_delete<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >::~unique_ptr() at /usr/include/c++/10/bits/unique_ptr.h:361 (inlined by) stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >::~stable_flattened_mutations_consumer() at ././mutation_reader.hh:342 (inlined by) flat_mutation_reader::impl::consumer_adapter<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >::~consumer_adapter() at ././flat_mutation_reader.hh:201 auto flat_mutation_reader::impl::consume_in_thread<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter>(stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter, std::chrono::time_point<seastar::lowres_clock, std::chrono::duration<long, std::ratio<1l, 1000l> > >) at ././flat_mutation_reader.hh:272 (inlined by) auto flat_mutation_reader::consume_in_thread<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter>(stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter, std::chrono::time_point<seastar::lowres_clock, std::chrono::duration<long, std::ratio<1l, 1000l> > >) at ././flat_mutation_reader.hh:383 (inlined by) auto flat_mutation_reader::consume_in_thread<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >(stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, std::chrono::time_point<seastar::lowres_clock, std::chrono::duration<long, std::ratio<1l, 1000l> > >) at ././flat_mutation_reader.hh:389 (inlined by) seastar::future<void> sstables::compaction::setup<noop_compacted_fragments_consumer>(noop_compacted_fragments_consumer)::{lambda(flat_mutation_reader)#1}::operator()(flat_mutation_reader)::{lambda()#1}::operator()() at ./sstables/compaction.cc:612 ``` What happens here is that: compressed_file_data_sink_impl(output_stream<char> out, sstables::compression* cm, sstables::local_compression lc) : _out(std::move(out)) , _compression_metadata(cm) , _offsets(_compression_metadata->offsets.get_writer()) , _compression(lc) , _full_checksum(ChecksumType::init_checksum()) _compression_metadata points to a buffer held by the sstable object. and _compression_metadata->offsets.get_writer returns a writer that keeps a reference to the segmented_offsets in the sstables::compression that is used in the ~writer -> close path. Fixes #7821 Signed-off-by: Benny Halevy <bhalevy@scylladb.com> Message-Id: <20201227145726.33319-1-bhalevy@scylladb.com> (cherry picked from commit 8a745a0ee0)
2026-05-13 11:22:01 +00:00 · 2021-01-05 10:13:34 +02:00 · 2021-01-04 15:12:33 +02:00 · 2020-12-24 12:42:42 +02:00 · 2020-12-16 17:20:32 +02:00 · 2020-12-16 11:59:12 +02:00
114 changed files with 3550 additions and 970 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -12,3 +12,6 @@
 [submodule "zstd"]
 	path = zstd
 	url = ../zstd
+[submodule "abseil"]
+	path = abseil
+	url = ../abseil-cpp
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=4.1.3
+VERSION=4.1.11

 if test -f version
 then
--- a/1
+++ b/1
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -129,7 +129,7 @@ future<std::string> get_key_from_roles(cql3::query_processor& qp, std::string us
            auth::meta::roles_table::qualified_name(), auth::meta::roles_table::role_col_name);

    auto cl = auth::password_authenticator::consistency_for_user(username);
-    auto timeout = auth::internal_distributed_timeout_config();
+    auto& timeout = auth::internal_distributed_timeout_config();
    return qp.execute_internal(query, cl, timeout, {sstring(username)}, true).then_wrapped([username = std::move(username)] (future<::shared_ptr<cql3::untyped_result_set>> f) {
        auto res = f.get0();
        auto salted_hash = std::optional<sstring>();
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -141,6 +141,11 @@ struct nonempty : public size_check {

 // Check that array has the expected number of elements
 static void verify_operand_count(const rjson::value* array, const size_check& expected, const rjson::value& op) {
+    if (!array && expected(0)) {
+        // If expected() allows an empty AttributeValueList, it is also fine
+        // that it is missing.
+        return;
+    }
    if (!array || !array->IsArray()) {
        throw api_error("ValidationException", "With ComparisonOperator, AttributeValueList must be given and an array");
    }
@@ -365,31 +370,35 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara

 struct cmp_lt {
    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs; }
+    // We cannot use the normal comparison operators like "<" on the bytes
+    // type, because they treat individual bytes as signed but we need to
+    // compare them as *unsigned*. So we need a specialization for bytes.
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) < 0; }
    static constexpr const char* diagnostic = "LT operator";
 };

 struct cmp_le {
-    // bytes only has <, so we cannot use <=.
-    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs || lhs == rhs; }
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs <= rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) <= 0; }
    static constexpr const char* diagnostic = "LE operator";
 };

 struct cmp_ge {
-    // bytes only has <, so we cannot use >=.
-    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs || lhs == rhs; }
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs >= rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) >= 0; }
    static constexpr const char* diagnostic = "GE operator";
 };

 struct cmp_gt {
-    // bytes only has <, so we cannot use >.
-    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs; }
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs > rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) > 0; }
    static constexpr const char* diagnostic = "GT operator";
 };

 // True if v is between lb and ub, inclusive.  Throws if lb > ub.
 template <typename T>
 bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
-    if (ub < lb) {
+    if (cmp_lt()(ub, lb)) {
        throw api_error("ValidationException",
                        format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
    }
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -971,15 +971,24 @@ class attribute_collector {
    void add(bytes&& name, atomic_cell&& cell) {
        collected.emplace(std::move(name), std::move(cell));
    }
+    void add(const bytes& name, atomic_cell&& cell) {
+        collected.emplace(name, std::move(cell));
+    }
 public:
    attribute_collector() : collected(attrs_type()->get_keys_type()->as_less_comparator()) { }
-    void put(bytes&& name, bytes&& val, api::timestamp_type ts) {
-        add(std::move(name), atomic_cell::make_live(*bytes_type, ts, std::move(val), atomic_cell::collection_member::yes));
+    void put(bytes&& name, const bytes& val, api::timestamp_type ts) {
+        add(std::move(name), atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));

    }
+    void put(const bytes& name, const bytes& val, api::timestamp_type ts) {
+        add(name, atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
+    }
    void del(bytes&& name, api::timestamp_type ts) {
        add(std::move(name), atomic_cell::make_dead(ts, gc_clock::now()));
    }
+    void del(const bytes& name, api::timestamp_type ts) {
+        add(name, atomic_cell::make_dead(ts, gc_clock::now()));
+    }
    collection_mutation_description to_mut() {
        collection_mutation_description ret;
        for (auto&& e : collected) {
@@ -1059,7 +1068,7 @@ public:
    put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item);
    // put_or_delete_item doesn't keep a reference to schema (so it can be
    // moved between shards for LWT) so it needs to be given again to build():
-    mutation build(schema_ptr schema, api::timestamp_type ts);
+    mutation build(schema_ptr schema, api::timestamp_type ts) const;
    const partition_key& pk() const { return _pk; }
    const clustering_key& ck() const { return _ck; }
 };
@@ -1088,7 +1097,7 @@ put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr sche
    }
 }

-mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) {
+mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) const {
    mutation m(schema, _pk);
    // If there's no clustering key, a tombstone should be created directly
    // on a partition, not on a clustering row - otherwise it will look like
@@ -1110,7 +1119,7 @@ mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) {
    for (auto& c : *_cells) {
        const column_definition* cdef = schema->get_column_definition(c.column_name);
        if (!cdef) {
-            attrs_collector.put(std::move(c.column_name), std::move(c.value), ts);
+            attrs_collector.put(c.column_name, c.value, ts);
        } else {
            row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, std::move(c.value)));
        }
@@ -1390,7 +1399,7 @@ public:
               check_needs_read_before_write(_condition_expression) ||
               _returnvalues == returnvalues::ALL_OLD;
    }
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override {
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
        std::unordered_set<std::string> used_attribute_values;
        std::unordered_set<std::string> used_attribute_names;
        if (!verify_expected(_request, previous_item) ||
@@ -1402,6 +1411,7 @@ public:
            // efficient than throwing an exception.
            return {};
        }
+        _return_attributes = {};
        if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
            // previous_item is supposed to have been created with
            // describe_item(), so has the "Item" attribute:
@@ -1468,7 +1478,7 @@ public:
                check_needs_read_before_write(_condition_expression) ||
                _returnvalues == returnvalues::ALL_OLD;
    }
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override {
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
        std::unordered_set<std::string> used_attribute_values;
        std::unordered_set<std::string> used_attribute_names;
        if (!verify_expected(_request, previous_item) ||
@@ -1480,6 +1490,7 @@ public:
            // efficient than throwing an exception.
            return {};
        }
+        _return_attributes = {};
        if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
            rjson::value* item = rjson::find(*previous_item, "Item");
            if (item) {
@@ -1563,7 +1574,7 @@ public:
    virtual ~put_or_delete_item_cas_request() = default;
    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override {
        std::optional<mutation> ret;
-        for (put_or_delete_item& mutation_builder : _mutation_builders) {
+        for (const put_or_delete_item& mutation_builder : _mutation_builders) {
            // We assume all these builders have the same partition.
            if (ret) {
                ret->apply(mutation_builder.build(schema, ts));
@@ -2387,7 +2398,7 @@ public:

    update_item_operation(service::storage_proxy& proxy, rjson::value&& request);
    virtual ~update_item_operation() = default;
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override;
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override;
    bool needs_read_before_write() const;
 };

@@ -2451,7 +2462,7 @@ update_item_operation::needs_read_before_write() const {
 }

 std::optional<mutation>
-update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) {
+update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const {
    std::unordered_set<std::string> used_attribute_values;
    std::unordered_set<std::string> used_attribute_names;
    if (!verify_expected(_request, previous_item) ||
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -87,7 +87,11 @@ protected:
    // When _returnvalues != NONE, apply() should store here, in JSON form,
    // the values which are to be returned in the "Attributes" field.
    // The default null JSON means do not return an Attributes field at all.
-    rjson::value _return_attributes;
+    // This field is marked "mutable" so that the const apply() can modify
+    // it (see explanation below), but note that because apply() may be
+    // called more than once, if apply() will sometimes set this field it
+    // must set it (even if just to the default empty value) every time.
+    mutable rjson::value _return_attributes;
 public:
    // The constructor of a rmw_operation subclass should parse the request
    // and try to discover as many input errors as it can before really
@@ -100,7 +104,12 @@ public:
    // conditional expression, apply() should return an empty optional.
    // apply() may throw if it encounters input errors not discovered during
    // the constructor.
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) = 0;
+    // apply() may be called more than once in case of contention, so it must
+    // not change the state saved in the object (issue #7218 was caused by
+    // violating this). We mark apply() "const" to let the compiler validate
+    // this for us. The output-only field _return_attributes is marked
+    // "mutable" above so that apply() can still write to it.
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
    // Convert the above apply() into the signature needed by cas_request:
    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override;
    virtual ~rmw_operation() = default;
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -269,8 +269,8 @@ void set_storage_service(http_context& ctx, routes& r) {
                for (auto cf : column_families) {
                    column_families_vec.push_back(&db.find_column_family(keyspace, cf));
                }
-                return parallel_for_each(column_families_vec, [&cm] (column_family* cf) {
-                    return cm.perform_cleanup(cf);
+                return parallel_for_each(column_families_vec, [&cm, &db] (column_family* cf) {
+                    return cm.perform_cleanup(db, cf);
                });
            }).then([]{
                return make_ready_future<json::json_return_type>(0);
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -1146,7 +1146,7 @@ public:
                    if (r.row().deleted_at()) {
                        touched_parts.set<stats::part_type::ROW_DELETE>();
                        cdc_op = operation::row_delete;
-                        if (pirow) {
+                        if (pirow && pikey) {
                            for (const column_definition& column: _schema->regular_columns()) {
                                assert(pirow->has(column.name_as_text()));
                                auto& cdef = *_log_schema->get_column_definition(log_data_column_name_bytes(column.name()));
--- a/configure.py
+++ b/configure.py
@@ -381,6 +381,7 @@ scylla_tests = set([
    'test/boost/view_schema_ckey_test',
    'test/boost/vint_serialization_test',
    'test/boost/virtual_reader_test',
+    'test/boost/stall_free_test',
    'test/manual/ec2_snitch_test',
    'test/manual/gce_snitch_test',
    'test/manual/gossip',
@@ -540,6 +541,7 @@ scylla_core = (['database.cc',
                'sstables/compaction_strategy.cc',
                'sstables/size_tiered_compaction_strategy.cc',
                'sstables/leveled_compaction_strategy.cc',
+                'sstables/time_window_compaction_strategy.cc',
                'sstables/compaction_manager.cc',
                'sstables/integrity_checked_file_impl.cc',
                'sstables/prepended_input_stream.cc',
@@ -1265,9 +1267,9 @@ def query_seastar_flags(pc_file, link_static_cxx=False):
    return cflags, libs

 for mode in build_modes:
-    seastar_cflags, seastar_libs = query_seastar_flags(pc[mode], link_static_cxx=args.staticcxx)
-    modes[mode]['seastar_cflags'] = seastar_cflags
-    modes[mode]['seastar_libs'] = seastar_libs
+    seastar_pc_cflags, seastar_pc_libs = query_seastar_flags(pc[mode], link_static_cxx=args.staticcxx)
+    modes[mode]['seastar_cflags'] = seastar_pc_cflags
+    modes[mode]['seastar_libs'] = seastar_pc_libs

 # We need to use experimental features of the zstd library (to use our own allocators for the (de)compression context),
 # which are available only when the library is linked statically.
@@ -1288,6 +1290,46 @@ def configure_zstd(build_dir, mode):
    os.makedirs(zstd_build_dir, exist_ok=True)
    subprocess.check_call(zstd_cmd, shell=False, cwd=zstd_build_dir)

+def configure_abseil(build_dir, mode):
+    abseil_build_dir = os.path.join(build_dir, mode, 'abseil')
+
+    abseil_cflags = seastar_cflags + ' ' + modes[mode]['cxx_ld_flags']
+    cmake_mode = MODE_TO_CMAKE_BUILD_TYPE[mode]
+    abseil_cmake_args = [
+        '-DCMAKE_BUILD_TYPE={}'.format(cmake_mode),
+        '-DCMAKE_INSTALL_PREFIX={}'.format(build_dir + '/inst'), # just to avoid a warning from absl
+        '-DCMAKE_C_COMPILER={}'.format(args.cc),
+        '-DCMAKE_CXX_COMPILER={}'.format(args.cxx),
+        '-DCMAKE_CXX_FLAGS_{}={}'.format(cmake_mode.upper(), abseil_cflags),
+    ]
+
+    abseil_cmd = ['cmake', '-G', 'Ninja', os.path.relpath('abseil', abseil_build_dir)] + abseil_cmake_args
+
+    os.makedirs(abseil_build_dir, exist_ok=True)
+    subprocess.check_call(abseil_cmd, shell=False, cwd=abseil_build_dir)
+
+abseil_libs = ['absl/' + lib for lib in [
+    'container/libabsl_hashtablez_sampler.a',
+    'container/libabsl_raw_hash_set.a',
+    'synchronization/libabsl_synchronization.a',
+    'synchronization/libabsl_graphcycles_internal.a',
+    'debugging/libabsl_stacktrace.a',
+    'debugging/libabsl_symbolize.a',
+    'debugging/libabsl_debugging_internal.a',
+    'debugging/libabsl_demangle_internal.a',
+    'time/libabsl_time.a',
+    'time/libabsl_time_zone.a',
+    'numeric/libabsl_int128.a',
+    'hash/libabsl_city.a',
+    'hash/libabsl_hash.a',
+    'base/libabsl_malloc_internal.a',
+    'base/libabsl_spinlock_wait.a',
+    'base/libabsl_base.a',
+    'base/libabsl_dynamic_annotations.a',
+    'base/libabsl_raw_logging_internal.a',
+    'base/libabsl_exponential_biased.a',
+    'base/libabsl_throw_delegate.a']]
+
 args.user_cflags += " " + pkg_config('jsoncpp', '--cflags')
 args.user_cflags += ' -march=' + args.target
 libs = ' '.join([maybe_static(args.staticyamlcpp, '-lyaml-cpp'), '-latomic', '-llz4', '-lz', '-lsnappy', pkg_config('jsoncpp', '--libs'),
@@ -1318,6 +1360,7 @@ if any(filter(thrift_version.startswith, thrift_boost_versions)):
 for pkg in pkgs:
    args.user_cflags += ' ' + pkg_config(pkg, '--cflags')
    libs += ' ' + pkg_config(pkg, '--libs')
+args.user_cflags += '-I abseil'
 user_cflags = args.user_cflags + ' -fvisibility=hidden'
 user_ldflags = args.user_ldflags + ' -fvisibility=hidden'
 if args.staticcxx:
@@ -1348,6 +1391,9 @@ else:
 for mode in build_modes:
    configure_zstd(outdir, mode)

+for mode in build_modes:
+    configure_abseil(outdir, mode)
+
 # configure.py may run automatically from an already-existing build.ninja.
 # If the user interrupts configure.py in the middle, we need build.ninja
 # to remain in a valid state.  So we write our output to a temporary
@@ -1485,6 +1531,8 @@ with open(buildfile_tmp, 'w') as f:
                objs.extend(['$builddir/' + mode + '/' + artifact for artifact in [
                    'libdeflate/libdeflate.a',
                    'zstd/lib/libzstd.a',
+                ] + [
+                    'abseil/' + x for x in abseil_libs
                ]])
                objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
                if binary in tests:
@@ -1638,6 +1686,12 @@ with open(buildfile_tmp, 'w') as f:
        f.write('  subdir = build/{mode}/zstd\n'.format(**locals()))
        f.write('  target = libzstd.a\n'.format(**locals()))

+        for lib in abseil_libs:
+            f.write('build build/{mode}/abseil/{lib}: ninja\n'.format(**locals()))
+            f.write('  pool = submodule_pool\n')
+            f.write('  subdir = build/{mode}/abseil\n'.format(**locals()))
+            f.write('  target = {lib}\n'.format(**locals()))
+
    mode = 'dev' if 'dev' in modes else modes[0]
    f.write('build checkheaders: phony || {}\n'.format(' '.join(['$builddir/{}/{}.o'.format(mode, hh) for hh in headers])))

--- a/cql3/functions/aggregate_fcts.cc
+++ b/cql3/functions/aggregate_fcts.cc
@@ -267,10 +267,13 @@ public:
    }
 };

-/// The same as `impl_max_function_for' but without knowledge of `Type'.
+/// The same as `impl_max_function_for' but without compile-time dependency on `Type'.
 class impl_max_dynamic_function final : public aggregate_function::aggregate {
+    data_type _io_type;
    opt_bytes _max;
 public:
+    impl_max_dynamic_function(data_type io_type) : _io_type(std::move(io_type)) {}
+
    virtual void reset() override {
        _max = {};
    }
@@ -278,12 +281,11 @@ public:
        return _max.value_or(bytes{});
    }
    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
-        if (!values[0]) {
+        if (values.empty() || !values[0]) {
            return;
        }
-        const auto val = *values[0];
-        if (!_max || *_max < val) {
-            _max = val;
+        if (!_max || _io_type->less(*_max, *values[0])) {
+            _max = values[0];
        }
    }
 };
@@ -298,10 +300,13 @@ public:
 };

 class max_dynamic_function final : public native_aggregate_function {
+    data_type _io_type;
 public:
-    max_dynamic_function(data_type io_type) : native_aggregate_function("max", io_type, { io_type }) {}
+    max_dynamic_function(data_type io_type)
+            : native_aggregate_function("max", io_type, { io_type })
+            , _io_type(std::move(io_type)) {}
    virtual std::unique_ptr<aggregate> new_aggregate() override {
-        return std::make_unique<impl_max_dynamic_function>();
+        return std::make_unique<impl_max_dynamic_function>(_io_type);
    }
 };

@@ -358,10 +363,13 @@ public:
    }
 };

-/// The same as `impl_min_function_for' but without knowledge of `Type'.
+/// The same as `impl_min_function_for' but without compile-time dependency on `Type'.
 class impl_min_dynamic_function final : public aggregate_function::aggregate {
+    data_type _io_type;
    opt_bytes _min;
 public:
+    impl_min_dynamic_function(data_type io_type) : _io_type(std::move(io_type)) {}
+
    virtual void reset() override {
        _min = {};
    }
@@ -369,12 +377,11 @@ public:
        return _min.value_or(bytes{});
    }
    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
-        if (!values[0]) {
+        if (values.empty() || !values[0]) {
            return;
        }
-        const auto val = *values[0];
-        if (!_min || val < *_min) {
-            _min = val;
+        if (!_min || _io_type->less(*values[0], *_min)) {
+            _min = values[0];
        }
    }
 };
@@ -389,10 +396,13 @@ public:
 };

 class min_dynamic_function final : public native_aggregate_function {
+    data_type _io_type;
 public:
-    min_dynamic_function(data_type io_type) : native_aggregate_function("min", io_type, { io_type }) {}
+    min_dynamic_function(data_type io_type)
+            : native_aggregate_function("min", io_type, { io_type })
+            , _io_type(std::move(io_type)) {}
    virtual std::unique_ptr<aggregate> new_aggregate() override {
-        return std::make_unique<impl_min_dynamic_function>();
+        return std::make_unique<impl_min_dynamic_function>(_io_type);
    }
 };

--- a/cql3/lists.cc
+++ b/cql3/lists.cc
@@ -357,7 +357,12 @@ lists::setter_by_uuid::execute(mutation& m, const clustering_key_prefix& prefix,

    collection_mutation_description mut;
    mut.cells.reserve(1);
-    mut.cells.emplace_back(to_bytes(*index), params.make_cell(*ltype->value_comparator(), *value, atomic_cell::collection_member::yes));
+
+    if (!value) {
+        mut.cells.emplace_back(to_bytes(*index), params.make_dead_cell());
+    } else {
+        mut.cells.emplace_back(to_bytes(*index), params.make_cell(*ltype->value_comparator(), *value, atomic_cell::collection_member::yes));
+    }

    m.set_cell(prefix, column, mut.serialize(*ltype));
 }
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -417,7 +417,7 @@ std::vector<const column_definition*> statement_restrictions::get_column_defs_fo
                    _clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
            for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
                ::shared_ptr<single_column_restriction> restr;
-                if (single_pk_restrs) {
+                if (single_ck_restrs) {
                    auto it = single_ck_restrs->restrictions().find(cdef);
                    if (it != single_ck_restrs->restrictions().end()) {
                        restr = dynamic_pointer_cast<single_column_restriction>(it->second);
--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -207,6 +207,9 @@ void alter_table_statement::add_column(const schema& schema, const table& cf, sc
                "because a collection with the same name and a different type has already been used in the past", column_name));
        }
    }
+    if (type->is_counter() && !schema.is_counter()) {
+        throw exceptions::configuration_exception(format("Cannot add a counter column ({}) in a non counter column family", column_name));
+    }

    cfm.with_column(column_name.name(), type, is_static ? column_kind::static_column : column_kind::regular_column);

@@ -222,7 +225,7 @@ void alter_table_statement::add_column(const schema& schema, const table& cf, sc
            schema_builder builder(view);
            if (view->view_info()->include_all_columns()) {
                builder.with_column(column_name.name(), type);
-            } else if (view->view_info()->base_non_pk_columns_in_view_pk().empty()) {
+            } else if (!view->view_info()->has_base_non_pk_columns_in_view_pk()) {
                db::view::create_virtual_column(builder, column_name.name(), type);
            }
            view_updates.push_back(view_ptr(builder.build()));
--- a/database.cc
+++ b/database.cc
@@ -1823,7 +1823,11 @@ future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_fun
                            // TODO: indexes.
                            // Note: since discard_sstables was changed to only count tables owned by this shard,
                            // we can get zero rp back. Changed assert, and ensure we save at least low_mark.
-                            assert(low_mark <= rp || rp == db::replay_position());
+                            // #6995 - the assert below was broken in c2c6c71 and remained so for many years. 
+                            // We nowadays do not flush tables with sstables but autosnapshot=false. This means
+                            // the low_mark assertion does not hold, because we maybe/probably never got around to 
+                            // creating the sstables that would create them.
+                            assert(!should_flush || low_mark <= rp || rp == db::replay_position());
                            rp = std::max(low_mark, rp);
                            return truncate_views(cf, truncated_at, should_flush).then([&cf, truncated_at, rp] {
                                // save_truncation_record() may actually fail after we cached the truncation time
@@ -2001,9 +2005,10 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
            reader_concurrency_semaphore* semaphore;
        };
        distributed<database>& _db;
+        utils::UUID _table_id;
        std::vector<reader_context> _contexts;
    public:
-        explicit streaming_reader_lifecycle_policy(distributed<database>& db) : _db(db), _contexts(smp::count) {
+        streaming_reader_lifecycle_policy(distributed<database>& db, utils::UUID table_id) : _db(db), _table_id(table_id), _contexts(smp::count) {
        }
        virtual flat_mutation_reader create_reader(
                schema_ptr schema,
@@ -2032,7 +2037,12 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
            });
        }
        virtual reader_concurrency_semaphore& semaphore() override {
-            return *_contexts[this_shard_id()].semaphore;
+            const auto shard = this_shard_id();
+            if (!_contexts[shard].semaphore) {
+                auto& cf = _db.local().find_column_family(_table_id);
+                _contexts[shard].semaphore = &cf.streaming_read_concurrency_semaphore();
+            }
+            return *_contexts[shard].semaphore;
        }
    };
    auto ms = mutation_source([&db] (schema_ptr s,
@@ -2043,7 +2053,8 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
            tracing::trace_state_ptr trace_state,
            streamed_mutation::forwarding,
            mutation_reader::forwarding fwd_mr) {
-        return make_multishard_combining_reader(make_shared<streaming_reader_lifecycle_policy>(db), std::move(s), pr, ps, pc,
+        auto table_id = s->id();
+        return make_multishard_combining_reader(make_shared<streaming_reader_lifecycle_policy>(db, table_id), std::move(s), pr, ps, pc,
                std::move(trace_state), fwd_mr);
    });
    auto&& full_slice = schema->full_slice();
--- a/database.hh
+++ b/database.hh
@@ -55,6 +55,7 @@
 #include <limits>
 #include <cstddef>
 #include "schema_fwd.hh"
+#include "db/view/view.hh"
 #include "db/schema_features.hh"
 #include "gms/feature.hh"
 #include "timestamp.hh"
@@ -903,7 +904,7 @@ public:
    lw_shared_ptr<const sstable_list> get_sstables_including_compacted_undeleted() const;
    const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const;
    std::vector<sstables::shared_sstable> select_sstables(const dht::partition_range& range) const;
-    std::vector<sstables::shared_sstable> candidates_for_compaction() const;
+    std::vector<sstables::shared_sstable> non_staging_sstables() const;
    std::vector<sstables::shared_sstable> sstables_need_rewrite() const;
    size_t sstables_count() const;
    std::vector<uint64_t> sstable_count_per_level() const;
@@ -1008,8 +1009,9 @@ public:
        return *_config.sstables_manager;
    }

+    // Reader's schema must be the same as the base schema of each of the views.
    future<> populate_views(
-            std::vector<view_ptr>,
+            std::vector<db::view::view_and_base>,
            dht::token base_token,
            flat_mutation_reader&&);

@@ -1026,7 +1028,7 @@ private:
            const io_priority_class& io_priority, query::partition_slice::option_set custom_opts) const;
    std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
    future<> generate_and_propagate_view_updates(const schema_ptr& base,
-            std::vector<view_ptr>&& views,
+            std::vector<db::view::view_and_base>&& views,
            mutation&& m,
            flat_mutation_reader_opt existings) const;

--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -290,7 +290,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
            mutation m(schema, key);
            auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
            m.partition().apply_delete(*schema, clustering_key_prefix::make_empty(), tombstone(now, gc_clock::now()));
-            return _qp.proxy().mutate_locally(m);
+            return _qp.proxy().mutate_locally(m, db::commitlog::force_sync::no);
        });
    };

--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -521,7 +521,7 @@ public:
            _segment_manager->totals.total_size_on_disk -= size_on_disk();
            _segment_manager->totals.total_size -= (size_on_disk() + _buffer.size_bytes());
            _segment_manager->add_file_to_delete(_file_name, _desc);
-        } else {
+        } else if (_segment_manager->cfg.warn_about_segments_left_on_disk_after_shutdown) {
            clogger.warn("Segment {} is dirty and is left on disk.", *this);
        }
    }
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -137,6 +137,7 @@ public:

        bool reuse_segments = true;
        bool use_o_dsync = false;
+        bool warn_about_segments_left_on_disk_after_shutdown = true;

        const db::extensions * extensions = nullptr;
    };
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -299,7 +299,7 @@ future<> db::commitlog_replayer::impl::process(stats* s, commitlog::buffer_and_r
                mutation m(cf.schema(), fm.decorated_key(*cf.schema()));
                converting_mutation_partition_applier v(cm, *cf.schema(), m.partition());
                fm.partition().accept(cm, v);
-                return do_with(std::move(m), [&db, &cf] (mutation m) {
+                return do_with(std::move(m), [&db, &cf] (const mutation& m) {
                    return db.apply_in_memory(m, cf, db::rp_handle(), db::no_timeout);
                });
            } else {
--- a/db/consistency_level.cc
+++ b/db/consistency_level.cc
@@ -61,7 +61,8 @@ namespace db {
 logging::logger cl_logger("consistency");

 size_t quorum_for(const keyspace& ks) {
-    return (ks.get_replication_strategy().get_replication_factor() / 2) + 1;
+    size_t replication_factor = ks.get_replication_strategy().get_replication_factor();
+    return replication_factor ? (replication_factor / 2) + 1 : 0;
 }

 size_t local_quorum_for(const keyspace& ks, const sstring& dc) {
@@ -72,8 +73,8 @@ size_t local_quorum_for(const keyspace& ks, const sstring& dc) {
    if (rs.get_type() == replication_strategy_type::network_topology) {
        const network_topology_strategy* nrs =
            static_cast<const network_topology_strategy*>(&rs);
-
-        return (nrs->get_replication_factor(dc) / 2) + 1;
+        size_t replication_factor = nrs->get_replication_factor(dc);
+        return replication_factor ? (replication_factor / 2) + 1 : 0;
    }

    return quorum_for(ks);
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -224,7 +224,9 @@ future<> manager::end_point_hints_manager::stop(drain should_drain) noexcept {
        with_lock(file_update_mutex(), [this] {
            if (_hints_store_anchor) {
                hints_store_ptr tmp = std::exchange(_hints_store_anchor, nullptr);
-                return tmp->shutdown().finally([tmp] {});
+                return tmp->shutdown().finally([tmp] {
+                    return tmp->release();
+                }).finally([tmp] {});
            }
            return make_ready_future<>();
        }).handle_exception([&eptr] (auto e) { eptr = std::move(e); }).get();
@@ -326,6 +328,10 @@ future<db::commitlog> manager::end_point_hints_manager::add_store() noexcept {
            // HH doesn't utilize the flow that benefits from reusing segments.
            // Therefore let's simply disable it to avoid any possible confusion.
            cfg.reuse_segments = false;
+            // HH leaves segments on disk after commitlog shutdown, and later reads
+            // them when commitlog is re-created. This is expected to happen regularly
+            // during standard HH workload, so no need to print a warning about it.
+            cfg.warn_about_segments_left_on_disk_after_shutdown = false;

            return commitlog::create_commitlog(std::move(cfg)).then([this] (commitlog l) {
                // add_store() is triggered every time hint files are forcefully flushed to I/O (every hints_flush_period).
@@ -352,7 +358,9 @@ future<> manager::end_point_hints_manager::flush_current_hints() noexcept {
        return futurize_invoke([this] {
            return with_lock(file_update_mutex(), [this]() -> future<> {
                return get_or_load().then([] (hints_store_ptr cptr) {
-                    return cptr->shutdown();
+                    return cptr->shutdown().finally([cptr] {
+                        return cptr->release();
+                    }).finally([cptr] {});
                }).then([this] {
                    // Un-hold the commitlog object. Since we are under the exclusive _file_update_mutex lock there are no
                    // other hints_store_ptr copies and this would destroy the commitlog shared value.
@@ -812,7 +820,7 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
    int replayed_segments_count = 0;

    try {
-        while (replay_allowed() && have_segments()) {
+        while (replay_allowed() && have_segments() && can_send()) {
            if (!send_one_file(*_segments_to_replay.begin())) {
                break;
            }
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -822,6 +822,14 @@ future<> merge_schema(distributed<service::storage_proxy>& proxy, gms::feature_s
    });
 }

+future<> recalculate_schema_version(distributed<service::storage_proxy>& proxy, gms::feature_service& feat) {
+    return merge_lock().then([&proxy, &feat] {
+        return update_schema_version_and_announce(proxy, feat.cluster_schema_features());
+    }).finally([] {
+        return merge_unlock();
+    });
+}
+
 future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush)
 {
    return merge_lock().then([&proxy, mutations = std::move(mutations), do_flush] () mutable {
@@ -2905,10 +2913,6 @@ future<> maybe_update_legacy_secondary_index_mv_schema(service::migration_manage
    // format, where "token" is not marked as computed. Once we're sure that all indexes have their
    // columns marked as computed (because they were either created on a node that supports computed
    // columns or were fixed by this utility function), it's safe to remove this function altogether.
-    if (!db.features().cluster_supports_computed_columns()) {
-        return make_ready_future<>();
-    }
-
    if (v->clustering_key_size() == 0) {
        return make_ready_future<>();
    }
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -170,6 +170,13 @@ future<> merge_schema(distributed<service::storage_proxy>& proxy, gms::feature_s

 future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush);

+// Recalculates the local schema version and publishes it in gossip.
+//
+// It is safe to call concurrently with recalculate_schema_version() and merge_schema() in which case it
+// is guaranteed that the schema version we end up with after all calls will reflect the most recent state
+// of feature_service and schema tables.
+future<> recalculate_schema_version(distributed<service::storage_proxy>& proxy, gms::feature_service& feat);
+
 future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after);

 std::vector<mutation> make_create_keyspace_mutations(lw_shared_ptr<keyspace_metadata> keyspace, api::timestamp_type timestamp, bool with_tables_and_types_and_functions = true);
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -130,17 +130,90 @@ const column_definition* view_info::view_column(const column_definition& base_de
    return _schema.get_column_definition(base_def.name());
 }

-const std::vector<column_id>& view_info::base_non_pk_columns_in_view_pk() const {
+void view_info::set_base_info(db::view::base_info_ptr base_info) {
+    _base_info = std::move(base_info);
+}
+
+// A constructor for a base info that can facilitate reads and writes from the materialized view.
+db::view::base_dependent_view_info::base_dependent_view_info(schema_ptr base_schema, std::vector<column_id>&& base_non_pk_columns_in_view_pk)
+        : _base_schema{std::move(base_schema)}
+        , _base_non_pk_columns_in_view_pk{std::move(base_non_pk_columns_in_view_pk)}
+        , has_base_non_pk_columns_in_view_pk{!_base_non_pk_columns_in_view_pk.empty()}
+        , use_only_for_reads{false} {
+
+}
+
+// A constructor for a base info that can facilitate only reads from the materialized view.
+db::view::base_dependent_view_info::base_dependent_view_info(bool has_base_non_pk_columns_in_view_pk)
+        : _base_schema{nullptr}
+        , has_base_non_pk_columns_in_view_pk{has_base_non_pk_columns_in_view_pk}
+        , use_only_for_reads{true} {
+}
+
+const std::vector<column_id>& db::view::base_dependent_view_info::base_non_pk_columns_in_view_pk() const {
+    if (use_only_for_reads) {
+        on_internal_error(vlogger, "base_non_pk_columns_in_view_pk(): operation unsupported when initialized only for view reads.");
+    }
    return _base_non_pk_columns_in_view_pk;
 }

-void view_info::initialize_base_dependent_fields(const schema& base) {
+const schema_ptr& db::view::base_dependent_view_info::base_schema() const {
+    if (use_only_for_reads) {
+        on_internal_error(vlogger, "base_schema(): operation unsupported when initialized only for view reads.");
+    }
+    return _base_schema;
+}
+
+db::view::base_info_ptr view_info::make_base_dependent_view_info(const schema& base) const {
+    std::vector<column_id> base_non_pk_columns_in_view_pk;
+    bool has_base_non_pk_columns_in_view_pk = false;
+    bool can_only_read_from_view = false;
+
    for (auto&& view_col : boost::range::join(_schema.partition_key_columns(), _schema.clustering_key_columns())) {
+        if (view_col.is_computed()) {
+            // we are not going to find it in the base table...
+            continue;
+        }
        auto* base_col = base.get_column_definition(view_col.name());
        if (base_col && !base_col->is_primary_key()) {
-            _base_non_pk_columns_in_view_pk.push_back(base_col->id);
+            base_non_pk_columns_in_view_pk.push_back(base_col->id);
+            has_base_non_pk_columns_in_view_pk = true;
+        } else if (!base_col) {
+            // If we didn't find the column in the base column then it must have been deleted
+            // or not yet added (by alter command), this means it is for sure not a pk column
+            // in the base table. This can happen if the version of the base schema is not the
+            // one that the view was created with. Seting this schema as the base can't harm since
+            // if we got to such a situation then it means it is only going to be used for reading
+            // (computation of shadowable tombstones) and in that case the existence of such a column
+            // is the only thing that is of interest to us.
+            has_base_non_pk_columns_in_view_pk = true;
+            can_only_read_from_view = true;
+
+            // We can break the loop here since we have the info we wanted and the list
+            // of columns is not going to be reliable anyhow.
+            break;
        }
    }
+
+    if (can_only_read_from_view) {
+        return make_lw_shared<db::view::base_dependent_view_info>(has_base_non_pk_columns_in_view_pk);
+    } else {
+        return make_lw_shared<db::view::base_dependent_view_info>(base.shared_from_this(), std::move(base_non_pk_columns_in_view_pk));
+    }
+
+}
+
+bool view_info::has_base_non_pk_columns_in_view_pk() const {
+    // The base info is not always available, this is because
+    // the base info initialization is separate from the view
+    // info construction. If we are trying to get this info without
+    // initializing the base information it means that we have a
+    // schema integrity problem as the creator of owning view schema
+    // didn't make sure to initialize it with base information.
+    if (!_base_info) {
+        on_internal_error(vlogger, "Tried to perform a view query which is base info dependant without initializing it");
+    }
+    return _base_info->has_base_non_pk_columns_in_view_pk;
 }

 namespace db {
@@ -188,11 +261,11 @@ bool may_be_affected_by(const schema& base, const view_info& view, const dht::de
 }

 static bool update_requires_read_before_write(const schema& base,
-        const std::vector<view_ptr>& views,
+        const std::vector<view_and_base>& views,
        const dht::decorated_key& key,
        const rows_entry& update) {
    for (auto&& v : views) {
-        view_info& vf = *v->view_info();
+        view_info& vf = *v.view->view_info();
        if (may_be_affected_by(base, vf, key, update)) {
            return true;
        }
@@ -239,12 +312,14 @@ class view_updates final {
    view_ptr _view;
    const view_info& _view_info;
    schema_ptr _base;
+    base_info_ptr _base_info;
    std::unordered_map<partition_key, mutation_partition, partition_key::hashing, partition_key::equality> _updates;
 public:
-    explicit view_updates(view_ptr view, schema_ptr base)
-            : _view(std::move(view))
+    explicit view_updates(view_and_base vab)
+            : _view(std::move(vab.view))
            , _view_info(*_view->view_info())
-            , _base(std::move(base))
+            , _base(vab.base->base_schema())
+            , _base_info(vab.base)
            , _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view)) {
    }

@@ -306,7 +381,7 @@ row_marker view_updates::compute_row_marker(const clustering_row& base_row) cons
    // they share liveness information. It's true especially in the only case currently allowed by CQL,
    // which assumes there's up to one non-pk column in the view key. It's also true in alternator,
    // which does not carry TTL information.
-    const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
+    const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk();
    if (!col_ids.empty()) {
        auto& def = _base->regular_column_at(col_ids[0]);
        // Note: multi-cell columns can't be part of the primary key.
@@ -537,7 +612,7 @@ void view_updates::delete_old_entry(const partition_key& base_key, const cluster

 void view_updates::do_delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now) {
    auto& r = get_view_row(base_key, existing);
-    const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
+    const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk();
    if (!col_ids.empty()) {
        // We delete the old row using a shadowable row tombstone, making sure that
        // the tombstone deletes everything in the row (or it might still show up).
@@ -678,7 +753,7 @@ void view_updates::generate_update(
        return;
    }

-    const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
+    const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk();
    if (col_ids.empty()) {
        // The view key is necessarily the same pre and post update.
        if (existing && existing->is_live(*_base)) {
@@ -932,11 +1007,16 @@ future<stop_iteration> view_update_builder::on_results() {

 future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
        const schema_ptr& base,
-        std::vector<view_ptr>&& views_to_update,
+        std::vector<view_and_base>&& views_to_update,
        flat_mutation_reader&& updates,
        flat_mutation_reader_opt&& existings) {
-    auto vs = boost::copy_range<std::vector<view_updates>>(views_to_update | boost::adaptors::transformed([&] (auto&& v) {
-        return view_updates(std::move(v), base);
+    auto vs = boost::copy_range<std::vector<view_updates>>(views_to_update | boost::adaptors::transformed([&] (view_and_base v) {
+        if (base->version() != v.base->base_schema()->version()) {
+            on_internal_error(vlogger, format("Schema version used for view updates ({}) does not match the current"
+                                              " base schema version of the view ({}) for view {}.{} of {}.{}",
+                base->version(), v.base->base_schema()->version(), v.view->ks_name(), v.view->cf_name(), base->ks_name(), base->cf_name()));
+        }
+        return view_updates(std::move(v));
    }));
    auto builder = std::make_unique<view_update_builder>(base, std::move(vs), std::move(updates), std::move(existings));
    auto f = builder->build();
@@ -946,18 +1026,18 @@ future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
 query::clustering_row_ranges calculate_affected_clustering_ranges(const schema& base,
        const dht::decorated_key& key,
        const mutation_partition& mp,
-        const std::vector<view_ptr>& views) {
+        const std::vector<view_and_base>& views) {
    std::vector<nonwrapping_range<clustering_key_prefix_view>> row_ranges;
    std::vector<nonwrapping_range<clustering_key_prefix_view>> view_row_ranges;
    clustering_key_prefix_view::tri_compare cmp(base);
    if (mp.partition_tombstone() || !mp.row_tombstones().empty()) {
        for (auto&& v : views) {
            // FIXME: #2371
-            if (v->view_info()->select_statement().get_restrictions()->has_unrestricted_clustering_columns()) {
+            if (v.view->view_info()->select_statement().get_restrictions()->has_unrestricted_clustering_columns()) {
                view_row_ranges.push_back(nonwrapping_range<clustering_key_prefix_view>::make_open_ended_both_sides());
                break;
            }
-            for (auto&& r : v->view_info()->partition_slice().default_row_ranges()) {
+            for (auto&& r : v.view->view_info()->partition_slice().default_row_ranges()) {
                view_row_ranges.push_back(r.transform(std::mem_fn(&clustering_key_prefix::view)));
            }
        }
@@ -1720,7 +1800,7 @@ public:
            return stop_iteration::yes;
        }

-        _fragments_memory_usage += cr.memory_usage(*_step.base->schema());
+        _fragments_memory_usage += cr.memory_usage(*_step.reader.schema());
        _fragments.push_back(std::move(cr));
        if (_fragments_memory_usage > batch_memory_max) {
            // Although we have not yet completed the batch of base rows that
@@ -1740,10 +1820,14 @@ public:
        _builder._as.check();
        if (!_fragments.empty()) {
            _fragments.push_front(partition_start(_step.current_key, tombstone()));
+            auto base_schema = _step.base->schema();
+            auto views = with_base_info_snapshot(_views_to_build);
+            auto reader = make_flat_mutation_reader_from_fragments(_step.reader.schema(), std::move(_fragments));
+            reader.upgrade_schema(base_schema);
            _step.base->populate_views(
-                    _views_to_build,
+                    std::move(views),
                    _step.current_token(),
-                    make_flat_mutation_reader_from_fragments(_step.base->schema(), std::move(_fragments))).get();
+                    std::move(reader)).get();
            _fragments.clear();
            _fragments_memory_usage = 0;
        }
@@ -1890,5 +1974,11 @@ future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_d
    });
 }

+std::vector<db::view::view_and_base> with_base_info_snapshot(std::vector<view_ptr> vs) {
+    return boost::copy_range<std::vector<db::view::view_and_base>>(vs | boost::adaptors::transformed([] (const view_ptr& v) {
+        return db::view::view_and_base{v, v->view_info()->base_info()};
+    }));
+}
+
 } // namespace view
 } // namespace db
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -43,6 +43,46 @@ namespace db {

 namespace view {

+// Part of the view description which depends on the base schema version.
+//
+// This structure may change even though the view schema doesn't change, so
+// it needs to live outside view_ptr.
+struct base_dependent_view_info {
+private:
+    schema_ptr _base_schema;
+    // Id of a regular base table column included in the view's PK, if any.
+    // Scylla views only allow one such column, alternator can have up to two.
+    std::vector<column_id> _base_non_pk_columns_in_view_pk;
+public:
+    const std::vector<column_id>& base_non_pk_columns_in_view_pk() const;
+    const schema_ptr& base_schema() const;
+
+    // Indicates if the view hase pk columns which are not part of the base
+    // pk, it seems that !base_non_pk_columns_in_view_pk.empty() is the same,
+    // but actually there are cases where we can compute this boolean without
+    // succeeding to reliably build the former.
+    const bool has_base_non_pk_columns_in_view_pk;
+
+    // If base_non_pk_columns_in_view_pk couldn't reliably be built, this base
+    // info can't be used for computing view updates, only for reading the materialized
+    // view.
+    const bool use_only_for_reads;
+
+    // A constructor for a base info that can facilitate reads and writes from the materialized view.
+    base_dependent_view_info(schema_ptr base_schema, std::vector<column_id>&& base_non_pk_columns_in_view_pk);
+    // A constructor for a base info that can facilitate only reads from the materialized view.
+    base_dependent_view_info(bool has_base_non_pk_columns_in_view_pk);
+};
+
+// Immutable snapshot of view's base-schema-dependent part.
+using base_info_ptr = lw_shared_ptr<const base_dependent_view_info>;
+
+// Snapshot of the view schema and its base-schema-dependent part.
+struct view_and_base {
+    view_ptr view;
+    base_info_ptr base;
+};
+
 /**
 * Whether the view filter considers the specified partition key.
 *
@@ -92,7 +132,7 @@ bool clustering_prefix_matches(const schema& base, const partition_key& key, con

 future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
        const schema_ptr& base,
-        std::vector<view_ptr>&& views_to_update,
+        std::vector<view_and_base>&& views_to_update,
        flat_mutation_reader&& updates,
        flat_mutation_reader_opt&& existings);

@@ -100,7 +140,7 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(
        const schema& base,
        const dht::decorated_key& key,
        const mutation_partition& mp,
-        const std::vector<view_ptr>& views);
+        const std::vector<view_and_base>& views);

 struct wait_for_all_updates_tag {};
 using wait_for_all_updates = bool_class<wait_for_all_updates_tag>;
@@ -128,6 +168,13 @@ future<> mutate_MV(
 */
 void create_virtual_column(schema_builder& builder, const bytes& name, const data_type& type);

+/**
+ * Converts a collection of view schema snapshots into a collection of
+ * view_and_base objects, which are snapshots of both the view schema
+ * and the base-schema-dependent part of view description.
+ */
+std::vector<view_and_base> with_base_info_snapshot(std::vector<view_ptr>);
+
 }

 }
--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -36,16 +36,17 @@ future<> view_update_generator::start() {
                _pending_sstables.wait().get();
            }

+            // To ensure we don't race with updates, move the entire content
+            // into a local variable.
+            auto sstables_with_tables = std::exchange(_sstables_with_tables, {});
+
            // If we got here, we will process all tables we know about so far eventually so there
            // is no starvation
-            for (auto table_it = _sstables_with_tables.begin(); table_it != _sstables_with_tables.end(); table_it = _sstables_with_tables.erase(table_it)) {
-                auto& [t, t_sstables] = *table_it;
+            for (auto table_it = sstables_with_tables.begin(); table_it != sstables_with_tables.end(); table_it = sstables_with_tables.erase(table_it)) {
+                auto& [t, sstables] = *table_it;
                schema_ptr s = t->schema();

-                vug_logger.trace("Processing {}.{}: {} sstables", s->ks_name(), s->cf_name(), t_sstables.size());
-
-                // Copy what we have so far so we don't miss new updates
-                auto sstables = std::exchange(t_sstables, {});
+                vug_logger.trace("Processing {}.{}: {} sstables", s->ks_name(), s->cf_name(), sstables.size());

                const auto num_sstables = sstables.size();

--- a/digest_algorithm.hh
+++ b/digest_algorithm.hh
@@ -28,7 +28,8 @@ namespace query {
 enum class digest_algorithm : uint8_t {
    none = 0,  // digest not required
    MD5 = 1,
-    xxHash = 2,// default algorithm
+    legacy_xxHash_without_null_digest = 2,
+    xxHash = 3, // default algorithm
 };

 }
--- a/digester.hh
+++ b/digester.hh
@@ -36,7 +36,7 @@ struct noop_hasher {
 };

 class digester final {
-    std::variant<noop_hasher, md5_hasher, xx_hasher> _impl;
+    std::variant<noop_hasher, md5_hasher, xx_hasher, legacy_xx_hasher_without_null_digest> _impl;

 public:
    explicit digester(digest_algorithm algo) {
@@ -47,6 +47,9 @@ public:
        case digest_algorithm::xxHash:
            _impl = xx_hasher();
            break;
+        case digest_algorithm::legacy_xxHash_without_null_digest:
+            _impl = legacy_xx_hasher_without_null_digest();
+            break;
        case digest_algorithm ::none:
            _impl = noop_hasher();
            break;
--- a/dist/common/scripts/node_exporter_install
+++ b/dist/common/scripts/node_exporter_install
@@ -42,6 +42,11 @@ if __name__ == '__main__':
    if node_exporter_p.exists() or (bindir_p() / 'prometheus-node_exporter').exists():
        if force:
            print('node_exporter already installed, reinstalling')
+            try:
+                node_exporter = systemd_unit('node-exporter.service')
+                node_exporter.stop()
+            except:
+                pass
        else:
            print('node_exporter already installed, you can use `--force` to force reinstallation')
            sys.exit(1)
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -96,7 +96,7 @@ def curl(url, byte=False):
                    return res.read()
                else:
                    return res.read().decode('utf-8')
-        except urllib.error.HTTPError:
+        except urllib.error.URLError:
            logging.warn("Failed to grab %s..." % url)
            time.sleep(5)
            retries += 1
@@ -182,7 +182,7 @@ class aws_instance:
        instance_size = self.instance_size()
        if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
            return 'ixgbevf'
-        if instance_class in ['a1', 'c5', 'c5d', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d']:
+        if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d']:
            return 'ena'
        if instance_class == 'm4':
            if instance_size == '16xlarge':
@@ -219,7 +219,7 @@ class aws_instance:

    def ebs_disks(self):
        """Returns all EBS disks"""
-        return set(self._disks["ephemeral"])
+        return set(self._disks["ebs"])

    def public_ipv4(self):
        """Returns the public IPv4 address of this instance"""
@@ -327,9 +327,7 @@ class scylla_cpuinfo:
            return len(self._cpu_data["system"])


-# When a CLI tool is not installed, use relocatable CLI tool provided by Scylla
 scylla_env = os.environ.copy()
-scylla_env['PATH'] =  '{}:{}'.format(scylla_env['PATH'], scyllabindir())

 def run(cmd, shell=False, silent=False, exception=True):
    stdout = subprocess.DEVNULL if silent else None
@@ -441,6 +439,19 @@ def dist_ver():
    return platform.dist()[1]


+SYSTEM_PARTITION_UUIDS = [
+        '21686148-6449-6e6f-744e-656564454649', # BIOS boot partition
+        'c12a7328-f81f-11d2-ba4b-00a0c93ec93b', # EFI system partition
+        '024dee41-33e7-11d3-9d69-0008c781f39f'  # MBR partition scheme
+]
+
+def get_partition_uuid(dev):
+    return out(f'lsblk -n -oPARTTYPE {dev}')
+
+def is_system_partition(dev):
+    uuid = get_partition_uuid(dev)
+    return (uuid in SYSTEM_PARTITION_UUIDS)
+
 def is_unused_disk(dev):
    # dev is not in /sys/class/block/, like /dev/nvme[0-9]+
    if not os.path.isdir('/sys/class/block/{dev}'.format(dev=dev.replace('/dev/', ''))):
@@ -448,7 +459,8 @@ def is_unused_disk(dev):
    try:
        fd = os.open(dev, os.O_EXCL)
        os.close(fd)
-        return True
+        # dev is not reserved for system
+        return not is_system_partition(dev)
    except OSError:
        return False

--- a/dist/common/sysctl.d/99-scylla-inotify.conf
+++ b/dist/common/sysctl.d/99-scylla-inotify.conf
@@ -0,0 +1,4 @@
+# allocate enough inotify instances for large machines
+# each tls instance needs 1 inotify instance, and there can be
+# multiple tls instances per shard.
+fs.inotify.max_user_instances = 1200
--- a/dist/debian/control.mustache
+++ b/dist/debian/control.mustache
@@ -5,6 +5,7 @@ Section: database
 Priority: optional
 X-Python3-Version: >= 3.4
 Standards-Version: 3.9.5
+Rules-Requires-Root: no

 Package: {{product}}-conf
 Architecture: any
--- a/dist/debian/debian/scylla-kernel-conf.postinst
+++ b/dist/debian/debian/scylla-kernel-conf.postinst
@@ -11,6 +11,7 @@ else
    sysctl -p/usr/lib/sysctl.d/99-scylla-sched.conf || :
    sysctl -p/usr/lib/sysctl.d/99-scylla-aio.conf || :
    sysctl -p/usr/lib/sysctl.d/99-scylla-vm.conf || :
+    sysctl -p/usr/lib/sysctl.d/99-scylla-inotify.conf || :
 fi

 #DEBHELPER#
--- a/dist/debian/python3/control.mustache
+++ b/dist/debian/python3/control.mustache
@@ -5,6 +5,7 @@ Section: python
 Priority: optional
 X-Python3-Version: >= 3.4
 Standards-Version: 3.9.5
+Rules-Requires-Root: no

 Package: {{product}}-python3
 Architecture: amd64
--- a/dist/debian/rules.mustache
+++ b/dist/debian/rules.mustache
@@ -37,6 +37,7 @@ override_dh_strip:
 	# The binaries (ethtool...patchelf) don't pass dh_strip after going through patchelf. Since they are
 	# already stripped, nothing is lost if we exclude them, so that's what we do.
 	dh_strip -Xlibprotobuf.so.15 -Xld.so -Xethtool -Xgawk -Xgzip -Xhwloc-calc -Xhwloc-distrib -Xifconfig -Xlscpu -Xnetstat -Xpatchelf --dbg-package={{product}}-server-dbg
+	find $(CURDIR)/debian/{{product}}-server-dbg/usr/lib/debug/.build-id/ -name "*.debug" -exec objcopy --decompress-debug-sections {} \;

 override_dh_makeshlibs:

--- a/dist/redhat/scylla.spec.mustache
+++ b/dist/redhat/scylla.spec.mustache
@@ -130,10 +130,9 @@ rm -rf $RPM_BUILD_ROOT
 %attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla-housekeeping
 %ghost /etc/systemd/system/scylla-helper.slice.d/
 %ghost /etc/systemd/system/scylla-helper.slice.d/memory.conf
-%ghost /etc/systemd/system/scylla-server.service.d/
 %ghost /etc/systemd/system/scylla-server.service.d/capabilities.conf
 %ghost /etc/systemd/system/scylla-server.service.d/mounts.conf
-%ghost /etc/systemd/system/scylla-server.service.d/dependencies.conf
+/etc/systemd/system/scylla-server.service.d/dependencies.conf
 %ghost /etc/systemd/system/var-lib-systemd-coredump.mount

 %package conf
@@ -189,6 +188,8 @@ Summary:        Scylla configuration package for the Linux kernel
 License:        AGPLv3
 URL:            http://www.scylladb.com/
 Requires:       kmod
+# tuned overwrites our sysctl settings
+Obsoletes:     tuned

 %description kernel-conf
 This package contains Linux kernel configuration changes for the Scylla database.  Install this package
@@ -200,6 +201,7 @@ if Scylla is the main application on your server and you wish to optimize its la
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
 /usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :
 /usr/lib/systemd/systemd-sysctl 99-scylla-vm.conf >/dev/null 2>&1 || :
+/usr/lib/systemd/systemd-sysctl 99-scylla-inotify.conf >/dev/null 2>&1 || :

 %files kernel-conf
 %defattr(-,root,root)
--- a/flat_mutation_reader.hh
+++ b/flat_mutation_reader.hh
@@ -488,6 +488,9 @@ public:
    size_t buffer_size() const {
        return _impl->buffer_size();
    }
+    const circular_buffer<mutation_fragment>& buffer() const {
+        return _impl->buffer();
+    }
    // Detach the internal buffer of the reader.
    // Roughly equivalent to depleting it by calling pop_mutation_fragment()
    // until is_buffer_empty() returns true.
--- a/gms/feature.hh
+++ b/gms/feature.hh
@@ -140,6 +140,7 @@ extern const std::string_view NONFROZEN_UDTS;
 extern const std::string_view HINTED_HANDOFF_SEPARATE_CONNECTION;
 extern const std::string_view LWT;
 extern const std::string_view PER_TABLE_PARTITIONERS;
+extern const std::string_view DIGEST_FOR_NULL_VALUES;

 }

--- a/gms/feature_service.cc
+++ b/gms/feature_service.cc
@@ -55,6 +55,7 @@ constexpr std::string_view features::NONFROZEN_UDTS = "NONFROZEN_UDTS";
 constexpr std::string_view features::HINTED_HANDOFF_SEPARATE_CONNECTION = "HINTED_HANDOFF_SEPARATE_CONNECTION";
 constexpr std::string_view features::LWT = "LWT";
 constexpr std::string_view features::PER_TABLE_PARTITIONERS = "PER_TABLE_PARTITIONERS";
+constexpr std::string_view features::DIGEST_FOR_NULL_VALUES = "DIGEST_FOR_NULL_VALUES";

 static logging::logger logger("features");

@@ -88,8 +89,9 @@ feature_service::feature_service(feature_config cfg) : _config(cfg)
        , _nonfrozen_udts(*this, features::NONFROZEN_UDTS)
        , _hinted_handoff_separate_connection(*this, features::HINTED_HANDOFF_SEPARATE_CONNECTION)
        , _lwt_feature(*this, features::LWT)
-        , _per_table_partitioners_feature(*this, features::PER_TABLE_PARTITIONERS) {
-}
+        , _per_table_partitioners_feature(*this, features::PER_TABLE_PARTITIONERS)
+        , _digest_for_null_values_feature(*this, features::DIGEST_FOR_NULL_VALUES)
+{}

 feature_config feature_config_from_db_config(db::config& cfg) {
    feature_config fcfg;
@@ -163,6 +165,7 @@ std::set<std::string_view> feature_service::known_feature_set() {
        gms::features::UNBOUNDED_RANGE_TOMBSTONES,
        gms::features::HINTED_HANDOFF_SEPARATE_CONNECTION,
        gms::features::PER_TABLE_PARTITIONERS,
+        gms::features::DIGEST_FOR_NULL_VALUES,
    };

    if (_config.enable_sstables_mc_format) {
@@ -254,6 +257,7 @@ void feature_service::enable(const std::set<std::string_view>& list) {
        std::ref(_hinted_handoff_separate_connection),
        std::ref(_lwt_feature),
        std::ref(_per_table_partitioners_feature),
+        std::ref(_digest_for_null_values_feature),
    })
    {
        if (list.count(f.name())) {
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -96,6 +96,7 @@ private:
    gms::feature _hinted_handoff_separate_connection;
    gms::feature _lwt_feature;
    gms::feature _per_table_partitioners_feature;
+    gms::feature _digest_for_null_values_feature;

 public:
    bool cluster_supports_range_tombstones() const {
@@ -166,6 +167,10 @@ public:
        return _per_table_partitioners_feature;
    }

+    const feature& cluster_supports_digest_for_null_values() const {
+        return _digest_for_null_values_feature;
+    }
+
    bool cluster_supports_row_level_repair() const {
        return bool(_row_level_repair_feature);
    }
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -428,6 +428,7 @@ future<> gossiper::handle_shutdown_msg(inet_address from) {
        return make_ready_future<>();
    }
    return seastar::async([this, from] {
+        auto permit = this->lock_endpoint(from).get0();
        this->mark_as_shutdown(from);
    });
 }
--- a/install.sh
+++ b/install.sh
@@ -132,6 +132,7 @@ relocate_python3() {
    cp "$script" "$relocateddir"
    cat > "$install"<<EOF
 #!/usr/bin/env bash
+export LC_ALL=en_US.UTF-8
 x="\$(readlink -f "\$0")"
 b="\$(basename "\$x")"
 d="\$(dirname "\$x")"
@@ -143,7 +144,7 @@ DEBIAN_SSL_CERT_FILE="/etc/ssl/certs/ca-certificates.crt"
 if [ -f "\${DEBIAN_SSL_CERT_FILE}" ]; then
  c=\${DEBIAN_SSL_CERT_FILE}
 fi
-PYTHONPATH="\${d}:\${d}/libexec:\$PYTHONPATH" PATH="\${d}/$pythonpath:\${PATH}" SSL_CERT_FILE="\${c}" exec -a "\$0" "\${d}/libexec/\${b}" "\$@"
+PYTHONPATH="\${d}:\${d}/libexec:\$PYTHONPATH" PATH="\${d}/../bin:\${d}/$pythonpath:\${PATH}" SSL_CERT_FILE="\${c}" exec -a "\$0" "\${d}/libexec/\${b}" "\$@"
 EOF
    chmod +x "$install"
 }
@@ -378,5 +379,9 @@ elif ! $packaging; then
    chown -R scylla:scylla $rdata
    chown -R scylla:scylla $rhkdata

+    for file in dist/common/sysctl.d/*.conf; do
+        bn=$(basename "$file")
+        sysctl -p "$rusr"/lib/sysctl.d/"$bn"
+    done
    $rprefix/scripts/scylla_post_install.sh
 fi
--- a/licenses/abseil-license.txt
+++ b/licenses/abseil-license.txt
@@ -0,0 +1,203 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        https://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       https://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
--- a/locator/abstract_replication_strategy.cc
+++ b/locator/abstract_replication_strategy.cc
@@ -168,15 +168,33 @@ insert_token_range_to_sorted_container_while_unwrapping(

 dht::token_range_vector
 abstract_replication_strategy::get_ranges(inet_address ep) const {
-    return get_ranges(ep, _token_metadata);
+    return do_get_ranges(ep, _token_metadata, false);
+}
+
+dht::token_range_vector
+abstract_replication_strategy::get_ranges_in_thread(inet_address ep) const {
+    return do_get_ranges(ep, _token_metadata, true);
 }

 dht::token_range_vector
 abstract_replication_strategy::get_ranges(inet_address ep, token_metadata& tm) const {
+    return do_get_ranges(ep, tm, false);
+}
+
+dht::token_range_vector
+abstract_replication_strategy::get_ranges_in_thread(inet_address ep, token_metadata& tm) const {
+    return do_get_ranges(ep, tm, true);
+}
+
+dht::token_range_vector
+abstract_replication_strategy::do_get_ranges(inet_address ep, token_metadata& tm, bool can_yield) const {
    dht::token_range_vector ret;
    auto prev_tok = tm.sorted_tokens().back();
    for (auto tok : tm.sorted_tokens()) {
        for (inet_address a : calculate_natural_endpoints(tok, tm)) {
+            if (can_yield) {
+                seastar::thread::maybe_yield();
+            }
            if (a == ep) {
                insert_token_range_to_sorted_container_while_unwrapping(prev_tok, tok, ret);
                break;
--- a/locator/abstract_replication_strategy.hh
+++ b/locator/abstract_replication_strategy.hh
@@ -113,10 +113,15 @@ public:
    // It the analogue of Origin's getAddressRanges().get(endpoint).
    // This function is not efficient, and not meant for the fast path.
    dht::token_range_vector get_ranges(inet_address ep) const;
+    dht::token_range_vector get_ranges_in_thread(inet_address ep) const;

    // Use the token_metadata provided by the caller instead of _token_metadata
    dht::token_range_vector get_ranges(inet_address ep, token_metadata& tm) const;
+    dht::token_range_vector get_ranges_in_thread(inet_address ep, token_metadata& tm) const;
+private:
+    dht::token_range_vector do_get_ranges(inet_address ep, token_metadata& tm, bool can_yield) const;

+public:
    // get_primary_ranges() returns the list of "primary ranges" for the given
    // endpoint. "Primary ranges" are the ranges that the node is responsible
    // for storing replica primarily, which means this is the first node
--- a/locator/token_metadata.cc
+++ b/locator/token_metadata.cc
@@ -1975,9 +1975,7 @@ void topology::add_endpoint(const inet_address& ep)
        if (current->second.dc == dc && current->second.rack == rack) {
            return;
        }
-
-        _dc_racks[current->second.dc][current->second.rack].erase(ep);
-        _dc_endpoints[current->second.dc].erase(ep);
+        remove_endpoint(ep);
    }

    _dc_endpoints[dc].insert(ep);
@@ -2002,7 +2000,14 @@ void topology::remove_endpoint(inet_address ep)
    }

    _dc_endpoints[cur_dc_rack->second.dc].erase(ep);
-    _dc_racks[cur_dc_rack->second.dc][cur_dc_rack->second.rack].erase(ep);
+
+    auto& racks = _dc_racks[cur_dc_rack->second.dc];
+    auto& eps = racks[cur_dc_rack->second.rack];
+    eps.erase(ep);
+    if (eps.empty()) {
+        racks.erase(cur_dc_rack->second.rack);
+    }
+
    _current_locations.erase(cur_dc_rack);
 }

--- a/main.cc
+++ b/main.cc
@@ -821,6 +821,7 @@ int main(int ac, char** av) {
            storage_proxy_smp_service_group_config.max_nonlocal_requests = 5000;
            spcfg.read_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
            spcfg.write_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
+            spcfg.hints_write_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
            spcfg.write_ack_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
            static db::view::node_update_backlog node_backlog(smp::count, 10ms);
            scheduling_group_key_config storage_proxy_stats_cfg =
@@ -949,12 +950,16 @@ int main(int ac, char** av) {
                mm.init_messaging_service();
            }).get();
            supervisor::notify("initializing storage proxy RPC verbs");
-            proxy.invoke_on_all([] (service::storage_proxy& p) {
-                p.init_messaging_service();
-            }).get();
+            proxy.invoke_on_all(&service::storage_proxy::init_messaging_service).get();
+            auto stop_proxy_handlers = defer_verbose_shutdown("storage proxy RPC verbs", [&proxy] {
+                proxy.invoke_on_all(&service::storage_proxy::uninit_messaging_service).get();
+            });

            supervisor::notify("starting streaming service");
            streaming::stream_session::init_streaming_service(db, sys_dist_ks, view_update_generator).get();
+            auto stop_streaming_service = defer_verbose_shutdown("streaming service", [] {
+                streaming::stream_session::uninit_streaming_service().get();
+            });
            api::set_server_stream_manager(ctx).get();

            supervisor::notify("starting hinted handoff manager");
@@ -987,6 +992,9 @@ int main(int ac, char** av) {
                rs.stop().get();
            });
            repair_init_messaging_service_handler(rs, sys_dist_ks, view_update_generator).get();
+            auto stop_repair_messages = defer_verbose_shutdown("repair message handlers", [] {
+                repair_uninit_messaging_service_handler().get();
+            });
            supervisor::notify("starting storage service", true);
            auto& ss = service::get_local_storage_service();
            ss.init_messaging_service_part().get();
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -731,6 +731,10 @@ void messaging_service::register_stream_mutation_fragments(std::function<future<
    register_handler(this, messaging_verb::STREAM_MUTATION_FRAGMENTS, std::move(func));
 }

+future<> messaging_service::unregister_stream_mutation_fragments() {
+    return unregister_handler(messaging_verb::STREAM_MUTATION_FRAGMENTS);
+}
+
 template<class SinkType, class SourceType>
 future<rpc::sink<SinkType>, rpc::source<SourceType>>
 do_make_sink_source(messaging_verb verb, uint32_t repair_meta_id, shared_ptr<messaging_service::rpc_protocol_client_wrapper> rpc_client, std::unique_ptr<messaging_service::rpc_protocol_wrapper>& rpc) {
@@ -762,6 +766,9 @@ rpc::sink<repair_row_on_wire_with_cmd> messaging_service::make_sink_for_repair_g
 void messaging_service::register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_hash_with_cmd> source)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM, std::move(func));
 }
+future<> messaging_service::unregister_repair_get_row_diff_with_rpc_stream() {
+    return unregister_handler(messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM);
+}

 // Wrapper for REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM
 future<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>
@@ -781,6 +788,9 @@ rpc::sink<repair_stream_cmd> messaging_service::make_sink_for_repair_put_row_dif
 void messaging_service::register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source)>&& func) {
    register_handler(this, messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM, std::move(func));
 }
+future<> messaging_service::unregister_repair_put_row_diff_with_rpc_stream() {
+    return unregister_handler(messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM);
+}

 // Wrapper for REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM
 future<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>
@@ -800,6 +810,9 @@ rpc::sink<repair_hash_with_cmd> messaging_service::make_sink_for_repair_get_full
 void messaging_service::register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_stream_cmd> source)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM, std::move(func));
 }
+future<> messaging_service::unregister_repair_get_full_row_hashes_with_rpc_stream() {
+    return unregister_handler(messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM);
+}

 // Send a message for verb
 template <typename MsgIn, typename... MsgOut>
@@ -883,6 +896,9 @@ future<streaming::prepare_message> messaging_service::send_prepare_message(msg_a
    return send_message<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
        std::move(msg), plan_id, std::move(description), reason);
 }
+future<> messaging_service::unregister_prepare_message() {
+    return unregister_handler(messaging_verb::PREPARE_MESSAGE);
+}

 // PREPARE_DONE_MESSAGE
 void messaging_service::register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func) {
@@ -892,6 +908,9 @@ future<> messaging_service::send_prepare_done_message(msg_addr id, UUID plan_id,
    return send_message<void>(this, messaging_verb::PREPARE_DONE_MESSAGE, id,
        plan_id, dst_cpu_id);
 }
+future<> messaging_service::unregister_prepare_done_message() {
+    return unregister_handler(messaging_verb::PREPARE_DONE_MESSAGE);
+}

 // STREAM_MUTATION
 void messaging_service::register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool> fragmented, rpc::optional<streaming::stream_reason> reason)>&& func) {
@@ -916,6 +935,9 @@ future<> messaging_service::send_stream_mutation_done(msg_addr id, UUID plan_id,
    return send_message<void>(this, messaging_verb::STREAM_MUTATION_DONE, id,
        plan_id, std::move(ranges), cf_id, dst_cpu_id);
 }
+future<> messaging_service::unregister_stream_mutation_done() {
+    return unregister_handler(messaging_verb::STREAM_MUTATION_DONE);
+}

 // COMPLETE_MESSAGE
 void messaging_service::register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func) {
@@ -925,6 +947,9 @@ future<> messaging_service::send_complete_message(msg_addr id, UUID plan_id, uns
    return send_message<void>(this, messaging_verb::COMPLETE_MESSAGE, id,
        plan_id, dst_cpu_id, failed);
 }
+future<> messaging_service::unregister_complete_message() {
+    return unregister_handler(messaging_verb::COMPLETE_MESSAGE);
+}

 void messaging_service::register_gossip_echo(std::function<future<> ()>&& func) {
    register_handler(this, messaging_verb::GOSSIP_ECHO, std::move(func));
@@ -1139,14 +1164,14 @@ future<partition_checksum> messaging_service::send_repair_checksum_range(
 }

 // Wrapper for REPAIR_GET_FULL_ROW_HASHES
-void messaging_service::register_repair_get_full_row_hashes(std::function<future<std::unordered_set<repair_hash>> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
+void messaging_service::register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(func));
 }
 future<> messaging_service::unregister_repair_get_full_row_hashes() {
    return unregister_handler(messaging_verb::REPAIR_GET_FULL_ROW_HASHES);
 }
-future<std::unordered_set<repair_hash>> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
-    return send_message<future<std::unordered_set<repair_hash>>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
+future<repair_hash_set> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
+    return send_message<future<repair_hash_set>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
 }

 // Wrapper for REPAIR_GET_COMBINED_ROW_HASH
@@ -1171,13 +1196,13 @@ future<get_sync_boundary_response> messaging_service::send_repair_get_sync_bound
 }

 // Wrapper for REPAIR_GET_ROW_DIFF
-void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows)>&& func) {
+void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(func));
 }
 future<> messaging_service::unregister_repair_get_row_diff() {
    return unregister_handler(messaging_verb::REPAIR_GET_ROW_DIFF);
 }
-future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows) {
+future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows) {
    return send_message<future<repair_rows_on_wire>>(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(id), repair_meta_id, std::move(set_diff), needs_all_rows);
 }

--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -276,10 +276,12 @@ public:
            streaming::prepare_message msg, UUID plan_id, sstring description, rpc::optional<streaming::stream_reason> reason)>&& func);
    future<streaming::prepare_message> send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
            sstring description, streaming::stream_reason);
+    future<> unregister_prepare_message();

    // Wrapper for PREPARE_DONE_MESSAGE verb
    void register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func);
    future<> send_prepare_done_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id);
+    future<> unregister_prepare_done_message();

    // Wrapper for STREAM_MUTATION verb
    void register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool>, rpc::optional<streaming::stream_reason>)>&& func);
@@ -288,6 +290,7 @@ public:
    // Wrapper for STREAM_MUTATION_FRAGMENTS
    // The receiver of STREAM_MUTATION_FRAGMENTS sends status code to the sender to notify any error on the receiver side. The status code is of type int32_t. 0 means successful, -1 means error, other status code value are reserved for future use.
    void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>> source)>&& func);
+    future<> unregister_stream_mutation_fragments();
    rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>>& source);
    future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);

@@ -295,22 +298,27 @@ public:
    future<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>> make_sink_and_source_for_repair_get_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
    rpc::sink<repair_row_on_wire_with_cmd> make_sink_for_repair_get_row_diff_with_rpc_stream(rpc::source<repair_hash_with_cmd>& source);
    void register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_hash_with_cmd> source)>&& func);
+    future<> unregister_repair_get_row_diff_with_rpc_stream();

    // Wrapper for REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM
    future<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>> make_sink_and_source_for_repair_put_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
    rpc::sink<repair_stream_cmd> make_sink_for_repair_put_row_diff_with_rpc_stream(rpc::source<repair_row_on_wire_with_cmd>& source);
    void register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source)>&& func);
+    future<> unregister_repair_put_row_diff_with_rpc_stream();

    // Wrapper for REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM
    future<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>> make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
    rpc::sink<repair_hash_with_cmd> make_sink_for_repair_get_full_row_hashes_with_rpc_stream(rpc::source<repair_stream_cmd>& source);
    void register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_stream_cmd> source)>&& func);
+    future<> unregister_repair_get_full_row_hashes_with_rpc_stream();

    void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
    future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);
+    future<> unregister_stream_mutation_done();

    void register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func);
    future<> send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id, bool failed = false);
+    future<> unregister_complete_message();

    // Wrapper for REPAIR_CHECKSUM_RANGE verb
    void register_repair_checksum_range(std::function<future<partition_checksum> (sstring keyspace, sstring cf, dht::token_range range, rpc::optional<repair_checksum> hash_version)>&& func);
@@ -318,9 +326,9 @@ public:
    future<partition_checksum> send_repair_checksum_range(msg_addr id, sstring keyspace, sstring cf, dht::token_range range, repair_checksum hash_version);

    // Wrapper for REPAIR_GET_FULL_ROW_HASHES
-    void register_repair_get_full_row_hashes(std::function<future<std::unordered_set<repair_hash>> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
+    void register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
    future<> unregister_repair_get_full_row_hashes();
-    future<std::unordered_set<repair_hash>> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);
+    future<repair_hash_set> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);

    // Wrapper for REPAIR_GET_COMBINED_ROW_HASH
    void register_repair_get_combined_row_hash(std::function<future<get_combined_row_hash_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::optional<repair_sync_boundary> common_sync_boundary)>&& func);
@@ -333,9 +341,9 @@ public:
    future<get_sync_boundary_response> send_repair_get_sync_boundary(msg_addr id, uint32_t repair_meta_id, std::optional<repair_sync_boundary> skipped_sync_boundary);

    // Wrapper for REPAIR_GET_ROW_DIFF
-    void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows)>&& func);
+    void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func);
    future<> unregister_repair_get_row_diff();
-    future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows);
+    future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows);

    // Wrapper for REPAIR_PUT_ROW_DIFF
    void register_repair_put_row_diff(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_rows_on_wire row_diff)>&& func);
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -195,6 +195,7 @@ class read_context : public reader_lifecycle_policy {

    // One for each shard. Index is shard id.
    std::vector<reader_meta> _readers;
+    std::vector<reader_concurrency_semaphore*> _semaphores;

    gate _dismantling_gate;

@@ -211,7 +212,8 @@ public:
            , _schema(std::move(s))
            , _cmd(cmd)
            , _ranges(ranges)
-            , _trace_state(std::move(trace_state)) {
+            , _trace_state(std::move(trace_state))
+            , _semaphores(smp::count, nullptr) {
        _readers.resize(smp::count);
    }

@@ -236,7 +238,12 @@ public:
    virtual void destroy_reader(shard_id shard, future<stopped_reader> reader_fut) noexcept override;

    virtual reader_concurrency_semaphore& semaphore() override {
-        return _readers[this_shard_id()].rparts->semaphore;
+        const auto shard = this_shard_id();
+        if (!_semaphores[shard]) {
+            auto& table = _db.local().find_column_family(_schema);
+            _semaphores[shard] = &table.read_concurrency_semaphore();
+        }
+        return *_semaphores[shard];
    }

    future<> lookup_readers();
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -734,56 +734,78 @@ void write_counter_cell(RowWriter& w, const query::partition_slice& slice, ::ato
  });
 }

-// Used to return the timestamp of the latest update to the row
-struct max_timestamp {
-    api::timestamp_type max = api::missing_timestamp;
-
-    void update(api::timestamp_type ts) {
-        max = std::max(max, ts);
-    }
-};
-
-template<>
-struct appending_hash<row> {
-    template<typename Hasher>
-    void operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const {
-        for (auto id : columns) {
-            const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
-            if (!cell_and_hash) {
-                return;
-            }
-            auto&& def = s.column_at(kind, id);
-            if (def.is_atomic()) {
-                max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp());
-                if constexpr (query::using_hash_of_hash_v<Hasher>) {
-                    if (cell_and_hash->hash) {
-                        feed_hash(h, *cell_and_hash->hash);
-                    } else {
-                        query::default_hasher cellh;
-                        feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def);
-                        feed_hash(h, cellh.finalize_uint64());
-                    }
+template<typename Hasher>
+void appending_hash<row>::operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const {
+    for (auto id : columns) {
+        const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
+        if (!cell_and_hash) {
+            feed_hash(h, appending_hash<row>::null_hash_value);
+            continue;
+        }
+        auto&& def = s.column_at(kind, id);
+        if (def.is_atomic()) {
+            max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp());
+            if constexpr (query::using_hash_of_hash_v<Hasher>) {
+                if (cell_and_hash->hash) {
+                    feed_hash(h, *cell_and_hash->hash);
                } else {
-                    feed_hash(h, cell_and_hash->cell.as_atomic_cell(def), def);
+                    query::default_hasher cellh;
+                    feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def);
+                    feed_hash(h, cellh.finalize_uint64());
                }
            } else {
-                auto cm = cell_and_hash->cell.as_collection_mutation();
-                max_ts.update(cm.last_update(*def.type));
-                if constexpr (query::using_hash_of_hash_v<Hasher>) {
-                    if (cell_and_hash->hash) {
-                        feed_hash(h, *cell_and_hash->hash);
-                    } else {
-                        query::default_hasher cellh;
-                        feed_hash(cellh, cm, def);
-                        feed_hash(h, cellh.finalize_uint64());
-                    }
+                feed_hash(h, cell_and_hash->cell.as_atomic_cell(def), def);
+            }
+        } else {
+            auto cm = cell_and_hash->cell.as_collection_mutation();
+            max_ts.update(cm.last_update(*def.type));
+            if constexpr (query::using_hash_of_hash_v<Hasher>) {
+                if (cell_and_hash->hash) {
+                    feed_hash(h, *cell_and_hash->hash);
                } else {
-                    feed_hash(h, cm, def);
+                    query::default_hasher cellh;
+                    feed_hash(cellh, cm, def);
+                    feed_hash(h, cellh.finalize_uint64());
                }
+            } else {
+                feed_hash(h, cm, def);
            }
        }
    }
-};
+}
+// Instantiation for mutation_test.cc
+template void appending_hash<row>::operator()<xx_hasher>(xx_hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const;
+
+template<>
+void appending_hash<row>::operator()<legacy_xx_hasher_without_null_digest>(legacy_xx_hasher_without_null_digest& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const {
+    for (auto id : columns) {
+        const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
+        if (!cell_and_hash) {
+            return;
+        }
+        auto&& def = s.column_at(kind, id);
+        if (def.is_atomic()) {
+            max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp());
+            if (cell_and_hash->hash) {
+                feed_hash(h, *cell_and_hash->hash);
+            } else {
+                query::default_hasher cellh;
+                feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def);
+                feed_hash(h, cellh.finalize_uint64());
+            }
+        } else {
+            auto cm = cell_and_hash->cell.as_collection_mutation();
+            max_ts.update(cm.last_update(*def.type));
+            if (cell_and_hash->hash) {
+                feed_hash(h, *cell_and_hash->hash);
+            } else {
+                query::default_hasher cellh;
+                feed_hash(cellh, cm, def);
+                feed_hash(h, cellh.finalize_uint64());
+            }
+        }
+    }
+}

 cell_hash_opt row::cell_hash_for(column_id id) const {
    if (_type == storage_type::vector) {
@@ -1721,7 +1743,7 @@ void row::apply_monotonically(const schema& s, column_kind kind, row&& other) {
 // we erase the live cells according to the shadowable_tombstone rules.
 static bool dead_marker_shadows_row(const schema& s, column_kind kind, const row_marker& marker) {
    return s.is_view()
-            && !s.view_info()->base_non_pk_columns_in_view_pk().empty()
+            && s.view_info()->has_base_non_pk_columns_in_view_pk()
            && !marker.is_live()
            && kind == column_kind::regular_column; // not applicable to static rows
 }
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -650,6 +650,22 @@ public:
    };
 };

+// Used to return the timestamp of the latest update to the row
+struct max_timestamp {
+    api::timestamp_type max = api::missing_timestamp;
+
+    void update(api::timestamp_type ts) {
+        max = std::max(max, ts);
+    }
+};
+
+template<>
+struct appending_hash<row> {
+    static constexpr int null_hash_value = 0xbeefcafe;
+    template<typename Hasher>
+    void operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const;
+};
+
 class row_marker;
 int compare_row_marker_for_merge(const row_marker& left, const row_marker& right) noexcept;

--- a/mutation_query.hh
+++ b/mutation_query.hh
@@ -113,9 +113,6 @@ class reconcilable_result_builder {
    const schema& _schema;
    const query::partition_slice& _slice;

-    utils::chunked_vector<partition> _result;
-    uint32_t _live_rows{};
-
    bool _return_static_content_on_partition_with_no_rows{};
    bool _static_row_is_alive{};
    uint32_t _total_live_rows = 0;
@@ -123,6 +120,10 @@ class reconcilable_result_builder {
    stop_iteration _stop;
    bool _short_read_allowed;
    std::optional<streamed_mutation_freezer> _mutation_consumer;
+
+    uint32_t _live_rows{};
+    // make this the last member so it is destroyed first. #7240
+    utils::chunked_vector<partition> _result;
 public:
    reconcilable_result_builder(const schema& s, const query::partition_slice& slice,
                                query::result_memory_accounter&& accounter)
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
--- a/mutation_reader.hh
+++ b/mutation_reader.hh
@@ -372,6 +372,64 @@ flat_mutation_reader make_foreign_reader(schema_ptr schema,
        foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader,
        streamed_mutation::forwarding fwd_sm = streamed_mutation::forwarding::no);

+/// Make an auto-paused evictable reader.
+///
+/// The reader is paused after each use, that is after each call to any of its
+/// members that cause actual reading to be done (`fill_buffer()` and
+/// `fast_forward_to()`). When paused, the reader is made evictable, that it is
+/// it is registered with reader concurrency semaphore as an inactive read.
+/// The reader is resumed automatically on the next use. If it was evicted, it
+/// will be recreated at the position it left off reading. This is all
+/// transparent to its user.
+/// Parameters passed by reference have to be kept alive while the reader is
+/// alive.
+flat_mutation_reader make_auto_paused_evictable_reader(
+        mutation_source ms,
+        schema_ptr schema,
+        reader_concurrency_semaphore& semaphore,
+        const dht::partition_range& pr,
+        const query::partition_slice& ps,
+        const io_priority_class& pc,
+        tracing::trace_state_ptr trace_state,
+        mutation_reader::forwarding fwd_mr);
+
+class evictable_reader;
+
+class evictable_reader_handle {
+    friend std::pair<flat_mutation_reader, evictable_reader_handle> make_manually_paused_evictable_reader(mutation_source, schema_ptr, reader_concurrency_semaphore&,
+            const dht::partition_range&, const query::partition_slice&, const io_priority_class&, tracing::trace_state_ptr, mutation_reader::forwarding);
+
+private:
+    evictable_reader* _r;
+
+private:
+    explicit evictable_reader_handle(evictable_reader& r);
+
+public:
+    void pause();
+};
+
+/// Make a manually-paused evictable reader.
+///
+/// The reader can be paused via the evictable reader handle when desired. The
+/// intended usage is subsequent reads done in bursts, after which the reader is
+/// not used for some time. When paused, the reader is made evictable, that is,
+/// it is registered with reader concurrency semaphore as an inactive read.
+/// The reader is resumed automatically on the next use. If it was evicted, it
+/// will be recreated at the position it left off reading. This is all
+/// transparent to its user.
+/// Parameters passed by reference have to be kept alive while the reader is
+/// alive.
+std::pair<flat_mutation_reader, evictable_reader_handle> make_manually_paused_evictable_reader(
+        mutation_source ms,
+        schema_ptr schema,
+        reader_concurrency_semaphore& semaphore,
+        const dht::partition_range& pr,
+        const query::partition_slice& ps,
+        const io_priority_class& pc,
+        tracing::trace_state_ptr trace_state,
+        mutation_reader::forwarding fwd_mr);
+
 /// Reader lifecycle policy for the mulitshard combining reader.
 ///
 /// This policy is expected to make sure any additional resource the readers
--- a/mutation_writer/feed_writers.hh
+++ b/mutation_writer/feed_writers.hh
@@ -38,8 +38,14 @@ future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
                auto f2 = rd.is_buffer_empty() ? rd.fill_buffer(db::no_timeout) : make_ready_future<>();
                return when_all_succeed(std::move(f1), std::move(f2));
            });
-        }).finally([&wr] {
-            return wr.consume_end_of_stream();
+        }).then_wrapped([&wr] (future<> f) {
+            if (f.failed()) {
+                auto ex = f.get_exception();
+                wr.abort(ex);
+                return make_exception_future<>(ex);
+            } else {
+                return wr.consume_end_of_stream();
+            }
        });
    });
 }
--- a/mutation_writer/shard_based_splitting_writer.cc
+++ b/mutation_writer/shard_based_splitting_writer.cc
@@ -57,6 +57,9 @@ class shard_based_splitting_mutation_writer {
            }
            return std::move(_consume_fut);
        }
+        void abort(std::exception_ptr ep) {
+            _handle.abort(ep);
+        }
    };

 private:
@@ -108,6 +111,13 @@ public:
            return shard->consume_end_of_stream();
        });
    }
+    void abort(std::exception_ptr ep) {
+        for (auto&& shard : _shards) {
+            if (shard) {
+                shard->abort(ep);
+            }
+        }
+    }
 };

 future<> segregate_by_shard(flat_mutation_reader producer, reader_consumer consumer) {
--- a/mutation_writer/timestamp_based_splitting_writer.cc
+++ b/mutation_writer/timestamp_based_splitting_writer.cc
@@ -144,6 +144,9 @@ class timestamp_based_splitting_mutation_writer {
            }
            return std::move(_consume_fut);
        }
+        void abort(std::exception_ptr ep) {
+            _handle.abort(ep);
+        }
    };

 private:
@@ -186,6 +189,11 @@ public:
            return bucket.second.consume_end_of_stream();
        });
    }
+    void abort(std::exception_ptr ep) {
+        for (auto&& b : _buckets) {
+            b.second.abort(ep);
+        }
+    }
 };

 future<> timestamp_based_splitting_mutation_writer::write_to_bucket(bucket_id bucket, mutation_fragment&& mf) {
--- a/partition_version.cc
+++ b/partition_version.cc
@@ -650,12 +650,12 @@ partition_snapshot_ptr partition_entry::read(logalloc::region& r,
    return partition_snapshot_ptr(std::move(snp));
 }

-std::vector<range_tombstone>
+partition_snapshot::range_tombstone_result
 partition_snapshot::range_tombstones(position_in_partition_view start, position_in_partition_view end)
 {
    partition_version* v = &*version();
    if (!v->next()) {
-        return boost::copy_range<std::vector<range_tombstone>>(
+        return boost::copy_range<range_tombstone_result>(
            v->partition().row_tombstones().slice(*_schema, start, end));
    }
    range_tombstone_list list(*_schema);
@@ -665,10 +665,10 @@ partition_snapshot::range_tombstones(position_in_partition_view start, position_
        }
        v = v->next();
    }
-    return boost::copy_range<std::vector<range_tombstone>>(list.slice(*_schema, start, end));
+    return boost::copy_range<range_tombstone_result>(list.slice(*_schema, start, end));
 }

-std::vector<range_tombstone>
+partition_snapshot::range_tombstone_result
 partition_snapshot::range_tombstones()
 {
    return range_tombstones(
--- a/partition_version.hh
+++ b/partition_version.hh
@@ -26,6 +26,7 @@
 #include "utils/anchorless_list.hh"
 #include "utils/logalloc.hh"
 #include "utils/coroutine.hh"
+#include "utils/chunked_vector.hh"

 #include <boost/intrusive/parent_from_member.hpp>
 #include <boost/intrusive/slist.hpp>
@@ -400,10 +401,13 @@ public:
    ::static_row static_row(bool digest_requested) const;
    bool static_row_continuous() const;
    mutation_partition squashed() const;
+
+    using range_tombstone_result = utils::chunked_vector<range_tombstone>;
+
    // Returns range tombstones overlapping with [start, end)
-    std::vector<range_tombstone> range_tombstones(position_in_partition_view start, position_in_partition_view end);
+    range_tombstone_result range_tombstones(position_in_partition_view start, position_in_partition_view end);
    // Returns all range tombstones
-    std::vector<range_tombstone> range_tombstones();
+    range_tombstone_result range_tombstones();
 };

 class partition_snapshot_ptr {
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -163,6 +163,11 @@ public:
        return {partition_region::clustered, bound_weight::before_all_prefixed, &ck};
    }

+    // Returns a view to before_key(pos._ck) if pos.is_clustering_row() else returns pos as-is.
+    static position_in_partition_view before_key(position_in_partition_view pos) {
+        return {partition_region::clustered, pos._bound_weight == bound_weight::equal ? bound_weight::before_all_prefixed : pos._bound_weight, pos._ck};
+    }
+
    partition_region region() const { return _type; }
    bound_weight get_bound_weight() const { return _bound_weight; }
    bool is_partition_start() const { return _type == partition_region::partition_start; }
--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -27,6 +27,7 @@


 reader_permit::impl::impl(reader_concurrency_semaphore& semaphore, reader_resources base_cost) : semaphore(semaphore), base_cost(base_cost) {
+    semaphore.consume(base_cost);
 }

 reader_permit::impl::~impl() {
@@ -88,7 +89,6 @@ void reader_concurrency_semaphore::signal(const resources& r) noexcept {
    _resources += r;
    while (!_wait_list.empty() && has_available_units(_wait_list.front().res)) {
        auto& x = _wait_list.front();
-        _resources -= x.res;
        try {
            x.pr.set_value(reader_permit(*this, x.res));
        } catch (...) {
@@ -160,7 +160,6 @@ future<reader_permit> reader_concurrency_semaphore::wait_admission(size_t memory
        --_inactive_read_stats.population;
    }
    if (may_proceed(r)) {
-        _resources -= r;
        return make_ready_future<reader_permit>(reader_permit(*this, r));
    }
    promise<reader_permit> pr;
@@ -170,7 +169,6 @@ future<reader_permit> reader_concurrency_semaphore::wait_admission(size_t memory
 }

 reader_permit reader_concurrency_semaphore::consume_resources(resources r) {
-    _resources -= r;
    return reader_permit(*this, r);
 }

--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -128,6 +128,10 @@ private:
        return has_available_units(r) && _wait_list.empty();
    }

+    void consume(resources r) {
+        _resources -= r;
+    }
+
    void consume_memory(size_t memory) {
        _resources.memory -= memory;
    }
--- a/redis/commands.cc
+++ b/redis/commands.cc
@@ -63,7 +63,7 @@ shared_ptr<abstract_command> exists::prepare(service::storage_proxy& proxy, requ
 }

 future<redis_message> exists::execute(service::storage_proxy& proxy, redis::redis_options& options, service_permit permit) {
-    return seastar::do_for_each(_keys, [&proxy, &options, &permit, this] (bytes key) {
+    return seastar::do_for_each(_keys, [&proxy, &options, permit, this] (bytes& key) {
        return redis::read_strings(proxy, options, key, permit).then([this] (lw_shared_ptr<strings_result> result) {
            if (result->has_result()) {
                _count++;
--- a/repair/repair.hh
+++ b/repair/repair.hh
@@ -23,6 +23,7 @@

 #include <unordered_map>
 #include <exception>
+#include <absl/container/btree_set.h>

 #include <seastar/core/sstring.hh>
 #include <seastar/core/sharded.hh>
@@ -334,6 +335,8 @@ public:
    }
 };

+using repair_hash_set = absl::btree_set<repair_hash>;
+
 // Return value of the REPAIR_GET_SYNC_BOUNDARY RPC verb
 struct get_sync_boundary_response {
    std::optional<repair_sync_boundary> boundary;
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -47,6 +47,7 @@
 #include "gms/gossiper.hh"
 #include "repair/row_level.hh"
 #include "mutation_source_metadata.hh"
+#include "utils/stall_free.hh"

 extern logging::logger rlogger;

@@ -372,6 +373,7 @@ private:
    std::optional<utils::phased_barrier::operation> _local_read_op;
    // Local reader or multishard reader to read the range
    flat_mutation_reader _reader;
+    std::optional<evictable_reader_handle> _reader_handle;
    // Current partition read from disk
    lw_shared_ptr<const decorated_key_with_hash> _current_dk;

@@ -390,32 +392,49 @@ public:
            , _sharder(remote_sharder, range, remote_shard)
            , _seed(seed)
            , _local_read_op(local_reader ? std::optional(cf.read_in_progress()) : std::nullopt)
-            , _reader(make_reader(db, cf, local_reader)) {
-    }
-
-private:
-    flat_mutation_reader
-    make_reader(seastar::sharded<database>& db,
-            column_family& cf,
-            is_local_reader local_reader) {
+            , _reader(nullptr) {
        if (local_reader) {
-            return cf.make_streaming_reader(_schema, _range);
+            auto ms = mutation_source([&cf] (
+                        schema_ptr s,
+                        reader_permit,
+                        const dht::partition_range& pr,
+                        const query::partition_slice& ps,
+                        const io_priority_class& pc,
+                        tracing::trace_state_ptr,
+                        streamed_mutation::forwarding,
+                        mutation_reader::forwarding fwd_mr) {
+                return cf.make_streaming_reader(std::move(s), pr, ps, fwd_mr);
+            });
+            std::tie(_reader, _reader_handle) = make_manually_paused_evictable_reader(
+                    std::move(ms),
+                    _schema,
+                    cf.streaming_read_concurrency_semaphore(),
+                    _range,
+                    _schema->full_slice(),
+                    service::get_local_streaming_read_priority(),
+                    {},
+                    mutation_reader::forwarding::no);
+        } else {
+            _reader = make_multishard_streaming_reader(db, _schema, [this] {
+                auto shard_range = _sharder.next();
+                if (shard_range) {
+                    return std::optional<dht::partition_range>(dht::to_partition_range(*shard_range));
+                }
+                return std::optional<dht::partition_range>();
+            });
        }
-        return make_multishard_streaming_reader(db, _schema, [this] {
-            auto shard_range = _sharder.next();
-            if (shard_range) {
-                return std::optional<dht::partition_range>(dht::to_partition_range(*shard_range));
-            }
-            return std::optional<dht::partition_range>();
-        });
    }

-public:
    future<mutation_fragment_opt>
    read_mutation_fragment() {
        return _reader(db::no_timeout);
    }

+    void on_end_of_stream() {
+        _reader = make_empty_flat_reader(_schema);
+        _reader_handle.reset();
+    }
+
    lw_shared_ptr<const decorated_key_with_hash>& get_current_dk() {
        return _current_dk;
    }
@@ -434,9 +453,14 @@ public:
        }
    }

+    void pause() {
+        if (_reader_handle) {
+            _reader_handle->pause();
+        }
+    }
 };

-class repair_writer {
+class repair_writer : public enable_lw_shared_from_this<repair_writer> {
    schema_ptr _schema;
    uint64_t _estimated_partitions;
    size_t _nr_peer_nodes;
@@ -495,9 +519,10 @@ public:
            return _mq[node_idx]->pop_eventually();
        };
        table& t = db.local().find_column_family(_schema->id());
+        auto writer = shared_from_this();
        _writer_done[node_idx] = mutation_writer::distribute_reader_and_consume_on_shards(_schema,
                make_generating_reader(_schema, std::move(get_next_mutation_fragment)),
-                [&db, reason = this->_reason, estimated_partitions = this->_estimated_partitions] (flat_mutation_reader reader) {
+                [&db, reason = this->_reason, estimated_partitions = this->_estimated_partitions, writer] (flat_mutation_reader reader) {
            auto& t = db.local().find_column_family(reader.schema());
            return db::view::check_needs_view_update_path(_sys_dist_ks->local(), t, reason).then([t = t.shared_from_this(), estimated_partitions, reader = std::move(reader)] (bool use_view_update_path) mutable {
                //FIXME: for better estimations this should be transmitted from remote
@@ -525,13 +550,13 @@ public:
                return consumer(std::move(reader));
            });
        },
-        t.stream_in_progress()).then([this, node_idx] (uint64_t partitions) {
+        t.stream_in_progress()).then([node_idx, writer] (uint64_t partitions) {
            rlogger.debug("repair_writer: keyspace={}, table={}, managed to write partitions={} to sstable",
-                _schema->ks_name(), _schema->cf_name(), partitions);
-        }).handle_exception([this, node_idx] (std::exception_ptr ep) {
+                writer->_schema->ks_name(), writer->_schema->cf_name(), partitions);
+        }).handle_exception([node_idx, writer] (std::exception_ptr ep) {
            rlogger.warn("repair_writer: keyspace={}, table={}, multishard_writer failed: {}",
-                    _schema->ks_name(), _schema->cf_name(), ep);
-            _mq[node_idx]->abort(ep);
+                    writer->_schema->ks_name(), writer->_schema->cf_name(), ep);
+            writer->_mq[node_idx]->abort(ep);
            return make_exception_future<>(std::move(ep));
        });
    }
@@ -635,7 +660,7 @@ private:
    size_t _nr_peer_nodes= 1;
    repair_stats _stats;
    repair_reader _repair_reader;
-    repair_writer _repair_writer;
+    lw_shared_ptr<repair_writer> _repair_writer;
    // Contains rows read from disk
    std::list<repair_row> _row_buf;
    // Contains rows we are working on to sync between peers
@@ -647,7 +672,7 @@ private:
    // Tracks current sync boundary
    std::optional<repair_sync_boundary> _current_sync_boundary;
    // Contains the hashes of rows in the _working_row_buffor for all peer nodes
-    std::vector<std::unordered_set<repair_hash>> _peer_row_hash_sets;
+    std::vector<repair_hash_set> _peer_row_hash_sets;
    // Gate used to make sure pending operation of meta data is done
    seastar::gate _gate;
    sink_source_for_get_full_row_hashes _sink_source_for_get_full_row_hashes;
@@ -716,7 +741,7 @@ public:
                    _seed,
                    repair_reader::is_local_reader(_repair_master || _same_sharding_config)
              )
-            , _repair_writer(_schema, _estimated_partitions, _nr_peer_nodes, _reason)
+            , _repair_writer(make_lw_shared<repair_writer>(_schema, _estimated_partitions, _nr_peer_nodes, _reason))
            , _sink_source_for_get_full_row_hashes(_repair_meta_id, _nr_peer_nodes,
                    [] (uint32_t repair_meta_id, netw::messaging_service::msg_addr addr) {
                        return netw::get_local_messaging_service().make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(repair_meta_id, addr);
@@ -735,11 +760,12 @@ public:
 public:
    future<> stop() {
        auto gate_future = _gate.close();
-        auto writer_future = _repair_writer.wait_for_writer_done();
        auto f1 = _sink_source_for_get_full_row_hashes.close();
        auto f2 = _sink_source_for_get_row_diff.close();
        auto f3 = _sink_source_for_put_row_diff.close();
-        return when_all_succeed(std::move(gate_future), std::move(writer_future), std::move(f1), std::move(f2), std::move(f3));
+        return when_all_succeed(std::move(gate_future), std::move(f1), std::move(f2), std::move(f3)).finally([this] {
+            return _repair_writer->wait_for_writer_done();
+        });
    }

    static std::unordered_map<node_repair_meta_id, lw_shared_ptr<repair_meta>>& repair_meta_map() {
@@ -867,9 +893,9 @@ public:
    }

    // Must run inside a seastar thread
-    static std::unordered_set<repair_hash>
-    get_set_diff(const std::unordered_set<repair_hash>& x, const std::unordered_set<repair_hash>& y) {
-        std::unordered_set<repair_hash> set_diff;
+    static repair_hash_set
+    get_set_diff(const repair_hash_set& x, const repair_hash_set& y) {
+        repair_hash_set set_diff;
        // Note std::set_difference needs x and y are sorted.
        std::copy_if(x.begin(), x.end(), std::inserter(set_diff, set_diff.end()),
                [&y] (auto& item) { thread::maybe_yield(); return y.find(item) == y.end(); });
@@ -887,14 +913,14 @@ public:

    }

-    std::unordered_set<repair_hash>& peer_row_hash_sets(unsigned node_idx) {
+    repair_hash_set& peer_row_hash_sets(unsigned node_idx) {
        return _peer_row_hash_sets[node_idx];
    }

    // Get a list of row hashes in _working_row_buf
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    working_row_hashes() {
-        return do_with(std::unordered_set<repair_hash>(), [this] (std::unordered_set<repair_hash>& hashes) {
+        return do_with(repair_hash_set(), [this] (repair_hash_set& hashes) {
            return do_for_each(_working_row_buf, [&hashes] (repair_row& r) {
                hashes.emplace(r.hash());
            }).then([&hashes] {
@@ -1019,11 +1045,7 @@ private:
        return repair_hash(h.finalize_uint64());
    }

-    stop_iteration handle_mutation_fragment(mutation_fragment_opt mfopt, size_t& cur_size, size_t& new_rows_size, std::list<repair_row>& cur_rows) {
-        if (!mfopt) {
-            return stop_iteration::yes;
-        }
-        mutation_fragment& mf = *mfopt;
+    stop_iteration handle_mutation_fragment(mutation_fragment& mf, size_t& cur_size, size_t& new_rows_size, std::list<repair_row>& cur_rows) {
        if (mf.is_partition_start()) {
            auto& start = mf.as_partition_start();
            _repair_reader.set_current_dk(start.key());
@@ -1058,32 +1080,49 @@ private:
                }
                _gate.check();
                return _repair_reader.read_mutation_fragment().then([this, &cur_size, &new_rows_size, &cur_rows] (mutation_fragment_opt mfopt) mutable {
-                    return handle_mutation_fragment(std::move(mfopt), cur_size, new_rows_size, cur_rows);
+                    if (!mfopt) {
+                        _repair_reader.on_end_of_stream();
+                        return stop_iteration::yes;
+                    }
+                    return handle_mutation_fragment(*mfopt, cur_size, new_rows_size, cur_rows);
                });
-            }).then([&cur_rows, &new_rows_size] () mutable {
+            }).then_wrapped([this, &cur_rows, &new_rows_size] (future<> fut) mutable {
+                if (fut.failed()) {
+                    _repair_reader.on_end_of_stream();
+                    return make_exception_future<std::list<repair_row>, size_t>(fut.get_exception());
+                }
+                _repair_reader.pause();
                return make_ready_future<std::list<repair_row>, size_t>(std::move(cur_rows), new_rows_size);
            });
        });
    }

+    future<> clear_row_buf() {
+        return utils::clear_gently(_row_buf);
+    }
+
+    future<> clear_working_row_buf() {
+        return utils::clear_gently(_working_row_buf).then([this] {
+            _working_row_buf_combined_hash.clear();
+        });
+    }
+
    // Read rows from disk until _max_row_buf_size of rows are filled into _row_buf.
    // Calculate the combined checksum of the rows
    // Calculate the total size of the rows in _row_buf
    future<get_sync_boundary_response>
    get_sync_boundary(std::optional<repair_sync_boundary> skipped_sync_boundary) {
+        auto f = make_ready_future<>();
        if (skipped_sync_boundary) {
            _current_sync_boundary = skipped_sync_boundary;
-            _row_buf.clear();
-            _working_row_buf.clear();
-            _working_row_buf_combined_hash.clear();
-        } else {
-            _working_row_buf.clear();
-            _working_row_buf_combined_hash.clear();
+            f = clear_row_buf();
        }
        // Here is the place we update _last_sync_boundary
        rlogger.trace("SET _last_sync_boundary from {} to {}", _last_sync_boundary, _current_sync_boundary);
        _last_sync_boundary = _current_sync_boundary;
-        return row_buf_size().then([this, sb = std::move(skipped_sync_boundary)] (size_t cur_size) {
+      return f.then([this, sb = std::move(skipped_sync_boundary)] () mutable {
+       return clear_working_row_buf().then([this, sb = sb] () mutable {
+        return row_buf_size().then([this, sb = std::move(sb)] (size_t cur_size) {
            return read_rows_from_disk(cur_size).then([this, sb = std::move(sb)] (std::list<repair_row> new_rows, size_t new_rows_size) mutable {
                size_t new_rows_nr = new_rows.size();
                _row_buf.splice(_row_buf.end(), new_rows);
@@ -1100,6 +1139,8 @@ private:
                });
            });
        });
+       });
+      });
    }

    future<> move_row_buf_to_working_row_buf() {
@@ -1175,9 +1216,9 @@ private:
    }

    future<std::list<repair_row>>
-    copy_rows_from_working_row_buf_within_set_diff(std::unordered_set<repair_hash> set_diff) {
+    copy_rows_from_working_row_buf_within_set_diff(repair_hash_set set_diff) {
        return do_with(std::list<repair_row>(), std::move(set_diff),
-                [this] (std::list<repair_row>& rows, std::unordered_set<repair_hash>& set_diff) {
+                [this] (std::list<repair_row>& rows, repair_hash_set& set_diff) {
            return do_for_each(_working_row_buf, [this, &set_diff, &rows] (const repair_row& r) {
                if (set_diff.count(r.hash()) > 0) {
                    rows.push_back(r);
@@ -1192,7 +1233,7 @@ private:
    // Give a set of row hashes, return the corresponding rows
    // If needs_all_rows is set, return all the rows in _working_row_buf, ignore the set_diff
    future<std::list<repair_row>>
-    get_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows = needs_all_rows_t::no) {
+    get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows = needs_all_rows_t::no) {
        if (needs_all_rows) {
            if (!_repair_master || _nr_peer_nodes == 1) {
                return make_ready_future<std::list<repair_row>>(std::move(_working_row_buf));
@@ -1203,19 +1244,28 @@ private:
        }
    }

-    future<> do_apply_rows(std::list<repair_row>& row_diff, unsigned node_idx, update_working_row_buf update_buf) {
-        return with_semaphore(_repair_writer.sem(), 1, [this, node_idx, update_buf, &row_diff] {
-            _repair_writer.create_writer(_db, node_idx);
-            return do_for_each(row_diff, [this, node_idx, update_buf] (repair_row& r) {
-                if (update_buf) {
-                    _working_row_buf_combined_hash.add(r.hash());
-                }
-                // The repair_row here is supposed to have
-                // mutation_fragment attached because we have stored it in
-                // to_repair_rows_list above where the repair_row is created.
-                mutation_fragment mf = std::move(r.get_mutation_fragment());
-                auto dk_with_hash = r.get_dk_with_hash();
-                return _repair_writer.do_write(node_idx, std::move(dk_with_hash), std::move(mf));
+    future<> do_apply_rows(std::list<repair_row>&& row_diff, unsigned node_idx, update_working_row_buf update_buf) {
+        return do_with(std::move(row_diff), [this, node_idx, update_buf] (std::list<repair_row>& row_diff) {
+            return with_semaphore(_repair_writer->sem(), 1, [this, node_idx, update_buf, &row_diff] {
+                _repair_writer->create_writer(_db, node_idx);
+                return repeat([this, node_idx, update_buf, &row_diff] () mutable {
+                    if (row_diff.empty()) {
+                        return make_ready_future<stop_iteration>(stop_iteration::yes);
+                    }
+                    repair_row& r = row_diff.front();
+                    if (update_buf) {
+                        _working_row_buf_combined_hash.add(r.hash());
+                    }
+                    // The repair_row here is supposed to have
+                    // mutation_fragment attached because we have stored it in
+                    // to_repair_rows_list above where the repair_row is created.
+                    mutation_fragment mf = std::move(r.get_mutation_fragment());
+                    auto dk_with_hash = r.get_dk_with_hash();
+                    return _repair_writer->do_write(node_idx, std::move(dk_with_hash), std::move(mf)).then([&row_diff] {
+                        row_diff.pop_front();
+                        return make_ready_future<stop_iteration>(stop_iteration::no);
+                    });
+                });
            });
        });
    }
@@ -1233,19 +1283,17 @@ private:
        stats().rx_row_nr += row_diff.size();
        stats().rx_row_nr_peer[from] += row_diff.size();
        if (update_buf) {
-            std::list<repair_row> tmp;
-            tmp.swap(_working_row_buf);
            // Both row_diff and _working_row_buf and are ordered, merging
            // two sored list to make sure the combination of row_diff
            // and _working_row_buf are ordered.
-            std::merge(tmp.begin(), tmp.end(), row_diff.begin(), row_diff.end(), std::back_inserter(_working_row_buf),
-                [this] (const repair_row& x, const repair_row& y) { thread::maybe_yield(); return _cmp(x.boundary(), y.boundary()) < 0; });
+            utils::merge_to_gently(_working_row_buf, row_diff,
+                 [this] (const repair_row& x, const repair_row& y) { return _cmp(x.boundary(), y.boundary()) < 0; });
        }
        if (update_hash_set) {
-            _peer_row_hash_sets[node_idx] = boost::copy_range<std::unordered_set<repair_hash>>(row_diff |
+            _peer_row_hash_sets[node_idx] = boost::copy_range<repair_hash_set>(row_diff |
                    boost::adaptors::transformed([] (repair_row& r) { thread::maybe_yield(); return r.hash(); }));
        }
-        do_apply_rows(row_diff, node_idx, update_buf).get();
+        do_apply_rows(std::move(row_diff), node_idx, update_buf).get();
    }

    future<>
@@ -1253,11 +1301,9 @@ private:
        if (rows.empty()) {
            return make_ready_future<>();
        }
-        return to_repair_rows_list(rows).then([this] (std::list<repair_row> row_diff) {
-            return do_with(std::move(row_diff), [this] (std::list<repair_row>& row_diff) {
-                unsigned node_idx = 0;
-                return do_apply_rows(row_diff, node_idx, update_working_row_buf::no);
-            });
+        return to_repair_rows_list(std::move(rows)).then([this] (std::list<repair_row> row_diff) {
+            unsigned node_idx = 0;
+            return do_apply_rows(std::move(row_diff), node_idx, update_working_row_buf::no);
        });
    }

@@ -1336,13 +1382,13 @@ private:
 public:
    // RPC API
    // Return the hashes of the rows in _working_row_buf
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    get_full_row_hashes(gms::inet_address remote_node) {
        if (remote_node == _myip) {
            return get_full_row_hashes_handler();
        }
        return netw::get_local_messaging_service().send_repair_get_full_row_hashes(msg_addr(remote_node),
-                _repair_meta_id).then([this, remote_node] (std::unordered_set<repair_hash> hashes) {
+                _repair_meta_id).then([this, remote_node] (repair_hash_set hashes) {
            rlogger.debug("Got full hashes from peer={}, nr_hashes={}", remote_node, hashes.size());
            _metrics.rx_hashes_nr += hashes.size();
            stats().rx_hashes_nr += hashes.size();
@@ -1353,7 +1399,7 @@ public:

 private:
    future<> get_full_row_hashes_source_op(
-            lw_shared_ptr<std::unordered_set<repair_hash>> current_hashes,
+            lw_shared_ptr<repair_hash_set> current_hashes,
            gms::inet_address remote_node,
            unsigned node_idx,
            rpc::source<repair_hash_with_cmd>& source) {
@@ -1391,12 +1437,12 @@ private:
    }

 public:
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    get_full_row_hashes_with_rpc_stream(gms::inet_address remote_node, unsigned node_idx) {
        if (remote_node == _myip) {
            return get_full_row_hashes_handler();
        }
-        auto current_hashes = make_lw_shared<std::unordered_set<repair_hash>>();
+        auto current_hashes = make_lw_shared<repair_hash_set>();
        return _sink_source_for_get_full_row_hashes.get_sink_source(remote_node, node_idx).then(
                [this, current_hashes, remote_node, node_idx]
                (rpc::sink<repair_stream_cmd>& sink, rpc::source<repair_hash_with_cmd>& source) mutable {
@@ -1411,7 +1457,7 @@ public:
    }

    // RPC handler
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    get_full_row_hashes_handler() {
        return with_gate(_gate, [this] {
            return working_row_hashes();
@@ -1551,7 +1597,7 @@ public:
    // RPC API
    // Return rows in the _working_row_buf with hash within the given sef_diff
    // Must run inside a seastar thread
-    void get_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
+    void get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
        if (needs_all_rows || !set_diff.empty()) {
            if (remote_node == _myip) {
                return;
@@ -1620,11 +1666,11 @@ private:
    }

    future<> get_row_diff_sink_op(
-            std::unordered_set<repair_hash> set_diff,
+            repair_hash_set set_diff,
            needs_all_rows_t needs_all_rows,
            rpc::sink<repair_hash_with_cmd>& sink,
            gms::inet_address remote_node) {
-        return do_with(std::move(set_diff), [needs_all_rows, remote_node, &sink] (std::unordered_set<repair_hash>& set_diff) mutable {
+        return do_with(std::move(set_diff), [needs_all_rows, remote_node, &sink] (repair_hash_set& set_diff) mutable {
            if (inject_rpc_stream_error) {
                return make_exception_future<>(std::runtime_error("get_row_diff: Inject sender error in sink loop"));
            }
@@ -1651,7 +1697,7 @@ private:
 public:
    // Must run inside a seastar thread
    void get_row_diff_with_rpc_stream(
-            std::unordered_set<repair_hash> set_diff,
+            repair_hash_set set_diff,
            needs_all_rows_t needs_all_rows,
            update_peer_row_hash_sets update_hash_set,
            gms::inet_address remote_node,
@@ -1677,7 +1723,7 @@ public:
    }

    // RPC handler
-    future<repair_rows_on_wire> get_row_diff_handler(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows) {
+    future<repair_rows_on_wire> get_row_diff_handler(repair_hash_set set_diff, needs_all_rows_t needs_all_rows) {
        return with_gate(_gate, [this, set_diff = std::move(set_diff), needs_all_rows] () mutable {
            return get_row_diff(std::move(set_diff), needs_all_rows).then([this] (std::list<repair_row> row_diff) {
                return to_repair_rows_on_wire(std::move(row_diff));
@@ -1687,12 +1733,12 @@ public:

    // RPC API
    // Send rows in the _working_row_buf with hash within the given sef_diff
-    future<> put_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
+    future<> put_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
        if (!set_diff.empty()) {
            if (remote_node == _myip) {
                return make_ready_future<>();
            }
-            auto sz = set_diff.size();
+            size_t sz = set_diff.size();
            return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, sz] (std::list<repair_row> row_diff) {
                if (row_diff.size() != sz) {
                    rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
@@ -1763,14 +1809,14 @@ private:

 public:
    future<> put_row_diff_with_rpc_stream(
-            std::unordered_set<repair_hash> set_diff,
+            repair_hash_set set_diff,
            needs_all_rows_t needs_all_rows,
            gms::inet_address remote_node, unsigned node_idx) {
        if (!set_diff.empty()) {
            if (remote_node == _myip) {
                return make_ready_future<>();
            }
-            auto sz = set_diff.size();
+            size_t sz = set_diff.size();
            return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, node_idx, sz] (std::list<repair_row> row_diff) {
                if (row_diff.size() != sz) {
                    rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
@@ -1813,7 +1859,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
        rpc::sink<repair_row_on_wire_with_cmd> sink,
        rpc::source<repair_hash_with_cmd> source,
        bool &error,
-        std::unordered_set<repair_hash>& current_set_diff,
+        repair_hash_set& current_set_diff,
        std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) {
    repair_hash_with_cmd hash_cmd = std::get<0>(hash_cmd_opt.value());
    rlogger.trace("Got repair_hash_with_cmd from peer={}, hash={}, cmd={}", from, hash_cmd.hash, int(hash_cmd.cmd));
@@ -1826,7 +1872,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
        }
        bool needs_all_rows = hash_cmd.cmd == repair_stream_cmd::needs_all_rows;
        _metrics.rx_hashes_nr += current_set_diff.size();
-        auto fp = make_foreign(std::make_unique<std::unordered_set<repair_hash>>(std::move(current_set_diff)));
+        auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(current_set_diff)));
        return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, needs_all_rows, fp = std::move(fp)] {
            auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
            if (fp.get_owner_shard() == this_shard_id()) {
@@ -1904,12 +1950,12 @@ static future<stop_iteration> repair_get_full_row_hashes_with_rpc_stream_process
    if (status == repair_stream_cmd::get_full_row_hashes) {
        return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id] {
            auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
-            return rm->get_full_row_hashes_handler().then([] (std::unordered_set<repair_hash> hashes) {
+            return rm->get_full_row_hashes_handler().then([] (repair_hash_set hashes) {
                _metrics.tx_hashes_nr += hashes.size();
                return hashes;
            });
-        }).then([sink] (std::unordered_set<repair_hash> hashes) mutable {
-            return do_with(std::move(hashes), [sink] (std::unordered_set<repair_hash>& hashes) mutable {
+        }).then([sink] (repair_hash_set hashes) mutable {
+            return do_with(std::move(hashes), [sink] (repair_hash_set& hashes) mutable {
                return do_for_each(hashes, [sink] (const repair_hash& hash) mutable {
                    return sink(repair_hash_with_cmd{repair_stream_cmd::hash_data, hash});
                }).then([sink] () mutable {
@@ -1932,7 +1978,7 @@ static future<> repair_get_row_diff_with_rpc_stream_handler(
        uint32_t repair_meta_id,
        rpc::sink<repair_row_on_wire_with_cmd> sink,
        rpc::source<repair_hash_with_cmd> source) {
-    return do_with(false, std::unordered_set<repair_hash>(), [from, src_cpu_id, repair_meta_id, sink, source] (bool& error, std::unordered_set<repair_hash>& current_set_diff) mutable {
+    return do_with(false, repair_hash_set(), [from, src_cpu_id, repair_meta_id, sink, source] (bool& error, repair_hash_set& current_set_diff) mutable {
        return repeat([from, src_cpu_id, repair_meta_id, sink, source, &error, &current_set_diff] () mutable {
            return source().then([from, src_cpu_id, repair_meta_id, sink, source, &error, &current_set_diff] (std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) mutable {
                if (hash_cmd_opt) {
@@ -2075,7 +2121,7 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
            auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
            return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id] {
                auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
-                return rm->get_full_row_hashes_handler().then([] (std::unordered_set<repair_hash> hashes) {
+                return rm->get_full_row_hashes_handler().then([] (repair_hash_set hashes) {
                    _metrics.tx_hashes_nr += hashes.size();
                    return hashes;
                });
@@ -2103,11 +2149,11 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
            });
        });
        ms.register_repair_get_row_diff([] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
-                std::unordered_set<repair_hash> set_diff, bool needs_all_rows) {
+                repair_hash_set set_diff, bool needs_all_rows) {
            auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
            auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
            _metrics.rx_hashes_nr += set_diff.size();
-            auto fp = make_foreign(std::make_unique<std::unordered_set<repair_hash>>(std::move(set_diff)));
+            auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(set_diff)));
            return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, fp = std::move(fp), needs_all_rows] () mutable {
                auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
                if (fp.get_owner_shard() == this_shard_id()) {
@@ -2175,6 +2221,25 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
    });
 }

+future<> repair_uninit_messaging_service_handler() {
+    return netw::get_messaging_service().invoke_on_all([] (auto& ms) {
+        return when_all_succeed(
+            ms.unregister_repair_get_row_diff_with_rpc_stream(),
+            ms.unregister_repair_put_row_diff_with_rpc_stream(),
+            ms.unregister_repair_get_full_row_hashes_with_rpc_stream(),
+            ms.unregister_repair_get_full_row_hashes(),
+            ms.unregister_repair_get_combined_row_hash(),
+            ms.unregister_repair_get_sync_boundary(),
+            ms.unregister_repair_get_row_diff(),
+            ms.unregister_repair_put_row_diff(),
+            ms.unregister_repair_row_level_start(),
+            ms.unregister_repair_row_level_stop(),
+            ms.unregister_repair_get_estimated_partitions(),
+            ms.unregister_repair_set_estimated_partitions(),
+            ms.unregister_repair_get_diff_algorithms()).discard_result();
+    });
+}
+
 class row_level_repair {
    repair_info& _ri;
    sstring _cf_name;
@@ -2404,7 +2469,7 @@ private:
            // sequentially because the rows from repair follower 1 to
            // repair master might reduce the amount of missing data
            // between repair master and repair follower 2.
-            std::unordered_set<repair_hash> set_diff = repair_meta::get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get0());
+            repair_hash_set set_diff = repair_meta::get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get0());
            // Request missing sets from peer node
            rlogger.debug("Before get_row_diff to node {}, local={}, peer={}, set_diff={}",
                    node, master.working_row_hashes().get0().size(), master.peer_row_hash_sets(node_idx).size(), set_diff.size());
@@ -2427,9 +2492,9 @@ private:
        // So we can figure out which rows peer node are missing and send the missing rows to them
        check_in_shutdown();
        _ri.check_in_abort();
-        std::unordered_set<repair_hash> local_row_hash_sets = master.working_row_hashes().get0();
+        repair_hash_set local_row_hash_sets = master.working_row_hashes().get0();
        auto sz = _all_live_peer_nodes.size();
-        std::vector<std::unordered_set<repair_hash>> set_diffs(sz);
+        std::vector<repair_hash_set> set_diffs(sz);
        for (size_t idx : boost::irange(size_t(0), sz)) {
            set_diffs[idx] = repair_meta::get_set_diff(local_row_hash_sets, master.peer_row_hash_sets(idx));
        }
--- a/repair/row_level.hh
+++ b/repair/row_level.hh
@@ -45,6 +45,7 @@ private:
 };

 future<> repair_init_messaging_service_handler(repair_service& rs, distributed<db::system_distributed_keyspace>& sys_dist_ks, distributed<db::view::view_update_generator>& view_update_generator);
+future<> repair_uninit_messaging_service_handler();

 class repair_info;

--- a/schema.cc
+++ b/schema.cc
@@ -43,6 +43,8 @@

 constexpr int32_t schema::NAME_LENGTH;

+extern logging::logger dblog;
+
 sstring to_sstring(column_kind k) {
    switch (k) {
    case column_kind::partition_key:  return "PARTITION_KEY";
@@ -592,11 +594,15 @@ schema::get_column_definition(const bytes& name) const {

 const column_definition&
 schema::column_at(column_kind kind, column_id id) const {
-    return _raw._columns.at(column_offset(kind) + id);
+    return column_at(static_cast<ordinal_column_id>(column_offset(kind) + id));
 }

 const column_definition&
 schema::column_at(ordinal_column_id ordinal_id) const {
+    if (size_t(ordinal_id) >= _raw._columns.size()) {
+        on_internal_error(dblog, format("{}.{}@{}: column id {:d} >= {:d}",
+            ks_name(), cf_name(), version(), size_t(ordinal_id), _raw._columns.size()));
+    }
    return _raw._columns.at(static_cast<column_count_type>(ordinal_id));
 }

--- a/scripts/create-relocatable-package.py
+++ b/scripts/create-relocatable-package.py
@@ -79,7 +79,8 @@ executables = ['build/{}/scylla'.format(args.mode),
               '/usr/sbin/ethtool',
               '/usr/bin/netstat',
               '/usr/bin/hwloc-distrib',
-               '/usr/bin/hwloc-calc']
+               '/usr/bin/hwloc-calc',
+               '/usr/bin/lsblk']

 output = args.dest

--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -597,7 +597,7 @@ def current_shard():


 def find_db(shard=None):
-    if not shard:
+    if shard is None:
        shard = current_shard()
    return gdb.parse_and_eval('::debug::db')['_instances']['_M_impl']['_M_start'][shard]['service']['_p']

--- a/scylla_post_install.sh
+++ b/scylla_post_install.sh
@@ -63,6 +63,17 @@ MemoryHigh=1200M
 MemoryMax=1400M
 MemoryLimit=1400M
 EOS
+
+# On CentOS7, systemd does not support percentage-based parameter.
+# To apply memory parameter on CentOS7, we need to override the parameter
+# in bytes, instead of percentage.
+elif [ "$RHEL" -a "$VERSION_ID" = "7" ]; then
+    MEMORY_LIMIT=$((MEMTOTAL_BYTES / 100 * 5))
+    mkdir -p /etc/systemd/system/scylla-helper.slice.d/
+    cat << EOS > /etc/systemd/system/scylla-helper.slice.d/memory.conf
+[Slice]
+MemoryLimit=$MEMORY_LIMIT
+EOS
 fi

 systemctl --system daemon-reload >/dev/null || true
--- a/2
+++ b/2
--- a/serializer_impl.hh
+++ b/serializer_impl.hh
@@ -25,6 +25,7 @@
 #include <seastar/util/bool_class.hh>
 #include <boost/range/algorithm/for_each.hpp>
 #include "utils/small_vector.hh"
+#include <absl/container/btree_set.h>

 namespace ser {

@@ -81,6 +82,17 @@ static inline void serialize_array(Output& out, const Container& v) {
 template<typename Container>
 struct container_traits;

+template<typename T>
+struct container_traits<absl::btree_set<T>> {
+    struct back_emplacer {
+        absl::btree_set<T>& c;
+        back_emplacer(absl::btree_set<T>& c_) : c(c_) {}
+        void operator()(T&& v) {
+            c.emplace(std::move(v));
+        }
+    };
+};
+
 template<typename T>
 struct container_traits<std::unordered_set<T>> {
    struct back_emplacer {
@@ -253,6 +265,27 @@ struct serializer<std::list<T>> {
    }
 };

+template<typename T>
+struct serializer<absl::btree_set<T>> {
+    template<typename Input>
+    static absl::btree_set<T> read(Input& in) {
+        auto sz = deserialize(in, boost::type<uint32_t>());
+        absl::btree_set<T> v;
+        deserialize_array_helper<false, T>::doit(in, v, sz);
+        return v;
+    }
+    template<typename Output>
+    static void write(Output& out, const absl::btree_set<T>& v) {
+        safe_serialize_as_uint32(out, v.size());
+        serialize_array_helper<false, T>::doit(out, v);
+    }
+    template<typename Input>
+    static void skip(Input& in) {
+        auto sz = deserialize(in, boost::type<uint32_t>());
+        skip_array<T>(in, sz);
+    }
+};
+
 template<typename T>
 struct serializer<std::unordered_set<T>> {
    template<typename Input>
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -92,7 +92,7 @@ void migration_manager::init_messaging_service()
        //FIXME: future discarded.
        (void)with_gate(_background_tasks, [this] {
            mlogger.debug("features changed, recalculating schema version");
-            return update_schema_version_and_announce(get_storage_proxy(), _feat.cluster_schema_features());
+            return db::schema_tables::recalculate_schema_version(get_storage_proxy(), _feat);
        });
    };

@@ -1104,6 +1104,20 @@ future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging
        mlogger.debug("Requesting schema {} from {}", v, dst);
        auto& ms = netw::get_local_messaging_service();
        return ms.send_get_schema_version(dst, v);
+    }).then([] (schema_ptr s) {
+        // If this is a view so this schema also needs a reference to the base
+        // table.
+        if (s->is_view()) {
+            if (!s->view_info()->base_info()) {
+                auto& db = service::get_local_storage_proxy().get_db().local();
+                // This line might throw a no_such_column_family
+                // It should be fine since if we tried to register a view for which
+                // we don't know the base table, our registry is broken.
+                schema_ptr base_schema = db.find_schema(s->view_info()->base_id());
+                s->view_info()->set_base_info(s->view_info()->make_base_dependent_view_info(*base_schema));
+            }
+        }
+        return s;
    });
 }

--- a/service/pager/query_pagers.cc
+++ b/service/pager/query_pagers.cc
@@ -349,7 +349,7 @@ public:
            _max = _max - row_count;
            _exhausted = (row_count < page_size && !results->is_short_read()) || _max == 0;

-            if (!_exhausted || row_count > 0) {
+            if (!_exhausted && row_count > 0) {
                if (_last_pkey) {
                    update_slice(*_last_pkey);
                }
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -119,9 +119,11 @@ using fbu = utils::fb_utilities;

 static inline
 query::digest_algorithm digest_algorithm(service::storage_proxy& proxy) {
-    return proxy.features().cluster_supports_xxhash_digest_algorithm()
-         ? query::digest_algorithm::xxHash
-         : query::digest_algorithm::MD5;
+    return proxy.features().cluster_supports_digest_for_null_values()
+            ? query::digest_algorithm::xxHash
+            : proxy.features().cluster_supports_xxhash_digest_algorithm()
+                    ? query::digest_algorithm::legacy_xxHash_without_null_digest
+                    : query::digest_algorithm::MD5;
 }

 static inline
@@ -1728,6 +1730,7 @@ storage_proxy::storage_proxy(distributed<database>& db, storage_proxy::config cf
    , _token_metadata(tm)
    , _read_smp_service_group(cfg.read_smp_service_group)
    , _write_smp_service_group(cfg.write_smp_service_group)
+    , _hints_write_smp_service_group(cfg.hints_write_smp_service_group)
    , _write_ack_smp_service_group(cfg.write_ack_smp_service_group)
    , _next_response_id(std::chrono::system_clock::now().time_since_epoch()/1ms)
    , _hints_resource_manager(cfg.available_memory / 10)
@@ -1771,37 +1774,47 @@ storage_proxy::response_id_type storage_proxy::unique_response_handler::release(
 }

 future<>
-storage_proxy::mutate_locally(const mutation& m, clock_type::time_point timeout) {
+storage_proxy::mutate_locally(const mutation& m, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp) {
    auto shard = _db.local().shard_of(m);
    get_stats().replica_cross_shard_ops += shard != this_shard_id();
-    return _db.invoke_on(shard, {_write_smp_service_group, timeout}, [s = global_schema_ptr(m.schema()), m = freeze(m), timeout] (database& db) -> future<> {
-        return db.apply(s, m, db::commitlog::force_sync::no, timeout);
+    return _db.invoke_on(shard, {smp_grp, timeout},
+            [s = global_schema_ptr(m.schema()),
+             m = freeze(m),
+             timeout,
+             sync] (database& db) mutable -> future<> {
+        return db.apply(s, m, sync, timeout);
    });
 }

 future<>
-storage_proxy::mutate_locally(const schema_ptr& s, const frozen_mutation& m, db::commitlog::force_sync sync, clock_type::time_point timeout) {
+storage_proxy::mutate_locally(const schema_ptr& s, const frozen_mutation& m, db::commitlog::force_sync sync, clock_type::time_point timeout,
+        smp_service_group smp_grp) {
    auto shard = _db.local().shard_of(m);
    get_stats().replica_cross_shard_ops += shard != this_shard_id();
-    return _db.invoke_on(shard, {_write_smp_service_group, timeout}, [&m, gs = global_schema_ptr(s), timeout, sync] (database& db) -> future<> {
+    return _db.invoke_on(shard, {smp_grp, timeout},
+            [&m, gs = global_schema_ptr(s), timeout, sync] (database& db) mutable -> future<> {
        return db.apply(gs, m, sync, timeout);
    });
 }

 future<>
-storage_proxy::mutate_locally(std::vector<mutation> mutations, clock_type::time_point timeout) {
-    return do_with(std::move(mutations), [this, timeout] (std::vector<mutation>& pmut){
-        return parallel_for_each(pmut.begin(), pmut.end(), [this, timeout] (const mutation& m) {
-            return mutate_locally(m, timeout);
+storage_proxy::mutate_locally(std::vector<mutation> mutations, clock_type::time_point timeout, smp_service_group smp_grp) {
+    return do_with(std::move(mutations), [this, timeout, smp_grp] (std::vector<mutation>& pmut) {
+        return parallel_for_each(pmut.begin(), pmut.end(), [this, timeout, smp_grp] (const mutation& m) {
+            return mutate_locally(m, db::commitlog::force_sync::no, timeout, smp_grp);
        });
    });
 }

+future<> 
+storage_proxy::mutate_locally(std::vector<mutation> mutation, clock_type::time_point timeout) {
+        return mutate_locally(std::move(mutation), timeout, _write_smp_service_group);
+}
 future<>
 storage_proxy::mutate_hint(const schema_ptr& s, const frozen_mutation& m, clock_type::time_point timeout) {
    auto shard = _db.local().shard_of(m);
    get_stats().replica_cross_shard_ops += shard != this_shard_id();
-    return _db.invoke_on(shard, {_write_smp_service_group, timeout}, [&m, gs = global_schema_ptr(s), timeout] (database& db) -> future<> {
+    return _db.invoke_on(shard, {_hints_write_smp_service_group, timeout}, [&m, gs = global_schema_ptr(s), timeout] (database& db) mutable -> future<> {
        return db.apply_hint(gs, m, timeout);
    });
 }
@@ -4440,6 +4453,12 @@ future<bool> storage_proxy::cas(schema_ptr schema, shared_ptr<cas_request> reque
                                    paxos::paxos_state::logger.debug("CAS[{}] successful", handler->id());
                                    tracing::trace(handler->tr_state, "CAS successful");
                                    return std::optional<bool>(condition_met);
+                                }).handle_exception_type([handler] (unavailable_exception& e) {
+                                    // if learning stage encountered unavailablity error lets re-map it to a write error
+                                    // since unavailable error means that operation has never ever started which is not the case here
+                                    schema_ptr schema = handler->schema();
+                                    return make_exception_future<std::optional<bool>>(mutation_write_timeout_exception(schema->ks_name(), schema->cf_name(),
+                                                               e.consistency, e.alive, e.required, db::write_type::CAS));
                                });
                            }
                            paxos::paxos_state::logger.debug("CAS[{}] PAXOS proposal not accepted (pre-empted by a higher ballot)",
@@ -4800,7 +4819,7 @@ void storage_proxy::init_messaging_service() {
        });
    };

-    auto receive_mutation_handler = [] (const rpc::client_info& cinfo, rpc::opt_time_point t, frozen_mutation in, std::vector<gms::inet_address> forward,
+    auto receive_mutation_handler = [] (smp_service_group smp_grp, const rpc::client_info& cinfo, rpc::opt_time_point t, frozen_mutation in, std::vector<gms::inet_address> forward,
            gms::inet_address reply_to, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info) {
        tracing::trace_state_ptr trace_state_ptr;
        auto src_addr = netw::messaging_service::get_source(cinfo);
@@ -4808,9 +4827,9 @@ void storage_proxy::init_messaging_service() {
        utils::UUID schema_version = in.schema_version();
        return handle_write(src_addr, t, schema_version, std::move(in), std::move(forward), reply_to, shard, response_id,
                trace_info ? *trace_info : std::nullopt,
-                /* apply_fn */ [] (shared_ptr<storage_proxy>& p, tracing::trace_state_ptr, schema_ptr s, const frozen_mutation& m,
+                /* apply_fn */ [smp_grp] (shared_ptr<storage_proxy>& p, tracing::trace_state_ptr, schema_ptr s, const frozen_mutation& m,
                        clock_type::time_point timeout) {
-                    return p->mutate_locally(std::move(s), m, db::commitlog::force_sync::no, timeout);
+                    return p->mutate_locally(std::move(s), m, db::commitlog::force_sync::no, timeout, smp_grp);
                },
                /* forward_fn */ [] (netw::messaging_service::msg_addr addr, clock_type::time_point timeout, const frozen_mutation& m,
                        gms::inet_address reply_to, unsigned shard, response_id_type response_id,
@@ -4819,8 +4838,14 @@ void storage_proxy::init_messaging_service() {
                    return ms.send_mutation(addr, timeout, m, {}, reply_to, shard, response_id, std::move(trace_info));
                });
    };
-    ms.register_mutation(receive_mutation_handler);
-    ms.register_hint_mutation(receive_mutation_handler);
+    auto make_receive_mutation_handler = [receive_mutation_handler] (smp_service_group grp) {
+        return [receive_mutation_handler, grp] (const rpc::client_info& cinfo, rpc::opt_time_point t, frozen_mutation in, std::vector<gms::inet_address> forward,
+            gms::inet_address reply_to, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info) {
+            return receive_mutation_handler(grp, cinfo, t, std::move(in), std::move(forward), reply_to, shard, response_id, trace_info);
+        };
+    };
+    ms.register_mutation(make_receive_mutation_handler(_write_smp_service_group));
+    ms.register_hint_mutation(make_receive_mutation_handler(_hints_write_smp_service_group));

    ms.register_paxos_learn([] (const rpc::client_info& cinfo, rpc::opt_time_point t, paxos::proposal decision,
            std::vector<gms::inet_address> forward, gms::inet_address reply_to, unsigned shard,
@@ -5063,18 +5088,22 @@ void storage_proxy::init_messaging_service() {
 future<> storage_proxy::uninit_messaging_service() {
    auto& ms = netw::get_local_messaging_service();
    return when_all_succeed(
+        ms.unregister_counter_mutation(),
        ms.unregister_mutation(),
+        ms.unregister_hint_mutation(),
        ms.unregister_mutation_done(),
        ms.unregister_mutation_failed(),
        ms.unregister_read_data(),
        ms.unregister_read_mutation_data(),
        ms.unregister_read_digest(),
        ms.unregister_truncate(),
+        ms.unregister_get_schema_version(),
        ms.unregister_paxos_prepare(),
        ms.unregister_paxos_accept(),
        ms.unregister_paxos_learn(),
        ms.unregister_paxos_prune()
    );
+
 }

 future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>
@@ -5167,8 +5196,7 @@ future<> storage_proxy::drain_on_shutdown() {

 future<>
 storage_proxy::stop() {
-    // FIXME: hints manager should be stopped here but it seems like this function is never called
-    return uninit_messaging_service();
+    return make_ready_future<>();
 }

 }
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -166,6 +166,7 @@ public:
        size_t available_memory;
        smp_service_group read_smp_service_group = default_smp_service_group();
        smp_service_group write_smp_service_group = default_smp_service_group();
+        smp_service_group hints_write_smp_service_group = default_smp_service_group();
        // Write acknowledgments might not be received on the correct shard, and
        // they need a separate smp_service_group to prevent an ABBA deadlock
        // with writes.
@@ -256,6 +257,7 @@ private:
    locator::token_metadata& _token_metadata;
    smp_service_group _read_smp_service_group;
    smp_service_group _write_smp_service_group;
+    smp_service_group _hints_write_smp_service_group;
    smp_service_group _write_ack_smp_service_group;
    response_id_type _next_response_id;
    response_handlers_map _response_handlers;
@@ -299,7 +301,6 @@ private:
    cdc::cdc_service* _cdc = nullptr;
    cdc_stats _cdc_stats;
 private:
-    future<> uninit_messaging_service();
    future<coordinator_query_result> query_singular(lw_shared_ptr<query::read_command> cmd,
            dht::partition_range_vector&& partition_ranges,
            db::consistency_level cl,
@@ -453,13 +454,31 @@ public:
        return next;
    }
    void init_messaging_service();
+    future<> uninit_messaging_service();

+private:
    // Applies mutation on this node.
    // Resolves with timed_out_error when timeout is reached.
-    future<> mutate_locally(const mutation& m, clock_type::time_point timeout = clock_type::time_point::max());
+    future<> mutate_locally(const mutation& m, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp);
    // Applies mutation on this node.
    // Resolves with timed_out_error when timeout is reached.
-    future<> mutate_locally(const schema_ptr&, const frozen_mutation& m, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max());
+    future<> mutate_locally(const schema_ptr&, const frozen_mutation& m, db::commitlog::force_sync sync, clock_type::time_point timeout,
+            smp_service_group smp_grp);
+    // Applies mutations on this node.
+    // Resolves with timed_out_error when timeout is reached.
+    future<> mutate_locally(std::vector<mutation> mutation, clock_type::time_point timeout, smp_service_group smp_grp);
+
+public:
+    // Applies mutation on this node.
+    // Resolves with timed_out_error when timeout is reached.
+    future<> mutate_locally(const mutation& m, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max()) {
+        return mutate_locally(m, sync, timeout, _write_smp_service_group);
+    }
+    // Applies mutation on this node.
+    // Resolves with timed_out_error when timeout is reached.
+    future<> mutate_locally(const schema_ptr& s, const frozen_mutation& m, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max()) {
+        return mutate_locally(s, m, sync, timeout, _write_smp_service_group);
+    }
    // Applies mutations on this node.
    // Resolves with timed_out_error when timeout is reached.
    future<> mutate_locally(std::vector<mutation> mutation, clock_type::time_point timeout = clock_type::time_point::max());
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -420,6 +420,9 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
        app_states.emplace(gms::application_state::CDC_STREAMS_TIMESTAMP, versioned_value::cdc_streams_timestamp(_cdc_streams_ts));
        app_states.emplace(gms::application_state::STATUS, versioned_value::normal(my_tokens));
    }
+    if (replacing_a_node_with_same_ip || replacing_a_node_with_diff_ip) {
+        app_states.emplace(gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens));
+    }
    slogger.info("Starting up server gossip");

    auto generation_number = db::system_keyspace::increment_and_get_generation().get0();
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -126,8 +126,8 @@ static std::vector<shared_sstable> get_uncompacting_sstables(column_family& cf,
 class compaction;

 struct compaction_writer {
-    sstable_writer writer;
    shared_sstable sst;
+    sstable_writer writer;
 };

 class compacting_sstable_writer {
@@ -541,10 +541,12 @@ private:
                                         std::move(gc_consumer));

            return seastar::async([cfc = std::move(cfc), reader = std::move(reader), this] () mutable {
-                reader.consume_in_thread(std::move(cfc), make_partition_filter(), db::no_timeout);
+                reader.consume_in_thread(std::move(cfc), db::no_timeout);
            });
        });
-        return consumer(make_sstable_reader());
+        // producer will filter out a partition before it reaches the consumer(s)
+        auto producer = make_filtering_reader(make_sstable_reader(), make_partition_filter());
+        return consumer(std::move(producer));
    }

    virtual reader_consumer make_interposer_consumer(reader_consumer end_consumer) = 0;
@@ -784,7 +786,8 @@ public:
        cfg.max_sstable_size = _max_sstable_size;
        cfg.monitor = &_active_write_monitors.back();
        cfg.run_identifier = _run_identifier;
-        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), priority), sst};
+        auto writer = sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), priority);
+        return compaction_writer{std::move(sst), std::move(writer)};
    }

    virtual void stop_sstable_writer(compaction_writer* writer) override {
@@ -1266,7 +1269,8 @@ public:
        // sstables generated for a given shard will share the same run identifier.
        cfg.run_identifier = _run_identifiers.at(shard);
        auto&& priority = service::get_local_compaction_priority();
-        return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(shard), cfg, get_encoding_stats(), priority, shard), sst};
+        auto writer = sst->get_writer(*_schema, partitions_per_sstable(shard), cfg, get_encoding_stats(), priority, shard);
+        return compaction_writer{std::move(sst), std::move(writer)};
    }

    void on_new_partition() override {}
--- a/sstables/compaction_manager.cc
+++ b/sstables/compaction_manager.cc
@@ -218,7 +218,7 @@ std::vector<sstables::shared_sstable> compaction_manager::get_candidates(const c
    auto& cs = cf.get_compaction_strategy();

    // Filter out sstables that are being compacted.
-    for (auto& sst : cf.candidates_for_compaction()) {
+    for (auto& sst : cf.non_staging_sstables()) {
        if (_compacting_sstables.count(sst)) {
            continue;
        }
@@ -663,8 +663,8 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
    return task->compaction_done.get_future().then([task] {});
 }

-static bool needs_cleanup(const sstables::shared_sstable& sst,
-                   const dht::token_range_vector& owned_ranges,
+bool needs_cleanup(const sstables::shared_sstable& sst,
+                   const dht::token_range_vector& sorted_owned_ranges,
                   schema_ptr s) {
    auto first = sst->get_first_partition_key();
    auto last = sst->get_last_partition_key();
@@ -672,29 +672,40 @@ static bool needs_cleanup(const sstables::shared_sstable& sst,
    auto last_token = dht::get_token(*s, last);
    dht::token_range sst_token_range = dht::token_range::make(first_token, last_token);

+    auto r = std::lower_bound(sorted_owned_ranges.begin(), sorted_owned_ranges.end(), first_token,
+            [] (const range<dht::token>& a, const dht::token& b) {
+        // check that range a is before token b.
+        return a.after(b, dht::token_comparator());
+    });
+
    // return true iff sst partition range isn't fully contained in any of the owned ranges.
-    for (auto& r : owned_ranges) {
-        if (r.contains(sst_token_range, dht::token_comparator())) {
+    if (r != sorted_owned_ranges.end()) {
+        if (r->contains(sst_token_range, dht::token_comparator())) {
            return false;
        }
    }
    return true;
 }

-future<> compaction_manager::perform_cleanup(column_family* cf) {
+future<> compaction_manager::perform_cleanup(database& db, column_family* cf) {
    if (check_for_cleanup(cf)) {
        throw std::runtime_error(format("cleanup request failed: there is an ongoing cleanup on {}.{}",
            cf->schema()->ks_name(), cf->schema()->cf_name()));
    }
-    return rewrite_sstables(cf, sstables::compaction_options::make_cleanup(), [this] (const table& table) {
-        auto schema = table.schema();
-        auto owned_ranges = service::get_local_storage_service().get_local_ranges(schema->ks_name());
+    return seastar::async([this, cf, &db] {
+        auto schema = cf->schema();
+        auto& rs = db.find_keyspace(schema->ks_name()).get_replication_strategy();
+        auto sorted_owned_ranges = rs.get_ranges_in_thread(utils::fb_utilities::get_broadcast_address());
        auto sstables = std::vector<sstables::shared_sstable>{};
-        const auto candidates = table.candidates_for_compaction();
-        std::copy_if(candidates.begin(), candidates.end(), std::back_inserter(sstables), [&owned_ranges, schema] (const sstables::shared_sstable& sst) {
-            return owned_ranges.empty() || needs_cleanup(sst, owned_ranges, schema);
+        const auto candidates = get_candidates(*cf);
+        std::copy_if(candidates.begin(), candidates.end(), std::back_inserter(sstables), [&sorted_owned_ranges, schema] (const sstables::shared_sstable& sst) {
+            seastar::thread::maybe_yield();
+            return sorted_owned_ranges.empty() || needs_cleanup(sst, sorted_owned_ranges, schema);
        });
        return sstables;
+    }).then([this, cf] (std::vector<sstables::shared_sstable> sstables) {
+        return rewrite_sstables(cf, sstables::compaction_options::make_cleanup(),
+                [sstables = std::move(sstables)] (const table&) { return sstables; });
    });
 }

@@ -709,7 +720,7 @@ future<> compaction_manager::perform_sstable_upgrade(column_family* cf, bool exc
        return cf->run_with_compaction_disabled([this, cf, &tables, exclude_current_version] {
            auto last_version = cf->get_sstables_manager().get_highest_supported_format();

-            for (auto& sst : cf->candidates_for_compaction()) {
+            for (auto& sst : get_candidates(*cf)) {
                // if we are a "normal" upgrade, we only care about
                // tables with older versions, but potentially
                // we are to actually rewrite everything. (-a)
@@ -734,8 +745,8 @@ future<> compaction_manager::perform_sstable_upgrade(column_family* cf, bool exc

 // Submit a column family to be scrubbed and wait for its termination.
 future<> compaction_manager::perform_sstable_scrub(column_family* cf, bool skip_corrupted) {
-    return rewrite_sstables(cf, sstables::compaction_options::make_scrub(skip_corrupted), [] (const table& cf) {
-        return cf.candidates_for_compaction();
+    return rewrite_sstables(cf, sstables::compaction_options::make_scrub(skip_corrupted), [this] (const table& cf) {
+        return get_candidates(cf);
    });
 }

--- a/sstables/compaction_manager.hh
+++ b/sstables/compaction_manager.hh
@@ -175,7 +175,7 @@ public:
    // Cleanup is about discarding keys that are no longer relevant for a
    // given sstable, e.g. after node loses part of its token range because
    // of a newly added node.
-    future<> perform_cleanup(column_family* cf);
+    future<> perform_cleanup(database& db, column_family* cf);

    // Submit a column family to be upgraded and wait for its termination.
    future<> perform_sstable_upgrade(column_family* cf, bool exclude_current_version);
@@ -243,3 +243,5 @@ public:
    friend class compaction_weight_registration;
 };

+bool needs_cleanup(const sstables::shared_sstable& sst, const dht::token_range_vector& owned_ranges, schema_ptr s);
+
--- a/sstables/compaction_strategy.cc
+++ b/sstables/compaction_strategy.cc
@@ -58,8 +58,6 @@
 #include "time_window_compaction_strategy.hh"
 #include "sstables/compaction_backlog_manager.hh"
 #include "sstables/size_tiered_backlog_tracker.hh"
-#include "mutation_source_metadata.hh"
-#include "mutation_writer/timestamp_based_splitting_writer.hh"

 logging::logger date_tiered_manifest::logger = logging::logger("DateTieredCompactionStrategy");
 logging::logger leveled_manifest::logger("LeveledManifest");
@@ -785,65 +783,6 @@ time_window_compaction_strategy::time_window_compaction_strategy(const std::map<
    _use_clustering_key_filter = true;
 }

-uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
-    if (!ms_meta.min_timestamp || !ms_meta.max_timestamp) {
-        // Not enough information, we assume the worst
-        return partition_estimate / max_data_segregation_window_count;
-    }
-    const auto min_window = get_window_for(_options, *ms_meta.min_timestamp);
-    const auto max_window = get_window_for(_options, *ms_meta.max_timestamp);
-    const auto window_size = get_window_size(_options);
-
-    auto estimated_window_count = (max_window + (window_size - 1) - min_window) / window_size;
-
-    return partition_estimate / std::max(1UL, uint64_t(estimated_window_count));
-}
-
-namespace {
-
-class classify_by_timestamp {
-    time_window_compaction_strategy_options _options;
-    std::vector<int64_t> _known_windows;
-
-public:
-    explicit classify_by_timestamp(time_window_compaction_strategy_options options) : _options(std::move(options)) { }
-    int64_t operator()(api::timestamp_type ts) {
-        const auto window = time_window_compaction_strategy::get_window_for(_options, ts);
-        if (const auto it = boost::find(_known_windows, window); it != _known_windows.end()) {
-            std::swap(*it, _known_windows.front());
-            return window;
-        }
-        if (_known_windows.size() < time_window_compaction_strategy::max_data_segregation_window_count) {
-            _known_windows.push_back(window);
-            return window;
-        }
-        int64_t closest_window;
-        int64_t min_diff = std::numeric_limits<int64_t>::max();
-        for (const auto known_window : _known_windows) {
-            if (const auto diff = std::abs(known_window - window); diff < min_diff) {
-                min_diff = diff;
-                closest_window = known_window;
-            }
-        }
-        return closest_window;
-    };
-};
-
-} // anonymous namespace
-
-reader_consumer time_window_compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer end_consumer) {
-    if (ms_meta.min_timestamp && ms_meta.max_timestamp
-            && get_window_for(_options, *ms_meta.min_timestamp) == get_window_for(_options, *ms_meta.max_timestamp)) {
-        return end_consumer;
-    }
-    return [options = _options, end_consumer = std::move(end_consumer)] (flat_mutation_reader rd) mutable -> future<> {
-        return mutation_writer::segregate_by_timestamp(
-                std::move(rd),
-                classify_by_timestamp(std::move(options)),
-                std::move(end_consumer));
-    };
-}
-
 } // namespace sstables

 std::vector<sstables::shared_sstable>
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -401,9 +401,16 @@ private:
                    auto indexes = std::move(entries_reader->_consumer.indexes);
                    return entries_reader->_context.close().then([indexes = std::move(indexes), ex = std::move(ex)] () mutable {
                        if (ex) {
-                            std::rethrow_exception(std::move(ex));
+                            return do_with(std::move(indexes), [ex = std::move(ex)] (index_list& indexes) mutable {
+                                return parallel_for_each(indexes, [] (index_entry& ie) mutable {
+                                    return ie.close_pi_stream();
+                                }).then_wrapped([ex = std::move(ex)] (future<>&& fut) mutable {
+                                    fut.ignore_ready_future();
+                                    return make_exception_future<index_list>(std::move(ex));
+                                });
+                            });
                        }
-                        return std::move(indexes);
+                        return make_ready_future<index_list>(std::move(indexes));
                    });

                });
--- a/sstables/mp_row_consumer.hh
+++ b/sstables/mp_row_consumer.hh
@@ -374,6 +374,7 @@ private:
        _fwd_end = _fwd ? position_in_partition::before_all_clustered_rows() : position_in_partition::after_all_clustered_rows();
        _out_of_range = false;
        _range_tombstones.reset();
+        _ready = {};
        _first_row_encountered = false;
    }
 public:
--- a/sstables/size_tiered_compaction_strategy.cc
+++ b/sstables/size_tiered_compaction_strategy.cc
@@ -27,7 +27,7 @@
 namespace sstables {

 std::vector<std::pair<sstables::shared_sstable, uint64_t>>
-size_tiered_compaction_strategy::create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables) const {
+size_tiered_compaction_strategy::create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables) {

    std::vector<std::pair<sstables::shared_sstable, uint64_t>> sstable_length_pairs;
    sstable_length_pairs.reserve(sstables.size());
@@ -43,7 +43,7 @@ size_tiered_compaction_strategy::create_sstable_and_length_pairs(const std::vect
 }

 std::vector<std::vector<sstables::shared_sstable>>
-size_tiered_compaction_strategy::get_buckets(const std::vector<sstables::shared_sstable>& sstables) const {
+size_tiered_compaction_strategy::get_buckets(const std::vector<sstables::shared_sstable>& sstables, size_tiered_compaction_strategy_options options) {
    // sstables sorted by size of its data file.
    auto sorted_sstables = create_sstable_and_length_pairs(sstables);

@@ -64,8 +64,8 @@ size_tiered_compaction_strategy::get_buckets(const std::vector<sstables::shared_
        for (auto it = buckets.begin(); it != buckets.end(); it++) {
            size_t old_average_size = it->first;

-            if ((size > (old_average_size * _options.bucket_low) && size < (old_average_size * _options.bucket_high)) ||
-                    (size < _options.min_sstable_size && old_average_size < _options.min_sstable_size)) {
+            if ((size > (old_average_size * options.bucket_low) && size < (old_average_size * options.bucket_high)) ||
+                    (size < options.min_sstable_size && old_average_size < options.min_sstable_size)) {
                auto bucket = std::move(it->second);
                size_t total_size = bucket.size() * old_average_size;
                size_t new_average_size = (total_size + size) / (bucket.size() + 1);
@@ -97,6 +97,11 @@ size_tiered_compaction_strategy::get_buckets(const std::vector<sstables::shared_
    return bucket_list;
 }

+std::vector<std::vector<sstables::shared_sstable>>
+size_tiered_compaction_strategy::get_buckets(const std::vector<sstables::shared_sstable>& sstables) const {
+    return get_buckets(sstables, _options);
+}
+
 std::vector<sstables::shared_sstable>
 size_tiered_compaction_strategy::most_interesting_bucket(std::vector<std::vector<sstables::shared_sstable>> buckets,
        unsigned min_threshold, unsigned max_threshold)
@@ -176,23 +181,28 @@ size_tiered_compaction_strategy::get_sstables_for_compaction(column_family& cfs,
    return sstables::compaction_descriptor();
 }

+int64_t size_tiered_compaction_strategy::estimated_pending_compactions(const std::vector<sstables::shared_sstable>& sstables,
+        int min_threshold, int max_threshold, size_tiered_compaction_strategy_options options) {
+    int64_t n = 0;
+    for (auto& bucket : get_buckets(sstables, options)) {
+        if (bucket.size() >= size_t(min_threshold)) {
+            n += std::ceil(double(bucket.size()) / max_threshold);
+        }
+    }
+    return n;
+}
+
 int64_t size_tiered_compaction_strategy::estimated_pending_compactions(column_family& cf) const {
    int min_threshold = cf.min_compaction_threshold();
    int max_threshold = cf.schema()->max_compaction_threshold();
    std::vector<sstables::shared_sstable> sstables;
-    int64_t n = 0;

    sstables.reserve(cf.sstables_count());
    for (auto& entry : *cf.get_sstables()) {
        sstables.push_back(entry);
    }

-    for (auto& bucket : get_buckets(sstables)) {
-        if (bucket.size() >= size_t(min_threshold)) {
-            n += std::ceil(double(bucket.size()) / max_threshold);
-        }
-    }
-    return n;
+    return estimated_pending_compactions(sstables, min_threshold, max_threshold, _options);
 }

 std::vector<sstables::shared_sstable>
--- a/sstables/size_tiered_compaction_strategy.hh
+++ b/sstables/size_tiered_compaction_strategy.hh
@@ -116,9 +116,11 @@ class size_tiered_compaction_strategy : public compaction_strategy_impl {
    compaction_backlog_tracker _backlog_tracker;

    // Return a list of pair of shared_sstable and its respective size.
-    std::vector<std::pair<sstables::shared_sstable, uint64_t>> create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables) const;
+    static std::vector<std::pair<sstables::shared_sstable, uint64_t>> create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables);

    // Group files of similar size into buckets.
+    static std::vector<std::vector<sstables::shared_sstable>> get_buckets(const std::vector<sstables::shared_sstable>& sstables, size_tiered_compaction_strategy_options options);
+
    std::vector<std::vector<sstables::shared_sstable>> get_buckets(const std::vector<sstables::shared_sstable>& sstables) const;

    // Maybe return a bucket of sstables to compact
@@ -154,6 +156,8 @@ public:

    virtual compaction_descriptor get_sstables_for_compaction(column_family& cfs, std::vector<sstables::shared_sstable> candidates) override;

+    static int64_t estimated_pending_compactions(const std::vector<sstables::shared_sstable>& sstables,
+        int min_threshold, int max_threshold, size_tiered_compaction_strategy_options options);
    virtual int64_t estimated_pending_compactions(column_family& cf) const override;

    virtual compaction_strategy_type type() const {
--- a/sstables/time_window_compaction_strategy.cc
+++ b/sstables/time_window_compaction_strategy.cc
@@ -0,0 +1,273 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "sstables/time_window_compaction_strategy.hh"
+#include "mutation_writer/timestamp_based_splitting_writer.hh"
+#include "mutation_source_metadata.hh"
+
+namespace sstables {
+
+class classify_by_timestamp {
+    time_window_compaction_strategy_options _options;
+    std::vector<int64_t> _known_windows;
+
+public:
+    explicit classify_by_timestamp(time_window_compaction_strategy_options options) : _options(std::move(options)) { }
+    int64_t operator()(api::timestamp_type ts) {
+        const auto window = time_window_compaction_strategy::get_window_for(_options, ts);
+        if (const auto it = boost::find(_known_windows, window); it != _known_windows.end()) {
+            std::swap(*it, _known_windows.front());
+            return window;
+        }
+        if (_known_windows.size() < time_window_compaction_strategy::max_data_segregation_window_count) {
+            _known_windows.push_back(window);
+            return window;
+        }
+        int64_t closest_window;
+        int64_t min_diff = std::numeric_limits<int64_t>::max();
+        for (const auto known_window : _known_windows) {
+            if (const auto diff = std::abs(known_window - window); diff < min_diff) {
+                min_diff = diff;
+                closest_window = known_window;
+            }
+        }
+        return closest_window;
+    };
+};
+
+uint64_t time_window_compaction_strategy::adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) {
+    if (!ms_meta.min_timestamp || !ms_meta.max_timestamp) {
+        // Not enough information, we assume the worst
+        return partition_estimate / max_data_segregation_window_count;
+    }
+    const auto min_window = get_window_for(_options, *ms_meta.min_timestamp);
+    const auto max_window = get_window_for(_options, *ms_meta.max_timestamp);
+    const auto window_size = get_window_size(_options);
+
+    auto estimated_window_count = (max_window + (window_size - 1) - min_window) / window_size;
+
+    return partition_estimate / std::max(1UL, uint64_t(estimated_window_count));
+}
+
+reader_consumer time_window_compaction_strategy::make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer end_consumer) {
+    if (ms_meta.min_timestamp && ms_meta.max_timestamp
+            && get_window_for(_options, *ms_meta.min_timestamp) == get_window_for(_options, *ms_meta.max_timestamp)) {
+        return end_consumer;
+    }
+    return [options = _options, end_consumer = std::move(end_consumer)] (flat_mutation_reader rd) mutable -> future<> {
+        return mutation_writer::segregate_by_timestamp(
+                std::move(rd),
+                classify_by_timestamp(std::move(options)),
+                std::move(end_consumer));
+    };
+}
+
+compaction_descriptor
+time_window_compaction_strategy::get_sstables_for_compaction(column_family& cf, std::vector<shared_sstable> candidates) {
+    auto gc_before = gc_clock::now() - cf.schema()->gc_grace_seconds();
+
+    if (candidates.empty()) {
+        return compaction_descriptor();
+    }
+
+    // Find fully expired SSTables. Those will be included no matter what.
+    std::unordered_set<shared_sstable> expired;
+
+    if (db_clock::now() - _last_expired_check > _options.expired_sstable_check_frequency) {
+        clogger.debug("TWCS expired check sufficiently far in the past, checking for fully expired SSTables");
+        expired = get_fully_expired_sstables(cf, candidates, gc_before);
+        _last_expired_check = db_clock::now();
+    } else {
+        clogger.debug("TWCS skipping check for fully expired SSTables");
+    }
+
+    if (!expired.empty()) {
+        auto is_expired = [&] (const shared_sstable& s) { return expired.find(s) != expired.end(); };
+        candidates.erase(boost::remove_if(candidates, is_expired), candidates.end());
+    }
+
+    auto compaction_candidates = get_next_non_expired_sstables(cf, std::move(candidates), gc_before);
+    if (!expired.empty()) {
+        compaction_candidates.insert(compaction_candidates.end(), expired.begin(), expired.end());
+    }
+    return compaction_descriptor(std::move(compaction_candidates));
+}
+
+time_window_compaction_strategy::bucket_compaction_mode
+time_window_compaction_strategy::compaction_mode(const bucket_t& bucket, timestamp_type bucket_key,
+        timestamp_type now, size_t min_threshold) const {
+    // STCS will also be performed on older window buckets, to avoid a bad write and
+    // space amplification when something like read repair cause small updates to
+    // those past windows.
+
+    if (bucket.size() >= 2 && !is_last_active_bucket(bucket_key, now) && _recent_active_windows.count(bucket_key)) {
+        return bucket_compaction_mode::major;
+    } else if (bucket.size() >= size_t(min_threshold)) {
+        return bucket_compaction_mode::size_tiered;
+    }
+    return bucket_compaction_mode::none;
+}
+
+std::vector<shared_sstable>
+time_window_compaction_strategy::get_next_non_expired_sstables(column_family& cf,
+        std::vector<shared_sstable> non_expiring_sstables, gc_clock::time_point gc_before) {
+    auto most_interesting = get_compaction_candidates(cf, non_expiring_sstables);
+
+    if (!most_interesting.empty()) {
+        return most_interesting;
+    }
+
+    // if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
+    // ratio is greater than threshold.
+    auto e = boost::range::remove_if(non_expiring_sstables, [this, &gc_before] (const shared_sstable& sst) -> bool {
+        return !worth_dropping_tombstones(sst, gc_before);
+    });
+    non_expiring_sstables.erase(e, non_expiring_sstables.end());
+    if (non_expiring_sstables.empty()) {
+        return {};
+    }
+    auto it = boost::min_element(non_expiring_sstables, [] (auto& i, auto& j) {
+        return i->get_stats_metadata().min_timestamp < j->get_stats_metadata().min_timestamp;
+    });
+    return { *it };
+}
+
+std::vector<shared_sstable>
+time_window_compaction_strategy::get_compaction_candidates(column_family& cf, std::vector<shared_sstable> candidate_sstables) {
+    auto p = get_buckets(std::move(candidate_sstables), _options);
+    // Update the highest window seen, if necessary
+    _highest_window_seen = std::max(_highest_window_seen, p.second);
+
+    update_estimated_compaction_by_tasks(p.first, cf.min_compaction_threshold(), cf.schema()->max_compaction_threshold());
+
+    return newest_bucket(std::move(p.first), cf.min_compaction_threshold(), cf.schema()->max_compaction_threshold(),
+        _options.sstable_window_size, _highest_window_seen, _stcs_options);
+}
+
+timestamp_type
+time_window_compaction_strategy::get_window_lower_bound(std::chrono::seconds sstable_window_size, timestamp_type timestamp) {
+    using namespace std::chrono;
+    auto timestamp_in_sec = duration_cast<seconds>(microseconds(timestamp)).count();
+
+    // mask out window size from timestamp to get lower bound of its window
+    auto window_lower_bound_in_sec = seconds(timestamp_in_sec - (timestamp_in_sec % sstable_window_size.count()));
+
+    return timestamp_type(duration_cast<microseconds>(window_lower_bound_in_sec).count());
+}
+
+std::pair<std::map<timestamp_type, std::vector<shared_sstable>>, timestamp_type>
+time_window_compaction_strategy::get_buckets(std::vector<shared_sstable> files, time_window_compaction_strategy_options& options) {
+    std::map<timestamp_type, std::vector<shared_sstable>> buckets;
+
+    timestamp_type max_timestamp = 0;
+    // Create map to represent buckets
+    // For each sstable, add sstable to the time bucket
+    // Where the bucket is the file's max timestamp rounded to the nearest window bucket
+    for (auto&& f : files) {
+        timestamp_type ts = to_timestamp_type(options.timestamp_resolution, f->get_stats_metadata().max_timestamp);
+        timestamp_type lower_bound = get_window_lower_bound(options.sstable_window_size, ts);
+        buckets[lower_bound].push_back(std::move(f));
+        max_timestamp = std::max(max_timestamp, lower_bound);
+    }
+
+    return std::make_pair(std::move(buckets), max_timestamp);
+}
+
+static std::ostream& operator<<(std::ostream& os, const std::map<timestamp_type, std::vector<shared_sstable>>& buckets) {
+    os << "  buckets = {\n";
+    for (auto& bucket : buckets | boost::adaptors::reversed) {
+        os << format("    key={}, size={}\n", bucket.first, bucket.second.size());
+    }
+    os << "  }\n";
+    return os;
+}
+
+std::vector<shared_sstable>
+time_window_compaction_strategy::newest_bucket(std::map<timestamp_type, std::vector<shared_sstable>> buckets,
+        int min_threshold, int max_threshold, std::chrono::seconds sstable_window_size, timestamp_type now,
+        size_tiered_compaction_strategy_options& stcs_options) {
+    clogger.debug("time_window_compaction_strategy::newest_bucket:\n  now {}\n{}", now, buckets);
+
+    for (auto&& key_bucket : buckets | boost::adaptors::reversed) {
+        auto key = key_bucket.first;
+        auto& bucket = key_bucket.second;
+
+        if (is_last_active_bucket(key, now)) {
+            _recent_active_windows.insert(key);
+        }
+        switch (compaction_mode(bucket, key, now, min_threshold)) {
+        case bucket_compaction_mode::size_tiered: {
+            // If we're in the newest bucket, we'll use STCS to prioritize sstables.
+            auto stcs_interesting_bucket = size_tiered_compaction_strategy::most_interesting_bucket(bucket, min_threshold, max_threshold, stcs_options);
+
+            // If the tables in the current bucket aren't eligible in the STCS strategy, we'll skip it and look for other buckets
+            if (!stcs_interesting_bucket.empty()) {
+                clogger.debug("bucket size {} >= 2, key {}, performing STCS on what's here", bucket.size(), key);
+                return stcs_interesting_bucket;
+            }
+            break;
+        }
+        case bucket_compaction_mode::major:
+            _recent_active_windows.erase(key);
+            clogger.debug("bucket size {} >= 2 and not in current bucket, key {}, compacting what's here", bucket.size(), key);
+            return trim_to_threshold(std::move(bucket), max_threshold);
+        default:
+            clogger.debug("No compaction necessary for bucket size {} , key {}, now {}", bucket.size(), key, now);
+            break;
+        }
+    }
+    return {};
+}
+
+std::vector<shared_sstable>
+time_window_compaction_strategy::trim_to_threshold(std::vector<shared_sstable> bucket, int max_threshold) {
+    auto n = std::min(bucket.size(), size_t(max_threshold));
+    // Trim the largest sstables off the end to meet the maxThreshold
+    boost::partial_sort(bucket, bucket.begin() + n, [] (auto& i, auto& j) {
+        return i->ondisk_data_size() < j->ondisk_data_size();
+    });
+    bucket.resize(n);
+    return bucket;
+}
+
+void time_window_compaction_strategy::update_estimated_compaction_by_tasks(std::map<timestamp_type, std::vector<shared_sstable>>& tasks,
+                                                                           int min_threshold, int max_threshold) {
+    int64_t n = 0;
+    timestamp_type now = _highest_window_seen;
+
+    for (auto& task : tasks) {
+        const bucket_t& bucket = task.second;
+        timestamp_type bucket_key = task.first;
+
+        switch (compaction_mode(bucket, bucket_key, now, min_threshold)) {
+        case bucket_compaction_mode::size_tiered:
+            n += size_tiered_compaction_strategy::estimated_pending_compactions(bucket, min_threshold, max_threshold, _stcs_options);
+            break;
+        case bucket_compaction_mode::major:
+            n++;
+        default:
+            break;
+        }
+    }
+    _estimated_remaining_tasks = n;
+}
+
+}
--- a/sstables/time_window_compaction_strategy.hh
+++ b/sstables/time_window_compaction_strategy.hh
@@ -140,6 +140,8 @@ class time_window_compaction_strategy : public compaction_strategy_impl {
    int64_t _estimated_remaining_tasks = 0;
    db_clock::time_point _last_expired_check;
    timestamp_type _highest_window_seen;
+    // Keep track of all recent active windows that still need to be compacted into a single SSTable
+    std::unordered_set<timestamp_type> _recent_active_windows;
    size_tiered_compaction_strategy_options _stcs_options;
    compaction_backlog_tracker _backlog_tracker;
 public:
@@ -148,37 +150,11 @@ public:
    // Better co-locate some windows into the same sstables than OOM.
    static constexpr uint64_t max_data_segregation_window_count = 100;

+    using bucket_t = std::vector<shared_sstable>;
+    enum class bucket_compaction_mode { none, size_tiered, major };
 public:
    time_window_compaction_strategy(const std::map<sstring, sstring>& options);
-    virtual compaction_descriptor get_sstables_for_compaction(column_family& cf, std::vector<shared_sstable> candidates) override {
-        auto gc_before = gc_clock::now() - cf.schema()->gc_grace_seconds();
-
-        if (candidates.empty()) {
-            return compaction_descriptor();
-        }
-
-        // Find fully expired SSTables. Those will be included no matter what.
-        std::unordered_set<shared_sstable> expired;
-
-        if (db_clock::now() - _last_expired_check > _options.expired_sstable_check_frequency) {
-            clogger.debug("TWCS expired check sufficiently far in the past, checking for fully expired SSTables");
-            expired = get_fully_expired_sstables(cf, candidates, gc_before);
-            _last_expired_check = db_clock::now();
-        } else {
-            clogger.debug("TWCS skipping check for fully expired SSTables");
-        }
-
-        if (!expired.empty()) {
-            auto is_expired = [&] (const shared_sstable& s) { return expired.find(s) != expired.end(); };
-            candidates.erase(boost::remove_if(candidates, is_expired), candidates.end());
-        }
-
-        auto compaction_candidates = get_next_non_expired_sstables(cf, std::move(candidates), gc_before);
-        if (!expired.empty()) {
-            compaction_candidates.insert(compaction_candidates.end(), expired.begin(), expired.end());
-        }
-        return compaction_descriptor(std::move(compaction_candidates));
-    }
+    virtual compaction_descriptor get_sstables_for_compaction(column_family& cf, std::vector<shared_sstable> candidates) override;
 private:
    static timestamp_type
    to_timestamp_type(time_window_compaction_strategy_options::timestamp_resolutions resolution, int64_t timestamp_from_sstable) {
@@ -192,114 +168,36 @@ private:
        };
    }

+    // Returns true if bucket is the last, most active one.
+    bool is_last_active_bucket(timestamp_type bucket_key, timestamp_type now) const {
+        return bucket_key >= now;
+    }
+
+    // Returns which compaction type should be performed on a given window bucket.
+    bucket_compaction_mode
+    compaction_mode(const bucket_t& bucket, timestamp_type bucket_key, timestamp_type now, size_t min_threshold) const;
+
    std::vector<shared_sstable>
-    get_next_non_expired_sstables(column_family& cf, std::vector<shared_sstable> non_expiring_sstables, gc_clock::time_point gc_before) {
-        auto most_interesting = get_compaction_candidates(cf, non_expiring_sstables);
+    get_next_non_expired_sstables(column_family& cf, std::vector<shared_sstable> non_expiring_sstables, gc_clock::time_point gc_before);

-        if (!most_interesting.empty()) {
-            return most_interesting;
-        }
-
-        // if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
-        // ratio is greater than threshold.
-        auto e = boost::range::remove_if(non_expiring_sstables, [this, &gc_before] (const shared_sstable& sst) -> bool {
-            return !worth_dropping_tombstones(sst, gc_before);
-        });
-        non_expiring_sstables.erase(e, non_expiring_sstables.end());
-        if (non_expiring_sstables.empty()) {
-            return {};
-        }
-        auto it = boost::min_element(non_expiring_sstables, [] (auto& i, auto& j) {
-            return i->get_stats_metadata().min_timestamp < j->get_stats_metadata().min_timestamp;
-        });
-        return { *it };
-    }
-
-    std::vector<shared_sstable> get_compaction_candidates(column_family& cf, std::vector<shared_sstable> candidate_sstables) {
-        auto p = get_buckets(std::move(candidate_sstables), _options);
-        // Update the highest window seen, if necessary
-        _highest_window_seen = std::max(_highest_window_seen, p.second);
-
-        update_estimated_compaction_by_tasks(p.first, cf.min_compaction_threshold());
-
-        return newest_bucket(std::move(p.first), cf.min_compaction_threshold(), cf.schema()->max_compaction_threshold(),
-            _options.sstable_window_size, _highest_window_seen, _stcs_options);
-    }
+    std::vector<shared_sstable> get_compaction_candidates(column_family& cf, std::vector<shared_sstable> candidate_sstables);
 public:
    // Find the lowest timestamp for window of given size
    static timestamp_type
-    get_window_lower_bound(std::chrono::seconds sstable_window_size, timestamp_type timestamp) {
-        using namespace std::chrono;
-        auto timestamp_in_sec = duration_cast<seconds>(microseconds(timestamp)).count();
-
-        // mask out window size from timestamp to get lower bound of its window
-        auto window_lower_bound_in_sec = seconds(timestamp_in_sec - (timestamp_in_sec % sstable_window_size.count()));
-
-        return timestamp_type(duration_cast<microseconds>(window_lower_bound_in_sec).count());
-    }
+    get_window_lower_bound(std::chrono::seconds sstable_window_size, timestamp_type timestamp);

    // Group files with similar max timestamp into buckets.
    // @return A pair, where the left element is the bucket representation (map of timestamp to sstablereader),
    // and the right is the highest timestamp seen
    static std::pair<std::map<timestamp_type, std::vector<shared_sstable>>, timestamp_type>
-    get_buckets(std::vector<shared_sstable> files, time_window_compaction_strategy_options& options) {
-        std::map<timestamp_type, std::vector<shared_sstable>> buckets;
+    get_buckets(std::vector<shared_sstable> files, time_window_compaction_strategy_options& options);

-        timestamp_type max_timestamp = 0;
-        // Create map to represent buckets
-        // For each sstable, add sstable to the time bucket
-        // Where the bucket is the file's max timestamp rounded to the nearest window bucket
-        for (auto&& f : files) {
-            timestamp_type ts = to_timestamp_type(options.timestamp_resolution, f->get_stats_metadata().max_timestamp);
-            timestamp_type lower_bound = get_window_lower_bound(options.sstable_window_size, ts);
-            buckets[lower_bound].push_back(std::move(f));
-            max_timestamp = std::max(max_timestamp, lower_bound);
-        }
-
-        return std::make_pair(std::move(buckets), max_timestamp);
-    }
-
-    static std::vector<shared_sstable>
+    std::vector<shared_sstable>
    newest_bucket(std::map<timestamp_type, std::vector<shared_sstable>> buckets, int min_threshold, int max_threshold,
-            std::chrono::seconds sstable_window_size, timestamp_type now, size_tiered_compaction_strategy_options& stcs_options) {
-        // If the current bucket has at least minThreshold SSTables, choose that one.
-        // For any other bucket, at least 2 SSTables is enough.
-        // In any case, limit to maxThreshold SSTables.
-
-        for (auto&& key_bucket : buckets | boost::adaptors::reversed) {
-            auto key = key_bucket.first;
-            auto& bucket = key_bucket.second;
-
-            clogger.trace("Key {}, now {}", key, now);
-
-            if (bucket.size() >= size_t(min_threshold) && key >= now) {
-                // If we're in the newest bucket, we'll use STCS to prioritize sstables
-                auto stcs_interesting_bucket = size_tiered_compaction_strategy::most_interesting_bucket(bucket, min_threshold, max_threshold, stcs_options);
-
-                // If the tables in the current bucket aren't eligible in the STCS strategy, we'll skip it and look for other buckets
-                if (!stcs_interesting_bucket.empty()) {
-                    return stcs_interesting_bucket;
-                }
-            } else if (bucket.size() >= 2 && key < now) {
-                clogger.debug("bucket size {} >= 2 and not in current bucket, compacting what's here", bucket.size());
-                return trim_to_threshold(std::move(bucket), max_threshold);
-            } else {
-                clogger.debug("No compaction necessary for bucket size {} , key {}, now {}", bucket.size(), key, now);
-            }
-        }
-        return {};
-    }
+            std::chrono::seconds sstable_window_size, timestamp_type now, size_tiered_compaction_strategy_options& stcs_options);

    static std::vector<shared_sstable>
-    trim_to_threshold(std::vector<shared_sstable> bucket, int max_threshold) {
-        auto n = std::min(bucket.size(), size_t(max_threshold));
-        // Trim the largest sstables off the end to meet the maxThreshold
-        boost::partial_sort(bucket, bucket.begin() + n, [] (auto& i, auto& j) {
-            return i->ondisk_data_size() < j->ondisk_data_size();
-        });
-        bucket.resize(n);
-        return bucket;
-    }
+    trim_to_threshold(std::vector<shared_sstable> bucket, int max_threshold);

    static int64_t
    get_window_for(const time_window_compaction_strategy_options& options, api::timestamp_type ts) {
@@ -311,23 +209,8 @@ public:
        return timestamp_type(std::chrono::duration_cast<std::chrono::microseconds>(options.get_sstable_window_size()).count());
    }
 private:
-    void update_estimated_compaction_by_tasks(std::map<timestamp_type, std::vector<shared_sstable>>& tasks, int min_threshold) {
-        int64_t n = 0;
-        timestamp_type now = _highest_window_seen;
-
-        for (auto task : tasks) {
-            auto key = task.first;
-
-            // For current window, make sure it's compactable
-            auto count = task.second.size();
-            if (key >= now && count >= size_t(min_threshold)) {
-                n++;
-            } else if (key < now && count >= 2) {
-                n++;
-            }
-        }
-        _estimated_remaining_tasks = n;
-    }
+    void update_estimated_compaction_by_tasks(std::map<timestamp_type, std::vector<shared_sstable>>& tasks,
+        int min_threshold, int max_threshold);

    friend class time_window_backlog_tracker;
 public:
--- a/streaming/stream_session.cc
+++ b/streaming/stream_session.cc
@@ -319,6 +319,15 @@ void stream_session::init_messaging_service_handler() {
    });
 }

+future<> stream_session::uninit_messaging_service_handler() {
+    return when_all_succeed(
+        ms().unregister_prepare_message(),
+        ms().unregister_prepare_done_message(),
+        ms().unregister_stream_mutation_fragments(),
+        ms().unregister_stream_mutation_done(),
+        ms().unregister_complete_message()).discard_result();
+}
+
 distributed<database>* stream_session::_db;
 distributed<db::system_distributed_keyspace>* stream_session::_sys_dist_ks;
 distributed<db::view::view_update_generator>* stream_session::_view_update_generator;
@@ -342,9 +351,13 @@ future<> stream_session::init_streaming_service(distributed<database>& db, distr
    // });
    return get_stream_manager().start().then([] {
        gms::get_local_gossiper().register_(get_local_stream_manager().shared_from_this());
-        return _db->invoke_on_all([] (auto& db) {
-            init_messaging_service_handler();
-        });
+        return smp::invoke_on_all([] { init_messaging_service_handler(); });
+    });
+}
+
+future<> stream_session::uninit_streaming_service() {
+    return smp::invoke_on_all([] {
+        return uninit_messaging_service_handler();
    });
 }

--- a/streaming/stream_session.hh
+++ b/streaming/stream_session.hh
@@ -142,6 +142,7 @@ private:
    using token = dht::token;
    using ring_position = dht::ring_position;
    static void init_messaging_service_handler();
+    static future<> uninit_messaging_service_handler();
    static distributed<database>* _db;
    static distributed<db::system_distributed_keyspace>* _sys_dist_ks;
    static distributed<db::view::view_update_generator>* _view_update_generator;
@@ -152,6 +153,7 @@ public:
    static database& get_local_db() { return _db->local(); }
    static distributed<database>& get_db() { return *_db; };
    static future<> init_streaming_service(distributed<database>& db, distributed<db::system_distributed_keyspace>& sys_dist_ks, distributed<db::view::view_update_generator>& view_update_generator);
+    static future<> uninit_streaming_service();
 public:
    /**
     * Streaming endpoint.
--- a/table.cc
+++ b/table.cc
@@ -1393,7 +1393,7 @@ future<std::unordered_set<sstring>> table::get_sstables_by_partition_key(const s
            [this] (std::unordered_set<sstring>& filenames, lw_shared_ptr<sstables::sstable_set::incremental_selector>& sel, partition_key& pk) {
        return do_with(dht::decorated_key(dht::decorate_key(*_schema, pk)),
                [this, &filenames, &sel, &pk](dht::decorated_key& dk) mutable {
-            auto sst = sel->select(dk).sstables;
+            const auto& sst = sel->select(dk).sstables;
            auto hk = sstables::sstable::make_hashed_key(*_schema, dk.key());

            return do_for_each(sst, [this, &filenames, &dk, hk = std::move(hk)] (std::vector<sstables::shared_sstable>::const_iterator::reference s) mutable {
@@ -1422,7 +1422,7 @@ std::vector<sstables::shared_sstable> table::select_sstables(const dht::partitio
    return _sstables->select(range);
 }

-std::vector<sstables::shared_sstable> table::candidates_for_compaction() const {
+std::vector<sstables::shared_sstable> table::non_staging_sstables() const {
    return boost::copy_range<std::vector<sstables::shared_sstable>>(*get_sstables()
            | boost::adaptors::filtered([this] (auto& sst) {
        return !_sstables_need_rewrite.count(sst->generation()) && !_sstables_staging.count(sst->generation());
@@ -1989,6 +1989,11 @@ void table::set_schema(schema_ptr s) {
    }
    _schema = std::move(s);

+    for (auto&& v : _views) {
+        v->view_info()->set_base_info(
+            v->view_info()->make_base_dependent_view_info(*_schema));
+    }
+
    set_compaction_strategy(_schema->compaction_strategy());
    trigger_compaction();
 }
@@ -2000,7 +2005,8 @@ static std::vector<view_ptr>::iterator find_view(std::vector<view_ptr>& views, c
 }

 void table::add_or_update_view(view_ptr v) {
-    v->view_info()->initialize_base_dependent_fields(*schema());
+    v->view_info()->set_base_info(
+        v->view_info()->make_base_dependent_view_info(*_schema));
    auto existing = find_view(_views, v);
    if (existing != _views.end()) {
        *existing = std::move(v);
@@ -2053,7 +2059,7 @@ static size_t memory_usage_of(const std::vector<frozen_mutation_and_schema>& ms)
 * @return a future resolving to the mutations to apply to the views, which can be empty.
 */
 future<> table::generate_and_propagate_view_updates(const schema_ptr& base,
-        std::vector<view_ptr>&& views,
+        std::vector<db::view::view_and_base>&& views,
        mutation&& m,
        flat_mutation_reader_opt existings) const {
    auto base_token = m.token();
@@ -2161,7 +2167,7 @@ table::local_base_lock(
 * @return a future that resolves when the updates have been acknowledged by the view replicas
 */
 future<> table::populate_views(
-        std::vector<view_ptr> views,
+        std::vector<db::view::view_and_base> views,
        dht::token base_token,
        flat_mutation_reader&& reader) {
    auto& schema = reader.schema();
@@ -2523,7 +2529,7 @@ future<row_locker::lock_holder> table::do_push_view_replica_updates(const schema
    }
    auto& base = schema();
    m.upgrade(base);
-    auto views = affected_views(base, m);
+    auto views = db::view::with_base_info_snapshot(affected_views(base, m));
    if (views.empty()) {
        return make_ready_future<row_locker::lock_holder>();
    }
--- a/test/alternator/run
+++ b/test/alternator/run
@@ -28,8 +28,8 @@ fi
 SCYLLA_IP=127.1.$(($$ >> 8 & 255)).$(($$ & 255))
 echo "Running Scylla on $SCYLLA_IP"

-tmp_dir=/tmp/alternator-test-$$
-mkdir $tmp_dir
+tmp_dir="$(readlink -e ${TMPDIR-/tmp})"/alternator-test-$$
+mkdir "$tmp_dir"

 # We run the cleanup() function on exit for any reason - successful finish
 # of the script, an error (since we have "set -e"), or a signal.
--- a/test/alternator/test_condition_expression.py
+++ b/test/alternator/test_condition_expression.py
@@ -1351,3 +1351,37 @@ def test_condition_expression_with_forbidden_rmw(scylla_only, dynamodb, test_tab
    assert test_table_s.get_item(Key={'p': s}, ConsistentRead=True)['Item'] == {'p': s, 'regular': 'write'}
    test_table_s.update_item(Key={'p': s}, AttributeUpdates={'write': {'Value': 'regular', 'Action': 'PUT'}})
    assert test_table_s.get_item(Key={'p': s}, ConsistentRead=True)['Item'] == {'p': s, 'regular': 'write', 'write': 'regular'}
+
+# Reproducer for issue #6573: binary strings should be ordered as unsigned
+# bytes, i.e., byte 128 comes after 127, not before as with signed bytes.
+# Test the five ordering operators: <, <=, >, >=, between
+def test_condition_expression_unsigned_bytes(test_table_s):
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'b': bytearray([127])})
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='SET z = :newval',
+        ConditionExpression='b < :oldval',
+        ExpressionAttributeValues={':newval': 1, ':oldval': bytearray([128])})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 1
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='SET z = :newval',
+        ConditionExpression='b <= :oldval',
+        ExpressionAttributeValues={':newval': 2, ':oldval': bytearray([128])})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 2
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='SET z = :newval',
+        ConditionExpression='b between :oldval1 and :oldval2',
+        ExpressionAttributeValues={':newval': 3, ':oldval1': bytearray([126]), ':oldval2': bytearray([128])})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 3
+
+    test_table_s.put_item(Item={'p': p, 'b': bytearray([128])})
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='SET z = :newval',
+        ConditionExpression='b > :oldval',
+        ExpressionAttributeValues={':newval': 4, ':oldval': bytearray([127])})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 4
+    test_table_s.update_item(Key={'p': p},
+        UpdateExpression='SET z = :newval',
+        ConditionExpression='b >= :oldval',
+        ExpressionAttributeValues={':newval': 5, ':oldval': bytearray([127])})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 5
--- a/test/alternator/test_expected.py
+++ b/test/alternator/test_expected.py
@@ -522,6 +522,15 @@ def test_update_expected_1_null(test_table_s):
            Expected={'a': {'ComparisonOperator': 'NULL', 'AttributeValueList': [2]}}
        )

+# When ComparisonOperator = "NULL", AttributeValueList should be empty if it
+# exists, but as this test verifies, it may also be missing completely.
+def test_update_expected_1_null_missing_list(test_table_s):
+    p = random_string()
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'a': {'Value': 2, 'Action': 'PUT'}},
+        Expected={'a': {'ComparisonOperator': 'NULL'}})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == 2
+
 # Tests for Expected with ComparisonOperator = "CONTAINS":
 def test_update_expected_1_contains(test_table_s):
    # true cases. CONTAINS can be used for two unrelated things: check substrings
@@ -1077,3 +1086,42 @@ def test_put_item_expected(test_table_s):
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'a': 2}
    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
        test_table_s.put_item(Item={'p': p, 'a': 3}, Expected={'a': {'Value': 1}})
+
+# Reproducer for issue #6573: binary strings should be ordered as unsigned
+# bytes, i.e., byte 128 comes after 127, not before as with signed bytes.
+# Test the five ordering operators: LT, LE, GT, GE, BETWEEN
+def test_update_expected_unsigned_bytes(test_table_s):
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'b': bytearray([127])})
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'z': {'Value': 1, 'Action': 'PUT'}},
+        Expected={'b': {'ComparisonOperator': 'LT',
+                        'AttributeValueList': [bytearray([128])]}}
+    )
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 1
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'z': {'Value': 2, 'Action': 'PUT'}},
+        Expected={'b': {'ComparisonOperator': 'LE',
+                        'AttributeValueList': [bytearray([128])]}}
+    )
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 2
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'z': {'Value': 3, 'Action': 'PUT'}},
+        Expected={'b': {'ComparisonOperator': 'BETWEEN',
+                        'AttributeValueList': [bytearray([126]), bytearray([128])]}}
+    )
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 3
+
+    test_table_s.put_item(Item={'p': p, 'b': bytearray([128])})
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'z': {'Value': 4, 'Action': 'PUT'}},
+        Expected={'b': {'ComparisonOperator': 'GT',
+                        'AttributeValueList': [bytearray([127])]}}
+    )
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 4
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'z': {'Value': 5, 'Action': 'PUT'}},
+        Expected={'b': {'ComparisonOperator': 'GE',
+                        'AttributeValueList': [bytearray([127])]}}
+    )
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 5
--- a/test/alternator/test_key_condition_expression.py
+++ b/test/alternator/test_key_condition_expression.py
@@ -520,6 +520,42 @@ def test_key_condition_expression_and_conditions(test_table_sn_with_sorted_parti
                'ComparisonOperator': 'GT'}}
            )

+# Demonstrate that issue #6573 was not a bug for KeyConditionExpression:
+# binary strings are ordered as unsigned bytes, i.e., byte 128 comes after
+# 127, not as signed bytes.
+# Test the five ordering operators: <, <=, >, >=, between
+def test_key_condition_expression_unsigned_bytes(test_table_sb):
+    p = random_string()
+    items = [{'p': p, 'c': bytearray([i])} for i in range(126,129)]
+    with test_table_sb.batch_writer() as batch:
+        for item in items:
+            batch.put_item(item)
+    got_items = full_query(test_table_sb,
+        KeyConditionExpression='p=:p AND c<:c',
+        ExpressionAttributeValues={':p': p, ':c': bytearray([127])})
+    expected_items = [item for item in items if item['c'] < bytearray([127])]
+    assert(got_items == expected_items)
+    got_items = full_query(test_table_sb,
+        KeyConditionExpression='p=:p AND c<=:c',
+        ExpressionAttributeValues={':p': p, ':c': bytearray([127])})
+    expected_items = [item for item in items if item['c'] <= bytearray([127])]
+    assert(got_items == expected_items)
+    got_items = full_query(test_table_sb,
+        KeyConditionExpression='p=:p AND c>:c',
+        ExpressionAttributeValues={':p': p, ':c': bytearray([127])})
+    expected_items = [item for item in items if item['c'] > bytearray([127])]
+    assert(got_items == expected_items)
+    got_items = full_query(test_table_sb,
+        KeyConditionExpression='p=:p AND c>=:c',
+        ExpressionAttributeValues={':p': p, ':c': bytearray([127])})
+    expected_items = [item for item in items if item['c'] >= bytearray([127])]
+    assert(got_items == expected_items)
+    got_items = full_query(test_table_sb,
+        KeyConditionExpression='p=:p AND c BETWEEN :c1 AND :c2',
+        ExpressionAttributeValues={':p': p, ':c1': bytearray([127]), ':c2': bytearray([128])})
+    expected_items = [item for item in items if item['c'] >= bytearray([127]) and item['c'] <= bytearray([128])]
+    assert(got_items == expected_items)
+
 # The following is an older test we had, which test one arbitrary use case
 # for KeyConditionExpression. It uses filled_test_table (the one we also
 # use in test_scan.py) instead of the fixtures defined in this file.
--- a/test/boost/batchlog_manager_test.cc
+++ b/test/boost/batchlog_manager_test.cc
@@ -64,7 +64,7 @@ SEASTAR_TEST_CASE(test_execute_batch) {
            auto version = netw::messaging_service::current_version;
            auto bm = bp.get_batch_log_mutation_for({ m }, s->id(), version, db_clock::now() - db_clock::duration(3h));

-            return qp.proxy().mutate_locally(bm).then([&bp] () mutable {
+            return qp.proxy().mutate_locally(bm, db::commitlog::force_sync::no).then([&bp] () mutable {
                return bp.count_all_batches().then([](auto n) {
                    BOOST_CHECK_EQUAL(n, 1);
                }).then([&bp] () mutable {
--- a/test/boost/cql_query_test.cc
+++ b/test/boost/cql_query_test.cc
@@ -3451,10 +3451,13 @@ SEASTAR_TEST_CASE(test_select_with_mixed_order_table) {
 }

 uint64_t
-run_and_examine_cache_stat_change(cql_test_env& e, uint64_t cache_tracker::stats::*metric, std::function<void (cql_test_env& e)> func) {
+run_and_examine_cache_read_stats_change(cql_test_env& e, std::string_view cf_name, std::function<void (cql_test_env& e)> func) {
    auto read_stat = [&] {
-        auto local_read_metric = [metric] (database& db) { return db.row_cache_tracker().get_stats().*metric; };
-        return e.db().map_reduce0(local_read_metric, uint64_t(0), std::plus<uint64_t>()).get0();
+        return e.db().map_reduce0([&cf_name] (const database& db) {
+            auto& t = db.find_column_family("ks", sstring(cf_name));
+            auto& stats = t.get_row_cache().stats();
+            return stats.reads_with_misses.count() + stats.reads_with_no_misses.count();
+        }, uint64_t(0), std::plus<uint64_t>()).get0();
    };
    auto before = read_stat();
    func(e);
@@ -3465,11 +3468,11 @@ run_and_examine_cache_stat_change(cql_test_env& e, uint64_t cache_tracker::stats
 SEASTAR_TEST_CASE(test_cache_bypass) {
    return do_with_cql_env_thread([] (cql_test_env& e) {
        e.execute_cql("CREATE TABLE t (k int PRIMARY KEY)").get();
-        auto with_cache = run_and_examine_cache_stat_change(e, &cache_tracker::stats::reads, [] (cql_test_env& e) {
+        auto with_cache = run_and_examine_cache_read_stats_change(e, "t", [] (cql_test_env& e) {
            e.execute_cql("SELECT * FROM t").get();
        });
        BOOST_REQUIRE(with_cache >= smp::count);  // scan may make multiple passes per shard
-        auto without_cache = run_and_examine_cache_stat_change(e, &cache_tracker::stats::reads, [] (cql_test_env& e) {
+        auto without_cache = run_and_examine_cache_read_stats_change(e, "t", [] (cql_test_env& e) {
            e.execute_cql("SELECT * FROM t BYPASS CACHE").get();
        });
        BOOST_REQUIRE_EQUAL(without_cache, 0);
@@ -4563,3 +4566,12 @@ SEASTAR_TEST_CASE(test_impossible_where) {
        require_rows(e, "SELECT * FROM t2 WHERE c>=10 AND c<=0 ALLOW FILTERING", {});
    });
 }
+
+SEASTAR_TEST_CASE(test_counter_column_added_into_non_counter_table) {
+    return do_with_cql_env_thread([] (cql_test_env& e) {
+        cquery_nofail(e, "CREATE TABLE t (pk int, ck int, PRIMARY KEY(pk, ck))");
+
+        BOOST_REQUIRE_THROW(e.execute_cql("ALTER TABLE t ADD \"c\" counter;").get(),
+                exceptions::configuration_exception);
+    });
+}
--- a/test/boost/filtering_test.cc
+++ b/test/boost/filtering_test.cc
@@ -1134,6 +1134,9 @@ SEASTAR_TEST_CASE(test_filtering) {
                { int32_type->decompose(8), int32_type->decompose(3) },
                { int32_type->decompose(9), int32_type->decompose(3) },
            });
+            require_rows(e, "SELECT k FROM cf WHERE k=12 AND (m,n)>=(4,0) ALLOW FILTERING;", {
+                    { int32_type->decompose(12), int32_type->decompose(4), int32_type->decompose(5)},
+                });
        }

        // test filtering on clustering keys
--- a/test/boost/mutation_reader_test.cc
+++ b/test/boost/mutation_reader_test.cc
@@ -44,6 +44,7 @@
 #include "test/lib/make_random_string.hh"
 #include "test/lib/dummy_sharder.hh"
 #include "test/lib/reader_lifecycle_policy.hh"
+#include "test/lib/random_utils.hh"

 #include "dht/sharder.hh"
 #include "mutation_reader.hh"
@@ -2555,7 +2556,7 @@ SEASTAR_THREAD_TEST_CASE(test_queue_reader) {
        }
    }

-    // abort()
+    // abort() -- check that consumer is aborted
    {
        auto [reader, handle] = make_queue_reader(gen.schema());
        auto fill_buffer_fut = reader.fill_buffer(db::no_timeout);
@@ -2570,6 +2571,28 @@ SEASTAR_THREAD_TEST_CASE(test_queue_reader) {

        BOOST_REQUIRE_THROW(fill_buffer_fut.get(), std::runtime_error);
        BOOST_REQUIRE_THROW(handle.push(partition_end{}).get(), std::runtime_error);
+        BOOST_REQUIRE(!reader.is_end_of_stream());
+    }
+
+    // abort() -- check that producer is aborted
+    {
+        auto [reader, handle] = make_queue_reader(gen.schema());
+        reader.set_max_buffer_size(1);
+
+        auto expected_reader = flat_mutation_reader_from_mutations(expected_muts);
+
+        auto push_fut = make_ready_future<>();
+        while (push_fut.available()) {
+            push_fut = handle.push(std::move(*expected_reader(db::no_timeout).get0()));
+        }
+
+        BOOST_REQUIRE(!push_fut.available());
+
+        handle.abort(std::make_exception_ptr<std::runtime_error>(std::runtime_error("error")));
+
+        BOOST_REQUIRE_THROW(reader.fill_buffer(db::no_timeout).get(), std::runtime_error);
+        BOOST_REQUIRE_THROW(push_fut.get(), std::runtime_error);
+        BOOST_REQUIRE(!reader.is_end_of_stream());
    }

    // Detached handle
@@ -2738,3 +2761,597 @@ SEASTAR_THREAD_TEST_CASE(test_compacting_reader_next_partition) {
    }
    reader_assertions.produces_end_of_stream();
 }
+
+SEASTAR_THREAD_TEST_CASE(test_auto_paused_evictable_reader_is_mutation_source) {
+    auto make_populate = [] (schema_ptr s, const std::vector<mutation>& mutations, gc_clock::time_point query_time) {
+        auto mt = make_lw_shared<memtable>(s);
+        for (auto& mut : mutations) {
+            mt->apply(mut);
+        }
+        auto sem = make_lw_shared<reader_concurrency_semaphore>(reader_concurrency_semaphore::no_limits());
+        return mutation_source([=] (
+                schema_ptr s,
+                reader_permit permit,
+                const dht::partition_range& range,
+                const query::partition_slice& slice,
+                const io_priority_class& pc,
+                tracing::trace_state_ptr trace_state,
+                streamed_mutation::forwarding fwd_sm,
+                mutation_reader::forwarding fwd_mr) mutable {
+            auto mr = make_auto_paused_evictable_reader(mt->as_data_source(), std::move(s), *sem, range, slice, pc, std::move(trace_state), fwd_mr);
+            if (fwd_sm == streamed_mutation::forwarding::yes) {
+                return make_forwardable(std::move(mr));
+            }
+            return mr;
+        });
+    };
+
+    run_mutation_source_tests(make_populate);
+}
+
+SEASTAR_THREAD_TEST_CASE(test_manual_paused_evictable_reader_is_mutation_source) {
+    class maybe_pausing_reader : public flat_mutation_reader::impl {
+        flat_mutation_reader _reader;
+        std::optional<evictable_reader_handle> _handle;
+
+    private:
+        void maybe_pause() {
+            if (!tests::random::get_int(0, 4)) {
+                _handle->pause();
+            }
+        }
+
+    public:
+        maybe_pausing_reader(
+                memtable& mt,
+                reader_concurrency_semaphore& semaphore,
+                const dht::partition_range& pr,
+                const query::partition_slice& ps,
+                const io_priority_class& pc,
+                tracing::trace_state_ptr trace_state,
+                mutation_reader::forwarding fwd_mr)
+            : impl(mt.schema()), _reader(nullptr) {
+            std::tie(_reader, _handle) = make_manually_paused_evictable_reader(mt.as_data_source(), mt.schema(), semaphore, pr, ps, pc,
+                    std::move(trace_state), fwd_mr);
+        }
+        virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
+            return _reader.fill_buffer(timeout).then([this] {
+                _end_of_stream = _reader.is_end_of_stream();
+                _reader.move_buffer_content_to(*this);
+            }).then([this] {
+                maybe_pause();
+            });
+        }
+        virtual void next_partition() override {
+            clear_buffer_to_next_partition();
+            if (!is_buffer_empty()) {
+                return;
+            }
+            _end_of_stream = false;
+            _reader.next_partition();
+        }
+        virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override {
+            clear_buffer();
+            _end_of_stream = false;
+            return _reader.fast_forward_to(pr, timeout).then([this] {
+                maybe_pause();
+            });
+        }
+        virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override {
+            throw_with_backtrace<std::bad_function_call>();
+        }
+        virtual size_t buffer_size() const override {
+            return flat_mutation_reader::impl::buffer_size() + _reader.buffer_size();
+        }
+    };
+
+    auto make_populate = [this] (schema_ptr s, const std::vector<mutation>& mutations, gc_clock::time_point query_time) {
+        auto mt = make_lw_shared<memtable>(s);
+        for (auto& mut : mutations) {
+            mt->apply(mut);
+        }
+        auto sem = make_lw_shared<reader_concurrency_semaphore>(reader_concurrency_semaphore::no_limits());
+        return mutation_source([=] (
+                schema_ptr s,
+                reader_permit permit,
+                const dht::partition_range& range,
+                const query::partition_slice& slice,
+                const io_priority_class& pc,
+                tracing::trace_state_ptr trace_state,
+                streamed_mutation::forwarding fwd_sm,
+                mutation_reader::forwarding fwd_mr) mutable {
+            auto mr = make_flat_mutation_reader<maybe_pausing_reader>(*mt, *sem, range, slice, pc, std::move(trace_state), fwd_mr);
+            if (fwd_sm == streamed_mutation::forwarding::yes) {
+                return make_forwardable(std::move(mr));
+            }
+            return mr;
+        });
+    };
+
+    run_mutation_source_tests(make_populate);
+}
+
+namespace {
+
+std::deque<mutation_fragment> copy_fragments(const schema& s, const std::deque<mutation_fragment>& o) {
+    std::deque<mutation_fragment> buf;
+    for (const auto& mf : o) {
+        buf.emplace_back(s, mf);
+    }
+    return buf;
+}
+
+flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
+        schema_ptr schema,
+        reader_concurrency_semaphore& semaphore,
+        const dht::partition_range& prange,
+        const query::partition_slice& slice,
+        std::deque<mutation_fragment> first_buffer,
+        position_in_partition_view last_fragment_position,
+        std::deque<mutation_fragment> second_buffer,
+        size_t max_buffer_size) {
+    class factory {
+        schema_ptr _schema;
+        std::optional<std::deque<mutation_fragment>> _first_buffer;
+        std::optional<std::deque<mutation_fragment>> _second_buffer;
+        size_t _max_buffer_size;
+
+    private:
+        std::optional<std::deque<mutation_fragment>> copy_buffer(const std::optional<std::deque<mutation_fragment>>& o) {
+            if (!o) {
+                return {};
+            }
+            return copy_fragments(*_schema, *o);
+        }
+
+    public:
+        factory(schema_ptr schema, std::deque<mutation_fragment> first_buffer, std::deque<mutation_fragment> second_buffer, size_t max_buffer_size)
+            : _schema(std::move(schema)), _first_buffer(std::move(first_buffer)), _second_buffer(std::move(second_buffer)), _max_buffer_size(max_buffer_size) {
+        }
+
+        factory(const factory& o)
+            : _schema(o._schema)
+            , _first_buffer(copy_buffer(o._first_buffer))
+            , _second_buffer(copy_buffer(o._second_buffer)) {
+        }
+        factory(factory&& o) = default;
+
+        flat_mutation_reader operator()(
+                schema_ptr s,
+                reader_permit permit,
+                const dht::partition_range& range,
+                const query::partition_slice& slice,
+                const io_priority_class& pc,
+                tracing::trace_state_ptr trace_state,
+                streamed_mutation::forwarding fwd_sm,
+                mutation_reader::forwarding fwd_mr) {
+            BOOST_REQUIRE(s == _schema);
+            if (_first_buffer) {
+                auto buf = *std::exchange(_first_buffer, {});
+                auto rd = make_flat_mutation_reader_from_fragments(_schema, std::move(buf));
+                rd.set_max_buffer_size(_max_buffer_size);
+                return rd;
+            }
+            if (_second_buffer) {
+                auto buf = *std::exchange(_second_buffer, {});
+                auto rd = make_flat_mutation_reader_from_fragments(_schema, std::move(buf));
+                rd.set_max_buffer_size(_max_buffer_size);
+                return rd;
+            }
+            return make_empty_flat_reader(_schema);
+        }
+    };
+    auto ms = mutation_source(factory(schema, std::move(first_buffer), std::move(second_buffer), max_buffer_size));
+
+    auto [rd, handle] = make_manually_paused_evictable_reader(
+            std::move(ms),
+            schema,
+            semaphore,
+            prange,
+            slice,
+            seastar::default_priority_class(),
+            nullptr,
+            mutation_reader::forwarding::yes);
+
+    rd.set_max_buffer_size(max_buffer_size);
+
+    rd.fill_buffer(db::no_timeout).get0();
+
+    const auto eq_cmp = position_in_partition::equal_compare(*schema);
+    BOOST_REQUIRE(rd.is_buffer_full());
+    BOOST_REQUIRE(eq_cmp(rd.buffer().back().position(), last_fragment_position));
+    BOOST_REQUIRE(!rd.is_end_of_stream());
+
+    rd.detach_buffer();
+
+    handle.pause();
+
+    while(semaphore.try_evict_one_inactive_read());
+
+    return std::move(rd);
+}
+
+}
+
+SEASTAR_THREAD_TEST_CASE(test_evictable_reader_trim_range_tombstones) {
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{});
+    simple_schema s;
+
+    const auto pkey = s.make_pkey();
+    size_t max_buffer_size = 512;
+    const int first_ck = 100;
+    const int second_buffer_ck = first_ck + 100;
+
+    size_t mem_usage = 0;
+
+    std::deque<mutation_fragment> first_buffer;
+    first_buffer.emplace_back(partition_start{pkey, {}});
+    mem_usage = first_buffer.back().memory_usage(*s.schema());
+    for (int i = 0; i < second_buffer_ck; ++i) {
+        first_buffer.emplace_back(s.make_row(s.make_ckey(i++), "v"));
+        mem_usage += first_buffer.back().memory_usage(*s.schema());
+    }
+    const auto last_fragment_position = position_in_partition(first_buffer.back().position());
+    max_buffer_size = mem_usage;
+    first_buffer.emplace_back(s.make_row(s.make_ckey(second_buffer_ck), "v"));
+
+    std::deque<mutation_fragment> second_buffer;
+    second_buffer.emplace_back(partition_start{pkey, {}});
+    mem_usage = second_buffer.back().memory_usage(*s.schema());
+    second_buffer.emplace_back(s.make_range_tombstone(query::clustering_range::make_ending_with(s.make_ckey(second_buffer_ck + 10))));
+    int ckey = second_buffer_ck;
+    while (mem_usage <= max_buffer_size) {
+        second_buffer.emplace_back(s.make_row(s.make_ckey(ckey++), "v"));
+        mem_usage += second_buffer.back().memory_usage(*s.schema());
+    }
+    second_buffer.emplace_back(partition_end{});
+
+    auto rd = create_evictable_reader_and_evict_after_first_buffer(s.schema(), semaphore, query::full_partition_range,
+            s.schema()->full_slice(), std::move(first_buffer), last_fragment_position, std::move(second_buffer), max_buffer_size);
+
+    rd.fill_buffer(db::no_timeout).get();
+
+    const auto tri_cmp = position_in_partition::tri_compare(*s.schema());
+
+    BOOST_REQUIRE(tri_cmp(last_fragment_position, rd.peek_buffer().position()) < 0);
+}
+
+namespace {
+
+void check_evictable_reader_validation_is_triggered(
+        std::string_view test_name,
+        std::string_view error_prefix, // empty str if no exception is expected
+        schema_ptr schema,
+        reader_concurrency_semaphore& semaphore,
+        const dht::partition_range& prange,
+        const query::partition_slice& slice,
+        std::deque<mutation_fragment> first_buffer,
+        position_in_partition_view last_fragment_position,
+        std::deque<mutation_fragment> second_buffer,
+        size_t max_buffer_size) {
+
+    testlog.info("check_evictable_reader_validation_is_triggered(): checking {} test case: {}", error_prefix.empty() ? "positive" : "negative", test_name);
+
+    auto rd = create_evictable_reader_and_evict_after_first_buffer(std::move(schema), semaphore, prange, slice, std::move(first_buffer),
+            last_fragment_position, std::move(second_buffer), max_buffer_size);
+
+    const bool fail = !error_prefix.empty();
+
+    try {
+        rd.fill_buffer(db::no_timeout).get0();
+    } catch (std::runtime_error& e) {
+        if (fail) {
+            if (error_prefix == std::string_view(e.what(), error_prefix.size())) {
+                testlog.trace("Expected exception caught: {}", std::current_exception());
+                return;
+            } else {
+                BOOST_FAIL(fmt::format("Exception with unexpected message caught: {}", std::current_exception()));
+            }
+        } else {
+            BOOST_FAIL(fmt::format("Unexpected exception caught: {}", std::current_exception()));
+        }
+    }
+    if (fail) {
+        BOOST_FAIL(fmt::format("Expected exception not thrown"));
+    }
+}
+
+}
+
+SEASTAR_THREAD_TEST_CASE(test_evictable_reader_self_validation) {
+    set_abort_on_internal_error(false);
+    auto reset_on_internal_abort = defer([] {
+        set_abort_on_internal_error(true);
+    });
+
+    reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{});
+    simple_schema s;
+
+    auto pkeys = s.make_pkeys(4);
+    boost::sort(pkeys, dht::decorated_key::less_comparator(s.schema()));
+
+    size_t max_buffer_size = 512;
+    const int first_ck = 100;
+    const int second_buffer_ck = first_ck + 100;
+    const int last_ck = second_buffer_ck + 100;
+
+    static const char partition_error_prefix[] = "maybe_validate_partition_start(): validation failed";
+    static const char position_in_partition_error_prefix[] = "validate_position_in_partition(): validation failed";
+    static const char trim_range_tombstones_error_prefix[] = "maybe_trim_range_tombstone(): validation failed";
+
+    const auto prange = dht::partition_range::make(
+            dht::partition_range::bound(pkeys[1], true),
+            dht::partition_range::bound(pkeys[2], true));
+
+    const auto ckrange = query::clustering_range::make(
+            query::clustering_range::bound(s.make_ckey(first_ck), true),
+            query::clustering_range::bound(s.make_ckey(last_ck), true));
+
+    const auto slice = partition_slice_builder(*s.schema()).with_range(ckrange).build();
+
+    std::deque<mutation_fragment> first_buffer;
+    first_buffer.emplace_back(partition_start{pkeys[1], {}});
+    size_t mem_usage = first_buffer.back().memory_usage(*s.schema());
+    for (int i = 0; i < second_buffer_ck; ++i) {
+        first_buffer.emplace_back(s.make_row(s.make_ckey(i++), "v"));
+        mem_usage += first_buffer.back().memory_usage(*s.schema());
+    }
+    max_buffer_size = mem_usage;
+    auto last_fragment_position = position_in_partition(first_buffer.back().position());
+    first_buffer.emplace_back(s.make_row(s.make_ckey(second_buffer_ck), "v"));
+
+    auto make_second_buffer = [&s, &max_buffer_size, second_buffer_ck] (dht::decorated_key pkey, std::optional<int> first_ckey = {},
+            bool inject_range_tombstone = false) mutable {
+        auto ckey = first_ckey ? *first_ckey : second_buffer_ck;
+        std::deque<mutation_fragment> second_buffer;
+        second_buffer.emplace_back(partition_start{std::move(pkey), {}});
+        size_t mem_usage = second_buffer.back().memory_usage(*s.schema());
+        if (inject_range_tombstone) {
+            second_buffer.emplace_back(s.make_range_tombstone(query::clustering_range::make_ending_with(s.make_ckey(last_ck))));
+        }
+        while (mem_usage <= max_buffer_size) {
+            second_buffer.emplace_back(s.make_row(s.make_ckey(ckey++), "v"));
+            mem_usage += second_buffer.back().memory_usage(*s.schema());
+        }
+        second_buffer.emplace_back(partition_end{});
+        return second_buffer;
+    };
+
+    //
+    // Continuing the same partition
+    //
+
+    check_evictable_reader_validation_is_triggered(
+            "pkey < _last_pkey; pkey ∉ prange",
+            partition_error_prefix,
+            s.schema(),
+            semaphore,
+            prange,
+            slice,
+            copy_fragments(*s.schema(), first_buffer),
+            last_fragment_position,
+            make_second_buffer(pkeys[0]),
+            max_buffer_size);
+
+    check_evictable_reader_validation_is_triggered(
+            "pkey == _last_pkey",
+            "",
+            s.schema(),
+            semaphore,
+            prange,
+            slice,
+            copy_fragments(*s.schema(), first_buffer),
+            last_fragment_position,
+            make_second_buffer(pkeys[1]),
+            max_buffer_size);
+
+    check_evictable_reader_validation_is_triggered(
+            "pkey == _last_pkey; position_in_partition ∉ ckrange (<)",
+            position_in_partition_error_prefix,
+            s.schema(),
+            semaphore,
+            prange,
+            slice,
+            copy_fragments(*s.schema(), first_buffer),
+            last_fragment_position,
+            make_second_buffer(pkeys[1], first_ck - 10),
+            max_buffer_size);
+
+    check_evictable_reader_validation_is_triggered(
+            "pkey == _last_pkey; position_in_partition ∉ ckrange (<); start with trimmable range-tombstone",
+            position_in_partition_error_prefix,
+            s.schema(),
+            semaphore,
+            prange,
+            slice,
+            copy_fragments(*s.schema(), first_buffer),
+            last_fragment_position,
+            make_second_buffer(pkeys[1], first_ck - 10, true),
+            max_buffer_size);
+
+    check_evictable_reader_validation_is_triggered(
+            "pkey == _last_pkey; position_in_partition ∉ ckrange; position_in_partition < _next_position_in_partition",
+            position_in_partition_error_prefix,
+            s.schema(),
+            semaphore,
+            prange,
+            slice,
+            copy_fragments(*s.schema(), first_buffer),
+            last_fragment_position,
+            make_second_buffer(pkeys[1], second_buffer_ck - 2),
+            max_buffer_size);
+
+    check_evictable_reader_validation_is_triggered(
+            "pkey == _last_pkey; position_in_partition ∉ ckrange; position_in_partition < _next_position_in_partition; start with trimmable range-tombstone",
+            position_in_partition_error_prefix,
+            s.schema(),
+            semaphore,
+            prange,
+            slice,
+            copy_fragments(*s.schema(), first_buffer),
+            last_fragment_position,
+            make_second_buffer(pkeys[1], second_buffer_ck - 2, true),
+            max_buffer_size);
+
+    {
+        auto second_buffer = make_second_buffer(pkeys[1], second_buffer_ck);
+        second_buffer[1] = s.make_range_tombstone(query::clustering_range::make_ending_with(s.make_ckey(second_buffer_ck - 10)));
+        check_evictable_reader_validation_is_triggered(
+                "pkey == _last_pkey; end(range_tombstone) < _next_position_in_partition",
+                trim_range_tombstones_error_prefix,
+                s.schema(),
+                semaphore,
+                prange,
+                slice,
+                copy_fragments(*s.schema(), first_buffer),
+                last_fragment_position,
+                std::move(second_buffer),
+                max_buffer_size);
+    }
+
+    {
+        auto second_buffer = make_second_buffer(pkeys[1], second_buffer_ck);
+        second_buffer[1] = s.make_range_tombstone(query::clustering_range::make_ending_with(s.make_ckey(second_buffer_ck + 10)));
+        check_evictable_reader_validation_is_triggered(
+                "pkey == _last_pkey; end(range_tombstone) > _next_position_in_partition",
+                "",
+                s.schema(),
+                semaphore,
+                prange,
+                slice,
+                copy_fragments(*s.schema(), first_buffer),
+                last_fragment_position,
+                std::move(second_buffer),
+                max_buffer_size);
+    }
+
+    {
+        auto second_buffer = make_second_buffer(pkeys[1], second_buffer_ck);
+        second_buffer[1] = s.make_range_tombstone(query::clustering_range::make_starting_with(s.make_ckey(last_ck + 10)));
+        check_evictable_reader_validation_is_triggered(
+                "pkey == _last_pkey; start(range_tombstone) ∉ ckrange (>)",
+                position_in_partition_error_prefix,
+                s.schema(),
+                semaphore,
+                prange,
+                slice,
+                copy_fragments(*s.schema(), first_buffer),
+                last_fragment_position,
+                std::move(second_buffer),
+                max_buffer_size);
+    }
+
+    check_evictable_reader_validation_is_triggered(
+            "pkey == _last_pkey; position_in_partition ∈ ckrange",
+            "",
+            s.schema(),
+            semaphore,
+            prange,
+            slice,
+            copy_fragments(*s.schema(), first_buffer),
+            last_fragment_position,
+            make_second_buffer(pkeys[1], second_buffer_ck),
+            max_buffer_size);
+
+    check_evictable_reader_validation_is_triggered(
+            "pkey == _last_pkey; position_in_partition ∉ ckrange (>)",
+            position_in_partition_error_prefix,
+            s.schema(),
+            semaphore,
+            prange,
+            slice,
+            copy_fragments(*s.schema(), first_buffer),
+            last_fragment_position,
+            make_second_buffer(pkeys[1], last_ck + 10),
+            max_buffer_size);
+
+    check_evictable_reader_validation_is_triggered(
+            "pkey > _last_pkey; pkey ∈ pkrange",
+            partition_error_prefix,
+            s.schema(),
+            semaphore,
+            prange,
+            slice,
+            copy_fragments(*s.schema(), first_buffer),
+            last_fragment_position,
+            make_second_buffer(pkeys[2]),
+            max_buffer_size);
+
+    check_evictable_reader_validation_is_triggered(
+            "pkey > _last_pkey; pkey ∉ pkrange",
+            partition_error_prefix,
+            s.schema(),
+            semaphore,
+            prange,
+            slice,
+            copy_fragments(*s.schema(), first_buffer),
+            last_fragment_position,
+            make_second_buffer(pkeys[3]),
+            max_buffer_size);
+
+    //
+    // Continuing from next partition
+    //
+
+    first_buffer.clear();
+
+    first_buffer.emplace_back(partition_start{pkeys[1], {}});
+    mem_usage = first_buffer.back().memory_usage(*s.schema());
+    for (int i = 0; i < second_buffer_ck; ++i) {
+        first_buffer.emplace_back(s.make_row(s.make_ckey(i++), "v"));
+        mem_usage += first_buffer.back().memory_usage(*s.schema());
+    }
+    first_buffer.emplace_back(partition_end{});
+    mem_usage += first_buffer.back().memory_usage(*s.schema());
+    last_fragment_position = position_in_partition(first_buffer.back().position());
+    max_buffer_size = mem_usage;
+    first_buffer.emplace_back(partition_start{pkeys[2], {}});
+
+    check_evictable_reader_validation_is_triggered(
+            "pkey < _last_pkey; pkey ∉ pkrange",
+            partition_error_prefix,
+            s.schema(),
+            semaphore,
+            prange,
+            slice,
+            copy_fragments(*s.schema(), first_buffer),
+            last_fragment_position,
+            make_second_buffer(pkeys[0]),
+            max_buffer_size);
+
+    check_evictable_reader_validation_is_triggered(
+            "pkey == _last_pkey",
+            partition_error_prefix,
+            s.schema(),
+            semaphore,
+            prange,
+            slice,
+            copy_fragments(*s.schema(), first_buffer),
+            last_fragment_position,
+            make_second_buffer(pkeys[1]),
+            max_buffer_size);
+
+    check_evictable_reader_validation_is_triggered(
+            "pkey > _last_pkey; pkey ∈ pkrange",
+            "",
+            s.schema(),
+            semaphore,
+            prange,
+            slice,
+            copy_fragments(*s.schema(), first_buffer),
+            last_fragment_position,
+            make_second_buffer(pkeys[2]),
+            max_buffer_size);
+
+    check_evictable_reader_validation_is_triggered(
+            "pkey > _last_pkey; pkey ∉ pkrange",
+            partition_error_prefix,
+            s.schema(),
+            semaphore,
+            prange,
+            slice,
+            copy_fragments(*s.schema(), first_buffer),
+            last_fragment_position,
+            make_second_buffer(pkeys[3]),
+            max_buffer_size);
+}
--- a/Show More
+++ b/Show More