release: prepare for 3.1.0

release: prepare for 3.1.0.rc9
querier_cache: correctly account entries evicted on insertion in the population
2019-10-12 08:45:49 +03:00 · 2019-10-06 10:51:37 +03:00 · 2019-10-05 12:36:21 +03:00 · 2019-10-05 09:50:05 +03:00 · 2019-10-03 14:42:38 +03:00 · 2019-10-03 14:41:34 +03:00
317 changed files with 4142 additions and 1097 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=666.development
+VERSION=3.1.0

 if test -f version
 then
--- a/api/api.hh
+++ b/api/api.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include <seastar/json/json_elements.hh>
+#include <type_traits>
 #include <boost/lexical_cast.hpp>
 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string/classification.hpp>
@@ -231,7 +232,22 @@ public:
            return;
        }
        try {
-            value = T{boost::lexical_cast<Base>(param)};
+            // boost::lexical_cast does not use boolalpha. Converting a
+            // true/false throws exceptions. We don't want that.
+            if constexpr (std::is_same_v<Base, bool>) {
+                // Cannot use boolalpha because we (probably) want to
+                // accept 1 and 0 as well as true and false. And True. And fAlse.
+                std::transform(param.begin(), param.end(), param.begin(), ::tolower);
+                if (param == "true" || param == "1") {
+                    value = T(true);
+                } else if (param == "false" || param == "0") {
+                    value = T(false);
+                } else {
+                    throw boost::bad_lexical_cast{};
+                }
+            } else {
+                value = T{boost::lexical_cast<Base>(param)};
+            }
        } catch (boost::bad_lexical_cast&) {
            throw bad_param_exception(format("{} ({}): type error - should be {}", name, param, boost::units::detail::demangle(typeid(Base).name())));
        }
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -170,7 +170,9 @@ future<> service::start() {
    return once_among_shards([this] {
        return create_keyspace_if_missing();
    }).then([this] {
-        return when_all_succeed(_role_manager->start(), _authorizer->start(), _authenticator->start());
+        return _role_manager->start().then([this] {
+            return when_all_succeed(_authorizer->start(), _authenticator->start());
+        });
    }).then([this] {
        _permissions_cache = std::make_unique<permissions_cache>(_permissions_cache_config, *this, log);
    }).then([this] {
--- a/configure.py
+++ b/configure.py
@@ -596,6 +596,7 @@ scylla_core = (['database.cc',
                'db/consistency_level.cc',
                'db/system_keyspace.cc',
                'db/system_distributed_keyspace.cc',
+                'db/size_estimates_virtual_reader.cc',
                'db/schema_tables.cc',
                'db/cql_type_parser.cc',
                'db/legacy_schema_migrator.cc',
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -222,11 +222,9 @@ statement_restrictions::statement_restrictions(database& db,
    auto& cf = db.find_column_family(schema);
    auto& sim = cf.get_index_manager();
    const allow_local_index allow_local(!_partition_key_restrictions->has_unrestricted_components(*_schema) && _partition_key_restrictions->is_all_eq());
-    bool has_queriable_clustering_column_index = _clustering_columns_restrictions->has_supporting_index(sim, allow_local);
-    bool has_queriable_pk_index = _partition_key_restrictions->has_supporting_index(sim, allow_local);
-    bool has_queriable_index = has_queriable_clustering_column_index
-            || has_queriable_pk_index
-            || _nonprimary_key_restrictions->has_supporting_index(sim, allow_local);
+    const bool has_queriable_clustering_column_index = _clustering_columns_restrictions->has_supporting_index(sim, allow_local);
+    const bool has_queriable_pk_index = _partition_key_restrictions->has_supporting_index(sim, allow_local);
+    const bool has_queriable_regular_index = _nonprimary_key_restrictions->has_supporting_index(sim, allow_local);

    // At this point, the select statement if fully constructed, but we still have a few things to validate
    process_partition_key_restrictions(has_queriable_pk_index, for_view, allow_filtering);
@@ -286,7 +284,7 @@ statement_restrictions::statement_restrictions(database& db,
    }

    if (!_nonprimary_key_restrictions->empty()) {
-        if (has_queriable_index) {
+        if (has_queriable_regular_index) {
            _uses_secondary_indexing = true;
        } else if (!allow_filtering) {
            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
@@ -392,8 +390,9 @@ std::vector<const column_definition*> statement_restrictions::get_column_defs_fo
                }
            }
        }
-        if (_clustering_columns_restrictions->needs_filtering(*_schema)) {
-            column_id first_filtering_id = _schema->clustering_key_columns().begin()->id +
+        const bool pk_has_unrestricted_components = _partition_key_restrictions->has_unrestricted_components(*_schema);
+        if (pk_has_unrestricted_components || _clustering_columns_restrictions->needs_filtering(*_schema)) {
+            column_id first_filtering_id = pk_has_unrestricted_components ? 0 : _schema->clustering_key_columns().begin()->id +
                    _clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
            for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
                if (cdef->id >= first_filtering_id && !column_uses_indexing(cdef)) {
@@ -507,10 +506,9 @@ bool statement_restrictions::need_filtering() const {
    int number_of_filtering_restrictions = _nonprimary_key_restrictions->size();
    // If the whole partition key is restricted, it does not imply filtering
    if (_partition_key_restrictions->has_unrestricted_components(*_schema) || !_partition_key_restrictions->is_all_eq()) {
-        number_of_filtering_restrictions += _partition_key_restrictions->size();
-        if (_clustering_columns_restrictions->has_unrestricted_components(*_schema)) {
-            number_of_filtering_restrictions += _clustering_columns_restrictions->size() - _clustering_columns_restrictions->prefix_size();
-        }
+        number_of_filtering_restrictions += _partition_key_restrictions->size() + _clustering_columns_restrictions->size();
+    } else if (_clustering_columns_restrictions->has_unrestricted_components(*_schema)) {
+        number_of_filtering_restrictions += _clustering_columns_restrictions->size() - _clustering_columns_restrictions->prefix_size();
    }
    return number_of_restricted_columns_for_indexing > 1
            || (number_of_restricted_columns_for_indexing == 0 && _partition_key_restrictions->empty() && !_clustering_columns_restrictions->empty())
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -407,7 +407,7 @@ public:
    }

    bool ck_restrictions_need_filtering() const {
-        return _clustering_columns_restrictions->needs_filtering(*_schema);
+        return _partition_key_restrictions->has_unrestricted_components(*_schema) || _clustering_columns_restrictions->needs_filtering(*_schema);
    }

    /**
--- a/cql3/result_set.cc
+++ b/cql3/result_set.cc
@@ -83,6 +83,9 @@ void metadata::maybe_set_paging_state(::shared_ptr<const service::pager::paging_
    assert(paging_state);
    if (paging_state->get_remaining() > 0) {
        set_paging_state(std::move(paging_state));
+    } else {
+        _flags.remove<flag::HAS_MORE_PAGES>();
+        _paging_state = nullptr;
    }
 }

--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -76,7 +76,7 @@ public:
        const bool _is_distinct;
        const bool _allow_filtering;
        const bool _is_json;
-        bool _bypass_cache;
+        bool _bypass_cache = false;
    public:
        parameters();
        parameters(orderings_type orderings,
--- a/database.cc
+++ b/database.cc
@@ -1929,7 +1929,7 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
        virtual flat_mutation_reader create_reader(
                schema_ptr schema,
                const dht::partition_range& range,
-                const query::partition_slice&,
+                const query::partition_slice& slice,
                const io_priority_class& pc,
                tracing::trace_state_ptr,
                mutation_reader::forwarding fwd_mr) override {
@@ -1940,7 +1940,7 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
            _contexts[shard].read_operation = make_foreign(std::make_unique<utils::phased_barrier::operation>(cf.read_in_progress()));
            _contexts[shard].semaphore = &cf.streaming_read_concurrency_semaphore();

-            return cf.make_streaming_reader(std::move(schema), *_contexts[shard].range, fwd_mr);
+            return cf.make_streaming_reader(std::move(schema), *_contexts[shard].range, slice, fwd_mr);
        }
        virtual void destroy_reader(shard_id shard, future<stopped_reader> reader_fut) noexcept override {
            reader_fut.then([this, zis = shared_from_this(), shard] (stopped_reader&& reader) mutable {
--- a/database.hh
+++ b/database.hh
@@ -458,6 +458,7 @@ private:
    // This semaphore ensures that an operation like snapshot won't have its selected
    // sstables deleted by compaction in parallel, a race condition which could
    // easily result in failure.
+    // Locking order: must be acquired either independently or after _sstables_lock
    seastar::semaphore _sstable_deletion_sem = {1};
    // There are situations in which we need to stop writing sstables. Flushers will take
    // the read lock, and the ones that wish to stop that process will take the write lock.
@@ -679,8 +680,13 @@ public:

    // Single range overload.
    flat_mutation_reader make_streaming_reader(schema_ptr schema, const dht::partition_range& range,
+            const query::partition_slice& slice,
            mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::no) const;

+    flat_mutation_reader make_streaming_reader(schema_ptr schema, const dht::partition_range& range) {
+        return make_streaming_reader(schema, range, schema->full_slice());
+    }
+
    sstables::shared_sstable make_streaming_sstable_for_write(std::optional<sstring> subdir = {});
    sstables::shared_sstable make_streaming_staging_sstable() {
        return make_streaming_sstable_for_write("staging");
@@ -759,13 +765,7 @@ public:

    // SSTable writes are now allowed again, and generation is updated to new_generation if != -1
    // returns the amount of microseconds elapsed since we disabled writes.
-    std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation) {
-        if (new_generation != -1) {
-            update_sstables_known_generation(new_generation);
-        }
-        _sstables_lock.write_unlock();
-        return std::chrono::steady_clock::now() - _sstable_writes_disabled_at;
-    }
+    std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation);

    // Make sure the generation numbers are sequential, starting from "start".
    // Generations before "start" are left untouched.
@@ -935,7 +935,7 @@ public:
    }

 private:
-    future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source) const;
+    future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source, const io_priority_class& io_priority) const;
    std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
    future<> generate_and_propagate_view_updates(const schema_ptr& base,
            std::vector<view_ptr>&& views,
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -396,10 +396,8 @@ std::unordered_set<gms::inet_address> db::batchlog_manager::endpoint_filter(cons

    // grab a random member of up to two racks
    for (auto& rack : racks) {
-        auto rack_members = validated.bucket(rack);
-        auto n = validated.bucket_size(rack_members);
        auto cpy = boost::copy_range<std::vector<gms::inet_address>>(validated.equal_range(rack) | boost::adaptors::map_values);
-        std::uniform_int_distribution<size_t> rdist(0, n - 1);
+        std::uniform_int_distribution<size_t> rdist(0, cpy.size() - 1);
        result.emplace(cpy[rdist(_e1)]);
    }

--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -148,9 +148,18 @@ db::commitlog::descriptor::descriptor(const sstring& filename, const std::string
        : descriptor([&filename, &fname_prefix]() {
            std::smatch m;
            // match both legacy and new version of commitlogs Ex: CommitLog-12345.log and CommitLog-4-12345.log.
-                std::regex rx("(?:.*/)?(?:Recycled-)?" + fname_prefix + "((\\d+)(" + SEPARATOR + "\\d+)?)" + FILENAME_EXTENSION);
+                std::regex rx("(?:Recycled-)?" + fname_prefix + "((\\d+)(" + SEPARATOR + "\\d+)?)" + FILENAME_EXTENSION);
                std::string sfilename = filename;
-                if (!std::regex_match(sfilename, m, rx)) {
+                auto cbegin = sfilename.cbegin();
+                // skip the leading path
+                // Note: we're using rfind rather than the regex above
+                // since it may run out of stack in debug builds.
+                // See https://github.com/scylladb/scylla/issues/4464
+                auto pos = std::string(filename).rfind('/');
+                if (pos != std::string::npos) {
+                    cbegin += pos + 1;
+                }
+                if (!std::regex_match(cbegin, sfilename.cend(), m, rx)) {
                    throw std::domain_error("Cannot parse the version of the file: " + filename);
                }
                if (m[3].length() == 0) {
@@ -420,7 +429,11 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c

    uint64_t _file_pos = 0;
    uint64_t _flush_pos = 0;
+
    bool _closed = false;
+    // Not the same as _closed since files can be reused
+    bool _closed_file = false;
+
    bool _terminated = false;

    using buffer_type = segment_manager::buffer_type;
@@ -486,7 +499,7 @@ public:
        clogger.debug("Created new {} segment {}", active ? "active" : "reserve", *this);
    }
    ~segment() {
-        if (!_closed) {
+        if (!_closed_file) {
            _segment_manager->add_file_to_close(std::move(_file));
        }
        if (is_clean()) {
@@ -560,7 +573,7 @@ public:
                    // and we should have waited out all pending.
                    return me->_pending_ops.close().finally([me] {
                        return me->_file.truncate(me->_flush_pos).then([me] {
-                            return me->_file.close();
+                            return me->_file.close().finally([me] { me->_closed_file = true; });
                        });
                    });
                });
--- a/db/config.hh
+++ b/db/config.hh
@@ -756,6 +756,7 @@ public:
    val(enable_dangerous_direct_import_of_cassandra_counters, bool, false, Used, "Only turn this option on if you want to import tables from Cassandra containing counters, and you are SURE that no counters in that table were created in a version earlier than Cassandra 2.1." \
        " It is not enough to have ever since upgraded to newer versions of Cassandra. If you EVER used a version earlier than 2.1 in the cluster where these SSTables come from, DO NOT TURN ON THIS OPTION! You will corrupt your data. You have been warned.") \
    val(enable_shard_aware_drivers, bool, true, Used, "Enable native transport drivers to use connection-per-shard for better performance") \
+    val(abort_on_internal_error, bool, false, Used, "Abort the server instead of throwing exception when internal invariants are violated.") \
    /* done! */

 #define _make_value_member(name, type, deflt, status, desc, ...)    \
--- a/db/cql_type_parser.cc
+++ b/db/cql_type_parser.cc
@@ -57,9 +57,30 @@ static ::shared_ptr<cql3::cql3_type::raw> parse_raw(const sstring& str) {
 }

 data_type db::cql_type_parser::parse(const sstring& keyspace, const sstring& str, lw_shared_ptr<user_types_metadata> user_types) {
+    static const thread_local std::unordered_map<sstring, cql3::cql3_type> native_types = []{
+        std::unordered_map<sstring, cql3::cql3_type> res;
+        for (auto& nt : cql3::cql3_type::values()) {
+            res.emplace(nt.to_string(), nt);
+        }
+        return res;
+    }();
+
+    auto i = native_types.find(str);
+    if (i != native_types.end()) {
+        return i->second.get_type();
+    }
+
    if (!user_types && service::get_storage_proxy().local_is_initialized()) {
        user_types = service::get_storage_proxy().local().get_db().local().find_keyspace(keyspace).metadata()->user_types();
    }
+    // special-case top-level UDTs
+    if (user_types) {
+        auto& map = user_types->get_all_types();
+        auto i = map.find(utf8_type->decompose(str));
+        if (i != map.end()) {
+            return i->second;
+        }
+    }

    auto raw = parse_raw(str);
    auto cql = raw->prepare_internal(keyspace, user_types);
--- a/db/data_listeners.cc
+++ b/db/data_listeners.cc
@@ -57,7 +57,7 @@ void data_listeners::on_write(const schema_ptr& s, const frozen_mutation& m) {
    }
 }

-toppartitons_item_key::operator sstring() const {
+toppartitions_item_key::operator sstring() const {
    std::ostringstream oss;
    oss << key.key().with_schema(*schema);
    return oss.str();
@@ -84,8 +84,11 @@ flat_mutation_reader toppartitions_data_listener::on_read(const schema_ptr& s, c
        return std::move(rd);
    }
    dblog.trace("toppartitions_data_listener::on_read: {}.{}", s->ks_name(), s->cf_name());
-    return make_filtering_reader(std::move(rd), [this, &range, &slice, s = std::move(s)] (const dht::decorated_key& dk) {
-        _top_k_read.append(toppartitons_item_key{s, dk});
+    return make_filtering_reader(std::move(rd), [zis = this->weak_from_this(), &range, &slice, s = std::move(s)] (const dht::decorated_key& dk) {
+        // The data query may be executing after the toppartitions_data_listener object has been removed, so check
+        if (zis) {
+            zis->_top_k_read.append(toppartitions_item_key{s, dk});
+        }
        return true;
    });
 }
@@ -95,7 +98,27 @@ void toppartitions_data_listener::on_write(const schema_ptr& s, const frozen_mut
        return;
    }
    dblog.trace("toppartitions_data_listener::on_write: {}.{}", _ks, _cf);
-    _top_k_write.append(toppartitons_item_key{s, m.decorated_key(*s)});
+    _top_k_write.append(toppartitions_item_key{s, m.decorated_key(*s)});
+}
+
+toppartitions_data_listener::global_top_k::results
+toppartitions_data_listener::globalize(top_k::results&& r) {
+    toppartitions_data_listener::global_top_k::results n;
+    n.reserve(r.size());
+    for (auto&& e : r) {
+        n.emplace_back(global_top_k::results::value_type{toppartitions_global_item_key(std::move(e.item)), e.count, e.error});
+    }
+    return n;
+}
+
+toppartitions_data_listener::top_k::results
+toppartitions_data_listener::localize(const global_top_k::results& r) {
+    toppartitions_data_listener::top_k::results n;
+    n.reserve(r.size());
+    for (auto&& e : r) {
+        n.emplace_back(top_k::results::value_type{toppartitions_item_key(e.item), e.count, e.error});
+    }
+    return n;
 }

 toppartitions_query::toppartitions_query(distributed<database>& xdb, sstring ks, sstring cf,
@@ -108,20 +131,20 @@ future<> toppartitions_query::scatter() {
    return _query.start(std::ref(_xdb), _ks, _cf);
 }

-using top_t = toppartitions_data_listener::top_k::results;
+using top_t = toppartitions_data_listener::global_top_k::results;

 future<toppartitions_query::results> toppartitions_query::gather(unsigned res_size) {
    dblog.debug("toppartitions_query::gather");

    auto map = [res_size, this] (toppartitions_data_listener& listener) {
        dblog.trace("toppartitions_query::map_reduce with listener {}", &listener);
-        top_t rd = listener._top_k_read.top(res_size);
-        top_t wr = listener._top_k_write.top(res_size);
-        return std::tuple<top_t, top_t>{std::move(rd), std::move(wr)};
+        top_t rd = toppartitions_data_listener::globalize(listener._top_k_read.top(res_size));
+        top_t wr = toppartitions_data_listener::globalize(listener._top_k_write.top(res_size));
+        return make_foreign(std::make_unique<std::tuple<top_t, top_t>>(std::move(rd), std::move(wr)));
    };
-    auto reduce = [this] (results res, std::tuple<top_t, top_t> rd_wr) {
-        res.read.append(std::get<0>(rd_wr));
-        res.write.append(std::get<1>(rd_wr));
+    auto reduce = [this] (results res, foreign_ptr<std::unique_ptr<std::tuple<top_t, top_t>>> rd_wr) {
+        res.read.append(toppartitions_data_listener::localize(std::get<0>(*rd_wr)));
+        res.write.append(toppartitions_data_listener::localize(std::get<1>(*rd_wr)));
        return std::move(res);
    };
    return _query.map_reduce0(map, results{res_size}, reduce)
--- a/db/data_listeners.hh
+++ b/db/data_listeners.hh
@@ -24,12 +24,14 @@
 #include <seastar/core/distributed.hh>
 #include <seastar/core/future.hh>
 #include <seastar/core/distributed.hh>
+#include <seastar/core/weak_ptr.hh>

 #include "schema.hh"
 #include "flat_mutation_reader.hh"
 #include "mutation_reader.hh"
 #include "frozen_mutation.hh"
 #include "utils/top_k.hh"
+#include "schema_registry.hh"

 #include <vector>
 #include <set>
@@ -75,29 +77,54 @@ public:
 };


-struct toppartitons_item_key {
+struct toppartitions_item_key {
    schema_ptr schema;
    dht::decorated_key key;

-    toppartitons_item_key(const schema_ptr& schema, const dht::decorated_key& key) : schema(schema), key(key) {}
-    toppartitons_item_key(const toppartitons_item_key& key) noexcept : schema(key.schema), key(key.key) {}
+    toppartitions_item_key(const schema_ptr& schema, const dht::decorated_key& key) : schema(schema), key(key) {}
+    toppartitions_item_key(const toppartitions_item_key& key) noexcept : schema(key.schema), key(key.key) {}

    struct hash {
-        size_t operator()(const toppartitons_item_key& k) const {
+        size_t operator()(const toppartitions_item_key& k) const {
            return std::hash<dht::token>()(k.key.token());
        }
    };

    struct comp {
-        bool operator()(const toppartitons_item_key& k1, const toppartitons_item_key& k2) const {
-            return k1.schema == k2.schema && k1.key.equal(*k2.schema, k2.key);
+        bool operator()(const toppartitions_item_key& k1, const toppartitions_item_key& k2) const {
+            return k1.schema->id() == k2.schema->id() && k1.key.equal(*k2.schema, k2.key);
        }
    };

    explicit operator sstring() const;
 };

-class toppartitions_data_listener : public data_listener {
+// Like toppartitions_item_key, but uses global_schema_ptr, so can be safely transported across shards
+struct toppartitions_global_item_key {
+    global_schema_ptr schema;
+    dht::decorated_key key;
+
+    toppartitions_global_item_key(toppartitions_item_key&& tik) : schema(std::move(tik.schema)), key(std::move(tik.key)) {}
+    operator toppartitions_item_key() const {
+        return toppartitions_item_key(schema, key);
+    }
+
+    struct hash {
+        size_t operator()(const toppartitions_global_item_key& k) const {
+            return std::hash<dht::token>()(k.key.token());
+        }
+    };
+
+    struct comp {
+        bool operator()(const toppartitions_global_item_key& k1, const toppartitions_global_item_key& k2) const {
+            return k1.schema.get()->id() == k2.schema.get()->id() && k1.key.equal(*k2.schema.get(), k2.key);
+        }
+    };
+
+    explicit operator sstring() const;
+};
+
+class toppartitions_data_listener : public data_listener, public weakly_referencable<toppartitions_data_listener> {
    friend class toppartitions_query;

    database& _db;
@@ -105,7 +132,11 @@ class toppartitions_data_listener : public data_listener {
    sstring _cf;

 public:
-    using top_k = utils::space_saving_top_k<toppartitons_item_key, toppartitons_item_key::hash, toppartitons_item_key::comp>;
+    using top_k = utils::space_saving_top_k<toppartitions_item_key, toppartitions_item_key::hash, toppartitions_item_key::comp>;
+    using global_top_k = utils::space_saving_top_k<toppartitions_global_item_key, toppartitions_global_item_key::hash, toppartitions_global_item_key::comp>;
+public:
+    static global_top_k::results globalize(top_k::results&& r);
+    static top_k::results localize(const global_top_k::results& r);
 private:
    top_k _top_k_read;
    top_k _top_k_write;
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -118,8 +118,8 @@ future<> manager::stop() {

    return _draining_eps_gate.close().finally([this] {
        return parallel_for_each(_ep_managers, [] (auto& pair) {
-                return pair.second.stop();
-            }).finally([this] {
+            return pair.second.stop();
+        }).finally([this] {
            _ep_managers.clear();
            manager_logger.info("Stopped");
        }).discard_result();
@@ -240,6 +240,8 @@ future<> manager::end_point_hints_manager::stop(drain should_drain) noexcept {
 manager::end_point_hints_manager::end_point_hints_manager(const key_type& key, manager& shard_manager)
    : _key(key)
    , _shard_manager(shard_manager)
+    , _file_update_mutex_ptr(make_lw_shared<seastar::shared_mutex>())
+    , _file_update_mutex(*_file_update_mutex_ptr)
    , _state(state_set::of<state::stopped>())
    , _hints_dir(_shard_manager.hints_dir() / format("{}", _key).c_str())
    , _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper())
@@ -248,6 +250,8 @@ manager::end_point_hints_manager::end_point_hints_manager(const key_type& key, m
 manager::end_point_hints_manager::end_point_hints_manager(end_point_hints_manager&& other)
    : _key(other._key)
    , _shard_manager(other._shard_manager)
+    , _file_update_mutex_ptr(std::move(other._file_update_mutex_ptr))
+    , _file_update_mutex(*_file_update_mutex_ptr)
    , _state(other._state)
    , _hints_dir(std::move(other._hints_dir))
    , _sender(other._sender, *this)
@@ -520,28 +524,35 @@ void manager::drain_for(gms::inet_address endpoint) {
    manager_logger.trace("on_leave_cluster: {} is removed/decommissioned", endpoint);

    with_gate(_draining_eps_gate, [this, endpoint] {
-        return futurize_apply([this, endpoint] () {
-            if (utils::fb_utilities::is_me(endpoint)) {
-                return parallel_for_each(_ep_managers, [] (auto& pair) {
-                    return pair.second.stop(drain::yes).finally([&pair] {
-                        return remove_file(pair.second.hints_dir().c_str());
+        return with_semaphore(drain_lock(), 1, [this, endpoint] {
+            return futurize_apply([this, endpoint] () {
+                if (utils::fb_utilities::is_me(endpoint)) {
+                    return parallel_for_each(_ep_managers, [] (auto& pair) {
+                        return pair.second.stop(drain::yes).finally([&pair] {
+                            return with_file_update_mutex(pair.second, [&pair] {
+                                return remove_file(pair.second.hints_dir().c_str());
+                            });
+                        });
+                    }).finally([this] {
+                        _ep_managers.clear();
                    });
-                }).finally([this] {
-                    _ep_managers.clear();
-                });
-            } else {
-                ep_managers_map_type::iterator ep_manager_it = find_ep_manager(endpoint);
-                if (ep_manager_it != ep_managers_end()) {
-                    return ep_manager_it->second.stop(drain::yes).finally([this, endpoint, hints_dir = ep_manager_it->second.hints_dir()] {
-                        _ep_managers.erase(endpoint);
-                        return remove_file(hints_dir.c_str());
-                    });
-                }
+                } else {
+                    ep_managers_map_type::iterator ep_manager_it = find_ep_manager(endpoint);
+                    if (ep_manager_it != ep_managers_end()) {
+                        return ep_manager_it->second.stop(drain::yes).finally([this, endpoint, &ep_man = ep_manager_it->second] {
+                            return with_file_update_mutex(ep_man, [&ep_man] {
+                                return remove_file(ep_man.hints_dir().c_str());
+                            }).finally([this, endpoint] {
+                                _ep_managers.erase(endpoint);
+                            });
+                        });
+                    }

-                return make_ready_future<>();
-            }
-        }).handle_exception([endpoint] (auto eptr) {
-            manager_logger.error("Exception when draining {}: {}", endpoint, eptr);
+                    return make_ready_future<>();
+                }
+            }).handle_exception([endpoint] (auto eptr) {
+                manager_logger.error("Exception when draining {}: {}", endpoint, eptr);
+            });
        });
    });
 }
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -276,7 +276,8 @@ public:
        manager& _shard_manager;
        hints_store_ptr _hints_store_anchor;
        seastar::gate _store_gate;
-        seastar::shared_mutex _file_update_mutex;
+        lw_shared_ptr<seastar::shared_mutex> _file_update_mutex_ptr;
+        seastar::shared_mutex& _file_update_mutex;

        enum class state {
            can_hint,               // hinting is currently allowed (used by the space_watchdog)
@@ -378,8 +379,20 @@ public:
            return _state.contains(state::stopped);
        }

-        seastar::shared_mutex& file_update_mutex() {
-            return _file_update_mutex;
+        /// \brief Safely runs a given functor under the file_update_mutex of \ref ep_man
+        ///
+        /// Runs a given functor under the file_update_mutex of the given end_point_hints_manager instance.
+        /// This function is safe even if \ref ep_man gets destroyed before the future this function returns resolves
+        /// (as long as the \ref func call itself is safe).
+        ///
+        /// \tparam Func Functor type.
+        /// \param ep_man end_point_hints_manager instance which file_update_mutex we want to lock.
+        /// \param func Functor to run under the lock.
+        /// \return Whatever \ref func returns.
+        template <typename Func>
+        friend inline auto with_file_update_mutex(end_point_hints_manager& ep_man, Func&& func) {
+            lw_shared_ptr<seastar::shared_mutex> lock_ptr = ep_man._file_update_mutex_ptr;
+            return with_lock(*lock_ptr, std::forward<Func>(func)).finally([lock_ptr] {});
        }

        const fs::path& hints_dir() const noexcept {
@@ -387,6 +400,10 @@ public:
        }

    private:
+        seastar::shared_mutex& file_update_mutex() noexcept {
+            return _file_update_mutex;
+        }
+
        /// \brief Creates a new hints store object.
        ///
        /// - Creates a hints store directory if doesn't exist: <shard_hints_dir>/<ep_key>
@@ -453,6 +470,7 @@ private:
    stats _stats;
    seastar::metrics::metric_groups _metrics;
    std::unordered_set<ep_key_type> _eps_with_pending_hints;
+    seastar::semaphore _drain_lock = {1};

 public:
    manager(sstring hints_directory, std::vector<sstring> hinted_dcs, int64_t max_hint_window_ms, resource_manager&res_manager, distributed<database>& db);
@@ -531,6 +549,10 @@ public:
        return _hints_dir_device_id;
    }

+    seastar::semaphore& drain_lock() noexcept {
+        return _drain_lock;
+    }
+
    void allow_hints();
    void forbid_hints();
    void forbid_hints_for_eps_with_pending_hints();
--- a/db/hints/resource_manager.cc
+++ b/db/hints/resource_manager.cc
@@ -89,16 +89,27 @@ future<> space_watchdog::stop() noexcept {
    return std::move(_started);
 }

+// Called under the end_point_hints_manager::file_update_mutex() of the corresponding end_point_hints_manager instance.
 future<> space_watchdog::scan_one_ep_dir(fs::path path, manager& shard_manager, ep_key_type ep_key) {
-    return lister::scan_dir(path, { directory_entry_type::regular }, [this, ep_key, &shard_manager] (fs::path dir, directory_entry de) {
-        // Put the current end point ID to state.eps_with_pending_hints when we see the second hints file in its directory
-        if (_files_count == 1) {
-            shard_manager.add_ep_with_pending_hints(ep_key);
-        }
-        ++_files_count;
+    return do_with(std::move(path), [this, ep_key, &shard_manager] (fs::path& path) {
+        // It may happen that we get here and the directory has already been deleted in the context of manager::drain_for().
+        // In this case simply bail out.
+        return engine().file_exists(path.native()).then([this, ep_key, &shard_manager, &path] (bool exists) {
+            if (!exists) {
+                return make_ready_future<>();
+            } else {
+                return lister::scan_dir(path, { directory_entry_type::regular }, [this, ep_key, &shard_manager] (fs::path dir, directory_entry de) {
+                    // Put the current end point ID to state.eps_with_pending_hints when we see the second hints file in its directory
+                    if (_files_count == 1) {
+                        shard_manager.add_ep_with_pending_hints(ep_key);
+                    }
+                    ++_files_count;

-        return io_check(file_size, (dir / de.name.c_str()).c_str()).then([this] (uint64_t fsize) {
-            _total_size += fsize;
+                    return io_check(file_size, (dir / de.name.c_str()).c_str()).then([this] (uint64_t fsize) {
+                        _total_size += fsize;
+                    });
+                });
+            }
        });
    });
 }
@@ -136,7 +147,7 @@ void space_watchdog::on_timer() {
                // continue to enumeration - there is no one to change them.
                auto it = shard_manager.find_ep_manager(de.name);
                if (it != shard_manager.ep_managers_end()) {
-                    return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
+                    return with_file_update_mutex(it->second, [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)] () mutable {
                        return scan_one_ep_dir(dir / ep_name, shard_manager, ep_key_type(ep_name));
                    });
                } else {
--- a/db/schema_features.hh
+++ b/db/schema_features.hh
@@ -26,11 +26,17 @@
 namespace db {

 enum class schema_feature {
-    VIEW_VIRTUAL_COLUMNS
+    VIEW_VIRTUAL_COLUMNS,
+
+    // When set, the schema digest is calcualted in a way such that it doesn't change after all
+    // tombstones in an empty partition expire.
+    // See https://github.com/scylladb/scylla/issues/4485
+    DIGEST_INSENSITIVE_TO_EXPIRY,
 };

 using schema_features = enum_set<super_enum<schema_feature,
-    schema_feature::VIEW_VIRTUAL_COLUMNS
+    schema_feature::VIEW_VIRTUAL_COLUMNS,
+    schema_feature::DIGEST_INSENSITIVE_TO_EXPIRY
    >>;

 }
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -587,9 +587,9 @@ future<utils::UUID> calculate_schema_digest(distributed<service::storage_proxy>&
            return mutations;
        });
    };
-    auto reduce = [] (auto& hash, auto&& mutations) {
+    auto reduce = [features] (auto& hash, auto&& mutations) {
        for (const mutation& m : mutations) {
-            feed_hash_for_schema_digest(hash, m);
+            feed_hash_for_schema_digest(hash, m, features);
        }
    };
    return do_with(md5_hasher(), all_table_names(features), [features, map, reduce] (auto& hash, auto& tables) {
@@ -778,6 +778,13 @@ mutation compact_for_schema_digest(const mutation& m) {
    return m_compacted;
 }

+void feed_hash_for_schema_digest(hasher& h, const mutation& m, schema_features features) {
+    auto compacted = compact_for_schema_digest(m);
+    if (!features.contains<schema_feature::DIGEST_INSENSITIVE_TO_EXPIRY>() || !compacted.partition().empty()) {
+        feed_hash(h, compact_for_schema_digest(m));
+    }
+}
+
 // Applies deletion of the "version" column to a system_schema.scylla_tables mutation.
 static void delete_schema_version(mutation& m) {
    if (m.column_family_id() != scylla_tables()->id()) {
@@ -2727,8 +2734,9 @@ namespace legacy {

 table_schema_version schema_mutations::digest() const {
    md5_hasher h;
-    db::schema_tables::feed_hash_for_schema_digest(h, _columnfamilies);
-    db::schema_tables::feed_hash_for_schema_digest(h, _columns);
+    const db::schema_features no_features;
+    db::schema_tables::feed_hash_for_schema_digest(h, _columnfamilies, no_features);
+    db::schema_tables::feed_hash_for_schema_digest(h, _columns, no_features);
    return utils::UUID_gen::get_name_UUID(h.finalize());
 }

--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -215,10 +215,7 @@ index_metadata_kind deserialize_index_kind(sstring kind);

 mutation compact_for_schema_digest(const mutation& m);

-template<typename Hasher>
-void feed_hash_for_schema_digest(Hasher& h, const mutation& m) {
-    feed_hash(h, compact_for_schema_digest(m));
-}
+void feed_hash_for_schema_digest(hasher&, const mutation&, schema_features);

 } // namespace schema_tables
 } // namespace db
--- a/db/size_estimates_virtual_reader.cc
+++ b/db/size_estimates_virtual_reader.cc
@@ -0,0 +1,328 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <boost/range/adaptor/indirected.hpp>
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm/find_if.hpp>
+
+#include "clustering_bounds_comparator.hh"
+#include "database_fwd.hh"
+#include "db/system_keyspace.hh"
+#include "dht/i_partitioner.hh"
+#include "partition_range_compat.hh"
+#include "range.hh"
+#include "service/storage_service.hh"
+#include "mutation_fragment.hh"
+#include "sstables/sstables.hh"
+#include "db/timeout_clock.hh"
+#include "database.hh"
+
+#include "db/size_estimates_virtual_reader.hh"
+
+namespace db {
+
+namespace size_estimates {
+
+struct virtual_row {
+    const bytes& cf_name;
+    const token_range& tokens;
+    clustering_key_prefix as_key() const {
+        return clustering_key_prefix::from_exploded(std::vector<bytes_view>{cf_name, tokens.start, tokens.end});
+    }
+};
+
+struct virtual_row_comparator {
+    schema_ptr _schema;
+    virtual_row_comparator(schema_ptr schema) : _schema(schema) { }
+    bool operator()(const clustering_key_prefix& key1, const clustering_key_prefix& key2) {
+        return clustering_key_prefix::prefix_equality_less_compare(*_schema)(key1, key2);
+    }
+    bool operator()(const virtual_row& row, const clustering_key_prefix& key) {
+        return operator()(row.as_key(), key);
+    }
+    bool operator()(const clustering_key_prefix& key, const virtual_row& row) {
+        return operator()(key, row.as_key());
+    }
+};
+
+// Iterating over the cartesian product of cf_names and token_ranges.
+class virtual_row_iterator : public std::iterator<std::input_iterator_tag, const virtual_row> {
+    std::reference_wrapper<const std::vector<bytes>> _cf_names;
+    std::reference_wrapper<const std::vector<token_range>> _ranges;
+    size_t _cf_names_idx = 0;
+    size_t _ranges_idx = 0;
+public:
+    struct end_iterator_tag {};
+    virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges)
+            : _cf_names(std::ref(cf_names))
+            , _ranges(std::ref(ranges))
+    { }
+    virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges, end_iterator_tag)
+            : _cf_names(std::ref(cf_names))
+            , _ranges(std::ref(ranges))
+            , _cf_names_idx(cf_names.size())
+            , _ranges_idx(ranges.size())
+    {
+        if (cf_names.empty() || ranges.empty()) {
+            // The product of an empty range with any range is an empty range.
+            // In this case we want the end iterator to be equal to the begin iterator,
+            // which has_ranges_idx = _cf_names_idx = 0.
+            _ranges_idx = _cf_names_idx = 0;
+        }
+    }
+    virtual_row_iterator& operator++() {
+        if (++_ranges_idx == _ranges.get().size() && ++_cf_names_idx < _cf_names.get().size()) {
+            _ranges_idx = 0;
+        }
+        return *this;
+    }
+    virtual_row_iterator operator++(int) {
+        virtual_row_iterator i(*this);
+        ++(*this);
+        return i;
+    }
+    const value_type operator*() const {
+        return { _cf_names.get()[_cf_names_idx], _ranges.get()[_ranges_idx] };
+    }
+    bool operator==(const virtual_row_iterator& i) const {
+        return _cf_names_idx == i._cf_names_idx
+            && _ranges_idx == i._ranges_idx;
+    }
+    bool operator!=(const virtual_row_iterator& i) const {
+        return !(*this == i);
+    }
+};
+
+/**
+ * Returns the keyspaces, ordered by name, as selected by the partition_range.
+ */
+static std::vector<sstring> get_keyspaces(const schema& s, const database& db, dht::partition_range range) {
+    struct keyspace_less_comparator {
+        const schema& _s;
+        keyspace_less_comparator(const schema& s) : _s(s) { }
+        dht::ring_position as_ring_position(const sstring& ks) {
+            auto pkey = partition_key::from_single_value(_s, utf8_type->decompose(ks));
+            return dht::global_partitioner().decorate_key(_s, std::move(pkey));
+        }
+        bool operator()(const sstring& ks1, const sstring& ks2) {
+            return as_ring_position(ks1).less_compare(_s, as_ring_position(ks2));
+        }
+        bool operator()(const sstring& ks, const dht::ring_position& rp) {
+            return as_ring_position(ks).less_compare(_s, rp);
+        }
+        bool operator()(const dht::ring_position& rp, const sstring& ks) {
+            return rp.less_compare(_s, as_ring_position(ks));
+        }
+    };
+    auto keyspaces = db.get_non_system_keyspaces();
+    auto cmp = keyspace_less_comparator(s);
+    boost::sort(keyspaces, cmp);
+    return boost::copy_range<std::vector<sstring>>(
+        range.slice(keyspaces, std::move(cmp)) | boost::adaptors::filtered([&s] (const auto& ks) {
+            // If this is a range query, results are divided between shards by the partition key (keyspace_name).
+            return shard_of(dht::global_partitioner().get_token(s,
+                        partition_key::from_single_value(s, utf8_type->decompose(ks))))
+                == engine().cpu_id();
+        })
+    );
+}
+
+/**
+ * Makes a wrapping range of ring_position from a nonwrapping range of token, used to select sstables.
+ */
+static dht::partition_range as_ring_position_range(dht::token_range& r) {
+    std::optional<range<dht::ring_position>::bound> start_bound, end_bound;
+    if (r.start()) {
+        start_bound = {{ dht::ring_position(r.start()->value(), dht::ring_position::token_bound::start), r.start()->is_inclusive() }};
+    }
+    if (r.end()) {
+        end_bound = {{ dht::ring_position(r.end()->value(), dht::ring_position::token_bound::end), r.end()->is_inclusive() }};
+    }
+    return dht::partition_range(std::move(start_bound), std::move(end_bound), r.is_singular());
+}
+
+/**
+ * Add a new range_estimates for the specified range, considering the sstables associated with `cf`.
+ */
+static system_keyspace::range_estimates estimate(const column_family& cf, const token_range& r) {
+    int64_t count{0};
+    utils::estimated_histogram hist{0};
+    auto from_bytes = [] (auto& b) {
+        return dht::global_partitioner().from_sstring(utf8_type->to_string(b));
+    };
+    dht::token_range_vector ranges;
+    ::compat::unwrap_into(
+        wrapping_range<dht::token>({{ from_bytes(r.start), false }}, {{ from_bytes(r.end) }}),
+        dht::token_comparator(),
+        [&] (auto&& rng) { ranges.push_back(std::move(rng)); });
+    for (auto&& r : ranges) {
+        auto rp_range = as_ring_position_range(r);
+        for (auto&& sstable : cf.select_sstables(rp_range)) {
+            count += sstable->estimated_keys_for_range(r);
+            hist.merge(sstable->get_stats_metadata().estimated_partition_size);
+        }
+    }
+    return {cf.schema(), r.start, r.end, count, count > 0 ? hist.mean() : 0};
+}
+
+future<std::vector<token_range>> get_local_ranges() {
+    auto& ss = service::get_local_storage_service();
+    return ss.get_local_tokens().then([&ss] (auto&& tokens) {
+        auto ranges = ss.get_token_metadata().get_primary_ranges_for(std::move(tokens));
+        std::vector<token_range> local_ranges;
+        auto to_bytes = [](const std::optional<dht::token_range::bound>& b) {
+            assert(b);
+            return utf8_type->decompose(dht::global_partitioner().to_sstring(b->value()));
+        };
+        // We merge the ranges to be compatible with how Cassandra shows it's size estimates table.
+        // All queries will be on that table, where all entries are text and there's no notion of
+        // token ranges form the CQL point of view.
+        auto left_inf = boost::find_if(ranges, [] (auto&& r) {
+            return !r.start() || r.start()->value() == dht::minimum_token();
+        });
+        auto right_inf = boost::find_if(ranges, [] (auto&& r) {
+            return !r.end() || r.start()->value() == dht::maximum_token();
+        });
+        if (left_inf != right_inf && left_inf != ranges.end() && right_inf != ranges.end()) {
+            local_ranges.push_back(token_range{to_bytes(right_inf->start()), to_bytes(left_inf->end())});
+            ranges.erase(left_inf);
+            ranges.erase(right_inf);
+        }
+        for (auto&& r : ranges) {
+            local_ranges.push_back(token_range{to_bytes(r.start()), to_bytes(r.end())});
+        }
+        boost::sort(local_ranges, [] (auto&& tr1, auto&& tr2) {
+            return utf8_type->less(tr1.start, tr2.start);
+        });
+        return local_ranges;
+    });
+}
+
+size_estimates_mutation_reader::size_estimates_mutation_reader(schema_ptr schema, const dht::partition_range& prange, const query::partition_slice& slice, streamed_mutation::forwarding fwd)
+            : impl(schema)
+            , _schema(std::move(schema))
+            , _prange(&prange)
+            , _slice(slice)
+            , _fwd(fwd)
+    { }
+
+future<> size_estimates_mutation_reader::get_next_partition() {
+    auto& db = service::get_local_storage_proxy().get_db().local();
+    if (!_keyspaces) {
+        _keyspaces = get_keyspaces(*_schema, db, *_prange);
+        _current_partition = _keyspaces->begin();
+    }
+    if (_current_partition == _keyspaces->end()) {
+        _end_of_stream = true;
+        return make_ready_future<>();
+    }
+    return get_local_ranges().then([&db, this] (auto&& ranges) {
+        auto estimates = this->estimates_for_current_keyspace(db, std::move(ranges));
+        auto mutations = db::system_keyspace::make_size_estimates_mutation(*_current_partition, std::move(estimates));
+        ++_current_partition;
+        std::vector<mutation> ms;
+        ms.emplace_back(std::move(mutations));
+        _partition_reader = flat_mutation_reader_from_mutations(std::move(ms), _fwd);
+    });
+}
+
+future<> size_estimates_mutation_reader::fill_buffer(db::timeout_clock::time_point timeout) {
+    return do_until([this, timeout] { return is_end_of_stream() || is_buffer_full(); }, [this, timeout] {
+        if (!_partition_reader) {
+            return get_next_partition();
+        }
+        return _partition_reader->consume_pausable([this] (mutation_fragment mf) {
+            push_mutation_fragment(std::move(mf));
+            return stop_iteration(is_buffer_full());
+        }, timeout).then([this] {
+            if (_partition_reader->is_end_of_stream() && _partition_reader->is_buffer_empty()) {
+                _partition_reader = std::nullopt;
+            }
+        });
+    });
+}
+
+void size_estimates_mutation_reader::next_partition() {
+    clear_buffer_to_next_partition();
+    if (is_buffer_empty()) {
+        _partition_reader = std::nullopt;
+    }
+}
+
+future<> size_estimates_mutation_reader::fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) {
+    clear_buffer();
+    _prange = &pr;
+    _keyspaces = std::nullopt;
+    _partition_reader = std::nullopt;
+    _end_of_stream = false;
+    return make_ready_future<>();
+}
+
+future<> size_estimates_mutation_reader::fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) {
+    forward_buffer_to(pr.start());
+    _end_of_stream = false;
+    if (_partition_reader) {
+        return _partition_reader->fast_forward_to(std::move(pr), timeout);
+    }
+    return make_ready_future<>();
+}
+
+size_t size_estimates_mutation_reader::buffer_size() const {
+    if (_partition_reader) {
+        return flat_mutation_reader::impl::buffer_size() + _partition_reader->buffer_size();
+    }
+    return flat_mutation_reader::impl::buffer_size();
+}
+
+std::vector<db::system_keyspace::range_estimates>
+size_estimates_mutation_reader::estimates_for_current_keyspace(const database& db, std::vector<token_range> local_ranges) const {
+    // For each specified range, estimate (crudely) mean partition size and partitions count.
+    auto pkey = partition_key::from_single_value(*_schema, utf8_type->decompose(*_current_partition));
+    auto cfs = db.find_keyspace(*_current_partition).metadata()->cf_meta_data();
+    auto cf_names = boost::copy_range<std::vector<bytes>>(cfs | boost::adaptors::transformed([] (auto&& cf) {
+        return utf8_type->decompose(cf.first);
+    }));
+    boost::sort(cf_names, [] (auto&& n1, auto&& n2) {
+        return utf8_type->less(n1, n2);
+    });
+    std::vector<db::system_keyspace::range_estimates> estimates;
+    for (auto& range : _slice.row_ranges(*_schema, pkey)) {
+        auto rows = boost::make_iterator_range(
+                virtual_row_iterator(cf_names, local_ranges),
+                virtual_row_iterator(cf_names, local_ranges, virtual_row_iterator::end_iterator_tag()));
+        auto rows_to_estimate = range.slice(rows, virtual_row_comparator(_schema));
+        for (auto&& r : rows_to_estimate) {
+            auto& cf = db.find_column_family(*_current_partition, utf8_type->to_string(r.cf_name));
+            estimates.push_back(estimate(cf, r.tokens));
+            if (estimates.size() >= _slice.partition_row_limit()) {
+                return estimates;
+            }
+        }
+    }
+    return estimates;
+}
+
+} // namespace size_estimates
+
+} // namespace db
--- a/db/size_estimates_virtual_reader.hh
+++ b/db/size_estimates_virtual_reader.hh
@@ -21,33 +21,18 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

-#include <boost/range/adaptor/indirected.hpp>
-#include <boost/range/adaptor/map.hpp>
-#include <boost/range/adaptor/transformed.hpp>
-#include <boost/range/algorithm/find_if.hpp>
-
-#include "clustering_bounds_comparator.hh"
-#include "database_fwd.hh"
-#include "db/system_keyspace.hh"
-#include "dht/i_partitioner.hh"
 #include "mutation_reader.hh"
-#include "partition_range_compat.hh"
-#include "range.hh"
-#include "service/storage_service.hh"
-#include "mutation_fragment.hh"
-#include "sstables/sstables.hh"
-#include "db/timeout_clock.hh"
-#include "database.hh"

 namespace db {

 namespace size_estimates {

+struct token_range {
+    bytes start;
+    bytes end;
+};
+
 class size_estimates_mutation_reader final : public flat_mutation_reader::impl {
-    struct token_range {
-        bytes start;
-        bytes end;
-    };
    schema_ptr _schema;
    const dht::partition_range* _prange;
    const query::partition_slice& _slice;
@@ -57,267 +42,18 @@ class size_estimates_mutation_reader final : public flat_mutation_reader::impl {
    streamed_mutation::forwarding _fwd;
    flat_mutation_reader_opt _partition_reader;
 public:
-    size_estimates_mutation_reader(schema_ptr schema, const dht::partition_range& prange, const query::partition_slice& slice, streamed_mutation::forwarding fwd)
-            : impl(schema)
-            , _schema(std::move(schema))
-            , _prange(&prange)
-            , _slice(slice)
-            , _fwd(fwd)
-    { }
+    size_estimates_mutation_reader(schema_ptr, const dht::partition_range&, const query::partition_slice&, streamed_mutation::forwarding);

+    virtual future<> fill_buffer(db::timeout_clock::time_point) override;
+    virtual void next_partition() override;
+    virtual future<> fast_forward_to(const dht::partition_range&, db::timeout_clock::time_point) override;
+    virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point) override;
+    virtual size_t buffer_size() const override;
 private:
-    future<> get_next_partition() {
-        // For each specified range, estimate (crudely) mean partition size and partitions count.
-        auto& db = service::get_local_storage_proxy().get_db().local();
-        if (!_keyspaces) {
-            _keyspaces = get_keyspaces(*_schema, db, *_prange);
-            _current_partition = _keyspaces->begin();
-        }
-        if (_current_partition == _keyspaces->end()) {
-            _end_of_stream = true;
-            return make_ready_future<>();
-        }
-        return get_local_ranges().then([&db, this] (auto&& ranges) {
-            auto estimates = this->estimates_for_current_keyspace(db, std::move(ranges));
-            auto mutations = db::system_keyspace::make_size_estimates_mutation(*_current_partition, std::move(estimates));
-            ++_current_partition;
-            std::vector<mutation> ms;
-            ms.emplace_back(std::move(mutations));
-            _partition_reader = flat_mutation_reader_from_mutations(std::move(ms), _fwd);
-        });
-    }
-public:
-    virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
-        return do_until([this, timeout] { return is_end_of_stream() || is_buffer_full(); }, [this, timeout] {
-            if (!_partition_reader) {
-                return get_next_partition();
-            }
-            return _partition_reader->consume_pausable([this] (mutation_fragment mf) {
-                push_mutation_fragment(std::move(mf));
-                return stop_iteration(is_buffer_full());
-            }, timeout).then([this] {
-                if (_partition_reader->is_end_of_stream() && _partition_reader->is_buffer_empty()) {
-                    _partition_reader = std::nullopt;
-                }
-            });
-        });
-    }
-    virtual void next_partition() override {
-        clear_buffer_to_next_partition();
-        if (is_buffer_empty()) {
-            _partition_reader = std::nullopt;
-        }
-    }
-    virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override {
-        clear_buffer();
-        _prange = &pr;
-        _keyspaces = std::nullopt;
-        _partition_reader = std::nullopt;
-        _end_of_stream = false;
-        return make_ready_future<>();
-    }
-    virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override {
-        forward_buffer_to(pr.start());
-        _end_of_stream = false;
-        if (_partition_reader) {
-            return _partition_reader->fast_forward_to(std::move(pr), timeout);
-        }
-        return make_ready_future<>();
-    }
-    virtual size_t buffer_size() const override {
-        if (_partition_reader) {
-            return flat_mutation_reader::impl::buffer_size() + _partition_reader->buffer_size();
-        }
-        return flat_mutation_reader::impl::buffer_size();
-    }
-    /**
-     * Returns the primary ranges for the local node.
-     * Used for testing as well.
-     */
-    static future<std::vector<token_range>> get_local_ranges() {
-        auto& ss = service::get_local_storage_service();
-        return ss.get_local_tokens().then([&ss] (auto&& tokens) {
-            auto ranges = ss.get_token_metadata().get_primary_ranges_for(std::move(tokens));
-            std::vector<token_range> local_ranges;
-            auto to_bytes = [](const std::optional<dht::token_range::bound>& b) {
-                assert(b);
-                return utf8_type->decompose(dht::global_partitioner().to_sstring(b->value()));
-            };
-            // We merge the ranges to be compatible with how Cassandra shows it's size estimates table.
-            // All queries will be on that table, where all entries are text and there's no notion of
-            // token ranges form the CQL point of view.
-            auto left_inf = boost::find_if(ranges, [] (auto&& r) {
-                return !r.start() || r.start()->value() == dht::minimum_token();
-            });
-            auto right_inf = boost::find_if(ranges, [] (auto&& r) {
-                return !r.end() || r.start()->value() == dht::maximum_token();
-            });
-            if (left_inf != right_inf && left_inf != ranges.end() && right_inf != ranges.end()) {
-                local_ranges.push_back(token_range{to_bytes(right_inf->start()), to_bytes(left_inf->end())});
-                ranges.erase(left_inf);
-                ranges.erase(right_inf);
-            }
-            for (auto&& r : ranges) {
-                local_ranges.push_back(token_range{to_bytes(r.start()), to_bytes(r.end())});
-            }
-            boost::sort(local_ranges, [] (auto&& tr1, auto&& tr2) {
-                return utf8_type->less(tr1.start, tr2.start);
-            });
-            return local_ranges;
-        });
-    }
-private:
-    struct virtual_row {
-        const bytes& cf_name;
-        const token_range& tokens;
-        clustering_key_prefix as_key() const {
-            return clustering_key_prefix::from_exploded(std::vector<bytes_view>{cf_name, tokens.start, tokens.end});
-        }
-    };
-    struct virtual_row_comparator {
-        schema_ptr _schema;
-        virtual_row_comparator(schema_ptr schema) : _schema(schema) { }
-        bool operator()(const clustering_key_prefix& key1, const clustering_key_prefix& key2) {
-            return clustering_key_prefix::prefix_equality_less_compare(*_schema)(key1, key2);
-        }
-        bool operator()(const virtual_row& row, const clustering_key_prefix& key) {
-            return operator()(row.as_key(), key);
-        }
-        bool operator()(const clustering_key_prefix& key, const virtual_row& row) {
-            return operator()(key, row.as_key());
-        }
-    };
-    class virtual_row_iterator : public std::iterator<std::input_iterator_tag, const virtual_row> {
-        std::reference_wrapper<const std::vector<bytes>> _cf_names;
-        std::reference_wrapper<const std::vector<token_range>> _ranges;
-        size_t _cf_names_idx = 0;
-        size_t _ranges_idx = 0;
-    public:
-        struct end_iterator_tag {};
-        virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges)
-                : _cf_names(std::ref(cf_names))
-                , _ranges(std::ref(ranges))
-        { }
-        virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges, end_iterator_tag)
-                : _cf_names(std::ref(cf_names))
-                , _ranges(std::ref(ranges))
-                , _cf_names_idx(cf_names.size())
-                , _ranges_idx(ranges.size())
-        { }
-        virtual_row_iterator& operator++() {
-            if (++_ranges_idx == _ranges.get().size() && ++_cf_names_idx < _cf_names.get().size()) {
-                _ranges_idx = 0;
-            }
-            return *this;
-        }
-        virtual_row_iterator operator++(int) {
-            virtual_row_iterator i(*this);
-            ++(*this);
-            return i;
-        }
-        const value_type operator*() const {
-            return { _cf_names.get()[_cf_names_idx], _ranges.get()[_ranges_idx] };
-        }
-        bool operator==(const virtual_row_iterator& i) const {
-            return _cf_names_idx == i._cf_names_idx
-                && _ranges_idx == i._ranges_idx;
-        }
-        bool operator!=(const virtual_row_iterator& i) const {
-            return !(*this == i);
-        }
-    };
+    future<> get_next_partition();

    std::vector<db::system_keyspace::range_estimates>
-    estimates_for_current_keyspace(const database& db, std::vector<token_range> local_ranges) const {
-        auto pkey = partition_key::from_single_value(*_schema, utf8_type->decompose(*_current_partition));
-        auto cfs = db.find_keyspace(*_current_partition).metadata()->cf_meta_data();
-        auto cf_names = boost::copy_range<std::vector<bytes>>(cfs | boost::adaptors::transformed([] (auto&& cf) {
-            return utf8_type->decompose(cf.first);
-        }));
-        boost::sort(cf_names, [] (auto&& n1, auto&& n2) {
-            return utf8_type->less(n1, n2);
-        });
-        std::vector<db::system_keyspace::range_estimates> estimates;
-        for (auto& range : _slice.row_ranges(*_schema, pkey)) {
-            auto rows = boost::make_iterator_range(
-                    virtual_row_iterator(cf_names, local_ranges),
-                    virtual_row_iterator(cf_names, local_ranges, virtual_row_iterator::end_iterator_tag()));
-            auto rows_to_estimate = range.slice(rows, virtual_row_comparator(_schema));
-            for (auto&& r : rows_to_estimate) {
-                auto& cf = db.find_column_family(*_current_partition, utf8_type->to_string(r.cf_name));
-                estimates.push_back(estimate(cf, r.tokens));
-                if (estimates.size() >= _slice.partition_row_limit()) {
-                    return estimates;
-                }
-            }
-        }
-        return estimates;
-    }
-
-    /**
-     * Returns the keyspaces, ordered by name, as selected by the partition_range.
-     */
-    static ks_range get_keyspaces(const schema& s, const database& db, dht::partition_range range) {
-        struct keyspace_less_comparator {
-            const schema& _s;
-            keyspace_less_comparator(const schema& s) : _s(s) { }
-            dht::ring_position as_ring_position(const sstring& ks) {
-                auto pkey = partition_key::from_single_value(_s, utf8_type->decompose(ks));
-                return dht::global_partitioner().decorate_key(_s, std::move(pkey));
-            }
-            bool operator()(const sstring& ks1, const sstring& ks2) {
-                return as_ring_position(ks1).less_compare(_s, as_ring_position(ks2));
-            }
-            bool operator()(const sstring& ks, const dht::ring_position& rp) {
-                return as_ring_position(ks).less_compare(_s, rp);
-            }
-            bool operator()(const dht::ring_position& rp, const sstring& ks) {
-                return rp.less_compare(_s, as_ring_position(ks));
-            }
-        };
-        auto keyspaces = db.get_non_system_keyspaces();
-        auto cmp = keyspace_less_comparator(s);
-        boost::sort(keyspaces, cmp);
-        return boost::copy_range<ks_range>(range.slice(keyspaces, std::move(cmp)));
-    }
-
-    /**
-     * Makes a wrapping range of ring_position from a nonwrapping range of token, used to select sstables.
-     */
-    static dht::partition_range as_ring_position_range(dht::token_range& r) {
-        std::optional<range<dht::ring_position>::bound> start_bound, end_bound;
-        if (r.start()) {
-            start_bound = {{ dht::ring_position(r.start()->value(), dht::ring_position::token_bound::start), r.start()->is_inclusive() }};
-        }
-        if (r.end()) {
-            end_bound = {{ dht::ring_position(r.end()->value(), dht::ring_position::token_bound::end), r.end()->is_inclusive() }};
-        }
-        return dht::partition_range(std::move(start_bound), std::move(end_bound), r.is_singular());
-    }
-
-    /**
-     * Add a new range_estimates for the specified range, considering the sstables associated with `cf`.
-     */
-    static system_keyspace::range_estimates estimate(const column_family& cf, const token_range& r) {
-        int64_t count{0};
-        utils::estimated_histogram hist{0};
-        auto from_bytes = [] (auto& b) {
-            return dht::global_partitioner().from_sstring(utf8_type->to_string(b));
-        };
-        dht::token_range_vector ranges;
-        ::compat::unwrap_into(
-            wrapping_range<dht::token>({{ from_bytes(r.start), false }}, {{ from_bytes(r.end) }}),
-            dht::token_comparator(),
-            [&] (auto&& rng) { ranges.push_back(std::move(rng)); });
-        for (auto&& r : ranges) {
-            auto rp_range = as_ring_position_range(r);
-            for (auto&& sstable : cf.select_sstables(rp_range)) {
-                count += sstable->estimated_keys_for_range(r);
-                hist.merge(sstable->get_stats_metadata().estimated_partition_size);
-            }
-        }
-        return {cf.schema(), r.start, r.end, count, count > 0 ? hist.mean() : 0};
-    }
+    estimates_for_current_keyspace(const database&, std::vector<token_range> local_ranges) const;
 };

 struct virtual_reader {
@@ -332,6 +68,12 @@ struct virtual_reader {
    }
 };

+/**
+ * Returns the primary ranges for the local node.
+ * Used for testing as well.
+ */
+future<std::vector<token_range>> get_local_ranges();
+
 } // namespace size_estimates

 } // namespace db
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -143,10 +143,9 @@ void view_info::initialize_base_dependent_fields(const schema& base) {
 }

 bool view_info::is_index() const {
-    if (!_is_index) {
-        _is_index = service::get_local_storage_service().db().local().find_column_family(base_id()).get_index_manager().is_index(_schema);
-    }
-    return *_is_index;
+    //TODO(sarna): result of this call can be cached instead of calling index_manager::is_index every time
+    column_family& base_cf = service::get_local_storage_service().db().local().find_column_family(base_id());
+    return base_cf.get_index_manager().is_index(view_ptr(_schema.shared_from_this()));
 }

 namespace db {
@@ -1158,6 +1157,10 @@ future<> view_builder::stop() {
        return _sem.wait().then([this] {
            _sem.broken();
            return _build_step.join();
+        }).handle_exception_type([] (const broken_semaphore&) {
+            // ignored
+        }).handle_exception_type([] (const semaphore_timed_out&) {
+            // ignored
        });
    });
 }
--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -24,7 +24,9 @@
 namespace db::view {

 future<> view_update_generator::start() {
-    _started = seastar::async([this]() mutable {
+    thread_attributes attr;
+    attr.sched_group = _db.get_streaming_scheduling_group();
+    _started = seastar::async(std::move(attr), [this]() mutable {
        while (!_as.abort_requested()) {
            if (_sstables_with_tables.empty()) {
                _pending_sstables.wait().get();
--- a/dist/ami/build_ami.sh
+++ b/dist/ami/build_ami.sh
@@ -1,6 +1,7 @@
 #!/bin/bash -e

-PRODUCT=$(cat SCYLLA-PRODUCT-FILE)
+./SCYLLA-VERSION-GEN
+PRODUCT=$(cat build/SCYLLA-PRODUCT-FILE)

 if [ ! -e dist/ami/build_ami.sh ]; then
    echo "run build_ami.sh in top of scylla dir"
@@ -16,6 +17,7 @@ print_usage() {
    exit 1
 }
 LOCALRPM=0
+REPO_FOR_INSTALL=
 while [ $# -gt 0 ]; do
    case "$1" in
        "--localrpm")
@@ -23,10 +25,12 @@ while [ $# -gt 0 ]; do
            shift 1
            ;;
        "--repo")
+            REPO_FOR_INSTALL=$2
            INSTALL_ARGS="$INSTALL_ARGS --repo $2"
            shift 2
            ;;
        "--repo-for-install")
+            REPO_FOR_INSTALL=$2
            INSTALL_ARGS="$INSTALL_ARGS --repo-for-install $2"
            shift 2
            ;;
@@ -123,6 +127,43 @@ if [ $LOCALRPM -eq 1 ]; then
        cd ../..
        cp build/$PRODUCT-ami/build/RPMS/noarch/$PRODUCT-ami-`cat build/$PRODUCT-ami/build/SCYLLA-VERSION-FILE`-`cat build/$PRODUCT-ami/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/$PRODUCT-ami.noarch.rpm
    fi
+    if [ ! -f dist/ami/files/$PRODUCT-python3.x86_64.rpm ]; then
+        reloc/python3/build_reloc.sh
+        reloc/python3/build_rpm.sh
+        cp build/redhat/RPMS/x86_64/$PRODUCT-python3*.x86_64.rpm dist/ami/files/$PRODUCT-python3.x86_64.rpm
+    fi
+
+    SCYLLA_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} dist/ami/files/$PRODUCT.x86_64.rpm || true)
+    SCYLLA_AMI_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} dist/ami/files/$PRODUCT-ami.noarch.rpm || true)
+    SCYLLA_JMX_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} dist/ami/files/$PRODUCT-jmx.noarch.rpm || true)
+    SCYLLA_TOOLS_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} dist/ami/files/$PRODUCT-tools.noarch.rpm || true)
+    SCYLLA_PYTHON3_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} dist/ami/files/$PRODUCT-python3.x86_64.rpm || true)
+else
+    if [ -z "$REPO_FOR_INSTALL" ]; then
+        print_usage
+        exit 1
+    fi
+    if [ ! -f /usr/bin/yumdownloader ]; then
+        if is_redhat_variant; then
+            sudo yum install /usr/bin/yumdownloader
+        else
+            sudo apt-get install yum-utils
+        fi
+    fi
+    if [ ! -f /usr/bin/curl ]; then
+        pkg_install curl
+    fi
+    TMPREPO=$(mktemp -u -p /etc/yum.repos.d/ --suffix .repo)
+    sudo curl -o $TMPREPO $REPO_FOR_INSTALL
+    rm -rf build/ami_packages
+    mkdir -p build/ami_packages
+    yumdownloader --downloaddir build/ami_packages/ $PRODUCT $PRODUCT-kernel-conf $PRODUCT-conf $PRODUCT-server $PRODUCT-debuginfo $PRODUCT-ami $PRODUCT-jmx $PRODUCT-tools-core $PRODUCT-tools $PRODUCT-python3
+    sudo rm -f $TMPREPO
+    SCYLLA_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} build/ami_packages/$PRODUCT-[0-9]*.rpm || true)
+    SCYLLA_AMI_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} build/ami_packages/$PRODUCT-ami-*.rpm || true)
+    SCYLLA_JMX_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} build/ami_packages/$PRODUCT-jmx-*.rpm || true)
+    SCYLLA_TOOLS_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} build/ami_packages/$PRODUCT-tools-[0-9]*.rpm || true)
+    SCYLLA_PYTHON3_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} build/ami_packages/$PRODUCT-python3-*.rpm || true)
 fi

 cd dist/ami
@@ -147,4 +188,4 @@ if [ ! -d packer ]; then
    cd -
 fi

-env PACKER_LOG=1 PACKER_LOG_PATH=../../build/ami.log packer/packer build -var-file=variables.json -var install_args="$INSTALL_ARGS" -var region="$REGION" -var source_ami="$AMI" -var ssh_username="$SSH_USERNAME" scylla.json
+env PACKER_LOG=1 PACKER_LOG_PATH=../../build/ami.log packer/packer build -var-file=variables.json -var install_args="$INSTALL_ARGS" -var region="$REGION" -var source_ami="$AMI" -var ssh_username="$SSH_USERNAME" -var scylla_version="$SCYLLA_VERSION" -var scylla_ami_version="$SCYLLA_AMI_VERSION" -var scylla_jmx_version="$SCYLLA_JMX_VERSION" -var scylla_tools_version="$SCYLLA_TOOLS_VERSION" -var scylla_python3_version="$SCYLLA_PYTHON3_VERSION" scylla.json
--- a/dist/ami/scylla.json
+++ b/dist/ami/scylla.json
@@ -56,7 +56,15 @@
      "ssh_username": "{{user `ssh_username`}}",
      "subnet_id": "{{user `subnet_id`}}",
      "type": "amazon-ebs",
-      "user_data_file": "user_data.txt"
+      "user_data_file": "user_data.txt",
+      "ami_description": "scylla-{{user `scylla_version`}} scylla-ami-{{user `scylla_ami_version`}} scylla-jmx-{{user `scylla_jmx_version`}} scylla-tools-{{user `scylla_tools_version`}} scylla-python3-{{user `scylla_python3_version`}}",
+      "tags": {
+          "ScyllaVersion": "{{user `scylla_version`}}",
+          "ScyllaAMIVersion": "{{user `scylla_ami_version`}}",
+          "ScyllaJMXVersion": "{{user `scylla_jmx_version`}}",
+          "ScyllaToolsVersion": "{{user `scylla_tools_version`}}",
+          "ScyllaPython3Version": "{{user `scylla_python3_version`}}"
+      }
    }
  ],
  "provisioners": [
--- a/dist/common/scripts/scylla_io_setup
+++ b/dist/common/scripts/scylla_io_setup
@@ -60,6 +60,17 @@ if __name__ == "__main__":
                disk_properties["read_bandwidth"] = 2015342735 * nr_disks
                disk_properties["write_iops"] = 181500 * nr_disks
                disk_properties["write_bandwidth"] = 808775652 * nr_disks
+            elif idata.instance_class() == "i3en":
+                if idata.instance() in ("i3en.large", "i3.xlarge", "i3en.2xlarge"):
+                    disk_properties["read_iops"] = 46489
+                    disk_properties["read_bandwidth"] = 353437280
+                    disk_properties["write_iops"] = 36680
+                    disk_properties["write_bandwidth"] = 164766656
+                else:
+                    disk_properties["read_iops"] = 278478 * nr_disks
+                    disk_properties["read_bandwidth"] = 3029172992 * nr_disks
+                    disk_properties["write_iops"] = 221909 * nr_disks
+                    disk_properties["write_bandwidth"] = 1020482432 * nr_disks
            elif idata.instance_class() == "i2":
                disk_properties["read_iops"] = 64000 * nr_disks
                disk_properties["read_bandwidth"] = 507338935 * nr_disks
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -95,6 +95,9 @@ def do_verify_package(pkg):
        res = run('rpm -q {}'.format(pkg), silent=True, exception=False)
    elif is_gentoo_variant():
        res = 0 if len(glob.glob('/var/db/pkg/*/{}-*'.format(pkg))) else 1
+    else:
+        print("OS variant not recognized")
+        res = 1
    if res != 0:
        print('{} package is not installed.'.format(pkg))
        sys.exit(1)
@@ -252,22 +255,22 @@ if __name__ == '__main__':
    if not os.path.exists('/etc/scylla.d/housekeeping.cfg'):
        version_check = interactive_ask_service('Do you want to enable Scylla to check if there is a newer version of Scylla available?', 'Yes - start the Scylla-housekeeping service to check for a newer version. This check runs periodically. No - skips this step.', version_check)
        args.no_version_check = not version_check
-    if version_check:
-        with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
-            f.write('[housekeeping]\ncheck-version: True\n')
-        if is_systemd():
-            systemd_unit('scylla-housekeeping-daily.timer').unmask()
-            systemd_unit('scylla-housekeeping-restart.timer').unmask()
-    else:
-        with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
-            f.write('[housekeeping]\ncheck-version: False\n')
-        if is_systemd():
-            hk_daily = systemd_unit('scylla-housekeeping-daily.timer')
-            hk_daily.mask()
-            hk_daily.stop()
-            hk_restart = systemd_unit('scylla-housekeeping-restart.timer')
-            hk_restart.mask()
-            hk_restart.stop()
+        if version_check:
+            with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
+                f.write('[housekeeping]\ncheck-version: True\n')
+            if is_systemd():
+                systemd_unit('scylla-housekeeping-daily.timer').unmask()
+                systemd_unit('scylla-housekeeping-restart.timer').unmask()
+        else:
+            with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
+                f.write('[housekeeping]\ncheck-version: False\n')
+            if is_systemd():
+                hk_daily = systemd_unit('scylla-housekeeping-daily.timer')
+                hk_daily.mask()
+                hk_daily.stop()
+                hk_restart = systemd_unit('scylla-housekeeping-restart.timer')
+                hk_restart.mask()
+                hk_restart.stop()

    cur_version=out('scylla --version', exception=False)
    if len(cur_version) > 0:
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -119,7 +119,7 @@ class aws_instance:
        return self._type.split(".")[0]

    def is_supported_instance_class(self):
-        if self.instance_class() in ['i2', 'i3']:
+        if self.instance_class() in ['i2', 'i3', 'i3en']:
            return True
        return False

@@ -128,7 +128,7 @@ class aws_instance:
        instance_size = self.instance_size()
        if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
            return 'ixgbevf'
-        if instance_class in ['c5', 'c5d', 'f1', 'g3', 'h1', 'i3', 'm5', 'm5d', 'p2', 'p3', 'r4', 'x1']:
+        if instance_class in ['c5', 'c5d', 'f1', 'g3', 'h1', 'i3', 'i3en', 'm5', 'm5d', 'p2', 'p3', 'r4', 'x1']:
            return 'ena'
        if instance_class == 'm4':
            if instance_size == '16xlarge':
@@ -304,7 +304,7 @@ def parse_os_release_line(line):
    val = shlex.split(data)[0]
    return (id, val.split(' ') if id == 'ID' or id == 'ID_LIKE' else val)

-os_release = dict([parse_os_release_line(x) for x in open('/etc/os-release').read().splitlines()])
+os_release = dict([parse_os_release_line(x) for x in open('/etc/os-release').read().splitlines() if re.match(r'\w+=', x) ])

 def is_debian_variant():
    d = os_release['ID_LIKE'] if 'ID_LIKE' in os_release else os_release['ID']
@@ -313,7 +313,7 @@ def is_debian_variant():

 def is_redhat_variant():
    d = os_release['ID_LIKE'] if 'ID_LIKE' in os_release else os_release['ID']
-    return ('rhel' in d) or ('fedora' in d)
+    return ('rhel' in d) or ('fedora' in d) or ('ol') in d

 def is_gentoo_variant():
    return ('gentoo' in os_release['ID'])
--- a/dist/debian/control.mustache
+++ b/dist/debian/control.mustache
@@ -16,7 +16,7 @@ Conflicts: {{product}}-server (<< 1.1)

 Package: {{product}}-server
 Architecture: amd64
-Depends: ${shlibs:Depends}, ${misc:Depends}, adduser, hwloc-nox, {{product}}-conf, python-yaml, python-urwid, python-requests, curl, util-linux, python3-yaml, python3, uuid-runtime, pciutils, python3-pyudev, gzip, realpath | coreutils, num-utils, file
+Depends: ${shlibs:Depends}, ${misc:Depends}, adduser, hwloc-nox, {{product}}-conf, {{product}}-python3, curl, util-linux, uuid-runtime, pciutils, gzip, realpath | coreutils, num-utils, file
 Description: Scylla database server binaries 
 Scylla is a highly scalable, eventually consistent, distributed,
 partitioned row DB.
--- a/dist/debian/debian/adjust_bin
+++ b/dist/debian/debian/adjust_bin
@@ -0,0 +1,30 @@
+#!/bin/bash -ex
+
+root="$1"
+bin="$2"
+prefix="/opt/scylladb"
+
+[ "$bin" = patchelf ] && exit 0
+
+patchelf() {
+    # patchelf comes from the build system, so it needs the build system's ld.so and
+    # shared libraries. We can't use patchelf on patchelf itself, so invoke it via
+    # ld.so.
+    LD_LIBRARY_PATH="$root/$prefix/bin/libreloc" "$root/$prefix"/libreloc/ld.so "$root/$prefix"/libexec/patchelf "$@"
+}
+
+# We could add --set-rpath too, but then debugedit (called by rpmbuild) barfs
+# on the result. So use LD_LIBRARY_PATH in the thunk, below.
+patchelf \
+    --set-interpreter "$prefix/libreloc/ld.so" \
+    "$root/$prefix/libexec/$bin"
+mkdir -p "$root/$prefix/bin"
+cat > "$root/$prefix/bin/$bin" <<EOF
+#!/bin/bash -e
+export GNUTLS_SYSTEM_PRIORITY_FILE="\${GNUTLS_SYSTEM_PRIORITY_FILE-$prefix/libreloc/gnutls.config}"
+export LD_LIBRARY_PATH="$prefix/libreloc"
+exec -a "\$0" "$prefix/libexec/$bin" "\$@"
+EOF
+
+chmod +x "$root/$prefix/bin/$bin"
+
--- a/dist/debian/python3/build_deb.sh
+++ b/dist/debian/python3/build_deb.sh
@@ -0,0 +1,140 @@
+#!/bin/bash -e
+
+PRODUCT=$(cat SCYLLA-PRODUCT-FILE)
+
+. /etc/os-release
+print_usage() {
+    echo "build_deb.sh --reloc-pkg build/release/scylla-python3-package.tar.gz"
+    echo "  --reloc-pkg specify relocatable package path"
+    exit 1
+}
+
+TARGET=stable
+RELOC_PKG=
+while [ $# -gt 0 ]; do
+    case "$1" in
+        "--reloc-pkg")
+            RELOC_PKG=$2
+            shift 2
+            ;;
+        *)
+            print_usage
+            ;;
+    esac
+done
+
+is_redhat_variant() {
+    [ -f /etc/redhat-release ]
+}
+is_debian_variant() {
+    [ -f /etc/debian_version ]
+}
+pkg_install() {
+    if is_redhat_variant; then
+        sudo yum install -y $1
+    elif is_debian_variant; then
+        sudo apt-get install -y $1
+    else
+        echo "Requires to install following command: $1"
+        exit 1
+    fi
+}
+
+if [ ! -e SCYLLA-RELOCATABLE-FILE ]; then
+    echo "do not directly execute build_deb.sh, use reloc/build_deb.sh instead."
+    exit 1
+fi
+
+if [ "$(arch)" != "x86_64" ]; then
+    echo "Unsupported architecture: $(arch)"
+    exit 1
+fi
+
+if [ -z "$RELOC_PKG" ]; then
+    print_usage
+    exit 1
+fi
+if [ ! -f "$RELOC_PKG" ]; then
+    echo "$RELOC_PKG is not found."
+    exit 1
+fi
+
+if [ -e debian ]; then
+    rm -rf debian
+fi
+if is_debian_variant; then
+    sudo apt-get -y update
+fi
+# this hack is needed since some environment installs 'git-core' package, it's
+# subset of the git command and doesn't works for our git-archive-all script.
+if is_redhat_variant && [ ! -f /usr/libexec/git-core/git-submodule ]; then
+    sudo yum install -y git
+fi
+if [ ! -f /usr/bin/git ]; then
+    pkg_install git
+fi
+if [ ! -f /usr/bin/python ]; then
+    pkg_install python
+fi
+if [ ! -f /usr/bin/debuild ]; then
+    pkg_install devscripts
+fi
+if [ ! -f /usr/bin/dh_testdir ]; then
+    pkg_install debhelper
+fi
+if [ ! -f /usr/bin/fakeroot ]; then
+    pkg_install fakeroot
+fi
+if [ ! -f /usr/bin/pystache ]; then
+    if is_redhat_variant; then
+        sudo yum install -y /usr/bin/pystache
+    elif is_debian_variant; then
+        sudo apt-get install -y python-pystache
+    fi
+fi
+if [ ! -f /usr/bin/file ]; then
+    pkg_install file
+fi
+if is_debian_variant && [ ! -f /usr/share/doc/python-pkg-resources/copyright ]; then
+    sudo apt-get install -y python-pkg-resources
+fi
+
+if [ "$ID" = "ubuntu" ] && [ ! -f /usr/share/keyrings/debian-archive-keyring.gpg ]; then
+    sudo apt-get install -y debian-archive-keyring
+fi
+if [ "$ID" = "debian" ] && [ ! -f /usr/share/keyrings/ubuntu-archive-keyring.gpg ]; then
+    sudo apt-get install -y ubuntu-archive-keyring
+fi
+
+if [ -z "$TARGET" ]; then
+    if is_debian_variant; then
+        if [ ! -f /usr/bin/lsb_release ]; then
+            pkg_install lsb-release
+        fi
+        TARGET=`lsb_release -c|awk '{print $2}'`
+    else
+        echo "Please specify target"
+        exit 1
+    fi
+fi
+RELOC_PKG_FULLPATH=$(readlink -f $RELOC_PKG)
+RELOC_PKG_BASENAME=$(basename $RELOC_PKG)
+SCYLLA_VERSION=$(cat SCYLLA-VERSION-FILE)
+SCYLLA_RELEASE=$(cat SCYLLA-RELEASE-FILE)
+
+ln -fv $RELOC_PKG_FULLPATH ../$PRODUCT-python3_$SCYLLA_VERSION-$SCYLLA_RELEASE.orig.tar.gz
+
+cp -al dist/debian/python3/debian debian
+if [ "$PRODUCT" != "scylla" ]; then
+    for i in debian/scylla-*;do
+        mv $i ${i/scylla-/$PRODUCT-}
+    done
+fi
+REVISION="1"
+MUSTACHE_DIST="\"debian\": true, \"product\": \"$PRODUCT\", \"$PRODUCT\": true"
+pystache dist/debian/python3/changelog.mustache "{ $MUSTACHE_DIST, \"version\": \"$SCYLLA_VERSION\", \"release\": \"$SCYLLA_RELEASE\", \"revision\": \"$REVISION\", \"codename\": \"$TARGET\" }" > debian/changelog
+pystache dist/debian/python3/rules.mustache "{ $MUSTACHE_DIST }" > debian/rules
+pystache dist/debian/python3/control.mustache "{ $MUSTACHE_DIST }" > debian/control
+chmod a+rx debian/rules
+
+debuild -rfakeroot -us -uc
--- a/dist/debian/python3/changelog.mustache
+++ b/dist/debian/python3/changelog.mustache
@@ -0,0 +1,5 @@
+{{product}}-python3 ({{version}}-{{release}}-{{revision}}) {{codename}}; urgency=medium
+
+  * Initial release.
+
+ -- Takuya ASADA <syuu@scylladb.com>  Mon, 24 Aug 2015 09:22:55 +0000
--- a/dist/debian/python3/control.mustache
+++ b/dist/debian/python3/control.mustache
@@ -0,0 +1,16 @@
+Source: {{product}}-python3
+Maintainer: Takuya ASADA <syuu@scylladb.com>
+Homepage: http://scylladb.com
+Section: python
+Priority: optional
+X-Python3-Version: >= 3.4
+Standards-Version: 3.9.5
+
+Package: {{product}}-python3
+Architecture: amd64
+Description: A standalone python3 interpreter that can be moved around different Linux machines
+ This is a self-contained python interpreter that can be moved around
+ different Linux machines as long as they run a new enough kernel (where
+ new enough is defined by whichever Python module uses any kernel
+ functionality). All shared libraries needed for the interpreter to
+ operate are shipped with it.
--- a/dist/debian/python3/debian/compat
+++ b/dist/debian/python3/debian/compat
@@ -0,0 +1 @@
+9
--- a/dist/debian/python3/debian/copyright
+++ b/dist/debian/python3/debian/copyright
@@ -0,0 +1,995 @@
+This package was put together by Klee Dienes <klee@debian.org> from 
+sources from ftp.python.org:/pub/python, based on the Debianization by 
+the previous maintainers Bernd S. Brentrup <bsb@uni-muenster.de> and 
+Bruce Perens. Current maintainer is Matthias Klose <doko@debian.org>. 
+
+It was downloaded from http://python.org/
+
+Copyright:
+
+Upstream Author: Guido van Rossum <guido@cwi.nl> and others.
+
+License:
+
+The following text includes the Python license and licenses and
+acknowledgements for incorporated software. The licenses can be read
+in the HTML and texinfo versions of the documentation as well, after
+installing the pythonx.y-doc package. Licenses for files not licensed
+under the Python Licenses are found at the end of this file.
+
+
+Python License
+==============
+
+A. HISTORY OF THE SOFTWARE
+==========================
+
+Python was created in the early 1990s by Guido van Rossum at Stichting
+Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands
+as a successor of a language called ABC.  Guido remains Python's
+principal author, although it includes many contributions from others.
+
+In 1995, Guido continued his work on Python at the Corporation for
+National Research Initiatives (CNRI, see http://www.cnri.reston.va.us)
+in Reston, Virginia where he released several versions of the
+software.
+
+In May 2000, Guido and the Python core development team moved to
+BeOpen.com to form the BeOpen PythonLabs team.  In October of the same
+year, the PythonLabs team moved to Digital Creations (now Zope
+Corporation, see http://www.zope.com).  In 2001, the Python Software
+Foundation (PSF, see http://www.python.org/psf/) was formed, a
+non-profit organization created specifically to own Python-related
+Intellectual Property.  Zope Corporation is a sponsoring member of
+the PSF.
+
+All Python releases are Open Source (see http://www.opensource.org for
+the Open Source Definition).  Historically, most, but not all, Python
+releases have also been GPL-compatible; the table below summarizes
+the various releases.
+
+    Release         Derived     Year        Owner       GPL-
+                    from                                compatible? (1)
+
+    0.9.0 thru 1.2              1991-1995   CWI         yes
+    1.3 thru 1.5.2  1.2         1995-1999   CNRI        yes
+    1.6             1.5.2       2000        CNRI        no
+    2.0             1.6         2000        BeOpen.com  no
+    1.6.1           1.6         2001        CNRI        yes (2)
+    2.1             2.0+1.6.1   2001        PSF         no
+    2.0.1           2.0+1.6.1   2001        PSF         yes
+    2.1.1           2.1+2.0.1   2001        PSF         yes
+    2.2             2.1.1       2001        PSF         yes
+    2.1.2           2.1.1       2002        PSF         yes
+    2.1.3           2.1.2       2002        PSF         yes
+    2.2 and above   2.1.1       2001-now    PSF         yes
+
+Footnotes:
+
+(1) GPL-compatible doesn't mean that we're distributing Python under
+    the GPL.  All Python licenses, unlike the GPL, let you distribute
+    a modified version without making your changes open source.  The
+    GPL-compatible licenses make it possible to combine Python with
+    other software that is released under the GPL; the others don't.
+
+(2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
+    because its license has a choice of law clause.  According to
+    CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
+    is "not incompatible" with the GPL.
+
+Thanks to the many outside volunteers who have worked under Guido's
+direction to make these releases possible.
+
+
+B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
+===============================================================
+
+PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
+--------------------------------------------
+
+1. This LICENSE AGREEMENT is between the Python Software Foundation
+("PSF"), and the Individual or Organization ("Licensee") accessing and
+otherwise using this software ("Python") in source or binary form and
+its associated documentation.
+
+2. Subject to the terms and conditions of this License Agreement, PSF
+hereby grants Licensee a nonexclusive, royalty-free, world-wide
+license to reproduce, analyze, test, perform and/or display publicly,
+prepare derivative works, distribute, and otherwise use Python alone
+or in any derivative version, provided, however, that PSF's License
+Agreement and PSF's notice of copyright, i.e., "Copyright (c) 2001,
+2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
+2013, 2014 Python Software Foundation; All Rights Reserved" are
+retained in Python alone or in any derivative version prepared by
+Licensee.
+
+3. In the event Licensee prepares a derivative work that is based on
+or incorporates Python or any part thereof, and wants to make
+the derivative work available to others as provided herein, then
+Licensee hereby agrees to include in any such work a brief summary of
+the changes made to Python.
+
+4. PSF is making Python available to Licensee on an "AS IS"
+basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
+OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+6. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+7. Nothing in this License Agreement shall be deemed to create any
+relationship of agency, partnership, or joint venture between PSF and
+Licensee.  This License Agreement does not grant permission to use PSF
+trademarks or trade name in a trademark sense to endorse or promote
+products or services of Licensee, or any third party.
+
+8. By copying, installing or otherwise using Python, Licensee
+agrees to be bound by the terms and conditions of this License
+Agreement.
+
+
+BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0
+-------------------------------------------
+
+BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1
+
+1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an
+office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the
+Individual or Organization ("Licensee") accessing and otherwise using
+this software in source or binary form and its associated
+documentation ("the Software").
+
+2. Subject to the terms and conditions of this BeOpen Python License
+Agreement, BeOpen hereby grants Licensee a non-exclusive,
+royalty-free, world-wide license to reproduce, analyze, test, perform
+and/or display publicly, prepare derivative works, distribute, and
+otherwise use the Software alone or in any derivative version,
+provided, however, that the BeOpen Python License is retained in the
+Software, alone or in any derivative version prepared by Licensee.
+
+3. BeOpen is making the Software available to Licensee on an "AS IS"
+basis.  BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE
+SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS
+AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY
+DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+5. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+6. This License Agreement shall be governed by and interpreted in all
+respects by the law of the State of California, excluding conflict of
+law provisions.  Nothing in this License Agreement shall be deemed to
+create any relationship of agency, partnership, or joint venture
+between BeOpen and Licensee.  This License Agreement does not grant
+permission to use BeOpen trademarks or trade names in a trademark
+sense to endorse or promote products or services of Licensee, or any
+third party.  As an exception, the "BeOpen Python" logos available at
+http://www.pythonlabs.com/logos.html may be used according to the
+permissions granted on that web page.
+
+7. By copying, installing or otherwise using the software, Licensee
+agrees to be bound by the terms and conditions of this License
+Agreement.
+
+
+CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1
+---------------------------------------
+
+1. This LICENSE AGREEMENT is between the Corporation for National
+Research Initiatives, having an office at 1895 Preston White Drive,
+Reston, VA 20191 ("CNRI"), and the Individual or Organization
+("Licensee") accessing and otherwise using Python 1.6.1 software in
+source or binary form and its associated documentation.
+
+2. Subject to the terms and conditions of this License Agreement, CNRI
+hereby grants Licensee a nonexclusive, royalty-free, world-wide
+license to reproduce, analyze, test, perform and/or display publicly,
+prepare derivative works, distribute, and otherwise use Python 1.6.1
+alone or in any derivative version, provided, however, that CNRI's
+License Agreement and CNRI's notice of copyright, i.e., "Copyright (c)
+1995-2001 Corporation for National Research Initiatives; All Rights
+Reserved" are retained in Python 1.6.1 alone or in any derivative
+version prepared by Licensee.  Alternately, in lieu of CNRI's License
+Agreement, Licensee may substitute the following text (omitting the
+quotes): "Python 1.6.1 is made available subject to the terms and
+conditions in CNRI's License Agreement.  This Agreement together with
+Python 1.6.1 may be located on the Internet using the following
+unique, persistent identifier (known as a handle): 1895.22/1013.  This
+Agreement may also be obtained from a proxy server on the Internet
+using the following URL: http://hdl.handle.net/1895.22/1013".
+
+3. In the event Licensee prepares a derivative work that is based on
+or incorporates Python 1.6.1 or any part thereof, and wants to make
+the derivative work available to others as provided herein, then
+Licensee hereby agrees to include in any such work a brief summary of
+the changes made to Python 1.6.1.
+
+4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS"
+basis.  CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1,
+OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+6. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+7. This License Agreement shall be governed by the federal
+intellectual property law of the United States, including without
+limitation the federal copyright law, and, to the extent such
+U.S. federal law does not apply, by the law of the Commonwealth of
+Virginia, excluding Virginia's conflict of law provisions.
+Notwithstanding the foregoing, with regard to derivative works based
+on Python 1.6.1 that incorporate non-separable material that was
+previously distributed under the GNU General Public License (GPL), the
+law of the Commonwealth of Virginia shall govern this License
+Agreement only as to issues arising under or with respect to
+Paragraphs 4, 5, and 7 of this License Agreement.  Nothing in this
+License Agreement shall be deemed to create any relationship of
+agency, partnership, or joint venture between CNRI and Licensee.  This
+License Agreement does not grant permission to use CNRI trademarks or
+trade name in a trademark sense to endorse or promote products or
+services of Licensee, or any third party.
+
+8. By clicking on the "ACCEPT" button where indicated, or by copying,
+installing or otherwise using Python 1.6.1, Licensee agrees to be
+bound by the terms and conditions of this License Agreement.
+
+        ACCEPT
+
+
+CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2
+--------------------------------------------------
+
+Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam,
+The Netherlands.  All rights reserved.
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted,
+provided that the above copyright notice appear in all copies and that
+both that copyright notice and this permission notice appear in
+supporting documentation, and that the name of Stichting Mathematisch
+Centrum or CWI not be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior
+permission.
+
+STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
+THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
+FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+
+Licenses and Acknowledgements for Incorporated Software
+=======================================================
+
+Mersenne Twister
+----------------
+
+The `_random' module includes code based on a download from
+`http://www.math.keio.ac.jp/~matumoto/MT2002/emt19937ar.html'.  The
+following are the verbatim comments from the original code:
+
+     A C-program for MT19937, with initialization improved 2002/1/26.
+     Coded by Takuji Nishimura and Makoto Matsumoto.
+
+     Before using, initialize the state by using init_genrand(seed)
+     or init_by_array(init_key, key_length).
+
+     Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+     All rights reserved.
+
+     Redistribution and use in source and binary forms, with or without
+     modification, are permitted provided that the following conditions
+     are met:
+
+      1. Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+
+      2. Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+
+      3. The names of its contributors may not be used to endorse or promote
+         products derived from this software without specific prior written
+         permission.
+
+     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+     A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
+     OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+     TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+     PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+     LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+     NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+     SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+     Any feedback is very welcome.
+     http://www.math.keio.ac.jp/matumoto/emt.html
+     email: matumoto@math.keio.ac.jp
+
+
+Sockets
+-------
+
+The `socket' module uses the functions, `getaddrinfo', and
+`getnameinfo', which are coded in separate source files from the WIDE
+Project, `http://www.wide.ad.jp/about/index.html'.
+
+     Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
+     All rights reserved.
+
+     Redistribution and use in source and binary forms, with or without
+     modification, are permitted provided that the following conditions
+     are met:
+     1. Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+     2. Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+     3. Neither the name of the project nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+     THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+     GAI_ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+     ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+     FOR GAI_ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+     INTERRUPTION) HOWEVER CAUSED AND ON GAI_ANY THEORY OF LIABILITY, WHETHER
+     IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+     ARISING IN GAI_ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+     OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+Floating point exception control
+--------------------------------
+
+The source for the `fpectl' module includes the following notice:
+
+     ---------------------------------------------------------------------  
+    /                       Copyright (c) 1996.                           \ 
+   |          The Regents of the University of California.                 |
+   |                        All rights reserved.                           |
+   |                                                                       |
+   |   Permission to use, copy, modify, and distribute this software for   |
+   |   any purpose without fee is hereby granted, provided that this en-   |
+   |   tire notice is included in all copies of any software which is or   |
+   |   includes  a  copy  or  modification  of  this software and in all   |
+   |   copies of the supporting documentation for such software.           |
+   |                                                                       |
+   |   This  work was produced at the University of California, Lawrence   |
+   |   Livermore National Laboratory under  contract  no.  W-7405-ENG-48   |
+   |   between  the  U.S.  Department  of  Energy and The Regents of the   |
+   |   University of California for the operation of UC LLNL.              |
+   |                                                                       |
+   |                              DISCLAIMER                               |
+   |                                                                       |
+   |   This  software was prepared as an account of work sponsored by an   |
+   |   agency of the United States Government. Neither the United States   |
+   |   Government  nor the University of California nor any of their em-   |
+   |   ployees, makes any warranty, express or implied, or  assumes  any   |
+   |   liability  or  responsibility  for the accuracy, completeness, or   |
+   |   usefulness of any information,  apparatus,  product,  or  process   |
+   |   disclosed,   or  represents  that  its  use  would  not  infringe   |
+   |   privately-owned rights. Reference herein to any specific  commer-   |
+   |   cial  products,  process,  or  service  by trade name, trademark,   |
+   |   manufacturer, or otherwise, does not  necessarily  constitute  or   |
+   |   imply  its endorsement, recommendation, or favoring by the United   |
+   |   States Government or the University of California. The views  and   |
+   |   opinions  of authors expressed herein do not necessarily state or   |
+   |   reflect those of the United States Government or  the  University   |
+   |   of  California,  and shall not be used for advertising or product   |
+    \  endorsement purposes.                                              / 
+     ---------------------------------------------------------------------
+
+
+Cookie management
+-----------------
+
+The `Cookie' module contains the following notice:
+
+      Copyright 2000 by Timothy O'Malley <timo@alum.mit.edu>
+
+                     All Rights Reserved
+
+      Permission to use, copy, modify, and distribute this software
+      and its documentation for any purpose and without fee is hereby
+      granted, provided that the above copyright notice appear in all
+      copies and that both that copyright notice and this permission
+      notice appear in supporting documentation, and that the name of
+      Timothy O'Malley  not be used in advertising or publicity
+      pertaining to distribution of the software without specific, written
+      prior permission.
+
+      Timothy O'Malley DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+      SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+      AND FITNESS, IN NO EVENT SHALL Timothy O'Malley BE LIABLE FOR
+      ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+      WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+      WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+      ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+      PERFORMANCE OF THIS SOFTWARE.
+
+
+Execution tracing
+-----------------
+
+The `trace' module contains the following notice:
+
+      portions copyright 2001, Autonomous Zones Industries, Inc., all rights...
+      err...  reserved and offered to the public under the terms of the
+      Python 2.2 license.
+      Author: Zooko O'Whielacronx
+      http://zooko.com/
+      mailto:zooko@zooko.com
+
+      Copyright 2000, Mojam Media, Inc., all rights reserved.
+      Author: Skip Montanaro
+
+      Copyright 1999, Bioreason, Inc., all rights reserved.
+      Author: Andrew Dalke
+
+      Copyright 1995-1997, Automatrix, Inc., all rights reserved.
+      Author: Skip Montanaro
+
+      Copyright 1991-1995, Stichting Mathematisch Centrum, all rights reserved.
+
+      Permission to use, copy, modify, and distribute this Python software and
+      its associated documentation for any purpose without fee is hereby
+      granted, provided that the above copyright notice appears in all copies,
+      and that both that copyright notice and this permission notice appear in
+      supporting documentation, and that the name of neither Automatrix,
+      Bioreason or Mojam Media be used in advertising or publicity pertaining
+      to distribution of the software without specific, written prior
+      permission.
+
+
+UUencode and UUdecode functions
+-------------------------------
+
+The `uu' module contains the following notice:
+
+      Copyright 1994 by Lance Ellinghouse
+      Cathedral City, California Republic, United States of America.
+                             All Rights Reserved
+      Permission to use, copy, modify, and distribute this software and its
+      documentation for any purpose and without fee is hereby granted,
+      provided that the above copyright notice appear in all copies and that
+      both that copyright notice and this permission notice appear in
+      supporting documentation, and that the name of Lance Ellinghouse
+      not be used in advertising or publicity pertaining to distribution
+      of the software without specific, written prior permission.
+      LANCE ELLINGHOUSE DISCLAIMS ALL WARRANTIES WITH REGARD TO
+      THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+      FITNESS, IN NO EVENT SHALL LANCE ELLINGHOUSE CENTRUM BE LIABLE
+      FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+      WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+      ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+      OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+      Modified by Jack Jansen, CWI, July 1995:
+      - Use binascii module to do the actual line-by-line conversion
+        between ascii and binary. This results in a 1000-fold speedup. The C
+        version is still 5 times faster, though.
+      - Arguments more compliant with python standard
+
+
+XML Remote Procedure Calls
+--------------------------
+
+The `xmlrpclib' module contains the following notice:
+
+          The XML-RPC client interface is
+
+      Copyright (c) 1999-2002 by Secret Labs AB
+      Copyright (c) 1999-2002 by Fredrik Lundh
+
+      By obtaining, using, and/or copying this software and/or its
+      associated documentation, you agree that you have read, understood,
+      and will comply with the following terms and conditions:
+
+      Permission to use, copy, modify, and distribute this software and
+      its associated documentation for any purpose and without fee is
+      hereby granted, provided that the above copyright notice appears in
+      all copies, and that both that copyright notice and this permission
+      notice appear in supporting documentation, and that the name of
+      Secret Labs AB or the author not be used in advertising or publicity
+      pertaining to distribution of the software without specific, written
+      prior permission.
+
+      SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
+      TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
+      ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
+      BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
+      DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+      WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+      ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+      OF THIS SOFTWARE.
+
+Licenses for Software linked to
+===============================
+
+Note that the choice of GPL compatibility outlined above doesn't extend
+to modules linked to particular libraries, since they change the
+effective License of the module binary.
+
+
+GNU Readline
+------------
+
+The 'readline' module makes use of GNU Readline.
+
+      The GNU Readline Library is free software; you can redistribute it
+      and/or modify it under the terms of the GNU General Public License as
+      published by the Free Software Foundation; either version 2, or (at
+      your option) any later version.
+
+      On Debian systems, you can find the complete statement in
+      /usr/share/doc/readline-common/copyright'. A copy of the GNU General
+      Public License is available in /usr/share/common-licenses/GPL-2'.
+
+
+OpenSSL
+-------
+
+The '_ssl' module makes use of OpenSSL.
+
+      The OpenSSL toolkit stays under a dual license, i.e. both the
+      conditions of the OpenSSL License and the original SSLeay license
+      apply to the toolkit. Actually both licenses are BSD-style Open
+      Source licenses. Note that both licenses are incompatible with
+      the GPL.
+
+      On Debian systems, you can find the complete license text in
+      /usr/share/doc/openssl/copyright'.
+
+
+Files with other licenses than the Python License
+-------------------------------------------------
+
+Files: Include/dynamic_annotations.h
+Files: Python/dynamic_annotations.c
+Copyright: (c) 2008-2009, Google Inc.
+License: Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+ 
+      * Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+      * Neither the name of Google Inc. nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+ 
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Files: Include/unicodeobject.h
+Copyright: (c) Corporation for National Research Initiatives.
+Copyright: (c) 1999 by Secret Labs AB.
+Copyright: (c) 1999 by Fredrik Lundh.
+License: By obtaining, using, and/or copying this software and/or its
+  associated documentation, you agree that you have read, understood,
+  and will comply with the following terms and conditions:
+ 
+  Permission to use, copy, modify, and distribute this software and its
+  associated documentation for any purpose and without fee is hereby
+  granted, provided that the above copyright notice appears in all
+  copies, and that both that copyright notice and this permission notice
+  appear in supporting documentation, and that the name of Secret Labs
+  AB or the author not be used in advertising or publicity pertaining to
+  distribution of the software without specific, written prior
+  permission.
+ 
+  SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
+  THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+  FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
+  ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+  OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Files: Lib/logging/*
+Copyright: 2001-2010 by Vinay Sajip. All Rights Reserved.
+License: Permission to use, copy, modify, and distribute this software and
+ its documentation for any purpose and without fee is hereby granted,
+ provided that the above copyright notice appear in all copies and that
+ both that copyright notice and this permission notice appear in
+ supporting documentation, and that the name of Vinay Sajip
+ not be used in advertising or publicity pertaining to distribution
+ of the software without specific, written prior permission.
+ VINAY SAJIP DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
+ ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
+ VINAY SAJIP BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
+ ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
+ IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Files: Lib/multiprocessing/*
+Files: Modules/_multiprocessing/*
+Copyright: (c) 2006-2008, R Oudkerk. All rights reserved.
+License: Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ 
+ 1. Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+ 3. Neither the name of author nor the names of any contributors may be
+    used to endorse or promote products derived from this software
+    without specific prior written permission.
+ 
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ SUCH DAMAGE.
+
+Files: Lib/sqlite3/*
+Files: Modules/_sqlite/*
+Copyright: (C) 2004-2005 Gerhard Häring <gh@ghaering.de>
+License: This software is provided 'as-is', without any express or implied
+ warranty.  In no event will the authors be held liable for any damages
+ arising from the use of this software.
+ 
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+ 
+ 1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+Files: Lib/async*
+Copyright: Copyright 1996 by Sam Rushing
+License: Permission to use, copy, modify, and distribute this software and
+ its documentation for any purpose and without fee is hereby
+ granted, provided that the above copyright notice appear in all
+ copies and that both that copyright notice and this permission
+ notice appear in supporting documentation, and that the name of Sam
+ Rushing not be used in advertising or publicity pertaining to
+ distribution of the software without specific, written prior
+ permission.
+ 
+ SAM RUSHING DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
+ NO EVENT SHALL SAM RUSHING BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+ NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Files: Lib/tarfile.py
+Copyright: (C) 2002 Lars Gustaebel <lars@gustaebel.de>
+License: Permission  is  hereby granted,  free  of charge,  to  any person
+ obtaining a  copy of  this software  and associated documentation
+ files  (the  "Software"),  to   deal  in  the  Software   without
+ restriction,  including  without limitation  the  rights to  use,
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies  of  the  Software,  and to  permit  persons  to  whom the
+ Software  is  furnished  to  do  so,  subject  to  the  following
+ conditions:
+ 
+ The above copyright  notice and this  permission notice shall  be
+ included in all copies or substantial portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS  IS", WITHOUT WARRANTY OF ANY  KIND,
+ EXPRESS OR IMPLIED, INCLUDING  BUT NOT LIMITED TO  THE WARRANTIES
+ OF  MERCHANTABILITY,  FITNESS   FOR  A  PARTICULAR   PURPOSE  AND
+ NONINFRINGEMENT.  IN  NO  EVENT SHALL  THE  AUTHORS  OR COPYRIGHT
+ HOLDERS  BE LIABLE  FOR ANY  CLAIM, DAMAGES  OR OTHER  LIABILITY,
+ WHETHER  IN AN  ACTION OF  CONTRACT, TORT  OR OTHERWISE,  ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ OTHER DEALINGS IN THE SOFTWARE.
+
+Files: Lib/turtle.py
+Copyright: (C) 2006 - 2010  Gregor Lingl
+License: This software is provided 'as-is', without any express or implied
+ warranty.  In no event will the authors be held liable for any damages
+ arising from the use of this software.
+ 
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+ 
+ 1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ is copyright Gregor Lingl and licensed under a BSD-like license
+
+Files: Modules/_ctypes/libffi/*
+Copyright: Copyright (C) 1996-2011 Red Hat, Inc and others.
+    Copyright (C) 1996-2011 Anthony Green
+    Copyright (C) 1996-2010 Free Software Foundation, Inc
+    Copyright (c) 2003, 2004, 2006, 2007, 2008 Kaz Kojima
+    Copyright (c) 2010, 2011, Plausible Labs Cooperative , Inc.
+    Copyright (c) 2010 CodeSourcery
+    Copyright (c) 1998 Andreas Schwab
+    Copyright (c) 2000 Hewlett Packard Company
+    Copyright (c) 2009 Bradley Smith
+    Copyright (c) 2008 David Daney
+    Copyright (c) 2004 Simon Posnjak
+    Copyright (c) 2005 Axis Communications AB
+    Copyright (c) 1998 Cygnus Solutions
+    Copyright (c) 2004 Renesas Technology
+    Copyright (c) 2002, 2007  Bo Thorsen <bo@suse.de>
+    Copyright (c) 2002 Ranjit Mathew
+    Copyright (c) 2002 Roger Sayle
+    Copyright (c) 2000, 2007 Software AG
+    Copyright (c) 2003 Jakub Jelinek
+    Copyright (c) 2000, 2001 John Hornkvist
+    Copyright (c) 1998 Geoffrey Keating
+    Copyright (c) 2008 Björn König
+
+License: Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+
+   Documentation:
+   Permission is granted to copy, distribute and/or modify this document
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 2, or (at your option) any
+   later version.  A copy of the license is included in the
+   section entitled ``GNU General Public License''.
+
+Files: Modules/_gestalt.c
+Copyright: 1991-1997 by Stichting Mathematisch Centrum, Amsterdam.
+License: Permission to use, copy, modify, and distribute this software and its
+ documentation for any purpose and without fee is hereby granted,
+ provided that the above copyright notice appear in all copies and that
+ both that copyright notice and this permission notice appear in
+ supporting documentation, and that the names of Stichting Mathematisch
+ Centrum or CWI not be used in advertising or publicity pertaining to
+ distribution of the software without specific, written prior permission.
+ 
+ STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
+ THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
+ FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Files: Modules/syslogmodule.c
+Copyright: 1994 by Lance Ellinghouse
+License: Permission to use, copy, modify, and distribute this software and its
+ documentation for any purpose and without fee is hereby granted,
+ provided that the above copyright notice appear in all copies and that
+ both that copyright notice and this permission notice appear in
+ supporting documentation, and that the name of Lance Ellinghouse
+ not be used in advertising or publicity pertaining to distribution
+ of the software without specific, written prior permission.
+ 
+ LANCE ELLINGHOUSE DISCLAIMS ALL WARRANTIES WITH REGARD TO
+ THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ FITNESS, IN NO EVENT SHALL LANCE ELLINGHOUSE BE LIABLE FOR ANY SPECIAL,
+ INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
+ FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+ NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
+ WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Files: Modules/zlib/*
+Copyright: (C) 1995-2010 Jean-loup Gailly and Mark Adler
+License: This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  jloup@gzip.org          madler@alumni.caltech.edu
+
+ If you use the zlib library in a product, we would appreciate *not* receiving
+ lengthy legal documents to sign.  The sources are provided for free but without
+ warranty of any kind.  The library has been entirely written by Jean-loup
+ Gailly and Mark Adler; it does not include third-party code.
+
+Files: Modules/expat/*
+Copyright: Copyright (c) 1998, 1999, 2000 Thai Open Source Software Center Ltd
+  and Clark Cooper
+  Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Expat maintainers
+License: Permission is hereby granted, free of charge, to any person obtaining
+  a copy of this software and associated documentation files (the
+  "Software"), to deal in the Software without restriction, including
+  without limitation the rights to use, copy, modify, merge, publish,
+  distribute, sublicense, and/or sell copies of the Software, and to
+  permit persons to whom the Software is furnished to do so, subject to
+  the following conditions:
+ 
+  The above copyright notice and this permission notice shall be included
+  in all copies or substantial portions of the Software.
+ 
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+Files: Modules/_decimal/libmpdec/*
+Copyright: Copyright (c) 2008-2012 Stefan Krah. All rights reserved.
+License: Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ .
+ 1. Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+ .
+ 2. Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+ ,
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ SUCH DAMAGE.
+
+Files: Misc/python-mode.el
+Copyright: Copyright (C) 1992,1993,1994  Tim Peters
+License: This software is provided as-is, without express or implied
+	warranty.  Permission to use, copy, modify, distribute or sell this
+	software, without fee, for any purpose and by any individual or
+	organization, is hereby granted, provided that the above copyright
+	notice and this paragraph appear in all copies.
+
+Files: Python/dtoa.c
+Copyright: (c) 1991, 2000, 2001 by Lucent Technologies.
+License: Permission to use, copy, modify, and distribute this software for any
+  purpose without fee is hereby granted, provided that this entire notice
+  is included in all copies of any software which is or includes a copy
+  or modification of this software and in all copies of the supporting
+  documentation for such software.
+  
+  THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+  WARRANTY.  IN PARTICULAR, NEITHER THE AUTHOR NOR LUCENT MAKES ANY
+  REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+  OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+
+Files: Python/getopt.c
+Copyright: 1992-1994, David Gottner
+License: Permission to use, copy, modify, and distribute this software and its
+  documentation for any purpose and without fee is hereby granted,
+  provided that the above copyright notice, this permission notice and
+  the following disclaimer notice appear unmodified in all copies.
+  
+  I DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.  IN NO EVENT SHALL I
+  BE LIABLE FOR ANY SPECIAL, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
+  DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA, OR PROFITS, WHETHER
+  IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+  OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Files: PC/_subprocess.c
+Copyright: Copyright (c) 2004 by Fredrik Lundh <fredrik@pythonware.com>
+	Copyright (c) 2004 by Secret Labs AB, http://www.pythonware.com
+	Copyright (c) 2004 by Peter Astrand <astrand@lysator.liu.se>
+License:
+ * Permission to use, copy, modify, and distribute this software and
+ * its associated documentation for any purpose and without fee is
+ * hereby granted, provided that the above copyright notice appears in
+ * all copies, and that both that copyright notice and this permission
+ * notice appear in supporting documentation, and that the name of the
+ * authors not be used in advertising or publicity pertaining to
+ * distribution of the software without specific, written prior
+ * permission.
+ *
+ * THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+ * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
+ * WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Files: PC/winsound.c
+Copyright: Copyright (c) 1999 Toby Dickenson
+License:  * Permission to use this software in any way is granted without
+ * fee, provided that the copyright notice above appears in all
+ * copies. This software is provided "as is" without any warranty.
+ */
+
+/* Modified by Guido van Rossum */
+/* Beep added by Mark Hammond */
+/* Win9X Beep and platform identification added by Uncle Timmy */
+
+Files: Tools/pybench/*
+Copyright: (c), 1997-2006, Marc-Andre Lemburg (mal@lemburg.com)
+  (c), 2000-2006, eGenix.com Software GmbH (info@egenix.com)
+License: Permission to use, copy, modify, and distribute this software and its
+  documentation for any purpose and without fee or royalty is hereby
+  granted, provided that the above copyright notice appear in all copies
+  and that both that copyright notice and this permission notice appear
+  in supporting documentation or portions thereof, including
+  modifications, that you make.
+  
+  THE AUTHOR MARC-ANDRE LEMBURG DISCLAIMS ALL WARRANTIES WITH REGARD TO
+  THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+  FITNESS, IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL,
+  INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
+  FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+  NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
+  WITH THE USE OR PERFORMANCE OF THIS SOFTWARE !
--- a/dist/debian/python3/debian/scylla-python3.dirs
+++ b/dist/debian/python3/debian/scylla-python3.dirs
@@ -0,0 +1,3 @@
+opt/scylladb/python3/bin
+opt/scylladb/python3/lib64
+opt/scylladb/python3/libexec
--- a/dist/debian/python3/debian/scylla-python3.install
+++ b/dist/debian/python3/debian/scylla-python3.install
@@ -0,0 +1,3 @@
+bin/* opt/scylladb/python3/bin
+lib64/* opt/scylladb/python3/lib64
+libexec/* opt/scylladb/python3/libexec
--- a/dist/debian/python3/rules.mustache
+++ b/dist/debian/python3/rules.mustache
@@ -0,0 +1,22 @@
+#!/usr/bin/make -f
+
+export PYBUILD_DISABLE=1
+
+override_dh_auto_configure:
+
+override_dh_auto_build:
+
+override_dh_strip:
+
+override_dh_makeshlibs:
+
+override_dh_shlibdeps:
+
+override_dh_fixperms:
+	dh_fixperms
+	chmod 755 $(CURDIR)/debian/{{product}}-python3/opt/scylladb/python3/libexec/ld.so
+
+override_dh_strip_nondeterminism:
+
+%:
+	dh $@
--- a/dist/debian/rules.mustache
+++ b/dist/debian/rules.mustache
@@ -9,12 +9,21 @@ override_dh_auto_build:

 override_dh_auto_clean:

-override_dh_auto_install:
-	dh_auto_install
+override_dh_install:
+	dh_install
 	install -d $(CURDIR)/debian/scylla-server/usr/bin
+	for bin in debian/scylla-server/opt/scylladb/libexec/*; do debian/adjust_bin $(CURDIR)/debian/scylla-server "$${bin#*libexec/}"; done
 	ln -sf /opt/scylladb/bin/scylla $(CURDIR)/debian/scylla-server/usr/bin/scylla
 	ln -sf /opt/scylladb/bin/iotune $(CURDIR)/debian/scylla-server/usr/bin/iotune
 	ln -sf /usr/lib/scylla/scyllatop/scyllatop.py $(CURDIR)/debian/scylla-server/usr/bin/scyllatop
+	find ./dist/common/scripts -type f -exec ./relocate_python_scripts.py \
+	--installroot $(CURDIR)/debian/scylla-server/usr/lib/scylla/ --with-python3 "$(CURDIR)/debian/scylla-server/opt/scylladb/python3/bin/python3" {} +
+	./relocate_python_scripts.py \
+	--installroot $(CURDIR)/debian/scylla-server/usr/lib/scylla/ --with-python3 "$(CURDIR)/debian/scylla-server/opt/scylladb/python3/bin/python3" \
+	seastar/scripts/perftune.py seastar/scripts/seastar-addr2line seastar/scripts/perftune.py
+	./relocate_python_scripts.py \
+	--installroot $(CURDIR)/debian/scylla-server/usr/lib/scylla/scyllatop/ --with-python3 "$(CURDIR)/debian/scylla-server/opt/scylladb/python3/bin/python3" \
+	tools/scyllatop/scyllatop.py

 override_dh_installinit:
 {{#scylla}}
@@ -29,7 +38,9 @@ override_dh_installinit:
 	dh_installinit --no-start --name node-exporter

 override_dh_strip:
-	dh_strip -Xlibprotobuf.so.15 -Xld.so --dbg-package={{product}}-server-dbg
+	# The binaries (ethtool...patchelf) don't pass dh_strip after going through patchelf. Since they are
+	# already stripped, nothing is lost if we exclude them, so that's what we do.
+	dh_strip -Xlibprotobuf.so.15 -Xld.so -Xethtool -Xgawk -Xgzip -Xhwloc-calc -Xhwloc-distrib -Xifconfig -Xlscpu -Xnetstat -Xpatchelf --dbg-package={{product}}-server-dbg

 override_dh_makeshlibs:

--- a/dist/debian/scylla-server.install.mustache
+++ b/dist/debian/scylla-server.install.mustache
@@ -1,14 +1,9 @@
 dist/common/limits.d/scylla.conf etc/security/limits.d
 dist/common/scylla.d/*.conf etc/scylla.d
 seastar/dpdk/usertools/dpdk-devbind.py usr/lib/scylla
-seastar/scripts/perftune.py usr/lib/scylla
-seastar/scripts/seastar-addr2line usr/lib/scylla
 seastar/scripts/seastar-cpu-map.sh usr/lib/scylla
-dist/common/scripts/* usr/lib/scylla
-tools/scyllatop usr/lib/scylla
 swagger-ui/dist usr/lib/scylla/swagger-ui
 api/api-doc usr/lib/scylla/api
-bin/* opt/scylladb/bin
 libreloc/* opt/scylladb/libreloc
 libexec/* opt/scylladb/libexec
 dist/common/sbin/* usr/sbin
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -28,7 +28,7 @@ ADD commandlineparser.py /commandlineparser.py
 ADD docker-entrypoint.py /docker-entrypoint.py
 ADD node_exporter_install /node_exporter_install
 # Install Scylla:
-RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo && \
+RUN curl http://downloads.scylladb.com/rpm/centos/scylla-3.1.repo -o /etc/yum.repos.d/scylla.repo && \
    yum -y install epel-release && \
    yum -y clean expire-cache && \
    yum -y update && \
--- a/distributed_loader.cc
+++ b/distributed_loader.cc
@@ -192,7 +192,11 @@ future<> verification_error(fs::path path, const char* fstr, Args&&... args) {
 // No other file types may exist.
 future<> distributed_loader::verify_owner_and_mode(fs::path path) {
    return file_stat(path.string(), follow_symlink::no).then([path = std::move(path)] (stat_data sd) {
-        if (sd.uid != geteuid()) {
+        // Under docker, we run with euid 0 and there is no reasonable way to enforce that the
+        // in-container uid will have the same uid as files mounted from outside the container. So
+        // just allow euid 0 as a special case. It should survive the file_accessible() checks below.
+        // See #4823.
+        if (geteuid() != 0 && sd.uid != geteuid()) {
            return verification_error(std::move(path), "File not owned by current euid: {}. Owner is: {}", geteuid(), sd.uid);
        }
        switch (sd.type) {
--- a/fix_system_distributed_tables.py
+++ b/fix_system_distributed_tables.py
@@ -151,7 +151,7 @@ if __name__ == '__main__':
    argp.add_argument('--user', '-u')
    argp.add_argument('--password', '-p', default='none')
    argp.add_argument('--node', default='127.0.0.1', help='Node to connect to.')
-    argp.add_argument('--port', default='9042', help='Port to connect to.')
+    argp.add_argument('--port', default=9042, help='Port to connect to.', type=int)

    args = argp.parse_args()
    res = validate_and_fix(args)
--- a/gc_clock.hh
+++ b/gc_clock.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include "clocks-impl.hh"
+#include "hashing.hh"

 #include <seastar/core/lowres_clock.hh>

@@ -71,3 +72,17 @@ using ttl_opt = std::optional<gc_clock::duration>;
 static constexpr gc_clock::duration max_ttl = gc_clock::duration{20 * 365 * 24 * 60 * 60};

 std::ostream& operator<<(std::ostream& os, gc_clock::time_point tp);
+
+template<>
+struct appending_hash<gc_clock::time_point> {
+    template<typename Hasher>
+    void operator()(Hasher& h, gc_clock::time_point t) const {
+        // Remain backwards-compatible with the 32-bit duration::rep (refs #4460).
+        uint64_t d64 = t.time_since_epoch().count();
+        feed_hash(h, uint32_t(d64 & 0xffff'ffff));
+        uint32_t msb = d64 >> 32;
+        if (msb) {
+            feed_hash(h, msb);
+        }
+    }
+};
--- a/hashers.cc
+++ b/hashers.cc
@@ -29,7 +29,7 @@ template <typename T> struct hasher_traits;
 template <> struct hasher_traits<md5_hasher> { using impl_type = CryptoPP::Weak::MD5; };
 template <> struct hasher_traits<sha256_hasher> { using impl_type = CryptoPP::SHA256; };

-template <typename T, size_t size> struct hasher<T, size>::impl {
+template <typename T, size_t size> struct cryptopp_hasher<T, size>::impl {
    using impl_type = typename hasher_traits<T>::impl_type;

    impl_type hash{};
@@ -53,35 +53,35 @@ template <typename T, size_t size> struct hasher<T, size>::impl {
    }
 };

-template <typename T, size_t size> hasher<T, size>::hasher() : _impl(std::make_unique<impl>()) {}
+template <typename T, size_t size> cryptopp_hasher<T, size>::cryptopp_hasher() : _impl(std::make_unique<impl>()) {}

-template <typename T, size_t size> hasher<T, size>::~hasher() = default;
+template <typename T, size_t size> cryptopp_hasher<T, size>::~cryptopp_hasher() = default;

-template <typename T, size_t size> hasher<T, size>::hasher(hasher&& o) noexcept = default;
+template <typename T, size_t size> cryptopp_hasher<T, size>::cryptopp_hasher(cryptopp_hasher&& o) noexcept = default;

-template <typename T, size_t size> hasher<T, size>::hasher(const hasher& o) : _impl(std::make_unique<hasher<T, size>::impl>(*o._impl)) {}
+template <typename T, size_t size> cryptopp_hasher<T, size>::cryptopp_hasher(const cryptopp_hasher& o) : _impl(std::make_unique<cryptopp_hasher<T, size>::impl>(*o._impl)) {}

-template <typename T, size_t size> hasher<T, size>& hasher<T, size>::operator=(hasher&& o) noexcept = default;
+template <typename T, size_t size> cryptopp_hasher<T, size>& cryptopp_hasher<T, size>::operator=(cryptopp_hasher&& o) noexcept = default;

-template <typename T, size_t size> hasher<T, size>& hasher<T, size>::operator=(const hasher& o) {
-    _impl = std::make_unique<hasher<T, size>::impl>(*o._impl);
+template <typename T, size_t size> cryptopp_hasher<T, size>& cryptopp_hasher<T, size>::operator=(const cryptopp_hasher& o) {
+    _impl = std::make_unique<cryptopp_hasher<T, size>::impl>(*o._impl);
    return *this;
 }

-template <typename T, size_t size> bytes hasher<T, size>::finalize() { return _impl->finalize(); }
+template <typename T, size_t size> bytes cryptopp_hasher<T, size>::finalize() { return _impl->finalize(); }

-template <typename T, size_t size> std::array<uint8_t, size> hasher<T, size>::finalize_array() {
+template <typename T, size_t size> std::array<uint8_t, size> cryptopp_hasher<T, size>::finalize_array() {
    return _impl->finalize_array();
 }

-template <typename T, size_t size> void hasher<T, size>::update(const char* ptr, size_t length) { _impl->update(ptr, length); }
+template <typename T, size_t size> void cryptopp_hasher<T, size>::update(const char* ptr, size_t length) { _impl->update(ptr, length); }

-template <typename T, size_t size> bytes hasher<T, size>::calculate(const std::string_view& s) {
-    typename hasher<T, size>::impl::impl_type hash;
+template <typename T, size_t size> bytes cryptopp_hasher<T, size>::calculate(const std::string_view& s) {
+    typename cryptopp_hasher<T, size>::impl::impl_type hash;
    unsigned char digest[size];
    hash.CalculateDigest(digest, reinterpret_cast<const unsigned char*>(s.data()), s.size());
    return std::move(bytes{reinterpret_cast<const int8_t*>(digest), size});
 }

-template class hasher<md5_hasher, 16>;
-template class hasher<sha256_hasher, 32>;
+template class cryptopp_hasher<md5_hasher, 16>;
+template class cryptopp_hasher<sha256_hasher, 32>;
--- a/hashers.hh
+++ b/hashers.hh
@@ -22,29 +22,30 @@
 #pragma once

 #include "bytes.hh"
+#include "hashing.hh"

 class md5_hasher;

-template <typename T, size_t size> class hasher {
+template <typename T, size_t size> class cryptopp_hasher : public hasher {
    struct impl;
    std::unique_ptr<impl> _impl;

 public:
-    hasher();
-    ~hasher();
-    hasher(hasher&&) noexcept;
-    hasher(const hasher&);
-    hasher& operator=(hasher&&) noexcept;
-    hasher& operator=(const hasher&);
+    cryptopp_hasher();
+    ~cryptopp_hasher();
+    cryptopp_hasher(cryptopp_hasher&&) noexcept;
+    cryptopp_hasher(const cryptopp_hasher&);
+    cryptopp_hasher& operator=(cryptopp_hasher&&) noexcept;
+    cryptopp_hasher& operator=(const cryptopp_hasher&);

    bytes finalize();
    std::array<uint8_t, size> finalize_array();
-    void update(const char* ptr, size_t length);
+    void update(const char* ptr, size_t length) override;

    // Use update and finalize to compute the hash over the full view.
    static bytes calculate(const std::string_view& s);
 };

-class md5_hasher : public hasher<md5_hasher, 16> {};
+class md5_hasher final : public cryptopp_hasher<md5_hasher, 16> {};

-class sha256_hasher : public hasher<sha256_hasher, 32> {};
+class sha256_hasher final : public cryptopp_hasher<sha256_hasher, 32> {};
--- a/hashing.hh
+++ b/hashing.hh
@@ -27,6 +27,7 @@
 #include <seastar/core/byteorder.hh>
 #include <seastar/core/sstring.hh>
 #include "seastarx.hh"
+#include <seastar/util/gcc6-concepts.hh>

 //
 // This hashing differs from std::hash<> in that it decouples knowledge about
@@ -41,24 +42,38 @@
 // appending_hash<T> is machine-independent.
 //

-// The Hasher concept
-struct Hasher {
-    void update(const char* ptr, size_t size);
+GCC6_CONCEPT(
+    template<typename H>
+    concept bool Hasher() {
+        return requires(H& h, const char* ptr, size_t size) {
+            { h.update(ptr, size) } -> void
+        };
+    }
+)
+
+class hasher {
+public:
+    virtual ~hasher() = default;
+    virtual void update(const char* ptr, size_t size) = 0;
 };

+GCC6_CONCEPT(static_assert(Hasher<hasher>());)
+
 template<typename T, typename Enable = void>
 struct appending_hash;

-template<typename Hasher, typename T, typename... Args>
+template<typename H, typename T, typename... Args>
+GCC6_CONCEPT(requires Hasher<H>())
 inline
-void feed_hash(Hasher& h, const T& value, Args&&... args) {
+void feed_hash(H& h, const T& value, Args&&... args) {
    appending_hash<T>()(h, value, std::forward<Args>(args)...);
 };

 template<typename T>
 struct appending_hash<T, std::enable_if_t<std::is_arithmetic<T>::value>> {
-    template<typename Hasher>
-    void operator()(Hasher& h, T value) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, T value) const {
        auto value_le = cpu_to_le(value);
        h.update(reinterpret_cast<const char*>(&value_le), sizeof(T));
    }
@@ -66,24 +81,27 @@ struct appending_hash<T, std::enable_if_t<std::is_arithmetic<T>::value>> {

 template<>
 struct appending_hash<bool> {
-    template<typename Hasher>
-    void operator()(Hasher& h, bool value) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, bool value) const {
        feed_hash(h, static_cast<uint8_t>(value));
    }
 };

 template<typename T>
 struct appending_hash<T, std::enable_if_t<std::is_enum<T>::value>> {
-    template<typename Hasher>
-    void operator()(Hasher& h, const T& value) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, const T& value) const {
        feed_hash(h, static_cast<std::underlying_type_t<T>>(value));
    }
 };

 template<typename T>
 struct appending_hash<std::optional<T>>  {
-    template<typename Hasher>
-    void operator()(Hasher& h, const std::optional<T>& value) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, const std::optional<T>& value) const {
        if (value) {
            feed_hash(h, true);
            feed_hash(h, *value);
@@ -95,8 +113,9 @@ struct appending_hash<std::optional<T>>  {

 template<size_t N>
 struct appending_hash<char[N]>  {
-    template<typename Hasher>
-    void operator()(Hasher& h, const char (&value) [N]) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, const char (&value) [N]) const {
        feed_hash(h, N);
        h.update(value, N);
    }
@@ -104,8 +123,9 @@ struct appending_hash<char[N]>  {

 template<typename T>
 struct appending_hash<std::vector<T>> {
-    template<typename Hasher>
-    void operator()(Hasher& h, const std::vector<T>& value) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, const std::vector<T>& value) const {
        feed_hash(h, value.size());
        for (auto&& v : value) {
            appending_hash<T>()(h, v);
@@ -115,8 +135,9 @@ struct appending_hash<std::vector<T>> {

 template<typename K, typename V>
 struct appending_hash<std::map<K, V>> {
-    template<typename Hasher>
-    void operator()(Hasher& h, const std::map<K, V>& value) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, const std::map<K, V>& value) const {
        feed_hash(h, value.size());
        for (auto&& e : value) {
            appending_hash<K>()(h, e.first);
@@ -127,8 +148,9 @@ struct appending_hash<std::map<K, V>> {

 template<>
 struct appending_hash<sstring> {
-    template<typename Hasher>
-    void operator()(Hasher& h, const sstring& v) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, const sstring& v) const {
        feed_hash(h, v.size());
        h.update(reinterpret_cast<const char*>(v.cbegin()), v.size() * sizeof(sstring::value_type));
    }
@@ -136,8 +158,9 @@ struct appending_hash<sstring> {

 template<>
 struct appending_hash<std::string> {
-    template<typename Hasher>
-    void operator()(Hasher& h, const std::string& v) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, const std::string& v) const {
        feed_hash(h, v.size());
        h.update(reinterpret_cast<const char*>(v.data()), v.size() * sizeof(std::string::value_type));
    }
@@ -145,16 +168,18 @@ struct appending_hash<std::string> {

 template<typename T, typename R>
 struct appending_hash<std::chrono::duration<T, R>> {
-    template<typename Hasher>
-    void operator()(Hasher& h, std::chrono::duration<T, R> v) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, std::chrono::duration<T, R> v) const {
        feed_hash(h, v.count());
    }
 };

 template<typename Clock, typename Duration>
 struct appending_hash<std::chrono::time_point<Clock, Duration>> {
-    template<typename Hasher>
-    void operator()(Hasher& h, std::chrono::time_point<Clock, Duration> v) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, std::chrono::time_point<Clock, Duration> v) const {
        feed_hash(h, v.time_since_epoch().count());
    }
 };
--- a/idl/streaming.idl.hh
+++ b/idl/streaming.idl.hh
@@ -51,4 +51,10 @@ enum class stream_reason : uint8_t {
    repair,
 };

+enum class stream_mutation_fragments_cmd : uint8_t {
+    error,
+    mutation_fragment_data,
+    end_of_stream,
+};
+
 }
--- a/init.cc
+++ b/init.cc
@@ -155,6 +155,10 @@ void init_ms_fd_gossiper(sharded<gms::gossiper>& gossiper
                to_string(seeds), listen_address_in, broadcast_address);
        throw std::runtime_error("Use broadcast_address for seeds list");
    }
+    if ((!cfg.replace_address_first_boot().empty() || !cfg.replace_address().empty()) && seeds.count(broadcast_address)) {
+        startlog.error("Bad configuration: replace-address and replace-address-first-boot are not allowed for seed nodes");
+        throw bad_configuration_error();
+    }
    gossiper.local().set_seeds(seeds);
    gossiper.invoke_on_all([cluster_name](gms::gossiper& g) {
        g.set_cluster_name(cluster_name);
--- a/install.sh
+++ b/install.sh
@@ -75,6 +75,29 @@ while [ $# -gt 0 ]; do
    esac
 done

+patchelf() {
+    # patchelf comes from the build system, so it needs the build system's ld.so and
+    # shared libraries. We can't use patchelf on patchelf itself, so invoke it via
+    # ld.so.
+    LD_LIBRARY_PATH="$PWD/libreloc" libreloc/ld.so libexec/patchelf "$@"
+}
+
+adjust_bin() {
+    local bin="$1"
+    # We could add --set-rpath too, but then debugedit (called by rpmbuild) barfs
+    # on the result. So use LD_LIBRARY_PATH in the thunk, below.
+    patchelf \
+	--set-interpreter "/opt/scylladb/libreloc/ld.so" \
+	"$root/opt/scylladb/libexec/$bin"
+    cat > "$root/opt/scylladb/bin/$bin" <<EOF
+#!/bin/bash -e
+export GNUTLS_SYSTEM_PRIORITY_FILE="\${GNUTLS_SYSTEM_PRIORITY_FILE-/opt/scylladb/libreloc/gnutls.config}"
+export LD_LIBRARY_PATH="/opt/scylladb/libreloc"
+exec -a "\$0" "/opt/scylladb/libexec/$bin" "\$@"
+EOF
+    chmod +x "$root/opt/scylladb/bin/$bin"
+}
+
 rprefix="$root/$prefix"
 retc="$root/etc"
 rdoc="$rprefix/share/doc"
@@ -105,16 +128,13 @@ install -m644 dist/common/systemd/*.service -Dt "$rprefix"/lib/systemd/system
 install -m644 dist/common/systemd/*.timer -Dt "$rprefix"/lib/systemd/system
 install -m755 seastar/scripts/seastar-cpu-map.sh -Dt "$rprefix"/lib/scylla/
 install -m755 seastar/dpdk/usertools/dpdk-devbind.py -Dt "$rprefix"/lib/scylla/
-install -m755 bin/* -Dt "$root/opt/scylladb/bin"
+install -m755 libreloc/* -Dt "$root/opt/scylladb/libreloc"
 # some files in libexec are symlinks, which "install" dereferences
 # use cp -P for the symlinks instead.
-install -m755 libexec/*.bin -Dt "$root/opt/scylladb/libexec"
-for f in libexec/*; do
-    if [[ "$f" != *.bin ]]; then
-        cp -P "$f" "$root/opt/scylladb/libexec"
-    fi
+install -m755 libexec/* -Dt "$root/opt/scylladb/libexec"
+for bin in libexec/*; do
+    adjust_bin "${bin#libexec/}"
 done
-install -m755 libreloc/* -Dt "$root/opt/scylladb/libreloc"
 ln -srf "$root/opt/scylladb/bin/scylla" "$rprefix/bin/scylla"
 ln -srf "$root/opt/scylladb/bin/iotune" "$rprefix/bin/iotune"
 ln -srf "$rprefix/lib/scylla/scyllatop/scyllatop.py" "$rprefix/bin/scyllatop"
--- a/main.cc
+++ b/main.cc
@@ -340,15 +340,7 @@ int main(int ac, char** av) {
    auto cfg = make_lw_shared<db::config>(ext);
    auto init = app.get_options_description().add_options();

-    // If --version is requested, print it out and exit immediately to avoid
-    // Seastar-specific warnings that may occur when running the app
    init("version", bpo::bool_switch(), "print version number and exit");
-    bpo::variables_map vm;
-    bpo::store(bpo::command_line_parser(ac, av).options(app.get_options_description()).allow_unregistered().run(), vm);
-    if (vm["version"].as<bool>()) {
-        fmt::print("{}\n", scylla_version());
-        return 0;
-    }

    bpo::options_description deprecated("Deprecated options - ignored");
    deprecated.add_options()
@@ -362,6 +354,15 @@ int main(int ac, char** av) {
    configurable::append_all(*cfg, init);
    cfg->add_options(init);

+    // If --version is requested, print it out and exit immediately to avoid
+    // Seastar-specific warnings that may occur when running the app
+    bpo::variables_map vm;
+    bpo::store(bpo::command_line_parser(ac, av).options(app.get_options_description()).allow_unregistered().run(), vm);
+    if (vm["version"].as<bool>()) {
+        fmt::print("{}\n", scylla_version());
+        return 0;
+    }
+
    distributed<database> db;
    seastar::sharded<service::cache_hitrate_calculator> cf_cache_hitrate_calculator;
    debug::db = &db;
@@ -526,6 +527,9 @@ int main(int ac, char** av) {
            if (opts.count("developer-mode")) {
                smp::invoke_on_all([] { engine().set_strict_dma(false); }).get();
            }
+
+            set_abort_on_internal_error(cfg->abort_on_internal_error());
+
            supervisor::notify("creating tracing");
            tracing::backend_registry tracing_backend_registry;
            tracing::register_tracing_keyspace_backend(tracing_backend_registry);
@@ -916,8 +920,10 @@ int main(int ac, char** av) {
                service::get_local_storage_service().drain_on_shutdown().get();
            });

-            auto stop_view_builder = defer([] {
-                view_builder.stop().get();
+            auto stop_view_builder = defer([cfg] {
+                if (cfg->view_building()) {
+                    view_builder.stop().get();
+                }
            });

            auto stop_compaction_manager = defer([&db] {
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -89,6 +89,7 @@
 #include "frozen_mutation.hh"
 #include "flat_mutation_reader.hh"
 #include "streaming/stream_manager.hh"
+#include "streaming/stream_mutation_fragments_cmd.hh"

 namespace netw {

@@ -287,7 +288,6 @@ void messaging_service::start_listen() {
    if (_compress_what != compress_what::none) {
        so.compressor_factory = &compressor_factory;
    }
-    so.streaming_domain = rpc::streaming_domain_type(0x55AA);
    so.load_balancing_algorithm = server_socket::load_balancing_algorithm::port;

    // FIXME: we don't set so.tcp_nodelay, because we can't tell at this point whether the connection will come from a
@@ -295,19 +295,21 @@ void messaging_service::start_listen() {
    //        the first by wrapping its server_socket, but not the second.
    auto limits = rpc_resource_limits(_mcfg.rpc_memory_limit);
    if (!_server[0]) {
-        auto listen = [&] (const gms::inet_address& a) {
+        auto listen = [&] (const gms::inet_address& a, rpc::streaming_domain_type sdomain) {
+            so.streaming_domain = sdomain;
            auto addr = ipv4_addr{a.raw_addr(), _port};
            return std::unique_ptr<rpc_protocol_server_wrapper>(new rpc_protocol_server_wrapper(*_rpc,
                    so, addr, limits));
        };
-        _server[0] = listen(_listen_address);
+        _server[0] = listen(_listen_address, rpc::streaming_domain_type(0x55AA));
        if (listen_to_bc) {
-            _server[1] = listen(utils::fb_utilities::get_broadcast_address());
+            _server[1] = listen(utils::fb_utilities::get_broadcast_address(), rpc::streaming_domain_type(0x66BB));
        }
    }

    if (!_server_tls[0]) {
-        auto listen = [&] (const gms::inet_address& a) {
+        auto listen = [&] (const gms::inet_address& a, rpc::streaming_domain_type sdomain) {
+            so.streaming_domain = sdomain;
            return std::unique_ptr<rpc_protocol_server_wrapper>(
                    [this, &so, &a, limits] () -> std::unique_ptr<rpc_protocol_server_wrapper>{
                if (_encrypt_what == encrypt_what::none) {
@@ -321,9 +323,9 @@ void messaging_service::start_listen() {
                        so, seastar::tls::listen(_credentials, addr, lo), limits);
            }());
        };
-        _server_tls[0] = listen(_listen_address);
+        _server_tls[0] = listen(_listen_address, rpc::streaming_domain_type(0x77CC));
        if (listen_to_bc) {
-            _server_tls[1] = listen(utils::fb_utilities::get_broadcast_address());
+            _server_tls[1] = listen(utils::fb_utilities::get_broadcast_address(), rpc::streaming_domain_type(0x88DD));
        }
    }
    // Do this on just cpu 0, to avoid duplicate logs.
@@ -607,6 +609,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        opts.compressor_factory = &compressor_factory;
    }
    opts.tcp_nodelay = must_tcp_nodelay;
+    opts.reuseaddr = true;

    auto client = must_encrypt ?
                    ::make_shared<rpc_protocol_client_wrapper>(*_rpc, std::move(opts),
@@ -668,24 +671,24 @@ std::unique_ptr<messaging_service::rpc_protocol_wrapper>& messaging_service::rpc
    return _rpc;
 }

-rpc::sink<int32_t> messaging_service::make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment>& source) {
+rpc::sink<int32_t> messaging_service::make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>>& source) {
    return source.make_sink<netw::serializer, int32_t>();
 }

-future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>
+future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>>
 messaging_service::make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id) {
    auto rpc_client = get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, id);
-    return rpc_client->make_stream_sink<netw::serializer, frozen_mutation_fragment>().then([this, plan_id, schema_id, cf_id, estimated_partitions, reason, rpc_client] (rpc::sink<frozen_mutation_fragment> sink) mutable {
-        auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, streaming::stream_reason, rpc::sink<frozen_mutation_fragment>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
+    return rpc_client->make_stream_sink<netw::serializer, frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>().then([this, plan_id, schema_id, cf_id, estimated_partitions, reason, rpc_client] (rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd> sink) mutable {
+        auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, streaming::stream_reason, rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
        return rpc_handler(*rpc_client , plan_id, schema_id, cf_id, estimated_partitions, reason, sink).then_wrapped([sink, rpc_client] (future<rpc::source<int32_t>> source) mutable {
            return (source.failed() ? sink.close() : make_ready_future<>()).then([sink = std::move(sink), source = std::move(source)] () mutable {
-                return make_ready_future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>(std::move(sink), std::move(source.get0()));
+                return make_ready_future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>>(std::move(sink), std::move(source.get0()));
            });
        });
    });
 }

-void messaging_service::register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason>, rpc::source<frozen_mutation_fragment> source)>&& func) {
+void messaging_service::register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason>, rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>> source)>&& func) {
    register_handler(this, messaging_verb::STREAM_MUTATION_FRAGMENTS, std::move(func));
 }

@@ -1077,14 +1080,14 @@ future<> messaging_service::send_repair_put_row_diff(msg_addr id, uint32_t repai
 }

 // Wrapper for REPAIR_ROW_LEVEL_START
-void messaging_service::register_repair_row_level_start(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name)>&& func) {
+void messaging_service::register_repair_row_level_start(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version)>&& func) {
    register_handler(this, messaging_verb::REPAIR_ROW_LEVEL_START, std::move(func));
 }
 void messaging_service::unregister_repair_row_level_start() {
    _rpc->unregister_handler(messaging_verb::REPAIR_ROW_LEVEL_START);
 }
-future<> messaging_service::send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name) {
-    return send_message<void>(this, messaging_verb::REPAIR_ROW_LEVEL_START, std::move(id), repair_meta_id, std::move(keyspace_name), std::move(cf_name), std::move(range), algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, std::move(remote_partitioner_name));
+future<> messaging_service::send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version) {
+    return send_message<void>(this, messaging_verb::REPAIR_ROW_LEVEL_START, std::move(id), repair_meta_id, std::move(keyspace_name), std::move(cf_name), std::move(range), algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, std::move(remote_partitioner_name), std::move(schema_version));
 }

 // Wrapper for REPAIR_ROW_LEVEL_STOP
--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -36,6 +36,7 @@
 #include "tracing/tracing.hh"
 #include "digest_algorithm.hh"
 #include "streaming/stream_reason.hh"
+#include "streaming/stream_mutation_fragments_cmd.hh"
 #include "cache_temperature.hh"

 #include <list>
@@ -270,9 +271,9 @@ public:

    // Wrapper for STREAM_MUTATION_FRAGMENTS
    // The receiver of STREAM_MUTATION_FRAGMENTS sends status code to the sender to notify any error on the receiver side. The status code is of type int32_t. 0 means successful, -1 means error, other status code value are reserved for future use.
-    void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment> source)>&& func);
-    rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment>& source);
-    future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);
+    void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>> source)>&& func);
+    rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>>& source);
+    future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);

    void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
    future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);
@@ -311,9 +312,9 @@ public:
    future<> send_repair_put_row_diff(msg_addr id, uint32_t repair_meta_id, repair_rows_on_wire row_diff);

    // Wrapper for REPAIR_ROW_LEVEL_START
-    void register_repair_row_level_start(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name)>&& func);
+    void register_repair_row_level_start(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version)>&& func);
    void unregister_repair_row_level_start();
-    future<> send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name);
+    future<> send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version);

    // Wrapper for REPAIR_ROW_LEVEL_STOP
    void register_repair_row_level_stop(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range)>&& func);
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -145,7 +145,14 @@ mutation_partition::mutation_partition(const schema& s, const mutation_partition
        , _static_row(s, column_kind::static_column, x._static_row)
        , _static_row_continuous(x._static_row_continuous)
        , _rows()
-        , _row_tombstones(x._row_tombstones) {
+        , _row_tombstones(x._row_tombstones)
+#ifdef SEASTAR_DEBUG
+        , _schema_version(s.version())
+#endif
+{
+#ifdef SEASTAR_DEBUG
+    assert(x._schema_version == _schema_version);
+#endif
    auto cloner = [&s] (const auto& x) {
        return current_allocator().construct<rows_entry>(s, x);
    };
@@ -158,7 +165,14 @@ mutation_partition::mutation_partition(const mutation_partition& x, const schema
        , _static_row(schema, column_kind::static_column, x._static_row)
        , _static_row_continuous(x._static_row_continuous)
        , _rows()
-        , _row_tombstones(x._row_tombstones, range_tombstone_list::copy_comparator_only()) {
+        , _row_tombstones(x._row_tombstones, range_tombstone_list::copy_comparator_only())
+#ifdef SEASTAR_DEBUG
+        , _schema_version(schema.version())
+#endif
+{
+#ifdef SEASTAR_DEBUG
+    assert(x._schema_version == _schema_version);
+#endif
    try {
        for(auto&& r : ck_ranges) {
            for (const rows_entry& e : x.range(schema, r)) {
@@ -181,7 +195,13 @@ mutation_partition::mutation_partition(mutation_partition&& x, const schema& sch
    , _static_row_continuous(x._static_row_continuous)
    , _rows(std::move(x._rows))
    , _row_tombstones(std::move(x._row_tombstones))
+#ifdef SEASTAR_DEBUG
+    , _schema_version(schema.version())
+#endif
 {
+#ifdef SEASTAR_DEBUG
+    assert(x._schema_version == _schema_version);
+#endif
    {
        auto deleter = current_deleter<rows_entry>();
        auto it = _rows.begin();
@@ -221,6 +241,7 @@ mutation_partition::operator=(mutation_partition&& x) noexcept {
 }

 void mutation_partition::ensure_last_dummy(const schema& s) {
+    check_schema(s);
    if (_rows.empty() || !_rows.rbegin()->is_last_dummy()) {
        _rows.insert_before(_rows.end(),
            *current_allocator().construct<rows_entry>(s, rows_entry::last_dummy_tag(), is_continuous::yes));
@@ -277,11 +298,16 @@ void deletable_row::apply(const schema& s, clustering_row cr) {

 void
 mutation_partition::apply(const schema& s, const mutation_fragment& mf) {
+    check_schema(s);
    mutation_fragment_applier applier{s, *this};
    mf.visit(applier);
 }

 stop_iteration mutation_partition::apply_monotonically(const schema& s, mutation_partition&& p, cache_tracker* tracker, is_preemptible preemptible) {
+#ifdef SEASTAR_DEBUG
+    assert(s.version() == _schema_version);
+    assert(p._schema_version == _schema_version);
+#endif
    _tombstone.apply(p._tombstone);
    _static_row.apply_monotonically(s, column_kind::static_column, std::move(p._static_row));
    _static_row_continuous |= p._static_row_continuous;
@@ -387,6 +413,7 @@ void mutation_partition::apply_weak(const schema& s, mutation_partition&& p) {

 tombstone
 mutation_partition::range_tombstone_for_row(const schema& schema, const clustering_key& key) const {
+    check_schema(schema);
    tombstone t = _tombstone;
    if (!_row_tombstones.empty()) {
        auto found = _row_tombstones.search_tombstone_covering(schema, key);
@@ -397,6 +424,7 @@ mutation_partition::range_tombstone_for_row(const schema& schema, const clusteri

 row_tombstone
 mutation_partition::tombstone_for_row(const schema& schema, const clustering_key& key) const {
+    check_schema(schema);
    row_tombstone t = row_tombstone(range_tombstone_for_row(schema, key));

    auto j = _rows.find(key, rows_entry::compare(schema));
@@ -409,6 +437,7 @@ mutation_partition::tombstone_for_row(const schema& schema, const clustering_key

 row_tombstone
 mutation_partition::tombstone_for_row(const schema& schema, const rows_entry& e) const {
+    check_schema(schema);
    row_tombstone t = e.row().deleted_at();
    t.apply(range_tombstone_for_row(schema, e.key()));
    return t;
@@ -416,6 +445,7 @@ mutation_partition::tombstone_for_row(const schema& schema, const rows_entry& e)

 void
 mutation_partition::apply_row_tombstone(const schema& schema, clustering_key_prefix prefix, tombstone t) {
+    check_schema(schema);
    assert(!prefix.is_full(schema));
    auto start = prefix;
    _row_tombstones.apply(schema, {std::move(start), std::move(prefix), std::move(t)});
@@ -423,11 +453,13 @@ mutation_partition::apply_row_tombstone(const schema& schema, clustering_key_pre

 void
 mutation_partition::apply_row_tombstone(const schema& schema, range_tombstone rt) {
+    check_schema(schema);
    _row_tombstones.apply(schema, std::move(rt));
 }

 void
 mutation_partition::apply_delete(const schema& schema, const clustering_key_prefix& prefix, tombstone t) {
+    check_schema(schema);
    if (prefix.is_empty(schema)) {
        apply(t);
    } else if (prefix.is_full(schema)) {
@@ -439,6 +471,7 @@ mutation_partition::apply_delete(const schema& schema, const clustering_key_pref

 void
 mutation_partition::apply_delete(const schema& schema, range_tombstone rt) {
+    check_schema(schema);
    if (range_tombstone::is_single_clustering_row_tombstone(schema, rt.start, rt.start_kind, rt.end, rt.end_kind)) {
        apply_delete(schema, std::move(rt.start), std::move(rt.tomb));
        return;
@@ -448,6 +481,7 @@ mutation_partition::apply_delete(const schema& schema, range_tombstone rt) {

 void
 mutation_partition::apply_delete(const schema& schema, clustering_key&& prefix, tombstone t) {
+    check_schema(schema);
    if (prefix.is_empty(schema)) {
        apply(t);
    } else if (prefix.is_full(schema)) {
@@ -459,6 +493,7 @@ mutation_partition::apply_delete(const schema& schema, clustering_key&& prefix,

 void
 mutation_partition::apply_delete(const schema& schema, clustering_key_prefix_view prefix, tombstone t) {
+    check_schema(schema);
    if (prefix.is_empty(schema)) {
        apply(t);
    } else if (prefix.is_full(schema)) {
@@ -484,6 +519,7 @@ void mutation_partition::insert_row(const schema& s, const clustering_key& key,
 }

 void mutation_partition::insert_row(const schema& s, const clustering_key& key, const deletable_row& row) {
+    check_schema(s);
    auto e = alloc_strategy_unique_ptr<rows_entry>(
        current_allocator().construct<rows_entry>(s, key, row));
    _rows.insert(_rows.end(), *e, rows_entry::compare(s));
@@ -492,6 +528,7 @@ void mutation_partition::insert_row(const schema& s, const clustering_key& key,

 const row*
 mutation_partition::find_row(const schema& s, const clustering_key& key) const {
+    check_schema(s);
    auto i = _rows.find(key, rows_entry::compare(s));
    if (i == _rows.end()) {
        return nullptr;
@@ -501,6 +538,7 @@ mutation_partition::find_row(const schema& s, const clustering_key& key) const {

 deletable_row&
 mutation_partition::clustered_row(const schema& s, clustering_key&& key) {
+    check_schema(s);
    auto i = _rows.find(key, rows_entry::compare(s));
    if (i == _rows.end()) {
        auto e = alloc_strategy_unique_ptr<rows_entry>(
@@ -513,6 +551,7 @@ mutation_partition::clustered_row(const schema& s, clustering_key&& key) {

 deletable_row&
 mutation_partition::clustered_row(const schema& s, const clustering_key& key) {
+    check_schema(s);
    auto i = _rows.find(key, rows_entry::compare(s));
    if (i == _rows.end()) {
        auto e = alloc_strategy_unique_ptr<rows_entry>(
@@ -525,6 +564,7 @@ mutation_partition::clustered_row(const schema& s, const clustering_key& key) {

 deletable_row&
 mutation_partition::clustered_row(const schema& s, clustering_key_view key) {
+    check_schema(s);
    auto i = _rows.find(key, rows_entry::compare(s));
    if (i == _rows.end()) {
        auto e = alloc_strategy_unique_ptr<rows_entry>(
@@ -537,6 +577,7 @@ mutation_partition::clustered_row(const schema& s, clustering_key_view key) {

 deletable_row&
 mutation_partition::clustered_row(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous) {
+    check_schema(s);
    auto i = _rows.find(pos, rows_entry::compare(s));
    if (i == _rows.end()) {
        auto e = alloc_strategy_unique_ptr<rows_entry>(
@@ -549,6 +590,7 @@ mutation_partition::clustered_row(const schema& s, position_in_partition_view po

 mutation_partition::rows_type::const_iterator
 mutation_partition::lower_bound(const schema& schema, const query::clustering_range& r) const {
+    check_schema(schema);
    if (!r.start()) {
        return std::cbegin(_rows);
    }
@@ -557,6 +599,7 @@ mutation_partition::lower_bound(const schema& schema, const query::clustering_ra

 mutation_partition::rows_type::const_iterator
 mutation_partition::upper_bound(const schema& schema, const query::clustering_range& r) const {
+    check_schema(schema);
    if (!r.end()) {
        return std::cend(_rows);
    }
@@ -565,6 +608,7 @@ mutation_partition::upper_bound(const schema& schema, const query::clustering_ra

 boost::iterator_range<mutation_partition::rows_type::const_iterator>
 mutation_partition::range(const schema& schema, const query::clustering_range& r) const {
+    check_schema(schema);
    return boost::make_iterator_range(lower_bound(schema, r), upper_bound(schema, r));
 }

@@ -601,6 +645,7 @@ mutation_partition::upper_bound(const schema& schema, const query::clustering_ra
 template<typename Func>
 void mutation_partition::for_each_row(const schema& schema, const query::clustering_range& row_range, bool reversed, Func&& func) const
 {
+    check_schema(schema);
    auto r = range(schema, row_range);
    if (!reversed) {
        for (const auto& e : r) {
@@ -817,6 +862,7 @@ bool has_any_live_data(const schema& s, column_kind kind, const row& cells, tomb

 void
 mutation_partition::query_compacted(query::result::partition_writer& pw, const schema& s, uint32_t limit) const {
+    check_schema(s);
    const query::partition_slice& slice = pw.slice();
    max_timestamp max_ts{pw.last_modified()};

@@ -1049,6 +1095,10 @@ bool mutation_partition::equal(const schema& s, const mutation_partition& p) con
 }

 bool mutation_partition::equal(const schema& this_schema, const mutation_partition& p, const schema& p_schema) const {
+#ifdef SEASTAR_DEBUG
+    assert(_schema_version == this_schema.version());
+    assert(p._schema_version == p_schema.version());
+#endif
    if (_tombstone != p._tombstone) {
        return false;
    }
@@ -1177,6 +1227,7 @@ row::apply_monotonically(const column_definition& column, atomic_cell_or_collect
 void
 row::append_cell(column_id id, atomic_cell_or_collection value) {
    if (_type == storage_type::vector && id < max_vector_size) {
+        assert(_storage.vector.v.size() <= id);
        _storage.vector.v.resize(id);
        _storage.vector.v.emplace_back(cell_and_hash{std::move(value), cell_hash_opt()});
        _storage.vector.present.set(id);
@@ -1241,6 +1292,7 @@ size_t rows_entry::memory_usage(const schema& s) const {
 }

 size_t mutation_partition::external_memory_usage(const schema& s) const {
+    check_schema(s);
    size_t sum = 0;
    sum += static_row().external_memory_usage(s, column_kind::static_column);
    for (auto& clr : clustered_rows()) {
@@ -1259,6 +1311,7 @@ void mutation_partition::trim_rows(const schema& s,
    const std::vector<query::clustering_range>& row_ranges,
    Func&& func)
 {
+    check_schema(s);
    static_assert(std::is_same<stop_iteration, std::result_of_t<Func(rows_entry&)>>::value, "Bad func signature");

    stop_iteration stop = stop_iteration::no;
@@ -1303,6 +1356,7 @@ uint32_t mutation_partition::do_compact(const schema& s,
    uint32_t row_limit,
    can_gc_fn& can_gc)
 {
+    check_schema(s);
    assert(row_limit > 0);

    auto gc_before = saturating_subtract(query_time, s.gc_grace_seconds());
@@ -1368,12 +1422,14 @@ mutation_partition::compact_for_query(
    bool reverse,
    uint32_t row_limit)
 {
+    check_schema(s);
    return do_compact(s, query_time, row_ranges, reverse, row_limit, always_gc);
 }

 void mutation_partition::compact_for_compaction(const schema& s,
    can_gc_fn& can_gc, gc_clock::time_point compaction_time)
 {
+    check_schema(s);
    static const std::vector<query::clustering_range> all_rows = {
        query::clustering_range::make_open_ended_both_sides()
    };
@@ -1407,11 +1463,13 @@ row::is_live(const schema& s, column_kind kind, tombstone base_tombstone, gc_clo

 bool
 mutation_partition::is_static_row_live(const schema& s, gc_clock::time_point query_time) const {
+    check_schema(s);
    return has_any_live_data(s, column_kind::static_column, static_row(), _tombstone, query_time);
 }

 size_t
 mutation_partition::live_row_count(const schema& s, gc_clock::time_point query_time) const {
+    check_schema(s);
    size_t count = 0;

    for (const rows_entry& e : non_dummy_rows()) {
@@ -1757,6 +1815,7 @@ row row::difference(const schema& s, column_kind kind, const row& other) const

 mutation_partition mutation_partition::difference(schema_ptr s, const mutation_partition& other) const
 {
+    check_schema(*s);
    mutation_partition mp(s);
    if (_tombstone > other._tombstone) {
        mp.apply(_tombstone);
@@ -1787,6 +1846,7 @@ mutation_partition mutation_partition::difference(schema_ptr s, const mutation_p
 }

 void mutation_partition::accept(const schema& s, mutation_partition_visitor& v) const {
+    check_schema(s);
    v.accept_partition_tombstone(_tombstone);
    _static_row.for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
        const column_definition& def = s.static_column_at(id);
@@ -2200,6 +2260,9 @@ mutation_partition::mutation_partition(mutation_partition::incomplete_tag, const
    , _static_row_continuous(!s.has_static_columns())
    , _rows()
    , _row_tombstones(s)
+#ifdef SEASTAR_DEBUG
+    , _schema_version(s.version())
+#endif
 {
    _rows.insert_before(_rows.end(),
        *current_allocator().construct<rows_entry>(s, rows_entry::last_dummy_tag(), is_continuous::no));
@@ -2265,6 +2328,7 @@ void mutation_partition::set_continuity(const schema& s, const position_range& p
 }

 clustering_interval_set mutation_partition::get_continuity(const schema& s, is_continuous cont) const {
+    check_schema(s);
    clustering_interval_set result;
    auto i = _rows.begin();
    auto prev_pos = position_in_partition::before_all_clustered_rows();
@@ -2314,6 +2378,7 @@ stop_iteration mutation_partition::clear_gently(cache_tracker* tracker) noexcept

 bool
 mutation_partition::check_continuity(const schema& s, const position_range& r, is_continuous cont) const {
+    check_schema(s);
    auto less = rows_entry::compare(s);
    auto i = _rows.lower_bound(r.start(), less);
    auto end = _rows.lower_bound(r.end(), less);
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -940,6 +940,9 @@ private:
    // Contains only strict prefixes so that we don't have to lookup full keys
    // in both _row_tombstones and _rows.
    range_tombstone_list _row_tombstones;
+#ifdef SEASTAR_DEBUG
+    table_schema_version _schema_version;
+#endif

    friend class mutation_partition_applier;
    friend class converting_mutation_partition_applier;
@@ -954,10 +957,16 @@ public:
    mutation_partition(schema_ptr s)
        : _rows()
        , _row_tombstones(*s)
+#ifdef SEASTAR_DEBUG
+        , _schema_version(s->version())
+#endif
    { }
    mutation_partition(mutation_partition& other, copy_comparators_only)
        : _rows()
        , _row_tombstones(other._row_tombstones, range_tombstone_list::copy_comparator_only())
+#ifdef SEASTAR_DEBUG
+        , _schema_version(other._schema_version)
+#endif
    { }
    mutation_partition(mutation_partition&&) = default;
    mutation_partition(const schema& s, const mutation_partition&);
@@ -1181,6 +1190,12 @@ private:
    template<typename Func>
    void for_each_row(const schema& schema, const query::clustering_range& row_range, bool reversed, Func&& func) const;
    friend class counter_write_query_result_builder;
+
+    void check_schema(const schema& s) const {
+#ifdef SEASTAR_DEBUG
+        assert(s.version() == _schema_version);
+#endif
+    }
 };

 inline
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -910,9 +910,10 @@ class shard_reader : public enable_lw_shared_from_this<shard_reader>, public fla
        bool _reader_created = false;
        bool _drop_partition_start = false;
        bool _drop_static_row = false;
+        position_in_partition::tri_compare _tri_cmp;

        std::optional<dht::decorated_key> _last_pkey;
-        std::optional<position_in_partition> _last_position_in_partition;
+        position_in_partition _next_position_in_partition = position_in_partition::for_partition_start();
        // These are used when the reader has to be recreated (after having been
        // evicted while paused) and the range and/or slice it is recreated with
        // differs from the original ones.
@@ -920,13 +921,13 @@ class shard_reader : public enable_lw_shared_from_this<shard_reader>, public fla
        std::optional<query::partition_slice> _slice_override;

    private:
-        void update_last_position(const circular_buffer<mutation_fragment>& buffer);
+        void update_next_position(flat_mutation_reader& reader, circular_buffer<mutation_fragment>& buffer);
        void adjust_partition_slice();
        flat_mutation_reader recreate_reader();
        flat_mutation_reader resume_or_create_reader();
+        bool should_drop_fragment(const mutation_fragment& mf);
        future<> do_fill_buffer(flat_mutation_reader& reader, db::timeout_clock::time_point timeout);
-        future<> ensure_buffer_contains_all_fragments_for_last_pos(flat_mutation_reader& reader, circular_buffer<mutation_fragment>& buffer,
-                db::timeout_clock::time_point timeout);
+        future<> fill_buffer(flat_mutation_reader& reader, circular_buffer<mutation_fragment>& buffer, db::timeout_clock::time_point timeout);

    public:
        remote_reader(
@@ -1024,7 +1025,7 @@ void shard_reader::stop() noexcept {
    }).finally([zis = shared_from_this()] {}));
 }

-void shard_reader::remote_reader::update_last_position(const circular_buffer<mutation_fragment>& buffer) {
+void shard_reader::remote_reader::update_next_position(flat_mutation_reader& reader, circular_buffer<mutation_fragment>& buffer) {
    if (buffer.empty()) {
        return;
    }
@@ -1035,7 +1036,31 @@ void shard_reader::remote_reader::update_last_position(const circular_buffer<mut
        _last_pkey = pk_it->as_partition_start().key();
    }

-    _last_position_in_partition.emplace(buffer.back().position());
+    const auto last_pos = buffer.back().position();
+    switch (last_pos.region()) {
+        case partition_region::partition_start:
+            _next_position_in_partition = position_in_partition::for_static_row();
+            break;
+        case partition_region::static_row:
+            _next_position_in_partition = position_in_partition::before_all_clustered_rows();
+            break;
+        case partition_region::clustered:
+            if (reader.is_buffer_empty()) {
+                _next_position_in_partition = position_in_partition::after_key(last_pos);
+            } else {
+               const auto& next_frag = reader.peek_buffer();
+               if (next_frag.is_end_of_partition()) {
+                   buffer.emplace_back(reader.pop_mutation_fragment());
+                   _next_position_in_partition = position_in_partition::for_partition_start();
+               } else {
+                   _next_position_in_partition = position_in_partition(next_frag.position());
+               }
+            }
+            break;
+        case partition_region::partition_end:
+           _next_position_in_partition = position_in_partition::for_partition_start();
+           break;
+    }
 }

 void shard_reader::remote_reader::adjust_partition_slice() {
@@ -1043,9 +1068,8 @@ void shard_reader::remote_reader::adjust_partition_slice() {
        _slice_override = _ps;
    }

-    auto& last_ckey = _last_position_in_partition->key();
    auto ranges = _slice_override->default_row_ranges();
-    query::trim_clustering_row_ranges_to(*_schema, ranges, last_ckey);
+    query::trim_clustering_row_ranges_to(*_schema, ranges, _next_position_in_partition);

    _slice_override->clear_ranges();
    _slice_override->set_range(*_schema, _last_pkey->key(), std::move(ranges));
@@ -1058,25 +1082,22 @@ flat_mutation_reader shard_reader::remote_reader::recreate_reader() {
    if (_last_pkey) {
        bool partition_range_is_inclusive = true;

-        if (_last_position_in_partition) {
-            switch (_last_position_in_partition->region()) {
-            case partition_region::partition_start:
-                _drop_partition_start = true;
-                break;
-            case partition_region::static_row:
-                _drop_partition_start = true;
-                _drop_static_row = true;
-                break;
-            case partition_region::clustered:
-                _drop_partition_start = true;
-                _drop_static_row = true;
-                adjust_partition_slice();
-                slice = &*_slice_override;
-                break;
-            case partition_region::partition_end:
-                partition_range_is_inclusive = false;
-                break;
-            }
+        switch (_next_position_in_partition.region()) {
+        case partition_region::partition_start:
+            partition_range_is_inclusive = false;
+            break;
+        case partition_region::static_row:
+            _drop_partition_start = true;
+            break;
+        case partition_region::clustered:
+            _drop_partition_start = true;
+            _drop_static_row = true;
+            adjust_partition_slice();
+            slice = &*_slice_override;
+            break;
+        case partition_region::partition_end:
+            partition_range_is_inclusive = false;
+            break;
        }

        // The original range contained a single partition and we've read it
@@ -1115,62 +1136,83 @@ flat_mutation_reader shard_reader::remote_reader::resume_or_create_reader() {
    return recreate_reader();
 }

+bool shard_reader::remote_reader::should_drop_fragment(const mutation_fragment& mf) {
+    if (_drop_partition_start && mf.is_partition_start()) {
+        _drop_partition_start = false;
+        return true;
+    }
+    if (_drop_static_row && mf.is_static_row()) {
+        _drop_static_row = false;
+        return true;
+    }
+    return false;
+}
+
 future<> shard_reader::remote_reader::do_fill_buffer(flat_mutation_reader& reader, db::timeout_clock::time_point timeout) {
    if (!_drop_partition_start && !_drop_static_row) {
        return reader.fill_buffer(timeout);
    }
    return repeat([this, &reader, timeout] {
        return reader.fill_buffer(timeout).then([this, &reader] {
-            const auto eos = reader.is_end_of_stream();
-
-            if (reader.is_buffer_empty()) {
-                return stop_iteration(eos);
+            while (!reader.is_buffer_empty() && should_drop_fragment(reader.peek_buffer())) {
+                reader.pop_mutation_fragment();
            }
-            if (_drop_partition_start) {
-                _drop_partition_start = false;
-                if (reader.peek_buffer().is_partition_start()) {
-                    reader.pop_mutation_fragment();
-                }
-            }
-
-            if (reader.is_buffer_empty()) {
-                return stop_iteration(eos);
-            }
-            if (_drop_static_row) {
-                _drop_static_row = false;
-                if (reader.peek_buffer().is_static_row()) {
-                    reader.pop_mutation_fragment();
-                }
-            }
-
-            return stop_iteration(reader.is_buffer_full() || eos);
+            return stop_iteration(reader.is_buffer_full() || reader.is_end_of_stream());
        });
    });
 }

-future<> shard_reader::remote_reader::ensure_buffer_contains_all_fragments_for_last_pos(flat_mutation_reader& reader,
-        circular_buffer<mutation_fragment>& buffer, db::timeout_clock::time_point timeout) {
-    if (buffer.empty() || !buffer.back().is_range_tombstone()) {
-        return make_ready_future<>();
-    }
-
-    auto stop = [this, &reader, &buffer] {
+future<> shard_reader::remote_reader::fill_buffer(flat_mutation_reader& reader, circular_buffer<mutation_fragment>& buffer,
+        db::timeout_clock::time_point timeout) {
+    return do_fill_buffer(reader, timeout).then([this, &reader, &buffer, timeout] {
        if (reader.is_buffer_empty()) {
-            return reader.is_end_of_stream();
+            return make_ready_future<>();
        }
-        const auto& next_pos = reader.peek_buffer().position();
-        if (next_pos.region() != partition_region::clustered) {
-            return true;
-        }
-        return !next_pos.key().equal(*_schema, buffer.back().position().key());
-    };
-
-    return do_until(stop, [this, &reader, &buffer, timeout] {
-        if (reader.is_buffer_empty()) {
-            return do_fill_buffer(reader, timeout);
-        }
-        buffer.emplace_back(reader.pop_mutation_fragment());
-        return make_ready_future<>();
+        buffer = reader.detach_buffer();
+        auto stop = [this, &reader, &buffer] {
+            // The only problematic fragment kind is the range tombstone.
+            // All other fragment kinds are safe to end the buffer on, and
+            // are guaranteed to represent progress vs. the last buffer fill.
+            if (!buffer.back().is_range_tombstone()) {
+                return true;
+            }
+            if (reader.is_buffer_empty()) {
+                return reader.is_end_of_stream();
+            }
+            const auto& next_pos = reader.peek_buffer().position();
+            // To ensure safe progress we have to ensure the following:
+            //
+            // _next_position_in_partition < buffer.back().position() < next_pos
+            //
+            // * The first condition is to ensure we made progress since the
+            // last buffer fill. Otherwise we might get into an endless loop if
+            // the reader is recreated after each `fill_buffer()` call.
+            // * The second condition is to ensure we have seen all fragments
+            // with the same position. Otherwise we might jump over those
+            // remaining fragments with the same position as the last
+            // fragment's in the buffer when the reader is recreated.
+            return _tri_cmp(_next_position_in_partition, buffer.back().position()) < 0 && _tri_cmp(buffer.back().position(), next_pos) < 0;
+        };
+        // Read additional fragments until it is safe to stop, if needed.
+        // We have to ensure we stop at a fragment such that if the reader is
+        // evicted and recreated later, we won't be skipping any fragments.
+        // Practically, range tombstones are the only ones that are
+        // problematic to end the buffer on. This is due to the fact range
+        // tombstones can have the same position that multiple following range
+        // tombstones, or a single following clustering row in the stream has.
+        // When a range tombstone is the last in the buffer, we have to continue
+        // to read until we are sure we've read all fragments sharing the same
+        // position, so that we can safely continue reading from after said
+        // position.
+        return do_until(stop, [this, &reader, &buffer, timeout] {
+            if (reader.is_buffer_empty()) {
+                return do_fill_buffer(reader, timeout);
+            }
+            buffer.emplace_back(reader.pop_mutation_fragment());
+            return make_ready_future<>();
+        });
+    }).then([this, &reader, &buffer] {
+        update_next_position(reader, buffer);
    });
 }

@@ -1188,7 +1230,8 @@ shard_reader::remote_reader::remote_reader(
    , _ps(ps)
    , _pc(pc)
    , _trace_state(std::move(trace_state))
-    , _fwd_mr(fwd_mr) {
+    , _fwd_mr(fwd_mr)
+    , _tri_cmp(*_schema) {
 }

 future<shard_reader::fill_buffer_result> shard_reader::remote_reader::fill_buffer(const dht::partition_range& pr, bool pending_next_partition,
@@ -1196,7 +1239,7 @@ future<shard_reader::fill_buffer_result> shard_reader::remote_reader::fill_buffe
    // We could have missed a `fast_forward_to()` if the reader wasn't created yet.
    _pr = &pr;
    if (pending_next_partition) {
-        _last_position_in_partition = position_in_partition(position_in_partition::end_of_partition_tag_t{});
+        _next_position_in_partition = position_in_partition::for_partition_start();
    }
    return do_with(resume_or_create_reader(), circular_buffer<mutation_fragment>{},
            [this, pending_next_partition, timeout] (flat_mutation_reader& reader, circular_buffer<mutation_fragment>& buffer) mutable {
@@ -1204,22 +1247,8 @@ future<shard_reader::fill_buffer_result> shard_reader::remote_reader::fill_buffe
            reader.next_partition();
        }

-        return do_fill_buffer(reader, timeout).then([this, &reader, &buffer, timeout] {
-            buffer = reader.detach_buffer();
-            // When the reader is recreated (after having been evicted) we
-            // recreate it such that it starts reading from *after* the last
-            // seen fragment's position. If the last seen fragment is a range
-            // tombstone it is *not* guaranteed that the next fragments in the
-            // data stream have positions strictly greater than the range
-            // tombstone's. If the reader is evicted and has to be recreated,
-            // these fragments would be then skipped as the read would continue
-            // after their position.
-            // To avoid this ensure that the buffer contains *all* fragments for
-            // the last seen position.
-            return ensure_buffer_contains_all_fragments_for_last_pos(reader, buffer, timeout);
-        }).then([this, &reader, &buffer] {
+        return fill_buffer(reader, buffer, timeout).then([this, &reader, &buffer] {
            const auto eos = reader.is_end_of_stream() && reader.is_buffer_empty();
-            update_last_position(buffer);
            _irh = _lifecycle_policy.pause(std::move(reader));
            return fill_buffer_result(std::move(buffer), eos);
        });
@@ -1229,7 +1258,7 @@ future<shard_reader::fill_buffer_result> shard_reader::remote_reader::fill_buffe
 future<> shard_reader::remote_reader::fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) {
    _pr = &pr;
    _last_pkey.reset();
-    _last_position_in_partition.reset();
+    _next_position_in_partition = position_in_partition::for_partition_start();

    if (!_reader_created || !_irh) {
        return make_ready_future<>();
--- a/partition_version.cc
+++ b/partition_version.cc
@@ -338,7 +338,7 @@ partition_version& partition_entry::add_version(const schema& s, cache_tracker*

 void partition_entry::apply(const schema& s, const mutation_partition& mp, const schema& mp_schema)
 {
-    apply(s, mutation_partition(s, mp), mp_schema);
+    apply(s, mutation_partition(mp_schema, mp), mp_schema);
 }

 void partition_entry::apply(const schema& s, mutation_partition&& mp, const schema& mp_schema)
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -129,6 +129,8 @@ public:
        : _type(partition_region::clustered), _ck(&ck) { }
    position_in_partition_view(range_tag_t, bound_view bv)
        : _type(partition_region::clustered), _bound_weight(position_weight(bv.kind())), _ck(&bv.prefix()) { }
+    position_in_partition_view(const clustering_key_prefix& ck, bound_weight w)
+        : _type(partition_region::clustered), _bound_weight(w), _ck(&ck) { }

    static position_in_partition_view for_range_start(const query::clustering_range& r) {
        return {position_in_partition_view::range_tag_t(), bound_view::from_range_start(r)};
@@ -159,6 +161,7 @@ public:
    }

    partition_region region() const { return _type; }
+    bound_weight get_bound_weight() const { return _bound_weight; }
    bool is_partition_start() const { return _type == partition_region::partition_start; }
    bool is_partition_end() const { return _type == partition_region::partition_end; }
    bool is_static_row() const { return _type == partition_region::static_row; }
@@ -271,6 +274,10 @@ public:
        return {clustering_row_tag_t(), std::move(ck)};
    }

+    static position_in_partition for_partition_start() {
+        return position_in_partition{partition_start_tag_t()};
+    }
+
    static position_in_partition for_static_row() {
        return position_in_partition{static_row_tag_t()};
    }
--- a/querier.cc
+++ b/querier.cc
@@ -286,11 +286,11 @@ static void insert_querier(

    auto& e = entries.emplace_back(key, std::move(q), expires);
    e.set_pos(--entries.end());
+    ++stats.population;

    if (auto irh = sem.register_inactive_read(std::make_unique<querier_inactive_read>(entries, e.pos(), stats))) {
        e.set_inactive_handle(std::move(irh));
        index.insert(e);
-        ++stats.population;
    }
 }

--- a/query-request.hh
+++ b/query-request.hh
@@ -31,6 +31,8 @@
 #include "tracing/tracing.hh"
 #include "utils/small_vector.hh"

+class position_in_partition_view;
+
 namespace query {

 using column_id_vector = utils::small_vector<column_id, 8>;
@@ -58,10 +60,20 @@ typedef std::vector<clustering_range> clustering_row_ranges;

 /// Trim the clustering ranges.
 ///
-/// Equivalent of intersecting each range with [key, +inf), or (-inf, key] if
+/// Equivalent of intersecting each clustering range with [pos, +inf) position
+/// in partition range, or (-inf, pos] position in partition range if
 /// reversed == true. Ranges that do not intersect are dropped. Ranges that
 /// partially overlap are trimmed.
-/// Result: each range will overlap fully with [key, +inf), or (-int, key] if
+/// Result: each range will overlap fully with [pos, +inf), or (-int, pos] if
+/// reversed is true.
+void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, position_in_partition_view pos, bool reversed = false);
+
+/// Trim the clustering ranges.
+///
+/// Equivalent of intersecting each clustering range with (key, +inf) clustering
+/// range, or (-inf, key) clustering range if reversed == true. Ranges that do
+/// not intersect are dropped. Ranges that partially overlap are trimmed.
+/// Result: each range will overlap fully with (key, +inf), or (-int, key) if
 /// reversed is true.
 void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, const clustering_key& key, bool reversed = false);

--- a/query.cc
+++ b/query.cc
@@ -71,34 +71,38 @@ std::ostream& operator<<(std::ostream& out, const specific_ranges& s) {
    return out << "{" << s._pk << " : " << join(", ", s._ranges) << "}";
 }

-void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, const clustering_key& key, bool reversed) {
-    auto cmp = [reversed, bv_cmp = bound_view::compare(s)] (const auto& a, const auto& b) {
-        return reversed ? bv_cmp(b, a) : bv_cmp(a, b);
+void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, position_in_partition_view pos, bool reversed) {
+    auto cmp = [reversed, cmp = position_in_partition::composite_tri_compare(s)] (const auto& a, const auto& b) {
+        return reversed ? cmp(b, a) : cmp(a, b);
    };
-    auto start_bound = [reversed] (const auto& range) -> const bound_view& {
-        return reversed ? range.second : range.first;
+    auto start_bound = [reversed] (const auto& range) -> position_in_partition_view {
+        return reversed ? position_in_partition_view::for_range_end(range) : position_in_partition_view::for_range_start(range);
    };
-    auto end_bound = [reversed] (const auto& range) -> const bound_view& {
-        return reversed ? range.first : range.second;
+    auto end_bound = [reversed] (const auto& range) -> position_in_partition_view {
+        return reversed ? position_in_partition_view::for_range_start(range) : position_in_partition_view::for_range_end(range);
    };
-    clustering_key_prefix::equality eq(s);

    auto it = ranges.begin();
    while (it != ranges.end()) {
-        auto range = bound_view::from_range(*it);
-        if (cmp(end_bound(range), key) || eq(end_bound(range).prefix(), key)) {
+        if (cmp(end_bound(*it), pos) <= 0) {
            it = ranges.erase(it);
            continue;
-        } else if (cmp(start_bound(range), key)) {
-            assert(cmp(key, end_bound(range)));
-            auto r = reversed ? clustering_range(it->start(), clustering_range::bound { key, false })
-                : clustering_range(clustering_range::bound { key, false }, it->end());
+        } else if (cmp(start_bound(*it), pos) <= 0) {
+            assert(cmp(pos, end_bound(*it)) < 0);
+            auto r = reversed ?
+                clustering_range(it->start(), clustering_range::bound(pos.key(), pos.get_bound_weight() != bound_weight::before_all_prefixed)) :
+                clustering_range(clustering_range::bound(pos.key(), pos.get_bound_weight() != bound_weight::after_all_prefixed), it->end());
            *it = std::move(r);
        }
        ++it;
    }
 }

+void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, const clustering_key& key, bool reversed) {
+    return trim_clustering_row_ranges_to(s, ranges,
+            position_in_partition_view(key, reversed ? bound_weight::before_all_prefixed : bound_weight::after_all_prefixed), reversed);
+}
+
 partition_slice::partition_slice(clustering_row_ranges row_ranges,
    query::column_id_vector static_columns,
    query::column_id_vector regular_columns,
--- a/reloc/python3/build_deb.sh
+++ b/reloc/python3/build_deb.sh
@@ -0,0 +1,37 @@
+#!/bin/bash -e
+
+. /etc/os-release
+print_usage() {
+    echo "build_deb.sh --reloc-pkg build/release/scylla-python3-package.tar.gz"
+    echo "  --reloc-pkg specify relocatable package path"
+    exit 1
+}
+
+RELOC_PKG=build/release/scylla-python3-package.tar.gz
+OPTS=""
+while [ $# -gt 0 ]; do
+    case "$1" in
+        "--reloc-pkg")
+            OPTS="$OPTS $1 $(readlink -f $2)"
+            RELOC_PKG=$2
+            shift 2
+            ;;
+        *)
+            print_usage
+            ;;
+    esac
+done
+
+if [ ! -e $RELOC_PKG ]; then
+    echo "$RELOC_PKG does not exist."
+    echo "Run ./reloc/python3/build_reloc.sh first."
+    exit 1
+fi
+RELOC_PKG=$(readlink -f $RELOC_PKG)
+if [[ ! $OPTS =~ --reloc-pkg ]]; then
+    OPTS="$OPTS --reloc-pkg $RELOC_PKG"
+fi
+mkdir -p build/debian/scylla-python3-package
+tar -C build/debian/scylla-python3-package -xpf $RELOC_PKG
+cd build/debian/scylla-python3-package
+exec ./dist/debian/python3/build_deb.sh $OPTS
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -940,8 +940,20 @@ static future<> repair_cf_range(repair_info& ri,
 // Comparable to RepairSession in Origin
 static future<> repair_range(repair_info& ri, const dht::token_range& range) {
    auto id = utils::UUID_gen::get_time_UUID();
-    return do_with(get_neighbors(ri.db.local(), ri.keyspace, range, ri.data_centers, ri.hosts), [&ri, range, id] (const auto& neighbors) {
-        rlogger.debug("[repair #{}] new session: will sync {} on range {} for {}.{}", id, neighbors, range, ri.keyspace, ri.cfs);
+    return do_with(get_neighbors(ri.db.local(), ri.keyspace, range, ri.data_centers, ri.hosts), [&ri, range, id] (std::vector<gms::inet_address>& neighbors) {
+      auto live_neighbors = boost::copy_range<std::vector<gms::inet_address>>(neighbors |
+                    boost::adaptors::filtered([] (const gms::inet_address& node) { return gms::get_local_gossiper().is_alive(node); }));
+      if (live_neighbors.size() != neighbors.size()) {
+            ri.nr_failed_ranges++;
+            auto status = live_neighbors.empty() ? "skipped" : "partial";
+            rlogger.warn("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
+            ri.ranges_index, ri.ranges.size(), ri.id, ri.shard, ri.keyspace, ri.cfs, range, neighbors, live_neighbors, status);
+            if (live_neighbors.empty()) {
+                return make_ready_future<>();
+            }
+            neighbors.swap(live_neighbors);
+      }
+      return ::service::get_local_migration_manager().sync_schema(ri.db.local(), neighbors).then([&neighbors, &ri, range, id] {
        return do_for_each(ri.cfs.begin(), ri.cfs.end(), [&ri, &neighbors, range] (auto&& cf) {
            ri._sub_ranges_nr++;
            if (ri.row_level_repair()) {
@@ -950,6 +962,7 @@ static future<> repair_range(repair_info& ri, const dht::token_range& range) {
                return repair_cf_range(ri, cf, range, neighbors);
            }
        });
+      });
    });
 }

--- a/repair/repair.hh
+++ b/repair/repair.hh
@@ -295,6 +295,7 @@ public:
    void push_mutation_fragment(frozen_mutation_fragment mf) { _mfs.push_back(std::move(mf)); }
 };

+using repair_row_on_wire = partition_key_and_mutation_fragments;
 using repair_rows_on_wire = std::list<partition_key_and_mutation_fragments>;

 enum class row_level_diff_detect_algorithm : uint8_t {
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -152,8 +152,8 @@ class fragment_hasher {
    xx_hasher& _hasher;
 private:
    void consume_cell(const column_definition& col, const atomic_cell_or_collection& cell) {
-        feed_hash(_hasher, col.name());
-        feed_hash(_hasher, col.type->name());
+        feed_hash(_hasher, col.kind);
+        feed_hash(_hasher, col.id);
        feed_hash(_hasher, cell, col);
    }
 public:
@@ -220,43 +220,62 @@ private:
 };

 class repair_row {
-    frozen_mutation_fragment _fm;
+    std::optional<frozen_mutation_fragment> _fm;
    lw_shared_ptr<const decorated_key_with_hash> _dk_with_hash;
-    repair_sync_boundary _boundary;
-    repair_hash _hash;
+    std::optional<repair_sync_boundary> _boundary;
+    std::optional<repair_hash> _hash;
    lw_shared_ptr<mutation_fragment> _mf;
 public:
    repair_row() = delete;
-    repair_row(frozen_mutation_fragment fm,
-            position_in_partition pos,
+    repair_row(std::optional<frozen_mutation_fragment> fm,
+            std::optional<position_in_partition> pos,
            lw_shared_ptr<const decorated_key_with_hash> dk_with_hash,
-            repair_hash hash,
+            std::optional<repair_hash> hash,
            lw_shared_ptr<mutation_fragment> mf = {})
            : _fm(std::move(fm))
            , _dk_with_hash(std::move(dk_with_hash))
-            , _boundary({_dk_with_hash->dk, std::move(pos)})
+            , _boundary(pos ? std::optional<repair_sync_boundary>(repair_sync_boundary{_dk_with_hash->dk, std::move(*pos)}) : std::nullopt)
            , _hash(std::move(hash))
            , _mf(std::move(mf)) {
    }
    mutation_fragment& get_mutation_fragment() {
        if (!_mf) {
-            throw std::runtime_error("get empty mutation_fragment");
+            throw std::runtime_error("empty mutation_fragment");
        }
        return *_mf;
    }
-    frozen_mutation_fragment& get_frozen_mutation() { return _fm; }
-    const frozen_mutation_fragment& get_frozen_mutation() const { return _fm; }
+    frozen_mutation_fragment& get_frozen_mutation() {
+        if (!_fm) {
+            throw std::runtime_error("empty frozen_mutation_fragment");
+        }
+        return *_fm;
+    }
+    const frozen_mutation_fragment& get_frozen_mutation() const {
+        if (!_fm) {
+            throw std::runtime_error("empty frozen_mutation_fragment");
+        }
+        return *_fm;
+    }
    const lw_shared_ptr<const decorated_key_with_hash>& get_dk_with_hash() const {
        return _dk_with_hash;
    }
    size_t size() const {
-        return _fm.representation().size();
+        if (!_fm) {
+            throw std::runtime_error("empty size due to empty frozen_mutation_fragment");
+        }
+        return _fm->representation().size();
    }
    const repair_sync_boundary& boundary() const {
-        return _boundary;
+        if (!_boundary) {
+            throw std::runtime_error("empty repair_sync_boundary");
+        }
+        return *_boundary;
    }
    const repair_hash& hash() const {
-        return _hash;
+        if (!_hash) {
+            throw std::runtime_error("empty hash");
+        }
+        return *_hash;
    }
 };

@@ -284,13 +303,14 @@ public:
    repair_reader(
            seastar::sharded<database>& db,
            column_family& cf,
+            schema_ptr s,
            dht::token_range range,
            dht::i_partitioner& local_partitioner,
            dht::i_partitioner& remote_partitioner,
            unsigned remote_shard,
            uint64_t seed,
            is_local_reader local_reader)
-            : _schema(cf.schema())
+            : _schema(s)
            , _range(dht::to_partition_range(range))
            , _sharder(remote_partitioner, range, remote_shard)
            , _seed(seed)
@@ -458,8 +478,8 @@ public:
 private:
    seastar::sharded<database>& _db;
    column_family& _cf;
-    dht::token_range _range;
    schema_ptr _schema;
+    dht::token_range _range;
    repair_sync_boundary::tri_compare _cmp;
    // The algorithm used to find the row difference
    row_level_diff_detect_algorithm _algo;
@@ -519,6 +539,7 @@ public:
    repair_meta(
            seastar::sharded<database>& db,
            column_family& cf,
+            schema_ptr s,
            dht::token_range range,
            row_level_diff_detect_algorithm algo,
            size_t max_row_buf_size,
@@ -529,8 +550,8 @@ public:
            size_t nr_peer_nodes = 1)
            : _db(db)
            , _cf(cf)
+            , _schema(s)
            , _range(range)
-            , _schema(cf.schema())
            , _cmp(repair_sync_boundary::tri_compare(*_schema))
            , _algo(algo)
            , _max_row_buf_size(max_row_buf_size)
@@ -545,6 +566,7 @@ public:
            , _repair_reader(
                    _db,
                    _cf,
+                    _schema,
                    _range,
                    dht::global_partitioner(),
                    *_remote_partitioner,
@@ -577,35 +599,45 @@ public:
        }
    }

-    static void
+    static future<>
    insert_repair_meta(const gms::inet_address& from,
+            uint32_t src_cpu_id,
            uint32_t repair_meta_id,
-            sstring ks_name,
-            sstring cf_name,
            dht::token_range range,
            row_level_diff_detect_algorithm algo,
            uint64_t max_row_buf_size,
            uint64_t seed,
-            shard_config master_node_shard_config) {
-        node_repair_meta_id id{from, repair_meta_id};
-        auto& db = service::get_local_storage_proxy().get_db();
-        auto& cf = db.local().find_column_family(ks_name, cf_name);
-        auto rm = make_lw_shared<repair_meta>(db,
-                cf,
+            shard_config master_node_shard_config,
+            table_schema_version schema_version) {
+        return service::get_schema_for_write(schema_version, {from, src_cpu_id}).then([from,
+                repair_meta_id,
                range,
                algo,
                max_row_buf_size,
                seed,
-                repair_meta::repair_master::no,
-                repair_meta_id,
-                std::move(master_node_shard_config));
-        bool insertion = repair_meta_map().emplace(id, rm).second;
-        if (!insertion) {
-            rlogger.warn("insert_repair_meta: repair_meta_id {} for node {} already exists, replace existing one", id.repair_meta_id, id.ip);
-            repair_meta_map()[id] = rm;
-        } else {
-            rlogger.debug("insert_repair_meta: Inserted repair_meta_id {} for node {}", id.repair_meta_id, id.ip);
-        }
+                master_node_shard_config,
+                schema_version] (schema_ptr s) {
+            auto& db = service::get_local_storage_proxy().get_db();
+            auto& cf = db.local().find_column_family(s->id());
+            node_repair_meta_id id{from, repair_meta_id};
+            auto rm = make_lw_shared<repair_meta>(db,
+                    cf,
+                    s,
+                    range,
+                    algo,
+                    max_row_buf_size,
+                    seed,
+                    repair_meta::repair_master::no,
+                    repair_meta_id,
+                    std::move(master_node_shard_config));
+            bool insertion = repair_meta_map().emplace(id, rm).second;
+            if (!insertion) {
+                rlogger.warn("insert_repair_meta: repair_meta_id {} for node {} already exists, replace existing one", id.repair_meta_id, id.ip);
+                repair_meta_map()[id] = rm;
+            } else {
+                rlogger.debug("insert_repair_meta: Inserted repair_meta_id {} for node {}", id.repair_meta_id, id.ip);
+            }
+        });
    }

    static future<>
@@ -642,7 +674,11 @@ public:
            }
        }
        return parallel_for_each(*repair_metas, [repair_metas] (auto& rm) {
-            return rm->stop();
+            return rm->stop().then([&rm] {
+                rm = {};
+            });
+        }).then([repair_metas, from] {
+            rlogger.debug("Removed all repair_meta for single node {}", from);
        });
    }

@@ -654,7 +690,11 @@ public:
                | boost::adaptors::map_values));
        repair_meta_map().clear();
        return parallel_for_each(*repair_metas, [repair_metas] (auto& rm) {
-            return rm->stop();
+            return rm->stop().then([&rm] {
+                rm = {};
+            });
+        }).then([repair_metas] {
+            rlogger.debug("Removed all repair_meta for all nodes");
        });
    }

@@ -952,12 +992,12 @@ private:
        }
        return to_repair_rows_list(rows).then([this, from, node_idx, update_buf, update_hash_set] (std::list<repair_row> row_diff) {
            return do_with(std::move(row_diff), [this, from, node_idx, update_buf, update_hash_set] (std::list<repair_row>& row_diff) {
-                auto sz = get_repair_rows_size(row_diff);
-                stats().rx_row_bytes += sz;
-                stats().rx_row_nr += row_diff.size();
-                stats().rx_row_nr_peer[from] += row_diff.size();
-                _metrics.rx_row_nr += row_diff.size();
-                _metrics.rx_row_bytes += sz;
+                if (_repair_master) {
+                    auto sz = get_repair_rows_size(row_diff);
+                    stats().rx_row_bytes += sz;
+                    stats().rx_row_nr += row_diff.size();
+                    stats().rx_row_nr_peer[from] += row_diff.size();
+                }
                if (update_buf) {
                    std::list<repair_row> tmp;
                    tmp.swap(_working_row_buf);
@@ -993,11 +1033,16 @@ private:
        return do_with(repair_rows_on_wire(), std::move(row_list), [this] (repair_rows_on_wire& rows, std::list<repair_row>& row_list) {
            return do_for_each(row_list, [this, &rows] (repair_row& r) {
                auto pk = r.get_dk_with_hash()->dk.key();
-                auto it = std::find_if(rows.begin(), rows.end(), [&pk, s=_schema] (partition_key_and_mutation_fragments& row) { return pk.legacy_equal(*s, row.get_key()); });
-                if (it == rows.end()) {
-                    rows.push_back(partition_key_and_mutation_fragments(std::move(pk), {std::move(r.get_frozen_mutation())}));
+                // No need to search from the beginning of the rows. Look at the end of repair_rows_on_wire is enough.
+                if (rows.empty()) {
+                    rows.push_back(repair_row_on_wire(std::move(pk), {std::move(r.get_frozen_mutation())}));
                } else {
-                    it->push_mutation_fragment(std::move(r.get_frozen_mutation()));
+                    auto& row = rows.back();
+                    if (pk.legacy_equal(*_schema, row.get_key())) {
+                        row.push_mutation_fragment(std::move(r.get_frozen_mutation()));
+                    } else {
+                        rows.push_back(repair_row_on_wire(std::move(pk), {std::move(r.get_frozen_mutation())}));
+                    }
                }
            }).then([&rows] {
                return std::move(rows);
@@ -1006,23 +1051,47 @@ private:
    };

    future<std::list<repair_row>> to_repair_rows_list(repair_rows_on_wire rows) {
-        return do_with(std::move(rows), std::list<repair_row>(), lw_shared_ptr<const decorated_key_with_hash>(),
-          [this] (repair_rows_on_wire& rows, std::list<repair_row>& row_list, lw_shared_ptr<const decorated_key_with_hash>& dk_ptr) mutable {
-            return do_for_each(rows, [this, &dk_ptr, &row_list] (partition_key_and_mutation_fragments& x) mutable {
+        return do_with(std::move(rows), std::list<repair_row>(), lw_shared_ptr<const decorated_key_with_hash>(), lw_shared_ptr<mutation_fragment>(), position_in_partition::tri_compare(*_schema),
+          [this] (repair_rows_on_wire& rows, std::list<repair_row>& row_list, lw_shared_ptr<const decorated_key_with_hash>& dk_ptr, lw_shared_ptr<mutation_fragment>& last_mf, position_in_partition::tri_compare& cmp) mutable {
+            return do_for_each(rows, [this, &dk_ptr, &row_list, &last_mf, &cmp] (partition_key_and_mutation_fragments& x) mutable {
                dht::decorated_key dk = dht::global_partitioner().decorate_key(*_schema, x.get_key());
                if (!(dk_ptr && dk_ptr->dk.equal(*_schema, dk))) {
                    dk_ptr = make_lw_shared<const decorated_key_with_hash>(*_schema, dk, _seed);
                }
-                return do_for_each(x.get_mutation_fragments(), [this, &dk_ptr, &row_list] (frozen_mutation_fragment& fmf) mutable {
-                    // Keep the mutation_fragment in repair_row as an
-                    // optimization to avoid unfreeze again when
-                    // mutation_fragment is needed by _repair_writer.do_write()
-                    // to apply the repair_row to disk
-                    auto mf = make_lw_shared<mutation_fragment>(fmf.unfreeze(*_schema));
-                    auto hash = do_hash_for_mf(*dk_ptr, *mf);
-                    position_in_partition pos(mf->position());
-                    row_list.push_back(repair_row(std::move(fmf), std::move(pos), dk_ptr, std::move(hash), std::move(mf)));
-                });
+                if (_repair_master) {
+                    return do_for_each(x.get_mutation_fragments(), [this, &dk_ptr, &row_list] (frozen_mutation_fragment& fmf) mutable {
+                        _metrics.rx_row_nr += 1;
+                        _metrics.rx_row_bytes += fmf.representation().size();
+                        // Keep the mutation_fragment in repair_row as an
+                        // optimization to avoid unfreeze again when
+                        // mutation_fragment is needed by _repair_writer.do_write()
+                        // to apply the repair_row to disk
+                        auto mf = make_lw_shared<mutation_fragment>(fmf.unfreeze(*_schema));
+                        auto hash = do_hash_for_mf(*dk_ptr, *mf);
+                        position_in_partition pos(mf->position());
+                        row_list.push_back(repair_row(std::move(fmf), std::move(pos), dk_ptr, std::move(hash), std::move(mf)));
+                    });
+                } else {
+                    last_mf = {};
+                    return do_for_each(x.get_mutation_fragments(), [this, &dk_ptr, &row_list, &last_mf, &cmp] (frozen_mutation_fragment& fmf) mutable {
+                        _metrics.rx_row_nr += 1;
+                        _metrics.rx_row_bytes += fmf.representation().size();
+                        auto mf = make_lw_shared<mutation_fragment>(fmf.unfreeze(*_schema));
+                        position_in_partition pos(mf->position());
+                        // If the mutation_fragment has the same position as
+                        // the last mutation_fragment, it means they are the
+                        // same row with different contents. We can not feed
+                        // such rows into the sstable writer. Instead we apply
+                        // the mutation_fragment into the previous one.
+                        if (last_mf && cmp(last_mf->position(), pos) == 0 && last_mf->mergeable_with(*mf)) {
+                            last_mf->apply(*_schema, std::move(*mf));
+                        } else {
+                            last_mf = mf;
+                            // On repair follower node, only decorated_key_with_hash and the mutation_fragment inside repair_row are used.
+                            row_list.push_back(repair_row({}, {}, dk_ptr, {}, std::move(mf)));
+                        }
+                    });
+                }
            }).then([&row_list] {
                return std::move(row_list);
            });
@@ -1084,29 +1153,28 @@ public:

    // RPC API
    future<>
-    repair_row_level_start(gms::inet_address remote_node, sstring ks_name, sstring cf_name, dht::token_range range) {
+    repair_row_level_start(gms::inet_address remote_node, sstring ks_name, sstring cf_name, dht::token_range range, table_schema_version schema_version) {
        if (remote_node == _myip) {
            return make_ready_future<>();
        }
        stats().rpc_call_nr++;
        return netw::get_local_messaging_service().send_repair_row_level_start(msg_addr(remote_node),
                _repair_meta_id, std::move(ks_name), std::move(cf_name), std::move(range), _algo, _max_row_buf_size, _seed,
-                _master_node_shard_config.shard, _master_node_shard_config.shard_count, _master_node_shard_config.ignore_msb, _master_node_shard_config.partitioner_name);
+                _master_node_shard_config.shard, _master_node_shard_config.shard_count, _master_node_shard_config.ignore_msb, _master_node_shard_config.partitioner_name, std::move(schema_version));
    }

    // RPC handler
    static future<>
-    repair_row_level_start_handler(gms::inet_address from, uint32_t repair_meta_id, sstring ks_name, sstring cf_name,
+    repair_row_level_start_handler(gms::inet_address from, uint32_t src_cpu_id, uint32_t repair_meta_id, sstring ks_name, sstring cf_name,
            dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size,
-            uint64_t seed, shard_config master_node_shard_config) {
+            uint64_t seed, shard_config master_node_shard_config, table_schema_version schema_version) {
        if (!_sys_dist_ks->local_is_initialized() || !_view_update_generator->local_is_initialized()) {
            return make_exception_future<>(std::runtime_error(format("Node {} is not fully initialized for repair, try again later",
                    utils::fb_utilities::get_broadcast_address())));
        }
-        rlogger.debug(">>> Started Row Level Repair (Follower): local={}, peers={}, repair_meta_id={}, keyspace={}, cf={}, range={}",
-            utils::fb_utilities::get_broadcast_address(), from, repair_meta_id, ks_name, cf_name, range);
-        insert_repair_meta(from, repair_meta_id, std::move(ks_name), std::move(cf_name), std::move(range), algo, max_row_buf_size, seed, std::move(master_node_shard_config));
-        return make_ready_future<>();
+        rlogger.debug(">>> Started Row Level Repair (Follower): local={}, peers={}, repair_meta_id={}, keyspace={}, cf={}, schema_version={}, range={}",
+            utils::fb_utilities::get_broadcast_address(), from, repair_meta_id, ks_name, cf_name, schema_version, range);
+        return insert_repair_meta(from, src_cpu_id, repair_meta_id, std::move(range), algo, max_row_buf_size, seed, std::move(master_node_shard_config), std::move(schema_version));
    }

    // RPC API
@@ -1313,14 +1381,15 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
        });
        ms.register_repair_row_level_start([] (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring ks_name,
                sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed,
-                unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name) {
+                unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version) {
            auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
            auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
-            return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, ks_name, cf_name,
-                    range, algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, remote_partitioner_name] () mutable {
-                return repair_meta::repair_row_level_start_handler(from, repair_meta_id, std::move(ks_name),
+            return smp::submit_to(src_cpu_id % smp::count, [from, src_cpu_id, repair_meta_id, ks_name, cf_name,
+                    range, algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, remote_partitioner_name, schema_version] () mutable {
+                return repair_meta::repair_row_level_start_handler(from, src_cpu_id, repair_meta_id, std::move(ks_name),
                        std::move(cf_name), std::move(range), algo, max_row_buf_size, seed,
-                        shard_config{remote_shard, remote_shard_count, remote_ignore_msb, std::move(remote_partitioner_name)});
+                        shard_config{remote_shard, remote_shard_count, remote_ignore_msb, std::move(remote_partitioner_name)},
+                        schema_version);
            });
        });
        ms.register_repair_row_level_stop([] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
@@ -1608,8 +1677,12 @@ public:
                    dht::global_partitioner().sharding_ignore_msb(),
                    dht::global_partitioner().name()
            };
+            auto s = _cf.schema();
+            auto schema_version = s->version();
+
            repair_meta master(_ri.db,
                    _cf,
+                    s,
                    _range,
                    algorithm,
                    _max_row_buf_size,
@@ -1622,12 +1695,13 @@ public:
            // All nodes including the node itself.
            _all_nodes.insert(_all_nodes.begin(), master.myip());

-            rlogger.debug(">>> Started Row Level Repair (Master): local={}, peers={}, repair_meta_id={}, keyspace={}, cf={}, range={}, seed={}",
-                    master.myip(), _all_live_peer_nodes, master.repair_meta_id(), _ri.keyspace, _cf_name, _range, _seed);
+            rlogger.debug(">>> Started Row Level Repair (Master): local={}, peers={}, repair_meta_id={}, keyspace={}, cf={}, schema_version={}, range={}, seed={}",
+                    master.myip(), _all_live_peer_nodes, master.repair_meta_id(), _ri.keyspace, _cf_name, schema_version, _range, _seed);
+

            try {
                parallel_for_each(_all_nodes, [&, this] (const gms::inet_address& node) {
-                    return master.repair_row_level_start(node, _ri.keyspace, _cf_name, _range).then([&] () {
+                    return master.repair_row_level_start(node, _ri.keyspace, _cf_name, _range, schema_version).then([&] () {
                        return master.repair_get_estimated_partitions(node).then([this, node] (uint64_t partitions) {
                            rlogger.trace("Get repair_get_estimated_partitions for node={}, estimated_partitions={}", node, partitions);
                            _estimated_partitions += partitions;
@@ -1677,19 +1751,7 @@ public:
 future<> repair_cf_range_row_level(repair_info& ri,
        sstring cf_name, dht::token_range range,
        const std::vector<gms::inet_address>& all_peer_nodes) {
-    auto all_live_peer_nodes = boost::copy_range<std::vector<gms::inet_address>>(all_peer_nodes |
-        boost::adaptors::filtered([] (const gms::inet_address& node) { return gms::get_local_gossiper().is_alive(node); }));
-    if (all_live_peer_nodes.size() != all_peer_nodes.size()) {
-        rlogger.warn("Repair for range={} is partial, peer nodes={}, live peer nodes={}",
-                range, all_peer_nodes, all_live_peer_nodes);
-        ri.nr_failed_ranges++;
-    }
-    if (all_live_peer_nodes.empty()) {
-        rlogger.info(">>> Skipped Row Level Repair (Master): local={}, peers={}, keyspace={}, cf={}, range={}",
-            utils::fb_utilities::get_broadcast_address(), all_peer_nodes, ri.keyspace, cf_name, range);
-        return make_ready_future<>();
-    }
-    return do_with(row_level_repair(ri, std::move(cf_name), std::move(range), std::move(all_live_peer_nodes)), [] (row_level_repair& repair) {
+    return do_with(row_level_repair(ri, std::move(cf_name), std::move(range), all_peer_nodes), [] (row_level_repair& repair) {
        return repair.run();
    });
 }
--- a/schema_mutations.cc
+++ b/schema_mutations.cc
@@ -69,19 +69,30 @@ table_schema_version schema_mutations::digest() const {
    }

    md5_hasher h;
-    db::schema_tables::feed_hash_for_schema_digest(h, _columnfamilies);
-    db::schema_tables::feed_hash_for_schema_digest(h, _columns);
+    db::schema_features sf = db::schema_features::full();
+
+    // Disable this feature so that the digest remains compactible with Scylla
+    // versions prior to this feature.
+    // This digest affects the table schema version calculation and it's important
+    // that all nodes arrive at the same table schema version to avoid needless schema version
+    // pulls. Table schema versions are calculated on boot when we don't yet
+    // know all the cluster features, so we could get different table versions after reboot
+    // in an already upgraded cluster.
+    sf.remove<db::schema_feature::DIGEST_INSENSITIVE_TO_EXPIRY>();
+
+    db::schema_tables::feed_hash_for_schema_digest(h, _columnfamilies, sf);
+    db::schema_tables::feed_hash_for_schema_digest(h, _columns, sf);
    if (_view_virtual_columns && !_view_virtual_columns->partition().empty()) {
-        db::schema_tables::feed_hash_for_schema_digest(h, *_view_virtual_columns);
+        db::schema_tables::feed_hash_for_schema_digest(h, *_view_virtual_columns, sf);
    }
    if (_indices && !_indices->partition().empty()) {
-        db::schema_tables::feed_hash_for_schema_digest(h, *_indices);
+        db::schema_tables::feed_hash_for_schema_digest(h, *_indices, sf);
    }
    if (_dropped_columns && !_dropped_columns->partition().empty()) {
-        db::schema_tables::feed_hash_for_schema_digest(h, *_dropped_columns);
+        db::schema_tables::feed_hash_for_schema_digest(h, *_dropped_columns, sf);
    }
    if (_scylla_tables) {
-        db::schema_tables::feed_hash_for_schema_digest(h, *_scylla_tables);
+        db::schema_tables::feed_hash_for_schema_digest(h, *_scylla_tables, sf);
    }
    return utils::UUID_gen::get_name_UUID(h.finalize());
 }
--- a/schema_registry.cc
+++ b/schema_registry.cc
@@ -263,11 +263,9 @@ global_schema_ptr::global_schema_ptr(const global_schema_ptr& o)
    : global_schema_ptr(o.get())
 { }

-global_schema_ptr::global_schema_ptr(global_schema_ptr&& o) {
+global_schema_ptr::global_schema_ptr(global_schema_ptr&& o) noexcept {
    auto current = engine().cpu_id();
-    if (o._cpu_of_origin != current) {
-        throw std::runtime_error("Attempted to move global_schema_ptr across shards");
-    }
+    assert(o._cpu_of_origin == current);
    _ptr = std::move(o._ptr);
    _cpu_of_origin = current;
 }
--- a/schema_registry.hh
+++ b/schema_registry.hh
@@ -173,7 +173,7 @@ public:
    // The other may come from a different shard.
    global_schema_ptr(const global_schema_ptr& other);
    // The other must come from current shard.
-    global_schema_ptr(global_schema_ptr&& other);
+    global_schema_ptr(global_schema_ptr&& other) noexcept;
    // May be invoked across shards. Always returns an engaged pointer.
    schema_ptr get() const;
    operator schema_ptr() const { return get(); }
--- a/scripts/create-relocatable-package-python3.py
+++ b/scripts/create-relocatable-package-python3.py
@@ -231,9 +231,15 @@ ar = tarfile.open(args.output, mode='w|gz')
 pathlib.Path('build/SCYLLA-RELOCATABLE-FILE').touch()
 ar.add('build/SCYLLA-RELOCATABLE-FILE', arcname='SCYLLA-RELOCATABLE-FILE')
 ar.add('dist/redhat/python3')
+ar.add('dist/debian/python3')
 ar.add('build/python3/SCYLLA-RELEASE-FILE', arcname='SCYLLA-RELEASE-FILE')
 ar.add('build/python3/SCYLLA-VERSION-FILE', arcname='SCYLLA-VERSION-FILE')
 ar.add('build/SCYLLA-PRODUCT-FILE', arcname='SCYLLA-PRODUCT-FILE')
+for p in ['pyhton3-libs'] + packages:
+    pdir = pathlib.Path('/usr/share/licenses/{}/'.format(p))
+    if pdir.exists():
+        for f in pdir.glob('*'):
+            ar.add(f, arcname='licenses/{}/{}'.format(p, f.name))

 for f in file_list:
    copy_file_to_python_env(ar, f)
--- a/scripts/create-relocatable-package.py
+++ b/scripts/create-relocatable-package.py
@@ -61,6 +61,7 @@ args = ap.parse_args()

 executables = ['build/{}/scylla'.format(args.mode),
               'build/{}/iotune'.format(args.mode),
+               '/usr/bin/patchelf',
               '/usr/bin/lscpu',
               '/usr/bin/gawk',
               '/usr/bin/gzip',
@@ -76,6 +77,9 @@ libs = {}
 for exe in executables:
    libs.update(ldd(exe))

+# manually add libthread_db for debugging thread
+libs.update({'libthread_db-1.0.so': '/lib64/libthread_db-1.0.so'})
+
 ld_so = libs['ld.so']

 have_gnutls = any([lib.startswith('libgnutls.so')
@@ -93,56 +97,9 @@ ar = tarfile.open(fileobj=gzip_process.stdin, mode='w|')
 pathlib.Path('build/SCYLLA-RELOCATABLE-FILE').touch()
 ar.add('build/SCYLLA-RELOCATABLE-FILE', arcname='SCYLLA-RELOCATABLE-FILE')

-# This thunk is a shell script that arranges for the executable to be invoked,
-# under the following conditions:
-#
-#  - the same argument vector is passed to the executable, including argv[0]
-#  - the executable name (/proc/pid/comm, shown in top(1)) is the same
-#  - the dynamic linker is taken from this package rather than the executable's
-#    default (which is hardcoded to point to /lib64/ld-linux-x86_64.so or similar)
-#  - LD_LIBRARY_PATH points to the lib/ directory so shared library dependencies
-#    are satisified from there rather than the system default (e.g. /lib64)
-
-# To do that, the dynamic linker is invoked using a symbolic link named after the
-# executable, not its standard name. We use "bash -a" to set argv[0].
-
-# The full tangled web looks like:
-#
-# foobar/bin/scylla               a shell script invoking everything
-# foobar/libexec/scylla.bin       the real binary
-# foobar/libexec/scylla           a symlink to ../lib/ld.so
-# foobar/libreloc/ld.so                the dynamic linker
-# foobar/libreloc/lib...               all the other libraries
-
-# the transformations (done by the thunk and symlinks) are:
-#
-#    bin/scylla args -> libexec/scylla libexec/scylla.bin args -> lib/ld.so libexec/scylla.bin args
-
-thunk = b'''\
-#!/bin/bash
-
-x="$(readlink -f "$0")"
-b="$(basename "$x")"
-d="$(dirname "$x")/.."
-ldso="$d/libexec/$b"
-realexe="$d/libexec/$b.bin"
-export GNUTLS_SYSTEM_PRIORITY_FILE="${GNUTLS_SYSTEM_PRIORITY_FILE-$d/libreloc/gnutls.config}"
-LD_LIBRARY_PATH="$d/libreloc" exec -a "$0" "$ldso" "$realexe" "$@"
-'''
-
 for exe in executables:
    basename = os.path.basename(exe)
-    ar.add(exe, arcname='libexec/' + basename + '.bin')
-    ti = tarfile.TarInfo(name='bin/' + basename)
-    ti.size = len(thunk)
-    ti.mode = 0o755
-    ti.mtime = os.stat(exe).st_mtime
-    ar.addfile(ti, fileobj=io.BytesIO(thunk))
-    ti = tarfile.TarInfo(name='libexec/' + basename)
-    ti.type = tarfile.SYMTYPE
-    ti.linkname = '../libreloc/ld.so'
-    ti.mtime = os.stat(exe).st_mtime
-    ar.addfile(ti)
+    ar.add(exe, arcname='libexec/' + basename)
 for lib, libfile in libs.items():
    ar.add(libfile, arcname='libreloc/' + lib)
 if have_gnutls:
--- a/scripts/relocate_python_scripts.py
+++ b/scripts/relocate_python_scripts.py
@@ -34,7 +34,15 @@ class FilesystemFixup:
 x="$(readlink -f "$0")"
 b="$(basename "$x")"
 d="$(dirname "$x")"
-PYTHONPATH="${{d}}:${{d}}/libexec:$PYTHONPATH" PATH="${{d}}/{pythonpath}:${{PATH}}" exec -a "$0" "${{d}}/libexec/${{b}}" "$@"
+CENTOS_SSL_CERT_FILE="/etc/pki/tls/cert.pem"
+if [ -f "${{CENTOS_SSL_CERT_FILE}}" ]; then
+  c=${{CENTOS_SSL_CERT_FILE}}
+fi
+DEBIAN_SSL_CERT_FILE="/etc/ssl/certs/ca-certificates.crt"
+if [ -f "${{DEBIAN_SSL_CERT_FILE}}" ]; then
+  c=${{DEBIAN_SSL_CERT_FILE}}
+fi
+PYTHONPATH="${{d}}:${{d}}/libexec:$PYTHONPATH" PATH="${{d}}/{pythonpath}:${{PATH}}" SSL_CERT_FILE="${{c}}" exec -a "$0" "${{d}}/libexec/${{b}}" "$@"
 '''
        self.python_path = python_path
        self.installroot = installroot
--- a/2
+++ b/2
--- a/service/cache_hitrate_calculator.hh
+++ b/service/cache_hitrate_calculator.hh
@@ -30,11 +30,24 @@ using namespace seastar;
 namespace service {

 class cache_hitrate_calculator : public seastar::async_sharded_service<cache_hitrate_calculator> {
+    struct stat {
+        float h = 0;
+        float m = 0;
+        stat& operator+=(stat& o) {
+            h += o.h;
+            m += o.m;
+            return *this;
+        }
+    };
+
    seastar::sharded<database>& _db;
    seastar::sharded<cache_hitrate_calculator>& _me;
    timer<lowres_clock> _timer;
    bool _stopped = false;
    float _diff = 0;
+    std::unordered_map<utils::UUID, stat> _rates;
+    size_t _slen = 0;
+    std::string _gstate;
    future<> _done = make_ready_future();

    future<lowres_clock::duration> recalculate_hitrates();
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -82,12 +82,16 @@ future<> migration_manager::stop()
 void migration_manager::init_messaging_service()
 {
    auto& ss = service::get_local_storage_service();
-    _feature_listeners.push_back(ss.cluster_supports_view_virtual_columns().when_enabled([this, &ss] {
+
+    auto update_schema = [this, &ss] {
        with_gate(_background_tasks, [this, &ss] {
-            mlogger.debug("view_virtual_columns feature enabled, recalculating schema version");
-            return update_schema_version(get_storage_proxy(), ss.cluster_schema_features());
+            mlogger.debug("features changed, recalculating schema version");
+            return update_schema_version_and_announce(get_storage_proxy(), ss.cluster_schema_features());
        });
-    }));
+    };
+
+    _feature_listeners.push_back(ss.cluster_supports_view_virtual_columns().when_enabled(update_schema));
+    _feature_listeners.push_back(ss.cluster_supports_digest_insensitive_to_expiry().when_enabled(update_schema));

    auto& ms = netw::get_local_messaging_service();
    ms.register_definitions_update([this] (const rpc::client_info& cinfo, std::vector<frozen_mutation> m) {
@@ -992,4 +996,22 @@ future<schema_ptr> get_schema_for_write(table_schema_version v, netw::messaging_
    });
 }

+future<> migration_manager::sync_schema(const database& db, const std::vector<gms::inet_address>& nodes) {
+    using schema_and_hosts = std::unordered_map<utils::UUID, std::vector<gms::inet_address>>;
+    return do_with(schema_and_hosts(), db.get_version(), [this, &nodes] (schema_and_hosts& schema_map, utils::UUID& my_version) {
+        return parallel_for_each(nodes, [this, &schema_map, &my_version] (const gms::inet_address& node) {
+            return netw::get_messaging_service().local().send_schema_check(netw::msg_addr(node)).then([node, &schema_map, &my_version] (utils::UUID remote_version) {
+                if (my_version != remote_version) {
+                    schema_map[remote_version].emplace_back(node);
+                }
+            });
+        }).then([this, &schema_map] {
+            return parallel_for_each(schema_map, [this] (auto& x) {
+                mlogger.debug("Pulling schema {} from {}", x.first, x.second.front());
+                return submit_migration_task(x.second.front());
+            });
+        });
+    });
+}
+
 }
--- a/service/migration_manager.hh
+++ b/service/migration_manager.hh
@@ -75,6 +75,9 @@ public:

    future<> submit_migration_task(const gms::inet_address& endpoint);

+    // Makes sure that this node knows about all schema changes known by "nodes" that were made prior to this call.
+    future<> sync_schema(const database& db, const std::vector<gms::inet_address>& nodes);
+
    // Fetches schema from remote node and applies it locally.
    // Differs from submit_migration_task() in that all errors are propagated.
    // Coalesces requests.
--- a/service/misc_services.cc
+++ b/service/misc_services.cc
@@ -113,16 +113,6 @@ void cache_hitrate_calculator::run_on(size_t master, lowres_clock::duration d) {
 }

 future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates() {
-    struct stat {
-        float h = 0;
-        float m = 0;
-        stat& operator+=(stat& o) {
-            h += o.h;
-            m += o.m;
-            return *this;
-        }
-    };
-
    auto non_system_filter = [&] (const std::pair<utils::UUID, lw_shared_ptr<column_family>>& cf) {
        return _db.local().find_keyspace(cf.second->schema()->ks_name()).get_replication_strategy().get_type() != locator::replication_strategy_type::local;
    };
@@ -144,15 +134,18 @@ future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates()

    return _db.map_reduce0(cf_to_cache_hit_stats, std::unordered_map<utils::UUID, stat>(), sum_stats_per_cf).then([this, non_system_filter] (std::unordered_map<utils::UUID, stat> rates) mutable {
        _diff = 0;
+        _gstate.reserve(_slen); // assume length did not change from previous iteration
+        _slen = 0;
+        _rates = std::move(rates);
        // set calculated rates on all shards
-        return _db.invoke_on_all([this, rates = std::move(rates), cpuid = engine().cpu_id(), non_system_filter] (database& db) {
-            sstring gstate;
-            for (auto& cf : db.get_column_families() | boost::adaptors::filtered(non_system_filter)) {
-                auto it = rates.find(cf.first);
-                if (it == rates.end()) { // a table may be added before map/reduce compltes and this code runs
-                    continue;
+        return _db.invoke_on_all([this, cpuid = engine().cpu_id(), non_system_filter] (database& db) {
+            return do_for_each(_rates, [this, cpuid, &db] (auto&& r) mutable {
+                auto it = db.get_column_families().find(r.first);
+                if (it == db.get_column_families().end()) { // a table may be added before map/reduce completes and this code runs
+                    return;
                }
-                stat s = it->second;
+                auto& cf = *it;
+                stat& s = r.second;
                float rate = 0;
                if (s.h) {
                    rate = s.h / (s.h + s.m);
@@ -160,24 +153,25 @@ future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates()
                if (engine().cpu_id() == cpuid) {
                    // calculate max difference between old rate and new one for all cfs
                    _diff = std::max(_diff, std::abs(float(cf.second->get_global_cache_hit_rate()) - rate));
-                    gstate += format("{}.{}:{:f};", cf.second->schema()->ks_name(), cf.second->schema()->cf_name(), rate);
+                    _gstate += format("{}.{}:{:0.6f};", cf.second->schema()->ks_name(), cf.second->schema()->cf_name(), rate);
                }
                cf.second->set_global_cache_hit_rate(cache_temperature(rate));
-            }
-            if (gstate.size()) {
-                auto& g = gms::get_local_gossiper();
-                auto& ss = get_local_storage_service();
-                return g.add_local_application_state(gms::application_state::CACHE_HITRATES, ss.value_factory.cache_hitrates(std::move(gstate)));
-            }
-            return make_ready_future<>();
+            });
        });
    }).then([this] {
+        auto& g = gms::get_local_gossiper();
+        auto& ss = get_local_storage_service();
+        _slen = _gstate.size();
+        g.add_local_application_state(gms::application_state::CACHE_HITRATES, ss.value_factory.cache_hitrates(_gstate));
        // if max difference during this round is big schedule next recalculate earlier
        if (_diff < 0.01) {
            return std::chrono::milliseconds(2000);
        } else {
            return std::chrono::milliseconds(500);
        }
+    }).finally([this] {
+        _gstate = std::string(); // free memory, do not trust clear() to do that for string
+        _rates.clear();
    });
 }

--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -481,17 +481,58 @@ public:
                        std::move(targets), pending_endpoints, std::move(dead_endpoints), std::move(tr_state), stats) {
        register_in_intrusive_list(*p);
    }
+    ~view_update_write_response_handler();
 private:
    void register_in_intrusive_list(storage_proxy& p);
 };

 class storage_proxy::view_update_handlers_list : public bi::list<view_update_write_response_handler, bi::base_hook<view_update_write_response_handler>, bi::constant_time_size<false>> {
+    // _live_iterators holds all iterators that point into the bi:list in the base class of this object.
+    // If we remove a view_update_write_response_handler from the list, and an iterator happens to point
+    // into it, we advance the iterator so it doesn't point at a removed object. See #4912.
+    std::vector<iterator*> _live_iterators;
+public:
+    view_update_handlers_list() {
+        _live_iterators.reserve(10); // We only expect 1.
+    }
+    void register_live_iterator(iterator* itp) noexcept { // We don't tolerate failure, so abort instead
+        _live_iterators.push_back(itp);
+    }
+    void unregister_live_iterator(iterator* itp) {
+        _live_iterators.erase(boost::remove(_live_iterators, itp), _live_iterators.end());
+    }
+    void update_live_iterators(view_update_write_response_handler* vuwrh) {
+        // vuwrh is being removed from the b::list, so if any live iterator points at it,
+        // move it to the next object (this requires that the list is traversed in the forward
+        // direction).
+        for (auto& itp : _live_iterators) {
+            if (&**itp == vuwrh) {
+                ++*itp;
+            }
+        }
+    }
+    class iterator_guard {
+        view_update_handlers_list& _vuhl;
+        iterator* _itp;
+    public:
+        iterator_guard(view_update_handlers_list& vuhl, iterator& it) : _vuhl(vuhl), _itp(&it) {
+            _vuhl.register_live_iterator(_itp);
+        }
+        ~iterator_guard() {
+            _vuhl.unregister_live_iterator(_itp);
+        }
+    };
 };

 void view_update_write_response_handler::register_in_intrusive_list(storage_proxy& p) {
    p.get_view_update_handlers_list().push_back(*this);
 }

+
+view_update_write_response_handler::~view_update_write_response_handler() {
+    _proxy->_view_update_handlers_list->update_live_iterators(this);
+}
+
 class datacenter_sync_write_response_handler : public abstract_write_response_handler {
    struct dc_info {
        size_t acks;
@@ -604,17 +645,21 @@ storage_proxy::response_id_type storage_proxy::register_response_handler(shared_

 void storage_proxy::remove_response_handler(storage_proxy::response_id_type id) {
    auto entry = _response_handlers.find(id);
+    assert(entry != _response_handlers.end());
+    remove_response_handler_entry(std::move(entry));
+}
+
+void storage_proxy::remove_response_handler_entry(response_handlers_map::iterator entry) {
    entry->second->on_released();
    _response_handlers.erase(std::move(entry));
 }

-
 void storage_proxy::got_response(storage_proxy::response_id_type id, gms::inet_address from, std::optional<db::view::update_backlog> backlog) {
    auto it = _response_handlers.find(id);
    if (it != _response_handlers.end()) {
        tracing::trace(it->second->get_trace_state(), "Got a response from /{}", from);
        if (it->second->response(from)) {
-            remove_response_handler(id); // last one, remove entry. Will cancel expiration timer too.
+            remove_response_handler_entry(std::move(it)); // last one, remove entry. Will cancel expiration timer too.
        } else {
            it->second->check_for_early_completion();
        }
@@ -627,7 +672,7 @@ void storage_proxy::got_failure_response(storage_proxy::response_id_type id, gms
    if (it != _response_handlers.end()) {
        tracing::trace(it->second->get_trace_state(), "Got {} failures from /{}", count, from);
        if (it->second->failure_response(from, count)) {
-            remove_response_handler(id);
+            remove_response_handler_entry(std::move(it));
        } else {
            it->second->check_for_early_completion();
        }
@@ -1097,6 +1142,22 @@ future<> storage_proxy::mutate_begin(std::vector<unique_response_handler> ids, d
                                     std::optional<clock_type::time_point> timeout_opt) {
    return parallel_for_each(ids, [this, cl, timeout_opt] (unique_response_handler& protected_response) {
        auto response_id = protected_response.id;
+        // This function, mutate_begin(), is called after a preemption point
+        // so it's possible that other code besides our caller just ran. In
+        // particular, Scylla may have noticed that a remote node went down,
+        // called storage_proxy::on_down(), and removed some of the ongoing
+        // handlers, including this id. If this happens, we need to ignore
+        // this id - not try to look it up or start a send.
+        if (_response_handlers.find(response_id) == _response_handlers.end()) {
+            protected_response.release(); // Don't try to remove this id again
+            // Requests that time-out normally below after response_wait()
+            // result in an exception (see ~abstract_write_response_handler())
+            // However, here we no longer have the handler or its information
+            // to put in the exception. The exception is not needed for
+            // correctness (e.g., hints are written by timeout_cb(), not
+            // because of an exception here).
+            return make_exception_future<>(std::runtime_error("unstarted write cancelled"));
+        }
        // it is better to send first and hint afterwards to reduce latency
        // but request may complete before hint_to_dead_endpoints() is called and
        // response_id handler will be removed, so we will have to do hint with separate
@@ -2873,6 +2934,12 @@ storage_proxy::query_partition_key_range_concurrent(storage_proxy::clock_type::t
    dht::partition_range_vector ranges = ranges_to_vnodes(concurrency_factor);
    dht::partition_range_vector::iterator i = ranges.begin();

+    // query_ranges_to_vnodes_generator can return less results than requested. If the number of results
+    // is small enough or there are a lot of results - concurrentcy_factor which is increased by shifting left can
+    // eventualy zero out resulting in an infinite recursion. This line makes sure that concurrency factor is never
+    // get stuck on 0 and never increased too much if the number of results remains small.
+    concurrency_factor = std::max(size_t(1), ranges.size());
+
    while (i != ranges.end()) {
        dht::partition_range& range = *i;
        std::vector<gms::inet_address> live_endpoints = get_live_sorted_endpoints(ks, end_token(range));
@@ -3614,20 +3681,27 @@ void storage_proxy::on_up(const gms::inet_address& endpoint) {};

 void storage_proxy::on_down(const gms::inet_address& endpoint) {
    assert(thread::running_in_thread());
-    for (auto it = _view_update_handlers_list->begin(); it != _view_update_handlers_list->end(); ++it) {
+    auto it = _view_update_handlers_list->begin();
+    while (it != _view_update_handlers_list->end()) {
        auto guard = it->shared_from_this();
-        if (it->get_targets().count(endpoint) > 0) {
+        if (it->get_targets().count(endpoint) > 0 && _response_handlers.find(it->id()) != _response_handlers.end()) {
            it->timeout_cb();
        }
-        seastar::thread::yield();
+        ++it;
+        if (seastar::thread::should_yield()) {
+            view_update_handlers_list::iterator_guard ig{*_view_update_handlers_list, it};
+            seastar::thread::yield();
+        }
    }
 };

 future<> storage_proxy::drain_on_shutdown() {
    return do_with(::shared_ptr<abstract_write_response_handler>(), [this] (::shared_ptr<abstract_write_response_handler>& intrusive_list_guard) {
-        return do_for_each(*_view_update_handlers_list, [&intrusive_list_guard] (abstract_write_response_handler& handler) {
-            intrusive_list_guard = handler.shared_from_this();
-            handler.timeout_cb();
+        return do_for_each(*_view_update_handlers_list, [this, &intrusive_list_guard] (abstract_write_response_handler& handler) {
+            if (_response_handlers.find(handler.id()) != _response_handlers.end()) {
+                intrusive_list_guard = handler.shared_from_this();
+                handler.timeout_cb();
+            }
        });
    }).then([this] {
        return _hints_resource_manager.stop();
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -79,6 +79,7 @@ namespace service {
 class abstract_write_response_handler;
 class abstract_read_executor;
 class mutation_holder;
+class view_update_write_response_handler;

 using replicas_per_token_range = std::unordered_map<dht::token_range, std::vector<utils::UUID>>;

@@ -131,6 +132,7 @@ private:
        ~unique_response_handler();
        response_id_type release();
    };
+    using response_handlers_map = std::unordered_map<response_id_type, ::shared_ptr<abstract_write_response_handler>>;

 public:
    static const sstring COORDINATOR_STATS_CATEGORY;
@@ -178,7 +180,7 @@ public:
 private:
    distributed<database>& _db;
    response_id_type _next_response_id;
-    std::unordered_map<response_id_type, ::shared_ptr<abstract_write_response_handler>> _response_handlers;
+    response_handlers_map _response_handlers;
    // This buffer hold ids of throttled writes in case resource consumption goes
    // below the threshold and we want to unthrottle some of them. Without this throttled
    // request with dead or slow replica may wait for up to timeout ms before replying
@@ -220,6 +222,7 @@ private:
            coordinator_query_options optional_params);
    response_id_type register_response_handler(shared_ptr<abstract_write_response_handler>&& h);
    void remove_response_handler(response_id_type id);
+    void remove_response_handler_entry(response_handlers_map::iterator entry);
    void got_response(response_id_type id, gms::inet_address from, std::optional<db::view::update_backlog> backlog);
    void got_failure_response(response_id_type id, gms::inet_address from, size_t count, std::optional<db::view::update_backlog> backlog);
    future<> response_wait(response_id_type id, clock_type::time_point timeout);
@@ -457,6 +460,7 @@ public:
    friend class abstract_write_response_handler;
    friend class speculating_read_executor;
    friend class view_update_backlog_broker;
+    friend class view_update_write_response_handler;
 };

 extern distributed<storage_proxy> _the_storage_proxy;
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -110,6 +110,7 @@ static const sstring TRUNCATION_TABLE = "TRUNCATION_TABLE";
 static const sstring CORRECT_STATIC_COMPACT_IN_MC = "CORRECT_STATIC_COMPACT_IN_MC";
 static const sstring UNBOUNDED_RANGE_TOMBSTONES_FEATURE = "UNBOUNDED_RANGE_TOMBSTONES";
 static const sstring VIEW_VIRTUAL_COLUMNS = "VIEW_VIRTUAL_COLUMNS";
+static const sstring DIGEST_INSENSITIVE_TO_EXPIRY = "DIGEST_INSENSITIVE_TO_EXPIRY";

 static const sstring SSTABLE_FORMAT_PARAM_NAME = "sstable_format";

@@ -162,6 +163,7 @@ storage_service::storage_service(distributed<database>& db, gms::gossiper& gossi
        , _correct_static_compact_in_mc(_feature_service, CORRECT_STATIC_COMPACT_IN_MC)
        , _unbounded_range_tombstones_feature(_feature_service, UNBOUNDED_RANGE_TOMBSTONES_FEATURE)
        , _view_virtual_columns(_feature_service, VIEW_VIRTUAL_COLUMNS)
+        , _digest_insensitive_to_expiry(_feature_service, DIGEST_INSENSITIVE_TO_EXPIRY)
        , _la_feature_listener(*this, _feature_listeners_sem, sstables::sstable_version_types::la)
        , _mc_feature_listener(*this, _feature_listeners_sem, sstables::sstable_version_types::mc)
        , _replicate_action([this] { return do_replicate_to_all_cores(); })
@@ -208,6 +210,7 @@ void storage_service::enable_all_features() {
        std::ref(_correct_static_compact_in_mc),
        std::ref(_unbounded_range_tombstones_feature),
        std::ref(_view_virtual_columns),
+        std::ref(_digest_insensitive_to_expiry),
    })
    {
        if (features.count(f.name())) {
@@ -311,6 +314,7 @@ std::set<sstring> storage_service::get_config_supported_features_set() {
        TRUNCATION_TABLE,
        CORRECT_STATIC_COMPACT_IN_MC,
        VIEW_VIRTUAL_COLUMNS,
+        DIGEST_INSENSITIVE_TO_EXPIRY,
    };

    // Do not respect config in the case database is not started
@@ -479,6 +483,14 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
        }
    }

+    // If this is a restarting node, we should update tokens before gossip starts
+    auto my_tokens = db::system_keyspace::get_saved_tokens().get0();
+    bool restarting_normal_node = db::system_keyspace::bootstrap_complete() && !db().local().is_replacing() && !my_tokens.empty();
+    if (restarting_normal_node) {
+        slogger.info("Restarting a node in NORMAL status");
+        _token_metadata.update_normal_tokens(my_tokens, get_broadcast_address());
+    }
+
    // have to start the gossip service before we can see any info on other nodes.  this is necessary
    // for bootstrap to get the load info it needs.
    // (we won't be part of the storage ring though until we add a counterId to our state, below.)
@@ -489,6 +501,12 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
    }).get();
    auto features = get_config_supported_features();
    _token_metadata.update_host_id(local_host_id, get_broadcast_address());
+
+    // Replicate the tokens early because once gossip runs other nodes
+    // might send reads/writes to this node. Replicate it early to make
+    // sure the tokens are valid on all the shards.
+    replicate_to_all_cores().get();
+
    auto broadcast_rpc_address = utils::fb_utilities::get_broadcast_rpc_address();
    auto& proxy = service::get_storage_proxy();
    // Ensure we know our own actual Schema UUID in preparation for updates
@@ -503,6 +521,10 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
    app_states.emplace(gms::application_state::RPC_READY, value_factory.cql_ready(false));
    app_states.emplace(gms::application_state::VIEW_BACKLOG, versioned_value(""));
    app_states.emplace(gms::application_state::SCHEMA, value_factory.schema(schema_version));
+    if (restarting_normal_node) {
+        app_states.emplace(gms::application_state::TOKENS, value_factory.tokens(my_tokens));
+        app_states.emplace(gms::application_state::STATUS, value_factory.normal(my_tokens));
+    }
    slogger.info("Starting up server gossip");

    _gossiper.register_(this->shared_from_this());
@@ -811,6 +833,7 @@ void storage_service::bootstrap(std::unordered_set<token> tokens) {
    } else {
        // Dont set any state for the node which is bootstrapping the existing token...
        _token_metadata.update_normal_tokens(tokens, get_broadcast_address());
+        replicate_to_all_cores().get();
        auto replace_addr = db().local().get_replace_address();
        if (replace_addr) {
            slogger.debug("Removing replaced endpoint {} from system.peers", *replace_addr);
@@ -1581,6 +1604,7 @@ future<> storage_service::init_server(int delay, bind_messaging_port do_bind) {
            auto tokens = db::system_keyspace::get_saved_tokens().get0();
            if (!tokens.empty()) {
                _token_metadata.update_normal_tokens(tokens, get_broadcast_address());
+                replicate_to_all_cores().get();
                // order is important here, the gossiper can fire in between adding these two states.  It's ok to send TOKENS without STATUS, but *not* vice versa.
                _gossiper.add_local_application_state({
                    { gms::application_state::TOKENS, value_factory.tokens(tokens) },
@@ -3479,6 +3503,7 @@ void storage_service::notify_cql_change(inet_address endpoint, bool ready)
 db::schema_features storage_service::cluster_schema_features() const {
    db::schema_features f;
    f.set_if<db::schema_feature::VIEW_VIRTUAL_COLUMNS>(bool(_view_virtual_columns));
+    f.set_if<db::schema_feature::DIGEST_INSENSITIVE_TO_EXPIRY>(bool(_digest_insensitive_to_expiry));
    return f;
 }

--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -323,6 +323,7 @@ private:
    gms::feature _correct_static_compact_in_mc;
    gms::feature _unbounded_range_tombstones_feature;
    gms::feature _view_virtual_columns;
+    gms::feature _digest_insensitive_to_expiry;

    sstables::sstable_version_types _sstables_format = sstables::sstable_version_types::ka;
    seastar::semaphore _feature_listeners_sem = {1};
@@ -2338,6 +2339,9 @@ public:
    const gms::feature& cluster_supports_view_virtual_columns() const {
        return _view_virtual_columns;
    }
+    const gms::feature& cluster_supports_digest_insensitive_to_expiry() const {
+        return _digest_insensitive_to_expiry;
+    }
    // Returns schema features which all nodes in the cluster advertise as supported.
    db::schema_features cluster_schema_features() const;
 private:
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -104,16 +104,6 @@ static bool belongs_to_current_node(const dht::token& t, const dht::token_range_
    return false;
 }

-static void delete_sstables_for_interrupted_compaction(std::vector<shared_sstable>& new_sstables, sstring& ks, sstring& cf) {
-    // Delete either partially or fully written sstables of a compaction that
-    // was either stopped abruptly (e.g. out of disk space) or deliberately
-    // (e.g. nodetool stop COMPACTION).
-    for (auto& sst : new_sstables) {
-        clogger.debug("Deleting sstable {} of interrupted compaction for {}.{}", sst->get_filename(), ks, cf);
-        sst->mark_for_deletion();
-    }
-}
-
 static std::vector<shared_sstable> get_uncompacting_sstables(column_family& cf, std::vector<shared_sstable> sstables) {
    auto all_sstables = boost::copy_range<std::vector<shared_sstable>>(*cf.get_sstables_including_compacted_undeleted());
    boost::sort(all_sstables, [] (const shared_sstable& x, const shared_sstable& y) {
@@ -317,6 +307,9 @@ protected:
    column_family& _cf;
    schema_ptr _schema;
    std::vector<shared_sstable> _sstables;
+    // Unused sstables are tracked because if compaction is interrupted we can only delete them.
+    // Deleting used sstables could potentially result in data loss.
+    std::vector<shared_sstable> _new_unused_sstables;
    lw_shared_ptr<sstable_set> _compacting;
    uint64_t _max_sstable_size;
    uint32_t _sstable_level;
@@ -347,6 +340,7 @@ protected:

    void setup_new_sstable(shared_sstable& sst) {
        _info->new_sstables.push_back(sst);
+        _new_unused_sstables.push_back(sst);
        sst->get_metadata_collector().set_replay_position(_rp);
        sst->get_metadata_collector().sstable_level(_sstable_level);
        for (auto ancestor : _ancestors) {
@@ -488,6 +482,16 @@ private:
    const schema_ptr& schema() const {
        return _schema;
    }
+
+    void delete_sstables_for_interrupted_compaction() {
+        // Delete either partially or fully written sstables of a compaction that
+        // was either stopped abruptly (e.g. out of disk space) or deliberately
+        // (e.g. nodetool stop COMPACTION).
+        for (auto& sst : _new_unused_sstables) {
+            clogger.debug("Deleting sstable {} of interrupted compaction for {}.{}", sst->get_filename(), _info->ks_name, _info->cf_name);
+            sst->mark_for_deletion();
+        }
+    }
 public:
    static future<compaction_info> run(std::unique_ptr<compaction> c);

@@ -521,7 +525,6 @@ void compacting_sstable_writer::consume_end_of_stream() {
 class regular_compaction : public compaction {
    std::function<shared_sstable()> _creator;
    replacer_fn _replacer;
-    std::vector<shared_sstable> _unreplaced_new_tables;
    std::unordered_set<shared_sstable> _compacting_for_max_purgeable_func;
    // store a clone of sstable set for column family, which needs to be alive for incremental selector.
    sstable_set _set;
@@ -625,8 +628,6 @@ private:
    }

    void maybe_replace_exhausted_sstables() {
-        _unreplaced_new_tables.push_back(_sst);
-
        // Replace exhausted sstable(s), if any, by new one(s) in the column family.
        auto not_exhausted = [s = _schema, &dk = _sst->get_last_decorated_key()] (shared_sstable& sst) {
            return sst->get_last_decorated_key().tri_compare(*s, dk) > 0;
@@ -668,7 +669,7 @@ private:
                _compacting->erase(sst);
                _monitor_generator.remove_sstable(_info->tracking, sst);
            });
-            _replacer(std::vector<shared_sstable>(exhausted, _sstables.end()), std::move(_unreplaced_new_tables));
+            _replacer(std::vector<shared_sstable>(exhausted, _sstables.end()), std::move(_new_unused_sstables));
            _sstables.erase(exhausted, _sstables.end());
        }
    }
@@ -677,7 +678,7 @@ private:
        if (!_sstables.empty()) {
            std::vector<shared_sstable> sstables_compacted;
            std::move(_sstables.begin(), _sstables.end(), std::back_inserter(sstables_compacted));
-            _replacer(std::move(sstables_compacted), std::move(_unreplaced_new_tables));
+            _replacer(std::move(sstables_compacted), std::move(_new_unused_sstables));
        }
    }

@@ -877,7 +878,7 @@ future<compaction_info> compaction::run(std::unique_ptr<compaction> c) {
            auto r = std::move(reader);
            r.consume_in_thread(std::move(cfc), c->filter_func(), db::no_timeout);
        } catch (...) {
-            delete_sstables_for_interrupted_compaction(c->_info->new_sstables, c->_info->ks_name, c->_info->cf_name);
+            c->delete_sstables_for_interrupted_compaction();
            c = nullptr; // make sure writers are stopped while running in thread context
            throw;
        }
--- a/sstables/compaction_manager.cc
+++ b/sstables/compaction_manager.cc
@@ -639,7 +639,7 @@ future<> compaction_manager::perform_sstable_upgrade(column_family* cf, bool exc
        return cf->run_with_compaction_disabled([this, cf, &tables, exclude_current_version] {
            auto last_version = get_highest_supported_format();

-            for (auto& sst : *(cf->get_sstables())) {
+            for (auto& sst : cf->candidates_for_compaction()) {
                // if we are a "normal" upgrade, we only care about
                // tables with older versions, but potentially
                // we are to actually rewrite everything. (-a)
--- a/sstables/compaction_strategy.cc
+++ b/sstables/compaction_strategy.cc
@@ -479,11 +479,6 @@ public:
            auto itw = writes_per_window.find(bound);
            if (itw != writes_per_window.end()) {
                ow_this_window = &itw->second;
-                // We will erase here so we can keep track of which
-                // writes belong to existing windows. Writes that don't belong to any window
-                // are writes in progress to new windows and will be accounted in the final
-                // loop before we return
-                writes_per_window.erase(itw);
            }
            auto* oc_this_window = &no_oc;
            auto itc = compactions_per_window.find(bound);
@@ -491,6 +486,13 @@ public:
                oc_this_window = &itc->second;
            }
            b += windows.second.backlog(*ow_this_window, *oc_this_window);
+            if (itw != writes_per_window.end()) {
+                // We will erase here so we can keep track of which
+                // writes belong to existing windows. Writes that don't belong to any window
+                // are writes in progress to new windows and will be accounted in the final
+                // loop before we return
+                writes_per_window.erase(itw);
+            }
        }

        // Partial writes that don't belong to any window are accounted here.
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -380,9 +380,17 @@ private:
            }

            return do_with(std::make_unique<reader>(_sstable, _pc, position, end, quantity), [this, summary_idx] (auto& entries_reader) {
-                return entries_reader->_context.consume_input().then([this, summary_idx, &entries_reader] {
+                return entries_reader->_context.consume_input().then_wrapped([this, summary_idx, &entries_reader] (future<> f) {
+                    std::exception_ptr ex;
+                    if (f.failed()) {
+                        ex = f.get_exception();
+                        sstlog.error("failed reading index for {}: {}", _sstable->get_filename(), ex);
+                    }
                    auto indexes = std::move(entries_reader->_consumer.indexes);
-                    return entries_reader->_context.close().then([indexes = std::move(indexes)] () mutable {
+                    return entries_reader->_context.close().then([indexes = std::move(indexes), ex = std::move(ex)] () mutable {
+                        if (ex) {
+                            std::rethrow_exception(std::move(ex));
+                        }
                        return std::move(indexes);
                    });

--- a/sstables/mc/writer.cc
+++ b/sstables/mc/writer.cc
@@ -29,6 +29,7 @@
 #include "sstables/mc/types.hh"
 #include "db/config.hh"
 #include "atomic_cell.hh"
+#include "utils/exceptions.hh"

 #include <functional>
 #include <boost/iterator/iterator_facade.hpp>
@@ -533,7 +534,7 @@ private:
    shard_id _shard; // Specifies which shard the new SStable will belong to.
    bool _compression_enabled = false;
    std::unique_ptr<file_writer> _data_writer;
-    std::optional<file_writer> _index_writer;
+    std::unique_ptr<file_writer> _index_writer;
    bool _tombstone_written = false;
    bool _static_row_written = false;
    // The length of partition header (partition key, partition deletion and static row, if present)
@@ -592,6 +593,10 @@ private:
    bool _write_regular_as_static; // See #4139

    void init_file_writers();
+
+    // Returns the closed writer
+    std::unique_ptr<file_writer> close_writer(std::unique_ptr<file_writer>& w);
+
    void close_data_writer();
    void ensure_tombstone_is_written() {
        if (!_tombstone_written) {
@@ -676,7 +681,7 @@ private:

    // Writes single atomic cell
    void write_cell(bytes_ostream& writer, const clustering_key_prefix* clustering_key, atomic_cell_view cell, const column_definition& cdef,
-        const row_time_properties& properties, bytes_view cell_path = {});
+        const row_time_properties& properties, std::optional<bytes_view> cell_path = {});

    // Writes information about row liveness (formerly 'row marker')
    void write_liveness_info(bytes_ostream& writer, const row_marker& marker);
@@ -836,13 +841,17 @@ void writer::init_file_writers() {
                &_sst._components->compression,
                _schema.get_compressor_params()));
    }
-    _index_writer.emplace(std::move(_sst._index_file), options);
+    _index_writer = std::make_unique<file_writer>(std::move(_sst._index_file), options);
+}
+
+std::unique_ptr<file_writer> writer::close_writer(std::unique_ptr<file_writer>& w) {
+    auto writer = std::move(w);
+    writer->close();
+    return writer;
 }

 void writer::close_data_writer() {
-    auto writer = std::move(_data_writer);
-    writer->close();
-
+    auto writer = close_writer(_data_writer);
    if (!_compression_enabled) {
        auto chksum_wr = static_cast<crc32_checksummed_file_writer*>(writer.get());
        _sst.write_digest(chksum_wr->full_checksum());
@@ -970,7 +979,7 @@ void writer::consume(tombstone t) {
 }

 void writer::write_cell(bytes_ostream& writer, const clustering_key_prefix* clustering_key, atomic_cell_view cell,
-         const column_definition& cdef, const row_time_properties& properties, bytes_view cell_path) {
+         const column_definition& cdef, const row_time_properties& properties, std::optional<bytes_view> cell_path) {

    uint64_t current_pos = writer.size();
    bool is_deleted = !cell.is_live();
@@ -983,7 +992,7 @@ void writer::write_cell(bytes_ostream& writer, const clustering_key_prefix* clus
                       properties.local_deletion_time == cell.deletion_time();

    cell_flags flags = cell_flags::none;
-    if (!has_value) {
+    if ((!has_value && !cdef.is_counter()) || is_deleted) {
        flags |= cell_flags::has_empty_value_mask;
    }
    if (is_deleted) {
@@ -1012,20 +1021,22 @@ void writer::write_cell(bytes_ostream& writer, const clustering_key_prefix* clus
        }
    }

-    if (!cell_path.empty()) {
-        write_vint(writer, cell_path.size());
-        write(_sst.get_version(), writer, cell_path);
+    if (bool(cell_path)) {
+        write_vint(writer, cell_path->size());
+        write(_sst.get_version(), writer, *cell_path);
    }

-    if (has_value) {
-        if (cdef.is_counter()) {
+    if (cdef.is_counter()) {
+        if (!is_deleted) {
            assert(!cell.is_counter_update());
            counter_cell_view::with_linearized(cell, [&] (counter_cell_view ccv) {
                write_counter_value(ccv, writer, sstable_version_types::mc, [] (bytes_ostream& out, uint32_t value) {
                    return write_vint(out, value);
                });
            });
-        } else {
+        }
+    } else {
+        if (has_value) {
            write_cell_value(writer, *cdef.type, cell.value());
        }
    }
@@ -1370,10 +1381,15 @@ stop_iteration writer::consume_end_of_partition() {
        _first_key = *_partition_key;
    }
    _last_key = std::move(*_partition_key);
+    _partition_key = std::nullopt;
    return get_data_offset() < _cfg.max_sstable_size ? stop_iteration::no : stop_iteration::yes;
 }

 void writer::consume_end_of_stream() {
+    if (_partition_key) {
+        on_internal_error(sstlog, "Mutation stream ends with unclosed partition during write");
+    }
+
    _cfg.monitor->on_data_write_completed();

    seal_summary(_sst._components->summary, std::move(_first_key), std::move(_last_key), _index_sampling_state);
@@ -1382,8 +1398,7 @@ void writer::consume_end_of_stream() {
        _sst.get_metadata_collector().add_compression_ratio(_sst._components->compression.compressed_file_length(), _sst._components->compression.uncompressed_file_length());
    }

-    _index_writer->close();
-    _index_writer.reset();
+    close_writer(_index_writer);
    _sst.set_first_and_last_keys();

    _sst._components->statistics.contents[metadata_type::Serialization] = std::make_unique<serialization_header>(std::move(_sst_schema.header));
--- a/sstables/mp_row_consumer.cc
+++ b/sstables/mp_row_consumer.cc
@@ -44,6 +44,14 @@ namespace sstables {
 atomic_cell make_counter_cell(api::timestamp_type timestamp, bytes_view value) {
    static constexpr size_t shard_size = 32;

+    if (value.empty()) {
+        // This will never happen in a correct MC sstable but
+        // we had a bug #4363 that caused empty counters
+        // to be incorrectly stored inside sstables.
+        counter_cell_builder ccb;
+        return ccb.build(timestamp);
+    }
+
    data_input in(value);

    auto header_size = in.read<int16_t>();
@@ -59,8 +67,6 @@ atomic_cell make_counter_cell(api::timestamp_type timestamp, bytes_view value) {
        throw marshal_exception("encountered remote shards in a counter cell");
    }

-    std::vector<counter_shard> shards;
-    shards.reserve(shard_count);
    counter_cell_builder ccb(shard_count);
    for (auto i = 0u; i < shard_count; i++) {
        auto id_hi = in.read<int64_t>();
--- a/sstables/mp_row_consumer.hh
+++ b/sstables/mp_row_consumer.hh
@@ -703,9 +703,12 @@ public:
    // Sets streamed_mutation::_end_of_range when there are no more fragments for the query range.
    // Returns information whether the parser should continue to parse more
    // input and produce more fragments or we have collected enough and should yield.
+    // Returns proceed:yes only when all pending fragments have been pushed.
    proceed push_ready_fragments() {
        if (_ready) {
-            return push_ready_fragments_with_ready_set();
+            if (push_ready_fragments_with_ready_set() == proceed::no) {
+                return proceed::no;
+            }
        }

        if (_out_of_range) {
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -1060,9 +1060,26 @@ void sstable::write_simple(const T& component, const io_priority_class& pc) {
    options.buffer_size = sstable_buffer_size;
    options.io_priority_class = pc;
    auto w = file_writer(std::move(f), std::move(options));
-    write(_version, w, component);
-    w.flush();
-    w.close();
+    std::exception_ptr eptr;
+    try {
+        write(_version, w, component);
+        w.flush();
+    } catch (...) {
+        eptr = std::current_exception();
+    }
+    try {
+        w.close();
+    } catch (...) {
+        std::exception_ptr close_eptr = std::current_exception();
+        sstlog.warn("failed to close file_writer: {}", close_eptr);
+        // If write succeeded but close failed, we rethrow close's exception.
+        if (!eptr) {
+            eptr = close_eptr;
+        }
+    }
+    if (eptr) {
+        std::rethrow_exception(eptr);
+    }
 }

 template future<> sstable::read_simple<component_type::Filter>(sstables::filter& f, const io_priority_class& pc);
@@ -2133,11 +2150,15 @@ stop_iteration components_writer::consume_end_of_partition() {
        _first_key = *_partition_key;
    }
    _last_key = std::move(*_partition_key);
+    _partition_key = std::nullopt;

    return get_offset() < _max_sstable_size ? stop_iteration::no : stop_iteration::yes;
 }

 void components_writer::consume_end_of_stream() {
+    if (_partition_key) {
+        on_internal_error(sstlog, "Mutation stream ends with unclosed partition during write");
+    }
    // what if there is only one partition? what if it is empty?
    seal_summary(_sst._components->summary, std::move(_first_key), std::move(_last_key), _index_sampling_state);

@@ -3053,6 +3074,56 @@ std::optional<std::pair<uint64_t, uint64_t>> sstable::get_sample_indexes_for_ran
    return std::nullopt;
 }

+/**
+ * Returns a pair of positions [p1, p2) in the summary file corresponding to
+ * pages which may include keys covered by the specified range, or a disengaged
+ * optional if the sstable does not include any keys from the range.
+ */
+std::optional<std::pair<uint64_t, uint64_t>> sstable::get_index_pages_for_range(const dht::token_range& range) {
+    const auto& entries = _components->summary.entries;
+    auto entries_size = entries.size();
+    index_comparator cmp(*_schema);
+    dht::ring_position_comparator rp_cmp(*_schema);
+    uint64_t left = 0;
+    if (range.start()) {
+        dht::ring_position_view pos = range.start()->is_inclusive()
+            ? dht::ring_position_view::starting_at(range.start()->value())
+            : dht::ring_position_view::ending_at(range.start()->value());
+
+        // There is no summary entry for the last key, so in order to determine
+        // if pos overlaps with the sstable or not we have to compare with the
+        // last key.
+        if (rp_cmp(pos, get_last_decorated_key()) > 0) {
+            // left is past the end of the sampling.
+            return std::nullopt;
+        }
+
+        left = std::distance(std::begin(entries),
+            std::lower_bound(entries.begin(), entries.end(), pos, cmp));
+
+        if (left) {
+            --left;
+        }
+    }
+    uint64_t right = entries_size;
+    if (range.end()) {
+        dht::ring_position_view pos = range.end()->is_inclusive()
+                                      ? dht::ring_position_view::ending_at(range.end()->value())
+                                      : dht::ring_position_view::starting_at(range.end()->value());
+
+        right = std::distance(std::begin(entries),
+            std::lower_bound(entries.begin(), entries.end(), pos, cmp));
+        if (right == 0) {
+            // The first key is strictly greater than right.
+            return std::nullopt;
+        }
+    }
+    if (left < right) {
+        return std::optional<std::pair<uint64_t, uint64_t>>(std::in_place_t(), left, right);
+    }
+    return std::nullopt;
+}
+
 std::vector<dht::decorated_key> sstable::get_key_samples(const schema& s, const dht::token_range& range) {
    auto index_range = get_sample_indexes_for_range(range);
    std::vector<dht::decorated_key> res;
@@ -3066,10 +3137,15 @@ std::vector<dht::decorated_key> sstable::get_key_samples(const schema& s, const
 }

 uint64_t sstable::estimated_keys_for_range(const dht::token_range& range) {
-    auto sample_index_range = get_sample_indexes_for_range(range);
-    uint64_t sample_key_count = sample_index_range ? sample_index_range->second - sample_index_range->first : 0;
-    // adjust for the current sampling level
-    uint64_t estimated_keys = sample_key_count * ((downsampling::BASE_SAMPLING_LEVEL * _components->summary.header.min_index_interval) / _components->summary.header.sampling_level);
+    auto page_range = get_index_pages_for_range(range);
+    if (!page_range) {
+        return 0;
+    }
+    using uint128_t = unsigned __int128;
+    uint64_t range_pages = page_range->second - page_range->first;
+    auto total_keys = get_estimated_key_count();
+    auto total_pages = _components->summary.entries.size();
+    uint64_t estimated_keys = (uint128_t)range_pages * total_keys / total_pages;
    return std::max(uint64_t(1), estimated_keys);
 }

@@ -3146,7 +3222,7 @@ sstable::unlink()
    });

    name = get_filename();
-    auto update_large_data_fut = get_large_data_handler().maybe_delete_large_data_entries(*get_schema(), std::move(name), data_size())
+    auto update_large_data_fut = get_large_data_handler().maybe_delete_large_data_entries(*get_schema(), name, data_size())
            .then_wrapped([name = std::move(name)] (future<> f) {
        if (f.failed()) {
            // Just log and ignore failures to delete large data entries.
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -655,6 +655,7 @@ private:
            composite::eoc marker = composite::eoc::none);

    std::optional<std::pair<uint64_t, uint64_t>> get_sample_indexes_for_range(const dht::token_range& range);
+    std::optional<std::pair<uint64_t, uint64_t>> get_index_pages_for_range(const dht::token_range& range);

    std::vector<unsigned> compute_shards_for_this_sstable() const;
    template <typename Components>
--- a/sstables/types.hh
+++ b/sstables/types.hh
@@ -458,7 +458,8 @@ enum sstable_feature : uint8_t {
    NonCompoundRangeTombstones = 1, // See #2986
    ShadowableTombstones = 2, // See #3885
    CorrectStaticCompact = 3, // See #4139
-    End = 4,
+    CorrectEmptyCounters = 4, // See #4363
+    End = 5,
 };

 // Scylla-specific features enabled for a particular sstable.
--- a/streaming/stream_mutation_fragments_cmd.hh
+++ b/streaming/stream_mutation_fragments_cmd.hh
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+namespace streaming {
+
+enum class stream_mutation_fragments_cmd : uint8_t {
+    error,
+    mutation_fragment_data,
+    end_of_stream,
+};
+
+
+}
--- a/streaming/stream_session.cc
+++ b/streaming/stream_session.cc
@@ -65,6 +65,7 @@
 #include <boost/algorithm/cxx11/any_of.hpp>
 #include <boost/range/adaptor/map.hpp>
 #include "../db/view/view_update_generator.hh"
+#include "streaming/stream_mutation_fragments_cmd.hh"

 namespace streaming {

@@ -160,7 +161,7 @@ void stream_session::init_messaging_service_handler() {
            });
        });
    });
-    ms().register_stream_mutation_fragments([] (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<stream_reason> reason_opt, rpc::source<frozen_mutation_fragment> source) {
+    ms().register_stream_mutation_fragments([] (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<stream_reason> reason_opt, rpc::source<frozen_mutation_fragment, rpc::optional<stream_mutation_fragments_cmd>> source) {
        auto from = netw::messaging_service::get_source(cinfo);
        auto reason = reason_opt ? *reason_opt: stream_reason::unspecified;
        sslog.trace("Got stream_mutation_fragments from {} reason {}", from, int(reason));
@@ -172,15 +173,41 @@ void stream_session::init_messaging_service_handler() {
        return with_scheduling_group(service::get_local_storage_service().db().local().get_streaming_scheduling_group(), [from, estimated_partitions, plan_id, schema_id, &cf, source, reason] () mutable {
                return service::get_schema_for_write(schema_id, from).then([from, estimated_partitions, plan_id, schema_id, &cf, source, reason] (schema_ptr s) mutable {
                    auto sink = ms().make_sink_for_stream_mutation_fragments(source);
-                    auto get_next_mutation_fragment = [source, plan_id, from, s] () mutable {
-                        return source().then([plan_id, from, s] (std::optional<std::tuple<frozen_mutation_fragment>> fmf_opt) mutable {
-                            if (fmf_opt) {
-                                frozen_mutation_fragment& fmf = std::get<0>(fmf_opt.value());
+                    struct stream_mutation_fragments_cmd_status {
+                        bool got_cmd = false;
+                        bool got_end_of_stream = false;
+                    };
+                    auto cmd_status = make_lw_shared<stream_mutation_fragments_cmd_status>();
+                    auto get_next_mutation_fragment = [source, plan_id, from, s, cmd_status] () mutable {
+                        return source().then([plan_id, from, s, cmd_status] (std::optional<std::tuple<frozen_mutation_fragment, rpc::optional<stream_mutation_fragments_cmd>>> opt) mutable {
+                            if (opt) {
+                                auto cmd = std::get<1>(*opt);
+                                if (cmd) {
+                                    cmd_status->got_cmd = true;
+                                    switch (*cmd) {
+                                    case stream_mutation_fragments_cmd::mutation_fragment_data:
+                                        break;
+                                    case stream_mutation_fragments_cmd::error:
+                                        return make_exception_future<mutation_fragment_opt>(std::runtime_error("Sender failed"));
+                                    case stream_mutation_fragments_cmd::end_of_stream:
+                                        cmd_status->got_end_of_stream = true;
+                                        return make_ready_future<mutation_fragment_opt>();
+                                    default:
+                                        return make_exception_future<mutation_fragment_opt>(std::runtime_error("Sender sent wrong cmd"));
+                                    }
+                                }
+                                frozen_mutation_fragment& fmf = std::get<0>(*opt);
                                auto sz = fmf.representation().size();
                                auto mf = fmf.unfreeze(*s);
                                streaming::get_local_stream_manager().update_progress(plan_id, from.addr, progress_info::direction::IN, sz);
                                return make_ready_future<mutation_fragment_opt>(std::move(mf));
                            } else {
+                                // If the sender has sent stream_mutation_fragments_cmd it means it is
+                                // a node that understands the new protocol. It must send end_of_stream
+                                // before close the stream.
+                                if (cmd_status->got_cmd && !cmd_status->got_end_of_stream) {
+                                    return make_exception_future<mutation_fragment_opt>(std::runtime_error("Sender did not sent end_of_stream"));
+                                }
                                return make_ready_future<mutation_fragment_opt>();
                            }
                        });
@@ -211,7 +238,8 @@ void stream_session::init_messaging_service_handler() {
                        int32_t status = 0;
                        uint64_t received_partitions = 0;
                        if (f.failed()) {
-                            f.ignore_ready_future();
+                            sslog.error("[Stream #{}] Failed to handle STREAM_MUTATION_FRAGMENTS (receive and distribute phase) for ks={}, cf={}, peer={}: {}",
+                                    plan_id, s->ks_name(), s->cf_name(), from.addr, f.get_exception());
                            status = -1;
                        } else {
                            received_partitions = f.get0();
@@ -224,7 +252,8 @@ void stream_session::init_messaging_service_handler() {
                            return sink.close();
                        });
                    }).handle_exception([s, plan_id, from, sink] (std::exception_ptr ep) {
-                        sslog.error("[Stream #{}] Failed to handle STREAM_MUTATION_FRAGMENTS for ks={}, cf={}, peer={}: {}", plan_id, s->ks_name(), s->cf_name(), from.addr, ep);
+                        sslog.error("[Stream #{}] Failed to handle STREAM_MUTATION_FRAGMENTS (respond phase) for ks={}, cf={}, peer={}: {}",
+                                plan_id, s->ks_name(), s->cf_name(), from.addr, ep);
                    });
                    return make_ready_future<rpc::sink<int>>(sink);
                });
--- a/streaming/stream_transfer_task.cc
+++ b/streaming/stream_transfer_task.cc
@@ -42,6 +42,7 @@
 #include "streaming/stream_session.hh"
 #include "streaming/stream_manager.hh"
 #include "streaming/stream_reason.hh"
+#include "streaming/stream_mutation_fragments_cmd.hh"
 #include "mutation_reader.hh"
 #include "frozen_mutation.hh"
 #include "mutation.hh"
@@ -105,6 +106,21 @@ struct send_info {
        , prs(to_partition_ranges(ranges))
        , reader(cf.make_streaming_reader(cf.schema(), prs)) {
    }
+    future<bool> has_relevant_range_on_this_shard() {
+        return do_with(false, [this] (bool& found_relevant_range) {
+            return do_for_each(ranges, [this, &found_relevant_range] (dht::token_range range) {
+                if (!found_relevant_range) {
+                    auto sharder = dht::selective_token_range_sharder(range, engine().cpu_id());
+                    auto range_shard = sharder.next();
+                    if (range_shard) {
+                        found_relevant_range = true;
+                    }
+                }
+            }).then([&found_relevant_range] {
+                return found_relevant_range;
+            });
+        });
+    }
    future<size_t> estimate_partitions() {
        return do_with(cf.get_sstables(), size_t(0), [this] (auto& sstables, size_t& partition_count) {
            return do_for_each(*sstables, [this, &partition_count] (auto& sst) {
@@ -161,7 +177,7 @@ future<> send_mutations(lw_shared_ptr<send_info> si) {
 future<> send_mutation_fragments(lw_shared_ptr<send_info> si) {
  return si->estimate_partitions().then([si] (size_t estimated_partitions) {
    sslog.info("[Stream #{}] Start sending ks={}, cf={}, estimated_partitions={}, with new rpc streaming", si->plan_id, si->cf.schema()->ks_name(), si->cf.schema()->cf_name(), estimated_partitions);
-    return netw::get_local_messaging_service().make_sink_and_source_for_stream_mutation_fragments(si->reader.schema()->version(), si->plan_id, si->cf_id, estimated_partitions, si->reason, si->id).then([si] (rpc::sink<frozen_mutation_fragment> sink, rpc::source<int32_t> source) mutable {
+    return netw::get_local_messaging_service().make_sink_and_source_for_stream_mutation_fragments(si->reader.schema()->version(), si->plan_id, si->cf_id, estimated_partitions, si->reason, si->id).then([si] (rpc::sink<frozen_mutation_fragment, stream_mutation_fragments_cmd> sink, rpc::source<int32_t> source) mutable {
        auto got_error_from_peer = make_lw_shared<bool>(false);

        auto source_op = [source, got_error_from_peer, si] () mutable -> future<> {
@@ -184,18 +200,25 @@ future<> send_mutation_fragments(lw_shared_ptr<send_info> si) {
        }();

        auto sink_op = [sink, si, got_error_from_peer] () mutable -> future<> {
-            return do_with(std::move(sink), [si, got_error_from_peer] (rpc::sink<frozen_mutation_fragment>& sink) {
+            return do_with(std::move(sink), [si, got_error_from_peer] (rpc::sink<frozen_mutation_fragment, stream_mutation_fragments_cmd>& sink) {
                return repeat([&sink, si, got_error_from_peer] () mutable {
                    return si->reader(db::no_timeout).then([&sink, si, s = si->reader.schema(), got_error_from_peer] (mutation_fragment_opt mf) mutable {
                        if (mf && !(*got_error_from_peer)) {
                            frozen_mutation_fragment fmf = freeze(*s, *mf);
                            auto size = fmf.representation().size();
                            streaming::get_local_stream_manager().update_progress(si->plan_id, si->id.addr, streaming::progress_info::direction::OUT, size);
-                            return sink(fmf).then([] { return stop_iteration::no; });
+                            return sink(fmf, stream_mutation_fragments_cmd::mutation_fragment_data).then([] { return stop_iteration::no; });
                        } else {
                            return make_ready_future<stop_iteration>(stop_iteration::yes);
                        }
                    });
+                }).then([&sink] () mutable {
+                    return sink(frozen_mutation_fragment(bytes_ostream()), stream_mutation_fragments_cmd::end_of_stream);
+                }).handle_exception([&sink] (std::exception_ptr ep) mutable {
+                    // Notify the receiver the sender has failed
+                    return sink(frozen_mutation_fragment(bytes_ostream()), stream_mutation_fragments_cmd::error).then([ep = std::move(ep)] () mutable {
+                        return make_exception_future<>(std::move(ep));
+                    });
                }).finally([&sink] () mutable {
                    return sink.close();
                });
@@ -222,11 +245,18 @@ future<> stream_transfer_task::execute() {
    auto reason = session->get_reason();
    return session->get_db().invoke_on_all([plan_id, cf_id, id, dst_cpu_id, ranges=this->_ranges, streaming_with_rpc_stream, reason] (database& db) {
        auto si = make_lw_shared<send_info>(db, plan_id, cf_id, std::move(ranges), id, dst_cpu_id, reason);
-        if (streaming_with_rpc_stream) {
-            return send_mutation_fragments(std::move(si));
-        } else {
-            return send_mutations(std::move(si));
-        }
+        return si->has_relevant_range_on_this_shard().then([si, plan_id, cf_id, streaming_with_rpc_stream] (bool has_relevant_range_on_this_shard) {
+            if (!has_relevant_range_on_this_shard) {
+                sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}: ignore ranges on shard={}",
+                        plan_id, cf_id, engine().cpu_id());
+                return make_ready_future<>();
+            }
+            if (streaming_with_rpc_stream) {
+                return send_mutation_fragments(std::move(si));
+            } else {
+                return send_mutations(std::move(si));
+            }
+        });
    }).then([this, plan_id, cf_id, id, streaming_with_rpc_stream] {
        sslog.debug("[Stream #{}] SEND STREAM_MUTATION_DONE to {}, cf_id={}", plan_id, id, cf_id);
        return session->ms().send_stream_mutation_done(id, plan_id, _ranges,
--- a/table.cc
+++ b/table.cc
@@ -136,9 +136,12 @@ contains_rows(const sstables::sstable& sst, const schema_ptr& schema, const ck_f
 // of a range for each clustering component.
 static std::vector<sstables::shared_sstable>
 filter_sstable_for_reader(std::vector<sstables::shared_sstable>&& sstables, column_family& cf, const schema_ptr& schema,
-        const sstables::key& key, const query::partition_slice& slice) {
-    auto sstable_has_not_key = [&] (const sstables::shared_sstable& sst) {
-        return !sst->filter_has_key(key);
+        const dht::partition_range& pr, const sstables::key& key, const query::partition_slice& slice) {
+    const dht::ring_position& pr_key = pr.start()->value();
+    auto sstable_has_not_key = [&, cmp = dht::ring_position_comparator(*schema)] (const sstables::shared_sstable& sst) {
+        return cmp(pr_key, sst->get_first_decorated_key()) < 0 ||
+               cmp(pr_key, sst->get_last_decorated_key()) > 0 ||
+               !sst->filter_has_key(key);
    };
    sstables.erase(boost::remove_if(sstables, sstable_has_not_key), sstables.end());

@@ -286,7 +289,7 @@ create_single_key_sstable_reader(column_family* cf,
 {
    auto key = sstables::key::from_partition_key(*schema, *pr.start()->value().key());
    auto readers = boost::copy_range<std::vector<flat_mutation_reader>>(
-        filter_sstable_for_reader(sstables->select(pr), *cf, schema, key, slice)
+        filter_sstable_for_reader(sstables->select(pr), *cf, schema, pr, key, slice)
        | boost::adaptors::transformed([&] (const sstables::shared_sstable& sstable) {
            tracing::trace(trace_state, "Reading key {} from sstable {}", pr, seastar::value_of([&sstable] { return sstable->get_filename(); }));
            return sstable->read_row_flat(schema, pr.start()->value(), slice, pc, resource_tracker, fwd);
@@ -510,8 +513,8 @@ table::make_streaming_reader(schema_ptr s,
    return make_flat_multi_range_reader(s, std::move(source), ranges, slice, pc, nullptr, mutation_reader::forwarding::no);
 }

-flat_mutation_reader table::make_streaming_reader(schema_ptr schema, const dht::partition_range& range, mutation_reader::forwarding fwd_mr) const {
-    const auto& slice = schema->full_slice();
+flat_mutation_reader table::make_streaming_reader(schema_ptr schema, const dht::partition_range& range,
+        const query::partition_slice& slice, mutation_reader::forwarding fwd_mr) const {
    const auto& pc = service::get_local_streaming_read_priority();
    auto trace_state = tracing::trace_state_ptr();
    const auto fwd = streamed_mutation::forwarding::no;
@@ -1029,6 +1032,7 @@ table::reshuffle_sstables(std::set<int64_t> all_generations, int64_t start) {
    };

    return do_with(work(start, std::move(all_generations)), [this] (work& work) {
+        tlogger.info("Reshuffling SSTables in {}...", _config.datadir);
        return lister::scan_dir(_config.datadir, { directory_entry_type::regular }, [this, &work] (fs::path parent_dir, directory_entry de) {
            auto comps = sstables::entry_descriptor::make_descriptor(parent_dir.native(), de.name);
            if (comps.component != component_type::TOC) {
@@ -1345,7 +1349,8 @@ future<> table::cleanup_sstables(sstables::compaction_descriptor descriptor, boo
            return with_semaphore(sem, 1, [this, &sst, &release_fn, is_actual_cleanup] {
                // release reference to sstables cleaned up, otherwise space usage from their data and index
                // components cannot be reclaimed until all of them are cleaned.
-                auto descriptor = sstables::compaction_descriptor({ std::move(sst) }, sst->get_sstable_level());
+                auto sstable_level = sst->get_sstable_level();
+                auto descriptor = sstables::compaction_descriptor({ std::move(sst) }, sstable_level);
                descriptor.release_exhausted = release_fn;
                return this->compact_sstables(std::move(descriptor), is_actual_cleanup);
            });
@@ -1956,6 +1961,8 @@ future<int64_t>
 table::disable_sstable_write() {
    _sstable_writes_disabled_at = std::chrono::steady_clock::now();
    return _sstables_lock.write_lock().then([this] {
+      // _sstable_deletion_sem must be acquired after _sstables_lock.write_lock
+      return _sstable_deletion_sem.wait().then([this] {
        if (_sstables->all()->empty()) {
            return make_ready_future<int64_t>(0);
        }
@@ -1964,9 +1971,18 @@ table::disable_sstable_write() {
            max = std::max(max, s->generation());
        }
        return make_ready_future<int64_t>(max);
+      });
    });
 }

+std::chrono::steady_clock::duration table::enable_sstable_write(int64_t new_generation) {
+    if (new_generation != -1) {
+        update_sstables_known_generation(new_generation);
+    }
+    _sstable_deletion_sem.signal();
+    _sstables_lock.write_unlock();
+    return std::chrono::steady_clock::now() - _sstable_writes_disabled_at;
+}

 void table::set_schema(schema_ptr s) {
    tlogger.debug("Changing schema version of {}.{} ({}) from {} to {}",
@@ -2456,7 +2472,7 @@ future<row_locker::lock_holder> table::push_view_replica_updates(const schema_pt
    return push_view_replica_updates(s, std::move(m), timeout);
 }

-future<row_locker::lock_holder> table::do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source) const {
+future<row_locker::lock_holder> table::do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source, const io_priority_class& io_priority) const {
    if (!_config.view_update_concurrency_semaphore->current()) {
        // We don't have resources to generate view updates for this write. If we reached this point, we failed to
        // throttle the client. The memory queue is already full, waiting on the semaphore would cause this node to
@@ -2496,13 +2512,13 @@ future<row_locker::lock_holder> table::do_push_view_replica_updates(const schema
    // We'll return this lock to the caller, which will release it after
    // writing the base-table update.
    future<row_locker::lock_holder> lockf = local_base_lock(base, m.decorated_key(), slice.default_row_ranges(), timeout);
-    return lockf.then([m = std::move(m), slice = std::move(slice), views = std::move(views), base, this, timeout, source = std::move(source)] (row_locker::lock_holder lock) {
+    return lockf.then([m = std::move(m), slice = std::move(slice), views = std::move(views), base, this, timeout, source = std::move(source), &io_priority] (row_locker::lock_holder lock) {
      return do_with(
        dht::partition_range::make_singular(m.decorated_key()),
        std::move(slice),
        std::move(m),
-        [base, views = std::move(views), lock = std::move(lock), this, timeout, source = std::move(source)] (auto& pk, auto& slice, auto& m) mutable {
-            auto reader = source.make_reader(base, pk, slice, service::get_local_sstable_query_read_priority());
+        [base, views = std::move(views), lock = std::move(lock), this, timeout, source = std::move(source), &io_priority] (auto& pk, auto& slice, auto& m) mutable {
+            auto reader = source.make_reader(base, pk, slice, io_priority);
            return this->generate_and_propagate_view_updates(base, std::move(views), std::move(m), std::move(reader)).then([lock = std::move(lock)] () mutable {
                // return the local partition/row lock we have taken so it
                // remains locked until the caller is done modifying this
@@ -2514,11 +2530,11 @@ future<row_locker::lock_holder> table::do_push_view_replica_updates(const schema
 }

 future<row_locker::lock_holder> table::push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout) const {
-    return do_push_view_replica_updates(s, std::move(m), timeout, as_mutation_source());
+    return do_push_view_replica_updates(s, std::move(m), timeout, as_mutation_source(), service::get_local_sstable_query_read_priority());
 }

 future<row_locker::lock_holder> table::stream_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, sstables::shared_sstable excluded_sstable) const {
-    return do_push_view_replica_updates(s, std::move(m), timeout, as_mutation_source_excluding(std::move(excluded_sstable)));
+    return do_push_view_replica_updates(s, std::move(m), timeout, as_mutation_source_excluding(std::move(excluded_sstable)), service::get_local_streaming_write_priority());
 }

 mutation_source
--- a/tests/cql_assertions.cc
+++ b/tests/cql_assertions.cc
@@ -178,3 +178,14 @@ rows_assertions rows_assertions::with_serialized_columns_count(size_t columns_co
    }
    return {*this};
 }
+
+shared_ptr<cql_transport::messages::result_message> cquery_nofail(
+        cql_test_env& env, const char* query, const std::experimental::source_location& loc) {
+    try {
+        return env.execute_cql(query).get0();
+    } catch (...) {
+        BOOST_FAIL(format("query '{}' failed: {}\n{}:{}: originally from here",
+                          query, std::current_exception(), loc.file_name(), loc.line()));
+    }
+    return shared_ptr<cql_transport::messages::result_message>(nullptr);
+}
--- a/tests/cql_assertions.hh
+++ b/tests/cql_assertions.hh
@@ -22,8 +22,10 @@

 #pragma once

+#include "tests/cql_test_env.hh"
 #include "transport/messages/result_message_base.hh"
 #include "bytes.hh"
+#include <experimental/source_location>
 #include <seastar/core/shared_ptr.hh>
 #include <seastar/core/future.hh>

@@ -76,3 +78,12 @@ void assert_that_failed(future<T...>&& f)
    catch (...) {
    }
 }
+
+/// Invokes env.execute_cql(query), awaits its result, and returns it.  If an exception is thrown,
+/// invokes BOOST_FAIL with useful diagnostics.
+///
+/// \note Should be called from a seastar::thread context, as it awaits the CQL result.
+shared_ptr<cql_transport::messages::result_message> cquery_nofail(
+        cql_test_env& env,
+        const char* query,
+        const std::experimental::source_location& loc = std::experimental::source_location::current());
--- a/Show More
+++ b/Show More