release: prepare for 3.1.3

repair: Avoid duplicated partition_end write
Consider this: 1) Write partition_start of p1 2) Write clustering_row of p1 3) Write partition_end of p1 4) Repair is stopped due to error before writing partition_start of p2 5) Repair calls repair_row_level_stop() to tear down which calls wait_for_writer_done(). A duplicate partition_end is written. To fix, track the partition_start and partition_end written, avoid unpaired writes. Backports: 3.1 and 3.2 Fixes: #5527 (cherry picked from commit 401854dbaf)
2020-01-28 14:09:57 +02:00 · 2020-01-21 13:39:19 +02:00 · 2020-01-16 12:07:40 +01:00 · 2020-01-05 18:51:53 +02:00 · 2020-01-05 18:50:13 +02:00 · 2019-12-30 19:38:34 +02:00
355 changed files with 5055 additions and 1267 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=666.development
+VERSION=3.1.3

 if test -f version
 then
--- a/api/api.hh
+++ b/api/api.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include <seastar/json/json_elements.hh>
+#include <type_traits>
 #include <boost/lexical_cast.hpp>
 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string/classification.hpp>
@@ -231,7 +232,22 @@ public:
            return;
        }
        try {
-            value = T{boost::lexical_cast<Base>(param)};
+            // boost::lexical_cast does not use boolalpha. Converting a
+            // true/false throws exceptions. We don't want that.
+            if constexpr (std::is_same_v<Base, bool>) {
+                // Cannot use boolalpha because we (probably) want to
+                // accept 1 and 0 as well as true and false. And True. And fAlse.
+                std::transform(param.begin(), param.end(), param.begin(), ::tolower);
+                if (param == "true" || param == "1") {
+                    value = T(true);
+                } else if (param == "false" || param == "0") {
+                    value = T(false);
+                } else {
+                    throw boost::bad_lexical_cast{};
+                }
+            } else {
+                value = T{boost::lexical_cast<Base>(param)};
+            }
        } catch (boost::bad_lexical_cast&) {
            throw bad_param_exception(format("{} ({}): type error - should be {}", name, param, boost::units::detail::demangle(typeid(Base).name())));
        }
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -170,7 +170,9 @@ future<> service::start() {
    return once_among_shards([this] {
        return create_keyspace_if_missing();
    }).then([this] {
-        return when_all_succeed(_role_manager->start(), _authorizer->start(), _authenticator->start());
+        return _role_manager->start().then([this] {
+            return when_all_succeed(_authorizer->start(), _authenticator->start());
+        });
    }).then([this] {
        _permissions_cache = std::make_unique<permissions_cache>(_permissions_cache_config, *this, log);
    }).then([this] {
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -61,6 +61,7 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
        // - _last_row points at a direct predecessor of the next row which is going to be read.
        //   Used for populating continuity.
        // - _population_range_starts_before_all_rows is set accordingly
+        // - _underlying is engaged and fast-forwarded
        reading_from_underlying,

        end_of_stream
@@ -99,7 +100,13 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
    // forward progress is not guaranteed in case iterators are getting constantly invalidated.
    bool _lower_bound_changed = false;

+    // Points to the underlying reader conforming to _schema,
+    // either to *_underlying_holder or _read_context->underlying().underlying().
+    flat_mutation_reader* _underlying = nullptr;
+    std::optional<flat_mutation_reader> _underlying_holder;
+
    future<> do_fill_buffer(db::timeout_clock::time_point);
+    future<> ensure_underlying(db::timeout_clock::time_point);
    void copy_from_cache_to_buffer();
    future<> process_static_row(db::timeout_clock::time_point);
    void move_to_end();
@@ -186,23 +193,22 @@ future<> cache_flat_mutation_reader::process_static_row(db::timeout_clock::time_
        return make_ready_future<>();
    } else {
        _read_context->cache().on_row_miss();
-        return _read_context->get_next_fragment(timeout).then([this] (mutation_fragment_opt&& sr) {
-            if (sr) {
-                assert(sr->is_static_row());
-                maybe_add_to_cache(sr->as_static_row());
-                push_mutation_fragment(std::move(*sr));
-            }
-            maybe_set_static_row_continuous();
+        return ensure_underlying(timeout).then([this, timeout] {
+            return (*_underlying)(timeout).then([this] (mutation_fragment_opt&& sr) {
+                if (sr) {
+                    assert(sr->is_static_row());
+                    maybe_add_to_cache(sr->as_static_row());
+                    push_mutation_fragment(std::move(*sr));
+                }
+                maybe_set_static_row_continuous();
+            });
        });
    }
 }

 inline
 void cache_flat_mutation_reader::touch_partition() {
-    if (_snp->at_latest_version()) {
-        rows_entry& last_dummy = *_snp->version()->partition().clustered_rows().rbegin();
-        _snp->tracker()->touch(last_dummy);
-    }
+    _snp->touch();
 }

 inline
@@ -232,14 +238,36 @@ future<> cache_flat_mutation_reader::fill_buffer(db::timeout_clock::time_point t
    });
 }

+inline
+future<> cache_flat_mutation_reader::ensure_underlying(db::timeout_clock::time_point timeout) {
+    if (_underlying) {
+        return make_ready_future<>();
+    }
+    return _read_context->ensure_underlying(timeout).then([this, timeout] {
+        flat_mutation_reader& ctx_underlying = _read_context->underlying().underlying();
+        if (ctx_underlying.schema() != _schema) {
+            _underlying_holder = make_delegating_reader(ctx_underlying);
+            _underlying_holder->upgrade_schema(_schema);
+            _underlying = &*_underlying_holder;
+        } else {
+            _underlying = &ctx_underlying;
+        }
+    });
+}
+
 inline
 future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
    if (_state == state::move_to_underlying) {
+        if (!_underlying) {
+            return ensure_underlying(timeout).then([this, timeout] {
+                return do_fill_buffer(timeout);
+            });
+        }
        _state = state::reading_from_underlying;
        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
                                      : position_in_partition(_upper_bound);
-        return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
+        return _underlying->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
            return read_from_underlying(timeout);
        });
    }
@@ -280,7 +308,7 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin

 inline
 future<> cache_flat_mutation_reader::read_from_underlying(db::timeout_clock::time_point timeout) {
-    return consume_mutation_fragments_until(_read_context->underlying().underlying(),
+    return consume_mutation_fragments_until(*_underlying,
        [this] { return _state != state::reading_from_underlying || is_buffer_full(); },
        [this] (mutation_fragment mf) {
            _read_context->cache().on_row_miss();
--- a/configure.py
+++ b/configure.py
@@ -596,6 +596,7 @@ scylla_core = (['database.cc',
                'db/consistency_level.cc',
                'db/system_keyspace.cc',
                'db/system_distributed_keyspace.cc',
+                'db/size_estimates_virtual_reader.cc',
                'db/schema_tables.cc',
                'db/cql_type_parser.cc',
                'db/legacy_schema_migrator.cc',
--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -130,6 +130,18 @@ query_options::query_options(std::unique_ptr<query_options> qo, ::shared_ptr<ser

 }

+query_options::query_options(std::unique_ptr<query_options> qo, ::shared_ptr<service::pager::paging_state> paging_state, int32_t page_size)
+        : query_options(qo->_consistency,
+        qo->get_timeout_config(),
+        std::move(qo->_names),
+        std::move(qo->_values),
+        std::move(qo->_value_views),
+        qo->_skip_metadata,
+        std::move(query_options::specific_options{page_size, paging_state, qo->_options.serial_consistency, qo->_options.timestamp}),
+        qo->_cql_serialization_format) {
+
+}
+
 query_options::query_options(std::vector<cql3::raw_value> values)
    : query_options(
          db::consistency_level::ONE, infinite_timeout_config, std::move(values))
--- a/cql3/query_options.hh
+++ b/cql3/query_options.hh
@@ -102,7 +102,7 @@ private:

 public:
    query_options(query_options&&) = default;
-    query_options(const query_options&) = delete;
+    explicit query_options(const query_options&) = default;

    explicit query_options(db::consistency_level consistency,
                           const timeout_config& timeouts,
@@ -155,6 +155,7 @@ public:
    explicit query_options(db::consistency_level, const timeout_config& timeouts,
            std::vector<cql3::raw_value> values, specific_options options = specific_options::DEFAULT);
    explicit query_options(std::unique_ptr<query_options>, ::shared_ptr<service::pager::paging_state> paging_state);
+    explicit query_options(std::unique_ptr<query_options>, ::shared_ptr<service::pager::paging_state> paging_state, int32_t page_size);

    const timeout_config& get_timeout_config() const { return _timeout_config; }

--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -222,11 +222,9 @@ statement_restrictions::statement_restrictions(database& db,
    auto& cf = db.find_column_family(schema);
    auto& sim = cf.get_index_manager();
    const allow_local_index allow_local(!_partition_key_restrictions->has_unrestricted_components(*_schema) && _partition_key_restrictions->is_all_eq());
-    bool has_queriable_clustering_column_index = _clustering_columns_restrictions->has_supporting_index(sim, allow_local);
-    bool has_queriable_pk_index = _partition_key_restrictions->has_supporting_index(sim, allow_local);
-    bool has_queriable_index = has_queriable_clustering_column_index
-            || has_queriable_pk_index
-            || _nonprimary_key_restrictions->has_supporting_index(sim, allow_local);
+    const bool has_queriable_clustering_column_index = _clustering_columns_restrictions->has_supporting_index(sim, allow_local);
+    const bool has_queriable_pk_index = _partition_key_restrictions->has_supporting_index(sim, allow_local);
+    const bool has_queriable_regular_index = _nonprimary_key_restrictions->has_supporting_index(sim, allow_local);

    // At this point, the select statement if fully constructed, but we still have a few things to validate
    process_partition_key_restrictions(has_queriable_pk_index, for_view, allow_filtering);
@@ -286,7 +284,7 @@ statement_restrictions::statement_restrictions(database& db,
    }

    if (!_nonprimary_key_restrictions->empty()) {
-        if (has_queriable_index) {
+        if (has_queriable_regular_index) {
            _uses_secondary_indexing = true;
        } else if (!allow_filtering) {
            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
@@ -392,8 +390,9 @@ std::vector<const column_definition*> statement_restrictions::get_column_defs_fo
                }
            }
        }
-        if (_clustering_columns_restrictions->needs_filtering(*_schema)) {
-            column_id first_filtering_id = _schema->clustering_key_columns().begin()->id +
+        const bool pk_has_unrestricted_components = _partition_key_restrictions->has_unrestricted_components(*_schema);
+        if (pk_has_unrestricted_components || _clustering_columns_restrictions->needs_filtering(*_schema)) {
+            column_id first_filtering_id = pk_has_unrestricted_components ? 0 : _schema->clustering_key_columns().begin()->id +
                    _clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
            for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
                if (cdef->id >= first_filtering_id && !column_uses_indexing(cdef)) {
@@ -507,10 +506,9 @@ bool statement_restrictions::need_filtering() const {
    int number_of_filtering_restrictions = _nonprimary_key_restrictions->size();
    // If the whole partition key is restricted, it does not imply filtering
    if (_partition_key_restrictions->has_unrestricted_components(*_schema) || !_partition_key_restrictions->is_all_eq()) {
-        number_of_filtering_restrictions += _partition_key_restrictions->size();
-        if (_clustering_columns_restrictions->has_unrestricted_components(*_schema)) {
-            number_of_filtering_restrictions += _clustering_columns_restrictions->size() - _clustering_columns_restrictions->prefix_size();
-        }
+        number_of_filtering_restrictions += _partition_key_restrictions->size() + _clustering_columns_restrictions->size();
+    } else if (_clustering_columns_restrictions->has_unrestricted_components(*_schema)) {
+        number_of_filtering_restrictions += _clustering_columns_restrictions->size() - _clustering_columns_restrictions->prefix_size();
    }
    return number_of_restricted_columns_for_indexing > 1
            || (number_of_restricted_columns_for_indexing == 0 && _partition_key_restrictions->empty() && !_clustering_columns_restrictions->empty())
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -407,7 +407,7 @@ public:
    }

    bool ck_restrictions_need_filtering() const {
-        return _clustering_columns_restrictions->needs_filtering(*_schema);
+        return _partition_key_restrictions->has_unrestricted_components(*_schema) || _clustering_columns_restrictions->needs_filtering(*_schema);
    }

    /**
--- a/cql3/result_set.cc
+++ b/cql3/result_set.cc
@@ -83,6 +83,9 @@ void metadata::maybe_set_paging_state(::shared_ptr<const service::pager::paging_
    assert(paging_state);
    if (paging_state->get_remaining() > 0) {
        set_paging_state(std::move(paging_state));
+    } else {
+        _flags.remove<flag::HAS_MORE_PAGES>();
+        _paging_state = nullptr;
    }
 }

--- a/cql3/selection/selectable.cc
+++ b/cql3/selection/selectable.cc
@@ -142,7 +142,7 @@ shared_ptr<selector::factory>
 selectable::with_field_selection::new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) {
    auto&& factory = _selected->new_selector_factory(db, s, defs);
    auto&& type = factory->new_instance()->get_type();
-    auto&& ut = dynamic_pointer_cast<const user_type_impl>(std::move(type));
+    auto&& ut = dynamic_pointer_cast<const user_type_impl>(type->underlying_type());
    if (!ut) {
        throw exceptions::invalid_request_exception(
                format("Invalid field selection: {} of type {} is not a user type",
--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -166,7 +166,8 @@ alter_type_statement::add_or_alter::add_or_alter(const ut_name& name, bool is_ad
 user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_update) const
 {
    if (get_idx_of_field(to_update, _field_name)) {
-        throw exceptions::invalid_request_exception(format("Cannot add new field {} to type {}: a field of the same name already exists", _field_name->name(), _name.to_string()));
+        throw exceptions::invalid_request_exception(format("Cannot add new field {} to type {}: a field of the same name already exists",
+            _field_name->to_string(), _name.to_string()));
    }

    std::vector<bytes> new_names(to_update->field_names());
@@ -174,7 +175,7 @@ user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_
    std::vector<data_type> new_types(to_update->field_types());
    auto&& add_type = _field_type->prepare(db, keyspace()).get_type();
    if (add_type->references_user_type(to_update->_keyspace, to_update->_name)) {
-        throw exceptions::invalid_request_exception(format("Cannot add new field {} of type {} to type {} as this would create a circular reference", _field_name->name(), _field_type->to_string(), _name.to_string()));
+        throw exceptions::invalid_request_exception(format("Cannot add new field {} of type {} to type {} as this would create a circular reference", _field_name->to_string(), _field_type->to_string(), _name.to_string()));
    }
    new_types.push_back(std::move(add_type));
    return user_type_impl::get_instance(to_update->_keyspace, to_update->_name, std::move(new_names), std::move(new_types));
@@ -184,13 +185,14 @@ user_type alter_type_statement::add_or_alter::do_alter(database& db, user_type t
 {
    std::optional<uint32_t> idx = get_idx_of_field(to_update, _field_name);
    if (!idx) {
-        throw exceptions::invalid_request_exception(format("Unknown field {} in type {}", _field_name->name(), _name.to_string()));
+        throw exceptions::invalid_request_exception(format("Unknown field {} in type {}", _field_name->to_string(), _name.to_string()));
    }

    auto previous = to_update->field_types()[*idx];
    auto new_type = _field_type->prepare(db, keyspace()).get_type();
    if (!new_type->is_compatible_with(*previous)) {
-        throw exceptions::invalid_request_exception(format("Type {} in incompatible with previous type {} of field {} in user type {}", _field_type->to_string(), previous->as_cql3_type().to_string(), _field_name->name(), _name.to_string()));
+        throw exceptions::invalid_request_exception(format("Type {} in incompatible with previous type {} of field {} in user type {}",
+            _field_type->to_string(), previous->as_cql3_type().to_string(), _field_name->to_string(), _name.to_string()));
    }

    std::vector<data_type> new_types(to_update->field_types());
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -76,7 +76,7 @@ public:
        const bool _is_distinct;
        const bool _allow_filtering;
        const bool _is_json;
-        bool _bypass_cache;
+        bool _bypass_cache = false;
    public:
        parameters();
        parameters(orderings_type orderings,
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -440,8 +440,8 @@ indexed_table_select_statement::prepare_command_for_base_query(const query_optio
    return cmd;
 }

-future<shared_ptr<cql_transport::messages::result_message>>
-indexed_table_select_statement::execute_base_query(
+future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
+indexed_table_select_statement::do_execute_base_query(
        service::storage_proxy& proxy,
        dht::partition_range_vector&& partition_ranges,
        service::query_state& state,
@@ -492,22 +492,27 @@ indexed_table_select_statement::execute_base_query(
        }).then([&merger]() {
            return merger.get();
        });
-    }).then([this, &proxy, &state, &options, now, cmd, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
-        return this->process_base_query_results(std::move(result), cmd, proxy, state, options, now, std::move(paging_state));
+    }).then([cmd] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
+        return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>(std::move(result), std::move(cmd));
    });
 }

-// Function for fetching the selected columns from a list of clustering rows.
-// It is currently used only in our Secondary Index implementation - ordinary
-// CQL SELECT statements do not have the syntax to request a list of rows.
-// FIXME: The current implementation is very inefficient - it requests each
-// row separately (and, incrementally, in parallel). Even multiple rows from a single
-// partition are requested separately. This last case can be easily improved,
-// but to implement the general case (multiple rows from multiple partitions)
-// efficiently, we will need more support from other layers.
-// Keys are ordered in token order (see #3423)
 future<shared_ptr<cql_transport::messages::result_message>>
 indexed_table_select_statement::execute_base_query(
+        service::storage_proxy& proxy,
+        dht::partition_range_vector&& partition_ranges,
+        service::query_state& state,
+        const query_options& options,
+        gc_clock::time_point now,
+        ::shared_ptr<const service::pager::paging_state> paging_state) {
+    return do_execute_base_query(proxy, std::move(partition_ranges), state, options, now, paging_state).then(
+            [this, &proxy, &state, &options, now, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result, lw_shared_ptr<query::read_command> cmd) {
+        return process_base_query_results(std::move(result), std::move(cmd), proxy, state, options, now, std::move(paging_state));
+    });
+}
+
+future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
+indexed_table_select_statement::do_execute_base_query(
        service::storage_proxy& proxy,
        std::vector<primary_key>&& primary_keys,
        service::query_state& state,
@@ -562,9 +567,23 @@ indexed_table_select_statement::execute_base_query(
            });
        }).then([&merger] () {
            return merger.get();
+        }).then([cmd] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
+            return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>(std::move(result), std::move(cmd));
        });
-    }).then([this, &proxy, &state, &options, now, cmd, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
-        return this->process_base_query_results(std::move(result), cmd, proxy, state, options, now, std::move(paging_state));
+    });
+}
+
+future<shared_ptr<cql_transport::messages::result_message>>
+indexed_table_select_statement::execute_base_query(
+        service::storage_proxy& proxy,
+        std::vector<primary_key>&& primary_keys,
+        service::query_state& state,
+        const query_options& options,
+        gc_clock::time_point now,
+        ::shared_ptr<const service::pager::paging_state> paging_state) {
+    return do_execute_base_query(proxy, std::move(primary_keys), state, options, now, paging_state).then(
+            [this, &proxy, &state, &options, now, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result, lw_shared_ptr<query::read_command> cmd) {
+        return process_base_query_results(std::move(result), std::move(cmd), proxy, state, options, now, std::move(paging_state));
    });
 }

@@ -868,6 +887,60 @@ indexed_table_select_statement::do_execute(service::storage_proxy& proxy,
        }
    }

+    // Aggregated and paged filtering needs to aggregate the results from all pages
+    // in order to avoid returning partial per-page results (issue #4540).
+    // It's a little bit more complicated than regular aggregation, because each paging state
+    // needs to be translated between the base table and the underlying view.
+    // The routine below keeps fetching pages from the underlying view, which are then
+    // used to fetch base rows, which go straight to the result set builder.
+    // A local, internal copy of query_options is kept in order to keep updating
+    // the paging state between requesting data from replicas.
+    const bool aggregate = _selection->is_aggregate();
+    if (aggregate) {
+        const bool restrictions_need_filtering = _restrictions->need_filtering();
+        return do_with(cql3::selection::result_set_builder(*_selection, now, options.get_cql_serialization_format()), std::make_unique<cql3::query_options>(cql3::query_options(options)),
+                [this, &options, &proxy, &state, now, whole_partitions, partition_slices, restrictions_need_filtering] (cql3::selection::result_set_builder& builder, std::unique_ptr<cql3::query_options>& internal_options) {
+            // page size is set to the internal count page size, regardless of the user-provided value
+            internal_options.reset(new cql3::query_options(std::move(internal_options), options.get_paging_state(), DEFAULT_COUNT_PAGE_SIZE));
+            return repeat([this, &builder, &options, &internal_options, &proxy, &state, now, whole_partitions, partition_slices, restrictions_need_filtering] () {
+                auto consume_results = [this, &builder, &options, &internal_options, restrictions_need_filtering] (foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd) {
+                    if (restrictions_need_filtering) {
+                        query::result_view::consume(*results, cmd->slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection,
+                                cql3::selection::result_set_builder::restrictions_filter(_restrictions, options, cmd->row_limit, _schema, cmd->slice.partition_row_limit())));
+                    } else {
+                        query::result_view::consume(*results, cmd->slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection));
+                    }
+                };
+
+                if (whole_partitions || partition_slices) {
+                    return find_index_partition_ranges(proxy, state, *internal_options).then(
+                            [this, now, &state, &internal_options, &proxy, consume_results = std::move(consume_results)] (dht::partition_range_vector partition_ranges, ::shared_ptr<const service::pager::paging_state> paging_state) {
+                        bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
+                        internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? ::make_shared<service::pager::paging_state>(*paging_state) : nullptr));
+                        return do_execute_base_query(proxy, std::move(partition_ranges), state, *internal_options, now, std::move(paging_state)).then(consume_results).then([has_more_pages] {
+                            return stop_iteration(!has_more_pages);
+                        });
+                    });
+                } else {
+                    return find_index_clustering_rows(proxy, state, *internal_options).then(
+                            [this, now, &state, &internal_options, &proxy, consume_results = std::move(consume_results)] (std::vector<primary_key> primary_keys, ::shared_ptr<const service::pager::paging_state> paging_state) {
+                        bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
+                        internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? ::make_shared<service::pager::paging_state>(*paging_state) : nullptr));
+                        return this->do_execute_base_query(proxy, std::move(primary_keys), state, *internal_options, now, std::move(paging_state)).then(consume_results).then([has_more_pages] {
+                            return stop_iteration(!has_more_pages);
+                        });
+                    });
+                }
+            }).then([this, &builder, restrictions_need_filtering] () {
+                auto rs = builder.build();
+                update_stats_rows_read(rs->size());
+                _stats.filtered_rows_matched_total += restrictions_need_filtering ? rs->size() : 0;
+                auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
+                return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
+            });
+        });
+    }
+
    if (whole_partitions || partition_slices) {
        // In this case, can use our normal query machinery, which retrieves
        // entire partitions or the same slice for many partitions.
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -68,8 +68,8 @@ class select_statement : public cql_statement {
 public:
    using parameters = raw::select_statement::parameters;
    using ordering_comparator_type = raw::select_statement::ordering_comparator_type;
-protected:
    static constexpr int DEFAULT_COUNT_PAGE_SIZE = 10000;
+protected:
    static thread_local const ::shared_ptr<parameters> _default_parameters;
    schema_ptr _schema;
    uint32_t _bound_terms;
@@ -229,6 +229,14 @@ private:
    lw_shared_ptr<query::read_command>
    prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging);

+    future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
+    do_execute_base_query(
+            service::storage_proxy& proxy,
+            dht::partition_range_vector&& partition_ranges,
+            service::query_state& state,
+            const query_options& options,
+            gc_clock::time_point now,
+            ::shared_ptr<const service::pager::paging_state> paging_state);
    future<shared_ptr<cql_transport::messages::result_message>>
    execute_base_query(
            service::storage_proxy& proxy,
@@ -238,6 +246,23 @@ private:
            gc_clock::time_point now,
            ::shared_ptr<const service::pager::paging_state> paging_state);

+    // Function for fetching the selected columns from a list of clustering rows.
+    // It is currently used only in our Secondary Index implementation - ordinary
+    // CQL SELECT statements do not have the syntax to request a list of rows.
+    // FIXME: The current implementation is very inefficient - it requests each
+    // row separately (and, incrementally, in parallel). Even multiple rows from a single
+    // partition are requested separately. This last case can be easily improved,
+    // but to implement the general case (multiple rows from multiple partitions)
+    // efficiently, we will need more support from other layers.
+    // Keys are ordered in token order (see #3423)
+    future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
+    do_execute_base_query(
+            service::storage_proxy& proxy,
+            std::vector<primary_key>&& primary_keys,
+            service::query_state& state,
+            const query_options& options,
+            gc_clock::time_point now,
+            ::shared_ptr<const service::pager::paging_state> paging_state);
    future<shared_ptr<cql_transport::messages::result_message>>
    execute_base_query(
            service::storage_proxy& proxy,
--- a/cql3/tuples.cc
+++ b/cql3/tuples.cc
@@ -32,7 +32,7 @@ tuples::component_spec_of(shared_ptr<column_specification> column, size_t compon
            column->ks_name,
            column->cf_name,
            ::make_shared<column_identifier>(format("{}[{:d}]", column->name, component), true),
-            static_pointer_cast<const tuple_type_impl>(column->type)->type(component));
+            static_pointer_cast<const tuple_type_impl>(column->type->underlying_type())->type(component));
 }

 shared_ptr<term>
--- a/cql3/tuples.hh
+++ b/cql3/tuples.hh
@@ -70,7 +70,7 @@ public:

    private:
        void validate_assignable_to(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) {
-            auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver->type);
+            auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver->type->underlying_type());
            if (!tt) {
                throw exceptions::invalid_request_exception(format("Invalid tuple type literal for {} of type {}", receiver->name, receiver->type->as_cql3_type()));
            }
--- a/database.cc
+++ b/database.cc
@@ -260,6 +260,10 @@ void backlog_controller::adjust() {

 float backlog_controller::backlog_of_shares(float shares) const {
    size_t idx = 1;
+    // No control points means the controller is disabled.
+    if (_control_points.size() == 0) {
+            return 1.0f;
+    }
    while ((idx < _control_points.size() - 1) && (_control_points[idx].output < shares)) {
        idx++;
    }
@@ -1929,7 +1933,7 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
        virtual flat_mutation_reader create_reader(
                schema_ptr schema,
                const dht::partition_range& range,
-                const query::partition_slice&,
+                const query::partition_slice& slice,
                const io_priority_class& pc,
                tracing::trace_state_ptr,
                mutation_reader::forwarding fwd_mr) override {
@@ -1940,7 +1944,7 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
            _contexts[shard].read_operation = make_foreign(std::make_unique<utils::phased_barrier::operation>(cf.read_in_progress()));
            _contexts[shard].semaphore = &cf.streaming_read_concurrency_semaphore();

-            return cf.make_streaming_reader(std::move(schema), *_contexts[shard].range, fwd_mr);
+            return cf.make_streaming_reader(std::move(schema), *_contexts[shard].range, slice, fwd_mr);
        }
        virtual void destroy_reader(shard_id shard, future<stopped_reader> reader_fut) noexcept override {
            reader_fut.then([this, zis = shared_from_this(), shard] (stopped_reader&& reader) mutable {
@@ -1963,7 +1967,8 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
        return make_multishard_combining_reader(make_shared<streaming_reader_lifecycle_policy>(db), partitioner, std::move(s), pr, ps, pc,
                std::move(trace_state), fwd_mr);
    });
-    return make_flat_multi_range_reader(std::move(schema), std::move(ms), std::move(range_generator), schema->full_slice(),
+    auto&& full_slice = schema->full_slice();
+    return make_flat_multi_range_reader(std::move(schema), std::move(ms), std::move(range_generator), std::move(full_slice),
            service::get_local_streaming_read_priority(), {}, mutation_reader::forwarding::no);
 }

--- a/database.hh
+++ b/database.hh
@@ -458,6 +458,7 @@ private:
    // This semaphore ensures that an operation like snapshot won't have its selected
    // sstables deleted by compaction in parallel, a race condition which could
    // easily result in failure.
+    // Locking order: must be acquired either independently or after _sstables_lock
    seastar::semaphore _sstable_deletion_sem = {1};
    // There are situations in which we need to stop writing sstables. Flushers will take
    // the read lock, and the ones that wish to stop that process will take the write lock.
@@ -679,8 +680,13 @@ public:

    // Single range overload.
    flat_mutation_reader make_streaming_reader(schema_ptr schema, const dht::partition_range& range,
+            const query::partition_slice& slice,
            mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::no) const;

+    flat_mutation_reader make_streaming_reader(schema_ptr schema, const dht::partition_range& range) {
+        return make_streaming_reader(schema, range, schema->full_slice());
+    }
+
    sstables::shared_sstable make_streaming_sstable_for_write(std::optional<sstring> subdir = {});
    sstables::shared_sstable make_streaming_staging_sstable() {
        return make_streaming_sstable_for_write("staging");
@@ -759,13 +765,7 @@ public:

    // SSTable writes are now allowed again, and generation is updated to new_generation if != -1
    // returns the amount of microseconds elapsed since we disabled writes.
-    std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation) {
-        if (new_generation != -1) {
-            update_sstables_known_generation(new_generation);
-        }
-        _sstables_lock.write_unlock();
-        return std::chrono::steady_clock::now() - _sstable_writes_disabled_at;
-    }
+    std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation);

    // Make sure the generation numbers are sequential, starting from "start".
    // Generations before "start" are left untouched.
@@ -935,7 +935,7 @@ public:
    }

 private:
-    future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source) const;
+    future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source, const io_priority_class& io_priority) const;
    std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
    future<> generate_and_propagate_view_updates(const schema_ptr& base,
            std::vector<view_ptr>&& views,
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -396,10 +396,8 @@ std::unordered_set<gms::inet_address> db::batchlog_manager::endpoint_filter(cons

    // grab a random member of up to two racks
    for (auto& rack : racks) {
-        auto rack_members = validated.bucket(rack);
-        auto n = validated.bucket_size(rack_members);
        auto cpy = boost::copy_range<std::vector<gms::inet_address>>(validated.equal_range(rack) | boost::adaptors::map_values);
-        std::uniform_int_distribution<size_t> rdist(0, n - 1);
+        std::uniform_int_distribution<size_t> rdist(0, cpy.size() - 1);
        result.emplace(cpy[rdist(_e1)]);
    }

--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -148,9 +148,18 @@ db::commitlog::descriptor::descriptor(const sstring& filename, const std::string
        : descriptor([&filename, &fname_prefix]() {
            std::smatch m;
            // match both legacy and new version of commitlogs Ex: CommitLog-12345.log and CommitLog-4-12345.log.
-                std::regex rx("(?:.*/)?(?:Recycled-)?" + fname_prefix + "((\\d+)(" + SEPARATOR + "\\d+)?)" + FILENAME_EXTENSION);
+                std::regex rx("(?:Recycled-)?" + fname_prefix + "((\\d+)(" + SEPARATOR + "\\d+)?)" + FILENAME_EXTENSION);
                std::string sfilename = filename;
-                if (!std::regex_match(sfilename, m, rx)) {
+                auto cbegin = sfilename.cbegin();
+                // skip the leading path
+                // Note: we're using rfind rather than the regex above
+                // since it may run out of stack in debug builds.
+                // See https://github.com/scylladb/scylla/issues/4464
+                auto pos = std::string(filename).rfind('/');
+                if (pos != std::string::npos) {
+                    cbegin += pos + 1;
+                }
+                if (!std::regex_match(cbegin, sfilename.cend(), m, rx)) {
                    throw std::domain_error("Cannot parse the version of the file: " + filename);
                }
                if (m[3].length() == 0) {
@@ -420,7 +429,11 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c

    uint64_t _file_pos = 0;
    uint64_t _flush_pos = 0;
+
    bool _closed = false;
+    // Not the same as _closed since files can be reused
+    bool _closed_file = false;
+
    bool _terminated = false;

    using buffer_type = segment_manager::buffer_type;
@@ -486,7 +499,7 @@ public:
        clogger.debug("Created new {} segment {}", active ? "active" : "reserve", *this);
    }
    ~segment() {
-        if (!_closed) {
+        if (!_closed_file) {
            _segment_manager->add_file_to_close(std::move(_file));
        }
        if (is_clean()) {
@@ -560,7 +573,7 @@ public:
                    // and we should have waited out all pending.
                    return me->_pending_ops.close().finally([me] {
                        return me->_file.truncate(me->_flush_pos).then([me] {
-                            return me->_file.close();
+                            return me->_file.close().finally([me] { me->_closed_file = true; });
                        });
                    });
                });
@@ -1223,6 +1236,34 @@ void db::commitlog::segment_manager::flush_segments(bool force) {
    }
 }

+/// \brief Helper for ensuring a file is closed if an exception is thrown.
+///
+/// The file provided by the file_fut future is passed to func.
+/// * If func throws an exception E, the file is closed and we return
+///   a failed future with E.
+/// * If func returns a value V, the file is not closed and we return
+///   a future with V.
+/// Note that when an exception is not thrown, it is the
+/// responsibility of func to make sure the file will be closed. It
+/// can close the file itself, return it, or store it somewhere.
+///
+/// \tparam Func The type of function this wraps
+/// \param file_fut A future that produces a file
+/// \param func A function that uses a file
+/// \return A future that passes the file produced by file_fut to func
+///         and closes it if func fails
+template <typename Func>
+static auto close_on_failure(future<file> file_fut, Func func) {
+    return file_fut.then([func = std::move(func)](file f) {
+        return futurize_apply(func, f).handle_exception([f] (std::exception_ptr e) mutable {
+            return f.close().then_wrapped([f, e = std::move(e)] (future<> x) {
+                using futurator = futurize<std::result_of_t<Func(file)>>;
+                return futurator::make_exception_future(e);
+            });
+        });
+    });
+}
+
 future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::allocate_segment_ex(const descriptor& d, sstring filename, open_flags flags, bool active) {
    file_open_options opt;
    opt.extent_allocation_size_hint = max_size;
@@ -1249,7 +1290,7 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
        return fut;
    });

-    return fut.then([this, d, active, filename](file f) {
+    return close_on_failure(std::move(fut), [this, d, active, filename] (file f) {
        f = make_checked_file(commit_error_handler, f);
        // xfs doesn't like files extended betond eof, so enlarge the file
        return f.truncate(max_size).then([this, d, active, f, filename] () mutable {
--- a/db/config.hh
+++ b/db/config.hh
@@ -756,6 +756,8 @@ public:
    val(enable_dangerous_direct_import_of_cassandra_counters, bool, false, Used, "Only turn this option on if you want to import tables from Cassandra containing counters, and you are SURE that no counters in that table were created in a version earlier than Cassandra 2.1." \
        " It is not enough to have ever since upgraded to newer versions of Cassandra. If you EVER used a version earlier than 2.1 in the cluster where these SSTables come from, DO NOT TURN ON THIS OPTION! You will corrupt your data. You have been warned.") \
    val(enable_shard_aware_drivers, bool, true, Used, "Enable native transport drivers to use connection-per-shard for better performance") \
+    val(abort_on_internal_error, bool, false, Used, "Abort the server instead of throwing exception when internal invariants are violated.") \
+    val(enable_3_1_0_compatibility_mode, bool, false, Used, "Set to true if the cluster was initially installed from 3.1.0. If it was upgraded from an earlier version, or installed from a later version, leave this set to false. This adjusts the communication protocol to work around a bug in Scylla 3.1.0") \
    /* done! */

 #define _make_value_member(name, type, deflt, status, desc, ...)    \
--- a/db/cql_type_parser.cc
+++ b/db/cql_type_parser.cc
@@ -57,9 +57,30 @@ static ::shared_ptr<cql3::cql3_type::raw> parse_raw(const sstring& str) {
 }

 data_type db::cql_type_parser::parse(const sstring& keyspace, const sstring& str, lw_shared_ptr<user_types_metadata> user_types) {
+    static const thread_local std::unordered_map<sstring, cql3::cql3_type> native_types = []{
+        std::unordered_map<sstring, cql3::cql3_type> res;
+        for (auto& nt : cql3::cql3_type::values()) {
+            res.emplace(nt.to_string(), nt);
+        }
+        return res;
+    }();
+
+    auto i = native_types.find(str);
+    if (i != native_types.end()) {
+        return i->second.get_type();
+    }
+
    if (!user_types && service::get_storage_proxy().local_is_initialized()) {
        user_types = service::get_storage_proxy().local().get_db().local().find_keyspace(keyspace).metadata()->user_types();
    }
+    // special-case top-level UDTs
+    if (user_types) {
+        auto& map = user_types->get_all_types();
+        auto i = map.find(utf8_type->decompose(str));
+        if (i != map.end()) {
+            return i->second;
+        }
+    }

    auto raw = parse_raw(str);
    auto cql = raw->prepare_internal(keyspace, user_types);
--- a/db/data_listeners.cc
+++ b/db/data_listeners.cc
@@ -57,7 +57,7 @@ void data_listeners::on_write(const schema_ptr& s, const frozen_mutation& m) {
    }
 }

-toppartitons_item_key::operator sstring() const {
+toppartitions_item_key::operator sstring() const {
    std::ostringstream oss;
    oss << key.key().with_schema(*schema);
    return oss.str();
@@ -84,8 +84,11 @@ flat_mutation_reader toppartitions_data_listener::on_read(const schema_ptr& s, c
        return std::move(rd);
    }
    dblog.trace("toppartitions_data_listener::on_read: {}.{}", s->ks_name(), s->cf_name());
-    return make_filtering_reader(std::move(rd), [this, &range, &slice, s = std::move(s)] (const dht::decorated_key& dk) {
-        _top_k_read.append(toppartitons_item_key{s, dk});
+    return make_filtering_reader(std::move(rd), [zis = this->weak_from_this(), &range, &slice, s = std::move(s)] (const dht::decorated_key& dk) {
+        // The data query may be executing after the toppartitions_data_listener object has been removed, so check
+        if (zis) {
+            zis->_top_k_read.append(toppartitions_item_key{s, dk});
+        }
        return true;
    });
 }
@@ -95,7 +98,27 @@ void toppartitions_data_listener::on_write(const schema_ptr& s, const frozen_mut
        return;
    }
    dblog.trace("toppartitions_data_listener::on_write: {}.{}", _ks, _cf);
-    _top_k_write.append(toppartitons_item_key{s, m.decorated_key(*s)});
+    _top_k_write.append(toppartitions_item_key{s, m.decorated_key(*s)});
+}
+
+toppartitions_data_listener::global_top_k::results
+toppartitions_data_listener::globalize(top_k::results&& r) {
+    toppartitions_data_listener::global_top_k::results n;
+    n.reserve(r.size());
+    for (auto&& e : r) {
+        n.emplace_back(global_top_k::results::value_type{toppartitions_global_item_key(std::move(e.item)), e.count, e.error});
+    }
+    return n;
+}
+
+toppartitions_data_listener::top_k::results
+toppartitions_data_listener::localize(const global_top_k::results& r) {
+    toppartitions_data_listener::top_k::results n;
+    n.reserve(r.size());
+    for (auto&& e : r) {
+        n.emplace_back(top_k::results::value_type{toppartitions_item_key(e.item), e.count, e.error});
+    }
+    return n;
 }

 toppartitions_query::toppartitions_query(distributed<database>& xdb, sstring ks, sstring cf,
@@ -108,20 +131,20 @@ future<> toppartitions_query::scatter() {
    return _query.start(std::ref(_xdb), _ks, _cf);
 }

-using top_t = toppartitions_data_listener::top_k::results;
+using top_t = toppartitions_data_listener::global_top_k::results;

 future<toppartitions_query::results> toppartitions_query::gather(unsigned res_size) {
    dblog.debug("toppartitions_query::gather");

    auto map = [res_size, this] (toppartitions_data_listener& listener) {
        dblog.trace("toppartitions_query::map_reduce with listener {}", &listener);
-        top_t rd = listener._top_k_read.top(res_size);
-        top_t wr = listener._top_k_write.top(res_size);
-        return std::tuple<top_t, top_t>{std::move(rd), std::move(wr)};
+        top_t rd = toppartitions_data_listener::globalize(listener._top_k_read.top(res_size));
+        top_t wr = toppartitions_data_listener::globalize(listener._top_k_write.top(res_size));
+        return make_foreign(std::make_unique<std::tuple<top_t, top_t>>(std::move(rd), std::move(wr)));
    };
-    auto reduce = [this] (results res, std::tuple<top_t, top_t> rd_wr) {
-        res.read.append(std::get<0>(rd_wr));
-        res.write.append(std::get<1>(rd_wr));
+    auto reduce = [this] (results res, foreign_ptr<std::unique_ptr<std::tuple<top_t, top_t>>> rd_wr) {
+        res.read.append(toppartitions_data_listener::localize(std::get<0>(*rd_wr)));
+        res.write.append(toppartitions_data_listener::localize(std::get<1>(*rd_wr)));
        return std::move(res);
    };
    return _query.map_reduce0(map, results{res_size}, reduce)
--- a/db/data_listeners.hh
+++ b/db/data_listeners.hh
@@ -24,12 +24,14 @@
 #include <seastar/core/distributed.hh>
 #include <seastar/core/future.hh>
 #include <seastar/core/distributed.hh>
+#include <seastar/core/weak_ptr.hh>

 #include "schema.hh"
 #include "flat_mutation_reader.hh"
 #include "mutation_reader.hh"
 #include "frozen_mutation.hh"
 #include "utils/top_k.hh"
+#include "schema_registry.hh"

 #include <vector>
 #include <set>
@@ -75,29 +77,54 @@ public:
 };


-struct toppartitons_item_key {
+struct toppartitions_item_key {
    schema_ptr schema;
    dht::decorated_key key;

-    toppartitons_item_key(const schema_ptr& schema, const dht::decorated_key& key) : schema(schema), key(key) {}
-    toppartitons_item_key(const toppartitons_item_key& key) noexcept : schema(key.schema), key(key.key) {}
+    toppartitions_item_key(const schema_ptr& schema, const dht::decorated_key& key) : schema(schema), key(key) {}
+    toppartitions_item_key(const toppartitions_item_key& key) noexcept : schema(key.schema), key(key.key) {}

    struct hash {
-        size_t operator()(const toppartitons_item_key& k) const {
+        size_t operator()(const toppartitions_item_key& k) const {
            return std::hash<dht::token>()(k.key.token());
        }
    };

    struct comp {
-        bool operator()(const toppartitons_item_key& k1, const toppartitons_item_key& k2) const {
-            return k1.schema == k2.schema && k1.key.equal(*k2.schema, k2.key);
+        bool operator()(const toppartitions_item_key& k1, const toppartitions_item_key& k2) const {
+            return k1.schema->id() == k2.schema->id() && k1.key.equal(*k2.schema, k2.key);
        }
    };

    explicit operator sstring() const;
 };

-class toppartitions_data_listener : public data_listener {
+// Like toppartitions_item_key, but uses global_schema_ptr, so can be safely transported across shards
+struct toppartitions_global_item_key {
+    global_schema_ptr schema;
+    dht::decorated_key key;
+
+    toppartitions_global_item_key(toppartitions_item_key&& tik) : schema(std::move(tik.schema)), key(std::move(tik.key)) {}
+    operator toppartitions_item_key() const {
+        return toppartitions_item_key(schema, key);
+    }
+
+    struct hash {
+        size_t operator()(const toppartitions_global_item_key& k) const {
+            return std::hash<dht::token>()(k.key.token());
+        }
+    };
+
+    struct comp {
+        bool operator()(const toppartitions_global_item_key& k1, const toppartitions_global_item_key& k2) const {
+            return k1.schema.get()->id() == k2.schema.get()->id() && k1.key.equal(*k2.schema.get(), k2.key);
+        }
+    };
+
+    explicit operator sstring() const;
+};
+
+class toppartitions_data_listener : public data_listener, public weakly_referencable<toppartitions_data_listener> {
    friend class toppartitions_query;

    database& _db;
@@ -105,7 +132,11 @@ class toppartitions_data_listener : public data_listener {
    sstring _cf;

 public:
-    using top_k = utils::space_saving_top_k<toppartitons_item_key, toppartitons_item_key::hash, toppartitons_item_key::comp>;
+    using top_k = utils::space_saving_top_k<toppartitions_item_key, toppartitions_item_key::hash, toppartitions_item_key::comp>;
+    using global_top_k = utils::space_saving_top_k<toppartitions_global_item_key, toppartitions_global_item_key::hash, toppartitions_global_item_key::comp>;
+public:
+    static global_top_k::results globalize(top_k::results&& r);
+    static top_k::results localize(const global_top_k::results& r);
 private:
    top_k _top_k_read;
    top_k _top_k_write;
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -118,8 +118,8 @@ future<> manager::stop() {

    return _draining_eps_gate.close().finally([this] {
        return parallel_for_each(_ep_managers, [] (auto& pair) {
-                return pair.second.stop();
-            }).finally([this] {
+            return pair.second.stop();
+        }).finally([this] {
            _ep_managers.clear();
            manager_logger.info("Stopped");
        }).discard_result();
@@ -240,6 +240,8 @@ future<> manager::end_point_hints_manager::stop(drain should_drain) noexcept {
 manager::end_point_hints_manager::end_point_hints_manager(const key_type& key, manager& shard_manager)
    : _key(key)
    , _shard_manager(shard_manager)
+    , _file_update_mutex_ptr(make_lw_shared<seastar::shared_mutex>())
+    , _file_update_mutex(*_file_update_mutex_ptr)
    , _state(state_set::of<state::stopped>())
    , _hints_dir(_shard_manager.hints_dir() / format("{}", _key).c_str())
    , _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper())
@@ -248,6 +250,8 @@ manager::end_point_hints_manager::end_point_hints_manager(const key_type& key, m
 manager::end_point_hints_manager::end_point_hints_manager(end_point_hints_manager&& other)
    : _key(other._key)
    , _shard_manager(other._shard_manager)
+    , _file_update_mutex_ptr(std::move(other._file_update_mutex_ptr))
+    , _file_update_mutex(*_file_update_mutex_ptr)
    , _state(other._state)
    , _hints_dir(std::move(other._hints_dir))
    , _sender(other._sender, *this)
@@ -520,28 +524,35 @@ void manager::drain_for(gms::inet_address endpoint) {
    manager_logger.trace("on_leave_cluster: {} is removed/decommissioned", endpoint);

    with_gate(_draining_eps_gate, [this, endpoint] {
-        return futurize_apply([this, endpoint] () {
-            if (utils::fb_utilities::is_me(endpoint)) {
-                return parallel_for_each(_ep_managers, [] (auto& pair) {
-                    return pair.second.stop(drain::yes).finally([&pair] {
-                        return remove_file(pair.second.hints_dir().c_str());
+        return with_semaphore(drain_lock(), 1, [this, endpoint] {
+            return futurize_apply([this, endpoint] () {
+                if (utils::fb_utilities::is_me(endpoint)) {
+                    return parallel_for_each(_ep_managers, [] (auto& pair) {
+                        return pair.second.stop(drain::yes).finally([&pair] {
+                            return with_file_update_mutex(pair.second, [&pair] {
+                                return remove_file(pair.second.hints_dir().c_str());
+                            });
+                        });
+                    }).finally([this] {
+                        _ep_managers.clear();
                    });
-                }).finally([this] {
-                    _ep_managers.clear();
-                });
-            } else {
-                ep_managers_map_type::iterator ep_manager_it = find_ep_manager(endpoint);
-                if (ep_manager_it != ep_managers_end()) {
-                    return ep_manager_it->second.stop(drain::yes).finally([this, endpoint, hints_dir = ep_manager_it->second.hints_dir()] {
-                        _ep_managers.erase(endpoint);
-                        return remove_file(hints_dir.c_str());
-                    });
-                }
+                } else {
+                    ep_managers_map_type::iterator ep_manager_it = find_ep_manager(endpoint);
+                    if (ep_manager_it != ep_managers_end()) {
+                        return ep_manager_it->second.stop(drain::yes).finally([this, endpoint, &ep_man = ep_manager_it->second] {
+                            return with_file_update_mutex(ep_man, [&ep_man] {
+                                return remove_file(ep_man.hints_dir().c_str());
+                            }).finally([this, endpoint] {
+                                _ep_managers.erase(endpoint);
+                            });
+                        });
+                    }

-                return make_ready_future<>();
-            }
-        }).handle_exception([endpoint] (auto eptr) {
-            manager_logger.error("Exception when draining {}: {}", endpoint, eptr);
+                    return make_ready_future<>();
+                }
+            }).handle_exception([endpoint] (auto eptr) {
+                manager_logger.error("Exception when draining {}: {}", endpoint, eptr);
+            });
        });
    });
 }
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -276,7 +276,8 @@ public:
        manager& _shard_manager;
        hints_store_ptr _hints_store_anchor;
        seastar::gate _store_gate;
-        seastar::shared_mutex _file_update_mutex;
+        lw_shared_ptr<seastar::shared_mutex> _file_update_mutex_ptr;
+        seastar::shared_mutex& _file_update_mutex;

        enum class state {
            can_hint,               // hinting is currently allowed (used by the space_watchdog)
@@ -378,8 +379,20 @@ public:
            return _state.contains(state::stopped);
        }

-        seastar::shared_mutex& file_update_mutex() {
-            return _file_update_mutex;
+        /// \brief Safely runs a given functor under the file_update_mutex of \ref ep_man
+        ///
+        /// Runs a given functor under the file_update_mutex of the given end_point_hints_manager instance.
+        /// This function is safe even if \ref ep_man gets destroyed before the future this function returns resolves
+        /// (as long as the \ref func call itself is safe).
+        ///
+        /// \tparam Func Functor type.
+        /// \param ep_man end_point_hints_manager instance which file_update_mutex we want to lock.
+        /// \param func Functor to run under the lock.
+        /// \return Whatever \ref func returns.
+        template <typename Func>
+        friend inline auto with_file_update_mutex(end_point_hints_manager& ep_man, Func&& func) {
+            lw_shared_ptr<seastar::shared_mutex> lock_ptr = ep_man._file_update_mutex_ptr;
+            return with_lock(*lock_ptr, std::forward<Func>(func)).finally([lock_ptr] {});
        }

        const fs::path& hints_dir() const noexcept {
@@ -387,6 +400,10 @@ public:
        }

    private:
+        seastar::shared_mutex& file_update_mutex() noexcept {
+            return _file_update_mutex;
+        }
+
        /// \brief Creates a new hints store object.
        ///
        /// - Creates a hints store directory if doesn't exist: <shard_hints_dir>/<ep_key>
@@ -453,6 +470,7 @@ private:
    stats _stats;
    seastar::metrics::metric_groups _metrics;
    std::unordered_set<ep_key_type> _eps_with_pending_hints;
+    seastar::semaphore _drain_lock = {1};

 public:
    manager(sstring hints_directory, std::vector<sstring> hinted_dcs, int64_t max_hint_window_ms, resource_manager&res_manager, distributed<database>& db);
@@ -531,6 +549,10 @@ public:
        return _hints_dir_device_id;
    }

+    seastar::semaphore& drain_lock() noexcept {
+        return _drain_lock;
+    }
+
    void allow_hints();
    void forbid_hints();
    void forbid_hints_for_eps_with_pending_hints();
--- a/db/hints/resource_manager.cc
+++ b/db/hints/resource_manager.cc
@@ -89,16 +89,27 @@ future<> space_watchdog::stop() noexcept {
    return std::move(_started);
 }

+// Called under the end_point_hints_manager::file_update_mutex() of the corresponding end_point_hints_manager instance.
 future<> space_watchdog::scan_one_ep_dir(fs::path path, manager& shard_manager, ep_key_type ep_key) {
-    return lister::scan_dir(path, { directory_entry_type::regular }, [this, ep_key, &shard_manager] (fs::path dir, directory_entry de) {
-        // Put the current end point ID to state.eps_with_pending_hints when we see the second hints file in its directory
-        if (_files_count == 1) {
-            shard_manager.add_ep_with_pending_hints(ep_key);
-        }
-        ++_files_count;
+    return do_with(std::move(path), [this, ep_key, &shard_manager] (fs::path& path) {
+        // It may happen that we get here and the directory has already been deleted in the context of manager::drain_for().
+        // In this case simply bail out.
+        return engine().file_exists(path.native()).then([this, ep_key, &shard_manager, &path] (bool exists) {
+            if (!exists) {
+                return make_ready_future<>();
+            } else {
+                return lister::scan_dir(path, { directory_entry_type::regular }, [this, ep_key, &shard_manager] (fs::path dir, directory_entry de) {
+                    // Put the current end point ID to state.eps_with_pending_hints when we see the second hints file in its directory
+                    if (_files_count == 1) {
+                        shard_manager.add_ep_with_pending_hints(ep_key);
+                    }
+                    ++_files_count;

-        return io_check(file_size, (dir / de.name.c_str()).c_str()).then([this] (uint64_t fsize) {
-            _total_size += fsize;
+                    return io_check(file_size, (dir / de.name.c_str()).c_str()).then([this] (uint64_t fsize) {
+                        _total_size += fsize;
+                    });
+                });
+            }
        });
    });
 }
@@ -136,7 +147,7 @@ void space_watchdog::on_timer() {
                // continue to enumeration - there is no one to change them.
                auto it = shard_manager.find_ep_manager(de.name);
                if (it != shard_manager.ep_managers_end()) {
-                    return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
+                    return with_file_update_mutex(it->second, [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)] () mutable {
                        return scan_one_ep_dir(dir / ep_name, shard_manager, ep_key_type(ep_name));
                    });
                } else {
--- a/db/schema_features.hh
+++ b/db/schema_features.hh
@@ -26,11 +26,17 @@
 namespace db {

 enum class schema_feature {
-    VIEW_VIRTUAL_COLUMNS
+    VIEW_VIRTUAL_COLUMNS,
+
+    // When set, the schema digest is calcualted in a way such that it doesn't change after all
+    // tombstones in an empty partition expire.
+    // See https://github.com/scylladb/scylla/issues/4485
+    DIGEST_INSENSITIVE_TO_EXPIRY,
 };

 using schema_features = enum_set<super_enum<schema_feature,
-    schema_feature::VIEW_VIRTUAL_COLUMNS
+    schema_feature::VIEW_VIRTUAL_COLUMNS,
+    schema_feature::DIGEST_INSENSITIVE_TO_EXPIRY
    >>;

 }
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -587,9 +587,9 @@ future<utils::UUID> calculate_schema_digest(distributed<service::storage_proxy>&
            return mutations;
        });
    };
-    auto reduce = [] (auto& hash, auto&& mutations) {
+    auto reduce = [features] (auto& hash, auto&& mutations) {
        for (const mutation& m : mutations) {
-            feed_hash_for_schema_digest(hash, m);
+            feed_hash_for_schema_digest(hash, m, features);
        }
    };
    return do_with(md5_hasher(), all_table_names(features), [features, map, reduce] (auto& hash, auto& tables) {
@@ -778,6 +778,13 @@ mutation compact_for_schema_digest(const mutation& m) {
    return m_compacted;
 }

+void feed_hash_for_schema_digest(hasher& h, const mutation& m, schema_features features) {
+    auto compacted = compact_for_schema_digest(m);
+    if (!features.contains<schema_feature::DIGEST_INSENSITIVE_TO_EXPIRY>() || !compacted.partition().empty()) {
+        feed_hash(h, compact_for_schema_digest(m));
+    }
+}
+
 // Applies deletion of the "version" column to a system_schema.scylla_tables mutation.
 static void delete_schema_version(mutation& m) {
    if (m.column_family_id() != scylla_tables()->id()) {
@@ -1085,10 +1092,31 @@ static std::vector<V> get_list(const query::result_set_row& row, const sstring&
 // Create types for a given keyspace. This takes care of topologically sorting user defined types.
 template <typename T> static std::vector<user_type> create_types(keyspace_metadata& ks, T&& range) {
    cql_type_parser::raw_builder builder(ks);
+    std::unordered_set<bytes> names;
    for (const query::result_set_row& row : range) {
-        builder.add(row.get_nonnull<sstring>("type_name"),
-                        get_list<sstring>(row, "field_names"),
-                        get_list<sstring>(row, "field_types"));
+        auto name = row.get_nonnull<sstring>("type_name");
+        names.insert(to_bytes(name));
+        builder.add(std::move(name), get_list<sstring>(row, "field_names"), get_list<sstring>(row, "field_types"));
+    }
+    // Add user types that use any of the above types. From the
+    // database point of view they haven't changed since the content
+    // of system.types is the same for them. The runtime objects in
+    // the other hand now point to out of date types, so we need to
+    // recreate them.
+    for (const auto& p : ks.user_types()->get_all_types()) {
+        const user_type& t = p.second;
+        if (names.count(t->_name) != 0) {
+            continue;
+        }
+        for (const auto& name : names) {
+            if (t->references_user_type(t->_keyspace, name)) {
+                std::vector<sstring> field_types;
+                for (const data_type& f : t->field_types()) {
+                    field_types.push_back(f->as_cql3_type().to_string());
+                }
+                builder.add(t->get_name_as_string(), t->string_field_names(), std::move(field_types));
+            }
+        }
    }
    return builder.build();
 }
@@ -2727,8 +2755,9 @@ namespace legacy {

 table_schema_version schema_mutations::digest() const {
    md5_hasher h;
-    db::schema_tables::feed_hash_for_schema_digest(h, _columnfamilies);
-    db::schema_tables::feed_hash_for_schema_digest(h, _columns);
+    const db::schema_features no_features;
+    db::schema_tables::feed_hash_for_schema_digest(h, _columnfamilies, no_features);
+    db::schema_tables::feed_hash_for_schema_digest(h, _columns, no_features);
    return utils::UUID_gen::get_name_UUID(h.finalize());
 }

--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -215,10 +215,7 @@ index_metadata_kind deserialize_index_kind(sstring kind);

 mutation compact_for_schema_digest(const mutation& m);

-template<typename Hasher>
-void feed_hash_for_schema_digest(Hasher& h, const mutation& m) {
-    feed_hash(h, compact_for_schema_digest(m));
-}
+void feed_hash_for_schema_digest(hasher&, const mutation&, schema_features);

 } // namespace schema_tables
 } // namespace db
--- a/db/size_estimates_virtual_reader.cc
+++ b/db/size_estimates_virtual_reader.cc
@@ -0,0 +1,328 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <boost/range/adaptor/indirected.hpp>
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm/find_if.hpp>
+
+#include "clustering_bounds_comparator.hh"
+#include "database_fwd.hh"
+#include "db/system_keyspace.hh"
+#include "dht/i_partitioner.hh"
+#include "partition_range_compat.hh"
+#include "range.hh"
+#include "service/storage_service.hh"
+#include "mutation_fragment.hh"
+#include "sstables/sstables.hh"
+#include "db/timeout_clock.hh"
+#include "database.hh"
+
+#include "db/size_estimates_virtual_reader.hh"
+
+namespace db {
+
+namespace size_estimates {
+
+struct virtual_row {
+    const bytes& cf_name;
+    const token_range& tokens;
+    clustering_key_prefix as_key() const {
+        return clustering_key_prefix::from_exploded(std::vector<bytes_view>{cf_name, tokens.start, tokens.end});
+    }
+};
+
+struct virtual_row_comparator {
+    schema_ptr _schema;
+    virtual_row_comparator(schema_ptr schema) : _schema(schema) { }
+    bool operator()(const clustering_key_prefix& key1, const clustering_key_prefix& key2) {
+        return clustering_key_prefix::prefix_equality_less_compare(*_schema)(key1, key2);
+    }
+    bool operator()(const virtual_row& row, const clustering_key_prefix& key) {
+        return operator()(row.as_key(), key);
+    }
+    bool operator()(const clustering_key_prefix& key, const virtual_row& row) {
+        return operator()(key, row.as_key());
+    }
+};
+
+// Iterating over the cartesian product of cf_names and token_ranges.
+class virtual_row_iterator : public std::iterator<std::input_iterator_tag, const virtual_row> {
+    std::reference_wrapper<const std::vector<bytes>> _cf_names;
+    std::reference_wrapper<const std::vector<token_range>> _ranges;
+    size_t _cf_names_idx = 0;
+    size_t _ranges_idx = 0;
+public:
+    struct end_iterator_tag {};
+    virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges)
+            : _cf_names(std::ref(cf_names))
+            , _ranges(std::ref(ranges))
+    { }
+    virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges, end_iterator_tag)
+            : _cf_names(std::ref(cf_names))
+            , _ranges(std::ref(ranges))
+            , _cf_names_idx(cf_names.size())
+            , _ranges_idx(ranges.size())
+    {
+        if (cf_names.empty() || ranges.empty()) {
+            // The product of an empty range with any range is an empty range.
+            // In this case we want the end iterator to be equal to the begin iterator,
+            // which has_ranges_idx = _cf_names_idx = 0.
+            _ranges_idx = _cf_names_idx = 0;
+        }
+    }
+    virtual_row_iterator& operator++() {
+        if (++_ranges_idx == _ranges.get().size() && ++_cf_names_idx < _cf_names.get().size()) {
+            _ranges_idx = 0;
+        }
+        return *this;
+    }
+    virtual_row_iterator operator++(int) {
+        virtual_row_iterator i(*this);
+        ++(*this);
+        return i;
+    }
+    const value_type operator*() const {
+        return { _cf_names.get()[_cf_names_idx], _ranges.get()[_ranges_idx] };
+    }
+    bool operator==(const virtual_row_iterator& i) const {
+        return _cf_names_idx == i._cf_names_idx
+            && _ranges_idx == i._ranges_idx;
+    }
+    bool operator!=(const virtual_row_iterator& i) const {
+        return !(*this == i);
+    }
+};
+
+/**
+ * Returns the keyspaces, ordered by name, as selected by the partition_range.
+ */
+static std::vector<sstring> get_keyspaces(const schema& s, const database& db, dht::partition_range range) {
+    struct keyspace_less_comparator {
+        const schema& _s;
+        keyspace_less_comparator(const schema& s) : _s(s) { }
+        dht::ring_position as_ring_position(const sstring& ks) {
+            auto pkey = partition_key::from_single_value(_s, utf8_type->decompose(ks));
+            return dht::global_partitioner().decorate_key(_s, std::move(pkey));
+        }
+        bool operator()(const sstring& ks1, const sstring& ks2) {
+            return as_ring_position(ks1).less_compare(_s, as_ring_position(ks2));
+        }
+        bool operator()(const sstring& ks, const dht::ring_position& rp) {
+            return as_ring_position(ks).less_compare(_s, rp);
+        }
+        bool operator()(const dht::ring_position& rp, const sstring& ks) {
+            return rp.less_compare(_s, as_ring_position(ks));
+        }
+    };
+    auto keyspaces = db.get_non_system_keyspaces();
+    auto cmp = keyspace_less_comparator(s);
+    boost::sort(keyspaces, cmp);
+    return boost::copy_range<std::vector<sstring>>(
+        range.slice(keyspaces, std::move(cmp)) | boost::adaptors::filtered([&s] (const auto& ks) {
+            // If this is a range query, results are divided between shards by the partition key (keyspace_name).
+            return shard_of(dht::global_partitioner().get_token(s,
+                        partition_key::from_single_value(s, utf8_type->decompose(ks))))
+                == engine().cpu_id();
+        })
+    );
+}
+
+/**
+ * Makes a wrapping range of ring_position from a nonwrapping range of token, used to select sstables.
+ */
+static dht::partition_range as_ring_position_range(dht::token_range& r) {
+    std::optional<range<dht::ring_position>::bound> start_bound, end_bound;
+    if (r.start()) {
+        start_bound = {{ dht::ring_position(r.start()->value(), dht::ring_position::token_bound::start), r.start()->is_inclusive() }};
+    }
+    if (r.end()) {
+        end_bound = {{ dht::ring_position(r.end()->value(), dht::ring_position::token_bound::end), r.end()->is_inclusive() }};
+    }
+    return dht::partition_range(std::move(start_bound), std::move(end_bound), r.is_singular());
+}
+
+/**
+ * Add a new range_estimates for the specified range, considering the sstables associated with `cf`.
+ */
+static system_keyspace::range_estimates estimate(const column_family& cf, const token_range& r) {
+    int64_t count{0};
+    utils::estimated_histogram hist{0};
+    auto from_bytes = [] (auto& b) {
+        return dht::global_partitioner().from_sstring(utf8_type->to_string(b));
+    };
+    dht::token_range_vector ranges;
+    ::compat::unwrap_into(
+        wrapping_range<dht::token>({{ from_bytes(r.start), false }}, {{ from_bytes(r.end) }}),
+        dht::token_comparator(),
+        [&] (auto&& rng) { ranges.push_back(std::move(rng)); });
+    for (auto&& r : ranges) {
+        auto rp_range = as_ring_position_range(r);
+        for (auto&& sstable : cf.select_sstables(rp_range)) {
+            count += sstable->estimated_keys_for_range(r);
+            hist.merge(sstable->get_stats_metadata().estimated_partition_size);
+        }
+    }
+    return {cf.schema(), r.start, r.end, count, count > 0 ? hist.mean() : 0};
+}
+
+future<std::vector<token_range>> get_local_ranges() {
+    auto& ss = service::get_local_storage_service();
+    return ss.get_local_tokens().then([&ss] (auto&& tokens) {
+        auto ranges = ss.get_token_metadata().get_primary_ranges_for(std::move(tokens));
+        std::vector<token_range> local_ranges;
+        auto to_bytes = [](const std::optional<dht::token_range::bound>& b) {
+            assert(b);
+            return utf8_type->decompose(dht::global_partitioner().to_sstring(b->value()));
+        };
+        // We merge the ranges to be compatible with how Cassandra shows it's size estimates table.
+        // All queries will be on that table, where all entries are text and there's no notion of
+        // token ranges form the CQL point of view.
+        auto left_inf = boost::find_if(ranges, [] (auto&& r) {
+            return !r.start() || r.start()->value() == dht::minimum_token();
+        });
+        auto right_inf = boost::find_if(ranges, [] (auto&& r) {
+            return !r.end() || r.start()->value() == dht::maximum_token();
+        });
+        if (left_inf != right_inf && left_inf != ranges.end() && right_inf != ranges.end()) {
+            local_ranges.push_back(token_range{to_bytes(right_inf->start()), to_bytes(left_inf->end())});
+            ranges.erase(left_inf);
+            ranges.erase(right_inf);
+        }
+        for (auto&& r : ranges) {
+            local_ranges.push_back(token_range{to_bytes(r.start()), to_bytes(r.end())});
+        }
+        boost::sort(local_ranges, [] (auto&& tr1, auto&& tr2) {
+            return utf8_type->less(tr1.start, tr2.start);
+        });
+        return local_ranges;
+    });
+}
+
+size_estimates_mutation_reader::size_estimates_mutation_reader(schema_ptr schema, const dht::partition_range& prange, const query::partition_slice& slice, streamed_mutation::forwarding fwd)
+            : impl(schema)
+            , _schema(std::move(schema))
+            , _prange(&prange)
+            , _slice(slice)
+            , _fwd(fwd)
+    { }
+
+future<> size_estimates_mutation_reader::get_next_partition() {
+    auto& db = service::get_local_storage_proxy().get_db().local();
+    if (!_keyspaces) {
+        _keyspaces = get_keyspaces(*_schema, db, *_prange);
+        _current_partition = _keyspaces->begin();
+    }
+    if (_current_partition == _keyspaces->end()) {
+        _end_of_stream = true;
+        return make_ready_future<>();
+    }
+    return get_local_ranges().then([&db, this] (auto&& ranges) {
+        auto estimates = this->estimates_for_current_keyspace(db, std::move(ranges));
+        auto mutations = db::system_keyspace::make_size_estimates_mutation(*_current_partition, std::move(estimates));
+        ++_current_partition;
+        std::vector<mutation> ms;
+        ms.emplace_back(std::move(mutations));
+        _partition_reader = flat_mutation_reader_from_mutations(std::move(ms), _fwd);
+    });
+}
+
+future<> size_estimates_mutation_reader::fill_buffer(db::timeout_clock::time_point timeout) {
+    return do_until([this, timeout] { return is_end_of_stream() || is_buffer_full(); }, [this, timeout] {
+        if (!_partition_reader) {
+            return get_next_partition();
+        }
+        return _partition_reader->consume_pausable([this] (mutation_fragment mf) {
+            push_mutation_fragment(std::move(mf));
+            return stop_iteration(is_buffer_full());
+        }, timeout).then([this] {
+            if (_partition_reader->is_end_of_stream() && _partition_reader->is_buffer_empty()) {
+                _partition_reader = std::nullopt;
+            }
+        });
+    });
+}
+
+void size_estimates_mutation_reader::next_partition() {
+    clear_buffer_to_next_partition();
+    if (is_buffer_empty()) {
+        _partition_reader = std::nullopt;
+    }
+}
+
+future<> size_estimates_mutation_reader::fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) {
+    clear_buffer();
+    _prange = &pr;
+    _keyspaces = std::nullopt;
+    _partition_reader = std::nullopt;
+    _end_of_stream = false;
+    return make_ready_future<>();
+}
+
+future<> size_estimates_mutation_reader::fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) {
+    forward_buffer_to(pr.start());
+    _end_of_stream = false;
+    if (_partition_reader) {
+        return _partition_reader->fast_forward_to(std::move(pr), timeout);
+    }
+    return make_ready_future<>();
+}
+
+size_t size_estimates_mutation_reader::buffer_size() const {
+    if (_partition_reader) {
+        return flat_mutation_reader::impl::buffer_size() + _partition_reader->buffer_size();
+    }
+    return flat_mutation_reader::impl::buffer_size();
+}
+
+std::vector<db::system_keyspace::range_estimates>
+size_estimates_mutation_reader::estimates_for_current_keyspace(const database& db, std::vector<token_range> local_ranges) const {
+    // For each specified range, estimate (crudely) mean partition size and partitions count.
+    auto pkey = partition_key::from_single_value(*_schema, utf8_type->decompose(*_current_partition));
+    auto cfs = db.find_keyspace(*_current_partition).metadata()->cf_meta_data();
+    auto cf_names = boost::copy_range<std::vector<bytes>>(cfs | boost::adaptors::transformed([] (auto&& cf) {
+        return utf8_type->decompose(cf.first);
+    }));
+    boost::sort(cf_names, [] (auto&& n1, auto&& n2) {
+        return utf8_type->less(n1, n2);
+    });
+    std::vector<db::system_keyspace::range_estimates> estimates;
+    for (auto& range : _slice.row_ranges(*_schema, pkey)) {
+        auto rows = boost::make_iterator_range(
+                virtual_row_iterator(cf_names, local_ranges),
+                virtual_row_iterator(cf_names, local_ranges, virtual_row_iterator::end_iterator_tag()));
+        auto rows_to_estimate = range.slice(rows, virtual_row_comparator(_schema));
+        for (auto&& r : rows_to_estimate) {
+            auto& cf = db.find_column_family(*_current_partition, utf8_type->to_string(r.cf_name));
+            estimates.push_back(estimate(cf, r.tokens));
+            if (estimates.size() >= _slice.partition_row_limit()) {
+                return estimates;
+            }
+        }
+    }
+    return estimates;
+}
+
+} // namespace size_estimates
+
+} // namespace db
--- a/db/size_estimates_virtual_reader.hh
+++ b/db/size_estimates_virtual_reader.hh
@@ -21,33 +21,18 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

-#include <boost/range/adaptor/indirected.hpp>
-#include <boost/range/adaptor/map.hpp>
-#include <boost/range/adaptor/transformed.hpp>
-#include <boost/range/algorithm/find_if.hpp>
-
-#include "clustering_bounds_comparator.hh"
-#include "database_fwd.hh"
-#include "db/system_keyspace.hh"
-#include "dht/i_partitioner.hh"
 #include "mutation_reader.hh"
-#include "partition_range_compat.hh"
-#include "range.hh"
-#include "service/storage_service.hh"
-#include "mutation_fragment.hh"
-#include "sstables/sstables.hh"
-#include "db/timeout_clock.hh"
-#include "database.hh"

 namespace db {

 namespace size_estimates {

+struct token_range {
+    bytes start;
+    bytes end;
+};
+
 class size_estimates_mutation_reader final : public flat_mutation_reader::impl {
-    struct token_range {
-        bytes start;
-        bytes end;
-    };
    schema_ptr _schema;
    const dht::partition_range* _prange;
    const query::partition_slice& _slice;
@@ -57,267 +42,18 @@ class size_estimates_mutation_reader final : public flat_mutation_reader::impl {
    streamed_mutation::forwarding _fwd;
    flat_mutation_reader_opt _partition_reader;
 public:
-    size_estimates_mutation_reader(schema_ptr schema, const dht::partition_range& prange, const query::partition_slice& slice, streamed_mutation::forwarding fwd)
-            : impl(schema)
-            , _schema(std::move(schema))
-            , _prange(&prange)
-            , _slice(slice)
-            , _fwd(fwd)
-    { }
+    size_estimates_mutation_reader(schema_ptr, const dht::partition_range&, const query::partition_slice&, streamed_mutation::forwarding);

+    virtual future<> fill_buffer(db::timeout_clock::time_point) override;
+    virtual void next_partition() override;
+    virtual future<> fast_forward_to(const dht::partition_range&, db::timeout_clock::time_point) override;
+    virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point) override;
+    virtual size_t buffer_size() const override;
 private:
-    future<> get_next_partition() {
-        // For each specified range, estimate (crudely) mean partition size and partitions count.
-        auto& db = service::get_local_storage_proxy().get_db().local();
-        if (!_keyspaces) {
-            _keyspaces = get_keyspaces(*_schema, db, *_prange);
-            _current_partition = _keyspaces->begin();
-        }
-        if (_current_partition == _keyspaces->end()) {
-            _end_of_stream = true;
-            return make_ready_future<>();
-        }
-        return get_local_ranges().then([&db, this] (auto&& ranges) {
-            auto estimates = this->estimates_for_current_keyspace(db, std::move(ranges));
-            auto mutations = db::system_keyspace::make_size_estimates_mutation(*_current_partition, std::move(estimates));
-            ++_current_partition;
-            std::vector<mutation> ms;
-            ms.emplace_back(std::move(mutations));
-            _partition_reader = flat_mutation_reader_from_mutations(std::move(ms), _fwd);
-        });
-    }
-public:
-    virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
-        return do_until([this, timeout] { return is_end_of_stream() || is_buffer_full(); }, [this, timeout] {
-            if (!_partition_reader) {
-                return get_next_partition();
-            }
-            return _partition_reader->consume_pausable([this] (mutation_fragment mf) {
-                push_mutation_fragment(std::move(mf));
-                return stop_iteration(is_buffer_full());
-            }, timeout).then([this] {
-                if (_partition_reader->is_end_of_stream() && _partition_reader->is_buffer_empty()) {
-                    _partition_reader = std::nullopt;
-                }
-            });
-        });
-    }
-    virtual void next_partition() override {
-        clear_buffer_to_next_partition();
-        if (is_buffer_empty()) {
-            _partition_reader = std::nullopt;
-        }
-    }
-    virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override {
-        clear_buffer();
-        _prange = &pr;
-        _keyspaces = std::nullopt;
-        _partition_reader = std::nullopt;
-        _end_of_stream = false;
-        return make_ready_future<>();
-    }
-    virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override {
-        forward_buffer_to(pr.start());
-        _end_of_stream = false;
-        if (_partition_reader) {
-            return _partition_reader->fast_forward_to(std::move(pr), timeout);
-        }
-        return make_ready_future<>();
-    }
-    virtual size_t buffer_size() const override {
-        if (_partition_reader) {
-            return flat_mutation_reader::impl::buffer_size() + _partition_reader->buffer_size();
-        }
-        return flat_mutation_reader::impl::buffer_size();
-    }
-    /**
-     * Returns the primary ranges for the local node.
-     * Used for testing as well.
-     */
-    static future<std::vector<token_range>> get_local_ranges() {
-        auto& ss = service::get_local_storage_service();
-        return ss.get_local_tokens().then([&ss] (auto&& tokens) {
-            auto ranges = ss.get_token_metadata().get_primary_ranges_for(std::move(tokens));
-            std::vector<token_range> local_ranges;
-            auto to_bytes = [](const std::optional<dht::token_range::bound>& b) {
-                assert(b);
-                return utf8_type->decompose(dht::global_partitioner().to_sstring(b->value()));
-            };
-            // We merge the ranges to be compatible with how Cassandra shows it's size estimates table.
-            // All queries will be on that table, where all entries are text and there's no notion of
-            // token ranges form the CQL point of view.
-            auto left_inf = boost::find_if(ranges, [] (auto&& r) {
-                return !r.start() || r.start()->value() == dht::minimum_token();
-            });
-            auto right_inf = boost::find_if(ranges, [] (auto&& r) {
-                return !r.end() || r.start()->value() == dht::maximum_token();
-            });
-            if (left_inf != right_inf && left_inf != ranges.end() && right_inf != ranges.end()) {
-                local_ranges.push_back(token_range{to_bytes(right_inf->start()), to_bytes(left_inf->end())});
-                ranges.erase(left_inf);
-                ranges.erase(right_inf);
-            }
-            for (auto&& r : ranges) {
-                local_ranges.push_back(token_range{to_bytes(r.start()), to_bytes(r.end())});
-            }
-            boost::sort(local_ranges, [] (auto&& tr1, auto&& tr2) {
-                return utf8_type->less(tr1.start, tr2.start);
-            });
-            return local_ranges;
-        });
-    }
-private:
-    struct virtual_row {
-        const bytes& cf_name;
-        const token_range& tokens;
-        clustering_key_prefix as_key() const {
-            return clustering_key_prefix::from_exploded(std::vector<bytes_view>{cf_name, tokens.start, tokens.end});
-        }
-    };
-    struct virtual_row_comparator {
-        schema_ptr _schema;
-        virtual_row_comparator(schema_ptr schema) : _schema(schema) { }
-        bool operator()(const clustering_key_prefix& key1, const clustering_key_prefix& key2) {
-            return clustering_key_prefix::prefix_equality_less_compare(*_schema)(key1, key2);
-        }
-        bool operator()(const virtual_row& row, const clustering_key_prefix& key) {
-            return operator()(row.as_key(), key);
-        }
-        bool operator()(const clustering_key_prefix& key, const virtual_row& row) {
-            return operator()(key, row.as_key());
-        }
-    };
-    class virtual_row_iterator : public std::iterator<std::input_iterator_tag, const virtual_row> {
-        std::reference_wrapper<const std::vector<bytes>> _cf_names;
-        std::reference_wrapper<const std::vector<token_range>> _ranges;
-        size_t _cf_names_idx = 0;
-        size_t _ranges_idx = 0;
-    public:
-        struct end_iterator_tag {};
-        virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges)
-                : _cf_names(std::ref(cf_names))
-                , _ranges(std::ref(ranges))
-        { }
-        virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges, end_iterator_tag)
-                : _cf_names(std::ref(cf_names))
-                , _ranges(std::ref(ranges))
-                , _cf_names_idx(cf_names.size())
-                , _ranges_idx(ranges.size())
-        { }
-        virtual_row_iterator& operator++() {
-            if (++_ranges_idx == _ranges.get().size() && ++_cf_names_idx < _cf_names.get().size()) {
-                _ranges_idx = 0;
-            }
-            return *this;
-        }
-        virtual_row_iterator operator++(int) {
-            virtual_row_iterator i(*this);
-            ++(*this);
-            return i;
-        }
-        const value_type operator*() const {
-            return { _cf_names.get()[_cf_names_idx], _ranges.get()[_ranges_idx] };
-        }
-        bool operator==(const virtual_row_iterator& i) const {
-            return _cf_names_idx == i._cf_names_idx
-                && _ranges_idx == i._ranges_idx;
-        }
-        bool operator!=(const virtual_row_iterator& i) const {
-            return !(*this == i);
-        }
-    };
+    future<> get_next_partition();

    std::vector<db::system_keyspace::range_estimates>
-    estimates_for_current_keyspace(const database& db, std::vector<token_range> local_ranges) const {
-        auto pkey = partition_key::from_single_value(*_schema, utf8_type->decompose(*_current_partition));
-        auto cfs = db.find_keyspace(*_current_partition).metadata()->cf_meta_data();
-        auto cf_names = boost::copy_range<std::vector<bytes>>(cfs | boost::adaptors::transformed([] (auto&& cf) {
-            return utf8_type->decompose(cf.first);
-        }));
-        boost::sort(cf_names, [] (auto&& n1, auto&& n2) {
-            return utf8_type->less(n1, n2);
-        });
-        std::vector<db::system_keyspace::range_estimates> estimates;
-        for (auto& range : _slice.row_ranges(*_schema, pkey)) {
-            auto rows = boost::make_iterator_range(
-                    virtual_row_iterator(cf_names, local_ranges),
-                    virtual_row_iterator(cf_names, local_ranges, virtual_row_iterator::end_iterator_tag()));
-            auto rows_to_estimate = range.slice(rows, virtual_row_comparator(_schema));
-            for (auto&& r : rows_to_estimate) {
-                auto& cf = db.find_column_family(*_current_partition, utf8_type->to_string(r.cf_name));
-                estimates.push_back(estimate(cf, r.tokens));
-                if (estimates.size() >= _slice.partition_row_limit()) {
-                    return estimates;
-                }
-            }
-        }
-        return estimates;
-    }
-
-    /**
-     * Returns the keyspaces, ordered by name, as selected by the partition_range.
-     */
-    static ks_range get_keyspaces(const schema& s, const database& db, dht::partition_range range) {
-        struct keyspace_less_comparator {
-            const schema& _s;
-            keyspace_less_comparator(const schema& s) : _s(s) { }
-            dht::ring_position as_ring_position(const sstring& ks) {
-                auto pkey = partition_key::from_single_value(_s, utf8_type->decompose(ks));
-                return dht::global_partitioner().decorate_key(_s, std::move(pkey));
-            }
-            bool operator()(const sstring& ks1, const sstring& ks2) {
-                return as_ring_position(ks1).less_compare(_s, as_ring_position(ks2));
-            }
-            bool operator()(const sstring& ks, const dht::ring_position& rp) {
-                return as_ring_position(ks).less_compare(_s, rp);
-            }
-            bool operator()(const dht::ring_position& rp, const sstring& ks) {
-                return rp.less_compare(_s, as_ring_position(ks));
-            }
-        };
-        auto keyspaces = db.get_non_system_keyspaces();
-        auto cmp = keyspace_less_comparator(s);
-        boost::sort(keyspaces, cmp);
-        return boost::copy_range<ks_range>(range.slice(keyspaces, std::move(cmp)));
-    }
-
-    /**
-     * Makes a wrapping range of ring_position from a nonwrapping range of token, used to select sstables.
-     */
-    static dht::partition_range as_ring_position_range(dht::token_range& r) {
-        std::optional<range<dht::ring_position>::bound> start_bound, end_bound;
-        if (r.start()) {
-            start_bound = {{ dht::ring_position(r.start()->value(), dht::ring_position::token_bound::start), r.start()->is_inclusive() }};
-        }
-        if (r.end()) {
-            end_bound = {{ dht::ring_position(r.end()->value(), dht::ring_position::token_bound::end), r.end()->is_inclusive() }};
-        }
-        return dht::partition_range(std::move(start_bound), std::move(end_bound), r.is_singular());
-    }
-
-    /**
-     * Add a new range_estimates for the specified range, considering the sstables associated with `cf`.
-     */
-    static system_keyspace::range_estimates estimate(const column_family& cf, const token_range& r) {
-        int64_t count{0};
-        utils::estimated_histogram hist{0};
-        auto from_bytes = [] (auto& b) {
-            return dht::global_partitioner().from_sstring(utf8_type->to_string(b));
-        };
-        dht::token_range_vector ranges;
-        ::compat::unwrap_into(
-            wrapping_range<dht::token>({{ from_bytes(r.start), false }}, {{ from_bytes(r.end) }}),
-            dht::token_comparator(),
-            [&] (auto&& rng) { ranges.push_back(std::move(rng)); });
-        for (auto&& r : ranges) {
-            auto rp_range = as_ring_position_range(r);
-            for (auto&& sstable : cf.select_sstables(rp_range)) {
-                count += sstable->estimated_keys_for_range(r);
-                hist.merge(sstable->get_stats_metadata().estimated_partition_size);
-            }
-        }
-        return {cf.schema(), r.start, r.end, count, count > 0 ? hist.mean() : 0};
-    }
+    estimates_for_current_keyspace(const database&, std::vector<token_range> local_ranges) const;
 };

 struct virtual_reader {
@@ -332,6 +68,12 @@ struct virtual_reader {
    }
 };

+/**
+ * Returns the primary ranges for the local node.
+ * Used for testing as well.
+ */
+future<std::vector<token_range>> get_local_ranges();
+
 } // namespace size_estimates

 } // namespace db
--- a/db/view/build_progress_virtual_reader.hh
+++ b/db/view/build_progress_virtual_reader.hh
@@ -44,6 +44,11 @@ namespace db::view {
 // columns. When reading the results from the scylla_views_builds_in_progress
 // table, we adjust the clustering key (we shed the cpu_id column) and map
 // back the regular columns.
+// Since mutation fragment consumers expect clustering_row fragments
+// not to be duplicated for given primary key, previous clustering key
+// is stored between mutation fragments. If the clustering key becomes
+// the same as the previous one (as a result of trimming cpu_id),
+// the duplicated fragment is ignored.
 class build_progress_virtual_reader {
    database& _db;

@@ -55,6 +60,7 @@ class build_progress_virtual_reader {
        const query::partition_slice& _legacy_slice;
        query::partition_slice _slice;
        flat_mutation_reader _underlying;
+        std::optional<clustering_key> _previous_clustering_key;

        build_progress_reader(
                schema_ptr legacy_schema,
@@ -79,7 +85,8 @@ class build_progress_virtual_reader {
                        pc,
                        std::move(trace_state),
                        fwd,
-                        fwd_mr)) {
+                        fwd_mr))
+                , _previous_clustering_key() {
        }

        const schema& underlying_schema() const {
@@ -127,8 +134,13 @@ class build_progress_virtual_reader {
                                legacy_in_progress_row.append_cell(_legacy_generation_number_col, std::move(c));
                            }
                        });
+                        auto ck = adjust_ckey(scylla_in_progress_row.key());
+                        if (_previous_clustering_key && ck.equal(*_schema, *_previous_clustering_key)) {
+                            continue;
+                        }
+                        _previous_clustering_key = ck;
                        mf = clustering_row(
-                                adjust_ckey(scylla_in_progress_row.key()),
+                                std::move(ck),
                                std::move(scylla_in_progress_row.tomb()),
                                std::move(scylla_in_progress_row.marker()),
                                std::move(legacy_in_progress_row));
@@ -140,6 +152,8 @@ class build_progress_virtual_reader {
                                adjust_ckey(scylla_in_progress_rt.end),
                                scylla_in_progress_rt.end_kind,
                                scylla_in_progress_rt.tomb);
+                    } else if (mf.is_end_of_partition()) {
+                        _previous_clustering_key.reset();
                    }
                    push_mutation_fragment(std::move(mf));
                }
@@ -192,4 +206,4 @@ public:
    }
 };

-}
+}
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -83,7 +83,7 @@ view_info::view_info(const schema& schema, const raw_view_info& raw_view_info)
 cql3::statements::select_statement& view_info::select_statement() const {
    if (!_select_statement) {
        shared_ptr<cql3::statements::raw::select_statement> raw;
-        if (is_index()) {
+        if (service::get_local_storage_service().db().local().find_column_family(base_id()).get_index_manager().is_global_index(_schema)) {
            // Token column is the first clustering column
            auto token_column_it = boost::range::find_if(_schema.all_columns(), std::mem_fn(&column_definition::is_clustering_key));
            auto real_columns = _schema.all_columns() | boost::adaptors::filtered([this, token_column_it](const column_definition& cdef) {
@@ -143,10 +143,9 @@ void view_info::initialize_base_dependent_fields(const schema& base) {
 }

 bool view_info::is_index() const {
-    if (!_is_index) {
-        _is_index = service::get_local_storage_service().db().local().find_column_family(base_id()).get_index_manager().is_index(_schema);
-    }
-    return *_is_index;
+    //TODO(sarna): result of this call can be cached instead of calling index_manager::is_index every time
+    column_family& base_cf = service::get_local_storage_service().db().local().find_column_family(base_id());
+    return base_cf.get_index_manager().is_index(view_ptr(_schema.shared_from_this()));
 }

 namespace db {
@@ -450,7 +449,7 @@ void create_virtual_column(schema_builder& builder, const bytes& name, const dat
        // A map has keys and values. We don't need these values,
        // and can use empty values instead.
        auto mtype = dynamic_pointer_cast<const map_type_impl>(type);
-        builder.with_column(name, map_type_impl::get_instance(mtype->get_values_type(), empty_type, true), column_kind::regular_column, column_view_virtual::yes);
+        builder.with_column(name, map_type_impl::get_instance(mtype->get_keys_type(), empty_type, true), column_kind::regular_column, column_view_virtual::yes);
    } else if (ctype->is_set()) {
        // A set's cell has nothing beyond the keys, so the
        // virtual version of a set is, unfortunately, a complete
@@ -1158,6 +1157,10 @@ future<> view_builder::stop() {
        return _sem.wait().then([this] {
            _sem.broken();
            return _build_step.join();
+        }).handle_exception_type([] (const broken_semaphore&) {
+            // ignored
+        }).handle_exception_type([] (const semaphore_timed_out&) {
+            // ignored
        });
    });
 }
--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -24,7 +24,9 @@
 namespace db::view {

 future<> view_update_generator::start() {
-    _started = seastar::async([this]() mutable {
+    thread_attributes attr;
+    attr.sched_group = _db.get_streaming_scheduling_group();
+    _started = seastar::async(std::move(attr), [this]() mutable {
        while (!_as.abort_requested()) {
            if (_sstables_with_tables.empty()) {
                _pending_sstables.wait().get();
--- a/dist/ami/build_ami.sh
+++ b/dist/ami/build_ami.sh
@@ -1,6 +1,7 @@
 #!/bin/bash -e

-PRODUCT=$(cat SCYLLA-PRODUCT-FILE)
+./SCYLLA-VERSION-GEN
+PRODUCT=$(cat build/SCYLLA-PRODUCT-FILE)

 if [ ! -e dist/ami/build_ami.sh ]; then
    echo "run build_ami.sh in top of scylla dir"
@@ -16,6 +17,7 @@ print_usage() {
    exit 1
 }
 LOCALRPM=0
+REPO_FOR_INSTALL=
 while [ $# -gt 0 ]; do
    case "$1" in
        "--localrpm")
@@ -23,10 +25,12 @@ while [ $# -gt 0 ]; do
            shift 1
            ;;
        "--repo")
+            REPO_FOR_INSTALL=$2
            INSTALL_ARGS="$INSTALL_ARGS --repo $2"
            shift 2
            ;;
        "--repo-for-install")
+            REPO_FOR_INSTALL=$2
            INSTALL_ARGS="$INSTALL_ARGS --repo-for-install $2"
            shift 2
            ;;
@@ -123,6 +127,43 @@ if [ $LOCALRPM -eq 1 ]; then
        cd ../..
        cp build/$PRODUCT-ami/build/RPMS/noarch/$PRODUCT-ami-`cat build/$PRODUCT-ami/build/SCYLLA-VERSION-FILE`-`cat build/$PRODUCT-ami/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/$PRODUCT-ami.noarch.rpm
    fi
+    if [ ! -f dist/ami/files/$PRODUCT-python3.x86_64.rpm ]; then
+        reloc/python3/build_reloc.sh
+        reloc/python3/build_rpm.sh
+        cp build/redhat/RPMS/x86_64/$PRODUCT-python3*.x86_64.rpm dist/ami/files/$PRODUCT-python3.x86_64.rpm
+    fi
+
+    SCYLLA_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} dist/ami/files/$PRODUCT.x86_64.rpm || true)
+    SCYLLA_AMI_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} dist/ami/files/$PRODUCT-ami.noarch.rpm || true)
+    SCYLLA_JMX_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} dist/ami/files/$PRODUCT-jmx.noarch.rpm || true)
+    SCYLLA_TOOLS_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} dist/ami/files/$PRODUCT-tools.noarch.rpm || true)
+    SCYLLA_PYTHON3_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} dist/ami/files/$PRODUCT-python3.x86_64.rpm || true)
+else
+    if [ -z "$REPO_FOR_INSTALL" ]; then
+        print_usage
+        exit 1
+    fi
+    if [ ! -f /usr/bin/yumdownloader ]; then
+        if is_redhat_variant; then
+            sudo yum install /usr/bin/yumdownloader
+        else
+            sudo apt-get install yum-utils
+        fi
+    fi
+    if [ ! -f /usr/bin/curl ]; then
+        pkg_install curl
+    fi
+    TMPREPO=$(mktemp -u -p /etc/yum.repos.d/ --suffix .repo)
+    sudo curl -o $TMPREPO $REPO_FOR_INSTALL
+    rm -rf build/ami_packages
+    mkdir -p build/ami_packages
+    yumdownloader --downloaddir build/ami_packages/ $PRODUCT $PRODUCT-kernel-conf $PRODUCT-conf $PRODUCT-server $PRODUCT-debuginfo $PRODUCT-ami $PRODUCT-jmx $PRODUCT-tools-core $PRODUCT-tools $PRODUCT-python3
+    sudo rm -f $TMPREPO
+    SCYLLA_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} build/ami_packages/$PRODUCT-[0-9]*.rpm || true)
+    SCYLLA_AMI_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} build/ami_packages/$PRODUCT-ami-*.rpm || true)
+    SCYLLA_JMX_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} build/ami_packages/$PRODUCT-jmx-*.rpm || true)
+    SCYLLA_TOOLS_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} build/ami_packages/$PRODUCT-tools-[0-9]*.rpm || true)
+    SCYLLA_PYTHON3_VERSION=$(rpm -q --qf %{VERSION}-%{RELEASE} build/ami_packages/$PRODUCT-python3-*.rpm || true)
 fi

 cd dist/ami
@@ -147,4 +188,4 @@ if [ ! -d packer ]; then
    cd -
 fi

-env PACKER_LOG=1 PACKER_LOG_PATH=../../build/ami.log packer/packer build -var-file=variables.json -var install_args="$INSTALL_ARGS" -var region="$REGION" -var source_ami="$AMI" -var ssh_username="$SSH_USERNAME" scylla.json
+env PACKER_LOG=1 PACKER_LOG_PATH=../../build/ami.log packer/packer build -var-file=variables.json -var install_args="$INSTALL_ARGS" -var region="$REGION" -var source_ami="$AMI" -var ssh_username="$SSH_USERNAME" -var scylla_version="$SCYLLA_VERSION" -var scylla_ami_version="$SCYLLA_AMI_VERSION" -var scylla_jmx_version="$SCYLLA_JMX_VERSION" -var scylla_tools_version="$SCYLLA_TOOLS_VERSION" -var scylla_python3_version="$SCYLLA_PYTHON3_VERSION" scylla.json
--- a/dist/ami/scylla.json
+++ b/dist/ami/scylla.json
@@ -56,7 +56,15 @@
      "ssh_username": "{{user `ssh_username`}}",
      "subnet_id": "{{user `subnet_id`}}",
      "type": "amazon-ebs",
-      "user_data_file": "user_data.txt"
+      "user_data_file": "user_data.txt",
+      "ami_description": "scylla-{{user `scylla_version`}} scylla-ami-{{user `scylla_ami_version`}} scylla-jmx-{{user `scylla_jmx_version`}} scylla-tools-{{user `scylla_tools_version`}} scylla-python3-{{user `scylla_python3_version`}}",
+      "tags": {
+          "ScyllaVersion": "{{user `scylla_version`}}",
+          "ScyllaAMIVersion": "{{user `scylla_ami_version`}}",
+          "ScyllaJMXVersion": "{{user `scylla_jmx_version`}}",
+          "ScyllaToolsVersion": "{{user `scylla_tools_version`}}",
+          "ScyllaPython3Version": "{{user `scylla_python3_version`}}"
+      }
    }
  ],
  "provisioners": [
--- a/dist/common/scripts/scylla_io_setup
+++ b/dist/common/scripts/scylla_io_setup
@@ -60,6 +60,17 @@ if __name__ == "__main__":
                disk_properties["read_bandwidth"] = 2015342735 * nr_disks
                disk_properties["write_iops"] = 181500 * nr_disks
                disk_properties["write_bandwidth"] = 808775652 * nr_disks
+            elif idata.instance_class() == "i3en":
+                if idata.instance() in ("i3en.large", "i3.xlarge", "i3en.2xlarge"):
+                    disk_properties["read_iops"] = 46489
+                    disk_properties["read_bandwidth"] = 353437280
+                    disk_properties["write_iops"] = 36680
+                    disk_properties["write_bandwidth"] = 164766656
+                else:
+                    disk_properties["read_iops"] = 278478 * nr_disks
+                    disk_properties["read_bandwidth"] = 3029172992 * nr_disks
+                    disk_properties["write_iops"] = 221909 * nr_disks
+                    disk_properties["write_bandwidth"] = 1020482432 * nr_disks
            elif idata.instance_class() == "i2":
                disk_properties["read_iops"] = 64000 * nr_disks
                disk_properties["read_bandwidth"] = 507338935 * nr_disks
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -95,6 +95,9 @@ def do_verify_package(pkg):
        res = run('rpm -q {}'.format(pkg), silent=True, exception=False)
    elif is_gentoo_variant():
        res = 0 if len(glob.glob('/var/db/pkg/*/{}-*'.format(pkg))) else 1
+    else:
+        print("OS variant not recognized")
+        res = 1
    if res != 0:
        print('{} package is not installed.'.format(pkg))
        sys.exit(1)
@@ -252,22 +255,22 @@ if __name__ == '__main__':
    if not os.path.exists('/etc/scylla.d/housekeeping.cfg'):
        version_check = interactive_ask_service('Do you want to enable Scylla to check if there is a newer version of Scylla available?', 'Yes - start the Scylla-housekeeping service to check for a newer version. This check runs periodically. No - skips this step.', version_check)
        args.no_version_check = not version_check
-    if version_check:
-        with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
-            f.write('[housekeeping]\ncheck-version: True\n')
-        if is_systemd():
-            systemd_unit('scylla-housekeeping-daily.timer').unmask()
-            systemd_unit('scylla-housekeeping-restart.timer').unmask()
-    else:
-        with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
-            f.write('[housekeeping]\ncheck-version: False\n')
-        if is_systemd():
-            hk_daily = systemd_unit('scylla-housekeeping-daily.timer')
-            hk_daily.mask()
-            hk_daily.stop()
-            hk_restart = systemd_unit('scylla-housekeeping-restart.timer')
-            hk_restart.mask()
-            hk_restart.stop()
+        if version_check:
+            with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
+                f.write('[housekeeping]\ncheck-version: True\n')
+            if is_systemd():
+                systemd_unit('scylla-housekeeping-daily.timer').unmask()
+                systemd_unit('scylla-housekeeping-restart.timer').unmask()
+        else:
+            with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
+                f.write('[housekeeping]\ncheck-version: False\n')
+            if is_systemd():
+                hk_daily = systemd_unit('scylla-housekeeping-daily.timer')
+                hk_daily.mask()
+                hk_daily.stop()
+                hk_restart = systemd_unit('scylla-housekeeping-restart.timer')
+                hk_restart.mask()
+                hk_restart.stop()

    cur_version=out('scylla --version', exception=False)
    if len(cur_version) > 0:
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -119,7 +119,7 @@ class aws_instance:
        return self._type.split(".")[0]

    def is_supported_instance_class(self):
-        if self.instance_class() in ['i2', 'i3']:
+        if self.instance_class() in ['i2', 'i3', 'i3en']:
            return True
        return False

@@ -128,7 +128,7 @@ class aws_instance:
        instance_size = self.instance_size()
        if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
            return 'ixgbevf'
-        if instance_class in ['c5', 'c5d', 'f1', 'g3', 'h1', 'i3', 'm5', 'm5d', 'p2', 'p3', 'r4', 'x1']:
+        if instance_class in ['c5', 'c5d', 'f1', 'g3', 'h1', 'i3', 'i3en', 'm5', 'm5d', 'p2', 'p3', 'r4', 'x1']:
            return 'ena'
        if instance_class == 'm4':
            if instance_size == '16xlarge':
@@ -304,7 +304,7 @@ def parse_os_release_line(line):
    val = shlex.split(data)[0]
    return (id, val.split(' ') if id == 'ID' or id == 'ID_LIKE' else val)

-os_release = dict([parse_os_release_line(x) for x in open('/etc/os-release').read().splitlines()])
+os_release = dict([parse_os_release_line(x) for x in open('/etc/os-release').read().splitlines() if re.match(r'\w+=', x) ])

 def is_debian_variant():
    d = os_release['ID_LIKE'] if 'ID_LIKE' in os_release else os_release['ID']
@@ -313,7 +313,7 @@ def is_debian_variant():

 def is_redhat_variant():
    d = os_release['ID_LIKE'] if 'ID_LIKE' in os_release else os_release['ID']
-    return ('rhel' in d) or ('fedora' in d)
+    return ('rhel' in d) or ('fedora' in d) or ('ol') in d

 def is_gentoo_variant():
    return ('gentoo' in os_release['ID'])
@@ -476,6 +476,8 @@ def create_perftune_conf(nic='eth0'):


 def is_valid_nic(nic):
+    if len(nic) == 0:
+        return False
    return os.path.exists('/sys/class/net/{}'.format(nic))

 # Remove this when we do not support SET_NIC configuration value anymore
--- a/dist/debian/control.mustache
+++ b/dist/debian/control.mustache
@@ -16,7 +16,7 @@ Conflicts: {{product}}-server (<< 1.1)

 Package: {{product}}-server
 Architecture: amd64
-Depends: ${shlibs:Depends}, ${misc:Depends}, adduser, hwloc-nox, {{product}}-conf, python-yaml, python-urwid, python-requests, curl, util-linux, python3-yaml, python3, uuid-runtime, pciutils, python3-pyudev, gzip, realpath | coreutils, num-utils, file
+Depends: ${shlibs:Depends}, ${misc:Depends}, adduser, hwloc-nox, {{product}}-conf, {{product}}-python3, curl, util-linux, uuid-runtime, pciutils, gzip, realpath | coreutils, num-utils, file
 Description: Scylla database server binaries 
 Scylla is a highly scalable, eventually consistent, distributed,
 partitioned row DB.
--- a/dist/debian/debian/adjust_bin
+++ b/dist/debian/debian/adjust_bin
@@ -0,0 +1,30 @@
+#!/bin/bash -ex
+
+root="$1"
+bin="$2"
+prefix="/opt/scylladb"
+
+[ "$bin" = patchelf ] && exit 0
+
+patchelf() {
+    # patchelf comes from the build system, so it needs the build system's ld.so and
+    # shared libraries. We can't use patchelf on patchelf itself, so invoke it via
+    # ld.so.
+    LD_LIBRARY_PATH="$root/$prefix/bin/libreloc" "$root/$prefix"/libreloc/ld.so "$root/$prefix"/libexec/patchelf "$@"
+}
+
+# We could add --set-rpath too, but then debugedit (called by rpmbuild) barfs
+# on the result. So use LD_LIBRARY_PATH in the thunk, below.
+patchelf \
+    --set-interpreter "$prefix/libreloc/ld.so" \
+    "$root/$prefix/libexec/$bin"
+mkdir -p "$root/$prefix/bin"
+cat > "$root/$prefix/bin/$bin" <<EOF
+#!/bin/bash -e
+export GNUTLS_SYSTEM_PRIORITY_FILE="\${GNUTLS_SYSTEM_PRIORITY_FILE-$prefix/libreloc/gnutls.config}"
+export LD_LIBRARY_PATH="$prefix/libreloc"
+exec -a "\$0" "$prefix/libexec/$bin" "\$@"
+EOF
+
+chmod +x "$root/$prefix/bin/$bin"
+
--- a/dist/debian/python3/build_deb.sh
+++ b/dist/debian/python3/build_deb.sh
@@ -0,0 +1,140 @@
+#!/bin/bash -e
+
+PRODUCT=$(cat SCYLLA-PRODUCT-FILE)
+
+. /etc/os-release
+print_usage() {
+    echo "build_deb.sh --reloc-pkg build/release/scylla-python3-package.tar.gz"
+    echo "  --reloc-pkg specify relocatable package path"
+    exit 1
+}
+
+TARGET=stable
+RELOC_PKG=
+while [ $# -gt 0 ]; do
+    case "$1" in
+        "--reloc-pkg")
+            RELOC_PKG=$2
+            shift 2
+            ;;
+        *)
+            print_usage
+            ;;
+    esac
+done
+
+is_redhat_variant() {
+    [ -f /etc/redhat-release ]
+}
+is_debian_variant() {
+    [ -f /etc/debian_version ]
+}
+pkg_install() {
+    if is_redhat_variant; then
+        sudo yum install -y $1
+    elif is_debian_variant; then
+        sudo apt-get install -y $1
+    else
+        echo "Requires to install following command: $1"
+        exit 1
+    fi
+}
+
+if [ ! -e SCYLLA-RELOCATABLE-FILE ]; then
+    echo "do not directly execute build_deb.sh, use reloc/build_deb.sh instead."
+    exit 1
+fi
+
+if [ "$(arch)" != "x86_64" ]; then
+    echo "Unsupported architecture: $(arch)"
+    exit 1
+fi
+
+if [ -z "$RELOC_PKG" ]; then
+    print_usage
+    exit 1
+fi
+if [ ! -f "$RELOC_PKG" ]; then
+    echo "$RELOC_PKG is not found."
+    exit 1
+fi
+
+if [ -e debian ]; then
+    rm -rf debian
+fi
+if is_debian_variant; then
+    sudo apt-get -y update
+fi
+# this hack is needed since some environment installs 'git-core' package, it's
+# subset of the git command and doesn't works for our git-archive-all script.
+if is_redhat_variant && [ ! -f /usr/libexec/git-core/git-submodule ]; then
+    sudo yum install -y git
+fi
+if [ ! -f /usr/bin/git ]; then
+    pkg_install git
+fi
+if [ ! -f /usr/bin/python ]; then
+    pkg_install python
+fi
+if [ ! -f /usr/bin/debuild ]; then
+    pkg_install devscripts
+fi
+if [ ! -f /usr/bin/dh_testdir ]; then
+    pkg_install debhelper
+fi
+if [ ! -f /usr/bin/fakeroot ]; then
+    pkg_install fakeroot
+fi
+if [ ! -f /usr/bin/pystache ]; then
+    if is_redhat_variant; then
+        sudo yum install -y /usr/bin/pystache
+    elif is_debian_variant; then
+        sudo apt-get install -y python-pystache
+    fi
+fi
+if [ ! -f /usr/bin/file ]; then
+    pkg_install file
+fi
+if is_debian_variant && [ ! -f /usr/share/doc/python-pkg-resources/copyright ]; then
+    sudo apt-get install -y python-pkg-resources
+fi
+
+if [ "$ID" = "ubuntu" ] && [ ! -f /usr/share/keyrings/debian-archive-keyring.gpg ]; then
+    sudo apt-get install -y debian-archive-keyring
+fi
+if [ "$ID" = "debian" ] && [ ! -f /usr/share/keyrings/ubuntu-archive-keyring.gpg ]; then
+    sudo apt-get install -y ubuntu-archive-keyring
+fi
+
+if [ -z "$TARGET" ]; then
+    if is_debian_variant; then
+        if [ ! -f /usr/bin/lsb_release ]; then
+            pkg_install lsb-release
+        fi
+        TARGET=`lsb_release -c|awk '{print $2}'`
+    else
+        echo "Please specify target"
+        exit 1
+    fi
+fi
+RELOC_PKG_FULLPATH=$(readlink -f $RELOC_PKG)
+RELOC_PKG_BASENAME=$(basename $RELOC_PKG)
+SCYLLA_VERSION=$(cat SCYLLA-VERSION-FILE)
+SCYLLA_RELEASE=$(cat SCYLLA-RELEASE-FILE)
+
+ln -fv $RELOC_PKG_FULLPATH ../$PRODUCT-python3_$SCYLLA_VERSION-$SCYLLA_RELEASE.orig.tar.gz
+
+cp -al dist/debian/python3/debian debian
+if [ "$PRODUCT" != "scylla" ]; then
+    for i in debian/scylla-*;do
+        mv $i ${i/scylla-/$PRODUCT-}
+    done
+fi
+REVISION="1"
+MUSTACHE_DIST="\"debian\": true, \"product\": \"$PRODUCT\", \"$PRODUCT\": true"
+pystache dist/debian/python3/changelog.mustache "{ $MUSTACHE_DIST, \"version\": \"$SCYLLA_VERSION\", \"release\": \"$SCYLLA_RELEASE\", \"revision\": \"$REVISION\", \"codename\": \"$TARGET\" }" > debian/changelog
+pystache dist/debian/python3/rules.mustache "{ $MUSTACHE_DIST }" > debian/rules
+pystache dist/debian/python3/control.mustache "{ $MUSTACHE_DIST }" > debian/control
+chmod a+rx debian/rules
+
+debuild -rfakeroot -us -uc
--- a/dist/debian/python3/changelog.mustache
+++ b/dist/debian/python3/changelog.mustache
@@ -0,0 +1,5 @@
+{{product}}-python3 ({{version}}-{{release}}-{{revision}}) {{codename}}; urgency=medium
+
+  * Initial release.
+
+ -- Takuya ASADA <syuu@scylladb.com>  Mon, 24 Aug 2015 09:22:55 +0000
--- a/dist/debian/python3/control.mustache
+++ b/dist/debian/python3/control.mustache
@@ -0,0 +1,16 @@
+Source: {{product}}-python3
+Maintainer: Takuya ASADA <syuu@scylladb.com>
+Homepage: http://scylladb.com
+Section: python
+Priority: optional
+X-Python3-Version: >= 3.4
+Standards-Version: 3.9.5
+
+Package: {{product}}-python3
+Architecture: amd64
+Description: A standalone python3 interpreter that can be moved around different Linux machines
+ This is a self-contained python interpreter that can be moved around
+ different Linux machines as long as they run a new enough kernel (where
+ new enough is defined by whichever Python module uses any kernel
+ functionality). All shared libraries needed for the interpreter to
+ operate are shipped with it.
--- a/dist/debian/python3/debian/compat
+++ b/dist/debian/python3/debian/compat
@@ -0,0 +1 @@
+9
--- a/dist/debian/python3/debian/copyright
+++ b/dist/debian/python3/debian/copyright
@@ -0,0 +1,995 @@
+This package was put together by Klee Dienes <klee@debian.org> from 
+sources from ftp.python.org:/pub/python, based on the Debianization by 
+the previous maintainers Bernd S. Brentrup <bsb@uni-muenster.de> and 
+Bruce Perens. Current maintainer is Matthias Klose <doko@debian.org>. 
+
+It was downloaded from http://python.org/
+
+Copyright:
+
+Upstream Author: Guido van Rossum <guido@cwi.nl> and others.
+
+License:
+
+The following text includes the Python license and licenses and
+acknowledgements for incorporated software. The licenses can be read
+in the HTML and texinfo versions of the documentation as well, after
+installing the pythonx.y-doc package. Licenses for files not licensed
+under the Python Licenses are found at the end of this file.
+
+
+Python License
+==============
+
+A. HISTORY OF THE SOFTWARE
+==========================
+
+Python was created in the early 1990s by Guido van Rossum at Stichting
+Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands
+as a successor of a language called ABC.  Guido remains Python's
+principal author, although it includes many contributions from others.
+
+In 1995, Guido continued his work on Python at the Corporation for
+National Research Initiatives (CNRI, see http://www.cnri.reston.va.us)
+in Reston, Virginia where he released several versions of the
+software.
+
+In May 2000, Guido and the Python core development team moved to
+BeOpen.com to form the BeOpen PythonLabs team.  In October of the same
+year, the PythonLabs team moved to Digital Creations (now Zope
+Corporation, see http://www.zope.com).  In 2001, the Python Software
+Foundation (PSF, see http://www.python.org/psf/) was formed, a
+non-profit organization created specifically to own Python-related
+Intellectual Property.  Zope Corporation is a sponsoring member of
+the PSF.
+
+All Python releases are Open Source (see http://www.opensource.org for
+the Open Source Definition).  Historically, most, but not all, Python
+releases have also been GPL-compatible; the table below summarizes
+the various releases.
+
+    Release         Derived     Year        Owner       GPL-
+                    from                                compatible? (1)
+
+    0.9.0 thru 1.2              1991-1995   CWI         yes
+    1.3 thru 1.5.2  1.2         1995-1999   CNRI        yes
+    1.6             1.5.2       2000        CNRI        no
+    2.0             1.6         2000        BeOpen.com  no
+    1.6.1           1.6         2001        CNRI        yes (2)
+    2.1             2.0+1.6.1   2001        PSF         no
+    2.0.1           2.0+1.6.1   2001        PSF         yes
+    2.1.1           2.1+2.0.1   2001        PSF         yes
+    2.2             2.1.1       2001        PSF         yes
+    2.1.2           2.1.1       2002        PSF         yes
+    2.1.3           2.1.2       2002        PSF         yes
+    2.2 and above   2.1.1       2001-now    PSF         yes
+
+Footnotes:
+
+(1) GPL-compatible doesn't mean that we're distributing Python under
+    the GPL.  All Python licenses, unlike the GPL, let you distribute
+    a modified version without making your changes open source.  The
+    GPL-compatible licenses make it possible to combine Python with
+    other software that is released under the GPL; the others don't.
+
+(2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
+    because its license has a choice of law clause.  According to
+    CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
+    is "not incompatible" with the GPL.
+
+Thanks to the many outside volunteers who have worked under Guido's
+direction to make these releases possible.
+
+
+B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
+===============================================================
+
+PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
+--------------------------------------------
+
+1. This LICENSE AGREEMENT is between the Python Software Foundation
+("PSF"), and the Individual or Organization ("Licensee") accessing and
+otherwise using this software ("Python") in source or binary form and
+its associated documentation.
+
+2. Subject to the terms and conditions of this License Agreement, PSF
+hereby grants Licensee a nonexclusive, royalty-free, world-wide
+license to reproduce, analyze, test, perform and/or display publicly,
+prepare derivative works, distribute, and otherwise use Python alone
+or in any derivative version, provided, however, that PSF's License
+Agreement and PSF's notice of copyright, i.e., "Copyright (c) 2001,
+2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
+2013, 2014 Python Software Foundation; All Rights Reserved" are
+retained in Python alone or in any derivative version prepared by
+Licensee.
+
+3. In the event Licensee prepares a derivative work that is based on
+or incorporates Python or any part thereof, and wants to make
+the derivative work available to others as provided herein, then
+Licensee hereby agrees to include in any such work a brief summary of
+the changes made to Python.
+
+4. PSF is making Python available to Licensee on an "AS IS"
+basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
+OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+6. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+7. Nothing in this License Agreement shall be deemed to create any
+relationship of agency, partnership, or joint venture between PSF and
+Licensee.  This License Agreement does not grant permission to use PSF
+trademarks or trade name in a trademark sense to endorse or promote
+products or services of Licensee, or any third party.
+
+8. By copying, installing or otherwise using Python, Licensee
+agrees to be bound by the terms and conditions of this License
+Agreement.
+
+
+BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0
+-------------------------------------------
+
+BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1
+
+1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an
+office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the
+Individual or Organization ("Licensee") accessing and otherwise using
+this software in source or binary form and its associated
+documentation ("the Software").
+
+2. Subject to the terms and conditions of this BeOpen Python License
+Agreement, BeOpen hereby grants Licensee a non-exclusive,
+royalty-free, world-wide license to reproduce, analyze, test, perform
+and/or display publicly, prepare derivative works, distribute, and
+otherwise use the Software alone or in any derivative version,
+provided, however, that the BeOpen Python License is retained in the
+Software, alone or in any derivative version prepared by Licensee.
+
+3. BeOpen is making the Software available to Licensee on an "AS IS"
+basis.  BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE
+SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS
+AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY
+DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+5. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+6. This License Agreement shall be governed by and interpreted in all
+respects by the law of the State of California, excluding conflict of
+law provisions.  Nothing in this License Agreement shall be deemed to
+create any relationship of agency, partnership, or joint venture
+between BeOpen and Licensee.  This License Agreement does not grant
+permission to use BeOpen trademarks or trade names in a trademark
+sense to endorse or promote products or services of Licensee, or any
+third party.  As an exception, the "BeOpen Python" logos available at
+http://www.pythonlabs.com/logos.html may be used according to the
+permissions granted on that web page.
+
+7. By copying, installing or otherwise using the software, Licensee
+agrees to be bound by the terms and conditions of this License
+Agreement.
+
+
+CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1
+---------------------------------------
+
+1. This LICENSE AGREEMENT is between the Corporation for National
+Research Initiatives, having an office at 1895 Preston White Drive,
+Reston, VA 20191 ("CNRI"), and the Individual or Organization
+("Licensee") accessing and otherwise using Python 1.6.1 software in
+source or binary form and its associated documentation.
+
+2. Subject to the terms and conditions of this License Agreement, CNRI
+hereby grants Licensee a nonexclusive, royalty-free, world-wide
+license to reproduce, analyze, test, perform and/or display publicly,
+prepare derivative works, distribute, and otherwise use Python 1.6.1
+alone or in any derivative version, provided, however, that CNRI's
+License Agreement and CNRI's notice of copyright, i.e., "Copyright (c)
+1995-2001 Corporation for National Research Initiatives; All Rights
+Reserved" are retained in Python 1.6.1 alone or in any derivative
+version prepared by Licensee.  Alternately, in lieu of CNRI's License
+Agreement, Licensee may substitute the following text (omitting the
+quotes): "Python 1.6.1 is made available subject to the terms and
+conditions in CNRI's License Agreement.  This Agreement together with
+Python 1.6.1 may be located on the Internet using the following
+unique, persistent identifier (known as a handle): 1895.22/1013.  This
+Agreement may also be obtained from a proxy server on the Internet
+using the following URL: http://hdl.handle.net/1895.22/1013".
+
+3. In the event Licensee prepares a derivative work that is based on
+or incorporates Python 1.6.1 or any part thereof, and wants to make
+the derivative work available to others as provided herein, then
+Licensee hereby agrees to include in any such work a brief summary of
+the changes made to Python 1.6.1.
+
+4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS"
+basis.  CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1,
+OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+6. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+7. This License Agreement shall be governed by the federal
+intellectual property law of the United States, including without
+limitation the federal copyright law, and, to the extent such
+U.S. federal law does not apply, by the law of the Commonwealth of
+Virginia, excluding Virginia's conflict of law provisions.
+Notwithstanding the foregoing, with regard to derivative works based
+on Python 1.6.1 that incorporate non-separable material that was
+previously distributed under the GNU General Public License (GPL), the
+law of the Commonwealth of Virginia shall govern this License
+Agreement only as to issues arising under or with respect to
+Paragraphs 4, 5, and 7 of this License Agreement.  Nothing in this
+License Agreement shall be deemed to create any relationship of
+agency, partnership, or joint venture between CNRI and Licensee.  This
+License Agreement does not grant permission to use CNRI trademarks or
+trade name in a trademark sense to endorse or promote products or
+services of Licensee, or any third party.
+
+8. By clicking on the "ACCEPT" button where indicated, or by copying,
+installing or otherwise using Python 1.6.1, Licensee agrees to be
+bound by the terms and conditions of this License Agreement.
+
+        ACCEPT
+
+
+CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2
+--------------------------------------------------
+
+Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam,
+The Netherlands.  All rights reserved.
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted,
+provided that the above copyright notice appear in all copies and that
+both that copyright notice and this permission notice appear in
+supporting documentation, and that the name of Stichting Mathematisch
+Centrum or CWI not be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior
+permission.
+
+STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
+THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
+FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+
+Licenses and Acknowledgements for Incorporated Software
+=======================================================
+
+Mersenne Twister
+----------------
+
+The `_random' module includes code based on a download from
+`http://www.math.keio.ac.jp/~matumoto/MT2002/emt19937ar.html'.  The
+following are the verbatim comments from the original code:
+
+     A C-program for MT19937, with initialization improved 2002/1/26.
+     Coded by Takuji Nishimura and Makoto Matsumoto.
+
+     Before using, initialize the state by using init_genrand(seed)
+     or init_by_array(init_key, key_length).
+
+     Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+     All rights reserved.
+
+     Redistribution and use in source and binary forms, with or without
+     modification, are permitted provided that the following conditions
+     are met:
+
+      1. Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+
+      2. Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+
+      3. The names of its contributors may not be used to endorse or promote
+         products derived from this software without specific prior written
+         permission.
+
+     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+     A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
+     OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+     TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+     PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+     LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+     NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+     SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+     Any feedback is very welcome.
+     http://www.math.keio.ac.jp/matumoto/emt.html
+     email: matumoto@math.keio.ac.jp
+
+
+Sockets
+-------
+
+The `socket' module uses the functions, `getaddrinfo', and
+`getnameinfo', which are coded in separate source files from the WIDE
+Project, `http://www.wide.ad.jp/about/index.html'.
+
+     Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
+     All rights reserved.
+
+     Redistribution and use in source and binary forms, with or without
+     modification, are permitted provided that the following conditions
+     are met:
+     1. Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+     2. Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+     3. Neither the name of the project nor the names of its contributors
+        may be used to endorse or promote products derived from this software
+        without specific prior written permission.
+
+     THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+     GAI_ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+     ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+     FOR GAI_ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+     INTERRUPTION) HOWEVER CAUSED AND ON GAI_ANY THEORY OF LIABILITY, WHETHER
+     IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+     ARISING IN GAI_ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+     OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+Floating point exception control
+--------------------------------
+
+The source for the `fpectl' module includes the following notice:
+
+     ---------------------------------------------------------------------  
+    /                       Copyright (c) 1996.                           \ 
+   |          The Regents of the University of California.                 |
+   |                        All rights reserved.                           |
+   |                                                                       |
+   |   Permission to use, copy, modify, and distribute this software for   |
+   |   any purpose without fee is hereby granted, provided that this en-   |
+   |   tire notice is included in all copies of any software which is or   |
+   |   includes  a  copy  or  modification  of  this software and in all   |
+   |   copies of the supporting documentation for such software.           |
+   |                                                                       |
+   |   This  work was produced at the University of California, Lawrence   |
+   |   Livermore National Laboratory under  contract  no.  W-7405-ENG-48   |
+   |   between  the  U.S.  Department  of  Energy and The Regents of the   |
+   |   University of California for the operation of UC LLNL.              |
+   |                                                                       |
+   |                              DISCLAIMER                               |
+   |                                                                       |
+   |   This  software was prepared as an account of work sponsored by an   |
+   |   agency of the United States Government. Neither the United States   |
+   |   Government  nor the University of California nor any of their em-   |
+   |   ployees, makes any warranty, express or implied, or  assumes  any   |
+   |   liability  or  responsibility  for the accuracy, completeness, or   |
+   |   usefulness of any information,  apparatus,  product,  or  process   |
+   |   disclosed,   or  represents  that  its  use  would  not  infringe   |
+   |   privately-owned rights. Reference herein to any specific  commer-   |
+   |   cial  products,  process,  or  service  by trade name, trademark,   |
+   |   manufacturer, or otherwise, does not  necessarily  constitute  or   |
+   |   imply  its endorsement, recommendation, or favoring by the United   |
+   |   States Government or the University of California. The views  and   |
+   |   opinions  of authors expressed herein do not necessarily state or   |
+   |   reflect those of the United States Government or  the  University   |
+   |   of  California,  and shall not be used for advertising or product   |
+    \  endorsement purposes.                                              / 
+     ---------------------------------------------------------------------
+
+
+Cookie management
+-----------------
+
+The `Cookie' module contains the following notice:
+
+      Copyright 2000 by Timothy O'Malley <timo@alum.mit.edu>
+
+                     All Rights Reserved
+
+      Permission to use, copy, modify, and distribute this software
+      and its documentation for any purpose and without fee is hereby
+      granted, provided that the above copyright notice appear in all
+      copies and that both that copyright notice and this permission
+      notice appear in supporting documentation, and that the name of
+      Timothy O'Malley  not be used in advertising or publicity
+      pertaining to distribution of the software without specific, written
+      prior permission.
+
+      Timothy O'Malley DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+      SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+      AND FITNESS, IN NO EVENT SHALL Timothy O'Malley BE LIABLE FOR
+      ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+      WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+      WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+      ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+      PERFORMANCE OF THIS SOFTWARE.
+
+
+Execution tracing
+-----------------
+
+The `trace' module contains the following notice:
+
+      portions copyright 2001, Autonomous Zones Industries, Inc., all rights...
+      err...  reserved and offered to the public under the terms of the
+      Python 2.2 license.
+      Author: Zooko O'Whielacronx
+      http://zooko.com/
+      mailto:zooko@zooko.com
+
+      Copyright 2000, Mojam Media, Inc., all rights reserved.
+      Author: Skip Montanaro
+
+      Copyright 1999, Bioreason, Inc., all rights reserved.
+      Author: Andrew Dalke
+
+      Copyright 1995-1997, Automatrix, Inc., all rights reserved.
+      Author: Skip Montanaro
+
+      Copyright 1991-1995, Stichting Mathematisch Centrum, all rights reserved.
+
+      Permission to use, copy, modify, and distribute this Python software and
+      its associated documentation for any purpose without fee is hereby
+      granted, provided that the above copyright notice appears in all copies,
+      and that both that copyright notice and this permission notice appear in
+      supporting documentation, and that the name of neither Automatrix,
+      Bioreason or Mojam Media be used in advertising or publicity pertaining
+      to distribution of the software without specific, written prior
+      permission.
+
+
+UUencode and UUdecode functions
+-------------------------------
+
+The `uu' module contains the following notice:
+
+      Copyright 1994 by Lance Ellinghouse
+      Cathedral City, California Republic, United States of America.
+                             All Rights Reserved
+      Permission to use, copy, modify, and distribute this software and its
+      documentation for any purpose and without fee is hereby granted,
+      provided that the above copyright notice appear in all copies and that
+      both that copyright notice and this permission notice appear in
+      supporting documentation, and that the name of Lance Ellinghouse
+      not be used in advertising or publicity pertaining to distribution
+      of the software without specific, written prior permission.
+      LANCE ELLINGHOUSE DISCLAIMS ALL WARRANTIES WITH REGARD TO
+      THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+      FITNESS, IN NO EVENT SHALL LANCE ELLINGHOUSE CENTRUM BE LIABLE
+      FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+      WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+      ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+      OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+      Modified by Jack Jansen, CWI, July 1995:
+      - Use binascii module to do the actual line-by-line conversion
+        between ascii and binary. This results in a 1000-fold speedup. The C
+        version is still 5 times faster, though.
+      - Arguments more compliant with python standard
+
+
+XML Remote Procedure Calls
+--------------------------
+
+The `xmlrpclib' module contains the following notice:
+
+          The XML-RPC client interface is
+
+      Copyright (c) 1999-2002 by Secret Labs AB
+      Copyright (c) 1999-2002 by Fredrik Lundh
+
+      By obtaining, using, and/or copying this software and/or its
+      associated documentation, you agree that you have read, understood,
+      and will comply with the following terms and conditions:
+
+      Permission to use, copy, modify, and distribute this software and
+      its associated documentation for any purpose and without fee is
+      hereby granted, provided that the above copyright notice appears in
+      all copies, and that both that copyright notice and this permission
+      notice appear in supporting documentation, and that the name of
+      Secret Labs AB or the author not be used in advertising or publicity
+      pertaining to distribution of the software without specific, written
+      prior permission.
+
+      SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
+      TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
+      ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
+      BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
+      DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+      WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+      ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+      OF THIS SOFTWARE.
+
+Licenses for Software linked to
+===============================
+
+Note that the choice of GPL compatibility outlined above doesn't extend
+to modules linked to particular libraries, since they change the
+effective License of the module binary.
+
+
+GNU Readline
+------------
+
+The 'readline' module makes use of GNU Readline.
+
+      The GNU Readline Library is free software; you can redistribute it
+      and/or modify it under the terms of the GNU General Public License as
+      published by the Free Software Foundation; either version 2, or (at
+      your option) any later version.
+
+      On Debian systems, you can find the complete statement in
+      /usr/share/doc/readline-common/copyright'. A copy of the GNU General
+      Public License is available in /usr/share/common-licenses/GPL-2'.
+
+
+OpenSSL
+-------
+
+The '_ssl' module makes use of OpenSSL.
+
+      The OpenSSL toolkit stays under a dual license, i.e. both the
+      conditions of the OpenSSL License and the original SSLeay license
+      apply to the toolkit. Actually both licenses are BSD-style Open
+      Source licenses. Note that both licenses are incompatible with
+      the GPL.
+
+      On Debian systems, you can find the complete license text in
+      /usr/share/doc/openssl/copyright'.
+
+
+Files with other licenses than the Python License
+-------------------------------------------------
+
+Files: Include/dynamic_annotations.h
+Files: Python/dynamic_annotations.c
+Copyright: (c) 2008-2009, Google Inc.
+License: Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+ 
+      * Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+      * Neither the name of Google Inc. nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+ 
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Files: Include/unicodeobject.h
+Copyright: (c) Corporation for National Research Initiatives.
+Copyright: (c) 1999 by Secret Labs AB.
+Copyright: (c) 1999 by Fredrik Lundh.
+License: By obtaining, using, and/or copying this software and/or its
+  associated documentation, you agree that you have read, understood,
+  and will comply with the following terms and conditions:
+ 
+  Permission to use, copy, modify, and distribute this software and its
+  associated documentation for any purpose and without fee is hereby
+  granted, provided that the above copyright notice appears in all
+  copies, and that both that copyright notice and this permission notice
+  appear in supporting documentation, and that the name of Secret Labs
+  AB or the author not be used in advertising or publicity pertaining to
+  distribution of the software without specific, written prior
+  permission.
+ 
+  SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
+  THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+  FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
+  ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+  OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Files: Lib/logging/*
+Copyright: 2001-2010 by Vinay Sajip. All Rights Reserved.
+License: Permission to use, copy, modify, and distribute this software and
+ its documentation for any purpose and without fee is hereby granted,
+ provided that the above copyright notice appear in all copies and that
+ both that copyright notice and this permission notice appear in
+ supporting documentation, and that the name of Vinay Sajip
+ not be used in advertising or publicity pertaining to distribution
+ of the software without specific, written prior permission.
+ VINAY SAJIP DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
+ ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
+ VINAY SAJIP BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
+ ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
+ IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Files: Lib/multiprocessing/*
+Files: Modules/_multiprocessing/*
+Copyright: (c) 2006-2008, R Oudkerk. All rights reserved.
+License: Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ 
+ 1. Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+ 3. Neither the name of author nor the names of any contributors may be
+    used to endorse or promote products derived from this software
+    without specific prior written permission.
+ 
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ SUCH DAMAGE.
+
+Files: Lib/sqlite3/*
+Files: Modules/_sqlite/*
+Copyright: (C) 2004-2005 Gerhard Häring <gh@ghaering.de>
+License: This software is provided 'as-is', without any express or implied
+ warranty.  In no event will the authors be held liable for any damages
+ arising from the use of this software.
+ 
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+ 
+ 1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+Files: Lib/async*
+Copyright: Copyright 1996 by Sam Rushing
+License: Permission to use, copy, modify, and distribute this software and
+ its documentation for any purpose and without fee is hereby
+ granted, provided that the above copyright notice appear in all
+ copies and that both that copyright notice and this permission
+ notice appear in supporting documentation, and that the name of Sam
+ Rushing not be used in advertising or publicity pertaining to
+ distribution of the software without specific, written prior
+ permission.
+ 
+ SAM RUSHING DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
+ NO EVENT SHALL SAM RUSHING BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+ NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Files: Lib/tarfile.py
+Copyright: (C) 2002 Lars Gustaebel <lars@gustaebel.de>
+License: Permission  is  hereby granted,  free  of charge,  to  any person
+ obtaining a  copy of  this software  and associated documentation
+ files  (the  "Software"),  to   deal  in  the  Software   without
+ restriction,  including  without limitation  the  rights to  use,
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies  of  the  Software,  and to  permit  persons  to  whom the
+ Software  is  furnished  to  do  so,  subject  to  the  following
+ conditions:
+ 
+ The above copyright  notice and this  permission notice shall  be
+ included in all copies or substantial portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS  IS", WITHOUT WARRANTY OF ANY  KIND,
+ EXPRESS OR IMPLIED, INCLUDING  BUT NOT LIMITED TO  THE WARRANTIES
+ OF  MERCHANTABILITY,  FITNESS   FOR  A  PARTICULAR   PURPOSE  AND
+ NONINFRINGEMENT.  IN  NO  EVENT SHALL  THE  AUTHORS  OR COPYRIGHT
+ HOLDERS  BE LIABLE  FOR ANY  CLAIM, DAMAGES  OR OTHER  LIABILITY,
+ WHETHER  IN AN  ACTION OF  CONTRACT, TORT  OR OTHERWISE,  ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ OTHER DEALINGS IN THE SOFTWARE.
+
+Files: Lib/turtle.py
+Copyright: (C) 2006 - 2010  Gregor Lingl
+License: This software is provided 'as-is', without any express or implied
+ warranty.  In no event will the authors be held liable for any damages
+ arising from the use of this software.
+ 
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+ 
+ 1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ is copyright Gregor Lingl and licensed under a BSD-like license
+
+Files: Modules/_ctypes/libffi/*
+Copyright: Copyright (C) 1996-2011 Red Hat, Inc and others.
+    Copyright (C) 1996-2011 Anthony Green
+    Copyright (C) 1996-2010 Free Software Foundation, Inc
+    Copyright (c) 2003, 2004, 2006, 2007, 2008 Kaz Kojima
+    Copyright (c) 2010, 2011, Plausible Labs Cooperative , Inc.
+    Copyright (c) 2010 CodeSourcery
+    Copyright (c) 1998 Andreas Schwab
+    Copyright (c) 2000 Hewlett Packard Company
+    Copyright (c) 2009 Bradley Smith
+    Copyright (c) 2008 David Daney
+    Copyright (c) 2004 Simon Posnjak
+    Copyright (c) 2005 Axis Communications AB
+    Copyright (c) 1998 Cygnus Solutions
+    Copyright (c) 2004 Renesas Technology
+    Copyright (c) 2002, 2007  Bo Thorsen <bo@suse.de>
+    Copyright (c) 2002 Ranjit Mathew
+    Copyright (c) 2002 Roger Sayle
+    Copyright (c) 2000, 2007 Software AG
+    Copyright (c) 2003 Jakub Jelinek
+    Copyright (c) 2000, 2001 John Hornkvist
+    Copyright (c) 1998 Geoffrey Keating
+    Copyright (c) 2008 Björn König
+
+License: Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+
+   Documentation:
+   Permission is granted to copy, distribute and/or modify this document
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 2, or (at your option) any
+   later version.  A copy of the license is included in the
+   section entitled ``GNU General Public License''.
+
+Files: Modules/_gestalt.c
+Copyright: 1991-1997 by Stichting Mathematisch Centrum, Amsterdam.
+License: Permission to use, copy, modify, and distribute this software and its
+ documentation for any purpose and without fee is hereby granted,
+ provided that the above copyright notice appear in all copies and that
+ both that copyright notice and this permission notice appear in
+ supporting documentation, and that the names of Stichting Mathematisch
+ Centrum or CWI not be used in advertising or publicity pertaining to
+ distribution of the software without specific, written prior permission.
+ 
+ STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
+ THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
+ FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Files: Modules/syslogmodule.c
+Copyright: 1994 by Lance Ellinghouse
+License: Permission to use, copy, modify, and distribute this software and its
+ documentation for any purpose and without fee is hereby granted,
+ provided that the above copyright notice appear in all copies and that
+ both that copyright notice and this permission notice appear in
+ supporting documentation, and that the name of Lance Ellinghouse
+ not be used in advertising or publicity pertaining to distribution
+ of the software without specific, written prior permission.
+ 
+ LANCE ELLINGHOUSE DISCLAIMS ALL WARRANTIES WITH REGARD TO
+ THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ FITNESS, IN NO EVENT SHALL LANCE ELLINGHOUSE BE LIABLE FOR ANY SPECIAL,
+ INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
+ FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+ NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
+ WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Files: Modules/zlib/*
+Copyright: (C) 1995-2010 Jean-loup Gailly and Mark Adler
+License: This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  jloup@gzip.org          madler@alumni.caltech.edu
+
+ If you use the zlib library in a product, we would appreciate *not* receiving
+ lengthy legal documents to sign.  The sources are provided for free but without
+ warranty of any kind.  The library has been entirely written by Jean-loup
+ Gailly and Mark Adler; it does not include third-party code.
+
+Files: Modules/expat/*
+Copyright: Copyright (c) 1998, 1999, 2000 Thai Open Source Software Center Ltd
+  and Clark Cooper
+  Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Expat maintainers
+License: Permission is hereby granted, free of charge, to any person obtaining
+  a copy of this software and associated documentation files (the
+  "Software"), to deal in the Software without restriction, including
+  without limitation the rights to use, copy, modify, merge, publish,
+  distribute, sublicense, and/or sell copies of the Software, and to
+  permit persons to whom the Software is furnished to do so, subject to
+  the following conditions:
+ 
+  The above copyright notice and this permission notice shall be included
+  in all copies or substantial portions of the Software.
+ 
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+Files: Modules/_decimal/libmpdec/*
+Copyright: Copyright (c) 2008-2012 Stefan Krah. All rights reserved.
+License: Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ .
+ 1. Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+ .
+ 2. Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+ ,
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ SUCH DAMAGE.
+
+Files: Misc/python-mode.el
+Copyright: Copyright (C) 1992,1993,1994  Tim Peters
+License: This software is provided as-is, without express or implied
+	warranty.  Permission to use, copy, modify, distribute or sell this
+	software, without fee, for any purpose and by any individual or
+	organization, is hereby granted, provided that the above copyright
+	notice and this paragraph appear in all copies.
+
+Files: Python/dtoa.c
+Copyright: (c) 1991, 2000, 2001 by Lucent Technologies.
+License: Permission to use, copy, modify, and distribute this software for any
+  purpose without fee is hereby granted, provided that this entire notice
+  is included in all copies of any software which is or includes a copy
+  or modification of this software and in all copies of the supporting
+  documentation for such software.
+  
+  THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+  WARRANTY.  IN PARTICULAR, NEITHER THE AUTHOR NOR LUCENT MAKES ANY
+  REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+  OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+
+Files: Python/getopt.c
+Copyright: 1992-1994, David Gottner
+License: Permission to use, copy, modify, and distribute this software and its
+  documentation for any purpose and without fee is hereby granted,
+  provided that the above copyright notice, this permission notice and
+  the following disclaimer notice appear unmodified in all copies.
+  
+  I DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.  IN NO EVENT SHALL I
+  BE LIABLE FOR ANY SPECIAL, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
+  DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA, OR PROFITS, WHETHER
+  IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+  OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Files: PC/_subprocess.c
+Copyright: Copyright (c) 2004 by Fredrik Lundh <fredrik@pythonware.com>
+	Copyright (c) 2004 by Secret Labs AB, http://www.pythonware.com
+	Copyright (c) 2004 by Peter Astrand <astrand@lysator.liu.se>
+License:
+ * Permission to use, copy, modify, and distribute this software and
+ * its associated documentation for any purpose and without fee is
+ * hereby granted, provided that the above copyright notice appears in
+ * all copies, and that both that copyright notice and this permission
+ * notice appear in supporting documentation, and that the name of the
+ * authors not be used in advertising or publicity pertaining to
+ * distribution of the software without specific, written prior
+ * permission.
+ *
+ * THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+ * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
+ * WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Files: PC/winsound.c
+Copyright: Copyright (c) 1999 Toby Dickenson
+License:  * Permission to use this software in any way is granted without
+ * fee, provided that the copyright notice above appears in all
+ * copies. This software is provided "as is" without any warranty.
+ */
+
+/* Modified by Guido van Rossum */
+/* Beep added by Mark Hammond */
+/* Win9X Beep and platform identification added by Uncle Timmy */
+
+Files: Tools/pybench/*
+Copyright: (c), 1997-2006, Marc-Andre Lemburg (mal@lemburg.com)
+  (c), 2000-2006, eGenix.com Software GmbH (info@egenix.com)
+License: Permission to use, copy, modify, and distribute this software and its
+  documentation for any purpose and without fee or royalty is hereby
+  granted, provided that the above copyright notice appear in all copies
+  and that both that copyright notice and this permission notice appear
+  in supporting documentation or portions thereof, including
+  modifications, that you make.
+  
+  THE AUTHOR MARC-ANDRE LEMBURG DISCLAIMS ALL WARRANTIES WITH REGARD TO
+  THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+  FITNESS, IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL,
+  INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
+  FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+  NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
+  WITH THE USE OR PERFORMANCE OF THIS SOFTWARE !
--- a/dist/debian/python3/debian/scylla-python3.dirs
+++ b/dist/debian/python3/debian/scylla-python3.dirs
@@ -0,0 +1,3 @@
+opt/scylladb/python3/bin
+opt/scylladb/python3/lib64
+opt/scylladb/python3/libexec
--- a/dist/debian/python3/debian/scylla-python3.install
+++ b/dist/debian/python3/debian/scylla-python3.install
@@ -0,0 +1,3 @@
+bin/* opt/scylladb/python3/bin
+lib64/* opt/scylladb/python3/lib64
+libexec/* opt/scylladb/python3/libexec
--- a/dist/debian/python3/rules.mustache
+++ b/dist/debian/python3/rules.mustache
@@ -0,0 +1,22 @@
+#!/usr/bin/make -f
+
+export PYBUILD_DISABLE=1
+
+override_dh_auto_configure:
+
+override_dh_auto_build:
+
+override_dh_strip:
+
+override_dh_makeshlibs:
+
+override_dh_shlibdeps:
+
+override_dh_fixperms:
+	dh_fixperms
+	chmod 755 $(CURDIR)/debian/{{product}}-python3/opt/scylladb/python3/libexec/ld.so
+
+override_dh_strip_nondeterminism:
+
+%:
+	dh $@
--- a/dist/debian/rules.mustache
+++ b/dist/debian/rules.mustache
@@ -9,12 +9,21 @@ override_dh_auto_build:

 override_dh_auto_clean:

-override_dh_auto_install:
-	dh_auto_install
+override_dh_install:
+	dh_install
 	install -d $(CURDIR)/debian/scylla-server/usr/bin
+	for bin in debian/scylla-server/opt/scylladb/libexec/*; do debian/adjust_bin $(CURDIR)/debian/scylla-server "$${bin#*libexec/}"; done
 	ln -sf /opt/scylladb/bin/scylla $(CURDIR)/debian/scylla-server/usr/bin/scylla
 	ln -sf /opt/scylladb/bin/iotune $(CURDIR)/debian/scylla-server/usr/bin/iotune
 	ln -sf /usr/lib/scylla/scyllatop/scyllatop.py $(CURDIR)/debian/scylla-server/usr/bin/scyllatop
+	find ./dist/common/scripts -type f -exec ./relocate_python_scripts.py \
+	--installroot $(CURDIR)/debian/scylla-server/usr/lib/scylla/ --with-python3 "$(CURDIR)/debian/scylla-server/opt/scylladb/python3/bin/python3" {} +
+	./relocate_python_scripts.py \
+	--installroot $(CURDIR)/debian/scylla-server/usr/lib/scylla/ --with-python3 "$(CURDIR)/debian/scylla-server/opt/scylladb/python3/bin/python3" \
+	seastar/scripts/perftune.py seastar/scripts/seastar-addr2line seastar/scripts/perftune.py
+	./relocate_python_scripts.py \
+	--installroot $(CURDIR)/debian/scylla-server/usr/lib/scylla/scyllatop/ --with-python3 "$(CURDIR)/debian/scylla-server/opt/scylladb/python3/bin/python3" \
+	tools/scyllatop/scyllatop.py

 override_dh_installinit:
 {{#scylla}}
@@ -29,7 +38,9 @@ override_dh_installinit:
 	dh_installinit --no-start --name node-exporter

 override_dh_strip:
-	dh_strip -Xlibprotobuf.so.15 -Xld.so --dbg-package={{product}}-server-dbg
+	# The binaries (ethtool...patchelf) don't pass dh_strip after going through patchelf. Since they are
+	# already stripped, nothing is lost if we exclude them, so that's what we do.
+	dh_strip -Xlibprotobuf.so.15 -Xld.so -Xethtool -Xgawk -Xgzip -Xhwloc-calc -Xhwloc-distrib -Xifconfig -Xlscpu -Xnetstat -Xpatchelf --dbg-package={{product}}-server-dbg

 override_dh_makeshlibs:

--- a/dist/debian/scylla-server.install.mustache
+++ b/dist/debian/scylla-server.install.mustache
@@ -1,14 +1,9 @@
 dist/common/limits.d/scylla.conf etc/security/limits.d
 dist/common/scylla.d/*.conf etc/scylla.d
 seastar/dpdk/usertools/dpdk-devbind.py usr/lib/scylla
-seastar/scripts/perftune.py usr/lib/scylla
-seastar/scripts/seastar-addr2line usr/lib/scylla
 seastar/scripts/seastar-cpu-map.sh usr/lib/scylla
-dist/common/scripts/* usr/lib/scylla
-tools/scyllatop usr/lib/scylla
 swagger-ui/dist usr/lib/scylla/swagger-ui
 api/api-doc usr/lib/scylla/api
-bin/* opt/scylladb/bin
 libreloc/* opt/scylladb/libreloc
 libexec/* opt/scylladb/libexec
 dist/common/sbin/* usr/sbin
@@ -20,3 +15,4 @@ dist/common/systemd/scylla-housekeeping-restart.timer /lib/systemd/system
 dist/common/systemd/scylla-fstrim.timer /lib/systemd/system
 dist/debian/scripts/scylla_save_coredump usr/lib/scylla
 dist/debian/scripts/scylla_delay_fstrim usr/lib/scylla
+tools/scyllatop usr/lib/scylla
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -28,7 +28,7 @@ ADD commandlineparser.py /commandlineparser.py
 ADD docker-entrypoint.py /docker-entrypoint.py
 ADD node_exporter_install /node_exporter_install
 # Install Scylla:
-RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo && \
+RUN curl http://downloads.scylladb.com/rpm/centos/scylla-3.1.repo -o /etc/yum.repos.d/scylla.repo && \
    yum -y install epel-release && \
    yum -y clean expire-cache && \
    yum -y update && \
--- a/distributed_loader.cc
+++ b/distributed_loader.cc
@@ -192,7 +192,11 @@ future<> verification_error(fs::path path, const char* fstr, Args&&... args) {
 // No other file types may exist.
 future<> distributed_loader::verify_owner_and_mode(fs::path path) {
    return file_stat(path.string(), follow_symlink::no).then([path = std::move(path)] (stat_data sd) {
-        if (sd.uid != geteuid()) {
+        // Under docker, we run with euid 0 and there is no reasonable way to enforce that the
+        // in-container uid will have the same uid as files mounted from outside the container. So
+        // just allow euid 0 as a special case. It should survive the file_accessible() checks below.
+        // See #4823.
+        if (geteuid() != 0 && sd.uid != geteuid()) {
            return verification_error(std::move(path), "File not owned by current euid: {}. Owner is: {}", geteuid(), sd.uid);
        }
        switch (sd.type) {
--- a/fix_system_distributed_tables.py
+++ b/fix_system_distributed_tables.py
@@ -151,7 +151,7 @@ if __name__ == '__main__':
    argp.add_argument('--user', '-u')
    argp.add_argument('--password', '-p', default='none')
    argp.add_argument('--node', default='127.0.0.1', help='Node to connect to.')
-    argp.add_argument('--port', default='9042', help='Port to connect to.')
+    argp.add_argument('--port', default=9042, help='Port to connect to.', type=int)

    args = argp.parse_args()
    res = validate_and_fix(args)
--- a/flat_mutation_reader.cc
+++ b/flat_mutation_reader.cc
@@ -23,6 +23,7 @@
 #include "mutation_reader.hh"
 #include "seastar/util/reference_wrapper.hh"
 #include "clustering_ranges_walker.hh"
+#include "schema_upgrader.hh"
 #include <algorithm>

 #include <boost/range/adaptor/transformed.hpp>
@@ -908,3 +909,7 @@ public:
 flat_mutation_reader make_generating_reader(schema_ptr s, std::function<future<mutation_fragment_opt> ()> get_next_fragment) {
    return make_flat_mutation_reader<generating_reader>(std::move(s), std::move(get_next_fragment));
 }
+
+void flat_mutation_reader::do_upgrade_schema(const schema_ptr& s) {
+    *this = transform(std::move(*this), schema_upgrader(s));
+}
--- a/flat_mutation_reader.hh
+++ b/flat_mutation_reader.hh
@@ -326,6 +326,7 @@ private:
    flat_mutation_reader() = default;
    explicit operator bool() const noexcept { return bool(_impl); }
    friend class optimized_optional<flat_mutation_reader>;
+    void do_upgrade_schema(const schema_ptr&);
 public:
    // Documented in mutation_reader::forwarding in mutation_reader.hh.
    class partition_range_forwarding_tag;
@@ -474,6 +475,14 @@ public:
    void move_buffer_content_to(impl& other) {
        _impl->move_buffer_content_to(other);
    }
+
+    // Causes this reader to conform to s.
+    // Multiple calls of upgrade_schema() compose, effects of prior calls on the stream are preserved.
+    void upgrade_schema(const schema_ptr& s) {
+        if (__builtin_expect(s != schema(), false)) {
+            do_upgrade_schema(s);
+        }
+    }
 };

 using flat_mutation_reader_opt = optimized_optional<flat_mutation_reader>;
@@ -576,8 +585,12 @@ class delegating_reader : public flat_mutation_reader::impl {
 public:
    delegating_reader(Underlying&& r) : impl(to_reference(r).schema()), _underlying(std::forward<Underlying>(r)) { }
    virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
-        return fill_buffer_from(to_reference(_underlying), timeout).then([this] (bool underlying_finished) {
-            _end_of_stream = underlying_finished;
+        if (is_buffer_full()) {
+            return make_ready_future<>();
+        }
+        return to_reference(_underlying).fill_buffer(timeout).then([this] {
+            _end_of_stream = to_reference(_underlying).is_end_of_stream();
+            to_reference(_underlying).move_buffer_content_to(*this);
        });
    }
    virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override {
--- a/gc_clock.hh
+++ b/gc_clock.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include "clocks-impl.hh"
+#include "hashing.hh"

 #include <seastar/core/lowres_clock.hh>

@@ -71,3 +72,51 @@ using ttl_opt = std::optional<gc_clock::duration>;
 static constexpr gc_clock::duration max_ttl = gc_clock::duration{20 * 365 * 24 * 60 * 60};

 std::ostream& operator<<(std::ostream& os, gc_clock::time_point tp);
+
+template<>
+struct appending_hash<gc_clock::time_point> {
+    template<typename Hasher>
+    void operator()(Hasher& h, gc_clock::time_point t) const {
+        // Remain backwards-compatible with the 32-bit duration::rep (refs #4460).
+        uint64_t d64 = t.time_since_epoch().count();
+        feed_hash(h, uint32_t(d64 & 0xffff'ffff));
+        uint32_t msb = d64 >> 32;
+        if (msb) {
+            feed_hash(h, msb);
+        }
+    }
+};
+
+
+namespace ser {
+
+// Forward-declaration - defined in serializer.hh, to avoid including it here.
+
+template <typename Output>
+void serialize_gc_clock_duration_value(Output& out, int64_t value);
+
+template <typename Input>
+int64_t deserialize_gc_clock_duration_value(Input& in);
+
+template <typename T>
+struct serializer;
+
+template <>
+struct serializer<gc_clock::duration> {
+    template <typename Input>
+    static gc_clock::duration read(Input& in) {
+        return gc_clock::duration(deserialize_gc_clock_duration_value(in));
+    }
+
+    template <typename Output>
+    static void write(Output& out, gc_clock::duration d) {
+        serialize_gc_clock_duration_value(out, d.count());
+    }
+
+    template <typename Input>
+    static void skip(Input& in) {
+        read(in);
+    }
+};
+
+}
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -481,8 +481,7 @@ future<> gossiper::apply_state_locally(std::map<inet_address, endpoint_state> ma
                    int local_generation = local_ep_state_ptr.get_heart_beat_state().get_generation();
                    int remote_generation = remote_state.get_heart_beat_state().get_generation();
                    logger.trace("{} local generation {}, remote generation {}", ep, local_generation, remote_generation);
-                    // A node was removed with nodetool removenode can have a generation of 2
-                    if (local_generation > 2 && remote_generation > local_generation + MAX_GENERATION_DIFFERENCE) {
+                    if (remote_generation > service::get_generation_number() + MAX_GENERATION_DIFFERENCE) {
                        // assume some peer has corrupted memory and is broadcasting an unbelievable generation about another peer (or itself)
                        logger.warn("received an invalid gossip generation for peer {}; local generation = {}, received generation = {}",
                            ep, local_generation, remote_generation);
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -160,7 +160,9 @@ public:
    static constexpr std::chrono::milliseconds INTERVAL{1000};
    static constexpr std::chrono::hours A_VERY_LONG_TIME{24 * 3};

-    /** Maximimum difference in generation and version values we are willing to accept about a peer */
+    // Maximimum difference between remote generation value and generation
+    // value this node would get if this node were restarted that we are
+    // willing to accept about a peer.
    static constexpr int64_t MAX_GENERATION_DIFFERENCE = 86400 * 365;
    std::chrono::milliseconds fat_client_timeout;

--- a/hashers.cc
+++ b/hashers.cc
@@ -29,7 +29,7 @@ template <typename T> struct hasher_traits;
 template <> struct hasher_traits<md5_hasher> { using impl_type = CryptoPP::Weak::MD5; };
 template <> struct hasher_traits<sha256_hasher> { using impl_type = CryptoPP::SHA256; };

-template <typename T, size_t size> struct hasher<T, size>::impl {
+template <typename T, size_t size> struct cryptopp_hasher<T, size>::impl {
    using impl_type = typename hasher_traits<T>::impl_type;

    impl_type hash{};
@@ -53,35 +53,35 @@ template <typename T, size_t size> struct hasher<T, size>::impl {
    }
 };

-template <typename T, size_t size> hasher<T, size>::hasher() : _impl(std::make_unique<impl>()) {}
+template <typename T, size_t size> cryptopp_hasher<T, size>::cryptopp_hasher() : _impl(std::make_unique<impl>()) {}

-template <typename T, size_t size> hasher<T, size>::~hasher() = default;
+template <typename T, size_t size> cryptopp_hasher<T, size>::~cryptopp_hasher() = default;

-template <typename T, size_t size> hasher<T, size>::hasher(hasher&& o) noexcept = default;
+template <typename T, size_t size> cryptopp_hasher<T, size>::cryptopp_hasher(cryptopp_hasher&& o) noexcept = default;

-template <typename T, size_t size> hasher<T, size>::hasher(const hasher& o) : _impl(std::make_unique<hasher<T, size>::impl>(*o._impl)) {}
+template <typename T, size_t size> cryptopp_hasher<T, size>::cryptopp_hasher(const cryptopp_hasher& o) : _impl(std::make_unique<cryptopp_hasher<T, size>::impl>(*o._impl)) {}

-template <typename T, size_t size> hasher<T, size>& hasher<T, size>::operator=(hasher&& o) noexcept = default;
+template <typename T, size_t size> cryptopp_hasher<T, size>& cryptopp_hasher<T, size>::operator=(cryptopp_hasher&& o) noexcept = default;

-template <typename T, size_t size> hasher<T, size>& hasher<T, size>::operator=(const hasher& o) {
-    _impl = std::make_unique<hasher<T, size>::impl>(*o._impl);
+template <typename T, size_t size> cryptopp_hasher<T, size>& cryptopp_hasher<T, size>::operator=(const cryptopp_hasher& o) {
+    _impl = std::make_unique<cryptopp_hasher<T, size>::impl>(*o._impl);
    return *this;
 }

-template <typename T, size_t size> bytes hasher<T, size>::finalize() { return _impl->finalize(); }
+template <typename T, size_t size> bytes cryptopp_hasher<T, size>::finalize() { return _impl->finalize(); }

-template <typename T, size_t size> std::array<uint8_t, size> hasher<T, size>::finalize_array() {
+template <typename T, size_t size> std::array<uint8_t, size> cryptopp_hasher<T, size>::finalize_array() {
    return _impl->finalize_array();
 }

-template <typename T, size_t size> void hasher<T, size>::update(const char* ptr, size_t length) { _impl->update(ptr, length); }
+template <typename T, size_t size> void cryptopp_hasher<T, size>::update(const char* ptr, size_t length) { _impl->update(ptr, length); }

-template <typename T, size_t size> bytes hasher<T, size>::calculate(const std::string_view& s) {
-    typename hasher<T, size>::impl::impl_type hash;
+template <typename T, size_t size> bytes cryptopp_hasher<T, size>::calculate(const std::string_view& s) {
+    typename cryptopp_hasher<T, size>::impl::impl_type hash;
    unsigned char digest[size];
    hash.CalculateDigest(digest, reinterpret_cast<const unsigned char*>(s.data()), s.size());
    return std::move(bytes{reinterpret_cast<const int8_t*>(digest), size});
 }

-template class hasher<md5_hasher, 16>;
-template class hasher<sha256_hasher, 32>;
+template class cryptopp_hasher<md5_hasher, 16>;
+template class cryptopp_hasher<sha256_hasher, 32>;
--- a/hashers.hh
+++ b/hashers.hh
@@ -22,29 +22,30 @@
 #pragma once

 #include "bytes.hh"
+#include "hashing.hh"

 class md5_hasher;

-template <typename T, size_t size> class hasher {
+template <typename T, size_t size> class cryptopp_hasher : public hasher {
    struct impl;
    std::unique_ptr<impl> _impl;

 public:
-    hasher();
-    ~hasher();
-    hasher(hasher&&) noexcept;
-    hasher(const hasher&);
-    hasher& operator=(hasher&&) noexcept;
-    hasher& operator=(const hasher&);
+    cryptopp_hasher();
+    ~cryptopp_hasher();
+    cryptopp_hasher(cryptopp_hasher&&) noexcept;
+    cryptopp_hasher(const cryptopp_hasher&);
+    cryptopp_hasher& operator=(cryptopp_hasher&&) noexcept;
+    cryptopp_hasher& operator=(const cryptopp_hasher&);

    bytes finalize();
    std::array<uint8_t, size> finalize_array();
-    void update(const char* ptr, size_t length);
+    void update(const char* ptr, size_t length) override;

    // Use update and finalize to compute the hash over the full view.
    static bytes calculate(const std::string_view& s);
 };

-class md5_hasher : public hasher<md5_hasher, 16> {};
+class md5_hasher final : public cryptopp_hasher<md5_hasher, 16> {};

-class sha256_hasher : public hasher<sha256_hasher, 32> {};
+class sha256_hasher final : public cryptopp_hasher<sha256_hasher, 32> {};
--- a/hashing.hh
+++ b/hashing.hh
@@ -27,6 +27,7 @@
 #include <seastar/core/byteorder.hh>
 #include <seastar/core/sstring.hh>
 #include "seastarx.hh"
+#include <seastar/util/gcc6-concepts.hh>

 //
 // This hashing differs from std::hash<> in that it decouples knowledge about
@@ -41,24 +42,38 @@
 // appending_hash<T> is machine-independent.
 //

-// The Hasher concept
-struct Hasher {
-    void update(const char* ptr, size_t size);
+GCC6_CONCEPT(
+    template<typename H>
+    concept bool Hasher() {
+        return requires(H& h, const char* ptr, size_t size) {
+            { h.update(ptr, size) } -> void
+        };
+    }
+)
+
+class hasher {
+public:
+    virtual ~hasher() = default;
+    virtual void update(const char* ptr, size_t size) = 0;
 };

+GCC6_CONCEPT(static_assert(Hasher<hasher>());)
+
 template<typename T, typename Enable = void>
 struct appending_hash;

-template<typename Hasher, typename T, typename... Args>
+template<typename H, typename T, typename... Args>
+GCC6_CONCEPT(requires Hasher<H>())
 inline
-void feed_hash(Hasher& h, const T& value, Args&&... args) {
+void feed_hash(H& h, const T& value, Args&&... args) {
    appending_hash<T>()(h, value, std::forward<Args>(args)...);
 };

 template<typename T>
 struct appending_hash<T, std::enable_if_t<std::is_arithmetic<T>::value>> {
-    template<typename Hasher>
-    void operator()(Hasher& h, T value) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, T value) const {
        auto value_le = cpu_to_le(value);
        h.update(reinterpret_cast<const char*>(&value_le), sizeof(T));
    }
@@ -66,24 +81,27 @@ struct appending_hash<T, std::enable_if_t<std::is_arithmetic<T>::value>> {

 template<>
 struct appending_hash<bool> {
-    template<typename Hasher>
-    void operator()(Hasher& h, bool value) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, bool value) const {
        feed_hash(h, static_cast<uint8_t>(value));
    }
 };

 template<typename T>
 struct appending_hash<T, std::enable_if_t<std::is_enum<T>::value>> {
-    template<typename Hasher>
-    void operator()(Hasher& h, const T& value) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, const T& value) const {
        feed_hash(h, static_cast<std::underlying_type_t<T>>(value));
    }
 };

 template<typename T>
 struct appending_hash<std::optional<T>>  {
-    template<typename Hasher>
-    void operator()(Hasher& h, const std::optional<T>& value) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, const std::optional<T>& value) const {
        if (value) {
            feed_hash(h, true);
            feed_hash(h, *value);
@@ -95,8 +113,9 @@ struct appending_hash<std::optional<T>>  {

 template<size_t N>
 struct appending_hash<char[N]>  {
-    template<typename Hasher>
-    void operator()(Hasher& h, const char (&value) [N]) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, const char (&value) [N]) const {
        feed_hash(h, N);
        h.update(value, N);
    }
@@ -104,8 +123,9 @@ struct appending_hash<char[N]>  {

 template<typename T>
 struct appending_hash<std::vector<T>> {
-    template<typename Hasher>
-    void operator()(Hasher& h, const std::vector<T>& value) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, const std::vector<T>& value) const {
        feed_hash(h, value.size());
        for (auto&& v : value) {
            appending_hash<T>()(h, v);
@@ -115,8 +135,9 @@ struct appending_hash<std::vector<T>> {

 template<typename K, typename V>
 struct appending_hash<std::map<K, V>> {
-    template<typename Hasher>
-    void operator()(Hasher& h, const std::map<K, V>& value) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, const std::map<K, V>& value) const {
        feed_hash(h, value.size());
        for (auto&& e : value) {
            appending_hash<K>()(h, e.first);
@@ -127,8 +148,9 @@ struct appending_hash<std::map<K, V>> {

 template<>
 struct appending_hash<sstring> {
-    template<typename Hasher>
-    void operator()(Hasher& h, const sstring& v) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, const sstring& v) const {
        feed_hash(h, v.size());
        h.update(reinterpret_cast<const char*>(v.cbegin()), v.size() * sizeof(sstring::value_type));
    }
@@ -136,8 +158,9 @@ struct appending_hash<sstring> {

 template<>
 struct appending_hash<std::string> {
-    template<typename Hasher>
-    void operator()(Hasher& h, const std::string& v) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, const std::string& v) const {
        feed_hash(h, v.size());
        h.update(reinterpret_cast<const char*>(v.data()), v.size() * sizeof(std::string::value_type));
    }
@@ -145,16 +168,18 @@ struct appending_hash<std::string> {

 template<typename T, typename R>
 struct appending_hash<std::chrono::duration<T, R>> {
-    template<typename Hasher>
-    void operator()(Hasher& h, std::chrono::duration<T, R> v) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, std::chrono::duration<T, R> v) const {
        feed_hash(h, v.count());
    }
 };

 template<typename Clock, typename Duration>
 struct appending_hash<std::chrono::time_point<Clock, Duration>> {
-    template<typename Hasher>
-    void operator()(Hasher& h, std::chrono::time_point<Clock, Duration> v) const {
+    template<typename H>
+    GCC6_CONCEPT(requires Hasher<H>())
+    void operator()(H& h, std::chrono::time_point<Clock, Duration> v) const {
        feed_hash(h, v.time_since_epoch().count());
    }
 };
--- a/idl/reconcilable_result.idl.hh
+++ b/idl/reconcilable_result.idl.hh
@@ -26,6 +26,6 @@ class partition {

 class reconcilable_result {
    uint32_t row_count();
-    std::vector<partition> partitions();
+    utils::chunked_vector<partition> partitions();
    query::short_read is_short_read() [[version 1.6]] = query::short_read::no;
 };
--- a/idl/streaming.idl.hh
+++ b/idl/streaming.idl.hh
@@ -51,4 +51,10 @@ enum class stream_reason : uint8_t {
    repair,
 };

+enum class stream_mutation_fragments_cmd : uint8_t {
+    error,
+    mutation_fragment_data,
+    end_of_stream,
+};
+
 }
--- a/index/secondary_index_manager.cc
+++ b/index/secondary_index_manager.cc
@@ -181,4 +181,10 @@ bool secondary_index_manager::is_index(const schema& s) const {
    });
 }

+bool secondary_index_manager::is_global_index(const schema& s) const {
+    return boost::algorithm::any_of(_indices | boost::adaptors::map_values, [&s] (const index& i) {
+        return !i.metadata().local() && s.cf_name() == index_table_name(i.metadata().name());
+    });
+}
+
 }
--- a/index/secondary_index_manager.hh
+++ b/index/secondary_index_manager.hh
@@ -77,6 +77,7 @@ public:
    std::vector<index> list_indexes() const;
    bool is_index(view_ptr) const;
    bool is_index(const schema& s) const;
+    bool is_global_index(const schema& s) const;
 private:
    void add_index(const index_metadata& im);
 };
--- a/init.cc
+++ b/init.cc
@@ -155,6 +155,10 @@ void init_ms_fd_gossiper(sharded<gms::gossiper>& gossiper
                to_string(seeds), listen_address_in, broadcast_address);
        throw std::runtime_error("Use broadcast_address for seeds list");
    }
+    if ((!cfg.replace_address_first_boot().empty() || !cfg.replace_address().empty()) && seeds.count(broadcast_address)) {
+        startlog.error("Bad configuration: replace-address and replace-address-first-boot are not allowed for seed nodes");
+        throw bad_configuration_error();
+    }
    gossiper.local().set_seeds(seeds);
    gossiper.invoke_on_all([cluster_name](gms::gossiper& g) {
        g.set_cluster_name(cluster_name);
--- a/install.sh
+++ b/install.sh
@@ -75,6 +75,29 @@ while [ $# -gt 0 ]; do
    esac
 done

+patchelf() {
+    # patchelf comes from the build system, so it needs the build system's ld.so and
+    # shared libraries. We can't use patchelf on patchelf itself, so invoke it via
+    # ld.so.
+    LD_LIBRARY_PATH="$PWD/libreloc" libreloc/ld.so libexec/patchelf "$@"
+}
+
+adjust_bin() {
+    local bin="$1"
+    # We could add --set-rpath too, but then debugedit (called by rpmbuild) barfs
+    # on the result. So use LD_LIBRARY_PATH in the thunk, below.
+    patchelf \
+	--set-interpreter "/opt/scylladb/libreloc/ld.so" \
+	"$root/opt/scylladb/libexec/$bin"
+    cat > "$root/opt/scylladb/bin/$bin" <<EOF
+#!/bin/bash -e
+export GNUTLS_SYSTEM_PRIORITY_FILE="\${GNUTLS_SYSTEM_PRIORITY_FILE-/opt/scylladb/libreloc/gnutls.config}"
+export LD_LIBRARY_PATH="/opt/scylladb/libreloc"
+exec -a "\$0" "/opt/scylladb/libexec/$bin" "\$@"
+EOF
+    chmod +x "$root/opt/scylladb/bin/$bin"
+}
+
 rprefix="$root/$prefix"
 retc="$root/etc"
 rdoc="$rprefix/share/doc"
@@ -105,16 +128,13 @@ install -m644 dist/common/systemd/*.service -Dt "$rprefix"/lib/systemd/system
 install -m644 dist/common/systemd/*.timer -Dt "$rprefix"/lib/systemd/system
 install -m755 seastar/scripts/seastar-cpu-map.sh -Dt "$rprefix"/lib/scylla/
 install -m755 seastar/dpdk/usertools/dpdk-devbind.py -Dt "$rprefix"/lib/scylla/
-install -m755 bin/* -Dt "$root/opt/scylladb/bin"
+install -m755 libreloc/* -Dt "$root/opt/scylladb/libreloc"
 # some files in libexec are symlinks, which "install" dereferences
 # use cp -P for the symlinks instead.
-install -m755 libexec/*.bin -Dt "$root/opt/scylladb/libexec"
-for f in libexec/*; do
-    if [[ "$f" != *.bin ]]; then
-        cp -P "$f" "$root/opt/scylladb/libexec"
-    fi
+install -m755 libexec/* -Dt "$root/opt/scylladb/libexec"
+for bin in libexec/*; do
+    adjust_bin "${bin#libexec/}"
 done
-install -m755 libreloc/* -Dt "$root/opt/scylladb/libreloc"
 ln -srf "$root/opt/scylladb/bin/scylla" "$rprefix/bin/scylla"
 ln -srf "$root/opt/scylladb/bin/iotune" "$rprefix/bin/iotune"
 ln -srf "$rprefix/lib/scylla/scyllatop/scyllatop.py" "$rprefix/bin/scyllatop"
--- a/main.cc
+++ b/main.cc
@@ -69,6 +69,7 @@
 #include "sstables/sstables.hh"
 #include "gms/feature_service.hh"
 #include "distributed_loader.hh"
+#include "serializer.hh"

 namespace fs = std::filesystem;

@@ -340,15 +341,7 @@ int main(int ac, char** av) {
    auto cfg = make_lw_shared<db::config>(ext);
    auto init = app.get_options_description().add_options();

-    // If --version is requested, print it out and exit immediately to avoid
-    // Seastar-specific warnings that may occur when running the app
    init("version", bpo::bool_switch(), "print version number and exit");
-    bpo::variables_map vm;
-    bpo::store(bpo::command_line_parser(ac, av).options(app.get_options_description()).allow_unregistered().run(), vm);
-    if (vm["version"].as<bool>()) {
-        fmt::print("{}\n", scylla_version());
-        return 0;
-    }

    bpo::options_description deprecated("Deprecated options - ignored");
    deprecated.add_options()
@@ -362,6 +355,15 @@ int main(int ac, char** av) {
    configurable::append_all(*cfg, init);
    cfg->add_options(init);

+    // If --version is requested, print it out and exit immediately to avoid
+    // Seastar-specific warnings that may occur when running the app
+    bpo::variables_map vm;
+    bpo::store(bpo::command_line_parser(ac, av).options(app.get_options_description()).allow_unregistered().run(), vm);
+    if (vm["version"].as<bool>()) {
+        fmt::print("{}\n", scylla_version());
+        return 0;
+    }
+
    distributed<database> db;
    seastar::sharded<service::cache_hitrate_calculator> cf_cache_hitrate_calculator;
    debug::db = &db;
@@ -407,6 +409,11 @@ int main(int ac, char** av) {
            read_config(opts, *cfg).get();
            configurable::init_all(opts, *cfg, *ext).get();

+            // We're writing to a non-atomic variable here. But bool writes are atomic
+            // in all supported architectures, and some broadcast or other below
+            // will apply the required memory barriers anyway.
+            ser::gc_clock_using_3_1_0_serialization = cfg->enable_3_1_0_compatibility_mode();
+
            logalloc::prime_segment_pool(memory::stats().total_memory(), memory::min_free_memory()).get();
            logging::apply_settings(cfg->logging_settings(opts));

@@ -526,6 +533,9 @@ int main(int ac, char** av) {
            if (opts.count("developer-mode")) {
                smp::invoke_on_all([] { engine().set_strict_dma(false); }).get();
            }
+
+            set_abort_on_internal_error(cfg->abort_on_internal_error());
+
            supervisor::notify("creating tracing");
            tracing::backend_registry tracing_backend_registry;
            tracing::register_tracing_keyspace_backend(tracing_backend_registry);
@@ -916,8 +926,10 @@ int main(int ac, char** av) {
                service::get_local_storage_service().drain_on_shutdown().get();
            });

-            auto stop_view_builder = defer([] {
-                view_builder.stop().get();
+            auto stop_view_builder = defer([cfg] {
+                if (cfg->view_building()) {
+                    view_builder.stop().get();
+                }
            });

            auto stop_compaction_manager = defer([&db] {
--- a/memtable.cc
+++ b/memtable.cc
@@ -23,7 +23,6 @@
 #include "database.hh"
 #include "frozen_mutation.hh"
 #include "partition_snapshot_reader.hh"
-#include "schema_upgrader.hh"
 #include "partition_builder.hh"

 void memtable::memtable_encoding_stats_collector::update_timestamp(api::timestamp_type ts) {
@@ -429,11 +428,8 @@ public:
                        bool digest_requested = _slice.options.contains<query::partition_slice::option::with_digest>();
                        auto mpsr = make_partition_snapshot_flat_reader(snp_schema, std::move(key_and_snp->first), std::move(cr),
                                        std::move(key_and_snp->second), digest_requested, region(), read_section(), mtbl(), streamed_mutation::forwarding::no);
-                        if (snp_schema->version() != schema()->version()) {
-                            _delegate = transform(std::move(mpsr), schema_upgrader(schema()));
-                        } else {
-                            _delegate = std::move(mpsr);
-                        }
+                        mpsr.upgrade_schema(schema());
+                        _delegate = std::move(mpsr);
                    } else {
                        _end_of_stream = true;
                    }
@@ -588,11 +584,8 @@ private:
            auto snp_schema = key_and_snp->second->schema();
            auto mpsr = make_partition_snapshot_flat_reader<partition_snapshot_accounter>(snp_schema, std::move(key_and_snp->first), std::move(cr),
                            std::move(key_and_snp->second), false, region(), read_section(), mtbl(), streamed_mutation::forwarding::no, *snp_schema, _flushed_memory);
-            if (snp_schema->version() != schema()->version()) {
-                _partition_reader = transform(std::move(mpsr), schema_upgrader(schema()));
-            } else {
-                _partition_reader = std::move(mpsr);
-            }
+            mpsr.upgrade_schema(schema());
+            _partition_reader = std::move(mpsr);
        }
    }
 public:
@@ -668,11 +661,8 @@ memtable::make_flat_reader(schema_ptr s,
        bool digest_requested = slice.options.contains<query::partition_slice::option::with_digest>();
        auto rd = make_partition_snapshot_flat_reader(snp_schema, std::move(dk), std::move(cr), std::move(snp), digest_requested,
                                                      *this, _read_section, shared_from_this(), fwd);
-        if (snp_schema->version() != s->version()) {
-            return transform(std::move(rd), schema_upgrader(s));
-        } else {
-            return rd;
-        }
+        rd.upgrade_schema(s);
+        return rd;
    } else {
        auto res = make_flat_mutation_reader<scanning_reader>(std::move(s), shared_from_this(), range, slice, pc, fwd_mr);
        if (fwd == streamed_mutation::forwarding::yes) {
@@ -787,13 +777,19 @@ bool memtable::is_flushed() const {
    return bool(_underlying);
 }

+void memtable_entry::upgrade_schema(const schema_ptr& s, mutation_cleaner& cleaner) {
+    if (_schema != s) {
+        partition().upgrade(_schema, s, cleaner, no_cache_tracker);
+        _schema = s;
+    }
+}
+
 void memtable::upgrade_entry(memtable_entry& e) {
    if (e._schema != _schema) {
        assert(!reclaiming_enabled());
        with_allocator(allocator(), [this, &e] {
          with_linearized_managed_bytes([&] {
-            e.partition().upgrade(e._schema, _schema, cleaner(), no_cache_tracker);
-            e._schema = _schema;
+            e.upgrade_schema(_schema, cleaner());
          });
        });
    }
--- a/memtable.hh
+++ b/memtable.hh
@@ -69,6 +69,10 @@ public:
    schema_ptr& schema() { return _schema; }
    partition_snapshot_ptr snapshot(memtable& mtbl);

+    // Makes the entry conform to given schema.
+    // Must be called under allocating section of the region which owns the entry.
+    void upgrade_schema(const schema_ptr&, mutation_cleaner&);
+
    size_t external_memory_usage_without_rows() const {
        return _key.key().external_memory_usage();
    }
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -89,6 +89,7 @@
 #include "frozen_mutation.hh"
 #include "flat_mutation_reader.hh"
 #include "streaming/stream_manager.hh"
+#include "streaming/stream_mutation_fragments_cmd.hh"

 namespace netw {

@@ -287,7 +288,6 @@ void messaging_service::start_listen() {
    if (_compress_what != compress_what::none) {
        so.compressor_factory = &compressor_factory;
    }
-    so.streaming_domain = rpc::streaming_domain_type(0x55AA);
    so.load_balancing_algorithm = server_socket::load_balancing_algorithm::port;

    // FIXME: we don't set so.tcp_nodelay, because we can't tell at this point whether the connection will come from a
@@ -295,19 +295,21 @@ void messaging_service::start_listen() {
    //        the first by wrapping its server_socket, but not the second.
    auto limits = rpc_resource_limits(_mcfg.rpc_memory_limit);
    if (!_server[0]) {
-        auto listen = [&] (const gms::inet_address& a) {
+        auto listen = [&] (const gms::inet_address& a, rpc::streaming_domain_type sdomain) {
+            so.streaming_domain = sdomain;
            auto addr = ipv4_addr{a.raw_addr(), _port};
            return std::unique_ptr<rpc_protocol_server_wrapper>(new rpc_protocol_server_wrapper(*_rpc,
                    so, addr, limits));
        };
-        _server[0] = listen(_listen_address);
+        _server[0] = listen(_listen_address, rpc::streaming_domain_type(0x55AA));
        if (listen_to_bc) {
-            _server[1] = listen(utils::fb_utilities::get_broadcast_address());
+            _server[1] = listen(utils::fb_utilities::get_broadcast_address(), rpc::streaming_domain_type(0x66BB));
        }
    }

    if (!_server_tls[0]) {
-        auto listen = [&] (const gms::inet_address& a) {
+        auto listen = [&] (const gms::inet_address& a, rpc::streaming_domain_type sdomain) {
+            so.streaming_domain = sdomain;
            return std::unique_ptr<rpc_protocol_server_wrapper>(
                    [this, &so, &a, limits] () -> std::unique_ptr<rpc_protocol_server_wrapper>{
                if (_encrypt_what == encrypt_what::none) {
@@ -321,9 +323,9 @@ void messaging_service::start_listen() {
                        so, seastar::tls::listen(_credentials, addr, lo), limits);
            }());
        };
-        _server_tls[0] = listen(_listen_address);
+        _server_tls[0] = listen(_listen_address, rpc::streaming_domain_type(0x77CC));
        if (listen_to_bc) {
-            _server_tls[1] = listen(utils::fb_utilities::get_broadcast_address());
+            _server_tls[1] = listen(utils::fb_utilities::get_broadcast_address(), rpc::streaming_domain_type(0x88DD));
        }
    }
    // Do this on just cpu 0, to avoid duplicate logs.
@@ -607,6 +609,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        opts.compressor_factory = &compressor_factory;
    }
    opts.tcp_nodelay = must_tcp_nodelay;
+    opts.reuseaddr = true;

    auto client = must_encrypt ?
                    ::make_shared<rpc_protocol_client_wrapper>(*_rpc, std::move(opts),
@@ -668,24 +671,24 @@ std::unique_ptr<messaging_service::rpc_protocol_wrapper>& messaging_service::rpc
    return _rpc;
 }

-rpc::sink<int32_t> messaging_service::make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment>& source) {
+rpc::sink<int32_t> messaging_service::make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>>& source) {
    return source.make_sink<netw::serializer, int32_t>();
 }

-future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>
+future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>>
 messaging_service::make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id) {
    auto rpc_client = get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, id);
-    return rpc_client->make_stream_sink<netw::serializer, frozen_mutation_fragment>().then([this, plan_id, schema_id, cf_id, estimated_partitions, reason, rpc_client] (rpc::sink<frozen_mutation_fragment> sink) mutable {
-        auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, streaming::stream_reason, rpc::sink<frozen_mutation_fragment>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
+    return rpc_client->make_stream_sink<netw::serializer, frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>().then([this, plan_id, schema_id, cf_id, estimated_partitions, reason, rpc_client] (rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd> sink) mutable {
+        auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, streaming::stream_reason, rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
        return rpc_handler(*rpc_client , plan_id, schema_id, cf_id, estimated_partitions, reason, sink).then_wrapped([sink, rpc_client] (future<rpc::source<int32_t>> source) mutable {
            return (source.failed() ? sink.close() : make_ready_future<>()).then([sink = std::move(sink), source = std::move(source)] () mutable {
-                return make_ready_future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>(std::move(sink), std::move(source.get0()));
+                return make_ready_future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>>(std::move(sink), std::move(source.get0()));
            });
        });
    });
 }

-void messaging_service::register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason>, rpc::source<frozen_mutation_fragment> source)>&& func) {
+void messaging_service::register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason>, rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>> source)>&& func) {
    register_handler(this, messaging_verb::STREAM_MUTATION_FRAGMENTS, std::move(func));
 }

@@ -1077,14 +1080,14 @@ future<> messaging_service::send_repair_put_row_diff(msg_addr id, uint32_t repai
 }

 // Wrapper for REPAIR_ROW_LEVEL_START
-void messaging_service::register_repair_row_level_start(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name)>&& func) {
+void messaging_service::register_repair_row_level_start(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version)>&& func) {
    register_handler(this, messaging_verb::REPAIR_ROW_LEVEL_START, std::move(func));
 }
 void messaging_service::unregister_repair_row_level_start() {
    _rpc->unregister_handler(messaging_verb::REPAIR_ROW_LEVEL_START);
 }
-future<> messaging_service::send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name) {
-    return send_message<void>(this, messaging_verb::REPAIR_ROW_LEVEL_START, std::move(id), repair_meta_id, std::move(keyspace_name), std::move(cf_name), std::move(range), algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, std::move(remote_partitioner_name));
+future<> messaging_service::send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version) {
+    return send_message<void>(this, messaging_verb::REPAIR_ROW_LEVEL_START, std::move(id), repair_meta_id, std::move(keyspace_name), std::move(cf_name), std::move(range), algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, std::move(remote_partitioner_name), std::move(schema_version));
 }

 // Wrapper for REPAIR_ROW_LEVEL_STOP
--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -36,6 +36,7 @@
 #include "tracing/tracing.hh"
 #include "digest_algorithm.hh"
 #include "streaming/stream_reason.hh"
+#include "streaming/stream_mutation_fragments_cmd.hh"
 #include "cache_temperature.hh"

 #include <list>
@@ -270,9 +271,9 @@ public:

    // Wrapper for STREAM_MUTATION_FRAGMENTS
    // The receiver of STREAM_MUTATION_FRAGMENTS sends status code to the sender to notify any error on the receiver side. The status code is of type int32_t. 0 means successful, -1 means error, other status code value are reserved for future use.
-    void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment> source)>&& func);
-    rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment>& source);
-    future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);
+    void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>> source)>&& func);
+    rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>>& source);
+    future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);

    void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
    future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);
@@ -311,9 +312,9 @@ public:
    future<> send_repair_put_row_diff(msg_addr id, uint32_t repair_meta_id, repair_rows_on_wire row_diff);

    // Wrapper for REPAIR_ROW_LEVEL_START
-    void register_repair_row_level_start(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name)>&& func);
+    void register_repair_row_level_start(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version)>&& func);
    void unregister_repair_row_level_start();
-    future<> send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name);
+    future<> send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version);

    // Wrapper for REPAIR_ROW_LEVEL_STOP
    void register_repair_row_level_stop(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range)>&& func);
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -145,7 +145,14 @@ mutation_partition::mutation_partition(const schema& s, const mutation_partition
        , _static_row(s, column_kind::static_column, x._static_row)
        , _static_row_continuous(x._static_row_continuous)
        , _rows()
-        , _row_tombstones(x._row_tombstones) {
+        , _row_tombstones(x._row_tombstones)
+#ifdef SEASTAR_DEBUG
+        , _schema_version(s.version())
+#endif
+{
+#ifdef SEASTAR_DEBUG
+    assert(x._schema_version == _schema_version);
+#endif
    auto cloner = [&s] (const auto& x) {
        return current_allocator().construct<rows_entry>(s, x);
    };
@@ -158,7 +165,14 @@ mutation_partition::mutation_partition(const mutation_partition& x, const schema
        , _static_row(schema, column_kind::static_column, x._static_row)
        , _static_row_continuous(x._static_row_continuous)
        , _rows()
-        , _row_tombstones(x._row_tombstones, range_tombstone_list::copy_comparator_only()) {
+        , _row_tombstones(x._row_tombstones, range_tombstone_list::copy_comparator_only())
+#ifdef SEASTAR_DEBUG
+        , _schema_version(schema.version())
+#endif
+{
+#ifdef SEASTAR_DEBUG
+    assert(x._schema_version == _schema_version);
+#endif
    try {
        for(auto&& r : ck_ranges) {
            for (const rows_entry& e : x.range(schema, r)) {
@@ -181,7 +195,13 @@ mutation_partition::mutation_partition(mutation_partition&& x, const schema& sch
    , _static_row_continuous(x._static_row_continuous)
    , _rows(std::move(x._rows))
    , _row_tombstones(std::move(x._row_tombstones))
+#ifdef SEASTAR_DEBUG
+    , _schema_version(schema.version())
+#endif
 {
+#ifdef SEASTAR_DEBUG
+    assert(x._schema_version == _schema_version);
+#endif
    {
        auto deleter = current_deleter<rows_entry>();
        auto it = _rows.begin();
@@ -221,6 +241,7 @@ mutation_partition::operator=(mutation_partition&& x) noexcept {
 }

 void mutation_partition::ensure_last_dummy(const schema& s) {
+    check_schema(s);
    if (_rows.empty() || !_rows.rbegin()->is_last_dummy()) {
        _rows.insert_before(_rows.end(),
            *current_allocator().construct<rows_entry>(s, rows_entry::last_dummy_tag(), is_continuous::yes));
@@ -277,11 +298,16 @@ void deletable_row::apply(const schema& s, clustering_row cr) {

 void
 mutation_partition::apply(const schema& s, const mutation_fragment& mf) {
+    check_schema(s);
    mutation_fragment_applier applier{s, *this};
    mf.visit(applier);
 }

 stop_iteration mutation_partition::apply_monotonically(const schema& s, mutation_partition&& p, cache_tracker* tracker, is_preemptible preemptible) {
+#ifdef SEASTAR_DEBUG
+    assert(s.version() == _schema_version);
+    assert(p._schema_version == _schema_version);
+#endif
    _tombstone.apply(p._tombstone);
    _static_row.apply_monotonically(s, column_kind::static_column, std::move(p._static_row));
    _static_row_continuous |= p._static_row_continuous;
@@ -387,6 +413,7 @@ void mutation_partition::apply_weak(const schema& s, mutation_partition&& p) {

 tombstone
 mutation_partition::range_tombstone_for_row(const schema& schema, const clustering_key& key) const {
+    check_schema(schema);
    tombstone t = _tombstone;
    if (!_row_tombstones.empty()) {
        auto found = _row_tombstones.search_tombstone_covering(schema, key);
@@ -397,6 +424,7 @@ mutation_partition::range_tombstone_for_row(const schema& schema, const clusteri

 row_tombstone
 mutation_partition::tombstone_for_row(const schema& schema, const clustering_key& key) const {
+    check_schema(schema);
    row_tombstone t = row_tombstone(range_tombstone_for_row(schema, key));

    auto j = _rows.find(key, rows_entry::compare(schema));
@@ -409,6 +437,7 @@ mutation_partition::tombstone_for_row(const schema& schema, const clustering_key

 row_tombstone
 mutation_partition::tombstone_for_row(const schema& schema, const rows_entry& e) const {
+    check_schema(schema);
    row_tombstone t = e.row().deleted_at();
    t.apply(range_tombstone_for_row(schema, e.key()));
    return t;
@@ -416,6 +445,7 @@ mutation_partition::tombstone_for_row(const schema& schema, const rows_entry& e)

 void
 mutation_partition::apply_row_tombstone(const schema& schema, clustering_key_prefix prefix, tombstone t) {
+    check_schema(schema);
    assert(!prefix.is_full(schema));
    auto start = prefix;
    _row_tombstones.apply(schema, {std::move(start), std::move(prefix), std::move(t)});
@@ -423,11 +453,13 @@ mutation_partition::apply_row_tombstone(const schema& schema, clustering_key_pre

 void
 mutation_partition::apply_row_tombstone(const schema& schema, range_tombstone rt) {
+    check_schema(schema);
    _row_tombstones.apply(schema, std::move(rt));
 }

 void
 mutation_partition::apply_delete(const schema& schema, const clustering_key_prefix& prefix, tombstone t) {
+    check_schema(schema);
    if (prefix.is_empty(schema)) {
        apply(t);
    } else if (prefix.is_full(schema)) {
@@ -439,6 +471,7 @@ mutation_partition::apply_delete(const schema& schema, const clustering_key_pref

 void
 mutation_partition::apply_delete(const schema& schema, range_tombstone rt) {
+    check_schema(schema);
    if (range_tombstone::is_single_clustering_row_tombstone(schema, rt.start, rt.start_kind, rt.end, rt.end_kind)) {
        apply_delete(schema, std::move(rt.start), std::move(rt.tomb));
        return;
@@ -448,6 +481,7 @@ mutation_partition::apply_delete(const schema& schema, range_tombstone rt) {

 void
 mutation_partition::apply_delete(const schema& schema, clustering_key&& prefix, tombstone t) {
+    check_schema(schema);
    if (prefix.is_empty(schema)) {
        apply(t);
    } else if (prefix.is_full(schema)) {
@@ -459,6 +493,7 @@ mutation_partition::apply_delete(const schema& schema, clustering_key&& prefix,

 void
 mutation_partition::apply_delete(const schema& schema, clustering_key_prefix_view prefix, tombstone t) {
+    check_schema(schema);
    if (prefix.is_empty(schema)) {
        apply(t);
    } else if (prefix.is_full(schema)) {
@@ -484,6 +519,7 @@ void mutation_partition::insert_row(const schema& s, const clustering_key& key,
 }

 void mutation_partition::insert_row(const schema& s, const clustering_key& key, const deletable_row& row) {
+    check_schema(s);
    auto e = alloc_strategy_unique_ptr<rows_entry>(
        current_allocator().construct<rows_entry>(s, key, row));
    _rows.insert(_rows.end(), *e, rows_entry::compare(s));
@@ -492,6 +528,7 @@ void mutation_partition::insert_row(const schema& s, const clustering_key& key,

 const row*
 mutation_partition::find_row(const schema& s, const clustering_key& key) const {
+    check_schema(s);
    auto i = _rows.find(key, rows_entry::compare(s));
    if (i == _rows.end()) {
        return nullptr;
@@ -501,6 +538,7 @@ mutation_partition::find_row(const schema& s, const clustering_key& key) const {

 deletable_row&
 mutation_partition::clustered_row(const schema& s, clustering_key&& key) {
+    check_schema(s);
    auto i = _rows.find(key, rows_entry::compare(s));
    if (i == _rows.end()) {
        auto e = alloc_strategy_unique_ptr<rows_entry>(
@@ -513,6 +551,7 @@ mutation_partition::clustered_row(const schema& s, clustering_key&& key) {

 deletable_row&
 mutation_partition::clustered_row(const schema& s, const clustering_key& key) {
+    check_schema(s);
    auto i = _rows.find(key, rows_entry::compare(s));
    if (i == _rows.end()) {
        auto e = alloc_strategy_unique_ptr<rows_entry>(
@@ -525,6 +564,7 @@ mutation_partition::clustered_row(const schema& s, const clustering_key& key) {

 deletable_row&
 mutation_partition::clustered_row(const schema& s, clustering_key_view key) {
+    check_schema(s);
    auto i = _rows.find(key, rows_entry::compare(s));
    if (i == _rows.end()) {
        auto e = alloc_strategy_unique_ptr<rows_entry>(
@@ -537,6 +577,7 @@ mutation_partition::clustered_row(const schema& s, clustering_key_view key) {

 deletable_row&
 mutation_partition::clustered_row(const schema& s, position_in_partition_view pos, is_dummy dummy, is_continuous continuous) {
+    check_schema(s);
    auto i = _rows.find(pos, rows_entry::compare(s));
    if (i == _rows.end()) {
        auto e = alloc_strategy_unique_ptr<rows_entry>(
@@ -549,6 +590,7 @@ mutation_partition::clustered_row(const schema& s, position_in_partition_view po

 mutation_partition::rows_type::const_iterator
 mutation_partition::lower_bound(const schema& schema, const query::clustering_range& r) const {
+    check_schema(schema);
    if (!r.start()) {
        return std::cbegin(_rows);
    }
@@ -557,6 +599,7 @@ mutation_partition::lower_bound(const schema& schema, const query::clustering_ra

 mutation_partition::rows_type::const_iterator
 mutation_partition::upper_bound(const schema& schema, const query::clustering_range& r) const {
+    check_schema(schema);
    if (!r.end()) {
        return std::cend(_rows);
    }
@@ -565,6 +608,7 @@ mutation_partition::upper_bound(const schema& schema, const query::clustering_ra

 boost::iterator_range<mutation_partition::rows_type::const_iterator>
 mutation_partition::range(const schema& schema, const query::clustering_range& r) const {
+    check_schema(schema);
    return boost::make_iterator_range(lower_bound(schema, r), upper_bound(schema, r));
 }

@@ -601,6 +645,7 @@ mutation_partition::upper_bound(const schema& schema, const query::clustering_ra
 template<typename Func>
 void mutation_partition::for_each_row(const schema& schema, const query::clustering_range& row_range, bool reversed, Func&& func) const
 {
+    check_schema(schema);
    auto r = range(schema, row_range);
    if (!reversed) {
        for (const auto& e : r) {
@@ -817,6 +862,7 @@ bool has_any_live_data(const schema& s, column_kind kind, const row& cells, tomb

 void
 mutation_partition::query_compacted(query::result::partition_writer& pw, const schema& s, uint32_t limit) const {
+    check_schema(s);
    const query::partition_slice& slice = pw.slice();
    max_timestamp max_ts{pw.last_modified()};

@@ -1049,6 +1095,10 @@ bool mutation_partition::equal(const schema& s, const mutation_partition& p) con
 }

 bool mutation_partition::equal(const schema& this_schema, const mutation_partition& p, const schema& p_schema) const {
+#ifdef SEASTAR_DEBUG
+    assert(_schema_version == this_schema.version());
+    assert(p._schema_version == p_schema.version());
+#endif
    if (_tombstone != p._tombstone) {
        return false;
    }
@@ -1177,6 +1227,7 @@ row::apply_monotonically(const column_definition& column, atomic_cell_or_collect
 void
 row::append_cell(column_id id, atomic_cell_or_collection value) {
    if (_type == storage_type::vector && id < max_vector_size) {
+        assert(_storage.vector.v.size() <= id);
        _storage.vector.v.resize(id);
        _storage.vector.v.emplace_back(cell_and_hash{std::move(value), cell_hash_opt()});
        _storage.vector.present.set(id);
@@ -1241,6 +1292,7 @@ size_t rows_entry::memory_usage(const schema& s) const {
 }

 size_t mutation_partition::external_memory_usage(const schema& s) const {
+    check_schema(s);
    size_t sum = 0;
    sum += static_row().external_memory_usage(s, column_kind::static_column);
    for (auto& clr : clustered_rows()) {
@@ -1259,6 +1311,7 @@ void mutation_partition::trim_rows(const schema& s,
    const std::vector<query::clustering_range>& row_ranges,
    Func&& func)
 {
+    check_schema(s);
    static_assert(std::is_same<stop_iteration, std::result_of_t<Func(rows_entry&)>>::value, "Bad func signature");

    stop_iteration stop = stop_iteration::no;
@@ -1303,6 +1356,7 @@ uint32_t mutation_partition::do_compact(const schema& s,
    uint32_t row_limit,
    can_gc_fn& can_gc)
 {
+    check_schema(s);
    assert(row_limit > 0);

    auto gc_before = saturating_subtract(query_time, s.gc_grace_seconds());
@@ -1368,12 +1422,14 @@ mutation_partition::compact_for_query(
    bool reverse,
    uint32_t row_limit)
 {
+    check_schema(s);
    return do_compact(s, query_time, row_ranges, reverse, row_limit, always_gc);
 }

 void mutation_partition::compact_for_compaction(const schema& s,
    can_gc_fn& can_gc, gc_clock::time_point compaction_time)
 {
+    check_schema(s);
    static const std::vector<query::clustering_range> all_rows = {
        query::clustering_range::make_open_ended_both_sides()
    };
@@ -1407,11 +1463,13 @@ row::is_live(const schema& s, column_kind kind, tombstone base_tombstone, gc_clo

 bool
 mutation_partition::is_static_row_live(const schema& s, gc_clock::time_point query_time) const {
+    check_schema(s);
    return has_any_live_data(s, column_kind::static_column, static_row(), _tombstone, query_time);
 }

 size_t
 mutation_partition::live_row_count(const schema& s, gc_clock::time_point query_time) const {
+    check_schema(s);
    size_t count = 0;

    for (const rows_entry& e : non_dummy_rows()) {
@@ -1757,6 +1815,7 @@ row row::difference(const schema& s, column_kind kind, const row& other) const

 mutation_partition mutation_partition::difference(schema_ptr s, const mutation_partition& other) const
 {
+    check_schema(*s);
    mutation_partition mp(s);
    if (_tombstone > other._tombstone) {
        mp.apply(_tombstone);
@@ -1787,6 +1846,7 @@ mutation_partition mutation_partition::difference(schema_ptr s, const mutation_p
 }

 void mutation_partition::accept(const schema& s, mutation_partition_visitor& v) const {
+    check_schema(s);
    v.accept_partition_tombstone(_tombstone);
    _static_row.for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
        const column_definition& def = s.static_column_at(id);
@@ -2200,6 +2260,9 @@ mutation_partition::mutation_partition(mutation_partition::incomplete_tag, const
    , _static_row_continuous(!s.has_static_columns())
    , _rows()
    , _row_tombstones(s)
+#ifdef SEASTAR_DEBUG
+    , _schema_version(s.version())
+#endif
 {
    _rows.insert_before(_rows.end(),
        *current_allocator().construct<rows_entry>(s, rows_entry::last_dummy_tag(), is_continuous::no));
@@ -2265,6 +2328,7 @@ void mutation_partition::set_continuity(const schema& s, const position_range& p
 }

 clustering_interval_set mutation_partition::get_continuity(const schema& s, is_continuous cont) const {
+    check_schema(s);
    clustering_interval_set result;
    auto i = _rows.begin();
    auto prev_pos = position_in_partition::before_all_clustered_rows();
@@ -2314,6 +2378,7 @@ stop_iteration mutation_partition::clear_gently(cache_tracker* tracker) noexcept

 bool
 mutation_partition::check_continuity(const schema& s, const position_range& r, is_continuous cont) const {
+    check_schema(s);
    auto less = rows_entry::compare(s);
    auto i = _rows.lower_bound(r.start(), less);
    auto end = _rows.lower_bound(r.end(), less);
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -397,7 +397,7 @@ public:
        if (is_missing() || _ttl == dead) {
            return false;
        }
-        if (_ttl != no_ttl && _expiry < now) {
+        if (_ttl != no_ttl && _expiry <= now) {
            return false;
        }
        return _timestamp > t.timestamp;
@@ -407,7 +407,7 @@ public:
        if (_ttl == dead) {
            return true;
        }
-        return _ttl != no_ttl && _expiry < now;
+        return _ttl != no_ttl && _expiry <= now;
    }
    // Can be called only when is_live().
    bool is_expiring() const {
@@ -447,7 +447,7 @@ public:
            _timestamp = api::missing_timestamp;
            return false;
        }
-        if (_ttl > no_ttl && _expiry < now) {
+        if (_ttl > no_ttl && _expiry <= now) {
            _expiry -= _ttl;
            _ttl = dead;
        }
@@ -940,6 +940,9 @@ private:
    // Contains only strict prefixes so that we don't have to lookup full keys
    // in both _row_tombstones and _rows.
    range_tombstone_list _row_tombstones;
+#ifdef SEASTAR_DEBUG
+    table_schema_version _schema_version;
+#endif

    friend class mutation_partition_applier;
    friend class converting_mutation_partition_applier;
@@ -954,10 +957,16 @@ public:
    mutation_partition(schema_ptr s)
        : _rows()
        , _row_tombstones(*s)
+#ifdef SEASTAR_DEBUG
+        , _schema_version(s->version())
+#endif
    { }
    mutation_partition(mutation_partition& other, copy_comparators_only)
        : _rows()
        , _row_tombstones(other._row_tombstones, range_tombstone_list::copy_comparator_only())
+#ifdef SEASTAR_DEBUG
+        , _schema_version(other._schema_version)
+#endif
    { }
    mutation_partition(mutation_partition&&) = default;
    mutation_partition(const schema& s, const mutation_partition&);
@@ -1181,6 +1190,12 @@ private:
    template<typename Func>
    void for_each_row(const schema& schema, const query::clustering_range& row_range, bool reversed, Func&& func) const;
    friend class counter_write_query_result_builder;
+
+    void check_schema(const schema& s) const {
+#ifdef SEASTAR_DEBUG
+        assert(s.version() == _schema_version);
+#endif
+    }
 };

 inline
--- a/mutation_query.cc
+++ b/mutation_query.cc
@@ -31,7 +31,7 @@ reconcilable_result::reconcilable_result()
    : _row_count(0)
 { }

-reconcilable_result::reconcilable_result(uint32_t row_count, std::vector<partition> p, query::short_read short_read,
+reconcilable_result::reconcilable_result(uint32_t row_count, utils::chunked_vector<partition> p, query::short_read short_read,
                                         query::result_memory_tracker memory_tracker)
    : _row_count(row_count)
    , _short_read(short_read)
@@ -39,11 +39,11 @@ reconcilable_result::reconcilable_result(uint32_t row_count, std::vector<partiti
    , _partitions(std::move(p))
 { }

-const std::vector<partition>& reconcilable_result::partitions() const {
+const utils::chunked_vector<partition>& reconcilable_result::partitions() const {
    return _partitions;
 }

-std::vector<partition>& reconcilable_result::partitions() {
+utils::chunked_vector<partition>& reconcilable_result::partitions() {
    return _partitions;
 }

--- a/mutation_query.hh
+++ b/mutation_query.hh
@@ -27,6 +27,7 @@
 #include "frozen_mutation.hh"
 #include "db/timeout_clock.hh"
 #include "querier.hh"
+#include "utils/chunked_vector.hh"
 #include <seastar/core/execution_stage.hh>

 class reconcilable_result;
@@ -72,17 +73,17 @@ class reconcilable_result {
    uint32_t _row_count;
    query::short_read _short_read;
    query::result_memory_tracker _memory_tracker;
-    std::vector<partition> _partitions;
+    utils::chunked_vector<partition> _partitions;
 public:
    ~reconcilable_result();
    reconcilable_result();
    reconcilable_result(reconcilable_result&&) = default;
    reconcilable_result& operator=(reconcilable_result&&) = default;
-    reconcilable_result(uint32_t row_count, std::vector<partition> partitions, query::short_read short_read,
+    reconcilable_result(uint32_t row_count, utils::chunked_vector<partition> partitions, query::short_read short_read,
                        query::result_memory_tracker memory_tracker = { });

-    const std::vector<partition>& partitions() const;
-    std::vector<partition>& partitions();
+    const utils::chunked_vector<partition>& partitions() const;
+    utils::chunked_vector<partition>& partitions();

    uint32_t row_count() const {
        return _row_count;
@@ -112,7 +113,7 @@ class reconcilable_result_builder {
    const schema& _schema;
    const query::partition_slice& _slice;

-    std::vector<partition> _result;
+    utils::chunked_vector<partition> _result;
    uint32_t _live_rows{};

    bool _has_ck_selector{};
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -910,9 +910,10 @@ class shard_reader : public enable_lw_shared_from_this<shard_reader>, public fla
        bool _reader_created = false;
        bool _drop_partition_start = false;
        bool _drop_static_row = false;
+        position_in_partition::tri_compare _tri_cmp;

        std::optional<dht::decorated_key> _last_pkey;
-        std::optional<position_in_partition> _last_position_in_partition;
+        position_in_partition _next_position_in_partition = position_in_partition::for_partition_start();
        // These are used when the reader has to be recreated (after having been
        // evicted while paused) and the range and/or slice it is recreated with
        // differs from the original ones.
@@ -920,13 +921,13 @@ class shard_reader : public enable_lw_shared_from_this<shard_reader>, public fla
        std::optional<query::partition_slice> _slice_override;

    private:
-        void update_last_position(const circular_buffer<mutation_fragment>& buffer);
+        void update_next_position(flat_mutation_reader& reader, circular_buffer<mutation_fragment>& buffer);
        void adjust_partition_slice();
        flat_mutation_reader recreate_reader();
        flat_mutation_reader resume_or_create_reader();
+        bool should_drop_fragment(const mutation_fragment& mf);
        future<> do_fill_buffer(flat_mutation_reader& reader, db::timeout_clock::time_point timeout);
-        future<> ensure_buffer_contains_all_fragments_for_last_pos(flat_mutation_reader& reader, circular_buffer<mutation_fragment>& buffer,
-                db::timeout_clock::time_point timeout);
+        future<> fill_buffer(flat_mutation_reader& reader, circular_buffer<mutation_fragment>& buffer, db::timeout_clock::time_point timeout);

    public:
        remote_reader(
@@ -1024,7 +1025,7 @@ void shard_reader::stop() noexcept {
    }).finally([zis = shared_from_this()] {}));
 }

-void shard_reader::remote_reader::update_last_position(const circular_buffer<mutation_fragment>& buffer) {
+void shard_reader::remote_reader::update_next_position(flat_mutation_reader& reader, circular_buffer<mutation_fragment>& buffer) {
    if (buffer.empty()) {
        return;
    }
@@ -1035,7 +1036,31 @@ void shard_reader::remote_reader::update_last_position(const circular_buffer<mut
        _last_pkey = pk_it->as_partition_start().key();
    }

-    _last_position_in_partition.emplace(buffer.back().position());
+    const auto last_pos = buffer.back().position();
+    switch (last_pos.region()) {
+        case partition_region::partition_start:
+            _next_position_in_partition = position_in_partition::for_static_row();
+            break;
+        case partition_region::static_row:
+            _next_position_in_partition = position_in_partition::before_all_clustered_rows();
+            break;
+        case partition_region::clustered:
+            if (reader.is_buffer_empty()) {
+                _next_position_in_partition = position_in_partition::after_key(last_pos);
+            } else {
+               const auto& next_frag = reader.peek_buffer();
+               if (next_frag.is_end_of_partition()) {
+                   buffer.emplace_back(reader.pop_mutation_fragment());
+                   _next_position_in_partition = position_in_partition::for_partition_start();
+               } else {
+                   _next_position_in_partition = position_in_partition(next_frag.position());
+               }
+            }
+            break;
+        case partition_region::partition_end:
+           _next_position_in_partition = position_in_partition::for_partition_start();
+           break;
+    }
 }

 void shard_reader::remote_reader::adjust_partition_slice() {
@@ -1043,9 +1068,8 @@ void shard_reader::remote_reader::adjust_partition_slice() {
        _slice_override = _ps;
    }

-    auto& last_ckey = _last_position_in_partition->key();
    auto ranges = _slice_override->default_row_ranges();
-    query::trim_clustering_row_ranges_to(*_schema, ranges, last_ckey);
+    query::trim_clustering_row_ranges_to(*_schema, ranges, _next_position_in_partition);

    _slice_override->clear_ranges();
    _slice_override->set_range(*_schema, _last_pkey->key(), std::move(ranges));
@@ -1058,25 +1082,22 @@ flat_mutation_reader shard_reader::remote_reader::recreate_reader() {
    if (_last_pkey) {
        bool partition_range_is_inclusive = true;

-        if (_last_position_in_partition) {
-            switch (_last_position_in_partition->region()) {
-            case partition_region::partition_start:
-                _drop_partition_start = true;
-                break;
-            case partition_region::static_row:
-                _drop_partition_start = true;
-                _drop_static_row = true;
-                break;
-            case partition_region::clustered:
-                _drop_partition_start = true;
-                _drop_static_row = true;
-                adjust_partition_slice();
-                slice = &*_slice_override;
-                break;
-            case partition_region::partition_end:
-                partition_range_is_inclusive = false;
-                break;
-            }
+        switch (_next_position_in_partition.region()) {
+        case partition_region::partition_start:
+            partition_range_is_inclusive = false;
+            break;
+        case partition_region::static_row:
+            _drop_partition_start = true;
+            break;
+        case partition_region::clustered:
+            _drop_partition_start = true;
+            _drop_static_row = true;
+            adjust_partition_slice();
+            slice = &*_slice_override;
+            break;
+        case partition_region::partition_end:
+            partition_range_is_inclusive = false;
+            break;
        }

        // The original range contained a single partition and we've read it
@@ -1115,62 +1136,83 @@ flat_mutation_reader shard_reader::remote_reader::resume_or_create_reader() {
    return recreate_reader();
 }

+bool shard_reader::remote_reader::should_drop_fragment(const mutation_fragment& mf) {
+    if (_drop_partition_start && mf.is_partition_start()) {
+        _drop_partition_start = false;
+        return true;
+    }
+    if (_drop_static_row && mf.is_static_row()) {
+        _drop_static_row = false;
+        return true;
+    }
+    return false;
+}
+
 future<> shard_reader::remote_reader::do_fill_buffer(flat_mutation_reader& reader, db::timeout_clock::time_point timeout) {
    if (!_drop_partition_start && !_drop_static_row) {
        return reader.fill_buffer(timeout);
    }
    return repeat([this, &reader, timeout] {
        return reader.fill_buffer(timeout).then([this, &reader] {
-            const auto eos = reader.is_end_of_stream();
-
-            if (reader.is_buffer_empty()) {
-                return stop_iteration(eos);
+            while (!reader.is_buffer_empty() && should_drop_fragment(reader.peek_buffer())) {
+                reader.pop_mutation_fragment();
            }
-            if (_drop_partition_start) {
-                _drop_partition_start = false;
-                if (reader.peek_buffer().is_partition_start()) {
-                    reader.pop_mutation_fragment();
-                }
-            }
-
-            if (reader.is_buffer_empty()) {
-                return stop_iteration(eos);
-            }
-            if (_drop_static_row) {
-                _drop_static_row = false;
-                if (reader.peek_buffer().is_static_row()) {
-                    reader.pop_mutation_fragment();
-                }
-            }
-
-            return stop_iteration(reader.is_buffer_full() || eos);
+            return stop_iteration(reader.is_buffer_full() || reader.is_end_of_stream());
        });
    });
 }

-future<> shard_reader::remote_reader::ensure_buffer_contains_all_fragments_for_last_pos(flat_mutation_reader& reader,
-        circular_buffer<mutation_fragment>& buffer, db::timeout_clock::time_point timeout) {
-    if (buffer.empty() || !buffer.back().is_range_tombstone()) {
-        return make_ready_future<>();
-    }
-
-    auto stop = [this, &reader, &buffer] {
+future<> shard_reader::remote_reader::fill_buffer(flat_mutation_reader& reader, circular_buffer<mutation_fragment>& buffer,
+        db::timeout_clock::time_point timeout) {
+    return do_fill_buffer(reader, timeout).then([this, &reader, &buffer, timeout] {
        if (reader.is_buffer_empty()) {
-            return reader.is_end_of_stream();
+            return make_ready_future<>();
        }
-        const auto& next_pos = reader.peek_buffer().position();
-        if (next_pos.region() != partition_region::clustered) {
-            return true;
-        }
-        return !next_pos.key().equal(*_schema, buffer.back().position().key());
-    };
-
-    return do_until(stop, [this, &reader, &buffer, timeout] {
-        if (reader.is_buffer_empty()) {
-            return do_fill_buffer(reader, timeout);
-        }
-        buffer.emplace_back(reader.pop_mutation_fragment());
-        return make_ready_future<>();
+        buffer = reader.detach_buffer();
+        auto stop = [this, &reader, &buffer] {
+            // The only problematic fragment kind is the range tombstone.
+            // All other fragment kinds are safe to end the buffer on, and
+            // are guaranteed to represent progress vs. the last buffer fill.
+            if (!buffer.back().is_range_tombstone()) {
+                return true;
+            }
+            if (reader.is_buffer_empty()) {
+                return reader.is_end_of_stream();
+            }
+            const auto& next_pos = reader.peek_buffer().position();
+            // To ensure safe progress we have to ensure the following:
+            //
+            // _next_position_in_partition < buffer.back().position() < next_pos
+            //
+            // * The first condition is to ensure we made progress since the
+            // last buffer fill. Otherwise we might get into an endless loop if
+            // the reader is recreated after each `fill_buffer()` call.
+            // * The second condition is to ensure we have seen all fragments
+            // with the same position. Otherwise we might jump over those
+            // remaining fragments with the same position as the last
+            // fragment's in the buffer when the reader is recreated.
+            return _tri_cmp(_next_position_in_partition, buffer.back().position()) < 0 && _tri_cmp(buffer.back().position(), next_pos) < 0;
+        };
+        // Read additional fragments until it is safe to stop, if needed.
+        // We have to ensure we stop at a fragment such that if the reader is
+        // evicted and recreated later, we won't be skipping any fragments.
+        // Practically, range tombstones are the only ones that are
+        // problematic to end the buffer on. This is due to the fact range
+        // tombstones can have the same position that multiple following range
+        // tombstones, or a single following clustering row in the stream has.
+        // When a range tombstone is the last in the buffer, we have to continue
+        // to read until we are sure we've read all fragments sharing the same
+        // position, so that we can safely continue reading from after said
+        // position.
+        return do_until(stop, [this, &reader, &buffer, timeout] {
+            if (reader.is_buffer_empty()) {
+                return do_fill_buffer(reader, timeout);
+            }
+            buffer.emplace_back(reader.pop_mutation_fragment());
+            return make_ready_future<>();
+        });
+    }).then([this, &reader, &buffer] {
+        update_next_position(reader, buffer);
    });
 }

@@ -1188,7 +1230,8 @@ shard_reader::remote_reader::remote_reader(
    , _ps(ps)
    , _pc(pc)
    , _trace_state(std::move(trace_state))
-    , _fwd_mr(fwd_mr) {
+    , _fwd_mr(fwd_mr)
+    , _tri_cmp(*_schema) {
 }

 future<shard_reader::fill_buffer_result> shard_reader::remote_reader::fill_buffer(const dht::partition_range& pr, bool pending_next_partition,
@@ -1196,7 +1239,7 @@ future<shard_reader::fill_buffer_result> shard_reader::remote_reader::fill_buffe
    // We could have missed a `fast_forward_to()` if the reader wasn't created yet.
    _pr = &pr;
    if (pending_next_partition) {
-        _last_position_in_partition = position_in_partition(position_in_partition::end_of_partition_tag_t{});
+        _next_position_in_partition = position_in_partition::for_partition_start();
    }
    return do_with(resume_or_create_reader(), circular_buffer<mutation_fragment>{},
            [this, pending_next_partition, timeout] (flat_mutation_reader& reader, circular_buffer<mutation_fragment>& buffer) mutable {
@@ -1204,22 +1247,8 @@ future<shard_reader::fill_buffer_result> shard_reader::remote_reader::fill_buffe
            reader.next_partition();
        }

-        return do_fill_buffer(reader, timeout).then([this, &reader, &buffer, timeout] {
-            buffer = reader.detach_buffer();
-            // When the reader is recreated (after having been evicted) we
-            // recreate it such that it starts reading from *after* the last
-            // seen fragment's position. If the last seen fragment is a range
-            // tombstone it is *not* guaranteed that the next fragments in the
-            // data stream have positions strictly greater than the range
-            // tombstone's. If the reader is evicted and has to be recreated,
-            // these fragments would be then skipped as the read would continue
-            // after their position.
-            // To avoid this ensure that the buffer contains *all* fragments for
-            // the last seen position.
-            return ensure_buffer_contains_all_fragments_for_last_pos(reader, buffer, timeout);
-        }).then([this, &reader, &buffer] {
+        return fill_buffer(reader, buffer, timeout).then([this, &reader, &buffer] {
            const auto eos = reader.is_end_of_stream() && reader.is_buffer_empty();
-            update_last_position(buffer);
            _irh = _lifecycle_policy.pause(std::move(reader));
            return fill_buffer_result(std::move(buffer), eos);
        });
@@ -1229,7 +1258,7 @@ future<shard_reader::fill_buffer_result> shard_reader::remote_reader::fill_buffe
 future<> shard_reader::remote_reader::fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) {
    _pr = &pr;
    _last_pkey.reset();
-    _last_position_in_partition.reset();
+    _next_position_in_partition = position_in_partition::for_partition_start();

    if (!_reader_created || !_irh) {
        return make_ready_future<>();
--- a/partition_version.cc
+++ b/partition_version.cc
@@ -172,6 +172,9 @@ tombstone partition_entry::partition_tombstone() const {

 partition_snapshot::~partition_snapshot() {
    with_allocator(region().allocator(), [this] {
+        if (_locked) {
+            touch();
+        }
        if (_version && _version.is_unique_owner()) {
            auto v = &*_version;
            _version = {};
@@ -268,6 +271,7 @@ partition_entry::~partition_entry() {
        return;
    }
    if (_snapshot) {
+        assert(!_snapshot->is_locked());
        _snapshot->_version = std::move(_version);
        _snapshot->_version.mark_as_unique_owner();
        _snapshot->_entry = nullptr;
@@ -284,6 +288,7 @@ stop_iteration partition_entry::clear_gently(cache_tracker* tracker) noexcept {
    }

    if (_snapshot) {
+        assert(!_snapshot->is_locked());
        _snapshot->_version = std::move(_version);
        _snapshot->_version.mark_as_unique_owner();
        _snapshot->_entry = nullptr;
@@ -311,6 +316,7 @@ stop_iteration partition_entry::clear_gently(cache_tracker* tracker) noexcept {
 void partition_entry::set_version(partition_version* new_version)
 {
    if (_snapshot) {
+        assert(!_snapshot->is_locked());
        _snapshot->_version = std::move(_version);
        _snapshot->_entry = nullptr;
    }
@@ -338,7 +344,7 @@ partition_version& partition_entry::add_version(const schema& s, cache_tracker*

 void partition_entry::apply(const schema& s, const mutation_partition& mp, const schema& mp_schema)
 {
-    apply(s, mutation_partition(s, mp), mp_schema);
+    apply(s, mutation_partition(mp_schema, mp), mp_schema);
 }

 void partition_entry::apply(const schema& s, mutation_partition&& mp, const schema& mp_schema)
@@ -459,7 +465,6 @@ public:

 coroutine partition_entry::apply_to_incomplete(const schema& s,
    partition_entry&& pe,
-    const schema& pe_schema,
    mutation_cleaner& pe_cleaner,
    logalloc::allocating_section& alloc,
    logalloc::region& reg,
@@ -479,10 +484,6 @@ coroutine partition_entry::apply_to_incomplete(const schema& s,
    // partitions where I saw 40% slow down.
    const bool preemptible = s.clustering_key_size() > 0;

-    if (s.version() != pe_schema.version()) {
-        pe.upgrade(pe_schema.shared_from_this(), s.shared_from_this(), pe_cleaner, no_cache_tracker);
-    }
-
    // When preemptible, later memtable reads could start using the snapshot before
    // snapshot's writes are made visible in cache, which would cause them to miss those writes.
    // So we cannot allow erasing when preemptible.
@@ -496,6 +497,7 @@ coroutine partition_entry::apply_to_incomplete(const schema& s,
        prev_snp = read(reg, tracker.cleaner(), s.shared_from_this(), &tracker, phase - 1);
    }
    auto dst_snp = read(reg, tracker.cleaner(), s.shared_from_this(), &tracker, phase);
+    dst_snp->lock();

    // Once we start updating the partition, we must keep all snapshots until the update completes,
    // otherwise partial writes would be published. So the scope of snapshots must enclose the scope
@@ -570,6 +572,7 @@ coroutine partition_entry::apply_to_incomplete(const schema& s,
                    auto has_next = src_cur.erase_and_advance();
                    acc.unpin_memory(size);
                    if (!has_next) {
+                        dst_snp->unlock();
                        return stop_iteration::yes;
                    }
                } while (!preemptible || !need_preempt());
@@ -661,6 +664,18 @@ partition_snapshot::range_tombstones()
        position_in_partition_view::after_all_clustered_rows());
 }

+void partition_snapshot::touch() noexcept {
+    // Eviction assumes that older versions are evicted before newer so only the latest snapshot
+    // can be touched.
+    if (_tracker && at_latest_version()) {
+        auto&& rows = version()->partition().clustered_rows();
+        assert(!rows.empty());
+        rows_entry& last_dummy = *rows.rbegin();
+        assert(last_dummy.is_last_dummy());
+        _tracker->touch(last_dummy);
+    }
+}
+
 std::ostream& operator<<(std::ostream& out, const partition_entry::printer& p) {
    auto& e = p._partition_entry;
    out << "{";
@@ -688,6 +703,7 @@ void partition_entry::evict(mutation_cleaner& cleaner) noexcept {
        return;
    }
    if (_snapshot) {
+        assert(!_snapshot->is_locked());
        _snapshot->_version = std::move(_version);
        _snapshot->_version.mark_as_unique_owner();
        _snapshot->_entry = nullptr;
@@ -707,3 +723,18 @@ partition_snapshot_ptr::~partition_snapshot_ptr() {
        }
    }
 }
+
+void partition_snapshot::lock() noexcept {
+    // partition_entry::is_locked() assumes that if there is a locked snapshot,
+    // it can be found attached directly to it.
+    assert(at_latest_version());
+    _locked = true;
+}
+
+void partition_snapshot::unlock() noexcept {
+    // Locked snapshots must always be latest, is_locked() assumes that.
+    // Also, touch() is only effective when this snapshot is latest. 
+    assert(at_latest_version());
+    _locked = false;
+    touch(); // Make the entry evictable again in case it was fully unlinked by eviction attempt.
+}
--- a/partition_version.hh
+++ b/partition_version.hh
@@ -303,6 +303,7 @@ private:
    mutation_cleaner* _cleaner;
    cache_tracker* _tracker;
    boost::intrusive::slist_member_hook<> _cleaner_hook;
+    bool _locked = false;
    friend class partition_entry;
    friend class mutation_cleaner_impl;
 public:
@@ -318,6 +319,22 @@ public:
    partition_snapshot& operator=(const partition_snapshot&) = delete;
    partition_snapshot& operator=(partition_snapshot&&) = delete;

+    // Makes the snapshot locked.
+    // See is_locked() for meaning.
+    // Can be called only when at_lastest_version(). The snapshot must remain latest as long as it's locked.
+    void lock() noexcept;
+
+    // Makes the snapshot no longer locked.
+    // See is_locked() for meaning.
+    void unlock() noexcept;
+
+    // Tells whether the snapshot is locked.
+    // Locking the snapshot prevents it from getting detached from the partition entry.
+    // It also prevents the partition entry from being evicted.
+    bool is_locked() const {
+        return _locked;
+    }
+
    static partition_snapshot& container_of(partition_version_ref* ref) {
        return *boost::intrusive::get_parent_from_member(ref, &partition_snapshot::_version);
    }
@@ -344,6 +361,9 @@ public:
    // to the latest version.
    stop_iteration slide_to_oldest() noexcept;

+    // Brings the snapshot to the front of the LRU.
+    void touch() noexcept;
+
    // Must be called after snapshot's original region is merged into a different region
    // before the original region is destroyed, unless the snapshot is destroyed earlier.
    void migrate(logalloc::region* region, mutation_cleaner* cleaner) noexcept {
@@ -503,9 +523,18 @@ public:
        return _version->all_elements_reversed();
    }

+    // Tells whether this entry is locked.
+    // Locked entries are undergoing an update and should not have their snapshots
+    // detached from the entry.
+    // Certain methods can only be called when !is_locked().
+    bool is_locked() const {
+        return _snapshot && _snapshot->is_locked();
+    }
+
    // Strong exception guarantees.
    // Assumes this instance and mp are fully continuous.
    // Use only on non-evictable entries.
+    // Must not be called when is_locked().
    void apply(const schema& s, const mutation_partition& mp, const schema& mp_schema);
    void apply(const schema& s, mutation_partition&& mp, const schema& mp_schema);

@@ -526,11 +555,14 @@ public:
    // such that if the operation is retried (possibly many times) and eventually
    // succeeds the result will be as if the first attempt didn't fail.
    //
+    // The schema of pe must conform to s.
+    //
    // Returns a coroutine object representing the operation.
    // The coroutine must be resumed with the region being unlocked.
+    //
+    // The coroutine cannot run concurrently with other apply() calls.
    coroutine apply_to_incomplete(const schema& s,
        partition_entry&& pe,
-        const schema& pe_schema,
        mutation_cleaner& pe_cleaner,
        logalloc::allocating_section&,
        logalloc::region&,
@@ -539,6 +571,7 @@ public:
        real_dirty_memory_accounter&);

    // If this entry is evictable, cache_tracker must be provided.
+    // Must not be called when is_locked().
    partition_version& add_version(const schema& s, cache_tracker*);

    // Returns a reference to existing version with an active snapshot of given phase
@@ -568,9 +601,11 @@ public:
    tombstone partition_tombstone() const;

    // needs to be called with reclaiming disabled
+    // Must not be called when is_locked().
    void upgrade(schema_ptr from, schema_ptr to, mutation_cleaner&, cache_tracker*);

    // Snapshots with different values of phase will point to different partition_version objects.
+    // When is_locked(), read() can only be called with a phase which is <= the phase of the current snapshot.
    partition_snapshot_ptr read(logalloc::region& region,
        mutation_cleaner&,
        schema_ptr entry_schema,
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -129,6 +129,8 @@ public:
        : _type(partition_region::clustered), _ck(&ck) { }
    position_in_partition_view(range_tag_t, bound_view bv)
        : _type(partition_region::clustered), _bound_weight(position_weight(bv.kind())), _ck(&bv.prefix()) { }
+    position_in_partition_view(const clustering_key_prefix& ck, bound_weight w)
+        : _type(partition_region::clustered), _bound_weight(w), _ck(&ck) { }

    static position_in_partition_view for_range_start(const query::clustering_range& r) {
        return {position_in_partition_view::range_tag_t(), bound_view::from_range_start(r)};
@@ -159,6 +161,7 @@ public:
    }

    partition_region region() const { return _type; }
+    bound_weight get_bound_weight() const { return _bound_weight; }
    bool is_partition_start() const { return _type == partition_region::partition_start; }
    bool is_partition_end() const { return _type == partition_region::partition_end; }
    bool is_static_row() const { return _type == partition_region::static_row; }
@@ -271,6 +274,10 @@ public:
        return {clustering_row_tag_t(), std::move(ck)};
    }

+    static position_in_partition for_partition_start() {
+        return position_in_partition{partition_start_tag_t()};
+    }
+
    static position_in_partition for_static_row() {
        return position_in_partition{static_row_tag_t()};
    }
--- a/querier.cc
+++ b/querier.cc
@@ -286,11 +286,11 @@ static void insert_querier(

    auto& e = entries.emplace_back(key, std::move(q), expires);
    e.set_pos(--entries.end());
+    ++stats.population;

    if (auto irh = sem.register_inactive_read(std::make_unique<querier_inactive_read>(entries, e.pos(), stats))) {
        e.set_inactive_handle(std::move(irh));
        index.insert(e);
-        ++stats.population;
    }
 }

--- a/query-request.hh
+++ b/query-request.hh
@@ -31,6 +31,8 @@
 #include "tracing/tracing.hh"
 #include "utils/small_vector.hh"

+class position_in_partition_view;
+
 namespace query {

 using column_id_vector = utils::small_vector<column_id, 8>;
@@ -58,10 +60,20 @@ typedef std::vector<clustering_range> clustering_row_ranges;

 /// Trim the clustering ranges.
 ///
-/// Equivalent of intersecting each range with [key, +inf), or (-inf, key] if
+/// Equivalent of intersecting each clustering range with [pos, +inf) position
+/// in partition range, or (-inf, pos] position in partition range if
 /// reversed == true. Ranges that do not intersect are dropped. Ranges that
 /// partially overlap are trimmed.
-/// Result: each range will overlap fully with [key, +inf), or (-int, key] if
+/// Result: each range will overlap fully with [pos, +inf), or (-int, pos] if
+/// reversed is true.
+void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, position_in_partition_view pos, bool reversed = false);
+
+/// Trim the clustering ranges.
+///
+/// Equivalent of intersecting each clustering range with (key, +inf) clustering
+/// range, or (-inf, key) clustering range if reversed == true. Ranges that do
+/// not intersect are dropped. Ranges that partially overlap are trimmed.
+/// Result: each range will overlap fully with (key, +inf), or (-int, key) if
 /// reversed is true.
 void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, const clustering_key& key, bool reversed = false);

--- a/query.cc
+++ b/query.cc
@@ -71,34 +71,38 @@ std::ostream& operator<<(std::ostream& out, const specific_ranges& s) {
    return out << "{" << s._pk << " : " << join(", ", s._ranges) << "}";
 }

-void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, const clustering_key& key, bool reversed) {
-    auto cmp = [reversed, bv_cmp = bound_view::compare(s)] (const auto& a, const auto& b) {
-        return reversed ? bv_cmp(b, a) : bv_cmp(a, b);
+void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, position_in_partition_view pos, bool reversed) {
+    auto cmp = [reversed, cmp = position_in_partition::composite_tri_compare(s)] (const auto& a, const auto& b) {
+        return reversed ? cmp(b, a) : cmp(a, b);
    };
-    auto start_bound = [reversed] (const auto& range) -> const bound_view& {
-        return reversed ? range.second : range.first;
+    auto start_bound = [reversed] (const auto& range) -> position_in_partition_view {
+        return reversed ? position_in_partition_view::for_range_end(range) : position_in_partition_view::for_range_start(range);
    };
-    auto end_bound = [reversed] (const auto& range) -> const bound_view& {
-        return reversed ? range.first : range.second;
+    auto end_bound = [reversed] (const auto& range) -> position_in_partition_view {
+        return reversed ? position_in_partition_view::for_range_start(range) : position_in_partition_view::for_range_end(range);
    };
-    clustering_key_prefix::equality eq(s);

    auto it = ranges.begin();
    while (it != ranges.end()) {
-        auto range = bound_view::from_range(*it);
-        if (cmp(end_bound(range), key) || eq(end_bound(range).prefix(), key)) {
+        if (cmp(end_bound(*it), pos) <= 0) {
            it = ranges.erase(it);
            continue;
-        } else if (cmp(start_bound(range), key)) {
-            assert(cmp(key, end_bound(range)));
-            auto r = reversed ? clustering_range(it->start(), clustering_range::bound { key, false })
-                : clustering_range(clustering_range::bound { key, false }, it->end());
+        } else if (cmp(start_bound(*it), pos) <= 0) {
+            assert(cmp(pos, end_bound(*it)) < 0);
+            auto r = reversed ?
+                clustering_range(it->start(), clustering_range::bound(pos.key(), pos.get_bound_weight() != bound_weight::before_all_prefixed)) :
+                clustering_range(clustering_range::bound(pos.key(), pos.get_bound_weight() != bound_weight::after_all_prefixed), it->end());
            *it = std::move(r);
        }
        ++it;
    }
 }

+void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, const clustering_key& key, bool reversed) {
+    return trim_clustering_row_ranges_to(s, ranges,
+            position_in_partition_view(key, reversed ? bound_weight::before_all_prefixed : bound_weight::after_all_prefixed), reversed);
+}
+
 partition_slice::partition_slice(clustering_row_ranges row_ranges,
    query::column_id_vector static_columns,
    query::column_id_vector regular_columns,
--- a/read_context.hh
+++ b/read_context.hh
@@ -187,7 +187,7 @@ public:
    const dht::decorated_key& key() const { return *_key; }
    void on_underlying_created() { ++_underlying_created; }
    bool digest_requested() const { return _slice.options.contains<query::partition_slice::option::with_digest>(); }
-private:
+public:
    future<> ensure_underlying(db::timeout_clock::time_point timeout) {
        if (_underlying_snapshot) {
            return create_underlying(true, timeout);
@@ -206,18 +206,6 @@ public:
        _underlying_snapshot = {};
        _key = dk;
    }
-    // Fast forwards the underlying streamed_mutation to given range.
-    future<> fast_forward_to(position_range range, db::timeout_clock::time_point timeout) {
-        return ensure_underlying(timeout).then([this, range = std::move(range), timeout] {
-            return _underlying.underlying().fast_forward_to(std::move(range), timeout);
-        });
-    }
-    // Gets the next fragment from the underlying reader
-    future<mutation_fragment_opt> get_next_fragment(db::timeout_clock::time_point timeout) {
-        return ensure_underlying(timeout).then([this, timeout] {
-            return _underlying.underlying()(timeout);
-        });
-    }
 };

 }
--- a/reloc/build_reloc.sh
+++ b/reloc/build_reloc.sh
@@ -8,7 +8,6 @@ print_usage() {
    echo "  --clean clean build directory"
    echo "  --compiler  C++ compiler path"
    echo "  --c-compiler C compiler path"
-    echo "  --nodeps    skip installing dependencies"
    exit 1
 }

@@ -16,7 +15,6 @@ JOBS=
 CLEAN=
 COMPILER=
 CCOMPILER=
-NODEPS=
 while [ $# -gt 0 ]; do
    case "$1" in
        "--jobs")
@@ -36,7 +34,6 @@ while [ $# -gt 0 ]; do
            shift 2
            ;;
        "--nodeps")
-            NODEPS=yes
            shift 1
            ;;
        *)
@@ -66,10 +63,6 @@ if [ -f build/release/scylla-package.tar.gz ]; then
    rm build/release/scylla-package.tar.gz
 fi

-if [ -z "$NODEPS" ]; then
-    sudo ./install-dependencies.sh
-fi
-
 NINJA=$(which ninja-build) &&:
 if [ -z "$NINJA" ]; then
    NINJA=$(which ninja) &&:
--- a/reloc/python3/build_deb.sh
+++ b/reloc/python3/build_deb.sh
@@ -0,0 +1,37 @@
+#!/bin/bash -e
+
+. /etc/os-release
+print_usage() {
+    echo "build_deb.sh --reloc-pkg build/release/scylla-python3-package.tar.gz"
+    echo "  --reloc-pkg specify relocatable package path"
+    exit 1
+}
+
+RELOC_PKG=build/release/scylla-python3-package.tar.gz
+OPTS=""
+while [ $# -gt 0 ]; do
+    case "$1" in
+        "--reloc-pkg")
+            OPTS="$OPTS $1 $(readlink -f $2)"
+            RELOC_PKG=$2
+            shift 2
+            ;;
+        *)
+            print_usage
+            ;;
+    esac
+done
+
+if [ ! -e $RELOC_PKG ]; then
+    echo "$RELOC_PKG does not exist."
+    echo "Run ./reloc/python3/build_reloc.sh first."
+    exit 1
+fi
+RELOC_PKG=$(readlink -f $RELOC_PKG)
+if [[ ! $OPTS =~ --reloc-pkg ]]; then
+    OPTS="$OPTS --reloc-pkg $RELOC_PKG"
+fi
+mkdir -p build/debian/scylla-python3-package
+tar -C build/debian/scylla-python3-package -xpf $RELOC_PKG
+cd build/debian/scylla-python3-package
+exec ./dist/debian/python3/build_deb.sh $OPTS
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -780,8 +780,10 @@ static future<> repair_cf_range(repair_info& ri,
                    // still do our best to repair available replicas.
                    std::vector<gms::inet_address> live_neighbors;
                    std::vector<partition_checksum> live_neighbors_checksum;
+                    bool local_checksum_failed = false;
                    for (unsigned i = 0; i < checksums.size(); i++) {
                        if (checksums[i].failed()) {
+                            local_checksum_failed |= (i == 0);
                            rlogger.warn(
                                "Checksum of ks={}, table={}, range={} on {} failed: {}",
                                ri.keyspace, cf, range,
@@ -797,7 +799,7 @@ static future<> repair_cf_range(repair_info& ri,
                            live_neighbors_checksum.push_back(checksums[i].get0());
                        }
                    }
-                    if (checksums[0].failed() || live_neighbors.empty()) {
+                    if (local_checksum_failed || live_neighbors.empty()) {
                        return make_ready_future<>();
                    }
                    // If one of the available checksums is different, repair
@@ -940,8 +942,20 @@ static future<> repair_cf_range(repair_info& ri,
 // Comparable to RepairSession in Origin
 static future<> repair_range(repair_info& ri, const dht::token_range& range) {
    auto id = utils::UUID_gen::get_time_UUID();
-    return do_with(get_neighbors(ri.db.local(), ri.keyspace, range, ri.data_centers, ri.hosts), [&ri, range, id] (const auto& neighbors) {
-        rlogger.debug("[repair #{}] new session: will sync {} on range {} for {}.{}", id, neighbors, range, ri.keyspace, ri.cfs);
+    return do_with(get_neighbors(ri.db.local(), ri.keyspace, range, ri.data_centers, ri.hosts), [&ri, range, id] (std::vector<gms::inet_address>& neighbors) {
+      auto live_neighbors = boost::copy_range<std::vector<gms::inet_address>>(neighbors |
+                    boost::adaptors::filtered([] (const gms::inet_address& node) { return gms::get_local_gossiper().is_alive(node); }));
+      if (live_neighbors.size() != neighbors.size()) {
+            ri.nr_failed_ranges++;
+            auto status = live_neighbors.empty() ? "skipped" : "partial";
+            rlogger.warn("Repair {} out of {} ranges, id={}, shard={}, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
+            ri.ranges_index, ri.ranges.size(), ri.id, ri.shard, ri.keyspace, ri.cfs, range, neighbors, live_neighbors, status);
+            if (live_neighbors.empty()) {
+                return make_ready_future<>();
+            }
+            neighbors.swap(live_neighbors);
+      }
+      return ::service::get_local_migration_manager().sync_schema(ri.db.local(), neighbors).then([&neighbors, &ri, range, id] {
        return do_for_each(ri.cfs.begin(), ri.cfs.end(), [&ri, &neighbors, range] (auto&& cf) {
            ri._sub_ranges_nr++;
            if (ri.row_level_repair()) {
@@ -950,6 +964,7 @@ static future<> repair_range(repair_info& ri, const dht::token_range& range) {
                return repair_cf_range(ri, cf, range, neighbors);
            }
        });
+      });
    });
 }

--- a/repair/repair.hh
+++ b/repair/repair.hh
@@ -295,6 +295,7 @@ public:
    void push_mutation_fragment(frozen_mutation_fragment mf) { _mfs.push_back(std::move(mf)); }
 };

+using repair_row_on_wire = partition_key_and_mutation_fragments;
 using repair_rows_on_wire = std::list<partition_key_and_mutation_fragments>;

 enum class row_level_diff_detect_algorithm : uint8_t {
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -152,8 +152,8 @@ class fragment_hasher {
    xx_hasher& _hasher;
 private:
    void consume_cell(const column_definition& col, const atomic_cell_or_collection& cell) {
-        feed_hash(_hasher, col.name());
-        feed_hash(_hasher, col.type->name());
+        feed_hash(_hasher, col.kind);
+        feed_hash(_hasher, col.id);
        feed_hash(_hasher, cell, col);
    }
 public:
@@ -220,43 +220,62 @@ private:
 };

 class repair_row {
-    frozen_mutation_fragment _fm;
+    std::optional<frozen_mutation_fragment> _fm;
    lw_shared_ptr<const decorated_key_with_hash> _dk_with_hash;
-    repair_sync_boundary _boundary;
-    repair_hash _hash;
+    std::optional<repair_sync_boundary> _boundary;
+    std::optional<repair_hash> _hash;
    lw_shared_ptr<mutation_fragment> _mf;
 public:
    repair_row() = delete;
-    repair_row(frozen_mutation_fragment fm,
-            position_in_partition pos,
+    repair_row(std::optional<frozen_mutation_fragment> fm,
+            std::optional<position_in_partition> pos,
            lw_shared_ptr<const decorated_key_with_hash> dk_with_hash,
-            repair_hash hash,
+            std::optional<repair_hash> hash,
            lw_shared_ptr<mutation_fragment> mf = {})
            : _fm(std::move(fm))
            , _dk_with_hash(std::move(dk_with_hash))
-            , _boundary({_dk_with_hash->dk, std::move(pos)})
+            , _boundary(pos ? std::optional<repair_sync_boundary>(repair_sync_boundary{_dk_with_hash->dk, std::move(*pos)}) : std::nullopt)
            , _hash(std::move(hash))
            , _mf(std::move(mf)) {
    }
    mutation_fragment& get_mutation_fragment() {
        if (!_mf) {
-            throw std::runtime_error("get empty mutation_fragment");
+            throw std::runtime_error("empty mutation_fragment");
        }
        return *_mf;
    }
-    frozen_mutation_fragment& get_frozen_mutation() { return _fm; }
-    const frozen_mutation_fragment& get_frozen_mutation() const { return _fm; }
+    frozen_mutation_fragment& get_frozen_mutation() {
+        if (!_fm) {
+            throw std::runtime_error("empty frozen_mutation_fragment");
+        }
+        return *_fm;
+    }
+    const frozen_mutation_fragment& get_frozen_mutation() const {
+        if (!_fm) {
+            throw std::runtime_error("empty frozen_mutation_fragment");
+        }
+        return *_fm;
+    }
    const lw_shared_ptr<const decorated_key_with_hash>& get_dk_with_hash() const {
        return _dk_with_hash;
    }
    size_t size() const {
-        return _fm.representation().size();
+        if (!_fm) {
+            throw std::runtime_error("empty size due to empty frozen_mutation_fragment");
+        }
+        return _fm->representation().size();
    }
    const repair_sync_boundary& boundary() const {
-        return _boundary;
+        if (!_boundary) {
+            throw std::runtime_error("empty repair_sync_boundary");
+        }
+        return *_boundary;
    }
    const repair_hash& hash() const {
-        return _hash;
+        if (!_hash) {
+            throw std::runtime_error("empty hash");
+        }
+        return *_hash;
    }
 };

@@ -284,13 +303,14 @@ public:
    repair_reader(
            seastar::sharded<database>& db,
            column_family& cf,
+            schema_ptr s,
            dht::token_range range,
            dht::i_partitioner& local_partitioner,
            dht::i_partitioner& remote_partitioner,
            unsigned remote_shard,
            uint64_t seed,
            is_local_reader local_reader)
-            : _schema(cf.schema())
+            : _schema(s)
            , _range(dht::to_partition_range(range))
            , _sharder(remote_partitioner, range, remote_shard)
            , _seed(seed)
@@ -351,6 +371,10 @@ class repair_writer {
    std::vector<std::optional<seastar::queue<mutation_fragment_opt>>> _mq;
    // Current partition written to disk
    std::vector<lw_shared_ptr<const decorated_key_with_hash>> _current_dk_written_to_sstable;
+    // Is current partition still open. A partition is opened when a
+    // partition_start is written and is closed when a partition_end is
+    // written.
+    std::vector<bool> _partition_opened;
 public:
    repair_writer(
            schema_ptr schema,
@@ -365,10 +389,13 @@ public:
    future<> write_start_and_mf(lw_shared_ptr<const decorated_key_with_hash> dk, mutation_fragment mf, unsigned node_idx)  {
        _current_dk_written_to_sstable[node_idx] = dk;
        if (mf.is_partition_start()) {
-            return _mq[node_idx]->push_eventually(mutation_fragment_opt(std::move(mf)));
+            return _mq[node_idx]->push_eventually(mutation_fragment_opt(std::move(mf))).then([this, node_idx] {
+                _partition_opened[node_idx] = true;
+            });
        } else {
            auto start = mutation_fragment(partition_start(dk->dk, tombstone()));
            return _mq[node_idx]->push_eventually(mutation_fragment_opt(std::move(start))).then([this, node_idx, mf = std::move(mf)] () mutable {
+                _partition_opened[node_idx] = true;
                return _mq[node_idx]->push_eventually(mutation_fragment_opt(std::move(mf)));
            });
        }
@@ -378,6 +405,7 @@ public:
        _writer_done.resize(_nr_peer_nodes);
        _mq.resize(_nr_peer_nodes);
        _current_dk_written_to_sstable.resize(_nr_peer_nodes);
+        _partition_opened.resize(_nr_peer_nodes, false);
    }

    void create_writer(unsigned node_idx) {
@@ -414,12 +442,21 @@ public:
        t.stream_in_progress());
    }

+    future<> write_partition_end(unsigned node_idx) {
+        if (_partition_opened[node_idx]) {
+            return _mq[node_idx]->push_eventually(mutation_fragment(partition_end())).then([this, node_idx] {
+                _partition_opened[node_idx] = false;
+            });
+        }
+        return make_ready_future<>();
+    }
+
    future<> do_write(unsigned node_idx, lw_shared_ptr<const decorated_key_with_hash> dk, mutation_fragment mf) {
        if (_current_dk_written_to_sstable[node_idx]) {
            if (_current_dk_written_to_sstable[node_idx]->dk.equal(*_schema, dk->dk)) {
                return _mq[node_idx]->push_eventually(mutation_fragment_opt(std::move(mf)));
            } else {
-                return _mq[node_idx]->push_eventually(mutation_fragment(partition_end())).then([this,
+                return write_partition_end(node_idx).then([this,
                        node_idx, dk = std::move(dk), mf = std::move(mf)] () mutable {
                    return write_start_and_mf(std::move(dk), std::move(mf), node_idx);
                });
@@ -433,7 +470,7 @@ public:
        return parallel_for_each(boost::irange(unsigned(0), unsigned(_nr_peer_nodes)), [this] (unsigned node_idx) {
            if (_writer_done[node_idx] && _mq[node_idx]) {
                // Partition_end is never sent on wire, so we have to write one ourselves.
-                return _mq[node_idx]->push_eventually(mutation_fragment(partition_end())).then([this, node_idx] () mutable {
+                return write_partition_end(node_idx).then([this, node_idx] () mutable {
                    // Empty mutation_fragment_opt means no more data, so the writer can seal the sstables.
                    return _mq[node_idx]->push_eventually(mutation_fragment_opt()).then([this, node_idx] () mutable {
                        return (*_writer_done[node_idx]).then([] (uint64_t partitions) {
@@ -458,8 +495,8 @@ public:
 private:
    seastar::sharded<database>& _db;
    column_family& _cf;
-    dht::token_range _range;
    schema_ptr _schema;
+    dht::token_range _range;
    repair_sync_boundary::tri_compare _cmp;
    // The algorithm used to find the row difference
    row_level_diff_detect_algorithm _algo;
@@ -519,6 +556,7 @@ public:
    repair_meta(
            seastar::sharded<database>& db,
            column_family& cf,
+            schema_ptr s,
            dht::token_range range,
            row_level_diff_detect_algorithm algo,
            size_t max_row_buf_size,
@@ -529,8 +567,8 @@ public:
            size_t nr_peer_nodes = 1)
            : _db(db)
            , _cf(cf)
+            , _schema(s)
            , _range(range)
-            , _schema(cf.schema())
            , _cmp(repair_sync_boundary::tri_compare(*_schema))
            , _algo(algo)
            , _max_row_buf_size(max_row_buf_size)
@@ -545,6 +583,7 @@ public:
            , _repair_reader(
                    _db,
                    _cf,
+                    _schema,
                    _range,
                    dht::global_partitioner(),
                    *_remote_partitioner,
@@ -577,35 +616,45 @@ public:
        }
    }

-    static void
+    static future<>
    insert_repair_meta(const gms::inet_address& from,
+            uint32_t src_cpu_id,
            uint32_t repair_meta_id,
-            sstring ks_name,
-            sstring cf_name,
            dht::token_range range,
            row_level_diff_detect_algorithm algo,
            uint64_t max_row_buf_size,
            uint64_t seed,
-            shard_config master_node_shard_config) {
-        node_repair_meta_id id{from, repair_meta_id};
-        auto& db = service::get_local_storage_proxy().get_db();
-        auto& cf = db.local().find_column_family(ks_name, cf_name);
-        auto rm = make_lw_shared<repair_meta>(db,
-                cf,
+            shard_config master_node_shard_config,
+            table_schema_version schema_version) {
+        return service::get_schema_for_write(schema_version, {from, src_cpu_id}).then([from,
+                repair_meta_id,
                range,
                algo,
                max_row_buf_size,
                seed,
-                repair_meta::repair_master::no,
-                repair_meta_id,
-                std::move(master_node_shard_config));
-        bool insertion = repair_meta_map().emplace(id, rm).second;
-        if (!insertion) {
-            rlogger.warn("insert_repair_meta: repair_meta_id {} for node {} already exists, replace existing one", id.repair_meta_id, id.ip);
-            repair_meta_map()[id] = rm;
-        } else {
-            rlogger.debug("insert_repair_meta: Inserted repair_meta_id {} for node {}", id.repair_meta_id, id.ip);
-        }
+                master_node_shard_config,
+                schema_version] (schema_ptr s) {
+            auto& db = service::get_local_storage_proxy().get_db();
+            auto& cf = db.local().find_column_family(s->id());
+            node_repair_meta_id id{from, repair_meta_id};
+            auto rm = make_lw_shared<repair_meta>(db,
+                    cf,
+                    s,
+                    range,
+                    algo,
+                    max_row_buf_size,
+                    seed,
+                    repair_meta::repair_master::no,
+                    repair_meta_id,
+                    std::move(master_node_shard_config));
+            bool insertion = repair_meta_map().emplace(id, rm).second;
+            if (!insertion) {
+                rlogger.warn("insert_repair_meta: repair_meta_id {} for node {} already exists, replace existing one", id.repair_meta_id, id.ip);
+                repair_meta_map()[id] = rm;
+            } else {
+                rlogger.debug("insert_repair_meta: Inserted repair_meta_id {} for node {}", id.repair_meta_id, id.ip);
+            }
+        });
    }

    static future<>
@@ -642,7 +691,11 @@ public:
            }
        }
        return parallel_for_each(*repair_metas, [repair_metas] (auto& rm) {
-            return rm->stop();
+            return rm->stop().then([&rm] {
+                rm = {};
+            });
+        }).then([repair_metas, from] {
+            rlogger.debug("Removed all repair_meta for single node {}", from);
        });
    }

@@ -654,7 +707,11 @@ public:
                | boost::adaptors::map_values));
        repair_meta_map().clear();
        return parallel_for_each(*repair_metas, [repair_metas] (auto& rm) {
-            return rm->stop();
+            return rm->stop().then([&rm] {
+                rm = {};
+            });
+        }).then([repair_metas] {
+            rlogger.debug("Removed all repair_meta for all nodes");
        });
    }

@@ -952,12 +1009,12 @@ private:
        }
        return to_repair_rows_list(rows).then([this, from, node_idx, update_buf, update_hash_set] (std::list<repair_row> row_diff) {
            return do_with(std::move(row_diff), [this, from, node_idx, update_buf, update_hash_set] (std::list<repair_row>& row_diff) {
-                auto sz = get_repair_rows_size(row_diff);
-                stats().rx_row_bytes += sz;
-                stats().rx_row_nr += row_diff.size();
-                stats().rx_row_nr_peer[from] += row_diff.size();
-                _metrics.rx_row_nr += row_diff.size();
-                _metrics.rx_row_bytes += sz;
+                if (_repair_master) {
+                    auto sz = get_repair_rows_size(row_diff);
+                    stats().rx_row_bytes += sz;
+                    stats().rx_row_nr += row_diff.size();
+                    stats().rx_row_nr_peer[from] += row_diff.size();
+                }
                if (update_buf) {
                    std::list<repair_row> tmp;
                    tmp.swap(_working_row_buf);
@@ -993,11 +1050,16 @@ private:
        return do_with(repair_rows_on_wire(), std::move(row_list), [this] (repair_rows_on_wire& rows, std::list<repair_row>& row_list) {
            return do_for_each(row_list, [this, &rows] (repair_row& r) {
                auto pk = r.get_dk_with_hash()->dk.key();
-                auto it = std::find_if(rows.begin(), rows.end(), [&pk, s=_schema] (partition_key_and_mutation_fragments& row) { return pk.legacy_equal(*s, row.get_key()); });
-                if (it == rows.end()) {
-                    rows.push_back(partition_key_and_mutation_fragments(std::move(pk), {std::move(r.get_frozen_mutation())}));
+                // No need to search from the beginning of the rows. Look at the end of repair_rows_on_wire is enough.
+                if (rows.empty()) {
+                    rows.push_back(repair_row_on_wire(std::move(pk), {std::move(r.get_frozen_mutation())}));
                } else {
-                    it->push_mutation_fragment(std::move(r.get_frozen_mutation()));
+                    auto& row = rows.back();
+                    if (pk.legacy_equal(*_schema, row.get_key())) {
+                        row.push_mutation_fragment(std::move(r.get_frozen_mutation()));
+                    } else {
+                        rows.push_back(repair_row_on_wire(std::move(pk), {std::move(r.get_frozen_mutation())}));
+                    }
                }
            }).then([&rows] {
                return std::move(rows);
@@ -1006,23 +1068,47 @@ private:
    };

    future<std::list<repair_row>> to_repair_rows_list(repair_rows_on_wire rows) {
-        return do_with(std::move(rows), std::list<repair_row>(), lw_shared_ptr<const decorated_key_with_hash>(),
-          [this] (repair_rows_on_wire& rows, std::list<repair_row>& row_list, lw_shared_ptr<const decorated_key_with_hash>& dk_ptr) mutable {
-            return do_for_each(rows, [this, &dk_ptr, &row_list] (partition_key_and_mutation_fragments& x) mutable {
+        return do_with(std::move(rows), std::list<repair_row>(), lw_shared_ptr<const decorated_key_with_hash>(), lw_shared_ptr<mutation_fragment>(), position_in_partition::tri_compare(*_schema),
+          [this] (repair_rows_on_wire& rows, std::list<repair_row>& row_list, lw_shared_ptr<const decorated_key_with_hash>& dk_ptr, lw_shared_ptr<mutation_fragment>& last_mf, position_in_partition::tri_compare& cmp) mutable {
+            return do_for_each(rows, [this, &dk_ptr, &row_list, &last_mf, &cmp] (partition_key_and_mutation_fragments& x) mutable {
                dht::decorated_key dk = dht::global_partitioner().decorate_key(*_schema, x.get_key());
                if (!(dk_ptr && dk_ptr->dk.equal(*_schema, dk))) {
                    dk_ptr = make_lw_shared<const decorated_key_with_hash>(*_schema, dk, _seed);
                }
-                return do_for_each(x.get_mutation_fragments(), [this, &dk_ptr, &row_list] (frozen_mutation_fragment& fmf) mutable {
-                    // Keep the mutation_fragment in repair_row as an
-                    // optimization to avoid unfreeze again when
-                    // mutation_fragment is needed by _repair_writer.do_write()
-                    // to apply the repair_row to disk
-                    auto mf = make_lw_shared<mutation_fragment>(fmf.unfreeze(*_schema));
-                    auto hash = do_hash_for_mf(*dk_ptr, *mf);
-                    position_in_partition pos(mf->position());
-                    row_list.push_back(repair_row(std::move(fmf), std::move(pos), dk_ptr, std::move(hash), std::move(mf)));
-                });
+                if (_repair_master) {
+                    return do_for_each(x.get_mutation_fragments(), [this, &dk_ptr, &row_list] (frozen_mutation_fragment& fmf) mutable {
+                        _metrics.rx_row_nr += 1;
+                        _metrics.rx_row_bytes += fmf.representation().size();
+                        // Keep the mutation_fragment in repair_row as an
+                        // optimization to avoid unfreeze again when
+                        // mutation_fragment is needed by _repair_writer.do_write()
+                        // to apply the repair_row to disk
+                        auto mf = make_lw_shared<mutation_fragment>(fmf.unfreeze(*_schema));
+                        auto hash = do_hash_for_mf(*dk_ptr, *mf);
+                        position_in_partition pos(mf->position());
+                        row_list.push_back(repair_row(std::move(fmf), std::move(pos), dk_ptr, std::move(hash), std::move(mf)));
+                    });
+                } else {
+                    last_mf = {};
+                    return do_for_each(x.get_mutation_fragments(), [this, &dk_ptr, &row_list, &last_mf, &cmp] (frozen_mutation_fragment& fmf) mutable {
+                        _metrics.rx_row_nr += 1;
+                        _metrics.rx_row_bytes += fmf.representation().size();
+                        auto mf = make_lw_shared<mutation_fragment>(fmf.unfreeze(*_schema));
+                        position_in_partition pos(mf->position());
+                        // If the mutation_fragment has the same position as
+                        // the last mutation_fragment, it means they are the
+                        // same row with different contents. We can not feed
+                        // such rows into the sstable writer. Instead we apply
+                        // the mutation_fragment into the previous one.
+                        if (last_mf && cmp(last_mf->position(), pos) == 0 && last_mf->mergeable_with(*mf)) {
+                            last_mf->apply(*_schema, std::move(*mf));
+                        } else {
+                            last_mf = mf;
+                            // On repair follower node, only decorated_key_with_hash and the mutation_fragment inside repair_row are used.
+                            row_list.push_back(repair_row({}, {}, dk_ptr, {}, std::move(mf)));
+                        }
+                    });
+                }
            }).then([&row_list] {
                return std::move(row_list);
            });
@@ -1084,29 +1170,28 @@ public:

    // RPC API
    future<>
-    repair_row_level_start(gms::inet_address remote_node, sstring ks_name, sstring cf_name, dht::token_range range) {
+    repair_row_level_start(gms::inet_address remote_node, sstring ks_name, sstring cf_name, dht::token_range range, table_schema_version schema_version) {
        if (remote_node == _myip) {
            return make_ready_future<>();
        }
        stats().rpc_call_nr++;
        return netw::get_local_messaging_service().send_repair_row_level_start(msg_addr(remote_node),
                _repair_meta_id, std::move(ks_name), std::move(cf_name), std::move(range), _algo, _max_row_buf_size, _seed,
-                _master_node_shard_config.shard, _master_node_shard_config.shard_count, _master_node_shard_config.ignore_msb, _master_node_shard_config.partitioner_name);
+                _master_node_shard_config.shard, _master_node_shard_config.shard_count, _master_node_shard_config.ignore_msb, _master_node_shard_config.partitioner_name, std::move(schema_version));
    }

    // RPC handler
    static future<>
-    repair_row_level_start_handler(gms::inet_address from, uint32_t repair_meta_id, sstring ks_name, sstring cf_name,
+    repair_row_level_start_handler(gms::inet_address from, uint32_t src_cpu_id, uint32_t repair_meta_id, sstring ks_name, sstring cf_name,
            dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size,
-            uint64_t seed, shard_config master_node_shard_config) {
+            uint64_t seed, shard_config master_node_shard_config, table_schema_version schema_version) {
        if (!_sys_dist_ks->local_is_initialized() || !_view_update_generator->local_is_initialized()) {
            return make_exception_future<>(std::runtime_error(format("Node {} is not fully initialized for repair, try again later",
                    utils::fb_utilities::get_broadcast_address())));
        }
-        rlogger.debug(">>> Started Row Level Repair (Follower): local={}, peers={}, repair_meta_id={}, keyspace={}, cf={}, range={}",
-            utils::fb_utilities::get_broadcast_address(), from, repair_meta_id, ks_name, cf_name, range);
-        insert_repair_meta(from, repair_meta_id, std::move(ks_name), std::move(cf_name), std::move(range), algo, max_row_buf_size, seed, std::move(master_node_shard_config));
-        return make_ready_future<>();
+        rlogger.debug(">>> Started Row Level Repair (Follower): local={}, peers={}, repair_meta_id={}, keyspace={}, cf={}, schema_version={}, range={}",
+            utils::fb_utilities::get_broadcast_address(), from, repair_meta_id, ks_name, cf_name, schema_version, range);
+        return insert_repair_meta(from, src_cpu_id, repair_meta_id, std::move(range), algo, max_row_buf_size, seed, std::move(master_node_shard_config), std::move(schema_version));
    }

    // RPC API
@@ -1313,14 +1398,15 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
        });
        ms.register_repair_row_level_start([] (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring ks_name,
                sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed,
-                unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name) {
+                unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version) {
            auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
            auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
-            return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, ks_name, cf_name,
-                    range, algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, remote_partitioner_name] () mutable {
-                return repair_meta::repair_row_level_start_handler(from, repair_meta_id, std::move(ks_name),
+            return smp::submit_to(src_cpu_id % smp::count, [from, src_cpu_id, repair_meta_id, ks_name, cf_name,
+                    range, algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, remote_partitioner_name, schema_version] () mutable {
+                return repair_meta::repair_row_level_start_handler(from, src_cpu_id, repair_meta_id, std::move(ks_name),
                        std::move(cf_name), std::move(range), algo, max_row_buf_size, seed,
-                        shard_config{remote_shard, remote_shard_count, remote_ignore_msb, std::move(remote_partitioner_name)});
+                        shard_config{remote_shard, remote_shard_count, remote_ignore_msb, std::move(remote_partitioner_name)},
+                        schema_version);
            });
        });
        ms.register_repair_row_level_stop([] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
@@ -1608,8 +1694,12 @@ public:
                    dht::global_partitioner().sharding_ignore_msb(),
                    dht::global_partitioner().name()
            };
+            auto s = _cf.schema();
+            auto schema_version = s->version();
+
            repair_meta master(_ri.db,
                    _cf,
+                    s,
                    _range,
                    algorithm,
                    _max_row_buf_size,
@@ -1622,12 +1712,13 @@ public:
            // All nodes including the node itself.
            _all_nodes.insert(_all_nodes.begin(), master.myip());

-            rlogger.debug(">>> Started Row Level Repair (Master): local={}, peers={}, repair_meta_id={}, keyspace={}, cf={}, range={}, seed={}",
-                    master.myip(), _all_live_peer_nodes, master.repair_meta_id(), _ri.keyspace, _cf_name, _range, _seed);
+            rlogger.debug(">>> Started Row Level Repair (Master): local={}, peers={}, repair_meta_id={}, keyspace={}, cf={}, schema_version={}, range={}, seed={}",
+                    master.myip(), _all_live_peer_nodes, master.repair_meta_id(), _ri.keyspace, _cf_name, schema_version, _range, _seed);
+

            try {
                parallel_for_each(_all_nodes, [&, this] (const gms::inet_address& node) {
-                    return master.repair_row_level_start(node, _ri.keyspace, _cf_name, _range).then([&] () {
+                    return master.repair_row_level_start(node, _ri.keyspace, _cf_name, _range, schema_version).then([&] () {
                        return master.repair_get_estimated_partitions(node).then([this, node] (uint64_t partitions) {
                            rlogger.trace("Get repair_get_estimated_partitions for node={}, estimated_partitions={}", node, partitions);
                            _estimated_partitions += partitions;
@@ -1677,19 +1768,7 @@ public:
 future<> repair_cf_range_row_level(repair_info& ri,
        sstring cf_name, dht::token_range range,
        const std::vector<gms::inet_address>& all_peer_nodes) {
-    auto all_live_peer_nodes = boost::copy_range<std::vector<gms::inet_address>>(all_peer_nodes |
-        boost::adaptors::filtered([] (const gms::inet_address& node) { return gms::get_local_gossiper().is_alive(node); }));
-    if (all_live_peer_nodes.size() != all_peer_nodes.size()) {
-        rlogger.warn("Repair for range={} is partial, peer nodes={}, live peer nodes={}",
-                range, all_peer_nodes, all_live_peer_nodes);
-        ri.nr_failed_ranges++;
-    }
-    if (all_live_peer_nodes.empty()) {
-        rlogger.info(">>> Skipped Row Level Repair (Master): local={}, peers={}, keyspace={}, cf={}, range={}",
-            utils::fb_utilities::get_broadcast_address(), all_peer_nodes, ri.keyspace, cf_name, range);
-        return make_ready_future<>();
-    }
-    return do_with(row_level_repair(ri, std::move(cf_name), std::move(range), std::move(all_live_peer_nodes)), [] (row_level_repair& repair) {
+    return do_with(row_level_repair(ri, std::move(cf_name), std::move(range), all_peer_nodes), [] (row_level_repair& repair) {
        return repair.run();
    });
 }
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -31,7 +31,6 @@
 #include <boost/version.hpp>
 #include <sys/sdt.h>
 #include "read_context.hh"
-#include "schema_upgrader.hh"
 #include "dirty_memory_manager.hh"
 #include "cache_flat_mutation_reader.hh"
 #include "real_dirty_memory_accounter.hh"
@@ -349,9 +348,7 @@ future<> read_context::create_underlying(bool skip_first_fragment, db::timeout_c

 static flat_mutation_reader read_directly_from_underlying(read_context& reader) {
    flat_mutation_reader res = make_delegating_reader(reader.underlying().underlying());
-    if (reader.schema()->version() != reader.underlying().underlying().schema()->version()) {
-        res = transform(std::move(res), schema_upgrader(reader.schema()));
-    }
+    res.upgrade_schema(reader.schema());
    return make_nonforwardable(std::move(res), true);
 }

@@ -928,7 +925,6 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
    });

    return seastar::async([this, &m, updater = std::move(updater), real_dirty_acc = std::move(real_dirty_acc)] () mutable {
-        coroutine update;
        size_t size_entry;
        // In case updater fails, we must bring the cache to consistency without deferring.
        auto cleanup = defer([&m, this] {
@@ -936,6 +932,7 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
            _prev_snapshot_pos = {};
            _prev_snapshot = {};
        });
+        coroutine update; // Destroy before cleanup to release snapshots before invalidating.
        partition_presence_checker is_present = _prev_snapshot->make_partition_presence_checker();
        while (!m.partitions.empty()) {
            with_allocator(_tracker.allocator(), [&] () {
@@ -1007,8 +1004,10 @@ future<> row_cache::update(external_updater eu, memtable& m) {
        if (cache_i != partitions_end() && cache_i->key().equal(*_schema, mem_e.key())) {
            cache_entry& entry = *cache_i;
            upgrade_entry(entry);
+            assert(entry._schema == _schema);
            _tracker.on_partition_merge();
-            return entry.partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), *mem_e.schema(), _tracker.memtable_cleaner(),
+            mem_e.upgrade_schema(_schema, _tracker.memtable_cleaner());
+            return entry.partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), _tracker.memtable_cleaner(),
                alloc, _tracker.region(), _tracker, _underlying_phase, acc);
        } else if (cache_i->continuous()
                   || with_allocator(standard_allocator(), [&] { return is_present(mem_e.key()); })
@@ -1020,7 +1019,8 @@ future<> row_cache::update(external_updater eu, memtable& m) {
            entry->set_continuous(cache_i->continuous());
            _tracker.insert(*entry);
            _partitions.insert_before(cache_i, *entry);
-            return entry->partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), *mem_e.schema(), _tracker.memtable_cleaner(),
+            mem_e.upgrade_schema(_schema, _tracker.memtable_cleaner());
+            return entry->partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), _tracker.memtable_cleaner(),
                alloc, _tracker.region(), _tracker, _underlying_phase, acc);
        } else {
            return make_empty_coroutine();
@@ -1117,8 +1117,8 @@ future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&
    });
 }

-void row_cache::evict(const dht::partition_range& range) {
-    invalidate_unwrapped(range);
+void row_cache::evict() {
+    while (_tracker.region().evict_some() == memory::reclaiming_result::reclaimed_something) {}
 }

 void row_cache::invalidate_unwrapped(const dht::partition_range& range) {
@@ -1205,8 +1205,11 @@ void rows_entry::on_evicted(cache_tracker& tracker) noexcept {
        partition_version& pv = partition_version::container_of(mutation_partition::container_of(
            mutation_partition::rows_type::container_of_only_member(*it)));
        if (pv.is_referenced_from_entry()) {
-            cache_entry& ce = cache_entry::container_of(partition_entry::container_of(pv));
-            ce.on_evicted(tracker);
+            partition_entry& pe = partition_entry::container_of(pv);
+            if (!pe.is_locked()) {
+                cache_entry& ce = cache_entry::container_of(pe);
+                ce.on_evicted(tracker);
+            }
        }
    }
 }
@@ -1227,9 +1230,8 @@ flat_mutation_reader cache_entry::do_read(row_cache& rc, read_context& reader) {
    auto snp = _pe.read(rc._tracker.region(), rc._tracker.cleaner(), _schema, &rc._tracker, reader.phase());
    auto ckr = query::clustering_key_filter_ranges::get_ranges(*_schema, reader.slice(), _key.key());
    auto r = make_cache_flat_mutation_reader(_schema, _key, std::move(ckr), rc, reader.shared_from_this(), std::move(snp));
-    if (reader.schema()->version() != _schema->version()) {
-        r = transform(std::move(r), schema_upgrader(reader.schema()));
-    }
+    r.upgrade_schema(rc.schema());
+    r.upgrade_schema(reader.schema());
    return r;
 }

@@ -1238,7 +1240,7 @@ const schema_ptr& row_cache::schema() const {
 }

 void row_cache::upgrade_entry(cache_entry& e) {
-    if (e._schema != _schema) {
+    if (e._schema != _schema && !e.partition().is_locked()) {
        auto& r = _tracker.region();
        assert(!r.reclaiming_enabled());
        with_allocator(r.allocator(), [this, &e] {
--- a/row_cache.hh
+++ b/row_cache.hh
@@ -549,12 +549,12 @@ public:
    future<> invalidate(external_updater, const dht::partition_range& = query::full_partition_range);
    future<> invalidate(external_updater, dht::partition_range_vector&&);

-    // Evicts entries from given range in cache.
+    // Evicts entries from cache.
    //
    // Note that this does not synchronize with the underlying source,
    // it is assumed that the underlying source didn't change.
    // If it did, use invalidate() instead.
-    void evict(const dht::partition_range& = query::full_partition_range);
+    void evict();

    size_t partitions() const {
        return _partitions.size();
--- a/schema_mutations.cc
+++ b/schema_mutations.cc
@@ -69,19 +69,30 @@ table_schema_version schema_mutations::digest() const {
    }

    md5_hasher h;
-    db::schema_tables::feed_hash_for_schema_digest(h, _columnfamilies);
-    db::schema_tables::feed_hash_for_schema_digest(h, _columns);
+    db::schema_features sf = db::schema_features::full();
+
+    // Disable this feature so that the digest remains compactible with Scylla
+    // versions prior to this feature.
+    // This digest affects the table schema version calculation and it's important
+    // that all nodes arrive at the same table schema version to avoid needless schema version
+    // pulls. Table schema versions are calculated on boot when we don't yet
+    // know all the cluster features, so we could get different table versions after reboot
+    // in an already upgraded cluster.
+    sf.remove<db::schema_feature::DIGEST_INSENSITIVE_TO_EXPIRY>();
+
+    db::schema_tables::feed_hash_for_schema_digest(h, _columnfamilies, sf);
+    db::schema_tables::feed_hash_for_schema_digest(h, _columns, sf);
    if (_view_virtual_columns && !_view_virtual_columns->partition().empty()) {
-        db::schema_tables::feed_hash_for_schema_digest(h, *_view_virtual_columns);
+        db::schema_tables::feed_hash_for_schema_digest(h, *_view_virtual_columns, sf);
    }
    if (_indices && !_indices->partition().empty()) {
-        db::schema_tables::feed_hash_for_schema_digest(h, *_indices);
+        db::schema_tables::feed_hash_for_schema_digest(h, *_indices, sf);
    }
    if (_dropped_columns && !_dropped_columns->partition().empty()) {
-        db::schema_tables::feed_hash_for_schema_digest(h, *_dropped_columns);
+        db::schema_tables::feed_hash_for_schema_digest(h, *_dropped_columns, sf);
    }
    if (_scylla_tables) {
-        db::schema_tables::feed_hash_for_schema_digest(h, *_scylla_tables);
+        db::schema_tables::feed_hash_for_schema_digest(h, *_scylla_tables, sf);
    }
    return utils::UUID_gen::get_name_UUID(h.finalize());
 }
--- a/schema_registry.cc
+++ b/schema_registry.cc
@@ -263,11 +263,9 @@ global_schema_ptr::global_schema_ptr(const global_schema_ptr& o)
    : global_schema_ptr(o.get())
 { }

-global_schema_ptr::global_schema_ptr(global_schema_ptr&& o) {
+global_schema_ptr::global_schema_ptr(global_schema_ptr&& o) noexcept {
    auto current = engine().cpu_id();
-    if (o._cpu_of_origin != current) {
-        throw std::runtime_error("Attempted to move global_schema_ptr across shards");
-    }
+    assert(o._cpu_of_origin == current);
    _ptr = std::move(o._ptr);
    _cpu_of_origin = current;
 }
--- a/schema_registry.hh
+++ b/schema_registry.hh
@@ -173,7 +173,7 @@ public:
    // The other may come from a different shard.
    global_schema_ptr(const global_schema_ptr& other);
    // The other must come from current shard.
-    global_schema_ptr(global_schema_ptr&& other);
+    global_schema_ptr(global_schema_ptr&& other) noexcept;
    // May be invoked across shards. Always returns an engaged pointer.
    schema_ptr get() const;
    operator schema_ptr() const { return get(); }
--- a/scripts/create-relocatable-package-python3.py
+++ b/scripts/create-relocatable-package-python3.py
@@ -231,9 +231,15 @@ ar = tarfile.open(args.output, mode='w|gz')
 pathlib.Path('build/SCYLLA-RELOCATABLE-FILE').touch()
 ar.add('build/SCYLLA-RELOCATABLE-FILE', arcname='SCYLLA-RELOCATABLE-FILE')
 ar.add('dist/redhat/python3')
+ar.add('dist/debian/python3')
 ar.add('build/python3/SCYLLA-RELEASE-FILE', arcname='SCYLLA-RELEASE-FILE')
 ar.add('build/python3/SCYLLA-VERSION-FILE', arcname='SCYLLA-VERSION-FILE')
 ar.add('build/SCYLLA-PRODUCT-FILE', arcname='SCYLLA-PRODUCT-FILE')
+for p in ['pyhton3-libs'] + packages:
+    pdir = pathlib.Path('/usr/share/licenses/{}/'.format(p))
+    if pdir.exists():
+        for f in pdir.glob('*'):
+            ar.add(f, arcname='licenses/{}/{}'.format(p, f.name))

 for f in file_list:
    copy_file_to_python_env(ar, f)
--- a/scripts/create-relocatable-package.py
+++ b/scripts/create-relocatable-package.py
@@ -61,6 +61,7 @@ args = ap.parse_args()

 executables = ['build/{}/scylla'.format(args.mode),
               'build/{}/iotune'.format(args.mode),
+               '/usr/bin/patchelf',
               '/usr/bin/lscpu',
               '/usr/bin/gawk',
               '/usr/bin/gzip',
@@ -76,6 +77,9 @@ libs = {}
 for exe in executables:
    libs.update(ldd(exe))

+# manually add libthread_db for debugging thread
+libs.update({'libthread_db.so.1': '/lib64/libthread_db-1.0.so'})
+
 ld_so = libs['ld.so']

 have_gnutls = any([lib.startswith('libgnutls.so')
@@ -93,56 +97,9 @@ ar = tarfile.open(fileobj=gzip_process.stdin, mode='w|')
 pathlib.Path('build/SCYLLA-RELOCATABLE-FILE').touch()
 ar.add('build/SCYLLA-RELOCATABLE-FILE', arcname='SCYLLA-RELOCATABLE-FILE')

-# This thunk is a shell script that arranges for the executable to be invoked,
-# under the following conditions:
-#
-#  - the same argument vector is passed to the executable, including argv[0]
-#  - the executable name (/proc/pid/comm, shown in top(1)) is the same
-#  - the dynamic linker is taken from this package rather than the executable's
-#    default (which is hardcoded to point to /lib64/ld-linux-x86_64.so or similar)
-#  - LD_LIBRARY_PATH points to the lib/ directory so shared library dependencies
-#    are satisified from there rather than the system default (e.g. /lib64)
-
-# To do that, the dynamic linker is invoked using a symbolic link named after the
-# executable, not its standard name. We use "bash -a" to set argv[0].
-
-# The full tangled web looks like:
-#
-# foobar/bin/scylla               a shell script invoking everything
-# foobar/libexec/scylla.bin       the real binary
-# foobar/libexec/scylla           a symlink to ../lib/ld.so
-# foobar/libreloc/ld.so                the dynamic linker
-# foobar/libreloc/lib...               all the other libraries
-
-# the transformations (done by the thunk and symlinks) are:
-#
-#    bin/scylla args -> libexec/scylla libexec/scylla.bin args -> lib/ld.so libexec/scylla.bin args
-
-thunk = b'''\
-#!/bin/bash
-
-x="$(readlink -f "$0")"
-b="$(basename "$x")"
-d="$(dirname "$x")/.."
-ldso="$d/libexec/$b"
-realexe="$d/libexec/$b.bin"
-export GNUTLS_SYSTEM_PRIORITY_FILE="${GNUTLS_SYSTEM_PRIORITY_FILE-$d/libreloc/gnutls.config}"
-LD_LIBRARY_PATH="$d/libreloc" exec -a "$0" "$ldso" "$realexe" "$@"
-'''
-
 for exe in executables:
    basename = os.path.basename(exe)
-    ar.add(exe, arcname='libexec/' + basename + '.bin')
-    ti = tarfile.TarInfo(name='bin/' + basename)
-    ti.size = len(thunk)
-    ti.mode = 0o755
-    ti.mtime = os.stat(exe).st_mtime
-    ar.addfile(ti, fileobj=io.BytesIO(thunk))
-    ti = tarfile.TarInfo(name='libexec/' + basename)
-    ti.type = tarfile.SYMTYPE
-    ti.linkname = '../libreloc/ld.so'
-    ti.mtime = os.stat(exe).st_mtime
-    ar.addfile(ti)
+    ar.add(exe, arcname='libexec/' + basename)
 for lib, libfile in libs.items():
    ar.add(libfile, arcname='libreloc/' + lib)
 if have_gnutls:
--- a/Show More
+++ b/Show More