materialized_views: propagate "view virtual columns" between nodes

db::schema_tables::ALL and db::schema_tables::all_tables() are both supposed to list the same schema tables - the former is the list of their names, and the latter is the list of their schemas. This code duplication makes it easy to forget to update one of them, and indeed recently the new "view_virtual_columns" was added to all_tables() but not to ALL. What this patch does is to make ALL a function instead of constant vector. The newly named all_table_names() function uses all_tables() so the list of schema tables only appears once. So that nobody worries about the performance impact, all_table_names() caches the list in a per-thread vector that is only prepared once per thread. Because after this patch all_table_names() has the "view_virtual_columns" that was previously missing, this patch also fixes #4339, which was about virtual columns in materialized views not being propagated to other nodes. Unfortunately, to test the fix for #4339 we need a test with multiple nodes, so we cannot test it here in a unit test, and will instead use the dtest framework, in a separate patch. Fixes #4339 Branches: 3.0 Tests: all unit tests (release and debug mode), new dtest for #4339. The unit test mutation_reader_test failed in debug mode but not in release mode, but this probably has nothing to do with this patch (?). Signed-off-by: Nadav Har'El <nyh@scylladb.com> Message-Id: <20190320063437.32731-1-nyh@scylladb.com> (cherry picked from commit 7c874057f5)
cql: alter type: Format field name as text instead of hex
2020-01-06 00:37:59 +02:00 · 2020-01-05 18:55:40 +02:00 · 2020-01-05 18:50:27 +02:00 · 2019-12-24 18:42:33 +02:00 · 2019-12-24 17:44:40 +02:00 · 2019-12-24 17:44:40 +02:00
154 changed files with 4163 additions and 1799 deletions
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=3.0.2
+VERSION=3.0.11

 if test -f version
 then
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -241,7 +241,11 @@ future<authenticated_user> password_authenticator::authenticate(
    }).then_wrapped([=](future<::shared_ptr<cql3::untyped_result_set>> f) {
        try {
            auto res = f.get0();
-            if (res->empty() || !passwords::check(password, res->one().get_as<sstring>(SALTED_HASH))) {
+            auto salted_hash = std::experimental::optional<sstring>();
+            if (!res->empty()) {
+                salted_hash = res->one().get_opt<sstring>(SALTED_HASH);
+            }
+            if (!salted_hash || !passwords::check(password, *salted_hash)) {
                throw exceptions::authentication_exception("Username and/or password are incorrect");
            }
            return make_ready_future<authenticated_user>(username);
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -184,7 +184,9 @@ future<> service::start() {
    return once_among_shards([this] {
        return create_keyspace_if_missing();
    }).then([this] {
-        return when_all_succeed(_role_manager->start(), _authorizer->start(), _authenticator->start());
+        return _role_manager->start().then([this] {
+            return when_all_succeed(_authorizer->start(), _authenticator->start());
+        });
    }).then([this] {
        _permissions_cache = std::make_unique<permissions_cache>(_permissions_cache_config, *this, log);
    }).then([this] {
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -61,6 +61,7 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
        // - _last_row points at a direct predecessor of the next row which is going to be read.
        //   Used for populating continuity.
        // - _population_range_starts_before_all_rows is set accordingly
+        // - _underlying is engaged and fast-forwarded
        reading_from_underlying,

        end_of_stream
@@ -99,7 +100,13 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
    // forward progress is not guaranteed in case iterators are getting constantly invalidated.
    bool _lower_bound_changed = false;

+    // Points to the underlying reader conforming to _schema,
+    // either to *_underlying_holder or _read_context->underlying().underlying().
+    flat_mutation_reader* _underlying = nullptr;
+    std::optional<flat_mutation_reader> _underlying_holder;
+
    future<> do_fill_buffer(db::timeout_clock::time_point);
+    future<> ensure_underlying(db::timeout_clock::time_point);
    void copy_from_cache_to_buffer();
    future<> process_static_row(db::timeout_clock::time_point);
    void move_to_end();
@@ -186,23 +193,22 @@ future<> cache_flat_mutation_reader::process_static_row(db::timeout_clock::time_
        return make_ready_future<>();
    } else {
        _read_context->cache().on_row_miss();
-        return _read_context->get_next_fragment(timeout).then([this] (mutation_fragment_opt&& sr) {
-            if (sr) {
-                assert(sr->is_static_row());
-                maybe_add_to_cache(sr->as_static_row());
-                push_mutation_fragment(std::move(*sr));
-            }
-            maybe_set_static_row_continuous();
+        return ensure_underlying(timeout).then([this, timeout] {
+            return (*_underlying)(timeout).then([this] (mutation_fragment_opt&& sr) {
+                if (sr) {
+                    assert(sr->is_static_row());
+                    maybe_add_to_cache(sr->as_static_row());
+                    push_mutation_fragment(std::move(*sr));
+                }
+                maybe_set_static_row_continuous();
+            });
        });
    }
 }

 inline
 void cache_flat_mutation_reader::touch_partition() {
-    if (_snp->at_latest_version()) {
-        rows_entry& last_dummy = *_snp->version()->partition().clustered_rows().rbegin();
-        _snp->tracker()->touch(last_dummy);
-    }
+    _snp->touch();
 }

 inline
@@ -232,14 +238,36 @@ future<> cache_flat_mutation_reader::fill_buffer(db::timeout_clock::time_point t
    });
 }

+inline
+future<> cache_flat_mutation_reader::ensure_underlying(db::timeout_clock::time_point timeout) {
+    if (_underlying) {
+        return make_ready_future<>();
+    }
+    return _read_context->ensure_underlying(timeout).then([this, timeout] {
+        flat_mutation_reader& ctx_underlying = _read_context->underlying().underlying();
+        if (ctx_underlying.schema() != _schema) {
+            _underlying_holder = make_delegating_reader(ctx_underlying);
+            _underlying_holder->upgrade_schema(_schema);
+            _underlying = &*_underlying_holder;
+        } else {
+            _underlying = &ctx_underlying;
+        }
+    });
+}
+
 inline
 future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
    if (_state == state::move_to_underlying) {
+        if (!_underlying) {
+            return ensure_underlying(timeout).then([this, timeout] {
+                return do_fill_buffer(timeout);
+            });
+        }
        _state = state::reading_from_underlying;
        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
                                      : position_in_partition(_upper_bound);
-        return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
+        return _underlying->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
            return read_from_underlying(timeout);
        });
    }
@@ -280,7 +308,7 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin

 inline
 future<> cache_flat_mutation_reader::read_from_underlying(db::timeout_clock::time_point timeout) {
-    return consume_mutation_fragments_until(_read_context->underlying().underlying(),
+    return consume_mutation_fragments_until(*_underlying,
        [this] { return _state != state::reading_from_underlying || is_buffer_full(); },
        [this] (mutation_fragment mf) {
            _read_context->cache().on_row_miss();
--- a/configure.py
+++ b/configure.py
@@ -273,6 +273,7 @@ scylla_tests = [
    'tests/perf/perf_sstable',
    'tests/cql_query_test',
    'tests/secondary_index_test',
+    'tests/json_cql_query_test',
    'tests/filtering_test',
    'tests/storage_proxy_test',
    'tests/schema_change_test',
@@ -570,6 +571,7 @@ scylla_core = (['database.cc',
                'db/consistency_level.cc',
                'db/system_keyspace.cc',
                'db/system_distributed_keyspace.cc',
+                'db/size_estimates_virtual_reader.cc',
                'db/schema_tables.cc',
                'db/cql_type_parser.cc',
                'db/legacy_schema_migrator.cc',
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -470,6 +470,7 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
        std::vector<::shared_ptr<cql3::column_identifier::raw>> column_names;
        std::vector<::shared_ptr<cql3::term::raw>> values;
        bool if_not_exists = false;
+        bool default_unset = false;
        ::shared_ptr<cql3::term::raw> json_value;
    }
    : K_INSERT K_INTO cf=columnFamilyName
@@ -487,13 +488,15 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
              }
        | K_JSON
          json_token=jsonValue { json_value = $json_token.value; }
+            ( K_DEFAULT K_UNSET { default_unset = true; } | K_DEFAULT K_NULL )?
            ( K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
            ( usingClause[attrs] )?
              {
              $expr = ::make_shared<raw::insert_json_statement>(std::move(cf),
                                                       std::move(attrs),
                                                       std::move(json_value),
-                                                       if_not_exists);
+                                                       if_not_exists,
+                                                       default_unset);
              }
        )
    ;
@@ -1835,6 +1838,8 @@ K_OR:          O R;
 K_REPLACE:     R E P L A C E;
 K_DETERMINISTIC: D E T E R M I N I S T I C;
 K_JSON:        J S O N;
+K_DEFAULT:     D E F A U L T;
+K_UNSET:       U N S E T;

 K_EMPTY:       E M P T Y;

--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -130,6 +130,18 @@ query_options::query_options(std::unique_ptr<query_options> qo, ::shared_ptr<ser

 }

+query_options::query_options(std::unique_ptr<query_options> qo, ::shared_ptr<service::pager::paging_state> paging_state, int32_t page_size)
+        : query_options(qo->_consistency,
+        qo->get_timeout_config(),
+        std::move(qo->_names),
+        std::move(qo->_values),
+        std::move(qo->_value_views),
+        qo->_skip_metadata,
+        std::move(query_options::specific_options{page_size, paging_state, qo->_options.serial_consistency, qo->_options.timestamp}),
+        qo->_cql_serialization_format) {
+
+}
+
 query_options::query_options(std::vector<cql3::raw_value> values)
    : query_options(
          db::consistency_level::ONE, infinite_timeout_config, std::move(values))
--- a/cql3/query_options.hh
+++ b/cql3/query_options.hh
@@ -102,7 +102,7 @@ private:

 public:
    query_options(query_options&&) = default;
-    query_options(const query_options&) = delete;
+    explicit query_options(const query_options&) = default;

    explicit query_options(db::consistency_level consistency,
                           const timeout_config& timeouts,
@@ -155,6 +155,7 @@ public:
    explicit query_options(db::consistency_level, const timeout_config& timeouts,
            std::vector<cql3::raw_value> values, specific_options options = specific_options::DEFAULT);
    explicit query_options(std::unique_ptr<query_options>, ::shared_ptr<service::pager::paging_state> paging_state);
+    explicit query_options(std::unique_ptr<query_options>, ::shared_ptr<service::pager::paging_state> paging_state, int32_t page_size);

    const timeout_config& get_timeout_config() const { return _timeout_config; }

--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -214,11 +214,9 @@ statement_restrictions::statement_restrictions(database& db,
    }
    auto& cf = db.find_column_family(schema);
    auto& sim = cf.get_index_manager();
-    bool has_queriable_clustering_column_index = _clustering_columns_restrictions->has_supporting_index(sim);
-    bool has_queriable_pk_index = _partition_key_restrictions->has_supporting_index(sim);
-    bool has_queriable_index = has_queriable_clustering_column_index
-            || has_queriable_pk_index
-            || _nonprimary_key_restrictions->has_supporting_index(sim);
+    const bool has_queriable_clustering_column_index = _clustering_columns_restrictions->has_supporting_index(sim);
+    const bool has_queriable_pk_index = _partition_key_restrictions->has_supporting_index(sim);
+    const bool has_queriable_regular_index = _nonprimary_key_restrictions->has_supporting_index(sim);

    // At this point, the select statement if fully constructed, but we still have a few things to validate
    process_partition_key_restrictions(has_queriable_pk_index, for_view, allow_filtering);
@@ -279,7 +277,7 @@ statement_restrictions::statement_restrictions(database& db,
    }

    if (!_nonprimary_key_restrictions->empty()) {
-        if (has_queriable_index) {
+        if (has_queriable_regular_index) {
            _uses_secondary_indexing = true;
        } else if (!allow_filtering) {
            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
@@ -365,8 +363,9 @@ std::vector<const column_definition*> statement_restrictions::get_column_defs_fo
                }
            }
        }
-        if (_clustering_columns_restrictions->needs_filtering(*_schema)) {
-            column_id first_filtering_id = _schema->clustering_key_columns().begin()->id +
+        const bool pk_has_unrestricted_components = _partition_key_restrictions->has_unrestricted_components(*_schema);
+        if (pk_has_unrestricted_components || _clustering_columns_restrictions->needs_filtering(*_schema)) {
+            column_id first_filtering_id = pk_has_unrestricted_components ? 0 : _schema->clustering_key_columns().begin()->id +
                    _clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
            for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
                if (cdef->id >= first_filtering_id && !column_uses_indexing(cdef)) {
@@ -481,10 +480,9 @@ bool statement_restrictions::need_filtering() const {
    int number_of_filtering_restrictions = _nonprimary_key_restrictions->size();
    // If the whole partition key is restricted, it does not imply filtering
    if (_partition_key_restrictions->has_unrestricted_components(*_schema) || !_partition_key_restrictions->is_all_eq()) {
-        number_of_filtering_restrictions += _partition_key_restrictions->size();
-        if (_clustering_columns_restrictions->has_unrestricted_components(*_schema)) {
-            number_of_filtering_restrictions += _clustering_columns_restrictions->size() - _clustering_columns_restrictions->prefix_size();
-        }
+        number_of_filtering_restrictions += _partition_key_restrictions->size() + _clustering_columns_restrictions->size();
+    } else if (_clustering_columns_restrictions->has_unrestricted_components(*_schema)) {
+        number_of_filtering_restrictions += _clustering_columns_restrictions->size() - _clustering_columns_restrictions->prefix_size();
    }

    if (_partition_key_restrictions->is_multi_column() || _clustering_columns_restrictions->is_multi_column()) {
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -395,6 +395,14 @@ public:
        return !_nonprimary_key_restrictions->empty();
    }

+    bool pk_restrictions_need_filtering() const {
+        return _partition_key_restrictions->needs_filtering(*_schema);
+    }
+
+    bool ck_restrictions_need_filtering() const {
+        return _partition_key_restrictions->has_unrestricted_components(*_schema) || _clustering_columns_restrictions->needs_filtering(*_schema);
+    }
+
    /**
     * @return true if column is restricted by some restriction, false otherwise
     */
--- a/cql3/result_set.cc
+++ b/cql3/result_set.cc
@@ -83,6 +83,9 @@ void metadata::maybe_set_paging_state(::shared_ptr<const service::pager::paging_
    assert(paging_state);
    if (paging_state->get_remaining() > 0) {
        set_paging_state(std::move(paging_state));
+    } else {
+        _flags.remove<flag::HAS_MORE_PAGES>();
+        _paging_state = nullptr;
    }
 }

--- a/cql3/selection/selectable.cc
+++ b/cql3/selection/selectable.cc
@@ -142,7 +142,7 @@ shared_ptr<selector::factory>
 selectable::with_field_selection::new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) {
    auto&& factory = _selected->new_selector_factory(db, s, defs);
    auto&& type = factory->new_instance()->get_type();
-    auto&& ut = dynamic_pointer_cast<const user_type_impl>(std::move(type));
+    auto&& ut = dynamic_pointer_cast<const user_type_impl>(type->underlying_type());
    if (!ut) {
        throw exceptions::invalid_request_exception(
                sprint("Invalid field selection: %s of type %s is not a user type",
--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -165,7 +165,7 @@ alter_type_statement::add_or_alter::add_or_alter(const ut_name& name, bool is_ad
 user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_update) const
 {
    if (get_idx_of_field(to_update, _field_name)) {
-        throw exceptions::invalid_request_exception(sprint("Cannot add new field %s to type %s: a field of the same name already exists", _field_name->name(), _name.to_string()));
+        throw exceptions::invalid_request_exception(sprint("Cannot add new field %s to type %s: a field of the same name already exists", _field_name->to_string(), _name.to_string()));
    }

    std::vector<bytes> new_names(to_update->field_names());
@@ -173,7 +173,7 @@ user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_
    std::vector<data_type> new_types(to_update->field_types());
    auto&& add_type = _field_type->prepare(db, keyspace())->get_type();
    if (add_type->references_user_type(to_update->_keyspace, to_update->_name)) {
-        throw exceptions::invalid_request_exception(sprint("Cannot add new field %s of type %s to type %s as this would create a circular reference", _field_name->name(), _field_type->to_string(), _name.to_string()));
+        throw exceptions::invalid_request_exception(sprint("Cannot add new field %s of type %s to type %s as this would create a circular reference", _field_name->to_string(), _field_type->to_string(), _name.to_string()));
    }
    new_types.push_back(std::move(add_type));
    return user_type_impl::get_instance(to_update->_keyspace, to_update->_name, std::move(new_names), std::move(new_types));
@@ -183,13 +183,13 @@ user_type alter_type_statement::add_or_alter::do_alter(database& db, user_type t
 {
    stdx::optional<uint32_t> idx = get_idx_of_field(to_update, _field_name);
    if (!idx) {
-        throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", _field_name->name(), _name.to_string()));
+        throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", _field_name->to_string(), _name.to_string()));
    }

    auto previous = to_update->field_types()[*idx];
    auto new_type = _field_type->prepare(db, keyspace())->get_type();
    if (!new_type->is_compatible_with(*previous)) {
-        throw exceptions::invalid_request_exception(sprint("Type %s in incompatible with previous type %s of field %s in user type %s", _field_type->to_string(), previous->as_cql3_type()->to_string(), _field_name->name(), _name.to_string()));
+        throw exceptions::invalid_request_exception(sprint("Type %s in incompatible with previous type %s of field %s in user type %s", _field_type->to_string(), previous->as_cql3_type()->to_string(), _field_name->to_string(), _name.to_string()));
    }

    std::vector<data_type> new_types(to_update->field_types());
--- a/cql3/statements/raw/insert_statement.hh
+++ b/cql3/statements/raw/insert_statement.hh
@@ -87,6 +87,7 @@ private:
    ::shared_ptr<attributes::raw> _attrs;
    ::shared_ptr<term::raw> _json_value;
    bool _if_not_exists;
+    bool _default_unset;
 public:
    /**
     * A parsed <code>INSERT JSON</code> statement.
@@ -95,7 +96,7 @@ public:
     * @param json_value JSON string representing names and values
     * @param attrs additional attributes for statement (CL, timestamp, timeToLive)
     */
-    insert_json_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, ::shared_ptr<term::raw> json_value, bool if_not_exists);
+    insert_json_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, ::shared_ptr<term::raw> json_value, bool if_not_exists, bool default_unset);

    virtual ::shared_ptr<cql3::statements::modification_statement> prepare_internal(database& db, schema_ptr schema,
                ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs, cql_stats& stats) override;
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -522,8 +522,8 @@ indexed_table_select_statement::prepare_command_for_base_query(const query_optio
    return cmd;
 }

-future<shared_ptr<cql_transport::messages::result_message>>
-indexed_table_select_statement::execute_base_query(
+future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
+indexed_table_select_statement::do_execute_base_query(
        service::storage_proxy& proxy,
        dht::partition_range_vector&& partition_ranges,
        service::query_state& state,
@@ -582,22 +582,27 @@ indexed_table_select_statement::execute_base_query(
        }).then([&merger]() {
            return merger.get();
        });
-    }).then([this, &proxy, &state, &options, now, cmd, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
-        return this->process_base_query_results(std::move(result), cmd, proxy, state, options, now, std::move(paging_state));
+    }).then([cmd] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
+        return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>(std::move(result), std::move(cmd));
    });
 }

-// Function for fetching the selected columns from a list of clustering rows.
-// It is currently used only in our Secondary Index implementation - ordinary
-// CQL SELECT statements do not have the syntax to request a list of rows.
-// FIXME: The current implementation is very inefficient - it requests each
-// row separately (and, incrementally, in parallel). Even multiple rows from a single
-// partition are requested separately. This last case can be easily improved,
-// but to implement the general case (multiple rows from multiple partitions)
-// efficiently, we will need more support from other layers.
-// Keys are ordered in token order (see #3423)
 future<shared_ptr<cql_transport::messages::result_message>>
 indexed_table_select_statement::execute_base_query(
+        service::storage_proxy& proxy,
+        dht::partition_range_vector&& partition_ranges,
+        service::query_state& state,
+        const query_options& options,
+        gc_clock::time_point now,
+        ::shared_ptr<const service::pager::paging_state> paging_state) {
+    return do_execute_base_query(proxy, std::move(partition_ranges), state, options, now, paging_state).then(
+            [this, &proxy, &state, &options, now, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result, lw_shared_ptr<query::read_command> cmd) {
+        return process_base_query_results(std::move(result), std::move(cmd), proxy, state, options, now, std::move(paging_state));
+    });
+}
+
+future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
+indexed_table_select_statement::do_execute_base_query(
        service::storage_proxy& proxy,
        std::vector<primary_key>&& primary_keys,
        service::query_state& state,
@@ -652,9 +657,23 @@ indexed_table_select_statement::execute_base_query(
            });
        }).then([&merger] () {
            return merger.get();
+        }).then([cmd] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
+            return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>(std::move(result), std::move(cmd));
        });
-    }).then([this, &proxy, &state, &options, now, cmd, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
-        return this->process_base_query_results(std::move(result), cmd, proxy, state, options, now, std::move(paging_state));
+    });
+}
+
+future<shared_ptr<cql_transport::messages::result_message>>
+indexed_table_select_statement::execute_base_query(
+        service::storage_proxy& proxy,
+        std::vector<primary_key>&& primary_keys,
+        service::query_state& state,
+        const query_options& options,
+        gc_clock::time_point now,
+        ::shared_ptr<const service::pager::paging_state> paging_state) {
+    return do_execute_base_query(proxy, std::move(primary_keys), state, options, now, paging_state).then(
+            [this, &proxy, &state, &options, now, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result, lw_shared_ptr<query::read_command> cmd) {
+        return process_base_query_results(std::move(result), std::move(cmd), proxy, state, options, now, std::move(paging_state));
    });
 }

@@ -929,6 +948,60 @@ indexed_table_select_statement::do_execute(service::storage_proxy& proxy,
        }
    }

+    // Aggregated and paged filtering needs to aggregate the results from all pages
+    // in order to avoid returning partial per-page results (issue #4540).
+    // It's a little bit more complicated than regular aggregation, because each paging state
+    // needs to be translated between the base table and the underlying view.
+    // The routine below keeps fetching pages from the underlying view, which are then
+    // used to fetch base rows, which go straight to the result set builder.
+    // A local, internal copy of query_options is kept in order to keep updating
+    // the paging state between requesting data from replicas.
+    const bool aggregate = _selection->is_aggregate();
+    if (aggregate) {
+        const bool restrictions_need_filtering = _restrictions->need_filtering();
+        return do_with(cql3::selection::result_set_builder(*_selection, now, options.get_cql_serialization_format()), std::make_unique<cql3::query_options>(cql3::query_options(options)),
+                [this, &options, &proxy, &state, now, whole_partitions, partition_slices, restrictions_need_filtering] (cql3::selection::result_set_builder& builder, std::unique_ptr<cql3::query_options>& internal_options) {
+            // page size is set to the internal count page size, regardless of the user-provided value
+            internal_options.reset(new cql3::query_options(std::move(internal_options), options.get_paging_state(), DEFAULT_COUNT_PAGE_SIZE));
+            return repeat([this, &builder, &options, &internal_options, &proxy, &state, now, whole_partitions, partition_slices, restrictions_need_filtering] () {
+                auto consume_results = [this, &builder, &options, &internal_options, restrictions_need_filtering] (foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd) {
+                    if (restrictions_need_filtering) {
+                        query::result_view::consume(*results, cmd->slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection,
+                                cql3::selection::result_set_builder::restrictions_filter(_restrictions, options, cmd->row_limit)));
+                    } else {
+                        query::result_view::consume(*results, cmd->slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection));
+                    }
+                };
+
+                if (whole_partitions || partition_slices) {
+                    return find_index_partition_ranges(proxy, state, *internal_options).then(
+                            [this, now, &state, &internal_options, &proxy, consume_results = std::move(consume_results)] (dht::partition_range_vector partition_ranges, ::shared_ptr<const service::pager::paging_state> paging_state) {
+                        bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
+                        internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? ::make_shared<service::pager::paging_state>(*paging_state) : nullptr));
+                        return do_execute_base_query(proxy, std::move(partition_ranges), state, *internal_options, now, std::move(paging_state)).then(consume_results).then([has_more_pages] {
+                            return stop_iteration(!has_more_pages);
+                        });
+                    });
+                } else {
+                    return find_index_clustering_rows(proxy, state, *internal_options).then(
+                            [this, now, &state, &internal_options, &proxy, consume_results = std::move(consume_results)] (std::vector<primary_key> primary_keys, ::shared_ptr<const service::pager::paging_state> paging_state) {
+                        bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
+                        internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? ::make_shared<service::pager::paging_state>(*paging_state) : nullptr));
+                        return this->do_execute_base_query(proxy, std::move(primary_keys), state, *internal_options, now, std::move(paging_state)).then(consume_results).then([has_more_pages] {
+                            return stop_iteration(!has_more_pages);
+                        });
+                    });
+                }
+            }).then([this, &builder, restrictions_need_filtering] () {
+                auto rs = builder.build();
+                update_stats_rows_read(rs->size());
+                _stats.filtered_rows_matched_total += restrictions_need_filtering ? rs->size() : 0;
+                auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
+                return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
+            });
+        });
+    }
+
    if (whole_partitions || partition_slices) {
        // In this case, can use our normal query machinery, which retrieves
        // entire partitions or the same slice for many partitions.
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -67,8 +67,8 @@ class select_statement : public cql_statement {
 public:
    using parameters = raw::select_statement::parameters;
    using ordering_comparator_type = raw::select_statement::ordering_comparator_type;
-protected:
    static constexpr int DEFAULT_COUNT_PAGE_SIZE = 10000;
+protected:
    static thread_local const ::shared_ptr<parameters> _default_parameters;
    schema_ptr _schema;
    uint32_t _bound_terms;
@@ -213,6 +213,14 @@ private:
    lw_shared_ptr<query::read_command>
    prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging);

+    future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
+    do_execute_base_query(
+            service::storage_proxy& proxy,
+            dht::partition_range_vector&& partition_ranges,
+            service::query_state& state,
+            const query_options& options,
+            gc_clock::time_point now,
+            ::shared_ptr<const service::pager::paging_state> paging_state);
    future<shared_ptr<cql_transport::messages::result_message>>
    execute_base_query(
            service::storage_proxy& proxy,
@@ -222,6 +230,23 @@ private:
            gc_clock::time_point now,
            ::shared_ptr<const service::pager::paging_state> paging_state);

+    // Function for fetching the selected columns from a list of clustering rows.
+    // It is currently used only in our Secondary Index implementation - ordinary
+    // CQL SELECT statements do not have the syntax to request a list of rows.
+    // FIXME: The current implementation is very inefficient - it requests each
+    // row separately (and, incrementally, in parallel). Even multiple rows from a single
+    // partition are requested separately. This last case can be easily improved,
+    // but to implement the general case (multiple rows from multiple partitions)
+    // efficiently, we will need more support from other layers.
+    // Keys are ordered in token order (see #3423)
+    future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
+    do_execute_base_query(
+            service::storage_proxy& proxy,
+            std::vector<primary_key>&& primary_keys,
+            service::query_state& state,
+            const query_options& options,
+            gc_clock::time_point now,
+            ::shared_ptr<const service::pager::paging_state> paging_state);
    future<shared_ptr<cql_transport::messages::result_message>>
    execute_base_query(
            service::storage_proxy& proxy,
--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -84,8 +84,11 @@ parse(const sstring& json_string, const std::vector<column_definition>& expected
    for (const auto& def : expected_receivers) {
        sstring cql_name = def.name_as_text();
        auto value_it = prepared_map.find(cql_name);
-        if (value_it == prepared_map.end() || value_it->second.isNull()) {
+        if (value_it == prepared_map.end()) {
+            continue;
+        } else if (value_it->second.isNull()) {
            json_map.emplace(std::move(cql_name), bytes_opt{});
+            prepared_map.erase(value_it);
        } else {
            json_map.emplace(std::move(cql_name), def.type->from_json_object(value_it->second, sf));
            prepared_map.erase(value_it);
@@ -255,8 +258,12 @@ void insert_prepared_json_statement::execute_operations_for_key(mutation& m, con
            throw exceptions::invalid_request_exception(sprint("Cannot set the value of counter column %s in JSON", def.name_as_text()));
        }

-        auto value = json_cache->at(def.name_as_text());
-        execute_set_value(m, prefix, params, def, value);
+        auto it = json_cache->find(def.name_as_text());
+        if (it != json_cache->end()) {
+            execute_set_value(m, prefix, params, def, it->second);
+        } else if (!_default_unset) {
+            execute_set_value(m, prefix, params, def, bytes_opt{});
+        }
    }
 }

@@ -322,12 +329,14 @@ insert_statement::prepare_internal(database& db, schema_ptr schema,
 insert_json_statement::insert_json_statement(  ::shared_ptr<cf_name> name,
                                               ::shared_ptr<attributes::raw> attrs,
                                               ::shared_ptr<term::raw> json_value,
-                                               bool if_not_exists)
+                                               bool if_not_exists,
+                                               bool default_unset)
    : raw::modification_statement{name, attrs, conditions_vector{}, if_not_exists, false}
    , _name(name)
    , _attrs(attrs)
    , _json_value(json_value)
-    , _if_not_exists(if_not_exists) { }
+    , _if_not_exists(if_not_exists)
+    , _default_unset(default_unset) { }

 ::shared_ptr<cql3::statements::modification_statement>
 insert_json_statement::prepare_internal(database& db, schema_ptr schema,
@@ -337,7 +346,7 @@ insert_json_statement::prepare_internal(database& db, schema_ptr schema,
    auto json_column_placeholder = ::make_shared<column_identifier>("", true);
    auto prepared_json_value = _json_value->prepare(db, "", ::make_shared<column_specification>("", "", json_column_placeholder, utf8_type));
    prepared_json_value->collect_marker_specification(bound_names);
-    return ::make_shared<cql3::statements::insert_prepared_json_statement>(bound_names->size(), schema, std::move(attrs), &stats.inserts, std::move(prepared_json_value));
+    return ::make_shared<cql3::statements::insert_prepared_json_statement>(bound_names->size(), schema, std::move(attrs), &stats.inserts, std::move(prepared_json_value), _default_unset);
 }

 update_statement::update_statement(            ::shared_ptr<cf_name> name,
--- a/cql3/statements/update_statement.hh
+++ b/cql3/statements/update_statement.hh
@@ -82,9 +82,10 @@ private:
 */
 class insert_prepared_json_statement : public update_statement {
    ::shared_ptr<term> _term;
+    bool _default_unset;
 public:
-    insert_prepared_json_statement(uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, uint64_t* cql_stats_counter_ptr, ::shared_ptr<term> t)
-        : update_statement(statement_type::INSERT, bound_terms, s, std::move(attrs), cql_stats_counter_ptr), _term(t) {
+    insert_prepared_json_statement(uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, uint64_t* cql_stats_counter_ptr, ::shared_ptr<term> t, bool default_unset)
+        : update_statement(statement_type::INSERT, bound_terms, s, std::move(attrs), cql_stats_counter_ptr), _term(t), _default_unset(default_unset) {
        _restrictions = ::make_shared<restrictions::statement_restrictions>(s, false);
    }
 private:
--- a/cql3/tuples.hh
+++ b/cql3/tuples.hh
@@ -54,7 +54,7 @@ public:
                column->ks_name,
                column->cf_name,
                ::make_shared<column_identifier>(sprint("%s[%d]", column->name, component), true),
-                static_pointer_cast<const tuple_type_impl>(column->type)->type(component));
+                static_pointer_cast<const tuple_type_impl>(column->type->underlying_type())->type(component));
    }

    /**
@@ -112,7 +112,7 @@ public:

    private:
        void validate_assignable_to(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) {
-            auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver->type);
+            auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver->type->underlying_type());
            if (!tt) {
                throw exceptions::invalid_request_exception(sprint("Invalid tuple type literal for %s of type %s", receiver->name, receiver->type->as_cql3_type()));
            }
--- a/database.cc
+++ b/database.cc
@@ -1513,7 +1513,8 @@ future<> table::cleanup_sstables(sstables::compaction_descriptor descriptor) {
            return with_semaphore(sem, 1, [this, &sst] {
                // release reference to sstables cleaned up, otherwise space usage from their data and index
                // components cannot be reclaimed until all of them are cleaned.
-                return this->compact_sstables(sstables::compaction_descriptor({ std::move(sst) }, sst->get_sstable_level()), true);
+                auto sstable_level = sst->get_sstable_level();
+                return this->compact_sstables(sstables::compaction_descriptor({ std::move(sst) }, sstable_level), true);
            });
        });
    });
@@ -2232,6 +2233,10 @@ void backlog_controller::adjust() {

 float backlog_controller::backlog_of_shares(float shares) const {
    size_t idx = 1;
+    // No control points means the controller is disabled.
+    if (_control_points.size() == 0) {
+            return 1.0f;
+    }
    while ((idx < _control_points.size() - 1) && (_control_points[idx].output < shares)) {
        idx++;
    }
@@ -4356,6 +4361,8 @@ future<int64_t>
 table::disable_sstable_write() {
    _sstable_writes_disabled_at = std::chrono::steady_clock::now();
    return _sstables_lock.write_lock().then([this] {
+      // _sstable_deletion_sem must be acquired after _sstables_lock.write_lock
+      return _sstable_deletion_sem.wait().then([this] {
        if (_sstables->all()->empty()) {
            return make_ready_future<int64_t>(0);
        }
@@ -4364,9 +4371,19 @@ table::disable_sstable_write() {
            max = std::max(max, s->generation());
        }
        return make_ready_future<int64_t>(max);
+      });
    });
 }

+std::chrono::steady_clock::duration table::enable_sstable_write(int64_t new_generation) {
+    if (new_generation != -1) {
+        update_sstables_known_generation(new_generation);
+    }
+    _sstable_deletion_sem.signal();
+    _sstables_lock.write_unlock();
+    return std::chrono::steady_clock::now() - _sstable_writes_disabled_at;
+}
+
 std::ostream& operator<<(std::ostream& os, const user_types_metadata& m) {
    os << "org.apache.cassandra.config.UTMetaData@" << &m;
    return os;
--- a/database.hh
+++ b/database.hh
@@ -447,6 +447,7 @@ private:
    // This semaphore ensures that an operation like snapshot won't have its selected
    // sstables deleted by compaction in parallel, a race condition which could
    // easily result in failure.
+    // Locking order: must be acquired either independently or after _sstables_lock
    seastar::semaphore _sstable_deletion_sem = {1};
    // There are situations in which we need to stop writing sstables. Flushers will take
    // the read lock, and the ones that wish to stop that process will take the write lock.
@@ -737,13 +738,7 @@ public:

    // SSTable writes are now allowed again, and generation is updated to new_generation if != -1
    // returns the amount of microseconds elapsed since we disabled writes.
-    std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation) {
-        if (new_generation != -1) {
-            update_sstables_known_generation(new_generation);
-        }
-        _sstables_lock.write_unlock();
-        return std::chrono::steady_clock::now() - _sstable_writes_disabled_at;
-    }
+    std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation);

    // Make sure the generation numbers are sequential, starting from "start".
    // Generations before "start" are left untouched.
@@ -897,7 +892,7 @@ public:
    }

 private:
-    future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source) const;
+    future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source, const io_priority_class& io_priority) const;
    std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
    future<> generate_and_propagate_view_updates(const schema_ptr& base,
            std::vector<view_ptr>&& views,
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -395,10 +395,8 @@ std::unordered_set<gms::inet_address> db::batchlog_manager::endpoint_filter(cons

    // grab a random member of up to two racks
    for (auto& rack : racks) {
-        auto rack_members = validated.bucket(rack);
-        auto n = validated.bucket_size(rack_members);
        auto cpy = boost::copy_range<std::vector<gms::inet_address>>(validated.equal_range(rack) | boost::adaptors::map_values);
-        std::uniform_int_distribution<size_t> rdist(0, n - 1);
+        std::uniform_int_distribution<size_t> rdist(0, cpy.size() - 1);
        result.emplace(cpy[rdist(_e1)]);
    }

--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -689,6 +689,8 @@ public:
        // but all previous write/flush pairs.
        return _pending_ops.run_with_ordered_post_op(rp, [this, size, off, buf = std::move(buf)]() mutable { ///////////////////////////////////////////////////
            auto view = fragmented_temporary_buffer::view(buf);
+            view.remove_suffix(buf.size_bytes() - size);
+            assert(size == view.size_bytes());
            return do_with(off, view, [&] (uint64_t& off, fragmented_temporary_buffer::view& view) {
                if (view.empty()) {
                    return make_ready_future<>();
@@ -1187,6 +1189,34 @@ void db::commitlog::segment_manager::flush_segments(bool force) {
    }
 }

+/// \brief Helper for ensuring a file is closed if an exception is thrown.
+///
+/// The file provided by the file_fut future is passed to func.
+/// * If func throws an exception E, the file is closed and we return
+///   a failed future with E.
+/// * If func returns a value V, the file is not closed and we return
+///   a future with V.
+/// Note that when an exception is not thrown, it is the
+/// responsibility of func to make sure the file will be closed. It
+/// can close the file itself, return it, or store it somewhere.
+///
+/// \tparam Func The type of function this wraps
+/// \param file_fut A future that produces a file
+/// \param func A function that uses a file
+/// \return A future that passes the file produced by file_fut to func
+///         and closes it if func fails
+template <typename Func>
+static auto close_on_failure(future<file> file_fut, Func func) {
+    return file_fut.then([func = std::move(func)](file f) {
+        return futurize_apply(func, f).handle_exception([f] (std::exception_ptr e) mutable {
+            return f.close().then_wrapped([f, e = std::move(e)] (future<> x) {
+                using futurator = futurize<std::result_of_t<Func(file)>>;
+                return futurator::make_exception_future(e);
+            });
+        });
+    });
+}
+
 future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::allocate_segment(bool active) {
    static const auto flags = open_flags::wo | open_flags::create;

@@ -1217,7 +1247,7 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
        return fut;
    });

-    return fut.then([this, d, active, filename](file f) {
+    return close_on_failure(std::move(fut), [this, d, active, filename] (file f) {
        f = make_checked_file(commit_error_handler, f);
        // xfs doesn't like files extended betond eof, so enlarge the file
        return f.truncate(max_size).then([this, d, active, f, filename] () mutable {
@@ -1755,7 +1785,7 @@ db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class
                }

                if (magic != segment::segment_magic) {
-                    throw std::invalid_argument("Not a scylla format commitlog file");
+                    throw invalid_segment_format();
                }
                crc32_nbo crc;
                crc.process(ver);
@@ -1764,7 +1794,7 @@ db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class

                auto cs = crc.checksum();
                if (cs != checksum) {
-                    throw std::runtime_error("Checksum error in file header");
+                    throw header_checksum_error();
                }

                this->id = id;
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -342,18 +342,40 @@ public:

    typedef std::function<future<>(temporary_buffer<char>, replay_position)> commit_load_reader_func;

-    class segment_data_corruption_error: public std::runtime_error {
+    class segment_error : public std::exception {};
+
+    class segment_data_corruption_error: public segment_error {
+        std::string _msg;
    public:
        segment_data_corruption_error(std::string msg, uint64_t s)
-                : std::runtime_error(msg), _bytes(s) {
+                : _msg(std::move(msg)), _bytes(s) {
        }
        uint64_t bytes() const {
            return _bytes;
        }
+        virtual const char* what() const noexcept {
+            return _msg.c_str();
+        }
    private:
        uint64_t _bytes;
    };

+    class invalid_segment_format : public segment_error {
+        static constexpr const char* _msg = "Not a scylla format commitlog file";
+    public:
+        virtual const char* what() const noexcept {
+            return _msg;
+        }
+    };
+
+    class header_checksum_error : public segment_error {
+        static constexpr const char* _msg = "Checksum error in file header";
+    public:
+        virtual const char* what() const noexcept {
+            return _msg;
+        }
+    };
+
    static future<std::unique_ptr<subscription<temporary_buffer<char>, replay_position>>> read_log_file(
            const sstring&, seastar::io_priority_class read_io_prio_class, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
 private:
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -164,7 +164,7 @@ future<> db::commitlog_replayer::impl::init() {
                // Get all truncation records for the CF and initialize max rps if
                // present. Cannot do this on demand, as there may be no sstables to
                // mark the CF as "needed".
-                return db::system_keyspace::get_truncated_position(uuid).then([&map, &uuid](std::vector<db::replay_position> tpps) {
+                return db::system_keyspace::get_truncated_position(uuid).then([&map, uuid](std::vector<db::replay_position> tpps) {
                    for (auto& p : tpps) {
                        rlogger.trace("CF {} truncated at {}", uuid, p);
                        auto& pp = map[p.shard_id()][uuid];
--- a/db/config.cc
+++ b/db/config.cc
@@ -102,6 +102,8 @@ db::config::config()
 db::config::~config()
 {}

+const sstring db::config::default_tls_priority("SECURE128:-VERS-TLS1.0");
+
 namespace utils {

 template<>
--- a/db/config.hh
+++ b/db/config.hh
@@ -743,6 +743,7 @@ public:
    val(cpu_scheduler, bool, true, Used, "Enable cpu scheduling") \
    val(view_building, bool, true, Used, "Enable view building; should only be set to false when the node is experience issues due to view building") \
    val(enable_sstables_mc_format, bool, false, Used, "Enable SSTables 'mc' format to be used as the default file format") \
+    val(abort_on_internal_error, bool, false, Used, "Abort the server instead of throwing exception when internal invariants are violated.") \
    /* done! */

 #define _make_value_member(name, type, deflt, status, desc, ...)    \
@@ -756,6 +757,8 @@ public:
    add_options(boost::program_options::options_description_easy_init&);

    const db::extensions& extensions() const;
+
+    static const sstring default_tls_priority;
 private:
    template<typename T>
    struct log_legacy_value : public named_value<T, value_status::Used> {
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -82,6 +82,9 @@ void manager::register_metrics(const sstring& group_name) {

        sm::make_derive("discarded", _stats.discarded,
                        sm::description("Number of hints that were discarded during sending (too old, schema changed, etc.).")),
+
+        sm::make_derive("corrupted_files", _stats.corrupted_files,
+                        sm::description("Number of hints files that were discarded during sending because the file was corrupted.")),
    });
 }

@@ -114,8 +117,8 @@ future<> manager::stop() {

    return _draining_eps_gate.close().finally([this] {
        return parallel_for_each(_ep_managers, [] (auto& pair) {
-                return pair.second.stop();
-            }).finally([this] {
+            return pair.second.stop();
+        }).finally([this] {
            _ep_managers.clear();
            manager_logger.info("Stopped");
        }).discard_result();
@@ -236,6 +239,8 @@ future<> manager::end_point_hints_manager::stop(drain should_drain) noexcept {
 manager::end_point_hints_manager::end_point_hints_manager(const key_type& key, manager& shard_manager)
    : _key(key)
    , _shard_manager(shard_manager)
+    , _file_update_mutex_ptr(make_lw_shared<seastar::shared_mutex>())
+    , _file_update_mutex(*_file_update_mutex_ptr)
    , _state(state_set::of<state::stopped>())
    , _hints_dir(_shard_manager.hints_dir() / format("{}", _key).c_str())
    , _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper())
@@ -244,6 +249,8 @@ manager::end_point_hints_manager::end_point_hints_manager(const key_type& key, m
 manager::end_point_hints_manager::end_point_hints_manager(end_point_hints_manager&& other)
    : _key(other._key)
    , _shard_manager(other._shard_manager)
+    , _file_update_mutex_ptr(std::move(other._file_update_mutex_ptr))
+    , _file_update_mutex(*_file_update_mutex_ptr)
    , _state(other._state)
    , _hints_dir(std::move(other._hints_dir))
    , _sender(other._sender, *this)
@@ -513,28 +520,35 @@ void manager::drain_for(gms::inet_address endpoint) {
    manager_logger.trace("on_leave_cluster: {} is removed/decommissioned", endpoint);

    with_gate(_draining_eps_gate, [this, endpoint] {
-        return futurize_apply([this, endpoint] () {
-            if (utils::fb_utilities::is_me(endpoint)) {
-                return parallel_for_each(_ep_managers, [] (auto& pair) {
-                    return pair.second.stop(drain::yes).finally([&pair] {
-                        return remove_file(pair.second.hints_dir().c_str());
+        return with_semaphore(drain_lock(), 1, [this, endpoint] {
+            return futurize_apply([this, endpoint] () {
+                if (utils::fb_utilities::is_me(endpoint)) {
+                    return parallel_for_each(_ep_managers, [] (auto& pair) {
+                        return pair.second.stop(drain::yes).finally([&pair] {
+                            return with_file_update_mutex(pair.second, [&pair] {
+                                return remove_file(pair.second.hints_dir().c_str());
+                            });
+                        });
+                    }).finally([this] {
+                        _ep_managers.clear();
                    });
-                }).finally([this] {
-                    _ep_managers.clear();
-                });
-            } else {
-                ep_managers_map_type::iterator ep_manager_it = find_ep_manager(endpoint);
-                if (ep_manager_it != ep_managers_end()) {
-                    return ep_manager_it->second.stop(drain::yes).finally([this, endpoint, hints_dir = ep_manager_it->second.hints_dir()] {
-                        _ep_managers.erase(endpoint);
-                        return remove_file(hints_dir.c_str());
-                    });
-                }
+                } else {
+                    ep_managers_map_type::iterator ep_manager_it = find_ep_manager(endpoint);
+                    if (ep_manager_it != ep_managers_end()) {
+                        return ep_manager_it->second.stop(drain::yes).finally([this, endpoint, &ep_man = ep_manager_it->second] {
+                            return with_file_update_mutex(ep_man, [&ep_man] {
+                                return remove_file(ep_man.hints_dir().c_str());
+                            }).finally([this, endpoint] {
+                                _ep_managers.erase(endpoint);
+                            });
+                        });
+                    }

-                return make_ready_future<>();
-            }
-        }).handle_exception([endpoint] (auto eptr) {
-            manager_logger.error("Exception when draining {}: {}", endpoint, eptr);
+                    return make_ready_future<>();
+                }
+            }).handle_exception([endpoint] (auto eptr) {
+                manager_logger.error("Exception when draining {}: {}", endpoint, eptr);
+            });
        });
    });
 }
@@ -725,6 +739,10 @@ bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fnam
        }, _last_not_complete_rp.pos, &_db.get_config().extensions()).get0();

        s->done().get();
+    } catch (db::commitlog::segment_error& ex) {
+        manager_logger.error("{}: {}. Dropping...", fname, ex.what());
+        ctx_ptr->state.remove(send_state::segment_replay_failed);
+        ++this->shard_stats().corrupted_files;
    } catch (...) {
        manager_logger.trace("sending of {} failed: {}", fname, std::current_exception());
        ctx_ptr->state.set(send_state::segment_replay_failed);
@@ -959,8 +977,6 @@ future<> manager::rebalance(sstring hints_directory) {
 }

 void manager::update_backlog(size_t backlog, size_t max_backlog) {
-    _backlog_size = backlog;
-    _max_backlog_size = max_backlog;
    if (backlog < max_backlog) {
        allow_hints();
    } else {
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -60,6 +60,7 @@ private:
        uint64_t dropped = 0;
        uint64_t sent = 0;
        uint64_t discarded = 0;
+        uint64_t corrupted_files = 0;
    };

    // map: shard -> segments
@@ -274,7 +275,8 @@ public:
        manager& _shard_manager;
        hints_store_ptr _hints_store_anchor;
        seastar::gate _store_gate;
-        seastar::shared_mutex _file_update_mutex;
+        lw_shared_ptr<seastar::shared_mutex> _file_update_mutex_ptr;
+        seastar::shared_mutex& _file_update_mutex;

        enum class state {
            can_hint,               // hinting is currently allowed (used by the space_watchdog)
@@ -376,8 +378,20 @@ public:
            return _state.contains(state::stopped);
        }

-        seastar::shared_mutex& file_update_mutex() {
-            return _file_update_mutex;
+        /// \brief Safely runs a given functor under the file_update_mutex of \ref ep_man
+        ///
+        /// Runs a given functor under the file_update_mutex of the given end_point_hints_manager instance.
+        /// This function is safe even if \ref ep_man gets destroyed before the future this function returns resolves
+        /// (as long as the \ref func call itself is safe).
+        ///
+        /// \tparam Func Functor type.
+        /// \param ep_man end_point_hints_manager instance which file_update_mutex we want to lock.
+        /// \param func Functor to run under the lock.
+        /// \return Whatever \ref func returns.
+        template <typename Func>
+        friend inline auto with_file_update_mutex(end_point_hints_manager& ep_man, Func&& func) {
+            lw_shared_ptr<seastar::shared_mutex> lock_ptr = ep_man._file_update_mutex_ptr;
+            return with_lock(*lock_ptr, std::forward<Func>(func)).finally([lock_ptr] {});
        }

        const boost::filesystem::path& hints_dir() const noexcept {
@@ -385,6 +399,10 @@ public:
        }

    private:
+        seastar::shared_mutex& file_update_mutex() noexcept {
+            return _file_update_mutex;
+        }
+
        /// \brief Creates a new hints store object.
        ///
        /// - Creates a hints store directory if doesn't exist: <shard_hints_dir>/<ep_key>
@@ -451,9 +469,7 @@ private:
    stats _stats;
    seastar::metrics::metric_groups _metrics;
    std::unordered_set<ep_key_type> _eps_with_pending_hints;
-
-    size_t _max_backlog_size = 1;
-    size_t _backlog_size = 0;
+    seastar::semaphore _drain_lock = {1};

 public:
    manager(sstring hints_directory, std::vector<sstring> hinted_dcs, int64_t max_hint_window_ms, resource_manager&res_manager, distributed<database>& db);
@@ -532,18 +548,14 @@ public:
        return _hints_dir_device_id;
    }

+    seastar::semaphore& drain_lock() noexcept {
+        return _drain_lock;
+    }
+
    void allow_hints();
    void forbid_hints();
    void forbid_hints_for_eps_with_pending_hints();

-    size_t max_backlog_size() const {
-        return _max_backlog_size;
-    }
-
-    size_t backlog_size() const {
-        return _backlog_size;
-    }
-
    void allow_replaying() noexcept {
        _state.set(state::replay_allowed);
    }
--- a/db/hints/resource_manager.cc
+++ b/db/hints/resource_manager.cc
@@ -90,16 +90,27 @@ future<> space_watchdog::stop() noexcept {
    return std::move(_started);
 }

+// Called under the end_point_hints_manager::file_update_mutex() of the corresponding end_point_hints_manager instance.
 future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager& shard_manager, ep_key_type ep_key) {
-    return lister::scan_dir(path, { directory_entry_type::regular }, [this, ep_key, &shard_manager] (lister::path dir, directory_entry de) {
-        // Put the current end point ID to state.eps_with_pending_hints when we see the second hints file in its directory
-        if (_files_count == 1) {
-            shard_manager.add_ep_with_pending_hints(ep_key);
-        }
-        ++_files_count;
+    return do_with(std::move(path), [this, ep_key, &shard_manager] (boost::filesystem::path& path) {
+        // It may happen that we get here and the directory has already been deleted in the context of manager::drain_for().
+        // In this case simply bail out.
+        return engine().file_exists(path.native()).then([this, ep_key, &shard_manager, &path] (bool exists) {
+            if (!exists) {
+                return make_ready_future<>();
+            } else {
+                return lister::scan_dir(path, { directory_entry_type::regular }, [this, ep_key, &shard_manager] (lister::path dir, directory_entry de) {
+                    // Put the current end point ID to state.eps_with_pending_hints when we see the second hints file in its directory
+                    if (_files_count == 1) {
+                        shard_manager.add_ep_with_pending_hints(ep_key);
+                    }
+                    ++_files_count;

-        return io_check(file_size, (dir / de.name.c_str()).c_str()).then([this] (uint64_t fsize) {
-            _total_size += fsize;
+                    return io_check(file_size, (dir / de.name.c_str()).c_str()).then([this] (uint64_t fsize) {
+                        _total_size += fsize;
+                    });
+                });
+            }
        });
    });
 }
@@ -137,7 +148,7 @@ void space_watchdog::on_timer() {
                // continue to enumeration - there is no one to change them.
                auto it = shard_manager.find_ep_manager(de.name);
                if (it != shard_manager.ep_managers_end()) {
-                    return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
+                    return with_file_update_mutex(it->second, [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)] () mutable {
                        return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
                    });
                } else {
--- a/db/legacy_schema_migrator.cc
+++ b/db/legacy_schema_migrator.cc
@@ -598,7 +598,7 @@ public:

    future<> flush_schemas() {
        return _qp.proxy().get_db().invoke_on_all([this] (database& db) {
-            return parallel_for_each(db::schema_tables::ALL, [this, &db](const sstring& cf_name) {
+            return parallel_for_each(db::schema_tables::all_table_names(), [this, &db](const sstring& cf_name) {
                auto& cf = db.find_column_family(db::schema_tables::NAME, cf_name);
                return cf.flush();
            });
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -143,10 +143,10 @@ struct qualified_name {
 static future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy, const qualified_name& table, schema_ptr s);

 static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
-    std::map<qualified_name, schema_mutations>&& tables_before,
-    std::map<qualified_name, schema_mutations>&& tables_after,
-    std::map<qualified_name, schema_mutations>&& views_before,
-    std::map<qualified_name, schema_mutations>&& views_after);
+    std::map<utils::UUID, schema_mutations>&& tables_before,
+    std::map<utils::UUID, schema_mutations>&& tables_after,
+    std::map<utils::UUID, schema_mutations>&& views_before,
+    std::map<utils::UUID, schema_mutations>&& views_after);

 struct user_types_to_drop final {
    seastar::noncopyable_function<void()> drop;
@@ -194,8 +194,6 @@ static void prepare_builder_from_table_row(const schema_ctxt&, schema_builder&,

 using namespace v3;

-std::vector<const char*> ALL { KEYSPACES, TABLES, SCYLLA_TABLES, COLUMNS, DROPPED_COLUMNS, TRIGGERS, VIEWS, TYPES, FUNCTIONS, AGGREGATES, INDEXES };
-
 using days = std::chrono::duration<int, std::ratio<24 * 3600>>;

 future<> save_system_schema(const sstring & ksname) {
@@ -203,7 +201,7 @@ future<> save_system_schema(const sstring & ksname) {
    auto ksm = ks.metadata();

    // delete old, possibly obsolete entries in schema tables
-    return parallel_for_each(ALL, [ksm] (sstring cf) {
+    return parallel_for_each(all_table_names(), [ksm] (sstring cf) {
        auto deletion_timestamp = schema_creation_timestamp() - 1;
        return db::execute_cql(sprint("DELETE FROM %s.%s USING TIMESTAMP %s WHERE keyspace_name = ?", NAME, cf,
            deletion_timestamp), ksm->name()).discard_result();
@@ -598,7 +596,7 @@ future<utils::UUID> calculate_schema_digest(distributed<service::storage_proxy>&
        }
    };
    return do_with(md5_hasher(), [map, reduce] (auto& hash) {
-        return do_for_each(ALL.begin(), ALL.end(), [&hash, map, reduce] (auto& table) {
+        return do_for_each(all_table_names(), [&hash, map, reduce] (auto& table) {
            return map(table).then([&hash, reduce] (auto&& mutations) {
                reduce(hash, mutations);
            });
@@ -629,7 +627,7 @@ future<std::vector<frozen_mutation>> convert_schema_to_mutations(distributed<ser
        std::move(mutations.begin(), mutations.end(), std::back_inserter(result));
        return std::move(result);
    };
-    return map_reduce(ALL.begin(), ALL.end(), map, std::vector<frozen_mutation>{}, reduce);
+    return map_reduce(all_table_names(), map, std::vector<frozen_mutation>{}, reduce);
 }

 future<schema_result>
@@ -703,33 +701,7 @@ read_keyspace_mutation(distributed<service::storage_proxy>& proxy, const sstring
 static semaphore the_merge_lock {1};

 future<> merge_lock() {
-    // ref:  #1088
-    // to avoid deadlocks, we don't want long-standing calls to the shard 0
-    // as they can cause a deadlock:
-    //
-    //   fiber1                fiber2
-    //   merge_lock()                         (succeeds)
-    //                         merge_lock()   (waits)
-    //   invoke_on_all()                      (waits on merge_lock to relinquish smp::submit_to slot)
-    //
-    // so we issue the lock calls with a timeout; the slot will be relinquished, and invoke_on_all()
-    // can complete
-    return repeat([] () mutable {
-        return smp::submit_to(0, [] {
-            return the_merge_lock.try_wait();
-        }).then([] (bool result) {
-            if (result) {
-                return make_ready_future<stop_iteration>(stop_iteration::yes);
-            } else {
-                static thread_local auto rand_engine = std::default_random_engine();
-                auto dist = std::uniform_int_distribution<int>(0, 100);
-                auto to = std::chrono::microseconds(dist(rand_engine));
-                return sleep(to).then([] {
-                    return make_ready_future<stop_iteration>(stop_iteration::no);
-                });
-            }
-        });
-    });
+    return smp::submit_to(0, [] { return the_merge_lock.wait(); });
 }

 future<> merge_unlock() {
@@ -777,16 +749,24 @@ static read_table_names_of_keyspace(distributed<service::storage_proxy>& proxy,
    });
 }

+static utils::UUID table_id_from_mutations(const schema_mutations& sm) {
+    auto table_rs = query::result_set(sm.columnfamilies_mutation());
+    query::result_set_row table_row = table_rs.row(0);
+    return table_row.get_nonnull<utils::UUID>("id");
+}
+
 // Call inside a seastar thread
 static
-std::map<qualified_name, schema_mutations>
+std::map<utils::UUID, schema_mutations>
 read_tables_for_keyspaces(distributed<service::storage_proxy>& proxy, const std::set<sstring>& keyspace_names, schema_ptr s)
 {
-    std::map<qualified_name, schema_mutations> result;
+    std::map<utils::UUID, schema_mutations> result;
    for (auto&& keyspace_name : keyspace_names) {
        for (auto&& table_name : read_table_names_of_keyspace(proxy, keyspace_name, s).get0()) {
            auto qn = qualified_name(keyspace_name, table_name);
-            result.emplace(qn, read_table_mutations(proxy, qn, s).get0());
+            auto muts = read_table_mutations(proxy, qn, s).get0();
+            auto id = table_id_from_mutations(muts);
+            result.emplace(std::move(id), std::move(muts));
        }
    }
    return result;
@@ -956,14 +936,14 @@ struct schema_diff {

 template<typename CreateSchema>
 static schema_diff diff_table_or_view(distributed<service::storage_proxy>& proxy,
-    std::map<qualified_name, schema_mutations>&& before,
-    std::map<qualified_name, schema_mutations>&& after,
+    std::map<utils::UUID, schema_mutations>&& before,
+    std::map<utils::UUID, schema_mutations>&& after,
    CreateSchema&& create_schema)
 {
    schema_diff d;
    auto diff = difference(before, after);
    for (auto&& key : diff.entries_only_on_left) {
-        auto&& s = proxy.local().get_db().local().find_schema(key.keyspace_name, key.table_name);
+        auto&& s = proxy.local().get_db().local().find_schema(key);
        slogger.info("Dropping {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version());
        d.dropped.emplace_back(schema_diff::dropped_schema{s});
    }
@@ -986,10 +966,10 @@ static schema_diff diff_table_or_view(distributed<service::storage_proxy>& proxy
 // upon an alter table or alter type statement), then they are published together
 // as well, without any deferring in-between.
 static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
-    std::map<qualified_name, schema_mutations>&& tables_before,
-    std::map<qualified_name, schema_mutations>&& tables_after,
-    std::map<qualified_name, schema_mutations>&& views_before,
-    std::map<qualified_name, schema_mutations>&& views_after)
+    std::map<utils::UUID, schema_mutations>&& tables_before,
+    std::map<utils::UUID, schema_mutations>&& tables_after,
+    std::map<utils::UUID, schema_mutations>&& views_before,
+    std::map<utils::UUID, schema_mutations>&& views_after)
 {
    auto tables_diff = diff_table_or_view(proxy, std::move(tables_before), std::move(tables_after), [&] (auto&& sm) {
        return create_table_from_mutations(proxy, std::move(sm));
@@ -1000,6 +980,10 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,

    proxy.local().get_db().invoke_on_all([&] (database& db) {
        return seastar::async([&] {
+            parallel_for_each(boost::range::join(tables_diff.dropped, views_diff.dropped), [&] (schema_diff::dropped_schema& dt) {
+                auto& s = *dt.schema.get();
+                return db.drop_column_family(s.ks_name(), s.cf_name(), [&] { return dt.jp.value(); });
+            }).get();
            parallel_for_each(boost::range::join(tables_diff.created, views_diff.created), [&] (global_schema_ptr& gs) {
                return db.add_column_family_and_make_directory(gs);
            }).get();
@@ -1011,10 +995,6 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
            for (auto&& gs : boost::range::join(tables_diff.altered, views_diff.altered)) {
                columns_changed.push_back(db.update_column_family(gs));
            }
-            parallel_for_each(boost::range::join(tables_diff.dropped, views_diff.dropped), [&] (schema_diff::dropped_schema& dt) {
-                auto& s = *dt.schema.get();
-                return db.drop_column_family(s.ks_name(), s.cf_name(), [&] { return dt.jp.value(); });
-            }).get();

            auto& mm = service::get_local_migration_manager();
            auto it = columns_changed.begin();
@@ -2681,12 +2661,22 @@ data_type parse_type(sstring str)
 }

 std::vector<schema_ptr> all_tables() {
+    // Don't forget to update this list when new schema tables are added.
+    // The listed schema tables are the ones synchronized between nodes,
+    // and forgetting one of them in this list can cause bugs like #4339.
    return {
        keyspaces(), tables(), scylla_tables(), columns(), dropped_columns(), triggers(),
        views(), indexes(), types(), functions(), aggregates(), view_virtual_columns()
    };
 }

+const std::vector<sstring>& all_table_names() {
+    static thread_local std::vector<sstring> all =
+            boost::copy_range<std::vector<sstring>>(all_tables() |
+            boost::adaptors::transformed([] (auto schema) { return schema->cf_name(); }));
+    return all;
+}
+
 namespace legacy {

 table_schema_version schema_mutations::digest() const {
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -127,9 +127,8 @@ using namespace v3;
 // Replication of schema between nodes with different version is inhibited.
 extern const sstring version;

-extern std::vector<const char*> ALL;
-
 std::vector<schema_ptr> all_tables();
+const std::vector<sstring>& all_table_names();

 // saves/creates "ks" + all tables etc, while first deleting all old schema entries (will be rewritten)
 future<> save_system_schema(const sstring & ks);
--- a/db/size_estimates_virtual_reader.cc
+++ b/db/size_estimates_virtual_reader.cc
@@ -0,0 +1,329 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <boost/range/adaptor/indirected.hpp>
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/algorithm/find_if.hpp>
+
+#include "clustering_bounds_comparator.hh"
+#include "database.hh"
+#include "db/system_keyspace.hh"
+#include "dht/i_partitioner.hh"
+#include "partition_range_compat.hh"
+#include "range.hh"
+#include "service/storage_service.hh"
+#include "stdx.hh"
+#include "mutation_fragment.hh"
+#include "sstables/sstables.hh"
+#include "db/timeout_clock.hh"
+#include "database.hh"
+
+#include "db/size_estimates_virtual_reader.hh"
+
+namespace db {
+
+namespace size_estimates {
+
+struct virtual_row {
+    const bytes& cf_name;
+    const token_range& tokens;
+    clustering_key_prefix as_key() const {
+        return clustering_key_prefix::from_exploded(std::vector<bytes_view>{cf_name, tokens.start, tokens.end});
+    }
+};
+
+struct virtual_row_comparator {
+    schema_ptr _schema;
+    virtual_row_comparator(schema_ptr schema) : _schema(schema) { }
+    bool operator()(const clustering_key_prefix& key1, const clustering_key_prefix& key2) {
+        return clustering_key_prefix::prefix_equality_less_compare(*_schema)(key1, key2);
+    }
+    bool operator()(const virtual_row& row, const clustering_key_prefix& key) {
+        return operator()(row.as_key(), key);
+    }
+    bool operator()(const clustering_key_prefix& key, const virtual_row& row) {
+        return operator()(key, row.as_key());
+    }
+};
+
+// Iterating over the cartesian product of cf_names and token_ranges.
+class virtual_row_iterator : public std::iterator<std::input_iterator_tag, const virtual_row> {
+    std::reference_wrapper<const std::vector<bytes>> _cf_names;
+    std::reference_wrapper<const std::vector<token_range>> _ranges;
+    size_t _cf_names_idx = 0;
+    size_t _ranges_idx = 0;
+public:
+    struct end_iterator_tag {};
+    virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges)
+            : _cf_names(std::ref(cf_names))
+            , _ranges(std::ref(ranges))
+    { }
+    virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges, end_iterator_tag)
+            : _cf_names(std::ref(cf_names))
+            , _ranges(std::ref(ranges))
+            , _cf_names_idx(cf_names.size())
+            , _ranges_idx(ranges.size())
+    {
+        if (cf_names.empty() || ranges.empty()) {
+            // The product of an empty range with any range is an empty range.
+            // In this case we want the end iterator to be equal to the begin iterator,
+            // which has_ranges_idx = _cf_names_idx = 0.
+            _ranges_idx = _cf_names_idx = 0;
+        }
+    }
+    virtual_row_iterator& operator++() {
+        if (++_ranges_idx == _ranges.get().size() && ++_cf_names_idx < _cf_names.get().size()) {
+            _ranges_idx = 0;
+        }
+        return *this;
+    }
+    virtual_row_iterator operator++(int) {
+        virtual_row_iterator i(*this);
+        ++(*this);
+        return i;
+    }
+    const value_type operator*() const {
+        return { _cf_names.get()[_cf_names_idx], _ranges.get()[_ranges_idx] };
+    }
+    bool operator==(const virtual_row_iterator& i) const {
+        return _cf_names_idx == i._cf_names_idx
+            && _ranges_idx == i._ranges_idx;
+    }
+    bool operator!=(const virtual_row_iterator& i) const {
+        return !(*this == i);
+    }
+};
+
+/**
+ * Returns the keyspaces, ordered by name, as selected by the partition_range.
+ */
+static std::vector<sstring> get_keyspaces(const schema& s, const database& db, dht::partition_range range) {
+    struct keyspace_less_comparator {
+        const schema& _s;
+        keyspace_less_comparator(const schema& s) : _s(s) { }
+        dht::ring_position as_ring_position(const sstring& ks) {
+            auto pkey = partition_key::from_single_value(_s, utf8_type->decompose(ks));
+            return dht::global_partitioner().decorate_key(_s, std::move(pkey));
+        }
+        bool operator()(const sstring& ks1, const sstring& ks2) {
+            return as_ring_position(ks1).less_compare(_s, as_ring_position(ks2));
+        }
+        bool operator()(const sstring& ks, const dht::ring_position& rp) {
+            return as_ring_position(ks).less_compare(_s, rp);
+        }
+        bool operator()(const dht::ring_position& rp, const sstring& ks) {
+            return rp.less_compare(_s, as_ring_position(ks));
+        }
+    };
+    auto keyspaces = db.get_non_system_keyspaces();
+    auto cmp = keyspace_less_comparator(s);
+    boost::sort(keyspaces, cmp);
+    return boost::copy_range<std::vector<sstring>>(
+        range.slice(keyspaces, std::move(cmp)) | boost::adaptors::filtered([&s] (const auto& ks) {
+            // If this is a range query, results are divided between shards by the partition key (keyspace_name).
+            return shard_of(dht::global_partitioner().get_token(s,
+                        partition_key::from_single_value(s, utf8_type->decompose(ks))))
+                == engine().cpu_id();
+        })
+    );
+}
+
+/**
+ * Makes a wrapping range of ring_position from a nonwrapping range of token, used to select sstables.
+ */
+static dht::partition_range as_ring_position_range(dht::token_range& r) {
+    stdx::optional<range<dht::ring_position>::bound> start_bound, end_bound;
+    if (r.start()) {
+        start_bound = {{ dht::ring_position(r.start()->value(), dht::ring_position::token_bound::start), r.start()->is_inclusive() }};
+    }
+    if (r.end()) {
+        end_bound = {{ dht::ring_position(r.end()->value(), dht::ring_position::token_bound::end), r.end()->is_inclusive() }};
+    }
+    return dht::partition_range(std::move(start_bound), std::move(end_bound), r.is_singular());
+}
+
+/**
+ * Add a new range_estimates for the specified range, considering the sstables associated with `cf`.
+ */
+static system_keyspace::range_estimates estimate(const column_family& cf, const token_range& r) {
+    int64_t count{0};
+    utils::estimated_histogram hist{0};
+    auto from_bytes = [] (auto& b) {
+        return dht::global_partitioner().from_sstring(utf8_type->to_string(b));
+    };
+    dht::token_range_vector ranges;
+    ::compat::unwrap_into(
+        wrapping_range<dht::token>({{ from_bytes(r.start), false }}, {{ from_bytes(r.end) }}),
+        dht::token_comparator(),
+        [&] (auto&& rng) { ranges.push_back(std::move(rng)); });
+    for (auto&& r : ranges) {
+        auto rp_range = as_ring_position_range(r);
+        for (auto&& sstable : cf.select_sstables(rp_range)) {
+            count += sstable->estimated_keys_for_range(r);
+            hist.merge(sstable->get_stats_metadata().estimated_row_size);
+        }
+    }
+    return {cf.schema(), r.start, r.end, count, count > 0 ? hist.mean() : 0};
+}
+
+future<std::vector<token_range>> get_local_ranges() {
+    auto& ss = service::get_local_storage_service();
+    return ss.get_local_tokens().then([&ss] (auto&& tokens) {
+        auto ranges = ss.get_token_metadata().get_primary_ranges_for(std::move(tokens));
+        std::vector<token_range> local_ranges;
+        auto to_bytes = [](const stdx::optional<dht::token_range::bound>& b) {
+            assert(b);
+            return utf8_type->decompose(dht::global_partitioner().to_sstring(b->value()));
+        };
+        // We merge the ranges to be compatible with how Cassandra shows it's size estimates table.
+        // All queries will be on that table, where all entries are text and there's no notion of
+        // token ranges form the CQL point of view.
+        auto left_inf = boost::find_if(ranges, [] (auto&& r) {
+            return !r.start() || r.start()->value() == dht::minimum_token();
+        });
+        auto right_inf = boost::find_if(ranges, [] (auto&& r) {
+            return !r.end() || r.start()->value() == dht::maximum_token();
+        });
+        if (left_inf != right_inf && left_inf != ranges.end() && right_inf != ranges.end()) {
+            local_ranges.push_back(token_range{to_bytes(right_inf->start()), to_bytes(left_inf->end())});
+            ranges.erase(left_inf);
+            ranges.erase(right_inf);
+        }
+        for (auto&& r : ranges) {
+            local_ranges.push_back(token_range{to_bytes(r.start()), to_bytes(r.end())});
+        }
+        boost::sort(local_ranges, [] (auto&& tr1, auto&& tr2) {
+            return utf8_type->less(tr1.start, tr2.start);
+        });
+        return local_ranges;
+    });
+}
+
+size_estimates_mutation_reader::size_estimates_mutation_reader(schema_ptr schema, const dht::partition_range& prange, const query::partition_slice& slice, streamed_mutation::forwarding fwd)
+            : impl(schema)
+            , _schema(std::move(schema))
+            , _prange(&prange)
+            , _slice(slice)
+            , _fwd(fwd)
+    { }
+
+future<> size_estimates_mutation_reader::get_next_partition() {
+    auto& db = service::get_local_storage_proxy().get_db().local();
+    if (!_keyspaces) {
+        _keyspaces = get_keyspaces(*_schema, db, *_prange);
+        _current_partition = _keyspaces->begin();
+    }
+    if (_current_partition == _keyspaces->end()) {
+        _end_of_stream = true;
+        return make_ready_future<>();
+    }
+    return get_local_ranges().then([&db, this] (auto&& ranges) {
+        auto estimates = this->estimates_for_current_keyspace(db, std::move(ranges));
+        auto mutations = db::system_keyspace::make_size_estimates_mutation(*_current_partition, std::move(estimates));
+        ++_current_partition;
+        std::vector<mutation> ms;
+        ms.emplace_back(std::move(mutations));
+        _partition_reader = flat_mutation_reader_from_mutations(std::move(ms), _fwd);
+    });
+}
+
+future<> size_estimates_mutation_reader::fill_buffer(db::timeout_clock::time_point timeout) {
+    return do_until([this, timeout] { return is_end_of_stream() || is_buffer_full(); }, [this, timeout] {
+        if (!_partition_reader) {
+            return get_next_partition();
+        }
+        return _partition_reader->consume_pausable([this] (mutation_fragment mf) {
+            push_mutation_fragment(std::move(mf));
+            return stop_iteration(is_buffer_full());
+        }, timeout).then([this] {
+            if (_partition_reader->is_end_of_stream() && _partition_reader->is_buffer_empty()) {
+                _partition_reader = stdx::nullopt;
+            }
+        });
+    });
+}
+
+void size_estimates_mutation_reader::next_partition() {
+    clear_buffer_to_next_partition();
+    if (is_buffer_empty()) {
+        _partition_reader = stdx::nullopt;
+    }
+}
+
+future<> size_estimates_mutation_reader::fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) {
+    clear_buffer();
+    _prange = &pr;
+    _keyspaces = stdx::nullopt;
+    _partition_reader = stdx::nullopt;
+    _end_of_stream = false;
+    return make_ready_future<>();
+}
+
+future<> size_estimates_mutation_reader::fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) {
+    forward_buffer_to(pr.start());
+    _end_of_stream = false;
+    if (_partition_reader) {
+        return _partition_reader->fast_forward_to(std::move(pr), timeout);
+    }
+    return make_ready_future<>();
+}
+
+size_t size_estimates_mutation_reader::buffer_size() const {
+    if (_partition_reader) {
+        return flat_mutation_reader::impl::buffer_size() + _partition_reader->buffer_size();
+    }
+    return flat_mutation_reader::impl::buffer_size();
+}
+
+std::vector<db::system_keyspace::range_estimates>
+size_estimates_mutation_reader::estimates_for_current_keyspace(const database& db, std::vector<token_range> local_ranges) const {
+    // For each specified range, estimate (crudely) mean partition size and partitions count.
+    auto pkey = partition_key::from_single_value(*_schema, utf8_type->decompose(*_current_partition));
+    auto cfs = db.find_keyspace(*_current_partition).metadata()->cf_meta_data();
+    auto cf_names = boost::copy_range<std::vector<bytes>>(cfs | boost::adaptors::transformed([] (auto&& cf) {
+        return utf8_type->decompose(cf.first);
+    }));
+    boost::sort(cf_names, [] (auto&& n1, auto&& n2) {
+        return utf8_type->less(n1, n2);
+    });
+    std::vector<db::system_keyspace::range_estimates> estimates;
+    for (auto& range : _slice.row_ranges(*_schema, pkey)) {
+        auto rows = boost::make_iterator_range(
+                virtual_row_iterator(cf_names, local_ranges),
+                virtual_row_iterator(cf_names, local_ranges, virtual_row_iterator::end_iterator_tag()));
+        auto rows_to_estimate = range.slice(rows, virtual_row_comparator(_schema));
+        for (auto&& r : rows_to_estimate) {
+            auto& cf = db.find_column_family(*_current_partition, utf8_type->to_string(r.cf_name));
+            estimates.push_back(estimate(cf, r.tokens));
+            if (estimates.size() >= _slice.partition_row_limit()) {
+                return estimates;
+            }
+        }
+    }
+    return estimates;
+}
+
+} // namespace size_estimates
+
+} // namespace db
--- a/db/size_estimates_virtual_reader.hh
+++ b/db/size_estimates_virtual_reader.hh
@@ -21,33 +21,19 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

-#include <boost/range/adaptor/indirected.hpp>
-#include <boost/range/adaptor/map.hpp>
-#include <boost/range/adaptor/transformed.hpp>
-#include <boost/range/algorithm/find_if.hpp>
-
-#include "clustering_bounds_comparator.hh"
-#include "database.hh"
 #include "db/system_keyspace.hh"
-#include "dht/i_partitioner.hh"
 #include "mutation_reader.hh"
-#include "partition_range_compat.hh"
-#include "range.hh"
-#include "service/storage_service.hh"
-#include "stdx.hh"
-#include "mutation_fragment.hh"
-#include "sstables/sstables.hh"
-#include "db/timeout_clock.hh"

 namespace db {

 namespace size_estimates {

+struct token_range {
+    bytes start;
+    bytes end;
+};
+
 class size_estimates_mutation_reader final : public flat_mutation_reader::impl {
-    struct token_range {
-        bytes start;
-        bytes end;
-    };
    schema_ptr _schema;
    const dht::partition_range* _prange;
    const query::partition_slice& _slice;
@@ -57,267 +43,18 @@ class size_estimates_mutation_reader final : public flat_mutation_reader::impl {
    streamed_mutation::forwarding _fwd;
    flat_mutation_reader_opt _partition_reader;
 public:
-    size_estimates_mutation_reader(schema_ptr schema, const dht::partition_range& prange, const query::partition_slice& slice, streamed_mutation::forwarding fwd)
-            : impl(schema)
-            , _schema(std::move(schema))
-            , _prange(&prange)
-            , _slice(slice)
-            , _fwd(fwd)
-    { }
+    size_estimates_mutation_reader(schema_ptr, const dht::partition_range&, const query::partition_slice&, streamed_mutation::forwarding);

+    virtual future<> fill_buffer(db::timeout_clock::time_point) override;
+    virtual void next_partition() override;
+    virtual future<> fast_forward_to(const dht::partition_range&, db::timeout_clock::time_point) override;
+    virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point) override;
+    virtual size_t buffer_size() const override;
 private:
-    future<> get_next_partition() {
-        // For each specified range, estimate (crudely) mean partition size and partitions count.
-        auto& db = service::get_local_storage_proxy().get_db().local();
-        if (!_keyspaces) {
-            _keyspaces = get_keyspaces(*_schema, db, *_prange);
-            _current_partition = _keyspaces->begin();
-        }
-        if (_current_partition == _keyspaces->end()) {
-            _end_of_stream = true;
-            return make_ready_future<>();
-        }
-        return get_local_ranges().then([&db, this] (auto&& ranges) {
-            auto estimates = this->estimates_for_current_keyspace(db, std::move(ranges));
-            auto mutations = db::system_keyspace::make_size_estimates_mutation(*_current_partition, std::move(estimates));
-            ++_current_partition;
-            std::vector<mutation> ms;
-            ms.emplace_back(std::move(mutations));
-            _partition_reader = flat_mutation_reader_from_mutations(std::move(ms), _fwd);
-        });
-    }
-public:
-    virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
-        return do_until([this, timeout] { return is_end_of_stream() || is_buffer_full(); }, [this, timeout] {
-            if (!_partition_reader) {
-                return get_next_partition();
-            }
-            return _partition_reader->consume_pausable([this] (mutation_fragment mf) {
-                push_mutation_fragment(std::move(mf));
-                return stop_iteration(is_buffer_full());
-            }, timeout).then([this] {
-                if (_partition_reader->is_end_of_stream() && _partition_reader->is_buffer_empty()) {
-                    _partition_reader = stdx::nullopt;
-                }
-            });
-        });
-    }
-    virtual void next_partition() override {
-        clear_buffer_to_next_partition();
-        if (is_buffer_empty()) {
-            _partition_reader = stdx::nullopt;
-        }
-    }
-    virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override {
-        clear_buffer();
-        _prange = &pr;
-        _keyspaces = stdx::nullopt;
-        _partition_reader = stdx::nullopt;
-        _end_of_stream = false;
-        return make_ready_future<>();
-    }
-    virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override {
-        forward_buffer_to(pr.start());
-        _end_of_stream = false;
-        if (_partition_reader) {
-            return _partition_reader->fast_forward_to(std::move(pr), timeout);
-        }
-        return make_ready_future<>();
-    }
-    virtual size_t buffer_size() const override {
-        if (_partition_reader) {
-            return flat_mutation_reader::impl::buffer_size() + _partition_reader->buffer_size();
-        }
-        return flat_mutation_reader::impl::buffer_size();
-    }
-    /**
-     * Returns the primary ranges for the local node.
-     * Used for testing as well.
-     */
-    static future<std::vector<token_range>> get_local_ranges() {
-        auto& ss = service::get_local_storage_service();
-        return ss.get_local_tokens().then([&ss] (auto&& tokens) {
-            auto ranges = ss.get_token_metadata().get_primary_ranges_for(std::move(tokens));
-            std::vector<token_range> local_ranges;
-            auto to_bytes = [](const stdx::optional<dht::token_range::bound>& b) {
-                assert(b);
-                return utf8_type->decompose(dht::global_partitioner().to_sstring(b->value()));
-            };
-            // We merge the ranges to be compatible with how Cassandra shows it's size estimates table.
-            // All queries will be on that table, where all entries are text and there's no notion of
-            // token ranges form the CQL point of view.
-            auto left_inf = boost::find_if(ranges, [] (auto&& r) {
-                return !r.start() || r.start()->value() == dht::minimum_token();
-            });
-            auto right_inf = boost::find_if(ranges, [] (auto&& r) {
-                return !r.end() || r.start()->value() == dht::maximum_token();
-            });
-            if (left_inf != right_inf && left_inf != ranges.end() && right_inf != ranges.end()) {
-                local_ranges.push_back(token_range{to_bytes(right_inf->start()), to_bytes(left_inf->end())});
-                ranges.erase(left_inf);
-                ranges.erase(right_inf);
-            }
-            for (auto&& r : ranges) {
-                local_ranges.push_back(token_range{to_bytes(r.start()), to_bytes(r.end())});
-            }
-            boost::sort(local_ranges, [] (auto&& tr1, auto&& tr2) {
-                return utf8_type->less(tr1.start, tr2.start);
-            });
-            return local_ranges;
-        });
-    }
-private:
-    struct virtual_row {
-        const bytes& cf_name;
-        const token_range& tokens;
-        clustering_key_prefix as_key() const {
-            return clustering_key_prefix::from_exploded(std::vector<bytes_view>{cf_name, tokens.start, tokens.end});
-        }
-    };
-    struct virtual_row_comparator {
-        schema_ptr _schema;
-        virtual_row_comparator(schema_ptr schema) : _schema(schema) { }
-        bool operator()(const clustering_key_prefix& key1, const clustering_key_prefix& key2) {
-            return clustering_key_prefix::prefix_equality_less_compare(*_schema)(key1, key2);
-        }
-        bool operator()(const virtual_row& row, const clustering_key_prefix& key) {
-            return operator()(row.as_key(), key);
-        }
-        bool operator()(const clustering_key_prefix& key, const virtual_row& row) {
-            return operator()(key, row.as_key());
-        }
-    };
-    class virtual_row_iterator : public std::iterator<std::input_iterator_tag, const virtual_row> {
-        std::reference_wrapper<const std::vector<bytes>> _cf_names;
-        std::reference_wrapper<const std::vector<token_range>> _ranges;
-        size_t _cf_names_idx = 0;
-        size_t _ranges_idx = 0;
-    public:
-        struct end_iterator_tag {};
-        virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges)
-                : _cf_names(std::ref(cf_names))
-                , _ranges(std::ref(ranges))
-        { }
-        virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges, end_iterator_tag)
-                : _cf_names(std::ref(cf_names))
-                , _ranges(std::ref(ranges))
-                , _cf_names_idx(cf_names.size())
-                , _ranges_idx(ranges.size())
-        { }
-        virtual_row_iterator& operator++() {
-            if (++_ranges_idx == _ranges.get().size() && ++_cf_names_idx < _cf_names.get().size()) {
-                _ranges_idx = 0;
-            }
-            return *this;
-        }
-        virtual_row_iterator operator++(int) {
-            virtual_row_iterator i(*this);
-            ++(*this);
-            return i;
-        }
-        const value_type operator*() const {
-            return { _cf_names.get()[_cf_names_idx], _ranges.get()[_ranges_idx] };
-        }
-        bool operator==(const virtual_row_iterator& i) const {
-            return _cf_names_idx == i._cf_names_idx
-                && _ranges_idx == i._ranges_idx;
-        }
-        bool operator!=(const virtual_row_iterator& i) const {
-            return !(*this == i);
-        }
-    };
+    future<> get_next_partition();

    std::vector<db::system_keyspace::range_estimates>
-    estimates_for_current_keyspace(const database& db, std::vector<token_range> local_ranges) const {
-        auto pkey = partition_key::from_single_value(*_schema, utf8_type->decompose(*_current_partition));
-        auto cfs = db.find_keyspace(*_current_partition).metadata()->cf_meta_data();
-        auto cf_names = boost::copy_range<std::vector<bytes>>(cfs | boost::adaptors::transformed([] (auto&& cf) {
-            return utf8_type->decompose(cf.first);
-        }));
-        boost::sort(cf_names, [] (auto&& n1, auto&& n2) {
-            return utf8_type->less(n1, n2);
-        });
-        std::vector<db::system_keyspace::range_estimates> estimates;
-        for (auto& range : _slice.row_ranges(*_schema, pkey)) {
-            auto rows = boost::make_iterator_range(
-                    virtual_row_iterator(cf_names, local_ranges),
-                    virtual_row_iterator(cf_names, local_ranges, virtual_row_iterator::end_iterator_tag()));
-            auto rows_to_estimate = range.slice(rows, virtual_row_comparator(_schema));
-            for (auto&& r : rows_to_estimate) {
-                auto& cf = db.find_column_family(*_current_partition, utf8_type->to_string(r.cf_name));
-                estimates.push_back(estimate(cf, r.tokens));
-                if (estimates.size() >= _slice.partition_row_limit()) {
-                    return estimates;
-                }
-            }
-        }
-        return estimates;
-    }
-
-    /**
-     * Returns the keyspaces, ordered by name, as selected by the partition_range.
-     */
-    static ks_range get_keyspaces(const schema& s, const database& db, dht::partition_range range) {
-        struct keyspace_less_comparator {
-            const schema& _s;
-            keyspace_less_comparator(const schema& s) : _s(s) { }
-            dht::ring_position as_ring_position(const sstring& ks) {
-                auto pkey = partition_key::from_single_value(_s, utf8_type->decompose(ks));
-                return dht::global_partitioner().decorate_key(_s, std::move(pkey));
-            }
-            bool operator()(const sstring& ks1, const sstring& ks2) {
-                return as_ring_position(ks1).less_compare(_s, as_ring_position(ks2));
-            }
-            bool operator()(const sstring& ks, const dht::ring_position& rp) {
-                return as_ring_position(ks).less_compare(_s, rp);
-            }
-            bool operator()(const dht::ring_position& rp, const sstring& ks) {
-                return rp.less_compare(_s, as_ring_position(ks));
-            }
-        };
-        auto keyspaces = db.get_non_system_keyspaces();
-        auto cmp = keyspace_less_comparator(s);
-        boost::sort(keyspaces, cmp);
-        return boost::copy_range<ks_range>(range.slice(keyspaces, std::move(cmp)));
-    }
-
-    /**
-     * Makes a wrapping range of ring_position from a nonwrapping range of token, used to select sstables.
-     */
-    static dht::partition_range as_ring_position_range(dht::token_range& r) {
-        stdx::optional<range<dht::ring_position>::bound> start_bound, end_bound;
-        if (r.start()) {
-            start_bound = {{ dht::ring_position(r.start()->value(), dht::ring_position::token_bound::start), r.start()->is_inclusive() }};
-        }
-        if (r.end()) {
-            end_bound = {{ dht::ring_position(r.end()->value(), dht::ring_position::token_bound::end), r.end()->is_inclusive() }};
-        }
-        return dht::partition_range(std::move(start_bound), std::move(end_bound), r.is_singular());
-    }
-
-    /**
-     * Add a new range_estimates for the specified range, considering the sstables associated with `cf`.
-     */
-    static system_keyspace::range_estimates estimate(const column_family& cf, const token_range& r) {
-        int64_t count{0};
-        utils::estimated_histogram hist{0};
-        auto from_bytes = [] (auto& b) {
-            return dht::global_partitioner().from_sstring(utf8_type->to_string(b));
-        };
-        dht::token_range_vector ranges;
-        ::compat::unwrap_into(
-            wrapping_range<dht::token>({{ from_bytes(r.start), false }}, {{ from_bytes(r.end) }}),
-            dht::token_comparator(),
-            [&] (auto&& rng) { ranges.push_back(std::move(rng)); });
-        for (auto&& r : ranges) {
-            auto rp_range = as_ring_position_range(r);
-            for (auto&& sstable : cf.select_sstables(rp_range)) {
-                count += sstable->estimated_keys_for_range(r);
-                hist.merge(sstable->get_stats_metadata().estimated_row_size);
-            }
-        }
-        return {cf.schema(), r.start, r.end, count, count > 0 ? hist.mean() : 0};
-    }
+    estimates_for_current_keyspace(const database&, std::vector<token_range> local_ranges) const;
 };

 struct virtual_reader {
@@ -332,6 +69,12 @@ struct virtual_reader {
    }
 };

+/**
+ * Returns the primary ranges for the local node.
+ * Used for testing as well.
+ */
+future<std::vector<token_range>> get_local_ranges();
+
 } // namespace size_estimates

 } // namespace db
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -445,7 +445,7 @@ void create_virtual_column(schema_builder& builder, const bytes& name, const dat
        // A map has keys and values. We don't need these values,
        // and can use empty values instead.
        auto mtype = dynamic_pointer_cast<const map_type_impl>(type);
-        builder.with_column(name, map_type_impl::get_instance(mtype->get_values_type(), empty_type, true), column_kind::regular_column, column_view_virtual::yes);
+        builder.with_column(name, map_type_impl::get_instance(mtype->get_keys_type(), empty_type, true), column_kind::regular_column, column_view_virtual::yes);
    } else if (ctype->is_set()) {
        // A set's cell has nothing beyond the keys, so the
        // virtual version of a set is, unfortunately, a complete
@@ -781,6 +781,7 @@ future<stop_iteration> view_update_builder::on_results() {
    // If we have updates and it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it
    if (_update && !_update->is_end_of_partition()) {
        if (_update->is_clustering_row()) {
+            apply_tracked_tombstones(_update_tombstone_tracker, _update->as_mutable_clustering_row());
            generate_update(std::move(*_update).as_clustering_row(), { });
        }
        return advance_updates();
@@ -1464,7 +1465,16 @@ private:
    built_views _built_views;
    std::vector<view_ptr> _views_to_build;
    std::deque<mutation_fragment> _fragments;
-
+    // The compact_for_query<> that feeds this consumer is already configured
+    // to feed us up to view_builder::batchsize (128) rows and not an entire
+    // partition. Still, if rows contain large blobs, saving 128 of them in
+    // _fragments may be too much. So we want to track _fragment's memory
+    // usage, and flush the _fragments if it has grown too large.
+    // Additionally, limiting _fragment's size also solves issue #4213:
+    // A single view mutation can be as large as the size of the base rows
+    // used to build it, and we cannot allow its serialized size to grow
+    // beyond our limit on mutation size (by default 32 MB).
+    size_t _fragments_memory_usage = 0;
 public:
    consumer(view_builder& builder, build_step& step)
            : _builder(builder)
@@ -1527,7 +1537,15 @@ public:
            return stop_iteration::yes;
        }

+        _fragments_memory_usage += cr.memory_usage(*_step.base->schema());
        _fragments.push_back(std::move(cr));
+        if (_fragments_memory_usage > 1024*1024) {
+            // Although we have not yet completed the batch of base rows that
+            // compact_for_query<> planned for us (view_builder::batchsize),
+            // we've still collected enough rows to reach sizeable memory use,
+            // so let's flush these rows now.
+            flush_fragments();
+        }
        return stop_iteration::no;
    }

@@ -1535,7 +1553,7 @@ public:
        return stop_iteration::no;
    }

-    stop_iteration consume_end_of_partition() {
+    void flush_fragments() {
        _builder._as.check();
        if (!_fragments.empty()) {
            _fragments.push_front(partition_start(_step.current_key, tombstone()));
@@ -1544,7 +1562,12 @@ public:
                    _step.current_token(),
                    make_flat_mutation_reader_from_fragments(_step.base->schema(), std::move(_fragments))).get();
            _fragments.clear();
+            _fragments_memory_usage = 0;
        }
+    }
+
+    stop_iteration consume_end_of_partition() {
+        flush_fragments();
        return stop_iteration(_step.build_status.empty());
    }

--- a/db/view/view_update_from_staging_generator.cc
+++ b/db/view/view_update_from_staging_generator.cc
@@ -24,7 +24,9 @@
 namespace db::view {

 future<> view_update_from_staging_generator::start() {
-    _started = seastar::async([this]() mutable {
+    thread_attributes attr;
+    attr.sched_group = _db.get_streaming_scheduling_group();
+    _started = seastar::async(std::move(attr), [this]() mutable {
        while (!_as.abort_requested()) {
            if (_sstables_with_tables.empty()) {
                _pending_sstables.wait().get();
--- a/dht/boot_strapper.cc
+++ b/dht/boot_strapper.cc
@@ -51,20 +51,22 @@ future<> boot_strapper::bootstrap() {

    auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _tokens, _address, "Bootstrap", streaming::stream_reason::bootstrap);
    streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(gms::get_local_failure_detector()));
-    for (const auto& keyspace_name : _db.local().get_non_system_keyspaces()) {
+    auto keyspaces = make_lw_shared<std::vector<sstring>>(_db.local().get_non_system_keyspaces());
+    return do_for_each(*keyspaces, [this, keyspaces, streamer] (sstring& keyspace_name) {
        auto& ks = _db.local().find_keyspace(keyspace_name);
        auto& strategy = ks.get_replication_strategy();
        dht::token_range_vector ranges = strategy.get_pending_address_ranges(_token_metadata, _tokens, _address);
        blogger.debug("Will stream keyspace={}, ranges={}", keyspace_name, ranges);
-        streamer->add_ranges(keyspace_name, ranges);
-    }
-
-    return streamer->stream_async().then([streamer] () {
-        service::get_local_storage_service().finish_bootstrapping();
-    }).handle_exception([streamer] (std::exception_ptr eptr) {
-        blogger.warn("Error during bootstrap: {}", eptr);
-        return make_exception_future<>(std::move(eptr));
+        return streamer->add_ranges(keyspace_name, ranges);
+    }).then([this, streamer] {
+        return streamer->stream_async().then([streamer] () {
+            service::get_local_storage_service().finish_bootstrapping();
+        }).handle_exception([streamer] (std::exception_ptr eptr) {
+            blogger.warn("Error during bootstrap: {}", eptr);
+            return make_exception_future<>(std::move(eptr));
+        });
    });
+
 }

 std::unordered_set<token> boot_strapper::get_bootstrap_tokens(token_metadata metadata, database& db) {
--- a/dht/range_streamer.cc
+++ b/dht/range_streamer.cc
@@ -114,6 +114,9 @@ range_streamer::get_all_ranges_with_sources_for(const sstring& keyspace_name, dh
    for (auto& desired_range : desired_ranges) {
        auto found = false;
        for (auto& x : range_addresses) {
+            if (need_preempt()) {
+                seastar::thread::yield();
+            }
            const range<token>& src_range = x.first;
            if (src_range.contains(desired_range, dht::tri_compare)) {
                std::vector<inet_address>& addresses = x.second;
@@ -157,6 +160,9 @@ range_streamer::get_all_ranges_with_strict_sources_for(const sstring& keyspace_n
    for (auto& desired_range : desired_ranges) {
        for (auto& x : range_addresses) {
            const range<token>& src_range = x.first;
+            if (need_preempt()) {
+                seastar::thread::yield();
+            }
            if (src_range.contains(desired_range, dht::tri_compare)) {
                std::vector<inet_address> old_endpoints(x.second.begin(), x.second.end());
                auto it = pending_range_addresses.find(desired_range);
@@ -226,7 +232,8 @@ void range_streamer::add_rx_ranges(const sstring& keyspace_name, std::unordered_
 }

 // TODO: This is the legacy range_streamer interface, it is add_rx_ranges which adds rx ranges.
-void range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges) {
+future<> range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges) {
+  return seastar::async([this, keyspace_name, ranges= std::move(ranges)] () mutable {
    if (_nr_tx_added) {
        throw std::runtime_error("Mixed sending and receiving is not supported");
    }
@@ -249,6 +256,7 @@ void range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_v
        }
    }
    _to_stream.emplace(keyspace_name, std::move(range_fetch_map));
+  });
 }

 future<> range_streamer::stream_async() {
--- a/dht/range_streamer.hh
+++ b/dht/range_streamer.hh
@@ -120,7 +120,7 @@ public:
        _source_filters.emplace(std::move(filter));
    }

-    void add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges);
+    future<> add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges);
    void add_tx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint);
    void add_rx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint);
 private:
--- a/dist/ami/scylla.json
+++ b/dist/ami/scylla.json
@@ -68,7 +68,7 @@
      "type": "shell",
      "inline": [
         "sudo yum install -y epel-release",
-         "sudo yum install -y python34",
+         "sudo yum install -y python36",
         "sudo /home/{{user `ssh_username`}}/scylla_install_ami {{ user `install_args` }}"
       ]
    }
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -449,6 +449,8 @@ def create_perftune_conf(nic='eth0'):


 def is_valid_nic(nic):
+    if len(nic) == 0:
+        return False
    return os.path.exists('/sys/class/net/{}'.format(nic))

 # Remove this when we do not support SET_NIC configuration value anymore
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -33,7 +33,7 @@ RUN curl http://downloads.scylladb.com/rpm/centos/scylla-3.0.repo -o /etc/yum.re
    yum -y remove boost-thread boost-system && \
    yum -y install scylla hostname supervisor && \
    yum clean all && \
-    yum -y install python34 python34-PyYAML && \
+    yum -y install python36 python36-PyYAML && \
    cat /scylla_bashrc >> /etc/bashrc && \
    mkdir -p /etc/supervisor.conf.d && \
    mkdir -p /var/log/scylla && \
--- a/dist/redhat/scylla.spec.mustache
+++ b/dist/redhat/scylla.spec.mustache
@@ -56,9 +56,9 @@ License:        AGPLv3
 URL:            http://www.scylladb.com/
 BuildRequires:  libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler systemtap-sdt-devel ninja-build cmake python ragel grep kernel-headers
 %{?fedora:BuildRequires: boost-devel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum python2-pystache}
-%{?rhel:BuildRequires: scylla-libstdc++73-static scylla-libatomic73-static scylla-boost163-devel scylla-boost163-static scylla-antlr35-tool scylla-antlr35-C++-devel python34 scylla-gcc73-c++, scylla-python34-pyparsing20 yaml-cpp-static pystache python-setuptools}
+%{?rhel:BuildRequires: scylla-libstdc++73-static scylla-libatomic73-static scylla-boost163-devel scylla-boost163-static scylla-antlr35-tool scylla-antlr35-C++-devel python36 scylla-gcc73-c++, scylla-python36-pyparsing20 yaml-cpp-static pystache python-setuptools}
 Requires:       {{product}}-conf systemd-libs hwloc PyYAML python-urwid pciutils pyparsing python-requests curl util-linux python-setuptools pciutils python3-pyudev mdadm xfsprogs
-%{?rhel:Requires: python34 python34-PyYAML kernel >= 3.10.0-514}
+%{?rhel:Requires: python36 python36-PyYAML kernel >= 3.10.0-514}
 %{?fedora:Requires: python3 python3-PyYAML}
 Conflicts:      abrt
 %ifarch x86_64
@@ -97,7 +97,7 @@ cflags="--cflags=${defines[*]}"
 %endif
 %if 0%{?rhel}
 . /etc/profile.d/scylla.sh
-python3.4 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --c-compiler=/opt/scylladb/bin/gcc-7.3 --python python3.4 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
+python3.6 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --c-compiler=/opt/scylladb/bin/gcc-7.3 --python python3.6 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
 %endif
 ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune

--- a/flat_mutation_reader.cc
+++ b/flat_mutation_reader.cc
@@ -22,6 +22,8 @@
 #include "flat_mutation_reader.hh"
 #include "mutation_reader.hh"
 #include "seastar/util/reference_wrapper.hh"
+#include "clustering_ranges_walker.hh"
+#include "schema_upgrader.hh"
 #include <algorithm>

 #include <boost/range/adaptor/transformed.hpp>
@@ -347,6 +349,7 @@ flat_mutation_reader make_empty_flat_reader(schema_ptr s) {

 flat_mutation_reader
 flat_mutation_reader_from_mutations(std::vector<mutation> ms,
+                                    const dht::partition_range& pr,
                                    const query::partition_slice& slice,
                                    streamed_mutation::forwarding fwd) {
    std::vector<mutation> sliced_ms;
@@ -355,7 +358,12 @@ flat_mutation_reader_from_mutations(std::vector<mutation> ms,
        auto mp = mutation_partition(std::move(m.partition()), *m.schema(), std::move(ck_ranges));
        sliced_ms.emplace_back(m.schema(), m.decorated_key(), std::move(mp));
    }
-    return flat_mutation_reader_from_mutations(sliced_ms, query::full_partition_range, fwd);
+    return flat_mutation_reader_from_mutations(sliced_ms, pr, fwd);
+}
+
+flat_mutation_reader
+flat_mutation_reader_from_mutations(std::vector<mutation> ms, const query::partition_slice& slice, streamed_mutation::forwarding fwd) {
+    return flat_mutation_reader_from_mutations(std::move(ms), query::full_partition_range, slice, fwd);
 }

 flat_mutation_reader
@@ -487,11 +495,11 @@ flat_mutation_reader_from_mutations(std::vector<mutation> mutations, const dht::
        }
    public:
        reader(schema_ptr s, std::vector<mutation>&& mutations, const dht::partition_range& pr)
-            : impl(std::move(s))
+            : impl(s)
            , _mutations(std::move(mutations))
            , _cur(find_first_partition(_mutations, pr))
            , _end(find_last_partition(_mutations, pr))
-            , _cmp(*_cur->schema())
+            , _cmp(*s)
        {
            _end_of_stream = _cur == _end;
            if (!_end_of_stream) {
@@ -509,6 +517,7 @@ flat_mutation_reader_from_mutations(std::vector<mutation> mutations, const dht::
            // clear_and_dispose() used by mutation_partition destructor won't
            // work properly.

+            _cur = _mutations.begin();
            while (_cur != _end) {
                destroy_current_mutation();
                ++_cur;
@@ -779,15 +788,32 @@ make_flat_multi_range_reader(

 flat_mutation_reader
 make_flat_mutation_reader_from_fragments(schema_ptr schema, std::deque<mutation_fragment> fragments) {
+    return make_flat_mutation_reader_from_fragments(std::move(schema), std::move(fragments), query::full_partition_range);
+}
+
+flat_mutation_reader
+make_flat_mutation_reader_from_fragments(schema_ptr schema, std::deque<mutation_fragment> fragments, const dht::partition_range& pr) {
    class reader : public flat_mutation_reader::impl {
        std::deque<mutation_fragment> _fragments;
+        const dht::partition_range* _pr;
+        dht::ring_position_comparator _cmp;
+
+    private:
+        bool end_of_range() const {
+            return _fragments.empty() ||
+                (_fragments.front().is_partition_start() && _pr->after(_fragments.front().as_partition_start().key(), _cmp));
+        }
+
    public:
-        reader(schema_ptr schema, std::deque<mutation_fragment> fragments)
+        reader(schema_ptr schema, std::deque<mutation_fragment> fragments, const dht::partition_range& pr)
                : flat_mutation_reader::impl(std::move(schema))
-                , _fragments(std::move(fragments)) {
+                , _fragments(std::move(fragments))
+                , _pr(&pr)
+                , _cmp(*_schema) {
+            fast_forward_to(*_pr, db::no_timeout);
        }
        virtual future<> fill_buffer(db::timeout_clock::time_point) override {
-            while (!(_end_of_stream = _fragments.empty()) && !is_buffer_full()) {
+            while (!(_end_of_stream = end_of_range()) && !is_buffer_full()) {
                push_mutation_fragment(std::move(_fragments.front()));
                _fragments.pop_front();
            }
@@ -796,7 +822,7 @@ make_flat_mutation_reader_from_fragments(schema_ptr schema, std::deque<mutation_
        virtual void next_partition() override {
            clear_buffer_to_next_partition();
            if (is_buffer_empty()) {
-                while (!(_end_of_stream = _fragments.empty()) && !_fragments.front().is_partition_start()) {
+                while (!(_end_of_stream = end_of_range()) && !_fragments.front().is_partition_start()) {
                    _fragments.pop_front();
                }
            }
@@ -805,8 +831,48 @@ make_flat_mutation_reader_from_fragments(schema_ptr schema, std::deque<mutation_
            throw std::runtime_error("This reader can't be fast forwarded to another range.");
        }
        virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override {
-            throw std::runtime_error("This reader can't be fast forwarded to another position.");
+            clear_buffer();
+            _pr = &pr;
+            _fragments.erase(_fragments.begin(), std::find_if(_fragments.begin(), _fragments.end(), [this] (const mutation_fragment& mf) {
+                return mf.is_partition_start() && !_pr->before(mf.as_partition_start().key(), _cmp);
+            }));
+            _end_of_stream = end_of_range();
+            return make_ready_future<>();
        }
    };
-    return make_flat_mutation_reader<reader>(std::move(schema), std::move(fragments));
+    return make_flat_mutation_reader<reader>(std::move(schema), std::move(fragments), pr);
+}
+
+flat_mutation_reader
+make_flat_mutation_reader_from_fragments(schema_ptr schema, std::deque<mutation_fragment> fragments, const dht::partition_range& pr, const query::partition_slice& slice) {
+    std::optional<clustering_ranges_walker> ranges_walker;
+    for (auto it = fragments.begin(); it != fragments.end();) {
+        switch (it->mutation_fragment_kind()) {
+            case mutation_fragment::kind::partition_start:
+                ranges_walker.emplace(*schema, slice.row_ranges(*schema, it->as_partition_start().key().key()), false);
+            case mutation_fragment::kind::static_row: // fall-through
+            case mutation_fragment::kind::partition_end: // fall-through
+                ++it;
+                break;
+            case mutation_fragment::kind::clustering_row:
+                if (ranges_walker->advance_to(it->position())) {
+                    ++it;
+                } else {
+                    it = fragments.erase(it);
+                }
+                break;
+            case mutation_fragment::kind::range_tombstone:
+                if (ranges_walker->advance_to(it->as_range_tombstone().position(), it->as_range_tombstone().end_position())) {
+                    ++it;
+                } else {
+                    it = fragments.erase(it);
+                }
+                break;
+        }
+    }
+    return make_flat_mutation_reader_from_fragments(std::move(schema), std::move(fragments), pr);
+}
+
+void flat_mutation_reader::do_upgrade_schema(const schema_ptr& s) {
+    *this = transform(std::move(*this), schema_upgrader(s));
 }
--- a/flat_mutation_reader.hh
+++ b/flat_mutation_reader.hh
@@ -328,6 +328,7 @@ private:
    flat_mutation_reader() = default;
    explicit operator bool() const noexcept { return bool(_impl); }
    friend class optimized_optional<flat_mutation_reader>;
+    void do_upgrade_schema(const schema_ptr&);
 public:
    // Documented in mutation_reader::forwarding in mutation_reader.hh.
    class partition_range_forwarding_tag;
@@ -466,6 +467,14 @@ public:
    void move_buffer_content_to(impl& other) {
        _impl->move_buffer_content_to(other);
    }
+
+    // Causes this reader to conform to s.
+    // Multiple calls of upgrade_schema() compose, effects of prior calls on the stream are preserved.
+    void upgrade_schema(const schema_ptr& s) {
+        if (__builtin_expect(s != schema(), false)) {
+            do_upgrade_schema(s);
+        }
+    }
 };

 using flat_mutation_reader_opt = optimized_optional<flat_mutation_reader>;
@@ -568,8 +577,12 @@ class delegating_reader : public flat_mutation_reader::impl {
 public:
    delegating_reader(Underlying&& r) : impl(to_reference(r).schema()), _underlying(std::forward<Underlying>(r)) { }
    virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
-        return fill_buffer_from(to_reference(_underlying), timeout).then([this] (bool underlying_finished) {
-            _end_of_stream = underlying_finished;
+        if (is_buffer_full()) {
+            return make_ready_future<>();
+        }
+        return to_reference(_underlying).fill_buffer(timeout).then([this] {
+            _end_of_stream = to_reference(_underlying).is_end_of_stream();
+            to_reference(_underlying).move_buffer_content_to(*this);
        });
    }
    virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override {
@@ -609,6 +622,11 @@ flat_mutation_reader
 flat_mutation_reader_from_mutations(std::vector<mutation> ms,
                                    const query::partition_slice& slice,
                                    streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no);
+flat_mutation_reader
+flat_mutation_reader_from_mutations(std::vector<mutation> ms,
+                                    const dht::partition_range& pr,
+                                    const query::partition_slice& slice,
+                                    streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no);

 /// Make a reader that enables the wrapped reader to work with multiple ranges.
 ///
@@ -642,6 +660,12 @@ make_flat_multi_range_reader(
 flat_mutation_reader
 make_flat_mutation_reader_from_fragments(schema_ptr, std::deque<mutation_fragment>);

+flat_mutation_reader
+make_flat_mutation_reader_from_fragments(schema_ptr, std::deque<mutation_fragment>, const dht::partition_range& pr);
+
+flat_mutation_reader
+make_flat_mutation_reader_from_fragments(schema_ptr, std::deque<mutation_fragment>, const dht::partition_range& pr, const query::partition_slice& slice);
+
 // Calls the consumer for each element of the reader's stream until end of stream
 // is reached or the consumer requests iteration to stop by returning stop_iteration::yes.
 // The consumer should accept mutation as the argument and return stop_iteration.
--- a/gms/feature.hh
+++ b/gms/feature.hh
@@ -25,6 +25,8 @@

 namespace gms {

+class feature_service;
+
 /**
 * A gossip feature tracks whether all the nodes the current one is
 * aware of support the specified feature.
@@ -32,12 +34,13 @@ namespace gms {
 * A feature should only be created once the gossiper is available.
 */
 class feature final {
+    feature_service* _service = nullptr;
    sstring _name;
    bool _enabled = false;
    mutable shared_promise<> _pr;
    friend class gossiper;
 public:
-    explicit feature(sstring name, bool enabled = false);
+    explicit feature(feature_service& service, sstring name, bool enabled = false);
    feature() = default;
    ~feature();
    feature(const feature& other) = delete;
--- a/gms/feature_service.hh
+++ b/gms/feature_service.hh
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <seastar/core/sstring.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_future.hh>
+#include <unordered_map>
+#include <vector>
+#include "seastarx.hh"
+
+namespace gms {
+
+class feature;
+
+/**
+ * A gossip feature tracks whether all the nodes the current one is
+ * aware of support the specified feature.
+ */
+class feature_service final {
+    std::unordered_map<sstring, std::vector<feature*>> _registered_features;
+public:
+    feature_service();
+    ~feature_service();
+    future<> stop();
+    void register_feature(feature* f);
+    void unregister_feature(feature* f);
+    void enable(const sstring& name);
+};
+
+} // namespace gms
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -44,6 +44,7 @@
 #include "gms/gossip_digest_ack2.hh"
 #include "gms/versioned_value.hh"
 #include "gms/gossiper.hh"
+#include "gms/feature_service.hh"
 #include "gms/application_state.hh"
 #include "gms/failure_detector.hh"
 #include "gms/i_failure_detection_event_listener.hh"
@@ -53,6 +54,7 @@
 #include "message/messaging_service.hh"
 #include "dht/i_partitioner.hh"
 #include "log.hh"
+#include "db/system_keyspace.hh"
 #include <seastar/core/sleep.hh>
 #include <seastar/core/thread.hh>
 #include <seastar/core/metrics.hh>
@@ -126,7 +128,8 @@ public:
    void on_restart(inet_address, endpoint_state) override {}
 };

-gossiper::gossiper() {
+gossiper::gossiper(feature_service& features)
+        : _feature_service(features) {
    // Gossiper's stuff below runs only on CPU0
    if (engine().cpu_id() != 0) {
        return;
@@ -480,8 +483,7 @@ future<> gossiper::apply_state_locally(std::map<inet_address, endpoint_state> ma
                    int local_generation = local_ep_state_ptr.get_heart_beat_state().get_generation();
                    int remote_generation = remote_state.get_heart_beat_state().get_generation();
                    logger.trace("{} local generation {}, remote generation {}", ep, local_generation, remote_generation);
-                    // A node was removed with nodetool removenode can have a generation of 2
-                    if (local_generation > 2 && remote_generation > local_generation + MAX_GENERATION_DIFFERENCE) {
+                    if (remote_generation > service::get_generation_number() + MAX_GENERATION_DIFFERENCE) {
                        // assume some peer has corrupted memory and is broadcasting an unbelievable generation about another peer (or itself)
                        logger.warn("received an invalid gossip generation for peer {}; local generation = {}, received generation = {}",
                            ep, local_generation, remote_generation);
@@ -2031,14 +2033,21 @@ future<> gossiper::wait_for_gossip(std::chrono::milliseconds initial_delay, stdx

 future<> gossiper::wait_for_gossip_to_settle() {
    static constexpr std::chrono::milliseconds GOSSIP_SETTLE_MIN_WAIT_MS{5000};
-
    auto& cfg = service::get_local_storage_service().db().local().get_config();
    auto force_after = cfg.skip_wait_for_gossip_to_settle();
+    auto do_enable_features = [this] {
+        return async([this] {
+            if (!std::exchange(_gossip_settled, true)) {
+               maybe_enable_features();
+            }
+        });
+    };
    if (force_after == 0) {
-        return make_ready_future<>();
+        return do_enable_features();
    }
-    logger.info("Waiting for gossip to settle before accepting client requests...");
-    return wait_for_gossip(GOSSIP_SETTLE_MIN_WAIT_MS, force_after);
+    return wait_for_gossip(GOSSIP_SETTLE_MIN_WAIT_MS, force_after).then([this, do_enable_features] {
+        return do_enable_features();
+    });
 }

 future<> gossiper::wait_for_range_setup() {
@@ -2084,20 +2093,45 @@ std::set<sstring> gossiper::get_supported_features(inet_address endpoint) const
    return to_feature_set(app_state->value);
 }

-std::set<sstring> gossiper::get_supported_features() const {
-    std::unordered_map<inet_address, std::set<sstring>> features_map;
+std::set<sstring> gossiper::get_supported_features(const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features, ignore_features_of_local_node ignore_local_node) const {
+    std::unordered_map<gms::inet_address, std::set<sstring>> features_map;
    std::set<sstring> common_features;

+    for (auto& x : loaded_peer_features) {
+        auto features = to_feature_set(x.second);
+        if (features.empty()) {
+            logger.warn("Loaded empty features for peer node {}", x.first);
+        } else {
+            features_map.emplace(x.first, std::move(features));
+        }
+    }
+
    for (auto& x : endpoint_state_map) {
        auto endpoint = x.first;
        auto features = get_supported_features(endpoint);
+        if (ignore_local_node && endpoint == get_broadcast_address()) {
+            logger.debug("Ignore SUPPORTED_FEATURES of local node: features={}", features);
+            continue;
+        }
        if (features.empty()) {
-            return std::set<sstring>();
+            auto it = loaded_peer_features.find(endpoint);
+            if (it != loaded_peer_features.end()) {
+                logger.info("Node {} does not contain SUPPORTED_FEATURES in gossip, using features saved in system table, features={}", endpoint, to_feature_set(it->second));
+            } else {
+                logger.warn("Node {} does not contain SUPPORTED_FEATURES in gossip or system table", endpoint);
+            }
+        } else {
+            // Replace the features with live info
+            features_map[endpoint] = std::move(features);
        }
-        if (common_features.empty()) {
-            common_features = features;
-        }
-        features_map.emplace(endpoint, std::move(features));
+    }
+
+    if (ignore_local_node) {
+        features_map.erase(get_broadcast_address());
+    }
+
+    if (!features_map.empty()) {
+        common_features = features_map.begin()->second;
    }

    for (auto& x : features_map) {
@@ -2112,37 +2146,10 @@ std::set<sstring> gossiper::get_supported_features() const {
    return common_features;
 }

-std::set<sstring> gossiper::get_supported_features(std::unordered_map<gms::inet_address, sstring> peer_features_string) {
-    std::set<sstring> common_features;
-    // Convert feature string split by "," to std::set
-    std::unordered_map<gms::inet_address, std::set<sstring>> features_map;
-    for (auto& x : peer_features_string) {
-        std::set<sstring> features = to_feature_set(x.second);
-        if (features.empty()) {
-            return std::set<sstring>();
-        }
-        if (common_features.empty()) {
-            common_features = features;
-        }
-        features_map.emplace(x.first, features);
-    }
-
-    for (auto& x : features_map) {
-        auto& features = x.second;
-        std::set<sstring> result;
-        std::set_intersection(features.begin(), features.end(),
-                common_features.begin(), common_features.end(),
-                std::inserter(result, result.end()));
-        common_features = std::move(result);
-    }
-    common_features.erase("");
-    return common_features;
-}
-
-void gossiper::check_knows_remote_features(sstring local_features_string) const {
+void gossiper::check_knows_remote_features(sstring local_features_string, const std::unordered_map<inet_address, sstring>& loaded_peer_features) const {
    std::set<sstring> local_features = to_feature_set(local_features_string);
    auto local_endpoint = get_broadcast_address();
-    auto common_features = get_supported_features();
+    auto common_features = get_supported_features(loaded_peer_features, ignore_features_of_local_node::yes);
    if (boost::range::includes(local_features, common_features)) {
        logger.info("Feature check passed. Local node {} features = {}, Remote common_features = {}",
                local_endpoint, local_features, common_features);
@@ -2151,44 +2158,19 @@ void gossiper::check_knows_remote_features(sstring local_features_string) const
    }
 }

-void gossiper::check_knows_remote_features(sstring local_features_string, std::unordered_map<inet_address, sstring> peer_features_string) const {
-    std::set<sstring> local_features = to_feature_set(local_features_string);
-    auto local_endpoint = get_broadcast_address();
-    auto common_features = get_supported_features(peer_features_string);
-    if (boost::range::includes(local_features, common_features)) {
-        logger.info("Feature check passed. Local node {} features = {}, Remote common_features = {}",
-                local_endpoint, local_features, common_features);
-    } else {
-        throw std::runtime_error(sprint("Feature check failed. This node can not join the cluster because it does not understand the feature. Local node %s features = %s, Remote common_features = %s", local_endpoint, local_features, common_features));
-    }
+feature_service::feature_service() = default;
+
+feature_service::~feature_service() = default;
+
+future<> feature_service::stop() {
+    return make_ready_future<>();
 }

-static bool check_features(std::set<sstring> features, std::set<sstring> need_features) {
-    logger.debug("Checking if need_features {} in features {}", need_features, features);
-    return boost::range::includes(features, need_features);
+void feature_service::register_feature(feature* f) {
+    _registered_features.emplace(f->name(), std::vector<feature*>()).first->second.emplace_back(f);
 }

-future<> gossiper::wait_for_feature_on_all_node(std::set<sstring> features) {
-    return _features_condvar.wait([this, features = std::move(features)] {
-        return check_features(get_supported_features(), features);
-    });
-}
-
-future<> gossiper::wait_for_feature_on_node(std::set<sstring> features, inet_address endpoint) {
-    return _features_condvar.wait([this, features = std::move(features), endpoint = std::move(endpoint)] {
-        return check_features(get_supported_features(endpoint), features);
-    });
-}
-
-void gossiper::register_feature(feature* f) {
-    if (check_features(get_local_gossiper().get_supported_features(), {f->name()})) {
-        f->enable();
-    } else {
-        _registered_features.emplace(f->name(), std::vector<feature*>()).first->second.emplace_back(f);
-    }
-}
-
-void gossiper::unregister_feature(feature* f) {
+void feature_service::unregister_feature(feature* f) {
    auto&& fsit = _registered_features.find(f->name());
    if (fsit == _registered_features.end()) {
        return;
@@ -2200,66 +2182,61 @@ void gossiper::unregister_feature(feature* f) {
    }
 }

+
+void feature_service::enable(const sstring& name) {
+    if (auto it = _registered_features.find(name); it != _registered_features.end()) {
+        for (auto&& f : it->second) {
+            f->enable();
+        }
+    }
+}
+
 // Runs inside seastar::async context
 void gossiper::maybe_enable_features() {
-    if (_registered_features.empty()) {
-        _features_condvar.broadcast();
+    if (!_gossip_settled) {
        return;
    }
-
-    auto&& features = get_supported_features();
+    auto loaded_peer_features = db::system_keyspace::load_peer_features().get0();
+    auto&& features = get_supported_features(loaded_peer_features, ignore_features_of_local_node::no);
    container().invoke_on_all([&features] (gossiper& g) {
-        for (auto it = g._registered_features.begin(); it != g._registered_features.end();) {
-            if (features.find(it->first) != features.end()) {
-                for (auto&& f : it->second) {
-                    f->enable();
-                }
-                it = g._registered_features.erase(it);
-            } else {
-                ++it;
-            }
+        for (auto&& name : features) {
+            g._feature_service.enable(name);
        }
        g._features_condvar.broadcast();
    }).get();
 }

-feature::feature(sstring name, bool enabled)
-        : _name(name)
+feature::feature(feature_service& service, sstring name, bool enabled)
+        : _service(&service)
+        , _name(name)
        , _enabled(enabled) {
-    if (!_enabled) {
-        get_local_gossiper().register_feature(this);
-    } else {
+    _service->register_feature(this);
+    if (_enabled) {
        _pr.set_value();
    }
 }

 feature::~feature() {
-    if (!_enabled) {
-        auto& gossiper = get_gossiper();
-        if (gossiper.local_is_initialized()) {
-            gossiper.local().unregister_feature(this);
-        }
+    if (_service) {
+        _service->unregister_feature(this);
    }
 }

 feature& feature::operator=(feature&& other) {
-    if (!_enabled) {
-        get_local_gossiper().unregister_feature(this);
-    }
+    _service->unregister_feature(this);
+    _service = std::exchange(other._service, nullptr);
    _name = other._name;
    _enabled = other._enabled;
    _pr = std::move(other._pr);
-    if (!_enabled) {
-        get_local_gossiper().register_feature(this);
-    }
+    _service->register_feature(this);
    return *this;
 }

 void feature::enable() {
-    if (engine().cpu_id() == 0) {
-        logger.info("Feature {} is enabled", name());
-    }
    if (!_enabled) {
+        if (engine().cpu_id() == 0) {
+            logger.info("Feature {} is enabled", name());
+        }
        _enabled = true;
        _pr.set_value();
    }
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -70,6 +70,8 @@ class inet_address;
 class i_endpoint_state_change_subscriber;
 class i_failure_detector;

+class feature_service;
+
 struct bind_messaging_port_tag {};
 using bind_messaging_port = bool_class<bind_messaging_port_tag>;

@@ -88,6 +90,7 @@ using bind_messaging_port = bool_class<bind_messaging_port_tag>;
 class gossiper : public i_failure_detection_event_listener, public seastar::async_sharded_service<gossiper>, public seastar::peering_sharded_service<gossiper> {
 public:
    using clk = seastar::lowres_system_clock;
+    using ignore_features_of_local_node = bool_class<class ignore_features_of_local_node_tag>;
 private:
    using messaging_verb = netw::messaging_verb;
    using messaging_service = netw::messaging_service;
@@ -153,7 +156,9 @@ public:
    static constexpr std::chrono::milliseconds INTERVAL{1000};
    static constexpr std::chrono::hours A_VERY_LONG_TIME{24 * 3};

-    /** Maximimum difference in generation and version values we are willing to accept about a peer */
+    // Maximimum difference between remote generation value and generation
+    // value this node would get if this node were restarted that we are
+    // willing to accept about a peer.
    static constexpr int64_t MAX_GENERATION_DIFFERENCE = 86400 * 365;
    std::chrono::milliseconds fat_client_timeout;

@@ -236,7 +241,7 @@ private:
    // The value must be kept alive until completes and not change.
    future<> replicate(inet_address, application_state key, const versioned_value& value);
 public:
-    gossiper();
+    explicit gossiper(feature_service& features);

    void set_last_processed_message_at();
    void set_last_processed_message_at(clk::time_point tp);
@@ -565,29 +570,20 @@ private:
    uint64_t _msg_processing = 0;
    bool _ms_registered = false;
    bool _gossiped_to_seed = false;
+    bool _gossip_settled = false;

    class msg_proc_guard;
 private:
    condition_variable _features_condvar;
-    std::unordered_map<sstring, std::vector<feature*>> _registered_features;
+    feature_service& _feature_service;
    friend class feature;
    // Get features supported by a particular node
    std::set<sstring> get_supported_features(inet_address endpoint) const;
    // Get features supported by all the nodes this node knows about
-    std::set<sstring> get_supported_features() const;
-    // Get features supported by all the nodes listed in the address/feature map
-    static std::set<sstring> get_supported_features(std::unordered_map<gms::inet_address, sstring> peer_features_string);
-    // Wait for features are available on all nodes this node knows about
-    future<> wait_for_feature_on_all_node(std::set<sstring> features);
-    // Wait for features are available on a particular node
-    future<> wait_for_feature_on_node(std::set<sstring> features, inet_address endpoint);
+    std::set<sstring> get_supported_features(const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features, ignore_features_of_local_node ignore_local_node) const;
 public:
-    void check_knows_remote_features(sstring local_features_string) const;
-    void check_knows_remote_features(sstring local_features_string, std::unordered_map<inet_address, sstring> peer_features_string) const;
+    void check_knows_remote_features(sstring local_features_string, const std::unordered_map<inet_address, sstring>& loaded_peer_features) const;
    void maybe_enable_features();
-private:
-    void register_feature(feature* f);
-    void unregister_feature(feature* f);
 private:
    seastar::metrics::metric_groups _metrics;
 };
--- a/idl/reconcilable_result.idl.hh
+++ b/idl/reconcilable_result.idl.hh
@@ -26,6 +26,6 @@ class partition {

 class reconcilable_result {
    uint32_t row_count();
-    std::vector<partition> partitions();
+    utils::chunked_vector<partition> partitions();
    query::short_read is_short_read() [[version 1.6]] = query::short_read::no;
 };
--- a/idl/streaming.idl.hh
+++ b/idl/streaming.idl.hh
@@ -51,4 +51,10 @@ enum class stream_reason : uint8_t {
    repair,
 };

+enum class stream_mutation_fragments_cmd : uint8_t {
+    error,
+    mutation_fragment_data,
+    end_of_stream,
+};
+
 }
--- a/index/secondary_index_manager.cc
+++ b/index/secondary_index_manager.cc
@@ -134,6 +134,11 @@ view_ptr secondary_index_manager::create_view_for_index(const index_metadata& im
        }
        builder.with_column(col.name(), col.type, column_kind::clustering_key);
    }
+    if (index_target->is_primary_key()) {
+        for (auto& def : schema->regular_columns()) {
+            db::view::create_virtual_column(builder, def.name(), def.type);
+        }
+    }
    const sstring where_clause = sprint("%s IS NOT NULL", cql3::util::maybe_quote(index_target_name));
    builder.with_view_info(*schema, false, where_clause);
    return view_ptr{builder.build()};
--- a/init.cc
+++ b/init.cc
@@ -26,6 +26,8 @@
 #include "service/storage_service.hh"
 #include "to_string.hh"
 #include "gms/inet_address.hh"
+#include "gms/feature_service.hh"
+#include "seastarx.hh"

 logging::logger startlog("init");

@@ -34,13 +36,16 @@ logging::logger startlog("init");
 // duplicated in cql_test_env.cc
 // until proper shutdown is done.

-void init_storage_service(distributed<database>& db, sharded<auth::service>& auth_service, sharded<db::system_distributed_keyspace>& sys_dist_ks) {
-    service::init_storage_service(db, auth_service, sys_dist_ks).get();
+void init_storage_service(distributed<database>& db, sharded<auth::service>& auth_service, sharded<db::system_distributed_keyspace>& sys_dist_ks,
+        sharded<gms::feature_service>& feature_service) {
+    service::init_storage_service(db, auth_service, sys_dist_ks, feature_service).get();
    // #293 - do not stop anything
    //engine().at_exit([] { return service::deinit_storage_service(); });
 }

-void init_ms_fd_gossiper(sstring listen_address_in
+void init_ms_fd_gossiper(sharded<gms::feature_service>& features
+                , db::config& cfg
+                , sstring listen_address_in
                , uint16_t storage_port
                , uint16_t ssl_storage_port
                , bool tcp_nodelay_inter_dc
@@ -100,6 +105,8 @@ void init_ms_fd_gossiper(sstring listen_address_in
            creds->set_x509_trust_file(ms_trust_store, x509_crt_format::PEM).get();
        }

+        creds->set_priority_string(db::config::default_tls_priority);
+
        if (!ms_tls_prio.empty()) {
            creds->set_priority_string(ms_tls_prio);
        }
@@ -150,7 +157,11 @@ void init_ms_fd_gossiper(sstring listen_address_in
                to_string(seeds), listen_address_in, broadcast_address);
        throw std::runtime_error("Use broadcast_address for seeds list");
    }
-    gms::get_gossiper().start().get();
+    if ((!cfg.replace_address_first_boot().empty() || !cfg.replace_address().empty()) && seeds.count(broadcast_address)) {
+        startlog.error("Bad configuration: replace-address and replace-address-first-boot are not allowed for seed nodes");
+        throw bad_configuration_error();
+    }
+    gms::get_gossiper().start(std::ref(features)).get();
    auto& gossiper = gms::get_local_gossiper();
    gossiper.set_seeds(seeds);
    // #293 - do not stop anything
--- a/init.hh
+++ b/init.hh
@@ -28,16 +28,21 @@
 #include "db/system_distributed_keyspace.hh"
 #include "database.hh"
 #include "log.hh"
+#include "seastarx.hh"

 namespace db {
 class extensions;
 }

+namespace gms {
+class feature_service;
+}
+
 extern logging::logger startlog;

 class bad_configuration_error : public std::exception {};

-void init_storage_service(distributed<database>& db, sharded<auth::service>&, sharded<db::system_distributed_keyspace>&);
+void init_storage_service(distributed<database>& db, sharded<auth::service>&, sharded<db::system_distributed_keyspace>&, sharded<gms::feature_service>&);

 struct init_scheduling_config {
    scheduling_group streaming;
@@ -45,7 +50,9 @@ struct init_scheduling_config {
    scheduling_group gossip;
 };

-void init_ms_fd_gossiper(sstring listen_address
+void init_ms_fd_gossiper(sharded<gms::feature_service>& features
+                , db::config& config
+                , sstring listen_address
                , uint16_t storage_port
                , uint16_t ssl_storage_port
                , bool tcp_nodelay_inter_dc
--- a/main.cc
+++ b/main.cc
@@ -64,6 +64,7 @@
 #include "sstables/compaction_manager.hh"
 #include "sstables/sstables.hh"
 #include <db/view/view_update_from_staging_generator.hh>
+#include "gms/feature_service.hh"

 seastar::metrics::metric_groups app_metrics;

@@ -301,15 +302,7 @@ int main(int ac, char** av) {
    auto cfg = make_lw_shared<db::config>(ext);
    auto init = app.get_options_description().add_options();

-    // If --version is requested, print it out and exit immediately to avoid
-    // Seastar-specific warnings that may occur when running the app
    init("version", bpo::bool_switch(), "print version number and exit");
-    bpo::variables_map vm;
-    bpo::store(bpo::command_line_parser(ac, av).options(app.get_options_description()).allow_unregistered().run(), vm);
-    if (vm["version"].as<bool>()) {
-        print("%s\n", scylla_version());
-        return 0;
-    }

    bpo::options_description deprecated("Deprecated options - ignored");
    deprecated.add_options()
@@ -323,6 +316,15 @@ int main(int ac, char** av) {
    configurable::append_all(*cfg, init);
    cfg->add_options(init);

+    // If --version is requested, print it out and exit immediately to avoid
+    // Seastar-specific warnings that may occur when running the app
+    bpo::variables_map vm;
+    bpo::store(bpo::command_line_parser(ac, av).options(app.get_options_description()).allow_unregistered().run(), vm);
+    if (vm["version"].as<bool>()) {
+        print("%s\n", scylla_version());
+        return 0;
+    }
+
    distributed<database> db;
    seastar::sharded<service::cache_hitrate_calculator> cf_cache_hitrate_calculator;
    debug::db = &db;
@@ -333,6 +335,7 @@ int main(int ac, char** av) {
    httpd::http_server_control prometheus_server;
    prometheus::config pctx;
    directories dirs;
+    sharded<gms::feature_service> feature_service;

    return app.run_deprecated(ac, av, [&] {

@@ -360,7 +363,8 @@ int main(int ac, char** av) {

        tcp_syncookies_sanity();

-        return seastar::async([cfg, ext, &db, &qp, &proxy, &mm, &ctx, &opts, &dirs, &pctx, &prometheus_server, &return_value, &cf_cache_hitrate_calculator] {
+        return seastar::async([cfg, ext, &db, &qp, &proxy, &mm, &ctx, &opts, &dirs, &pctx, &prometheus_server, &return_value, &cf_cache_hitrate_calculator,
+                               &feature_service] {
            read_config(opts, *cfg).get();
            configurable::init_all(opts, *cfg, *ext).get();

@@ -380,6 +384,8 @@ int main(int ac, char** av) {
                    throw bad_configuration_error();
                }
            }
+            feature_service.start().get();
+            // FIXME: feature_service.stop(), when we fix up shutdown
            dht::set_global_partitioner(cfg->partitioner(), cfg->murmur3_partitioner_ignore_msb_bits());
            auto make_sched_group = [&] (sstring name, unsigned shares) {
                if (cfg->cpu_scheduler()) {
@@ -478,6 +484,9 @@ int main(int ac, char** av) {
            if (opts.count("developer-mode")) {
                smp::invoke_on_all([] { engine().set_strict_dma(false); }).get();
            }
+
+            set_abort_on_internal_error(cfg->abort_on_internal_error());
+
            supervisor::notify("creating tracing");
            tracing::tracing::create_tracing("trace_keyspace_helper").get();
            supervisor::notify("creating snitch");
@@ -503,7 +512,7 @@ int main(int ac, char** av) {
            static sharded<auth::service> auth_service;
            static sharded<db::system_distributed_keyspace> sys_dist_ks;
            supervisor::notify("initializing storage service");
-            init_storage_service(db, auth_service, sys_dist_ks);
+            init_storage_service(db, auth_service, sys_dist_ks, feature_service);
            supervisor::notify("starting per-shard database core");

            // Note: changed from using a move here, because we want the config object intact.
@@ -599,7 +608,9 @@ int main(int ac, char** av) {
            scfg.statement = dbcfg.statement_scheduling_group;
            scfg.streaming = dbcfg.streaming_scheduling_group;
            scfg.gossip = scheduling_group();
-            init_ms_fd_gossiper(listen_address
+            init_ms_fd_gossiper(feature_service
+                    , *cfg
+                    , listen_address
                    , storage_port
                    , ssl_storage_port
                    , tcp_nodelay_inter_dc
@@ -780,6 +791,7 @@ int main(int ac, char** av) {
            });

            api::set_server_cache(ctx);
+            startlog.info("Waiting for gossip to settle before accepting client requests...");
            gms::get_local_gossiper().wait_for_gossip_to_settle().get();
            api::set_server_gossip_settle(ctx).get();

@@ -839,8 +851,11 @@ int main(int ac, char** av) {
                return service::get_local_storage_service().drain_on_shutdown();
            });

-            engine().at_exit([] {
-                return view_builder.stop();
+            engine().at_exit([cfg] {
+                if (cfg->view_building()) {
+                    return view_builder.stop();
+                }
+                return make_ready_future<>();
            });

            engine().at_exit([&db] {
--- a/memtable.cc
+++ b/memtable.cc
@@ -24,7 +24,6 @@
 #include "frozen_mutation.hh"
 #include "stdx.hh"
 #include "partition_snapshot_reader.hh"
-#include "schema_upgrader.hh"
 #include "partition_builder.hh"

 memtable::memtable(schema_ptr schema, dirty_memory_manager& dmm, memtable_list* memtable_list,
@@ -343,11 +342,8 @@ public:
                        bool digest_requested = _slice.options.contains<query::partition_slice::option::with_digest>();
                        auto mpsr = make_partition_snapshot_flat_reader(snp_schema, std::move(key_and_snp->first), std::move(cr),
                                        std::move(key_and_snp->second), digest_requested, region(), read_section(), mtbl(), streamed_mutation::forwarding::no);
-                        if (snp_schema->version() != schema()->version()) {
-                            _delegate = transform(std::move(mpsr), schema_upgrader(schema()));
-                        } else {
-                            _delegate = std::move(mpsr);
-                        }
+                        mpsr.upgrade_schema(schema());
+                        _delegate = std::move(mpsr);
                    } else {
                        _end_of_stream = true;
                    }
@@ -502,11 +498,8 @@ private:
            auto snp_schema = key_and_snp->second->schema();
            auto mpsr = make_partition_snapshot_flat_reader<partition_snapshot_accounter>(snp_schema, std::move(key_and_snp->first), std::move(cr),
                            std::move(key_and_snp->second), false, region(), read_section(), mtbl(), streamed_mutation::forwarding::no, *snp_schema, _flushed_memory);
-            if (snp_schema->version() != schema()->version()) {
-                _partition_reader = transform(std::move(mpsr), schema_upgrader(schema()));
-            } else {
-                _partition_reader = std::move(mpsr);
-            }
+            mpsr.upgrade_schema(schema());
+            _partition_reader = std::move(mpsr);
        }
    }
 public:
@@ -582,11 +575,8 @@ memtable::make_flat_reader(schema_ptr s,
        bool digest_requested = slice.options.contains<query::partition_slice::option::with_digest>();
        auto rd = make_partition_snapshot_flat_reader(snp_schema, std::move(dk), std::move(cr), std::move(snp), digest_requested,
                                                      *this, _read_section, shared_from_this(), fwd);
-        if (snp_schema->version() != s->version()) {
-            return transform(std::move(rd), schema_upgrader(s));
-        } else {
-            return rd;
-        }
+        rd.upgrade_schema(s);
+        return rd;
    } else {
        auto res = make_flat_mutation_reader<scanning_reader>(std::move(s), shared_from_this(), range, slice, pc, fwd_mr);
        if (fwd == streamed_mutation::forwarding::yes) {
@@ -701,13 +691,19 @@ bool memtable::is_flushed() const {
    return bool(_underlying);
 }

+void memtable_entry::upgrade_schema(const schema_ptr& s, mutation_cleaner& cleaner) {
+    if (_schema != s) {
+        partition().upgrade(_schema, s, cleaner, no_cache_tracker);
+        _schema = s;
+    }
+}
+
 void memtable::upgrade_entry(memtable_entry& e) {
    if (e._schema != _schema) {
        assert(!reclaiming_enabled());
        with_allocator(allocator(), [this, &e] {
          with_linearized_managed_bytes([&] {
-            e.partition().upgrade(e._schema, _schema, cleaner(), no_cache_tracker);
-            e._schema = _schema;
+            e.upgrade_schema(_schema, cleaner());
          });
        });
    }
--- a/memtable.hh
+++ b/memtable.hh
@@ -68,6 +68,10 @@ public:
    schema_ptr& schema() { return _schema; }
    partition_snapshot_ptr snapshot(memtable& mtbl);

+    // Makes the entry conform to given schema.
+    // Must be called under allocating section of the region which owns the entry.
+    void upgrade_schema(const schema_ptr&, mutation_cleaner&);
+
    size_t external_memory_usage_without_rows() const {
        return _key.key().external_memory_usage();
    }
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -87,6 +87,7 @@
 #include "frozen_mutation.hh"
 #include "flat_mutation_reader.hh"
 #include "streaming/stream_manager.hh"
+#include "streaming/stream_mutation_fragments_cmd.hh"

 namespace netw {

@@ -281,25 +282,26 @@ void messaging_service::start_listen() {
    if (_compress_what != compress_what::none) {
        so.compressor_factory = &compressor_factory;
    }
-    so.streaming_domain = rpc::streaming_domain_type(0x55AA);
    // FIXME: we don't set so.tcp_nodelay, because we can't tell at this point whether the connection will come from a
    //        local or remote datacenter, and whether or not the connection will be used for gossip. We can fix
    //        the first by wrapping its server_socket, but not the second.
    auto limits = rpc_resource_limits(_mcfg.rpc_memory_limit);
    if (!_server[0]) {
-        auto listen = [&] (const gms::inet_address& a) {
+        auto listen = [&] (const gms::inet_address& a, rpc::streaming_domain_type sdomain) {
+            so.streaming_domain = sdomain;
            auto addr = ipv4_addr{a.raw_addr(), _port};
            return std::unique_ptr<rpc_protocol_server_wrapper>(new rpc_protocol_server_wrapper(*_rpc,
                    so, addr, limits));
        };
-        _server[0] = listen(_listen_address);
+        _server[0] = listen(_listen_address, rpc::streaming_domain_type(0x55AA));
        if (listen_to_bc) {
-            _server[1] = listen(utils::fb_utilities::get_broadcast_address());
+            _server[1] = listen(utils::fb_utilities::get_broadcast_address(), rpc::streaming_domain_type(0x66BB));
        }
    }

    if (!_server_tls[0]) {
-        auto listen = [&] (const gms::inet_address& a) {
+        auto listen = [&] (const gms::inet_address& a, rpc::streaming_domain_type sdomain) {
+            so.streaming_domain = sdomain;
            return std::unique_ptr<rpc_protocol_server_wrapper>(
                    [this, &so, &a, limits] () -> std::unique_ptr<rpc_protocol_server_wrapper>{
                if (_encrypt_what == encrypt_what::none) {
@@ -312,9 +314,9 @@ void messaging_service::start_listen() {
                        so, seastar::tls::listen(_credentials, addr, lo), limits);
            }());
        };
-        _server_tls[0] = listen(_listen_address);
+        _server_tls[0] = listen(_listen_address, rpc::streaming_domain_type(0x77CC));
        if (listen_to_bc) {
-            _server_tls[1] = listen(utils::fb_utilities::get_broadcast_address());
+            _server_tls[1] = listen(utils::fb_utilities::get_broadcast_address(), rpc::streaming_domain_type(0x88DD));
        }
    }
    // Do this on just cpu 0, to avoid duplicate logs.
@@ -592,6 +594,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        opts.compressor_factory = &compressor_factory;
    }
    opts.tcp_nodelay = must_tcp_nodelay;
+    opts.reuseaddr = true;

    auto client = must_encrypt ?
                    ::make_shared<rpc_protocol_client_wrapper>(*_rpc, std::move(opts),
@@ -651,23 +654,27 @@ std::unique_ptr<messaging_service::rpc_protocol_wrapper>& messaging_service::rpc
    return _rpc;
 }

-rpc::sink<int32_t> messaging_service::make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment>& source) {
+rpc::sink<int32_t> messaging_service::make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>>& source) {
    return source.make_sink<netw::serializer, int32_t>();
 }

-future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>
+future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>>
 messaging_service::make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id) {
-    auto wrapper = get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, id);
-    rpc_protocol::client& rpc_client = *wrapper;
-    return wrapper->make_stream_sink<netw::serializer, frozen_mutation_fragment>().then([this, plan_id, schema_id, cf_id, estimated_partitions, reason, &rpc_client] (rpc::sink<frozen_mutation_fragment> sink) mutable {
-        auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, streaming::stream_reason, rpc::sink<frozen_mutation_fragment>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
-        return rpc_handler(rpc_client , plan_id, schema_id, cf_id, estimated_partitions, reason, sink).then([sink] (rpc::source<int32_t> source) mutable {
-            return make_ready_future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>(std::move(sink), std::move(source));
+    if (is_stopping()) {
+        return make_exception_future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>>(rpc::closed_error());
+    }
+    auto rpc_client = get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, id);
+    return rpc_client->make_stream_sink<netw::serializer, frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>().then([this, plan_id, schema_id, cf_id, estimated_partitions, reason, rpc_client] (rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd> sink) mutable {
+        auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, streaming::stream_reason, rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
+        return rpc_handler(*rpc_client , plan_id, schema_id, cf_id, estimated_partitions, reason, sink).then_wrapped([sink, rpc_client] (future<rpc::source<int32_t>> source) mutable {
+            return (source.failed() ? sink.close() : make_ready_future<>()).then([sink = std::move(sink), source = std::move(source)] () mutable {
+                return make_ready_future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>>(std::move(sink), std::move(source.get0()));
+            });
        });
    });
 }

-void messaging_service::register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason>, rpc::source<frozen_mutation_fragment> source)>&& func) {
+void messaging_service::register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason>, rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>> source)>&& func) {
    register_handler(this, messaging_verb::STREAM_MUTATION_FRAGMENTS, std::move(func));
 }

--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -36,6 +36,7 @@
 #include "tracing/tracing.hh"
 #include "digest_algorithm.hh"
 #include "streaming/stream_reason.hh"
+#include "streaming/stream_mutation_fragments_cmd.hh"

 #include <seastar/net/tls.hh>

@@ -256,9 +257,9 @@ public:

    // Wrapper for STREAM_MUTATION_FRAGMENTS
    // The receiver of STREAM_MUTATION_FRAGMENTS sends status code to the sender to notify any error on the receiver side. The status code is of type int32_t. 0 means successful, -1 means error, other status code value are reserved for future use.
-    void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment> source)>&& func);
-    rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment>& source);
-    future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);
+    void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>> source)>&& func);
+    rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>>& source);
+    future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);

    void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
    future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -1162,6 +1162,7 @@ row::apply_monotonically(const column_definition& column, atomic_cell_or_collect
 void
 row::append_cell(column_id id, atomic_cell_or_collection value) {
    if (_type == storage_type::vector && id < max_vector_size) {
+        assert(_storage.vector.v.size() <= id);
        _storage.vector.v.resize(id);
        _storage.vector.v.emplace_back(cell_and_hash{std::move(value), cell_hash_opt()});
        _storage.vector.present.set(id);
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -387,7 +387,7 @@ public:
        if (is_missing() || _ttl == dead) {
            return false;
        }
-        if (_ttl != no_ttl && _expiry < now) {
+        if (_ttl != no_ttl && _expiry <= now) {
            return false;
        }
        return _timestamp > t.timestamp;
@@ -397,7 +397,7 @@ public:
        if (_ttl == dead) {
            return true;
        }
-        return _ttl != no_ttl && _expiry < now;
+        return _ttl != no_ttl && _expiry <= now;
    }
    // Can be called only when is_live().
    bool is_expiring() const {
@@ -435,7 +435,7 @@ public:
            _timestamp = api::missing_timestamp;
            return false;
        }
-        if (_ttl > no_ttl && _expiry < now) {
+        if (_ttl > no_ttl && _expiry <= now) {
            _expiry -= _ttl;
            _ttl = dead;
        }
--- a/mutation_query.cc
+++ b/mutation_query.cc
@@ -31,7 +31,7 @@ reconcilable_result::reconcilable_result()
    : _row_count(0)
 { }

-reconcilable_result::reconcilable_result(uint32_t row_count, std::vector<partition> p, query::short_read short_read,
+reconcilable_result::reconcilable_result(uint32_t row_count, utils::chunked_vector<partition> p, query::short_read short_read,
                                         query::result_memory_tracker memory_tracker)
    : _row_count(row_count)
    , _short_read(short_read)
@@ -39,11 +39,11 @@ reconcilable_result::reconcilable_result(uint32_t row_count, std::vector<partiti
    , _partitions(std::move(p))
 { }

-const std::vector<partition>& reconcilable_result::partitions() const {
+const utils::chunked_vector<partition>& reconcilable_result::partitions() const {
    return _partitions;
 }

-std::vector<partition>& reconcilable_result::partitions() {
+utils::chunked_vector<partition>& reconcilable_result::partitions() {
    return _partitions;
 }

--- a/mutation_query.hh
+++ b/mutation_query.hh
@@ -27,6 +27,7 @@
 #include "frozen_mutation.hh"
 #include "db/timeout_clock.hh"
 #include "querier.hh"
+#include "utils/chunked_vector.hh"
 #include <seastar/core/execution_stage.hh>

 class reconcilable_result;
@@ -72,17 +73,17 @@ class reconcilable_result {
    uint32_t _row_count;
    query::short_read _short_read;
    query::result_memory_tracker _memory_tracker;
-    std::vector<partition> _partitions;
+    utils::chunked_vector<partition> _partitions;
 public:
    ~reconcilable_result();
    reconcilable_result();
    reconcilable_result(reconcilable_result&&) = default;
    reconcilable_result& operator=(reconcilable_result&&) = default;
-    reconcilable_result(uint32_t row_count, std::vector<partition> partitions, query::short_read short_read,
+    reconcilable_result(uint32_t row_count, utils::chunked_vector<partition> partitions, query::short_read short_read,
                        query::result_memory_tracker memory_tracker = { });

-    const std::vector<partition>& partitions() const;
-    std::vector<partition>& partitions();
+    const utils::chunked_vector<partition>& partitions() const;
+    utils::chunked_vector<partition>& partitions();

    uint32_t row_count() const {
        return _row_count;
@@ -112,7 +113,7 @@ class reconcilable_result_builder {
    const schema& _schema;
    const query::partition_slice& _slice;

-    std::vector<partition> _result;
+    utils::chunked_vector<partition> _result;
    uint32_t _live_rows{};

    bool _has_ck_selector{};
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -764,6 +764,8 @@ class foreign_reader : public flat_mutation_reader::impl {
    }

    void update_buffer_with(foreign_unique_ptr<fragment_buffer> buffer, bool end_of_steam);
+
+    static future<> ensure_buffer_contains_all_fragments_for_last_pos(flat_mutation_reader& reader, fragment_buffer& buffer);
 public:
    foreign_reader(schema_ptr schema,
            foreign_unique_ptr<flat_mutation_reader> reader,
@@ -799,6 +801,39 @@ void foreign_reader::update_buffer_with(foreign_unique_ptr<fragment_buffer> buff
    }
 }

+future<> foreign_reader::ensure_buffer_contains_all_fragments_for_last_pos(flat_mutation_reader& reader, fragment_buffer& buffer) {
+    if (buffer.empty() || !buffer.back().is_range_tombstone()) {
+        return make_ready_future<>();
+    }
+
+    auto stop = [&reader, &buffer] {
+        if (reader.is_buffer_empty()) {
+            return reader.is_end_of_stream();
+        }
+        if (!buffer.back().is_range_tombstone()) {
+            return true;
+        }
+        const auto next_pos = reader.peek_buffer().position();
+        const auto& last_key = buffer.back().key();
+
+        // Ending the buffer on a non-full prefix key position is
+        // problematic because when recreating the reader we continue
+        // from *after* the last key we saw. If this is a prefix this
+        // would exclude all clustering positions that fall into the
+        // prefix. Fixing this is non-trivial and has little gain over
+        // just making sure we don't end the buffer on a prefix.
+        return last_key.is_full(*reader.schema()) && !next_pos.key().equal(*reader.schema(), last_key);
+    };
+
+    return do_until(stop, [&reader, &buffer] {
+        if (reader.is_buffer_empty()) {
+            return reader.fill_buffer(db::no_timeout);
+        }
+        buffer.emplace_back(reader.pop_mutation_fragment());
+        return make_ready_future<>();
+    });
+}
+
 foreign_reader::foreign_reader(schema_ptr schema,
        foreign_unique_ptr<flat_mutation_reader> reader,
        streamed_mutation::forwarding fwd_sm)
@@ -896,9 +931,29 @@ future<foreign_ptr<std::unique_ptr<flat_mutation_reader>>> foreign_reader::pause
            if (pending_next_partition) {
                reader->next_partition();
            }
-            return make_ready_future<foreign_unique_ptr<fragment_buffer>, bool>(
-                    std::make_unique<fragment_buffer>(reader->detach_buffer()),
-                    reader->is_end_of_stream());
+            auto buffer = reader->detach_buffer();
+            if (buffer.empty() || !buffer.back().is_range_tombstone()) {
+                return make_ready_future<foreign_unique_ptr<fragment_buffer>, bool>(
+                        std::make_unique<fragment_buffer>(std::move(buffer)),
+                        reader->is_end_of_stream());
+            }
+            // When the reader is recreated (after having been evicted) we
+            // recreate it such that it starts reading from *after* the last
+            // seen fragment's position. If the last seen fragment is a range
+            // tombstone it is *not* guaranteed that the next fragments in the
+            // data stream have positions strictly greater than the range
+            // tombstone's. If the reader is evicted and has to be recreated,
+            // these fragments would be then skipped as the read would continue
+            // after their position.
+            // To avoid this ensure that the buffer contains *all* fragments for
+            // the last seen position.
+            return do_with(std::move(buffer), [reader] (fragment_buffer& buffer) mutable {
+                return ensure_buffer_contains_all_fragments_for_last_pos(*reader, buffer).then([reader, &buffer] () mutable {
+                    return make_ready_future<foreign_unique_ptr<fragment_buffer>, bool>(
+                            std::make_unique<fragment_buffer>(std::move(buffer)),
+                            reader->is_end_of_stream() && reader->is_buffer_empty());
+                });
+            });
        });
    }).then([this] (foreign_unique_ptr<fragment_buffer>&& buffer, bool end_of_stream) mutable {
        update_buffer_with(std::move(buffer), end_of_stream);
--- a/partition_version.cc
+++ b/partition_version.cc
@@ -172,6 +172,9 @@ tombstone partition_entry::partition_tombstone() const {

 partition_snapshot::~partition_snapshot() {
    with_allocator(region().allocator(), [this] {
+        if (_locked) {
+            touch();
+        }
        if (_version && _version.is_unique_owner()) {
            auto v = &*_version;
            _version = {};
@@ -268,6 +271,7 @@ partition_entry::~partition_entry() {
        return;
    }
    if (_snapshot) {
+        assert(!_snapshot->is_locked());
        _snapshot->_version = std::move(_version);
        _snapshot->_version.mark_as_unique_owner();
        _snapshot->_entry = nullptr;
@@ -284,6 +288,7 @@ stop_iteration partition_entry::clear_gently(cache_tracker* tracker) noexcept {
    }

    if (_snapshot) {
+        assert(!_snapshot->is_locked());
        _snapshot->_version = std::move(_version);
        _snapshot->_version.mark_as_unique_owner();
        _snapshot->_entry = nullptr;
@@ -311,6 +316,7 @@ stop_iteration partition_entry::clear_gently(cache_tracker* tracker) noexcept {
 void partition_entry::set_version(partition_version* new_version)
 {
    if (_snapshot) {
+        assert(!_snapshot->is_locked());
        _snapshot->_version = std::move(_version);
        _snapshot->_entry = nullptr;
    }
@@ -459,7 +465,6 @@ public:

 coroutine partition_entry::apply_to_incomplete(const schema& s,
    partition_entry&& pe,
-    const schema& pe_schema,
    mutation_cleaner& pe_cleaner,
    logalloc::allocating_section& alloc,
    logalloc::region& reg,
@@ -479,10 +484,6 @@ coroutine partition_entry::apply_to_incomplete(const schema& s,
    // partitions where I saw 40% slow down.
    const bool preemptible = s.clustering_key_size() > 0;

-    if (s.version() != pe_schema.version()) {
-        pe.upgrade(pe_schema.shared_from_this(), s.shared_from_this(), pe_cleaner, no_cache_tracker);
-    }
-
    // When preemptible, later memtable reads could start using the snapshot before
    // snapshot's writes are made visible in cache, which would cause them to miss those writes.
    // So we cannot allow erasing when preemptible.
@@ -496,6 +497,7 @@ coroutine partition_entry::apply_to_incomplete(const schema& s,
        prev_snp = read(reg, tracker.cleaner(), s.shared_from_this(), &tracker, phase - 1);
    }
    auto dst_snp = read(reg, tracker.cleaner(), s.shared_from_this(), &tracker, phase);
+    dst_snp->lock();

    // Once we start updating the partition, we must keep all snapshots until the update completes,
    // otherwise partial writes would be published. So the scope of snapshots must enclose the scope
@@ -570,6 +572,7 @@ coroutine partition_entry::apply_to_incomplete(const schema& s,
                    auto has_next = src_cur.erase_and_advance();
                    acc.unpin_memory(size);
                    if (!has_next) {
+                        dst_snp->unlock();
                        return stop_iteration::yes;
                    }
                } while (!preemptible || !need_preempt());
@@ -661,6 +664,18 @@ partition_snapshot::range_tombstones()
        position_in_partition_view::after_all_clustered_rows());
 }

+void partition_snapshot::touch() noexcept {
+    // Eviction assumes that older versions are evicted before newer so only the latest snapshot
+    // can be touched.
+    if (_tracker && at_latest_version()) {
+        auto&& rows = version()->partition().clustered_rows();
+        assert(!rows.empty());
+        rows_entry& last_dummy = *rows.rbegin();
+        assert(last_dummy.is_last_dummy());
+        _tracker->touch(last_dummy);
+    }
+}
+
 std::ostream& operator<<(std::ostream& out, const partition_entry& e) {
    out << "{";
    bool first = true;
@@ -687,6 +702,7 @@ void partition_entry::evict(mutation_cleaner& cleaner) noexcept {
        return;
    }
    if (_snapshot) {
+        assert(!_snapshot->is_locked());
        _snapshot->_version = std::move(_version);
        _snapshot->_version.mark_as_unique_owner();
        _snapshot->_entry = nullptr;
@@ -706,3 +722,18 @@ partition_snapshot_ptr::~partition_snapshot_ptr() {
        }
    }
 }
+
+void partition_snapshot::lock() noexcept {
+    // partition_entry::is_locked() assumes that if there is a locked snapshot,
+    // it can be found attached directly to it.
+    assert(at_latest_version());
+    _locked = true;
+}
+
+void partition_snapshot::unlock() noexcept {
+    // Locked snapshots must always be latest, is_locked() assumes that.
+    // Also, touch() is only effective when this snapshot is latest.
+    assert(at_latest_version());
+    _locked = false;
+    touch(); // Make the entry evictable again in case it was fully unlinked by eviction attempt.
+}
--- a/partition_version.hh
+++ b/partition_version.hh
@@ -303,6 +303,7 @@ private:
    mutation_cleaner* _cleaner;
    cache_tracker* _tracker;
    boost::intrusive::slist_member_hook<> _cleaner_hook;
+    bool _locked = false;
    friend class partition_entry;
    friend class mutation_cleaner_impl;
 public:
@@ -318,6 +319,22 @@ public:
    partition_snapshot& operator=(const partition_snapshot&) = delete;
    partition_snapshot& operator=(partition_snapshot&&) = delete;

+    // Makes the snapshot locked.
+    // See is_locked() for meaning.
+    // Can be called only when at_lastest_version(). The snapshot must remain latest as long as it's locked.
+    void lock() noexcept;
+
+    // Makes the snapshot no longer locked.
+    // See is_locked() for meaning.
+    void unlock() noexcept;
+
+    // Tells whether the snapshot is locked.
+    // Locking the snapshot prevents it from getting detached from the partition entry.
+    // It also prevents the partition entry from being evicted.
+    bool is_locked() const {
+        return _locked;
+    }
+
    static partition_snapshot& container_of(partition_version_ref* ref) {
        return *boost::intrusive::get_parent_from_member(ref, &partition_snapshot::_version);
    }
@@ -344,6 +361,9 @@ public:
    // to the latest version.
    stop_iteration slide_to_oldest() noexcept;

+    // Brings the snapshot to the front of the LRU.
+    void touch() noexcept;
+
    // Must be called after snapshot's original region is merged into a different region
    // before the original region is destroyed, unless the snapshot is destroyed earlier.
    void migrate(logalloc::region* region, mutation_cleaner* cleaner) noexcept {
@@ -503,9 +523,18 @@ public:
        return _version->all_elements_reversed();
    }

+    // Tells whether this entry is locked.
+    // Locked entries are undergoing an update and should not have their snapshots
+    // detached from the entry.
+    // Certain methods can only be called when !is_locked().
+    bool is_locked() const {
+        return _snapshot && _snapshot->is_locked();
+    }
+
    // Strong exception guarantees.
    // Assumes this instance and mp are fully continuous.
    // Use only on non-evictable entries.
+    // Must not be called when is_locked().
    void apply(const schema& s, const mutation_partition& mp, const schema& mp_schema);
    void apply(const schema& s, mutation_partition&& mp, const schema& mp_schema);

@@ -526,11 +555,14 @@ public:
    // such that if the operation is retried (possibly many times) and eventually
    // succeeds the result will be as if the first attempt didn't fail.
    //
+    // The schema of pe must conform to s.
+    //
    // Returns a coroutine object representing the operation.
    // The coroutine must be resumed with the region being unlocked.
+    //
+    // The coroutine cannot run concurrently with other apply() calls.
    coroutine apply_to_incomplete(const schema& s,
        partition_entry&& pe,
-        const schema& pe_schema,
        mutation_cleaner& pe_cleaner,
        logalloc::allocating_section&,
        logalloc::region&,
@@ -539,6 +571,7 @@ public:
        real_dirty_memory_accounter&);

    // If this entry is evictable, cache_tracker must be provided.
+    // Must not be called when is_locked().
    partition_version& add_version(const schema& s, cache_tracker*);

    // Returns a reference to existing version with an active snapshot of given phase
@@ -568,9 +601,11 @@ public:
    tombstone partition_tombstone() const;

    // needs to be called with reclaiming disabled
+    // Must not be called when is_locked().
    void upgrade(schema_ptr from, schema_ptr to, mutation_cleaner&, cache_tracker*);

    // Snapshots with different values of phase will point to different partition_version objects.
+    // When is_locked(), read() can only be called with a phase which is <= the phase of the current snapshot.
    partition_snapshot_ptr read(logalloc::region& region,
        mutation_cleaner&,
        schema_ptr entry_schema,
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -151,6 +151,7 @@ public:
        return {partition_region::clustered, 1, &ck};
    }

+    partition_region region() const { return _type; }
    bool is_partition_start() const { return _type == partition_region::partition_start; }
    bool is_partition_end() const { return _type == partition_region::partition_end; }
    bool is_static_row() const { return _type == partition_region::static_row; }
--- a/querier.cc
+++ b/querier.cc
@@ -288,11 +288,11 @@ static void insert_querier(

    auto& e = entries.emplace_back(key, std::move(q), expires);
    e.set_pos(--entries.end());
+    ++stats.population;

    if (auto irh = sem.register_inactive_read(std::make_unique<querier_inactive_read>(entries, e.pos(), stats))) {
        e.set_inactive_handle(irh);
        index.insert(e);
-        ++stats.population;
    }
 }

--- a/read_context.hh
+++ b/read_context.hh
@@ -38,7 +38,7 @@ class autoupdating_underlying_reader final {
    row_cache& _cache;
    read_context& _read_context;
    stdx::optional<flat_mutation_reader> _reader;
-    utils::phased_barrier::phase_type _reader_creation_phase;
+    utils::phased_barrier::phase_type _reader_creation_phase = 0;
    dht::partition_range _range = { };
    stdx::optional<dht::decorated_key> _last_key;
    stdx::optional<dht::decorated_key> _new_last_key;
@@ -105,7 +105,6 @@ public:
        return make_ready_future<>();
    }
    utils::phased_barrier::phase_type creation_phase() const {
-        assert(_reader);
        return _reader_creation_phase;
    }
    const dht::partition_range& range() const {
@@ -192,7 +191,7 @@ public:
    const dht::decorated_key& key() const { return *_key; }
    void on_underlying_created() { ++_underlying_created; }
    bool digest_requested() const { return _slice.options.contains<query::partition_slice::option::with_digest>(); }
-private:
+public:
    future<> ensure_underlying(db::timeout_clock::time_point timeout) {
        if (_underlying_snapshot) {
            return create_underlying(true, timeout);
@@ -211,18 +210,6 @@ public:
        _underlying_snapshot = {};
        _key = dk;
    }
-    // Fast forwards the underlying streamed_mutation to given range.
-    future<> fast_forward_to(position_range range, db::timeout_clock::time_point timeout) {
-        return ensure_underlying(timeout).then([this, range = std::move(range), timeout] {
-            return _underlying.underlying().fast_forward_to(std::move(range), timeout);
-        });
-    }
-    // Gets the next fragment from the underlying reader
-    future<mutation_fragment_opt> get_next_fragment(db::timeout_clock::time_point timeout) {
-        return ensure_underlying(timeout).then([this, timeout] {
-            return _underlying.underlying()(timeout);
-        });
-    }
 };

 }
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -814,8 +814,10 @@ static future<> repair_cf_range(repair_info& ri,
                    // still do our best to repair available replicas.
                    std::vector<gms::inet_address> live_neighbors;
                    std::vector<partition_checksum> live_neighbors_checksum;
+                    bool local_checksum_failed = false;
                    for (unsigned i = 0; i < checksums.size(); i++) {
                        if (checksums[i].failed()) {
+                            local_checksum_failed |= (i == 0);
                            rlogger.warn(
                                "Checksum of range {} on {} failed: {}",
                                range,
@@ -831,7 +833,7 @@ static future<> repair_cf_range(repair_info& ri,
                            live_neighbors_checksum.push_back(checksums[i].get0());
                        }
                    }
-                    if (checksums[0].failed() || live_neighbors.empty()) {
+                    if (local_checksum_failed || live_neighbors.empty()) {
                        return make_ready_future<>();
                    }
                    // If one of the available checksums is different, repair
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -32,7 +32,6 @@
 #include <sys/sdt.h>
 #include "stdx.hh"
 #include "read_context.hh"
-#include "schema_upgrader.hh"
 #include "dirty_memory_manager.hh"
 #include "cache_flat_mutation_reader.hh"
 #include "real_dirty_memory_accounter.hh"
@@ -350,13 +349,11 @@ future<> read_context::create_underlying(bool skip_first_fragment, db::timeout_c

 static flat_mutation_reader read_directly_from_underlying(read_context& reader) {
    flat_mutation_reader res = make_delegating_reader(reader.underlying().underlying());
-    if (reader.schema()->version() != reader.underlying().underlying().schema()->version()) {
-        res = transform(std::move(res), schema_upgrader(reader.schema()));
-    }
    if (reader.fwd() == streamed_mutation::forwarding::no) {
        res = make_nonforwardable(std::move(res), true);
    }
-    return std::move(res);
+    res.upgrade_schema(reader.schema());
+    return res;
 }

 // Reader which populates the cache using data from the delegate.
@@ -947,7 +944,6 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
    });

    return seastar::async([this, &m, updater = std::move(updater), real_dirty_acc = std::move(real_dirty_acc)] () mutable {
-        coroutine update;
        size_t size_entry;
        // In case updater fails, we must bring the cache to consistency without deferring.
        auto cleanup = defer([&m, this] {
@@ -955,6 +951,7 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
            _prev_snapshot_pos = {};
            _prev_snapshot = {};
        });
+        coroutine update; // Destroy before cleanup to release snapshots before invalidating.
        partition_presence_checker is_present = _prev_snapshot->make_partition_presence_checker();
        while (!m.partitions.empty()) {
            with_allocator(_tracker.allocator(), [&] () {
@@ -1026,8 +1023,10 @@ future<> row_cache::update(external_updater eu, memtable& m) {
        if (cache_i != partitions_end() && cache_i->key().equal(*_schema, mem_e.key())) {
            cache_entry& entry = *cache_i;
            upgrade_entry(entry);
+            assert(entry._schema == _schema);
            _tracker.on_partition_merge();
-            return entry.partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), *mem_e.schema(), _tracker.memtable_cleaner(),
+            mem_e.upgrade_schema(_schema, _tracker.memtable_cleaner());
+            return entry.partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), _tracker.memtable_cleaner(),
                alloc, _tracker.region(), _tracker, _underlying_phase, acc);
        } else if (cache_i->continuous()
                   || with_allocator(standard_allocator(), [&] { return is_present(mem_e.key()); })
@@ -1039,7 +1038,8 @@ future<> row_cache::update(external_updater eu, memtable& m) {
            entry->set_continuous(cache_i->continuous());
            _tracker.insert(*entry);
            _partitions.insert_before(cache_i, *entry);
-            return entry->partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), *mem_e.schema(), _tracker.memtable_cleaner(),
+            mem_e.upgrade_schema(_schema, _tracker.memtable_cleaner());
+            return entry->partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), _tracker.memtable_cleaner(),
                alloc, _tracker.region(), _tracker, _underlying_phase, acc);
        } else {
            return make_empty_coroutine();
@@ -1136,8 +1136,8 @@ future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&
    });
 }

-void row_cache::evict(const dht::partition_range& range) {
-    invalidate_unwrapped(range);
+void row_cache::evict() {
+    while (_tracker.region().evict_some() == memory::reclaiming_result::reclaimed_something) {}
 }

 void row_cache::invalidate_unwrapped(const dht::partition_range& range) {
@@ -1224,8 +1224,11 @@ void rows_entry::on_evicted(cache_tracker& tracker) noexcept {
        partition_version& pv = partition_version::container_of(mutation_partition::container_of(
            mutation_partition::rows_type::container_of_only_member(*it)));
        if (pv.is_referenced_from_entry()) {
-            cache_entry& ce = cache_entry::container_of(partition_entry::container_of(pv));
-            ce.on_evicted(tracker);
+            partition_entry& pe = partition_entry::container_of(pv);
+            if (!pe.is_locked()) {
+                cache_entry& ce = cache_entry::container_of(pe);
+                ce.on_evicted(tracker);
+            }
        }
    }
 }
@@ -1246,13 +1249,12 @@ flat_mutation_reader cache_entry::do_read(row_cache& rc, read_context& reader) {
    auto snp = _pe.read(rc._tracker.region(), rc._tracker.cleaner(), _schema, &rc._tracker, reader.phase());
    auto ckr = query::clustering_key_filter_ranges::get_ranges(*_schema, reader.slice(), _key.key());
    auto r = make_cache_flat_mutation_reader(_schema, _key, std::move(ckr), rc, reader.shared_from_this(), std::move(snp));
-    if (reader.schema()->version() != _schema->version()) {
-        r = transform(std::move(r), schema_upgrader(reader.schema()));
-    }
    if (reader.fwd() == streamed_mutation::forwarding::yes) {
        r = make_forwardable(std::move(r));
    }
-    return std::move(r);
+    r.upgrade_schema(rc.schema());
+    r.upgrade_schema(reader.schema());
+    return r;
 }

 const schema_ptr& row_cache::schema() const {
@@ -1260,7 +1262,7 @@ const schema_ptr& row_cache::schema() const {
 }

 void row_cache::upgrade_entry(cache_entry& e) {
-    if (e._schema != _schema) {
+    if (e._schema != _schema && !e.partition().is_locked()) {
        auto& r = _tracker.region();
        assert(!r.reclaiming_enabled());
        with_allocator(r.allocator(), [this, &e] {
--- a/row_cache.hh
+++ b/row_cache.hh
@@ -549,12 +549,12 @@ public:
    future<> invalidate(external_updater, const dht::partition_range& = query::full_partition_range);
    future<> invalidate(external_updater, dht::partition_range_vector&&);

-    // Evicts entries from given range in cache.
+    // Evicts entries from cache.
    //
    // Note that this does not synchronize with the underlying source,
    // it is assumed that the underlying source didn't change.
    // If it did, use invalidate() instead.
-    void evict(const dht::partition_range& = query::full_partition_range);
+    void evict();

    size_t partitions() const {
        return _partitions.size();
--- a/2
+++ b/2
@@ -59,7 +59,7 @@ def sh_command(*args):

 def get_json_from_url(path):
    data = sh_command("curl", "-s", "-X", "GET", path)
-    return json.loads(data)
+    return json.loads(data.decode('utf-8'))

 def get_api(path):
    return get_json_from_url("http://" + api_address + path)
--- a/2
+++ b/2
--- a/service/cache_hitrate_calculator.hh
+++ b/service/cache_hitrate_calculator.hh
@@ -28,11 +28,25 @@
 namespace service {

 class cache_hitrate_calculator : public seastar::async_sharded_service<cache_hitrate_calculator> {
+    struct stat {
+        float h = 0;
+        float m = 0;
+        stat& operator+=(stat& o) {
+            h += o.h;
+            m += o.m;
+            return *this;
+        }
+    };
+
    seastar::sharded<database>& _db;
    seastar::sharded<cache_hitrate_calculator>& _me;
    timer<lowres_clock> _timer;
    bool _stopped = false;
    float _diff = 0;
+    std::unordered_map<utils::UUID, stat> _rates;
+    size_t _slen = 0;
+    std::string _gstate;
+    future<> _done = make_ready_future();

    future<lowres_clock::duration> recalculate_hitrates();
    void recalculate_timer();
--- a/service/client_state.cc
+++ b/service/client_state.cc
@@ -181,7 +181,7 @@ future<> service::client_state::has_access(const sstring& ks, auth::permission p
        for (auto cf : { db::system_keyspace::LOCAL, db::system_keyspace::PEERS }) {
            tmp.insert(auth::make_data_resource(db::system_keyspace::NAME, cf));
        }
-        for (auto cf : db::schema_tables::ALL) {
+        for (auto cf : db::schema_tables::all_table_names()) {
            tmp.insert(auth::make_data_resource(db::schema_tables::NAME, cf));
        }
        return tmp;
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -204,6 +204,10 @@ future<> migration_manager::maybe_schedule_schema_pull(const utils::UUID& their_
                return make_ready_future<>();
            }
            const auto* value = ep_state->get_application_state_ptr(gms::application_state::SCHEMA);
+            if (!value) {
+                mlogger.debug("application_state::SCHEMA does not exist for {}, not submitting migration task", endpoint);
+                return make_ready_future<>();
+            }
            utils::UUID current_version{value->value};
            auto& db = proxy.get_db().local();
            if (db.get_version() == current_version) {
@@ -529,6 +533,10 @@ future<> migration_manager::announce_new_column_family(schema_ptr cfm, api::time
        if (db.has_schema(cfm->ks_name(), cfm->cf_name())) {
            throw exceptions::already_exists_exception(cfm->ks_name(), cfm->cf_name());
        }
+        if (db.column_family_exists(cfm->id())) {
+            throw exceptions::invalid_request_exception(sprint("Table with ID %s already exists: %s", cfm->id(), db.find_schema(cfm->id())));
+        }
+
        mlogger.info("Create new ColumnFamily: {}", cfm);
        return db::schema_tables::make_create_table_mutations(keyspace.metadata(), cfm, timestamp)
            .then([announce_locally, this] (auto&& mutations) {
--- a/service/misc_services.cc
+++ b/service/misc_services.cc
@@ -92,7 +92,7 @@ cache_hitrate_calculator::cache_hitrate_calculator(seastar::sharded<database>& d
 {}

 void cache_hitrate_calculator::recalculate_timer() {
-    recalculate_hitrates().then_wrapped([p = shared_from_this()] (future<lowres_clock::duration> f) {
+    _done = recalculate_hitrates().then_wrapped([p = shared_from_this()] (future<lowres_clock::duration> f) {
        lowres_clock::duration d;
        if (f.failed()) {
            d = std::chrono::milliseconds(2000);
@@ -112,21 +112,11 @@ void cache_hitrate_calculator::run_on(size_t master, lowres_clock::duration d) {
 }

 future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates() {
-    struct stat {
-        float h = 0;
-        float m = 0;
-        stat& operator+=(stat& o) {
-            h += o.h;
-            m += o.m;
-            return *this;
-        }
-    };
-
-    static auto non_system_filter = [&] (const std::pair<utils::UUID, lw_shared_ptr<column_family>>& cf) {
+    auto non_system_filter = [&] (const std::pair<utils::UUID, lw_shared_ptr<column_family>>& cf) {
        return _db.local().find_keyspace(cf.second->schema()->ks_name()).get_replication_strategy().get_type() != locator::replication_strategy_type::local;
    };

-    auto cf_to_cache_hit_stats = [] (database& db) {
+    auto cf_to_cache_hit_stats = [non_system_filter] (database& db) {
        return boost::copy_range<std::unordered_map<utils::UUID, stat>>(db.get_column_families() | boost::adaptors::filtered(non_system_filter) |
                boost::adaptors::transformed([]  (const std::pair<utils::UUID, lw_shared_ptr<column_family>>& cf) {
            auto& stats = cf.second->get_row_cache().stats();
@@ -141,17 +131,20 @@ future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates()
        return std::move(a);
    };

-    return _db.map_reduce0(cf_to_cache_hit_stats, std::unordered_map<utils::UUID, stat>(), sum_stats_per_cf).then([this] (std::unordered_map<utils::UUID, stat> rates) mutable {
+    return _db.map_reduce0(cf_to_cache_hit_stats, std::unordered_map<utils::UUID, stat>(), sum_stats_per_cf).then([this, non_system_filter] (std::unordered_map<utils::UUID, stat> rates) mutable {
        _diff = 0;
+        _gstate.reserve(_slen); // assume length did not change from previous iteration
+        _slen = 0;
+        _rates = std::move(rates);
        // set calculated rates on all shards
-        return _db.invoke_on_all([this, rates = std::move(rates), cpuid = engine().cpu_id()] (database& db) {
-            sstring gstate;
-            for (auto& cf : db.get_column_families() | boost::adaptors::filtered(non_system_filter)) {
-                auto it = rates.find(cf.first);
-                if (it == rates.end()) { // a table may be added before map/reduce compltes and this code runs
-                    continue;
+        return _db.invoke_on_all([this, cpuid = engine().cpu_id(), non_system_filter] (database& db) {
+            return do_for_each(_rates, [this, cpuid, &db] (auto&& r) mutable {
+                auto it = db.get_column_families().find(r.first);
+                if (it == db.get_column_families().end()) { // a table may be added before map/reduce completes and this code runs
+                    return;
                }
-                stat s = it->second;
+                auto& cf = *it;
+                stat& s = r.second;
                float rate = 0;
                if (s.h) {
                    rate = s.h / (s.h + s.m);
@@ -159,31 +152,33 @@ future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates()
                if (engine().cpu_id() == cpuid) {
                    // calculate max difference between old rate and new one for all cfs
                    _diff = std::max(_diff, std::abs(float(cf.second->get_global_cache_hit_rate()) - rate));
-                    gstate += sprint("%s.%s:%f;", cf.second->schema()->ks_name(), cf.second->schema()->cf_name(), rate);
+                    _gstate += sprint("%s.%s:%.6f;", cf.second->schema()->ks_name(), cf.second->schema()->cf_name(), rate);
                }
                cf.second->set_global_cache_hit_rate(cache_temperature(rate));
-            }
-            if (gstate.size()) {
-                auto& g = gms::get_local_gossiper();
-                auto& ss = get_local_storage_service();
-                return g.add_local_application_state(gms::application_state::CACHE_HITRATES, ss.value_factory.cache_hitrates(std::move(gstate)));
-            }
-            return make_ready_future<>();
+            });
        });
    }).then([this] {
-        // if max difference during this round is big schedule next recalculate earlier
-        if (_diff < 0.01) {
-            return std::chrono::milliseconds(2000);
-        } else {
-            return std::chrono::milliseconds(500);
-        }
+        auto& g = gms::get_local_gossiper();
+        auto& ss = get_local_storage_service();
+        _slen = _gstate.size();
+        return g.add_local_application_state(gms::application_state::CACHE_HITRATES, ss.value_factory.cache_hitrates(_gstate)).then([this] {
+            // if max difference during this round is big schedule next recalculate earlier
+            if (_diff < 0.01) {
+                return std::chrono::milliseconds(2000);
+            } else {
+                return std::chrono::milliseconds(500);
+            }
+        });
+    }).finally([this] {
+        _gstate = std::string(); // free memory, do not trust clear() to do that for string
+        _rates.clear();
    });
 }

 future<> cache_hitrate_calculator::stop() {
    _timer.cancel();
    _stopped = true;
-    return make_ready_future<>();
+    return std::move(_done);
 }


--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -382,7 +382,7 @@ public:
    std::chrono::microseconds calculate_delay(db::view::update_backlog backlog) {
        constexpr auto delay_limit_us = 1000000;
        auto adjust = [] (float x) { return x * x * x; };
-        auto budget = std::min(std::chrono::microseconds(0), std::chrono::microseconds(_expire_timer.get_timeout() - storage_proxy::clock_type::now()));
+        auto budget = std::max(std::chrono::microseconds(0), std::chrono::microseconds(_expire_timer.get_timeout() - storage_proxy::clock_type::now()));
        return std::min(
                budget,
                std::chrono::microseconds(uint32_t(adjust(backlog.relative_size()) * delay_limit_us)));
@@ -616,9 +616,7 @@ void storage_proxy::maybe_update_view_backlog_of(gms::inet_address replica, stdx
 }

 db::view::update_backlog storage_proxy::get_view_update_backlog() const {
-    auto memory_backlog = get_db().local().get_view_update_backlog();
-    auto hints_backlog = db::view::update_backlog{_hints_for_views_manager.backlog_size(), _hints_for_views_manager.max_backlog_size()};
-    return _max_view_update_backlog.add_fetch(engine().cpu_id(), std::max(memory_backlog, hints_backlog));
+    return _max_view_update_backlog.add_fetch(engine().cpu_id(), get_db().local().get_view_update_backlog());
 }

 db::view::update_backlog storage_proxy::get_backlog_of(gms::inet_address ep) const {
@@ -1449,6 +1447,22 @@ future<> storage_proxy::mutate_begin(std::vector<unique_response_handler> ids, d
                                     stdx::optional<clock_type::time_point> timeout_opt) {
    return parallel_for_each(ids, [this, cl, timeout_opt] (unique_response_handler& protected_response) {
        auto response_id = protected_response.id;
+        // This function, mutate_begin(), is called after a preemption point
+        // so it's possible that other code besides our caller just ran. In
+        // particular, Scylla may have noticed that a remote node went down,
+        // called storage_proxy::on_down(), and removed some of the ongoing
+        // handlers, including this id. If this happens, we need to ignore
+        // this id - not try to look it up or start a send.
+        if (_response_handlers.find(response_id) == _response_handlers.end()) {
+            protected_response.release(); // Don't try to remove this id again
+            // Requests that time-out normally below after response_wait()
+            // result in an exception (see ~abstract_write_response_handler())
+            // However, here we no longer have the handler or its information
+            // to put in the exception. The exception is not needed for
+            // correctness (e.g., hints are written by timeout_cb(), not
+            // because of an exception here).
+            return make_exception_future<>(std::runtime_error("unstarted write cancelled"));
+        }
        // it is better to send first and hint afterwards to reduce latency
        // but request may complete before hint_to_dead_endpoints() is called and
        // response_id handler will be removed, so we will have to do hint with separate
@@ -2737,8 +2751,8 @@ public:

        // build reconcilable_result from reconciled data
        // traverse backwards since large keys are at the start
-        std::vector<partition> vec;
-        auto r = boost::accumulate(reconciled_partitions | boost::adaptors::reversed, std::ref(vec), [] (std::vector<partition>& a, const mutation_and_live_row_count& m_a_rc) {
+        utils::chunked_vector<partition> vec;
+        auto r = boost::accumulate(reconciled_partitions | boost::adaptors::reversed, std::ref(vec), [] (utils::chunked_vector<partition>& a, const mutation_and_live_row_count& m_a_rc) {
            a.emplace_back(partition(m_a_rc.live_row_count, freeze(m_a_rc.mut)));
            return std::ref(a);
        });
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -101,6 +101,7 @@ static const sstring ROLES_FEATURE = "ROLES";
 static const sstring LA_SSTABLE_FEATURE = "LA_SSTABLE_FORMAT";
 static const sstring STREAM_WITH_RPC_STREAM = "STREAM_WITH_RPC_STREAM";
 static const sstring MC_SSTABLE_FEATURE = "MC_SSTABLE_FORMAT";
+static const sstring CORRECT_STATIC_COMPACT_IN_MC = "CORRECT_STATIC_COMPACT_IN_MC";

 distributed<storage_service> _the_storage_service;

@@ -124,9 +125,27 @@ int get_generation_number() {
    return generation_number;
 }

-storage_service::storage_service(distributed<database>& db, sharded<auth::service>& auth_service, sharded<db::system_distributed_keyspace>& sys_dist_ks)
-        : _db(db)
+storage_service::storage_service(distributed<database>& db, sharded<auth::service>& auth_service, sharded<db::system_distributed_keyspace>& sys_dist_ks,
+        gms::feature_service& feature_service)
+        : _feature_service(feature_service)
+        , _db(db)
        , _auth_service(auth_service)
+        , _range_tombstones_feature(_feature_service, RANGE_TOMBSTONES_FEATURE)
+        , _large_partitions_feature(_feature_service, LARGE_PARTITIONS_FEATURE)
+        , _materialized_views_feature(_feature_service, MATERIALIZED_VIEWS_FEATURE)
+        , _counters_feature(_feature_service, COUNTERS_FEATURE)
+        , _indexes_feature(_feature_service, INDEXES_FEATURE)
+        , _digest_multipartition_read_feature(_feature_service, DIGEST_MULTIPARTITION_READ_FEATURE)
+        , _correct_counter_order_feature(_feature_service, CORRECT_COUNTER_ORDER_FEATURE)
+        , _schema_tables_v3(_feature_service, SCHEMA_TABLES_V3)
+        , _correct_non_compound_range_tombstones(_feature_service, CORRECT_NON_COMPOUND_RANGE_TOMBSTONES)
+        , _write_failure_reply_feature(_feature_service, WRITE_FAILURE_REPLY_FEATURE)
+        , _xxhash_feature(_feature_service, XXHASH_FEATURE)
+        , _roles_feature(_feature_service, ROLES_FEATURE)
+        , _la_sstable_feature(_feature_service, LA_SSTABLE_FEATURE)
+        , _stream_with_rpc_stream_feature(_feature_service, STREAM_WITH_RPC_STREAM)
+        , _mc_sstable_feature(_feature_service, MC_SSTABLE_FEATURE)
+        , _correct_static_compact_in_mc(_feature_service, CORRECT_STATIC_COMPACT_IN_MC)
        , _replicate_action([this] { return do_replicate_to_all_cores(); })
        , _update_pending_ranges_action([this] { return do_update_pending_ranges(); })
        , _sys_dist_ks(sys_dist_ks) {
@@ -137,6 +156,25 @@ storage_service::storage_service(distributed<database>& db, sharded<auth::servic
    commit_error.connect([this] { isolate_on_commit_error(); });
 }

+void storage_service::enable_all_features() {
+    _range_tombstones_feature.enable();
+    _large_partitions_feature.enable();
+    _materialized_views_feature.enable();
+    _counters_feature.enable();
+    _indexes_feature.enable();
+    _digest_multipartition_read_feature.enable();
+    _correct_counter_order_feature.enable();
+    _schema_tables_v3.enable();
+    _correct_non_compound_range_tombstones.enable();
+    _write_failure_reply_feature.enable();
+    _xxhash_feature.enable();
+    _roles_feature.enable();
+    _la_sstable_feature.enable();
+    _stream_with_rpc_stream_feature.enable();
+    _mc_sstable_feature.enable();
+    _correct_static_compact_in_mc.enable();
+}
+
 enum class node_external_status {
    UNKNOWN        = 0,
    STARTING       = 1,
@@ -210,7 +248,8 @@ sstring storage_service::get_config_supported_features() {
        LA_SSTABLE_FEATURE,
        STREAM_WITH_RPC_STREAM,
        MATERIALIZED_VIEWS_FEATURE,
-        INDEXES_FEATURE
+        INDEXES_FEATURE,
+        CORRECT_STATIC_COMPACT_IN_MC,
    };
    auto& config = service::get_local_storage_service()._db.local().get_config();
    if (config.enable_sstables_mc_format()) {
@@ -276,7 +315,7 @@ bool storage_service::should_bootstrap() {
 }

 // Runs inside seastar::async context
-void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints, bind_messaging_port do_bind) {
+void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints, const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features, bind_messaging_port do_bind) {
    if (_joined) {
        return;
    }
@@ -306,25 +345,20 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
        if (!is_auto_bootstrap()) {
            throw std::runtime_error("Trying to replace_address with auto_bootstrap disabled will not work, check your configuration");
        }
-        _bootstrap_tokens = prepare_replacement_info().get0();
+        _bootstrap_tokens = prepare_replacement_info(loaded_peer_features).get0();
        app_states.emplace(gms::application_state::TOKENS, value_factory.tokens(_bootstrap_tokens));
        app_states.emplace(gms::application_state::STATUS, value_factory.hibernate(true));
    } else if (should_bootstrap()) {
-        check_for_endpoint_collision().get();
+        check_for_endpoint_collision(loaded_peer_features).get();
    } else {
        auto& gossiper = gms::get_local_gossiper();
        auto seeds = gms::get_local_gossiper().get_seeds();
        auto my_ep = get_broadcast_address();
-        auto peer_features = db::system_keyspace::load_peer_features().get0();
-        slogger.info("load_peer_features: peer_features size={}", peer_features.size());
-        for (auto& x : peer_features) {
-            slogger.info("load_peer_features: peer={}, supported_features={}", x.first, x.second);
-        }
        auto local_features = get_config_supported_features();

        if (seeds.count(my_ep)) {
            // This node is a seed node
-            if (peer_features.empty()) {
+            if (loaded_peer_features.empty()) {
                // This is a competely new seed node, skip the check
                slogger.info("Checking remote features skipped, since this node is a new seed node which knows nothing about the cluster");
            } else {
@@ -332,7 +366,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
                if (seeds.size() == 1) {
                    // This node is the only seed node, check features with system table
                    slogger.info("Checking remote features with system table, since this node is the only seed node");
-                    gossiper.check_knows_remote_features(local_features, peer_features);
+                    gossiper.check_knows_remote_features(local_features, loaded_peer_features);
                } else {
                    // More than one seed node in the seed list, do shadow round with other seed nodes
                    bool ok;
@@ -347,11 +381,11 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
                    }

                    if (ok) {
-                        gossiper.check_knows_remote_features(local_features);
+                        gossiper.check_knows_remote_features(local_features, loaded_peer_features);
                    } else {
                        // Check features with system table
                        slogger.info("Checking remote features with gossip failed, fallback to check with system table");
-                        gossiper.check_knows_remote_features(local_features, peer_features);
+                        gossiper.check_knows_remote_features(local_features, loaded_peer_features);
                    }

                    gossiper.reset_endpoint_state_map().get();
@@ -367,7 +401,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
            // (missing features) to join the cluser.
            slogger.info("Checking remote features with gossip");
            gossiper.do_shadow_round().get();
-            gossiper.check_knows_remote_features(local_features);
+            gossiper.check_knows_remote_features(local_features, loaded_peer_features);
            gossiper.reset_endpoint_state_map().get();
            for (auto ep : loaded_endpoints) {
                gossiper.add_saved_endpoint(ep);
@@ -375,6 +409,14 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
        }
    }

+    // If this is a restarting node, we should update tokens before gossip starts
+    auto my_tokens = db::system_keyspace::get_saved_tokens().get0();
+    bool restarting_normal_node = db::system_keyspace::bootstrap_complete() && !db().local().is_replacing() && !my_tokens.empty();
+    if (restarting_normal_node) {
+        slogger.info("Restarting a node in NORMAL status");
+        _token_metadata.update_normal_tokens(my_tokens, get_broadcast_address());
+    }
+
    // have to start the gossip service before we can see any info on other nodes.  this is necessary
    // for bootstrap to get the load info it needs.
    // (we won't be part of the storage ring though until we add a counterId to our state, below.)
@@ -385,6 +427,12 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
    }).get();
    auto features = get_config_supported_features();
    _token_metadata.update_host_id(local_host_id, get_broadcast_address());
+
+    // Replicate the tokens early because once gossip runs other nodes
+    // might send reads/writes to this node. Replicate it early to make
+    // sure the tokens are valid on all the shards.
+    replicate_to_all_cores().get();
+
    auto broadcast_rpc_address = utils::fb_utilities::get_broadcast_rpc_address();
    app_states.emplace(gms::application_state::NET_VERSION, value_factory.network_version());
    app_states.emplace(gms::application_state::HOST_ID, value_factory.host_id(local_host_id));
@@ -395,6 +443,10 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
    app_states.emplace(gms::application_state::SCHEMA_TABLES_VERSION, versioned_value(db::schema_tables::version));
    app_states.emplace(gms::application_state::RPC_READY, value_factory.cql_ready(false));
    app_states.emplace(gms::application_state::VIEW_BACKLOG, versioned_value(""));
+    if (restarting_normal_node) {
+        app_states.emplace(gms::application_state::TOKENS, value_factory.tokens(my_tokens));
+        app_states.emplace(gms::application_state::STATUS, value_factory.normal(my_tokens));
+    }
    slogger.info("Starting up server gossip");

    auto& gossiper = gms::get_local_gossiper();
@@ -408,9 +460,6 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
    auto& proxy = service::get_storage_proxy();
    // gossip Schema.emptyVersion forcing immediate check for schema updates (see MigrationManager#maybeScheduleSchemaPull)
    update_schema_version_and_announce(proxy).get();// Ensure we know our own actual Schema UUID in preparation for updates
-    get_storage_service().invoke_on_all([] (auto& ss) {
-        ss.register_features();
-    }).get();
 #if 0
    if (!MessagingService.instance().isListening())
        MessagingService.instance().listen(FBUtilities.getLocalAddress());
@@ -419,24 +468,10 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
    HintedHandOffManager.instance.start();
    BatchlogManager.instance.start();
 #endif
-}
-
-void storage_service::register_features() {
-    _range_tombstones_feature = gms::feature(RANGE_TOMBSTONES_FEATURE);
-    _large_partitions_feature = gms::feature(LARGE_PARTITIONS_FEATURE);
-    _counters_feature = gms::feature(COUNTERS_FEATURE);
-    _digest_multipartition_read_feature = gms::feature(DIGEST_MULTIPARTITION_READ_FEATURE);
-    _correct_counter_order_feature = gms::feature(CORRECT_COUNTER_ORDER_FEATURE);
-    _schema_tables_v3 = gms::feature(SCHEMA_TABLES_V3);
-    _correct_non_compound_range_tombstones = gms::feature(CORRECT_NON_COMPOUND_RANGE_TOMBSTONES);
-    _write_failure_reply_feature = gms::feature(WRITE_FAILURE_REPLY_FEATURE);
-    _xxhash_feature = gms::feature(XXHASH_FEATURE);
-    _roles_feature = gms::feature(ROLES_FEATURE);
-    _la_sstable_feature = gms::feature(LA_SSTABLE_FEATURE);
-    _stream_with_rpc_stream_feature = gms::feature(STREAM_WITH_RPC_STREAM);
-    _mc_sstable_feature = gms::feature(MC_SSTABLE_FEATURE);
-    _materialized_views_feature = gms::feature(MATERIALIZED_VIEWS_FEATURE);
-    _indexes_feature = gms::feature(INDEXES_FEATURE);
+    // Wait for gossip to settle so that the fetures will be enabled
+    if (do_bind) {
+        gms::get_local_gossiper().wait_for_gossip_to_settle().get();
+    }
 }

 // Runs inside seastar::async context
@@ -478,13 +513,9 @@ void storage_service::join_token_ring(int delay) {
            db::system_keyspace::set_bootstrap_state(db::system_keyspace::bootstrap_state::IN_PROGRESS).get();
        }
        set_mode(mode::JOINING, "waiting for ring information", true);
-        // first sleep the delay to make sure we see all our peers
-        for (int i = 0; i < delay; i += 1000) {
-            // if we see schema, we can proceed to the next check directly
-            if (_db.local().get_version() != database::empty_version) {
-                slogger.debug("got schema: {}", _db.local().get_version());
-                break;
-            }
+        auto& gossiper = gms::get_gossiper().local();
+        // first sleep the delay to make sure we see *at least* one other node
+        for (int i = 0; i < delay && gossiper.get_live_members().size() < 2; i += 1000) {
            sleep(std::chrono::seconds(1)).get();
        }
        // if our schema hasn't matched yet, keep sleeping until it does
@@ -541,7 +572,6 @@ void storage_service::join_token_ring(int delay) {
                for (auto token : _bootstrap_tokens) {
                    auto existing = _token_metadata.get_endpoint(token);
                    if (existing) {
-                        auto& gossiper = gms::get_local_gossiper();
                        auto* eps = gossiper.get_endpoint_state_for_endpoint_ptr(*existing);
                        if (eps && eps->get_update_timestamp() > gms::gossiper::clk::now() - std::chrono::milliseconds(delay)) {
                            throw std::runtime_error("Cannot replace a live node...");
@@ -685,6 +715,7 @@ void storage_service::bootstrap(std::unordered_set<token> tokens) {
    } else {
        // Dont set any state for the node which is bootstrapping the existing token...
        _token_metadata.update_normal_tokens(tokens, get_broadcast_address());
+        replicate_to_all_cores().get();
        auto replace_addr = db().local().get_replace_address();
        if (replace_addr) {
            slogger.debug("Removing replaced endpoint {} from system.peers", *replace_addr);
@@ -1441,7 +1472,13 @@ future<> storage_service::init_server(int delay, bind_messaging_port do_bind) {
            }
        }

-        prepare_to_join(std::move(loaded_endpoints), do_bind);
+        auto loaded_peer_features = db::system_keyspace::load_peer_features().get0();
+        slogger.info("loaded_peer_features: peer_features size={}", loaded_peer_features.size());
+        for (auto& x : loaded_peer_features) {
+            slogger.info("loaded_peer_features: peer={}, supported_features={}", x.first, x.second);
+        }
+
+        prepare_to_join(std::move(loaded_endpoints), loaded_peer_features, do_bind);
 #if 0
        // Has to be called after the host id has potentially changed in prepareToJoin().
        for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
@@ -1455,6 +1492,7 @@ future<> storage_service::init_server(int delay, bind_messaging_port do_bind) {
            auto tokens = db::system_keyspace::get_saved_tokens().get0();
            if (!tokens.empty()) {
                _token_metadata.update_normal_tokens(tokens, get_broadcast_address());
+                replicate_to_all_cores().get();
                // order is important here, the gossiper can fire in between adding these two states.  It's ok to send TOKENS without STATUS, but *not* vice versa.
                gossiper.add_local_application_state({
                    { gms::application_state::TOKENS, value_factory.tokens(tokens) },
@@ -1518,20 +1556,21 @@ future<> storage_service::stop() {
    return make_ready_future<>();
 }

-future<> storage_service::check_for_endpoint_collision() {
+future<> storage_service::check_for_endpoint_collision(const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features) {
    slogger.debug("Starting shadow gossip round to check for endpoint collision");
 #if 0
    if (!MessagingService.instance().isListening())
        MessagingService.instance().listen(FBUtilities.getLocalAddress());
 #endif
-    return seastar::async([this] {
+    return seastar::async([this, loaded_peer_features] {
        auto& gossiper = gms::get_local_gossiper();
        auto t = gms::gossiper::clk::now();
        bool found_bootstrapping_node = false;
+        auto local_features = get_config_supported_features();
        do {
            slogger.info("Checking remote features with gossip");
            gossiper.do_shadow_round().get();
-            gossiper.check_knows_remote_features(get_config_supported_features());
+            gossiper.check_knows_remote_features(local_features, loaded_peer_features);
            auto addr = get_broadcast_address();
            if (!gossiper.is_safe_for_bootstrap(addr)) {
                throw std::runtime_error(sprint("A node with address %s already exists, cancelling join. "
@@ -1583,7 +1622,7 @@ void storage_service::remove_endpoint(inet_address endpoint) {
    }).get();
 }

-future<std::unordered_set<token>> storage_service::prepare_replacement_info() {
+future<std::unordered_set<token>> storage_service::prepare_replacement_info(const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features) {
    if (!db().local().get_replace_address()) {
        throw std::runtime_error(sprint("replace_address is empty"));
    }
@@ -1599,9 +1638,10 @@ future<std::unordered_set<token>> storage_service::prepare_replacement_info() {

    // make magic happen
    slogger.info("Checking remote features with gossip");
-    return gms::get_local_gossiper().do_shadow_round().then([this, replace_address] {
+    return gms::get_local_gossiper().do_shadow_round().then([this, loaded_peer_features, replace_address] {
        auto& gossiper = gms::get_local_gossiper();
-        gossiper.check_knows_remote_features(get_config_supported_features());
+        auto local_features = get_config_supported_features();
+        gossiper.check_knows_remote_features(local_features, loaded_peer_features);
        // now that we've gossiped at least once, we should be able to find the node we're replacing
        auto* state = gossiper.get_endpoint_state_for_endpoint_ptr(replace_address);
        if (!state) {
@@ -2106,6 +2146,7 @@ future<> storage_service::start_native_transport() {
                    auto cred = std::make_shared<seastar::tls::credentials_builder>();

                    cred->set_dh_level(seastar::tls::dh_params::level::MEDIUM);
+                    cred->set_priority_string(db::config::default_tls_priority);

                    if (ceo.count("priority_string")) {
                        cred->set_priority_string(ceo.at("priority_string"));
@@ -2466,15 +2507,17 @@ future<> storage_service::rebuild(sstring source_dc) {
        if (source_dc != "") {
            streamer->add_source_filter(std::make_unique<dht::range_streamer::single_datacenter_filter>(source_dc));
        }
-        for (const auto& keyspace_name : ss._db.local().get_non_system_keyspaces()) {
-            streamer->add_ranges(keyspace_name, ss.get_local_ranges(keyspace_name));
-        }
-        return streamer->stream_async().then([streamer] {
-            slogger.info("Streaming for rebuild successful");
-        }).handle_exception([] (auto ep) {
-            // This is used exclusively through JMX, so log the full trace but only throw a simple RTE
-            slogger.warn("Error while rebuilding node: {}", std::current_exception());
-            return make_exception_future<>(std::move(ep));
+        auto keyspaces = make_lw_shared<std::vector<sstring>>(ss._db.local().get_non_system_keyspaces());
+        return do_for_each(*keyspaces, [keyspaces, streamer, &ss] (sstring& keyspace_name) {
+            return streamer->add_ranges(keyspace_name, ss.get_local_ranges(keyspace_name));
+        }).then([streamer] {
+            return streamer->stream_async().then([streamer] {
+                slogger.info("Streaming for rebuild successful");
+            }).handle_exception([] (auto ep) {
+                // This is used exclusively through JMX, so log the full trace but only throw a simple RTE
+                slogger.warn("Error while rebuilding node: {}", std::current_exception());
+                return make_exception_future<>(std::move(ep));
+            });
        });
    });
 }
@@ -3306,5 +3349,14 @@ void storage_service::notify_cql_change(inet_address endpoint, bool ready)
    }
 }

+future<> init_storage_service(distributed<database>& db, sharded<auth::service>& auth_service, sharded<db::system_distributed_keyspace>& sys_dist_ks,
+        sharded<gms::feature_service>& feature_service) {
+    return service::get_storage_service().start(std::ref(db), std::ref(auth_service), std::ref(sys_dist_ks), std::ref(feature_service));
+}
+
+future<> deinit_storage_service() {
+    return service::get_storage_service().stop();
+}
+
 } // namespace service

--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -71,6 +71,10 @@ namespace dht {
 class boot_strapper;
 }

+namespace gms {
+class feature_service;
+};
+
 namespace service {

 class load_broadcaster;
@@ -120,6 +124,7 @@ private:
    /* JMX notification serial number counter */
    private final AtomicLong notificationSerialNumber = new AtomicLong();
 #endif
+    gms::feature_service& _feature_service;
    distributed<database>& _db;
    sharded<auth::service>& _auth_service;
    int _update_jobs{0};
@@ -139,7 +144,7 @@ private:
    bool _stream_manager_stopped = false;
    seastar::metrics::metric_groups _metrics;
 public:
-    storage_service(distributed<database>& db, sharded<auth::service>&, sharded<db::system_distributed_keyspace>&);
+    storage_service(distributed<database>& db, sharded<auth::service>&, sharded<db::system_distributed_keyspace>&, gms::feature_service& feature_service);
    void isolate_on_error();
    void isolate_on_commit_error();

@@ -290,24 +295,9 @@ private:
    gms::feature _la_sstable_feature;
    gms::feature _stream_with_rpc_stream_feature;
    gms::feature _mc_sstable_feature;
+    gms::feature _correct_static_compact_in_mc;
 public:
-    void enable_all_features() {
-        _range_tombstones_feature.enable();
-        _large_partitions_feature.enable();
-        _materialized_views_feature.enable();
-        _counters_feature.enable();
-        _indexes_feature.enable();
-        _digest_multipartition_read_feature.enable();
-        _correct_counter_order_feature.enable();
-        _schema_tables_v3.enable();
-        _correct_non_compound_range_tombstones.enable();
-        _write_failure_reply_feature.enable();
-        _xxhash_feature.enable();
-        _roles_feature.enable();
-        _la_sstable_feature.enable();
-        _stream_with_rpc_stream_feature.enable();
-        _mc_sstable_feature.enable();
-    }
+    void enable_all_features();

    void finish_bootstrapping() {
        _is_bootstrap_mode = false;
@@ -400,9 +390,9 @@ public:
    }
 #endif
 public:
-    future<std::unordered_set<token>> prepare_replacement_info();
+    future<std::unordered_set<token>> prepare_replacement_info(const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features);

-    future<> check_for_endpoint_collision();
+    future<> check_for_endpoint_collision(const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features);
 #if 0

    // for testing only
@@ -464,8 +454,7 @@ public:
 #endif
 private:
    bool should_bootstrap();
-    void prepare_to_join(std::vector<inet_address> loaded_endpoints, bind_messaging_port do_bind = bind_messaging_port::yes);
-    void register_features();
+    void prepare_to_join(std::vector<inet_address> loaded_endpoints, const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features, bind_messaging_port do_bind = bind_messaging_port::yes);
    void join_token_ring(int delay);
 public:
    future<> join_ring();
@@ -2293,6 +2282,10 @@ public:
    bool cluster_supports_mc_sstable() const {
        return bool(_mc_sstable_feature);
    }
+
+    const gms::feature& cluster_supports_correct_static_compact_in_mc() const {
+        return _correct_static_compact_in_mc;
+    }
 private:
    future<> set_cql_ready(bool ready);
 private:
@@ -2303,12 +2296,8 @@ private:
    void notify_cql_change(inet_address endpoint, bool ready);
 };

-inline future<> init_storage_service(distributed<database>& db, sharded<auth::service>& auth_service, sharded<db::system_distributed_keyspace>& sys_dist_ks) {
-    return service::get_storage_service().start(std::ref(db), std::ref(auth_service), std::ref(sys_dist_ks));
-}
-
-inline future<> deinit_storage_service() {
-    return service::get_storage_service().stop();
-}
+future<> init_storage_service(distributed<database>& db, sharded<auth::service>& auth_service, sharded<db::system_distributed_keyspace>& sys_dist_ks,
+        sharded<gms::feature_service>& feature_service);
+future<> deinit_storage_service();

 }
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -179,6 +179,8 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
        void remove_sstable(bool is_tracking) {
            if (is_tracking) {
                _cf.get_compaction_strategy().get_backlog_tracker().remove_sstable(_sst);
+            } else if (_sst) {
+                _cf.get_compaction_strategy().get_backlog_tracker().revert_charges(_sst);
            }
            _sst = {};
        }
@@ -303,6 +305,7 @@ public:
 class compaction {
 protected:
    column_family& _cf;
+    schema_ptr _schema;
    std::vector<shared_sstable> _sstables;
    uint64_t _max_sstable_size;
    uint32_t _sstable_level;
@@ -313,6 +316,7 @@ protected:
 protected:
    compaction(column_family& cf, std::vector<shared_sstable> sstables, uint64_t max_sstable_size, uint32_t sstable_level)
        : _cf(cf)
+        , _schema(cf.schema())
        , _sstables(std::move(sstables))
        , _max_sstable_size(max_sstable_size)
        , _sstable_level(sstable_level)
@@ -361,10 +365,9 @@ private:
    virtual flat_mutation_reader make_sstable_reader(lw_shared_ptr<sstables::sstable_set> ssts) const = 0;

    flat_mutation_reader setup() {
-        auto ssts = make_lw_shared<sstables::sstable_set>(_cf.get_compaction_strategy().make_sstable_set(_cf.schema()));
-        auto schema = _cf.schema();
+        auto ssts = make_lw_shared<sstables::sstable_set>(_cf.get_compaction_strategy().make_sstable_set(_schema));
        sstring formatted_msg = "[";
-        auto fully_expired = get_fully_expired_sstables(_cf, _sstables, gc_clock::now() - schema->gc_grace_seconds());
+        auto fully_expired = get_fully_expired_sstables(_cf, _sstables, gc_clock::now() - _schema->gc_grace_seconds());

        for (auto& sst : _sstables) {
            // Compacted sstable keeps track of its ancestors.
@@ -396,8 +399,8 @@ private:
        }
        formatted_msg += "]";
        _info->sstables = _sstables.size();
-        _info->ks = schema->ks_name();
-        _info->cf = schema->cf_name();
+        _info->ks = _schema->ks_name();
+        _info->cf = _schema->cf_name();
        report_start(formatted_msg);

        return make_sstable_reader(std::move(ssts));
@@ -462,7 +465,7 @@ private:
    }

    const schema_ptr& schema() const {
-        return _cf.schema();
+        return _schema;
    }
 public:
    static future<compaction_info> run(std::unique_ptr<compaction> c);
@@ -518,10 +521,10 @@ public:
    }

    flat_mutation_reader make_sstable_reader(lw_shared_ptr<sstables::sstable_set> ssts) const override {
-        return ::make_local_shard_sstable_reader(_cf.schema(),
+        return ::make_local_shard_sstable_reader(_schema,
                std::move(ssts),
                query::full_partition_range,
-                _cf.schema()->full_slice(),
+                _schema->full_slice(),
                service::get_local_compaction_priority(),
                no_resource_tracking(),
                nullptr,
@@ -570,7 +573,7 @@ public:
            cfg.monitor = &_active_write_monitors.back();
            cfg.large_partition_handler = _cf.get_large_partition_handler();
            // TODO: calculate encoding_stats based on statistics of compacted sstables
-            _writer.emplace(_sst->get_writer(*_cf.schema(), partitions_per_sstable(), cfg, encoding_stats{}, priority));
+            _writer.emplace(_sst->get_writer(*_schema, partitions_per_sstable(), cfg, encoding_stats{}, priority));
        }
        return &*_writer;
    }
@@ -610,7 +613,7 @@ public:
    }

    std::function<bool(const dht::decorated_key&)> filter_func() const override {
-        dht::token_range_vector owned_ranges = service::get_local_storage_service().get_local_ranges(_cf.schema()->ks_name());
+        dht::token_range_vector owned_ranges = service::get_local_storage_service().get_local_ranges(_schema->ks_name());

        return [this, owned_ranges = std::move(owned_ranges)] (const dht::decorated_key& dk) {
            if (dht::shard_of(dk.token()) != engine().cpu_id()) {
@@ -684,10 +687,10 @@ public:

    // Use reader that makes sure no non-local mutation will not be filtered out.
    flat_mutation_reader make_sstable_reader(lw_shared_ptr<sstables::sstable_set> ssts) const override {
-        return ::make_range_sstable_reader(_cf.schema(),
+        return ::make_range_sstable_reader(_schema,
                std::move(ssts),
                query::full_partition_range,
-                _cf.schema()->full_slice(),
+                _schema->full_slice(),
                service::get_local_compaction_priority(),
                no_resource_tracking(),
                nullptr,
@@ -719,7 +722,7 @@ public:
            cfg.large_partition_handler = _cf.get_large_partition_handler();
            auto&& priority = service::get_local_compaction_priority();
            // TODO: calculate encoding_stats based on statistics of compacted sstables
-            writer.emplace(sst->get_writer(*_cf.schema(), partitions_per_sstable(_shard), cfg, encoding_stats{}, priority, _shard));
+            writer.emplace(sst->get_writer(*_schema, partitions_per_sstable(_shard), cfg, encoding_stats{}, priority, _shard));
        }
        return &*writer;
    }
--- a/sstables/compaction_manager.cc
+++ b/sstables/compaction_manager.cc
@@ -66,6 +66,14 @@ public:
            _cm->deregister_compacting_sstables(_compacting);
        }
    }
+
+    // Explicitly release compacting sstables
+    void release_compacting(const std::vector<sstables::shared_sstable>& sstables) {
+        _cm->deregister_compacting_sstables(sstables);
+        for (auto& sst : sstables) {
+            _compacting.erase(boost::remove(_compacting, sst), _compacting.end());
+        }
+    }
 };

 compaction_weight_registration::compaction_weight_registration(compaction_manager* cm, int weight)
@@ -564,18 +572,24 @@ future<> compaction_manager::perform_cleanup(column_family* cf) {
            return make_ready_future<stop_iteration>(stop_iteration::yes);
        }
        column_family& cf = *task->compacting_cf;
-        sstables::compaction_descriptor descriptor = sstables::compaction_descriptor(get_candidates(cf));
-        auto compacting = compacting_sstable_registration(this, descriptor.sstables);
+        auto sstables = get_candidates(cf);
+        auto compacting = make_lw_shared<compacting_sstable_registration>(this, sstables);

        _stats.pending_tasks--;
        _stats.active_tasks++;
        task->compaction_running = true;
        compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
-        return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor)] (compaction_backlog_tracker& bt) mutable {
-            return with_scheduling_group(_scheduling_group, [this, &cf, descriptor = std::move(descriptor)] () mutable {
-                return cf.cleanup_sstables(std::move(descriptor));
+        return do_with(std::move(user_initiated), std::move(sstables), [this, &cf, compacting] (compaction_backlog_tracker& bt,
+                std::vector<sstables::shared_sstable>& sstables) mutable {
+            return with_scheduling_group(_scheduling_group, [this, &cf, &sstables, compacting] () mutable {
+                return do_for_each(sstables, [this, &cf, compacting] (auto& sst) {
+                    return cf.cleanup_sstables(sstables::compaction_descriptor({sst})).then([&sst, compacting] {
+                        // Releases reference to cleaned sstable such that respective used disk space can be freed.
+                        compacting->release_compacting({std::move(sst)});
+                    });
+                });
            });
-        }).then_wrapped([this, task, compacting = std::move(compacting)] (future<> f) mutable {
+        }).then_wrapped([this, task, compacting] (future<> f) mutable {
            task->compaction_running = false;
            _stats.active_tasks--;
            if (!can_proceed(task)) {
--- a/sstables/compaction_strategy.cc
+++ b/sstables/compaction_strategy.cc
@@ -170,7 +170,10 @@ public:
        _sstables.push_back(std::move(sst));
    }
    virtual void erase(shared_sstable sst) override {
-        _sstables.erase(boost::range::find(_sstables, sst));
+        auto it = boost::range::find(_sstables, sst);
+        if (it != _sstables.end()){
+            _sstables.erase(it);
+        }
    }
    virtual std::unique_ptr<incremental_selector_impl> make_incremental_selector() const override;
    class incremental_selector;
@@ -420,11 +423,6 @@ public:
            auto itw = writes_per_window.find(bound);
            if (itw != writes_per_window.end()) {
                ow_this_window = &itw->second;
-                // We will erase here so we can keep track of which
-                // writes belong to existing windows. Writes that don't belong to any window
-                // are writes in progress to new windows and will be accounted in the final
-                // loop before we return
-                writes_per_window.erase(itw);
            }
            auto* oc_this_window = &no_oc;
            auto itc = compactions_per_window.find(bound);
@@ -432,6 +430,13 @@ public:
                oc_this_window = &itc->second;
            }
            b += windows.second.backlog(*ow_this_window, *oc_this_window);
+            if (itw != writes_per_window.end()) {
+                // We will erase here so we can keep track of which
+                // writes belong to existing windows. Writes that don't belong to any window
+                // are writes in progress to new windows and will be accounted in the final
+                // loop before we return
+                writes_per_window.erase(itw);
+            }
        }

        // Partial writes that don't belong to any window are accounted here.
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -390,9 +390,17 @@ private:
            }

            return do_with(std::make_unique<reader>(_sstable, _pc, position, end, quantity), [this, summary_idx] (auto& entries_reader) {
-                return entries_reader->_context.consume_input().then([this, summary_idx, &entries_reader] {
+                return entries_reader->_context.consume_input().then_wrapped([this, summary_idx, &entries_reader] (future<> f) {
+                    std::exception_ptr ex;
+                    if (f.failed()) {
+                        ex = f.get_exception();
+                        sstlog.error("failed reading index for {}: {}", _sstable->get_filename(), ex);
+                    }
                    auto indexes = std::move(entries_reader->_consumer.indexes);
-                    return entries_reader->_context.close().then([indexes = std::move(indexes)] () mutable {
+                    return entries_reader->_context.close().then([indexes = std::move(indexes), ex = std::move(ex)] () mutable {
+                        if (ex) {
+                            std::rethrow_exception(std::move(ex));
+                        }
                        return std::move(indexes);
                    });

--- a/sstables/m_format_read_helpers.hh
+++ b/sstables/m_format_read_helpers.hh
@@ -72,8 +72,11 @@ inline gc_clock::duration parse_ttl(int32_t value) {

 inline gc_clock::duration parse_ttl(const serialization_header& header,
                                    uint64_t delta) {
-    int32_t _delta = static_cast<int32_t>(delta);
-    return parse_ttl(header.get_min_ttl() + _delta);
+    // sign-extend min_ttl back to 64 bits and
+    // add the delta using unsigned arithmetic
+    // to prevent signed integer overflow
+    uint64_t min_ttl = static_cast<uint64_t>(static_cast<int64_t>(header.get_min_ttl()));
+    return parse_ttl(static_cast<int32_t>(min_ttl + delta));
 }

 inline gc_clock::time_point parse_expiry(int32_t value) {
@@ -85,8 +88,11 @@ inline gc_clock::time_point parse_expiry(int32_t value) {

 inline gc_clock::time_point parse_expiry(const serialization_header& header,
                                   uint64_t delta) {
-    int32_t _delta = static_cast<int32_t>(delta);
-    return parse_expiry(header.get_min_local_deletion_time() + _delta);
+    // sign-extend min_local_deletion_time back to 64 bits and
+    // add the delta using unsigned arithmetic
+    // to prevent signed integer overflow
+    uint64_t min_local_deletion_time = static_cast<uint64_t>(static_cast<int64_t>(header.get_min_local_deletion_time()));
+    return parse_expiry(static_cast<int32_t>(min_local_deletion_time + delta));
 }

 };   // namespace sstables
--- a/sstables/mc/writer.cc
+++ b/sstables/mc/writer.cc
@@ -29,6 +29,7 @@
 #include "sstables/mc/types.hh"
 #include "db/config.hh"
 #include "atomic_cell.hh"
+#include "utils/exceptions.hh"

 #include <functional>
 #include <boost/iterator/iterator_facade.hpp>
@@ -308,9 +309,11 @@ void write_missing_columns(W& out, const indexed_columns& columns, const row& ro
 template <typename T, typename W>
 GCC6_CONCEPT(requires Writer<W>())
 void write_unsigned_delta_vint(W& out, T value, T base) {
+    using unsigned_type = std::make_unsigned_t<T>;
+    unsigned_type unsigned_delta = static_cast<unsigned_type>(value) - static_cast<unsigned_type>(base);
    // sign-extend to 64-bits
    using signed_type = std::make_signed_t<T>;
-    int64_t delta = static_cast<signed_type>(value) - static_cast<signed_type>(base);
+    int64_t delta = static_cast<int64_t>(static_cast<signed_type>(unsigned_delta));
    // write as unsigned 64-bit varint
    write_vint(out, static_cast<uint64_t>(delta));
 }
@@ -370,12 +373,21 @@ static sstring pk_type_to_string(const schema& s) {
    }
 }

-serialization_header make_serialization_header(const schema& s, const encoding_stats& enc_stats) {
+struct sstable_schema {
    serialization_header header;
+    indexed_columns regular_columns;
+    indexed_columns static_columns;
+};
+
+sstable_schema make_sstable_schema(const schema& s, const encoding_stats& enc_stats, const sstable_writer_config& cfg) {
+    sstable_schema sst_sch;
+    serialization_header& header = sst_sch.header;
    // mc serialization header minimum values are delta-encoded based on the default timestamp epoch times
-    header.min_timestamp_base.value = static_cast<uint64_t>(enc_stats.min_timestamp - encoding_stats::timestamp_epoch);
-    header.min_local_deletion_time_base.value = static_cast<uint64_t>(enc_stats.min_local_deletion_time - encoding_stats::deletion_time_epoch);
-    header.min_ttl_base.value = static_cast<uint64_t>(enc_stats.min_ttl - encoding_stats::ttl_epoch);
+    // Note: We rely on implicit conversion to uint64_t when subtracting the signed epoch values below
+    // for preventing signed integer overflow.
+    header.min_timestamp_base.value = static_cast<uint64_t>(enc_stats.min_timestamp) - encoding_stats::timestamp_epoch;
+    header.min_local_deletion_time_base.value = static_cast<uint64_t>(enc_stats.min_local_deletion_time) - encoding_stats::deletion_time_epoch;
+    header.min_ttl_base.value = static_cast<uint64_t>(enc_stats.min_ttl) - encoding_stats::ttl_epoch;

    header.pk_type_name = to_bytes_array_vint_size(pk_type_to_string(s));

@@ -385,23 +397,36 @@ serialization_header make_serialization_header(const schema& s, const encoding_s
        header.clustering_key_types_names.elements.push_back(std::move(ck_type_name));
    }

-    header.static_columns.elements.reserve(s.static_columns_count());
-    for (const auto& static_column : s.static_columns()) {
+    auto add_column = [&] (const column_definition& column) {
        serialization_header::column_desc cd;
-        cd.name = to_bytes_array_vint_size(static_column.name());
-        cd.type_name = to_bytes_array_vint_size(type_name_with_udt_frozen(static_column.type));
-        header.static_columns.elements.push_back(std::move(cd));
+        cd.name = to_bytes_array_vint_size(column.name());
+        cd.type_name = to_bytes_array_vint_size(type_name_with_udt_frozen(column.type));
+        if (column.is_static()) {
+            header.static_columns.elements.push_back(std::move(cd));
+            sst_sch.static_columns.push_back(column);
+        } else if (column.is_regular()) {
+            header.regular_columns.elements.push_back(std::move(cd));
+            sst_sch.regular_columns.push_back(column);
+        }
+    };
+
+    if (cfg.correctly_serialize_static_compact_in_mc) {
+        for (const auto& column : s.v3().all_columns()) {
+            add_column(column);
+        }
+    } else {
+        for (const auto& column : s.all_columns()) {
+            add_column(column);
+        }
    }

-    header.regular_columns.elements.reserve(s.regular_columns_count());
-    for (const auto& regular_column : s.regular_columns()) {
-        serialization_header::column_desc cd;
-        cd.name = to_bytes_array_vint_size(regular_column.name());
-        cd.type_name = to_bytes_array_vint_size(type_name_with_udt_frozen(regular_column.type));
-        header.regular_columns.elements.push_back(std::move(cd));
-    }
+    // For static and regular columns, we write all simple columns first followed by collections
+    // These containers have columns partitioned by atomicity
+    auto pred = [] (const std::reference_wrapper<const column_definition>& column) { return column.get().is_atomic(); };
+    boost::range::stable_partition(sst_sch.regular_columns, pred);
+    boost::range::stable_partition(sst_sch.static_columns, pred);

-    return header;
+    return sst_sch;
 }

 enum class cell_flags : uint8_t {
@@ -507,18 +532,6 @@ GCC6_CONCEPT(
    };
 )

-static indexed_columns get_indexed_columns_partitioned_by_atomicity(schema::const_iterator_range_type columns) {
-    indexed_columns result;
-    result.reserve(columns.size());
-    for (const auto& col: columns) {
-        result.emplace_back(col);
-    }
-    boost::range::stable_partition(
-            result,
-            [](const std::reference_wrapper<const column_definition>& column) { return column.get().is_atomic();});
-    return result;
-}
-
 // Used for writing SSTables in 'mc' format.
 class writer : public sstable_writer::writer_impl {
 private:
@@ -526,7 +539,7 @@ private:
    shard_id _shard; // Specifies which shard the new SStable will belong to.
    bool _compression_enabled = false;
    std::unique_ptr<file_writer> _data_writer;
-    std::optional<file_writer> _index_writer;
+    std::unique_ptr<file_writer> _index_writer;
    bool _tombstone_written = false;
    bool _static_row_written = false;
    // The length of partition header (partition key, partition deletion and static row, if present)
@@ -540,10 +553,7 @@ private:
    range_tombstone_stream _range_tombstones;
    bytes_ostream _tmp_bufs;

-    // For static and regular columns, we write all simple columns first followed by collections
-    // These containers have columns partitioned by atomicity
-    const indexed_columns _static_columns;
-    const indexed_columns _regular_columns;
+    const sstable_schema _sst_schema;

    struct cdef_and_collection {
        const column_definition* cdef;
@@ -571,7 +581,11 @@ private:
    struct {
        // Unfortunately we cannot output the promoted index directly to the
        // index file because it needs to be prepended by its size.
-        seastar::circular_buffer<pi_block> promoted_index;
+        // first_entry is used for deferring serialization into blocks for small partitions.
+        std::optional<pi_block> first_entry;
+        bytes_ostream blocks; // Serialized pi_blocks.
+        bytes_ostream offsets; // Serialized block offsets (uint32_t) relative to the start of "blocks".
+        uint64_t promoted_index_size = 0; // Number of pi_blocks inside blocks and first_entry;
        tombstone tomb;
        uint64_t block_start_offset;
        uint64_t block_next_start_offset;
@@ -580,8 +594,13 @@ private:
        size_t desired_block_size;
    } _pi_write_m;
    column_stats _c_stats;
+    bool _write_regular_as_static; // See #4139

    void init_file_writers();
+
+    // Returns the closed writer
+    std::unique_ptr<file_writer> close_writer(std::unique_ptr<file_writer>& w);
+
    void close_data_writer();
    void ensure_tombstone_is_written() {
        if (!_tombstone_written) {
@@ -590,7 +609,7 @@ private:
    }

    void ensure_static_row_is_written_if_needed() {
-        if (!_static_columns.empty() && !_static_row_written) {
+        if (!_sst_schema.static_columns.empty() && !_static_row_written) {
            consume(static_row{});
        }
    }
@@ -606,6 +625,7 @@ private:
    void maybe_set_pi_first_clustering(const clustering_info& info);
    void maybe_add_pi_block();
    void add_pi_block();
+    void write_pi_block(const pi_block&);

    void update_deletion_time_stats(deletion_time dt) {
        _c_stats.update_timestamp(dt.marked_for_delete_at);
@@ -643,7 +663,7 @@ private:

    // Writes single atomic cell
    void write_cell(bytes_ostream& writer, atomic_cell_view cell, const column_definition& cdef,
-                    const row_time_properties& properties, bytes_view cell_path = {});
+                    const row_time_properties& properties, std::optional<bytes_view> cell_path = {});

    // Writes information about row liveness (formerly 'row marker')
    void write_liveness_info(bytes_ostream& writer, const row_marker& marker);
@@ -654,7 +674,7 @@ private:

    void write_cells(bytes_ostream& writer, column_kind kind, const row& row_body, const row_time_properties& properties, bool has_complex_deletion);
    void write_row_body(bytes_ostream& writer, const clustering_row& row, bool has_complex_deletion);
-    void write_static_row(const row& static_row);
+    void write_static_row(const row&, column_kind);

    // Clustered is a term used to denote an entity that has a clustering key prefix
    // and constitutes an entry of a partition.
@@ -675,15 +695,19 @@ private:
        _prev_row_start = pos;
        maybe_add_pi_block();
    }
-    void write_promoted_index(file_writer& writer);
+    void write_promoted_index();
    void consume(rt_marker&& marker);

-    void flush_tmp_bufs() {
+    void flush_tmp_bufs(file_writer& writer) {
        for (auto&& buf : _tmp_bufs) {
-            _data_writer->write(buf);
+            writer.write(buf);
        }
        _tmp_bufs.clear();
    }
+
+    void flush_tmp_bufs() {
+        flush_tmp_bufs(*_data_writer);
+    }
 public:

    writer(sstable& sst, const schema& s, uint64_t estimated_partitions,
@@ -694,8 +718,8 @@ public:
        , _shard(shard)
        , _range_tombstones(_schema)
        , _tmp_bufs(_sst.sstable_buffer_size)
-        , _static_columns(get_indexed_columns_partitioned_by_atomicity(s.static_columns()))
-        , _regular_columns(get_indexed_columns_partitioned_by_atomicity(s.regular_columns()))
+        , _sst_schema(make_sstable_schema(s, _enc_stats, _cfg))
+        , _write_regular_as_static(cfg.correctly_serialize_static_compact_in_mc && s.is_static_compact_table())
    {
        _sst.generate_toc(_schema.get_compressor_params().get_compressor(), _schema.bloom_filter_fp_chance());
        _sst.write_toc(_pc);
@@ -760,12 +784,25 @@ static deletion_time to_deletion_time(tombstone t) {
 }

 void writer::add_pi_block() {
-    _pi_write_m.promoted_index.push_back({
+    auto block = pi_block{
        *_pi_write_m.first_clustering,
        *_pi_write_m.last_clustering,
        _pi_write_m.block_start_offset - _c_stats.start_offset,
        _data_writer->offset() - _pi_write_m.block_start_offset,
-        (_end_open_marker ? std::make_optional(_end_open_marker->tomb) : std::optional<tombstone>{})});
+        (_end_open_marker ? std::make_optional(_end_open_marker->tomb) : std::optional<tombstone>{})};
+
+    if (_pi_write_m.blocks.empty()) {
+        if (!_pi_write_m.first_entry) {
+            _pi_write_m.first_entry.emplace(std::move(block));
+            ++_pi_write_m.promoted_index_size;
+            return;
+        } else {
+            write_pi_block(*_pi_write_m.first_entry);
+        }
+    }
+
+    write_pi_block(block);
+    ++_pi_write_m.promoted_index_size;
 }

 void writer::maybe_add_pi_block() {
@@ -793,13 +830,17 @@ void writer::init_file_writers() {
                    &_sst._components->compression,
                    _schema.get_compressor_params()));
    }
-    _index_writer.emplace(std::move(_sst._index_file), options);
+    _index_writer = std::make_unique<file_writer>(std::move(_sst._index_file), options);
+}
+
+std::unique_ptr<file_writer> writer::close_writer(std::unique_ptr<file_writer>& w) {
+    auto writer = std::move(w);
+    writer->close();
+    return writer;
 }

 void writer::close_data_writer() {
-    auto writer = std::move(_data_writer);
-    writer->close();
-
+    auto writer = close_writer(_data_writer);
    if (!_compression_enabled) {
        auto chksum_wr = static_cast<crc32_checksummed_file_writer*>(writer.get());
        _sst.write_digest(chksum_wr->full_checksum());
@@ -900,7 +941,10 @@ void writer::consume_new_partition(const dht::decorated_key& dk) {
    write(_sst.get_version(), *_index_writer, p_key);
    write_vint(*_index_writer, _data_writer->offset());

-    _pi_write_m.promoted_index = {};
+    _pi_write_m.first_entry.reset();
+    _pi_write_m.blocks.clear();
+    _pi_write_m.offsets.clear();
+    _pi_write_m.promoted_index_size = 0;
    _pi_write_m.tomb = {};
    _pi_write_m.first_clustering.reset();
    _pi_write_m.last_clustering.reset();
@@ -926,7 +970,7 @@ void writer::consume(tombstone t) {
 }

 void writer::write_cell(bytes_ostream& writer, atomic_cell_view cell, const column_definition& cdef,
-        const row_time_properties& properties, bytes_view cell_path) {
+        const row_time_properties& properties, std::optional<bytes_view> cell_path) {

    bool is_deleted = !cell.is_live();
    bool has_value = !is_deleted && !cell.value().empty();
@@ -938,7 +982,7 @@ void writer::write_cell(bytes_ostream& writer, atomic_cell_view cell, const colu
                       properties.local_deletion_time == cell.deletion_time().time_since_epoch().count();

    cell_flags flags = cell_flags::none;
-    if (!has_value) {
+    if ((!has_value && !cdef.is_counter()) || is_deleted) {
        flags |= cell_flags::has_empty_value_mask;
    }
    if (is_deleted) {
@@ -967,20 +1011,22 @@ void writer::write_cell(bytes_ostream& writer, atomic_cell_view cell, const colu
        }
    }

-    if (!cell_path.empty()) {
-        write_vint(writer, cell_path.size());
-        write(_sst.get_version(), writer, cell_path);
+    if (bool(cell_path)) {
+        write_vint(writer, cell_path->size());
+        write(_sst.get_version(), writer, *cell_path);
    }

-    if (has_value) {
-        if (cdef.is_counter()) {
+    if (cdef.is_counter()) {
+        if (!is_deleted) {
            assert(!cell.is_counter_update());
          counter_cell_view::with_linearized(cell, [&] (counter_cell_view ccv) {
            write_counter_value(ccv, writer, sstable_version_types::mc, [] (bytes_ostream& out, uint32_t value) {
                return write_vint(out, value);
            });
          });
-        } else {
+        }
+    } else {
+        if (has_value) {
            write_cell_value(writer, *cdef.type, cell.value());
        }
    }
@@ -1061,7 +1107,7 @@ void writer::write_cells(bytes_ostream& writer, column_kind kind, const row& row
    // This differs from Origin where all updated columns are tracked and the set of filled columns of a row
    // is compared with the set of all columns filled in the memtable. So our encoding may be less optimal in some cases
    // but still valid.
-    write_missing_columns(writer, kind == column_kind::static_column ? _static_columns : _regular_columns, row_body);
+    write_missing_columns(writer, kind == column_kind::static_column ? _sst_schema.static_columns : _sst_schema.regular_columns, row_body);
    row_body.for_each_cell([this, &writer, kind, &properties, has_complex_deletion] (column_id id, const atomic_cell_or_collection& c) {
        auto&& column_definition = _schema.column_at(kind, id);
        if (!column_definition.is_atomic()) {
@@ -1105,18 +1151,6 @@ void writer::write_row_body(bytes_ostream& writer, const clustering_row& row, bo
    return write_cells(writer, column_kind::regular_column, row.cells(), properties, has_complex_deletion);
 }

-template<typename Func>
-uint64_t calculate_write_size(Func&& func) {
-    uint64_t written_size = 0;
-    {
-        auto counting_writer = file_writer(make_sizing_output_stream(written_size));
-        func(counting_writer);
-        counting_writer.flush();
-        counting_writer.close();
-    }
-    return written_size;
-}
-
 // Find if any collection in the row contains a collection-wide tombstone
 static bool row_has_complex_deletion(const schema& s, const row& r, column_kind kind) {
    bool result = false;
@@ -1138,16 +1172,14 @@ static bool row_has_complex_deletion(const schema& s, const row& r, column_kind
    return result;
 }

-void writer::write_static_row(const row& static_row) {
-    assert(_schema.is_compound());
-
+void writer::write_static_row(const row& static_row, column_kind kind) {
    uint64_t current_pos = _data_writer->offset();
    // Static row flag is stored in extended flags so extension_flag is always set for static rows
    row_flags flags = row_flags::extension_flag;
-    if (static_row.size() == _schema.static_columns_count()) {
+    if (static_row.size() == _sst_schema.static_columns.size()) {
        flags |= row_flags::has_all_columns;
    }
-    bool has_complex_deletion = row_has_complex_deletion(_schema, static_row, column_kind::static_column);
+    bool has_complex_deletion = row_has_complex_deletion(_schema, static_row, kind);
    if (has_complex_deletion) {
        flags |= row_flags::has_complex_deletion;
    }
@@ -1161,14 +1193,13 @@ void writer::write_static_row(const row& static_row) {

    _partition_header_length += (_data_writer->offset() - current_pos);

-    // Collect statistics
    ++_c_stats.rows_count;
+    _static_row_written = true;
 }

 stop_iteration writer::consume(static_row&& sr) {
    ensure_tombstone_is_written();
-    write_static_row(sr.cells());
-    _static_row_written = true;
+    write_static_row(sr.cells(), column_kind::static_column);
    return stop_iteration::no;
 }

@@ -1191,7 +1222,7 @@ void writer::write_clustered(const clustering_row& clustered_row, uint64_t prev_
        ext_flags = row_extended_flags::has_shadowable_deletion_scylla;
    }

-    if (clustered_row.cells().size() == _schema.regular_columns_count()) {
+    if (clustered_row.cells().size() == _sst_schema.regular_columns.size()) {
        flags |= row_flags::has_all_columns;
    }
    bool has_complex_deletion = row_has_complex_deletion(_schema, clustered_row.cells(), column_kind::regular_column);
@@ -1221,6 +1252,11 @@ void writer::write_clustered(const clustering_row& clustered_row, uint64_t prev_
 }

 stop_iteration writer::consume(clustering_row&& cr) {
+    if (_write_regular_as_static) {
+        ensure_tombstone_is_written();
+        write_static_row(cr.cells(), column_kind::regular_column);
+        return stop_iteration::no;
+    }
    drain_tombstones(position_in_partition_view::after_key(cr.key()));
    write_clustered(cr);
    return stop_iteration::no;
@@ -1242,28 +1278,33 @@ static void write_clustering_prefix(W& writer, bound_kind_m kind,
    write_clustering_prefix(writer, s, clustering, is_ephemerally_full);
 }

-void writer::write_promoted_index(file_writer& writer) {
-    static constexpr size_t width_base = 65536;
-    write_vint(writer, _partition_header_length);
-    write(_sst.get_version(), writer, to_deletion_time(_pi_write_m.tomb));
-    write_vint(writer, _pi_write_m.promoted_index.size());
-    std::vector<uint32_t> offsets;
-    offsets.reserve(_pi_write_m.promoted_index.size());
-    uint64_t start = writer.offset();
-    for (const pi_block& block: _pi_write_m.promoted_index) {
-        offsets.push_back(writer.offset() - start);
-        write_clustering_prefix(writer, block.first.kind, _schema, block.first.clustering);
-        write_clustering_prefix(writer, block.last.kind, _schema, block.last.clustering);
-        write_vint(writer, block.offset);
-        write_signed_vint(writer, block.width - width_base);
-        write(_sst.get_version(), writer, static_cast<std::byte>(block.open_marker ? 1 : 0));
-        if (block.open_marker) {
-            write(sstable_version_types::mc, writer, to_deletion_time(*block.open_marker));
-        }
+void writer::write_promoted_index() {
+    if (_pi_write_m.promoted_index_size < 2) {
+        write_vint(*_index_writer, uint64_t(0));
+        return;
    }
+    write_vint(_tmp_bufs, _partition_header_length);
+    write(_sst.get_version(), _tmp_bufs, to_deletion_time(_pi_write_m.tomb));
+    write_vint(_tmp_bufs, _pi_write_m.promoted_index_size);
+    uint64_t pi_size = _tmp_bufs.size() + _pi_write_m.blocks.size() + _pi_write_m.offsets.size();
+    write_vint(*_index_writer, pi_size);
+    flush_tmp_bufs(*_index_writer);
+    write(_sst.get_version(), *_index_writer, _pi_write_m.blocks);
+    write(_sst.get_version(), *_index_writer, _pi_write_m.offsets);
+}

-    for (uint32_t offset: offsets) {
-        write(_sst.get_version(), writer, offset);
+void writer::write_pi_block(const pi_block& block) {
+    static constexpr size_t width_base = 65536;
+    bytes_ostream& blocks = _pi_write_m.blocks;
+    uint32_t offset = blocks.size();
+    write(_sst.get_version(), _pi_write_m.offsets, offset);
+    write_clustering_prefix(blocks, block.first.kind, _schema, block.first.clustering);
+    write_clustering_prefix(blocks, block.last.kind, _schema, block.last.clustering);
+    write_vint(blocks, block.offset);
+    write_signed_vint(blocks, block.width - width_base);
+    write(_sst.get_version(), blocks, static_cast<std::byte>(block.open_marker ? 1 : 0));
+    if (block.open_marker) {
+        write(sstable_version_types::mc, blocks, to_deletion_time(*block.open_marker));
    }
 }

@@ -1307,21 +1348,11 @@ stop_iteration writer::consume_end_of_partition() {

    write(_sst.get_version(), *_data_writer, row_flags::end_of_partition);

-    if (!_pi_write_m.promoted_index.empty() && _pi_write_m.first_clustering) {
+    if (_pi_write_m.promoted_index_size && _pi_write_m.first_clustering) {
        add_pi_block();
    }

-    auto write_pi = [this] (file_writer& writer) {
-        return write_promoted_index(writer);
-    };
-
-    if (_pi_write_m.promoted_index.size() < 2) {
-        write_vint(*_index_writer, uint64_t(0));
-    } else {
-        uint64_t pi_size = calculate_write_size(write_pi);
-        write_vint(*_index_writer, pi_size);
-        write_pi(*_index_writer);
-    }
+    write_promoted_index();

    // compute size of the current row.
    _c_stats.partition_size = _data_writer->offset() - _c_stats.start_offset;
@@ -1336,10 +1367,15 @@ stop_iteration writer::consume_end_of_partition() {
        _first_key = *_partition_key;
    }
    _last_key = std::move(*_partition_key);
+    _partition_key = std::nullopt;
    return get_data_offset() < _cfg.max_sstable_size ? stop_iteration::no : stop_iteration::yes;
 }

 void writer::consume_end_of_stream() {
+    if (_partition_key) {
+        on_internal_error(sstlog, "Mutation stream ends with unclosed partition during write");
+    }
+
    _cfg.monitor->on_data_write_completed();

    seal_summary(_sst._components->summary, std::move(_first_key), std::move(_last_key), _index_sampling_state);
@@ -1348,9 +1384,10 @@ void writer::consume_end_of_stream() {
        _sst.get_metadata_collector().add_compression_ratio(_sst._components->compression.compressed_file_length(), _sst._components->compression.uncompressed_file_length());
    }

-    _index_writer->close();
-    _index_writer.reset();
+    close_writer(_index_writer);
    _sst.set_first_and_last_keys();
+
+    _sst._components->statistics.contents[metadata_type::Serialization] = std::make_unique<serialization_header>(std::move(_sst_schema.header));
    seal_statistics(_sst.get_version(), _sst._components->statistics, _sst.get_metadata_collector(),
            dht::global_partitioner().name(), _schema.bloom_filter_fp_chance(),
            _sst._schema, _sst.get_first_decorated_key(), _sst.get_last_decorated_key(), _enc_stats);
@@ -1363,6 +1400,9 @@ void writer::consume_end_of_stream() {
    if (!_cfg.correctly_serialize_non_compound_range_tombstones) {
        features.disable(sstable_feature::NonCompoundRangeTombstones);
    }
+    if (!_cfg.correctly_serialize_static_compact_in_mc) {
+        features.disable(sstable_feature::CorrectStaticCompact);
+    }
    _sst.write_scylla_metadata(_pc, _shard, std::move(features));
    _cfg.monitor->on_write_completed();
    if (!_cfg.leave_unsealed) {
--- a/sstables/mc/writer.hh
+++ b/sstables/mc/writer.hh
@@ -36,7 +36,5 @@ std::unique_ptr<sstable_writer::writer_impl> make_writer(sstable& sst,
    const io_priority_class& pc,
    shard_id shard);

-serialization_header make_serialization_header(const schema&, const encoding_stats&);
-
 }
 }
--- a/sstables/mp_row_consumer.cc
+++ b/sstables/mp_row_consumer.cc
@@ -44,6 +44,14 @@ namespace sstables {
 atomic_cell make_counter_cell(api::timestamp_type timestamp, bytes_view value) {
    static constexpr size_t shard_size = 32;

+    if (value.empty()) {
+        // This will never happen in a correct MC sstable but
+        // we had a bug #4363 that caused empty counters
+        // to be incorrectly stored inside sstables.
+        counter_cell_builder ccb;
+        return ccb.build(timestamp);
+    }
+
    data_input in(value);

    auto header_size = in.read<int16_t>();
@@ -53,13 +61,12 @@ atomic_cell make_counter_cell(api::timestamp_type timestamp, bytes_view value) {
            throw marshal_exception("encountered a local shard in a counter cell");
        }
    }
-    auto shard_count = value.size() / shard_size;
+    auto header_length = (size_t(header_size) + 1) * sizeof(int16_t);
+    auto shard_count = (value.size() - header_length) / shard_size;
    if (shard_count != size_t(header_size)) {
        throw marshal_exception("encountered remote shards in a counter cell");
    }

-    std::vector<counter_shard> shards;
-    shards.reserve(shard_count);
    counter_cell_builder ccb(shard_count);
    for (auto i = 0u; i < shard_count; i++) {
        auto id_hi = in.read<int64_t>();
--- a/sstables/mp_row_consumer.hh
+++ b/sstables/mp_row_consumer.hh
@@ -702,9 +702,12 @@ public:
    // Sets streamed_mutation::_end_of_range when there are no more fragments for the query range.
    // Returns information whether the parser should continue to parse more
    // input and produce more fragments or we have collected enough and should yield.
+    // Returns proceed:yes only when all pending fragments have been pushed.
    proceed push_ready_fragments() {
        if (_ready) {
-            return push_ready_fragments_with_ready_set();
+            if (push_ready_fragments_with_ready_set() == proceed::no) {
+                return proceed::no;
+            }
        }

        if (_out_of_range) {
@@ -808,6 +811,8 @@ class mp_row_consumer_m : public consumer_m {
    std::optional<new_mutation> _mutation;
    bool _is_mutation_end = true;
    streamed_mutation::forwarding _fwd;
+    // For static-compact tables C* stores the only row in the static row but in our representation they're regular rows.
+    const bool _treat_static_row_as_regular;

    std::optional<clustering_row> _in_progress_row;
    std::optional<range_tombstone> _stored_tombstone;
@@ -949,6 +954,8 @@ public:
        , _schema(schema)
        , _slice(slice)
        , _fwd(fwd)
+        , _treat_static_row_as_regular(_schema->is_static_compact_table()
+            && (!sst->has_scylla_component() || sst->features().is_enabled(sstable_feature::CorrectStaticCompact))) // See #4139
    {
        _cells.reserve(std::max(_schema->static_columns_count(), _schema->regular_columns_count()));
    }
@@ -1123,6 +1130,9 @@ public:

    virtual consumer_m::row_processing_result consume_static_row_start() override {
        sstlog.trace("mp_row_consumer_m {}: consume_static_row_start()", this);
+        if (_treat_static_row_as_regular) {
+            return consume_row_start({});
+        }
        _inside_static_row = true;
        _in_progress_static_row = static_row();
        return consumer_m::row_processing_result::do_proceed;
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -1023,9 +1023,26 @@ void sstable::write_simple(const T& component, const io_priority_class& pc) {
    options.buffer_size = sstable_buffer_size;
    options.io_priority_class = pc;
    auto w = file_writer(std::move(f), std::move(options));
-    write(_version, w, component);
-    w.flush();
-    w.close();
+    std::exception_ptr eptr;
+    try {
+        write(_version, w, component);
+        w.flush();
+    } catch (...) {
+        eptr = std::current_exception();
+    }
+    try {
+        w.close();
+    } catch (...) {
+        std::exception_ptr close_eptr = std::current_exception();
+        sstlog.warn("failed to close file_writer: {}", close_eptr);
+        // If write succeeded but close failed, we rethrow close's exception.
+        if (!eptr) {
+            eptr = close_eptr;
+        }
+    }
+    if (eptr) {
+        std::rethrow_exception(eptr);
+    }
 }

 template future<> sstable::read_simple<component_type::Filter>(sstables::filter& f, const io_priority_class& pc);
@@ -1816,11 +1833,6 @@ void seal_statistics(sstable_version_types v, statistics& s, metadata_collector&
    collector.construct_stats(stats);
    s.contents[metadata_type::Stats] = std::make_unique<stats_metadata>(std::move(stats));

-    if (v == sstable_version_types::mc) {
-        auto header = mc::make_serialization_header(*schema, enc_stats);
-        s.contents[metadata_type::Serialization] = std::make_unique<serialization_header>(std::move(header));
-    }
-
    populate_statistics_offsets(v, s);
 }

@@ -2082,11 +2094,15 @@ stop_iteration components_writer::consume_end_of_partition() {
        _first_key = *_partition_key;
    }
    _last_key = std::move(*_partition_key);
+    _partition_key = stdx::nullopt;

    return get_offset() < _max_sstable_size ? stop_iteration::no : stop_iteration::yes;
 }

 void components_writer::consume_end_of_stream() {
+    if (_partition_key) {
+        on_internal_error(sstlog, "Mutation stream ends with unclosed partition during write");
+    }
    // what if there is only one partition? what if it is empty?
    seal_summary(_sst._components->summary, std::move(_first_key), std::move(_last_key), _index_sampling_state);

@@ -3084,6 +3100,10 @@ bool supports_correct_non_compound_range_tombstones() {
    return service::get_local_storage_service().cluster_supports_reading_correctly_serialized_range_tombstones();
 }

+bool supports_correct_static_compact_in_mc() {
+    return bool(service::get_local_storage_service().cluster_supports_correct_static_compact_in_mc());
+}
+
 }

 std::ostream& operator<<(std::ostream& out, const sstables::component_type& comp_type) {
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -104,6 +104,7 @@ class data_consume_context;
 class index_reader;

 bool supports_correct_non_compound_range_tombstones();
+bool supports_correct_static_compact_in_mc();

 struct sstable_writer_config {
    std::experimental::optional<size_t> promoted_index_block_size;
@@ -113,6 +114,7 @@ struct sstable_writer_config {
    stdx::optional<db::replay_position> replay_position;
    write_monitor* monitor = &default_write_monitor();
    bool correctly_serialize_non_compound_range_tombstones = supports_correct_non_compound_range_tombstones();
+    bool correctly_serialize_static_compact_in_mc = supports_correct_static_compact_in_mc();
    db::large_partition_handler* large_partition_handler;
 };

@@ -624,6 +626,13 @@ public:
        return has_scylla_component() && _components->scylla_metadata->has_feature(sstable_feature::ShadowableTombstones);
    }

+    sstable_enabled_features features() const {
+        if (!has_scylla_component()) {
+            return {};
+        }
+        return _components->scylla_metadata->get_features();
+    }
+
    bool has_correct_max_deletion_time() const {
        return (_version == sstable_version_types::mc) || has_scylla_component();
    }
--- a/sstables/types.hh
+++ b/sstables/types.hh
@@ -410,16 +410,17 @@ struct serialization_header : public metadata_base<serialization_header> {
    }

    // mc serialization header minimum values are delta-encoded based on the default timestamp epoch times
+    // Note: following conversions rely on min_*_base.value being unsigned to prevent signed integer overflow
    api::timestamp_type get_min_timestamp() const {
        return static_cast<api::timestamp_type>(min_timestamp_base.value + encoding_stats::timestamp_epoch);
    }

    int32_t get_min_ttl() const {
-        return static_cast<int32_t>(min_ttl_base.value) + encoding_stats::ttl_epoch;
+        return static_cast<int32_t>(min_ttl_base.value + encoding_stats::ttl_epoch);
    }

    int32_t get_min_local_deletion_time() const {
-        return static_cast<int32_t>(min_local_deletion_time_base.value) + encoding_stats::deletion_time_epoch;
+        return static_cast<int32_t>(min_local_deletion_time_base.value + encoding_stats::deletion_time_epoch);
    }
 };

@@ -455,7 +456,9 @@ enum sstable_feature : uint8_t {
    NonCompoundPIEntries = 0,       // See #2993
    NonCompoundRangeTombstones = 1, // See #2986
    ShadowableTombstones = 2, // See #3885
-    End = 4,
+    CorrectStaticCompact = 3, // See #4139
+    CorrectEmptyCounters = 4, // See #4363
+    End = 5,
 };

 // Scylla-specific features enabled for a particular sstable.
@@ -504,9 +507,15 @@ struct scylla_metadata {
            disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ExtensionAttributes, extension_attributes>
            > data;

-    bool has_feature(sstable_feature f) const {
+    sstable_enabled_features get_features() const {
        auto features = data.get<scylla_metadata_type::Features, sstable_enabled_features>();
-        return features && features->is_enabled(f);
+        if (!features) {
+            return sstable_enabled_features{};
+        }
+        return *features;
+    }
+    bool has_feature(sstable_feature f) const {
+        return get_features().is_enabled(f);
    }
    const extension_attributes* get_extension_attributes() const {
        return data.get<scylla_metadata_type::ExtensionAttributes, extension_attributes>();
--- a/sstables/writer.hh
+++ b/sstables/writer.hh
@@ -136,6 +136,9 @@ public:
            , _full_checksum(full_file_checksum)
            {}

+    virtual temporary_buffer<char> allocate_buffer(size_t size) override {
+        return _out.allocate_buffer(size); // preserve alignment requirements
+    }
    future<> put(net::packet data) { abort(); }
    virtual future<> put(temporary_buffer<char> buf) override {
        // bufs will usually be a multiple of chunk size, but this won't be the case for
--- a/streaming/stream_manager.cc
+++ b/streaming/stream_manager.cc
@@ -292,7 +292,7 @@ void stream_manager::on_restart(inet_address endpoint, endpoint_state ep_state)
 }

 void stream_manager::on_dead(inet_address endpoint, endpoint_state ep_state) {
-    if (has_peer(endpoint) && ep_state.is_shutdown()) {
+    if (has_peer(endpoint)) {
        sslog.info("stream_manager: Close all stream_session with peer = {} in on_dead", endpoint);
        get_stream_manager().invoke_on_all([endpoint] (auto& sm) {
            sm.fail_sessions(endpoint);
--- a/streaming/stream_mutation_fragments_cmd.hh
+++ b/streaming/stream_mutation_fragments_cmd.hh
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+namespace streaming {
+
+enum class stream_mutation_fragments_cmd : uint8_t {
+    error,
+    mutation_fragment_data,
+    end_of_stream,
+};
+
+
+}
--- a/streaming/stream_session.cc
+++ b/streaming/stream_session.cc
@@ -63,6 +63,7 @@
 #include "db/system_keyspace.hh"
 #include <boost/algorithm/cxx11/any_of.hpp>
 #include <boost/range/adaptor/map.hpp>
+#include "streaming/stream_mutation_fragments_cmd.hh"

 namespace streaming {

@@ -214,22 +215,52 @@ void stream_session::init_messaging_service_handler() {
            });
        });
    });
-    ms().register_stream_mutation_fragments([] (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<stream_reason> reason_opt, rpc::source<frozen_mutation_fragment> source) {
+    ms().register_stream_mutation_fragments([] (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<stream_reason> reason_opt, rpc::source<frozen_mutation_fragment, rpc::optional<stream_mutation_fragments_cmd>> source) {
        auto from = netw::messaging_service::get_source(cinfo);
        auto reason = reason_opt ? *reason_opt: stream_reason::unspecified;
        sslog.trace("Got stream_mutation_fragments from {} reason {}", from, int(reason));
+        if (!_sys_dist_ks->local_is_initialized() || !_view_update_generator->local_is_initialized()) {
+            return make_exception_future<rpc::sink<int>>(std::runtime_error(format("Node {} is not fully initialized for streaming, try again later",
+                    utils::fb_utilities::get_broadcast_address())));
+        }
        return with_scheduling_group(service::get_local_storage_service().db().local().get_streaming_scheduling_group(), [from, estimated_partitions, plan_id, schema_id, cf_id, source, reason] () mutable {
                return service::get_schema_for_write(schema_id, from).then([from, estimated_partitions, plan_id, schema_id, cf_id, source, reason] (schema_ptr s) mutable {
                    auto sink = ms().make_sink_for_stream_mutation_fragments(source);
-                    auto get_next_mutation_fragment = [source, plan_id, from, s] () mutable {
-                        return source().then([plan_id, from, s] (stdx::optional<std::tuple<frozen_mutation_fragment>> fmf_opt) mutable {
-                            if (fmf_opt) {
-                                frozen_mutation_fragment& fmf = std::get<0>(fmf_opt.value());
+                    struct stream_mutation_fragments_cmd_status {
+                        bool got_cmd = false;
+                        bool got_end_of_stream = false;
+                    };
+                    auto cmd_status = make_lw_shared<stream_mutation_fragments_cmd_status>();
+                    auto get_next_mutation_fragment = [source, plan_id, from, s, cmd_status] () mutable {
+                        return source().then([plan_id, from, s, cmd_status] (stdx::optional<std::tuple<frozen_mutation_fragment, rpc::optional<stream_mutation_fragments_cmd>>> opt) mutable {
+                            if (opt) {
+                                auto cmd = std::get<1>(*opt);
+                                if (cmd) {
+                                    cmd_status->got_cmd = true;
+                                    switch (*cmd) {
+                                    case stream_mutation_fragments_cmd::mutation_fragment_data:
+                                        break;
+                                    case stream_mutation_fragments_cmd::error:
+                                        return make_exception_future<mutation_fragment_opt>(std::runtime_error("Sender failed"));
+                                    case stream_mutation_fragments_cmd::end_of_stream:
+                                        cmd_status->got_end_of_stream = true;
+                                        return make_ready_future<mutation_fragment_opt>();
+                                    default:
+                                        return make_exception_future<mutation_fragment_opt>(std::runtime_error("Sender sent wrong cmd"));
+                                    }
+                                }
+                                frozen_mutation_fragment& fmf = std::get<0>(*opt);
                                auto sz = fmf.representation().size();
                                auto mf = fmf.unfreeze(*s);
                                streaming::get_local_stream_manager().update_progress(plan_id, from.addr, progress_info::direction::IN, sz);
                                return make_ready_future<mutation_fragment_opt>(std::move(mf));
                            } else {
+                                // If the sender has sent stream_mutation_fragments_cmd it means it is
+                                // a node that understands the new protocol. It must send end_of_stream
+                                // before close the stream.
+                                if (cmd_status->got_cmd && !cmd_status->got_end_of_stream) {
+                                    return make_exception_future<mutation_fragment_opt>(std::runtime_error("Sender did not sent end_of_stream"));
+                                }
                                return make_ready_future<mutation_fragment_opt>();
                            }
                        });
@@ -644,8 +675,7 @@ void stream_session::close_session(stream_session_state final_state) {
            _stream_result->handle_session_complete(shared_from_this());
        }

-        sslog.debug("[Stream #{}] close_session session={}, state={}, cancel keep_alive timer", plan_id(), this, final_state);
-        _keep_alive.cancel();
+        sslog.debug("[Stream #{}] close_session session={}, state={}", plan_id(), this, final_state);
    }
 }

@@ -672,41 +702,6 @@ bool stream_session::is_initialized() const {

 void stream_session::init(shared_ptr<stream_result_future> stream_result_) {
    _stream_result = stream_result_;
-    _keep_alive.set_callback([this] {
-        auto plan_id = this->plan_id();
-        auto peer = this->peer;
-        get_local_stream_manager().get_progress_on_all_shards(plan_id, peer).then([this, peer, plan_id] (stream_bytes sbytes) {
-            if (this->_is_aborted) {
-                sslog.info("[Stream #{}] The session {} is closed, keep alive timer will do nothing", plan_id, this);
-                return;
-            }
-            auto now = lowres_clock::now();
-            sslog.debug("[Stream #{}] keep alive timer callback sbytes old: tx={}, rx={} new: tx={} rx={}",
-                    plan_id, this->_last_stream_bytes.bytes_sent, this->_last_stream_bytes.bytes_received,
-                    sbytes.bytes_sent, sbytes.bytes_received);
-            if (sbytes.bytes_sent > this->_last_stream_bytes.bytes_sent ||
-                sbytes.bytes_received > this->_last_stream_bytes.bytes_received) {
-                sslog.debug("[Stream #{}] The session {} made progress with peer {}", plan_id, this, peer);
-                // Progress has been made
-                this->_last_stream_bytes = sbytes;
-                this->_last_stream_progress = now;
-                this->start_keep_alive_timer();
-            } else if (now - this->_last_stream_progress >= this->_keep_alive_timeout) {
-                // Timeout
-                sslog.info("[Stream #{}] The session {} is idle for {} seconds, the peer {} is probably gone, close it",
-                        plan_id, this, this->_keep_alive_timeout.count(), peer);
-                this->on_error();
-            } else {
-                // Start the timer to check again
-                sslog.info("[Stream #{}] The session {} made no progress with peer {}", plan_id, this, peer);
-                this->start_keep_alive_timer();
-            }
-        }).handle_exception([plan_id, peer, session = this->shared_from_this()] (auto ep) {
-           sslog.info("[Stream #{}] keep alive timer callback fails with peer {}: {}", plan_id, peer, ep);
-        });
-    });
-    _last_stream_progress = lowres_clock::now();
-    start_keep_alive_timer();
 }

 utils::UUID stream_session::plan_id() {
--- a/streaming/stream_session.hh
+++ b/streaming/stream_session.hh
@@ -180,14 +180,6 @@ private:
    bool _complete_sent = false;
    bool _received_failed_complete_message = false;

-    // If the session is idle for 10 minutes, close the session
-    std::chrono::seconds _keep_alive_timeout{60 * 10};
-    // Check every 1 minutes
-    std::chrono::seconds _keep_alive_interval{60};
-    timer<lowres_clock> _keep_alive;
-    stream_bytes _last_stream_bytes;
-    lowres_clock::time_point _last_stream_progress;
-
    session_info _session_info;

    stream_reason _reason = stream_reason::unspecified;
@@ -198,9 +190,6 @@ public:
    void set_reason(stream_reason reason) {
        _reason = reason;
    }
-    void start_keep_alive_timer() {
-        _keep_alive.rearm(lowres_clock::now() + _keep_alive_interval);
-    }

    void add_bytes_sent(int64_t bytes) {
        _bytes_sent += bytes;
--- a/streaming/stream_transfer_task.cc
+++ b/streaming/stream_transfer_task.cc
@@ -42,6 +42,7 @@
 #include "streaming/stream_session.hh"
 #include "streaming/stream_manager.hh"
 #include "streaming/stream_reason.hh"
+#include "streaming/stream_mutation_fragments_cmd.hh"
 #include "mutation_reader.hh"
 #include "frozen_mutation.hh"
 #include "mutation.hh"
@@ -104,6 +105,21 @@ struct send_info {
        , prs(to_partition_ranges(ranges))
        , reader(cf.make_streaming_reader(cf.schema(), prs)) {
    }
+    future<bool> has_relevant_range_on_this_shard() {
+        return do_with(false, [this] (bool& found_relevant_range) {
+            return do_for_each(ranges, [this, &found_relevant_range] (dht::token_range range) {
+                if (!found_relevant_range) {
+                    auto sharder = dht::selective_token_range_sharder(range, engine().cpu_id());
+                    auto range_shard = sharder.next();
+                    if (range_shard) {
+                        found_relevant_range = true;
+                    }
+                }
+            }).then([&found_relevant_range] {
+                return found_relevant_range;
+            });
+        });
+    }
    future<size_t> estimate_partitions() {
        return do_with(cf.get_sstables(), size_t(0), [this] (auto& sstables, size_t& partition_count) {
            return do_for_each(*sstables, [this, &partition_count] (auto& sst) {
@@ -160,7 +176,7 @@ future<> send_mutations(lw_shared_ptr<send_info> si) {
 future<> send_mutation_fragments(lw_shared_ptr<send_info> si) {
  return si->estimate_partitions().then([si] (size_t estimated_partitions) {
    sslog.info("[Stream #{}] Start sending ks={}, cf={}, estimated_partitions={}, with new rpc streaming", si->plan_id, si->cf.schema()->ks_name(), si->cf.schema()->cf_name(), estimated_partitions);
-    return netw::get_local_messaging_service().make_sink_and_source_for_stream_mutation_fragments(si->reader.schema()->version(), si->plan_id, si->cf_id, estimated_partitions, si->reason, si->id).then([si] (rpc::sink<frozen_mutation_fragment> sink, rpc::source<int32_t> source) mutable {
+    return netw::get_local_messaging_service().make_sink_and_source_for_stream_mutation_fragments(si->reader.schema()->version(), si->plan_id, si->cf_id, estimated_partitions, si->reason, si->id).then([si] (rpc::sink<frozen_mutation_fragment, stream_mutation_fragments_cmd> sink, rpc::source<int32_t> source) mutable {
        auto got_error_from_peer = make_lw_shared<bool>(false);

        auto source_op = [source, got_error_from_peer, si] () mutable -> future<> {
@@ -183,18 +199,25 @@ future<> send_mutation_fragments(lw_shared_ptr<send_info> si) {
        }();

        auto sink_op = [sink, si, got_error_from_peer] () mutable -> future<> {
-            return do_with(std::move(sink), [si, got_error_from_peer] (rpc::sink<frozen_mutation_fragment>& sink) {
+            return do_with(std::move(sink), [si, got_error_from_peer] (rpc::sink<frozen_mutation_fragment, stream_mutation_fragments_cmd>& sink) {
                return repeat([&sink, si, got_error_from_peer] () mutable {
                    return si->reader(db::no_timeout).then([&sink, si, s = si->reader.schema(), got_error_from_peer] (mutation_fragment_opt mf) mutable {
                        if (mf && !(*got_error_from_peer)) {
                            frozen_mutation_fragment fmf = freeze(*s, *mf);
                            auto size = fmf.representation().size();
                            streaming::get_local_stream_manager().update_progress(si->plan_id, si->id.addr, streaming::progress_info::direction::OUT, size);
-                            return sink(fmf).then([] { return stop_iteration::no; });
+                            return sink(fmf, stream_mutation_fragments_cmd::mutation_fragment_data).then([] { return stop_iteration::no; });
                        } else {
                            return make_ready_future<stop_iteration>(stop_iteration::yes);
                        }
                    });
+                }).then([&sink] () mutable {
+                    return sink(frozen_mutation_fragment(bytes_ostream()), stream_mutation_fragments_cmd::end_of_stream);
+                }).handle_exception([&sink] (std::exception_ptr ep) mutable {
+                    // Notify the receiver the sender has failed
+                    return sink(frozen_mutation_fragment(bytes_ostream()), stream_mutation_fragments_cmd::error).then([ep = std::move(ep)] () mutable {
+                        return make_exception_future<>(std::move(ep));
+                    });
                }).finally([&sink] () mutable {
                    return sink.close();
                });
@@ -221,11 +244,18 @@ future<> stream_transfer_task::execute() {
    auto reason = session->get_reason();
    return session->get_db().invoke_on_all([plan_id, cf_id, id, dst_cpu_id, ranges=this->_ranges, streaming_with_rpc_stream, reason] (database& db) {
        auto si = make_lw_shared<send_info>(db, plan_id, cf_id, std::move(ranges), id, dst_cpu_id, reason);
-        if (streaming_with_rpc_stream) {
-            return send_mutation_fragments(std::move(si));
-        } else {
-            return send_mutations(std::move(si));
-        }
+        return si->has_relevant_range_on_this_shard().then([si, plan_id, cf_id, streaming_with_rpc_stream] (bool has_relevant_range_on_this_shard) {
+            if (!has_relevant_range_on_this_shard) {
+                sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}: ignore ranges on shard={}",
+                        plan_id, cf_id, engine().cpu_id());
+                return make_ready_future<>();
+            }
+            if (streaming_with_rpc_stream) {
+                return send_mutation_fragments(std::move(si));
+            } else {
+                return send_mutations(std::move(si));
+            }
+        });
    }).then([this, plan_id, cf_id, id, streaming_with_rpc_stream] {
        sslog.debug("[Stream #{}] SEND STREAM_MUTATION_DONE to {}, cf_id={}", plan_id, id, cf_id);
        return session->ms().send_stream_mutation_done(id, plan_id, _ranges,
@@ -235,7 +265,6 @@ future<> stream_transfer_task::execute() {
        });
    }).then([this, id, plan_id, cf_id] {
        sslog.debug("[Stream #{}] GOT STREAM_MUTATION_DONE Reply from {}", plan_id, id.addr);
-        session->start_keep_alive_timer();
    }).handle_exception([this, plan_id, id] (auto ep){
        sslog.warn("[Stream #{}] stream_transfer_task: Fail to send to {}: {}", plan_id, id, ep);
        std::rethrow_exception(ep);
--- a/Show More
+++ b/Show More