build: pass C compiler configuration in dist package build

Just like we allow customizing the C++ compiler, we should allow customizing the C compiler. Ref #3978 Message-Id: <20181211172821.30830-1-avi@scylladb.com> (cherry picked from commit fa96e07e6b)
sstables: index_reader: Avoid schema copy in advance_to()
2018-12-12 14:41:38 +02:00 · 2018-12-12 14:38:49 +02:00 · 2018-12-12 00:32:35 +00:00 · 2018-12-12 00:32:35 +00:00 · 2018-12-11 19:24:24 +02:00 · 2018-12-11 14:53:30 +00:00
283 changed files with 6837 additions and 2705 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
@@ -9,3 +9,6 @@
 [submodule "xxHash"]
 	path = xxHash
 	url = ../xxHash
+[submodule "libdeflate"]
+	path = libdeflate
+	url = ../libdeflate
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,4 +138,5 @@ target_include_directories(scylla PUBLIC
        ${SEASTAR_INCLUDE_DIRS}
        ${Boost_INCLUDE_DIRS}
        xxhash
+        libdeflate
        build/release/gen)
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=3.0.rc2

 if test -f version
 then
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -2228,11 +2228,11 @@
               "description":"The column family"
            },
            "total":{
-               "type":"int",
+               "type":"long",
               "description":"The total snapshot size"
            },
            "live":{
-               "type":"int",
+               "type":"long",
               "description":"The live snapshot size"
            }
         }
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -87,11 +87,17 @@ future<> create_metadata_table_if_missing(
    return mm.announce_new_column_family(b.build(), false);
 }

-future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db) {
+future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db, seastar::abort_source& as) {
    static const auto pause = [] { return sleep(std::chrono::milliseconds(500)); };

-    return do_until([&db] { return db.get_version() != database::empty_version; }, pause).then([&mm] {
-        return do_until([&mm] { return mm.have_schema_agreement(); }, pause);
+    return do_until([&db, &as] {
+        as.check();
+        return db.get_version() != database::empty_version;
+    }, pause).then([&mm, &as] {
+        return do_until([&mm, &as] {
+            as.check();
+            return mm.have_schema_agreement();
+        }, pause);
    });
 }

--- a/auth/common.hh
+++ b/auth/common.hh
@@ -81,7 +81,7 @@ future<> create_metadata_table_if_missing(
        stdx::string_view cql,
        ::service::migration_manager&);

-future<> wait_for_schema_agreement(::service::migration_manager&, const database&);
+future<> wait_for_schema_agreement(::service::migration_manager&, const database&, seastar::abort_source&);

 ///
 /// Time-outs for internal, non-local CQL queries.
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -160,7 +160,7 @@ future<> default_authorizer::start() {
                _migration_manager).then([this] {
            _finished = do_after_system_ready(_as, [this] {
                return async([this] {
-                    wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
+                    wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();

                    if (legacy_metadata_exists()) {
                        if (!any_granted().get0()) {
@@ -178,7 +178,7 @@ future<> default_authorizer::start() {

 future<> default_authorizer::stop() {
    _as.request_abort();
-    return _finished.handle_exception_type([](const sleep_aborted&) {});
+    return _finished.handle_exception_type([](const sleep_aborted&) {}).handle_exception_type([](const abort_requested_exception&) {});
 }

 future<permission_set>
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -157,7 +157,7 @@ future<> password_authenticator::start() {

         _stopped = do_after_system_ready(_as, [this] {
             return async([this] {
-                 wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
+                 wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();

                 if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash).get0()) {
                     if (legacy_metadata_exists()) {
@@ -182,7 +182,7 @@ future<> password_authenticator::start() {

 future<> password_authenticator::stop() {
    _as.request_abort();
-    return _stopped.handle_exception_type([] (const sleep_aborted&) { });
+    return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});
 }

 db::consistency_level password_authenticator::consistency_for_user(stdx::string_view role_name) {
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -227,7 +227,7 @@ future<> standard_role_manager::start() {
        return this->create_metadata_tables_if_missing().then([this] {
            _stopped = auth::do_after_system_ready(_as, [this] {
                return seastar::async([this] {
-                    wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
+                    wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();

                    if (any_nondefault_role_row_satisfies(_qp, &has_can_login).get0()) {
                        if (this->legacy_metadata_exists()) {
@@ -251,7 +251,7 @@ future<> standard_role_manager::start() {

 future<> standard_role_manager::stop() {
    _as.request_abort();
-    return _stopped.handle_exception_type([] (const sleep_aborted&) { });
+    return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});;
 }

 future<> standard_role_manager::create_or_replace(stdx::string_view role_name, const role_config& c) const {
--- a/clustering_ranges_walker.hh
+++ b/clustering_ranges_walker.hh
@@ -200,8 +200,9 @@ public:
        return _current_start;
    }

-    position_in_partition_view upper_bound() const {
-        return _current_end;
+    // Returns the upper bound of the last range in provided ranges set
+    position_in_partition_view uppermost_bound() const {
+        return position_in_partition_view::for_range_end(_ranges.back());
    }

    // When lower_bound() changes, this also does
--- a/compress.cc
+++ b/compress.cc
@@ -112,7 +112,7 @@ const sstring compression_parameters::CHUNK_LENGTH_KB = "chunk_length_kb";
 const sstring compression_parameters::CRC_CHECK_CHANCE = "crc_check_chance";

 compression_parameters::compression_parameters()
-    : compression_parameters(nullptr)
+    : compression_parameters(compressor::lz4)
 {}

 compression_parameters::~compression_parameters()
--- a/compress.hh
+++ b/compress.hh
@@ -118,6 +118,10 @@ public:
    std::map<sstring, sstring> get_options() const;
    bool operator==(const compression_parameters& other) const;
    bool operator!=(const compression_parameters& other) const;
+
+    static compression_parameters no_compression() {
+        return compression_parameters(nullptr);
+    }
 private:
    void validate_options(const std::map<sstring, sstring>&);
 };
--- a/configure.py
+++ b/configure.py
@@ -197,7 +197,9 @@ class Thrift(object):

 def default_target_arch():
    if platform.machine() in ['i386', 'i686', 'x86_64']:
-        return 'nehalem'
+        return 'westmere'   # support PCLMUL
+    elif platform.machine() == 'aarch64':
+        return 'armv8-a+crc+crypto'
    else:
        return ''

@@ -271,6 +273,7 @@ scylla_tests = [
    'tests/perf/perf_sstable',
    'tests/cql_query_test',
    'tests/secondary_index_test',
+    'tests/filtering_test',
    'tests/storage_proxy_test',
    'tests/schema_change_test',
    'tests/mutation_reader_test',
@@ -306,6 +309,7 @@ scylla_tests = [
    'tests/log_heap_test',
    'tests/managed_vector_test',
    'tests/crc_test',
+    'tests/checksum_utils_test',
    'tests/flush_queue_test',
    'tests/dynamic_bitset_test',
    'tests/auth_test',
@@ -356,6 +360,7 @@ scylla_tests = [

 perf_tests = [
    'tests/perf/perf_mutation_readers',
+    'tests/perf/perf_checksum',
    'tests/perf/perf_mutation_fragment',
    'tests/perf/perf_idl',
 ]
@@ -431,6 +436,7 @@ extra_cxxflags = {}
 cassandra_interface = Thrift(source='interface/cassandra.thrift', service='Cassandra')

 scylla_core = (['database.cc',
+                'table.cc',
                'atomic_cell.cc',
                'schema.cc',
                'frozen_schema.cc',
@@ -579,6 +585,7 @@ scylla_core = (['database.cc',
                'db/marshal/type_parser.cc',
                'db/batchlog_manager.cc',
                'db/view/view.cc',
+                'db/view/view_update_from_staging_generator.cc',
                'db/view/row_locking.cc',
                'index/secondary_index_manager.cc',
                'index/secondary_index.cc',
@@ -592,6 +599,7 @@ scylla_core = (['database.cc',
                'utils/managed_bytes.cc',
                'utils/exceptions.cc',
                'utils/config_file.cc',
+                'utils/gz/crc_combine.cc',
                'gms/version_generator.cc',
                'gms/versioned_value.cc',
                'gms/gossiper.cc',
@@ -682,6 +690,7 @@ scylla_core = (['database.cc',
                'data/cell.cc',
                'multishard_writer.cc',
                'multishard_mutation_query.cc',
+                'reader_concurrency_semaphore.cc',
                ] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
               )

@@ -773,6 +782,7 @@ pure_boost_tests = set([
    'tests/test-serialization',
    'tests/range_test',
    'tests/crc_test',
+    'tests/checksum_utils_test',
    'tests/managed_vector_test',
    'tests/dynamic_bitset_test',
    'tests/idl_test',
@@ -1001,6 +1011,8 @@ seastar_ldflags = args.user_ldflags
 seastar_flags += ['--compiler', args.cxx, '--c-compiler', args.cc, '--cflags=%s' % (seastar_cflags), '--ldflags=%s' % (seastar_ldflags),
                  '--c++-dialect=gnu++1z', '--optflags=%s' % (modes['release']['opt']), ]

+libdeflate_cflags = seastar_cflags
+
 status = subprocess.call([args.python, './configure.py'] + seastar_flags, cwd='seastar')

 if status != 0:
@@ -1100,6 +1112,9 @@ with open(buildfile, 'w') as f:
            command = {ninja} -C $subdir $target
            restat = 1
            description = NINJA $out
+        rule run
+            command = $in > $out
+            description = GEN $out
        rule copy
            command = cp $in $out
            description = COPY $out
@@ -1172,6 +1187,10 @@ with open(buildfile, 'w') as f:
            if binary.endswith('.a'):
                f.write('build $builddir/{}/{}: ar.{} {}\n'.format(mode, binary, mode, str.join(' ', objs)))
            else:
+                objs.extend(['$builddir/' + mode + '/' + artifact for artifact in [
+                    'libdeflate/libdeflate.a'
+                ]])
+                objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
                if binary.startswith('tests/'):
                    local_libs = '$libs'
                    if binary not in tests_not_using_seastar_test_framework or binary in pure_boost_tests:
@@ -1213,6 +1232,12 @@ with open(buildfile, 'w') as f:
                    antlr3_grammars.add(src)
                else:
                    raise Exception('No rule for ' + src)
+        compiles['$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o'] = '$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc'
+        compiles['$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'] = 'utils/gz/gen_crc_combine_table.cc'
+        f.write('build {}: run {}\n'.format('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc',
+                                            '$builddir/' + mode + '/utils/gz/gen_crc_combine_table'))
+        f.write('build {}: link.{} {}\n'.format('$builddir/' + mode + '/utils/gz/gen_crc_combine_table', mode,
+                                                '$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'))
        for obj in compiles:
            src = compiles[obj]
            gen_headers = list(ragels.keys())
@@ -1262,6 +1287,10 @@ with open(buildfile, 'w') as f:
            ''').format(**locals()))
        f.write('build build/$mode/scylla-package.tar: package build/{mode}/scylla build/{mode}/iotune\n'.format(**locals()))
        f.write('    mode = {mode}\n'.format(**locals()))
+        f.write('rule libdeflate.{mode}\n'.format(**locals()))
+        f.write('    command = make -C libdeflate BUILD_DIR=../build/{mode}/libdeflate/ CFLAGS="{libdeflate_cflags}" CC={args.cc}\n'.format(**locals()))
+        f.write('build build/{mode}/libdeflate/libdeflate.a: libdeflate.{mode}\n'.format(**locals()))
+
    f.write('build {}: phony\n'.format(seastar_deps))
    f.write(textwrap.dedent('''\
        rule configure
--- a/cql3/error_collector.hh
+++ b/cql3/error_collector.hh
@@ -67,6 +67,12 @@ class error_collector : public error_listener<RecognizerType, ExceptionBaseType>
     */
    const sstring_view _query;

+    /**
+     * An empty bitset to be used as a workaround for AntLR null dereference
+     * bug.
+     */
+    static typename ExceptionBaseType::BitsetListType _empty_bit_list;
+
 public:

    /**
@@ -144,6 +150,14 @@ private:
            break;
        }
        default:
+            // AntLR Exception class has a bug of dereferencing a null
+            // pointer in the displayRecognitionError. The following
+            // if statement makes sure it will not be null before the
+            // call to that function (displayRecognitionError).
+            // bug reference: https://github.com/antlr/antlr3/issues/191
+            if (!ex->get_expectingSet()) {
+                ex->set_expectingSet(&_empty_bit_list);
+            }
            ex->displayRecognitionError(token_names, msg);
        }
        return msg.str();
@@ -345,4 +359,8 @@ private:
 #endif
 };

+template<typename RecognizerType, typename TokenType, typename ExceptionBaseType>
+typename ExceptionBaseType::BitsetListType
+error_collector<RecognizerType,TokenType,ExceptionBaseType>::_empty_bit_list = typename ExceptionBaseType::BitsetListType();
+
 }
--- a/cql3/restrictions/primary_key_restrictions.hh
+++ b/cql3/restrictions/primary_key_restrictions.hh
@@ -106,6 +106,11 @@ public:
    virtual size_t prefix_size() const {
        return 0;
    }
+
+    size_t prefix_size(const schema_ptr schema) const {
+        return 0;
+    }
+
 };

 template<>
@@ -129,5 +134,23 @@ inline bool primary_key_restrictions<clustering_key>::needs_filtering(const sche
    return false;
 }

+template<>
+inline size_t primary_key_restrictions<clustering_key>::prefix_size(const schema_ptr schema) const {
+    size_t count = 0;
+    if (schema->clustering_key_columns().empty()) {
+        return count;
+    }
+    auto column_defs = get_column_defs();
+    column_id expected_column_id = schema->clustering_key_columns().begin()->id;
+    for (auto&& cdef : column_defs) {
+        if (schema->position(*cdef) != expected_column_id) {
+            return count;
+        }
+        expected_column_id++;
+        count++;
+    }
+    return count;
+}
+
 }
 }
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -166,19 +166,7 @@ public:
    }

    virtual size_t prefix_size() const override {
-        size_t count = 0;
-        if (_schema->clustering_key_columns().empty()) {
-            return count;
-        }
-        column_id expected_column_id = _schema->clustering_key_columns().begin()->id;
-        for (const auto& restriction_entry : _restrictions->restrictions()) {
-            if (_schema->position(*restriction_entry.first) != expected_column_id) {
-                return count;
-            }
-            expected_column_id++;
-            count++;
-        }
-        return count;
+        return primary_key_restrictions<ValueType>::prefix_size(_schema);
    }

    ::shared_ptr<single_column_primary_key_restrictions<clustering_key>> get_longest_prefix_restrictions() {
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -337,6 +337,52 @@ const std::vector<::shared_ptr<restrictions>>& statement_restrictions::index_res
    return _index_restrictions;
 }

+std::optional<secondary_index::index> statement_restrictions::find_idx(secondary_index::secondary_index_manager& sim) const {
+    for (::shared_ptr<cql3::restrictions::restrictions> restriction : index_restrictions()) {
+        for (const auto& cdef : restriction->get_column_defs()) {
+            for (auto index : sim.list_indexes()) {
+                if (index.depends_on(*cdef)) {
+                    return std::make_optional<secondary_index::index>(std::move(index));
+                }
+            }
+        }
+    }
+    return std::nullopt;
+}
+
+std::vector<const column_definition*> statement_restrictions::get_column_defs_for_filtering(database& db) const {
+    std::vector<const column_definition*> column_defs_for_filtering;
+    if (need_filtering()) {
+        auto& sim = db.find_column_family(_schema).get_index_manager();
+        std::optional<secondary_index::index> opt_idx = find_idx(sim);
+        auto column_uses_indexing = [&opt_idx] (const column_definition* cdef) {
+            return opt_idx && opt_idx->depends_on(*cdef);
+        };
+        if (_partition_key_restrictions->needs_filtering(*_schema)) {
+            for (auto&& cdef : _partition_key_restrictions->get_column_defs()) {
+                if (!column_uses_indexing(cdef)) {
+                    column_defs_for_filtering.emplace_back(cdef);
+                }
+            }
+        }
+        if (_clustering_columns_restrictions->needs_filtering(*_schema)) {
+            column_id first_non_prefix_id = _schema->clustering_key_columns().begin()->id +
+                    _clustering_columns_restrictions->prefix_size(_schema);
+            for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
+                if ((cdef->id >= first_non_prefix_id) && (!column_uses_indexing(cdef))) {
+                    column_defs_for_filtering.emplace_back(cdef);
+                }
+            }
+        }
+        for (auto&& cdef : _nonprimary_key_restrictions->get_column_defs()) {
+            if (!column_uses_indexing(cdef)) {
+                column_defs_for_filtering.emplace_back(cdef);
+            }
+        }
+    }
+    return column_defs_for_filtering;
+}
+
 void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering) {
    // If there is a queriable index, no special condition are required on the other restrictions.
    // But we still need to know 2 things:
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -163,6 +163,20 @@ public:
        return _clustering_columns_restrictions;
    }

+    /**
+     * Builds a possibly empty collection of column definitions that will be used for filtering
+     * @param db - the database context
+     * @return A list with the column definitions needed for filtering.
+     */
+    std::vector<const column_definition*> get_column_defs_for_filtering(database& db) const;
+
+    /**
+     * Determines the index to be used with the restriction.
+     * @param db - the database context (for extracting index manager)
+     * @return If an index can be used, an optional containing this index, otherwise an empty optional.
+     */
+    std::optional<secondary_index::index> find_idx(secondary_index::secondary_index_manager& sim) const;
+
    /**
     * Checks if the partition key has some unrestricted components.
     * @return <code>true</code> if the partition key has some unrestricted components, <code>false</code> otherwise.
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -156,9 +156,9 @@ public:
        return _factories->uses_function(ks_name, function_name);
    }

-    virtual uint32_t add_column_for_ordering(const column_definition& c) override {
-        uint32_t index = selection::add_column_for_ordering(c);
-        _factories->add_selector_for_ordering(c, index);
+    virtual uint32_t add_column_for_post_processing(const column_definition& c) override {
+        uint32_t index = selection::add_column_for_post_processing(c);
+        _factories->add_selector_for_post_processing(c, index);
        return index;
    }

@@ -227,7 +227,7 @@ protected:
    return simple_selection::make(schema, std::move(columns), false);
 }

-uint32_t selection::add_column_for_ordering(const column_definition& c) {
+uint32_t selection::add_column_for_post_processing(const column_definition& c) {
    _columns.push_back(&c);
    _metadata->add_non_serialized_column(c.column_specification);
    return _columns.size() - 1;
@@ -339,7 +339,7 @@ std::unique_ptr<result_set> result_set_builder::build() {
    return std::move(_result_set);
 }

-bool result_set_builder::restrictions_filter::operator()(const selection& selection,
+bool result_set_builder::restrictions_filter::do_filter(const selection& selection,
                                                         const std::vector<bytes>& partition_key,
                                                         const std::vector<bytes>& clustering_key,
                                                         const query::result_row_view& static_row,
@@ -427,6 +427,18 @@ bool result_set_builder::restrictions_filter::operator()(const selection& select
    return true;
 }

+bool result_set_builder::restrictions_filter::operator()(const selection& selection,
+                                                         const std::vector<bytes>& partition_key,
+                                                         const std::vector<bytes>& clustering_key,
+                                                         const query::result_row_view& static_row,
+                                                         const query::result_row_view& row) const {
+    bool accepted = do_filter(selection, partition_key, clustering_key, static_row, row);
+    if (!accepted) {
+        ++_rows_dropped;
+    }
+    return accepted;
+}
+
 api::timestamp_type result_set_builder::timestamp_of(size_t idx) {
    return _timestamps[idx];
 }
--- a/cql3/selection/selection.hh
+++ b/cql3/selection/selection.hh
@@ -176,7 +176,7 @@ public:
    static ::shared_ptr<selection> wildcard(schema_ptr schema);
    static ::shared_ptr<selection> for_columns(schema_ptr schema, std::vector<const column_definition*> columns);

-    virtual uint32_t add_column_for_ordering(const column_definition& c);
+    virtual uint32_t add_column_for_post_processing(const column_definition& c);

    virtual bool uses_function(const sstring &ks_name, const sstring& function_name) const {
        return false;
@@ -259,12 +259,16 @@ public:
        }
        void reset() {
        }
+        uint32_t get_rows_dropped() const {
+            return 0;
+        }
    };
    class restrictions_filter {
        ::shared_ptr<restrictions::statement_restrictions> _restrictions;
        const query_options& _options;
        mutable bool _current_partition_key_does_not_match = false;
        mutable bool _current_static_row_does_not_match = false;
+        mutable uint32_t _rows_dropped = 0;
    public:
        restrictions_filter() = default;
        explicit restrictions_filter(::shared_ptr<restrictions::statement_restrictions> restrictions, const query_options& options) : _restrictions(restrictions), _options(options) {}
@@ -272,7 +276,13 @@ public:
        void reset() {
            _current_partition_key_does_not_match = false;
            _current_static_row_does_not_match = false;
+            _rows_dropped = 0;
        }
+        uint32_t get_rows_dropped() const {
+            return _rows_dropped;
+        }
+    private:
+        bool do_filter(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
    };

    result_set_builder(const selection& s, gc_clock::time_point now, cql_serialization_format sf);
@@ -372,7 +382,7 @@ public:
            }
        }

-        void accept_partition_end(const query::result_row_view& static_row) {
+        uint32_t accept_partition_end(const query::result_row_view& static_row) {
            if (_row_count == 0) {
                _builder.new_row();
                auto static_row_iterator = static_row.iterator();
@@ -386,6 +396,7 @@ public:
                    }
                }
            }
+            return _filter.get_rows_dropped();
        }
    };

--- a/cql3/selection/selector_factories.cc
+++ b/cql3/selection/selector_factories.cc
@@ -53,6 +53,7 @@ selector_factories::selector_factories(std::vector<::shared_ptr<selectable>> sel
    : _contains_write_time_factory(false)
    , _contains_ttl_factory(false)
    , _number_of_aggregate_factories(0)
+    , _number_of_factories_for_post_processing(0)
 {
    _factories.reserve(selectables.size());

@@ -76,8 +77,9 @@ bool selector_factories::uses_function(const sstring& ks_name, const sstring& fu
    return false;
 }

-void selector_factories::add_selector_for_ordering(const column_definition& def, uint32_t index) {
+void selector_factories::add_selector_for_post_processing(const column_definition& def, uint32_t index) {
    _factories.emplace_back(simple_selector::new_factory(def.name_as_text(), index, def.type));
+    ++_number_of_factories_for_post_processing;
 }

 std::vector<::shared_ptr<selector>> selector_factories::new_instances() const {
--- a/cql3/selection/selector_factories.hh
+++ b/cql3/selection/selector_factories.hh
@@ -74,6 +74,11 @@ private:
     */
    uint32_t _number_of_aggregate_factories;

+    /**
+     * The number of factories that are only for post processing.
+     */
+    uint32_t _number_of_factories_for_post_processing;
+
 public:
    /**
     * Creates a new <code>SelectorFactories</code> instance and collect the column definitions.
@@ -97,11 +102,12 @@ public:
    bool uses_function(const sstring& ks_name, const sstring& function_name) const;

    /**
-     * Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY purposes.
+     * Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY or post
+     * processing purposes.
     * @param def the column that is needed for ordering
     * @param index the index of the column definition in the Selection's list of columns
     */
-    void add_selector_for_ordering(const column_definition& def, uint32_t index);
+    void add_selector_for_post_processing(const column_definition& def, uint32_t index);

    /**
     * Checks if this <code>SelectorFactories</code> contains only factories for aggregates.
@@ -111,7 +117,7 @@ public:
     */
    bool contains_only_aggregate_functions() const {
        auto size = _factories.size();
-        return size != 0 && _number_of_aggregate_factories == size;
+        return size != 0 && _number_of_aggregate_factories  == (size - _number_of_factories_for_post_processing);
    }

    /**
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -137,10 +137,15 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c

        bool is_map = dynamic_cast<const collection_type_impl *>(cd->type.get()) != nullptr
                      && dynamic_cast<const collection_type_impl *>(cd->type.get())->is_map();
-        bool is_frozen_collection = cd->type->is_collection() && !cd->type->is_multi_cell();
+        bool is_collection = cd->type->is_collection();
+        bool is_frozen_collection = is_collection && !cd->type->is_multi_cell();

        if (is_frozen_collection) {
            validate_for_frozen_collection(target);
+        } else if (is_collection) {
+            // NOTICE(sarna): should be lifted after #2962 (indexes on non-frozen collections) is implemented
+            throw exceptions::invalid_request_exception(
+                    sprint("Cannot create secondary index on non-frozen collection column %s", cd->name_as_text()));
        } else {
            validate_not_full_index(target);
            validate_is_values_index_if_target_column_not_collection(cd, target);
--- a/cql3/statements/create_view_statement.cc
+++ b/cql3/statements/create_view_statement.cc
@@ -84,7 +84,6 @@ create_view_statement::create_view_statement(
    , _clustering_keys{clustering_keys}
    , _if_not_exists{if_not_exists}
 {
-    service::get_local_storage_proxy().get_db().local().get_config().check_experimental("Creating materialized views");
    if (!service::get_local_storage_service().cluster_supports_materialized_views()) {
        throw exceptions::invalid_request_exception("Can't create materialized views until the whole cluster has been upgraded");
    }
@@ -315,6 +314,27 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
        throw exceptions::invalid_request_exception(sprint("No columns are defined for Materialized View other than primary key"));
    }

+    // The unique feature of a filter by a non-key column is that the
+    // value of such column can be updated - and also be expired with TTL
+    // and cause the view row to appear and disappear. We don't currently
+    // support support this case - see issue #3430, and neither does
+    // Cassandra - see see CASSANDRA-13798 and CASSANDRA-13832.
+    // Actually, as CASSANDRA-13798 explains, the problem is "the liveness of
+    // view row is now depending on multiple base columns (multiple filtered
+    // non-pk base column + base column used in view pk)". When the filtered
+    // column *is* the base column added to the view pk, we don't have this
+    // problem. And this case actually works correctly.
+    auto non_pk_restrictions = restrictions->get_non_pk_restriction();
+    if (non_pk_restrictions.size() == 1 && has_non_pk_column &&
+            std::find(target_primary_keys.begin(), target_primary_keys.end(), non_pk_restrictions.cbegin()->first) != target_primary_keys.end()) {
+        // This case (filter by new PK column of the view) works, as explained above
+    } else if (!non_pk_restrictions.empty()) {
+        auto column_names = ::join(", ", non_pk_restrictions | boost::adaptors::map_keys | boost::adaptors::transformed(std::mem_fn(&column_definition::name_as_text)));
+        throw exceptions::invalid_request_exception(sprint(
+                "Non-primary key columns cannot be restricted in the SELECT statement used for materialized view %s creation (got restrictions on: %s)",
+                column_family(), column_names));
+    }
+
    schema_builder builder{keyspace(), column_family()};
    auto add_columns = [this, &builder] (std::vector<const column_definition*>& defs, column_kind kind) mutable {
        for (auto* def : defs) {
--- a/cql3/statements/index_prop_defs.cc
+++ b/cql3/statements/index_prop_defs.cc
@@ -49,7 +49,7 @@ void cql3::statements::index_prop_defs::validate() {
    property_definitions::validate(keywords);

    if (is_custom && !custom_class) {
-        throw exceptions::invalid_request_exception("CUSTOM index requires specifiying the index class");
+        throw exceptions::invalid_request_exception("CUSTOM index requires specifying the index class");
    }

    if (!is_custom && custom_class) {
@@ -64,6 +64,16 @@ void cql3::statements::index_prop_defs::validate() {
                sprint("Cannot specify %s as a CUSTOM option",
                        db::index::secondary_index::custom_index_option_name));
    }
+
+    // Currently, Scylla does not support *any* class of custom index
+    // implementation. If in the future we do (e.g., SASI, or something
+    // new), we'll need to check for valid values here.
+    if (is_custom && custom_class) {
+        throw exceptions::invalid_request_exception(
+                format("Unsupported CUSTOM INDEX class {}. Note that currently, Scylla does not support SASI or any other CUSTOM INDEX class.",
+                        *custom_class));
+
+    }
 }

 index_options_map
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -141,6 +141,10 @@ private:
    /** If ALLOW FILTERING was not specified, this verifies that it is not needed */
    void check_needs_filtering(::shared_ptr<restrictions::statement_restrictions> restrictions);

+    void ensure_filtering_columns_retrieval(database& db,
+                                            ::shared_ptr<selection::selection> selection,
+                                            ::shared_ptr<restrictions::statement_restrictions> restrictions);
+
    bool contains_alias(::shared_ptr<column_identifier> name);

    ::shared_ptr<column_specification> limit_receiver();
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -383,8 +383,9 @@ select_statement::do_execute(service::storage_proxy& proxy,
    int32_t limit = get_limit(options);
    auto now = gc_clock::now();

+    const bool restrictions_need_filtering = _restrictions->need_filtering();
    ++_stats.reads;
-    _stats.filtered_reads += _restrictions->need_filtering();
+    _stats.filtered_reads += restrictions_need_filtering;

    auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
        make_partition_slice(options), limit, now, tracing::make_trace_info(state.get_trace_state()), query::max_partitions, utils::UUID(), options.get_timestamp(state));
@@ -396,37 +397,42 @@ select_statement::do_execute(service::storage_proxy& proxy,
    // An aggregation query will never be paged for the user, but we always page it internally to avoid OOM.
    // If we user provided a page_size we'll use that to page internally (because why not), otherwise we use our default
    // Note that if there are some nodes in the cluster with a version less than 2.0, we can't use paging (CASSANDRA-6707).
-    auto aggregate = _selection->is_aggregate();
-    if (aggregate && page_size <= 0) {
+    const bool aggregate = _selection->is_aggregate();
+    const bool nonpaged_filtering = restrictions_need_filtering && page_size <= 0;
+    if (aggregate || nonpaged_filtering) {
        page_size = DEFAULT_COUNT_PAGE_SIZE;
    }

    auto key_ranges = _restrictions->get_partition_key_ranges(options);

-    if (!aggregate && (page_size <= 0
+    if (!aggregate && !restrictions_need_filtering && (page_size <= 0
            || !service::pager::query_pagers::may_need_paging(*_schema, page_size,
                    *command, key_ranges))) {
        return execute(proxy, command, std::move(key_ranges), state, options, now);
    }

    command->slice.options.set<query::partition_slice::option::allow_short_read>();
-    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    auto timeout_duration = options.get_timeout_config().*get_timeout_config_selector();
    auto p = service::pager::query_pagers::pager(_schema, _selection,
-            state, options, command, std::move(key_ranges), _stats, _restrictions->need_filtering() ? _restrictions : nullptr);
+            state, options, command, std::move(key_ranges), _stats, restrictions_need_filtering ? _restrictions : nullptr);

-    if (aggregate) {
+    if (aggregate || nonpaged_filtering) {
        return do_with(
                cql3::selection::result_set_builder(*_selection, now,
                        options.get_cql_serialization_format()),
-                [this, p, page_size, now, timeout](auto& builder) {
+                [this, p, page_size, now, timeout_duration, restrictions_need_filtering, limit](auto& builder) {
                    return do_until([p] {return p->is_exhausted();},
-                            [p, &builder, page_size, now, timeout] {
+                            [p, &builder, page_size, now, timeout_duration] {
+                                auto timeout = db::timeout_clock::now() + timeout_duration;
                                return p->fetch_page(builder, page_size, now, timeout);
                            }
-                    ).then([this, &builder] {
+                    ).then([this, &builder, restrictions_need_filtering, limit] {
                                auto rs = builder.build();
+                                if (restrictions_need_filtering) {
+                                    rs->trim(limit);
+                                    _stats.filtered_rows_matched_total += rs->size();
+                                }
                                update_stats_rows_read(rs->size());
-                                _stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
                                auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
                                return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
                            });
@@ -439,7 +445,8 @@ select_statement::do_execute(service::storage_proxy& proxy,
                        " you must either remove the ORDER BY or the IN and sort client side, or disable paging for this query");
    }

-    if (_selection->is_trivial() && !_restrictions->need_filtering()) {
+    auto timeout = db::timeout_clock::now() + timeout_duration;
+    if (_selection->is_trivial() && !restrictions_need_filtering) {
        return p->fetch_page_generator(page_size, now, timeout, _stats).then([this, p, limit] (result_generator generator) {
            auto meta = [&] () -> shared_ptr<const cql3::metadata> {
                if (!p->is_exhausted()) {
@@ -458,14 +465,17 @@ select_statement::do_execute(service::storage_proxy& proxy,
    }

    return p->fetch_page(page_size, now, timeout).then(
-            [this, p, &options, limit, now](std::unique_ptr<cql3::result_set> rs) {
+            [this, p, &options, limit, now, restrictions_need_filtering](std::unique_ptr<cql3::result_set> rs) {

                if (!p->is_exhausted()) {
                    rs->get_metadata().set_paging_state(p->state());
                }

+                if (restrictions_need_filtering) {
+                    rs->trim(limit);
+                    _stats.filtered_rows_matched_total += rs->size();
+                }
                update_stats_rows_read(rs->size());
-                _stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
                auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
                return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
            });
@@ -492,15 +502,9 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const clustering_
    return KeyType::from_range(exploded_base_key);
 }

-future<shared_ptr<cql_transport::messages::result_message>>
-indexed_table_select_statement::execute_base_query(
-        service::storage_proxy& proxy,
-        dht::partition_range_vector&& partition_ranges,
-        service::query_state& state,
-        const query_options& options,
-        gc_clock::time_point now,
-        ::shared_ptr<const service::pager::paging_state> paging_state) {
-    auto cmd = ::make_lw_shared<query::read_command>(
+lw_shared_ptr<query::read_command>
+indexed_table_select_statement::prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging) {
+    lw_shared_ptr<query::read_command> cmd = ::make_lw_shared<query::read_command>(
            _schema->id(),
            _schema->version(),
            make_partition_slice(options),
@@ -510,9 +514,25 @@ indexed_table_select_statement::execute_base_query(
            query::max_partitions,
            utils::UUID(),
            options.get_timestamp(state));
-    if (options.get_page_size() > 0) {
+    if (use_paging) {
        cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
+        cmd->slice.options.set<query::partition_slice::option::send_partition_key>();
+        if (_schema->clustering_key_size() > 0) {
+            cmd->slice.options.set<query::partition_slice::option::send_clustering_key>();
+        }
    }
+    return cmd;
+}
+
+future<shared_ptr<cql_transport::messages::result_message>>
+indexed_table_select_statement::execute_base_query(
+        service::storage_proxy& proxy,
+        dht::partition_range_vector&& partition_ranges,
+        service::query_state& state,
+        const query_options& options,
+        gc_clock::time_point now,
+        ::shared_ptr<const service::pager::paging_state> paging_state) {
+    auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
    dht::partition_range_vector per_vnode_ranges;
    per_vnode_ranges.reserve(partition_ranges.size());
@@ -586,19 +606,7 @@ indexed_table_select_statement::execute_base_query(
        const query_options& options,
        gc_clock::time_point now,
        ::shared_ptr<const service::pager::paging_state> paging_state) {
-    auto cmd = make_lw_shared<query::read_command>(
-            _schema->id(),
-            _schema->version(),
-            make_partition_slice(options),
-            get_limit(options),
-            now,
-            tracing::make_trace_info(state.get_trace_state()),
-            query::max_partitions,
-            utils::UUID(),
-            options.get_timestamp(state));
-    if (options.get_page_size() > 0) {
-        cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
-    }
+    auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();

    struct base_query_state {
@@ -714,7 +722,8 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
                                  const query_options& options,
                                  gc_clock::time_point now)
 {
-    bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !_restrictions->need_filtering();
+    const bool restrictions_need_filtering = _restrictions->need_filtering();
+    const bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !restrictions_need_filtering;
    if (fast_path) {
        return make_shared<cql_transport::messages::result_message::rows>(result(
            result_generator(_schema, std::move(results), std::move(cmd), _selection, _stats),
@@ -724,7 +733,7 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu

    cql3::selection::result_set_builder builder(*_selection, now,
            options.get_cql_serialization_format());
-    if (_restrictions->need_filtering()) {
+    if (restrictions_need_filtering) {
        results->ensure_counts();
        _stats.filtered_rows_read_total += *results->row_count();
        query::result_view::consume(*results, cmd->slice,
@@ -743,9 +752,11 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
            rs->reverse();
        }
        rs->trim(cmd->row_limit);
+    } else if (restrictions_need_filtering) {
+        rs->trim(cmd->row_limit);
    }
    update_stats_rows_read(rs->size());
-    _stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
+    _stats.filtered_rows_matched_total += restrictions_need_filtering ? rs->size() : 0;
    return ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
 }

@@ -774,7 +785,8 @@ indexed_table_select_statement::prepare(database& db,
                                        ordering_comparator_type ordering_comparator,
                                        ::shared_ptr<term> limit, cql_stats &stats)
 {
-    auto index_opt = find_idx(db, schema, restrictions);
+    auto& sim = db.find_column_family(schema).get_index_manager();
+    auto index_opt = restrictions->find_idx(sim);
    if (!index_opt) {
        throw std::runtime_error("No index found.");
    }
@@ -798,24 +810,6 @@ indexed_table_select_statement::prepare(database& db,

 }

-
-stdx::optional<secondary_index::index> indexed_table_select_statement::find_idx(database& db,
-                                                                                schema_ptr schema,
-                                                                                ::shared_ptr<restrictions::statement_restrictions> restrictions)
-{
-    auto& sim = db.find_column_family(schema).get_index_manager();
-    for (::shared_ptr<cql3::restrictions::restrictions> restriction : restrictions->index_restrictions()) {
-        for (const auto& cdef : restriction->get_column_defs()) {
-            for (auto index : sim.list_indexes()) {
-                if (index.depends_on(*cdef)) {
-                    return stdx::make_optional<secondary_index::index>(std::move(index));
-                }
-            }
-        }
-    }
-    return stdx::nullopt;
-}
-
 indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms,
                                                           ::shared_ptr<parameters> parameters,
                                                           ::shared_ptr<selection::selection> selection,
@@ -1219,6 +1213,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
    }

    check_needs_filtering(restrictions);
+    ensure_filtering_columns_retrieval(db, selection, restrictions);

    ::shared_ptr<cql3::statements::select_statement> stmt;
    if (restrictions->uses_secondary_indexing()) {
@@ -1357,7 +1352,7 @@ select_statement::get_ordering_comparator(schema_ptr schema,
        }
        auto index = selection->index_of(*def);
        if (index < 0) {
-            index = selection->add_column_for_ordering(*def);
+            index = selection->add_column_for_post_processing(*def);
        }

        sorters.emplace_back(index, def->type);
@@ -1444,6 +1439,23 @@ void select_statement::check_needs_filtering(::shared_ptr<restrictions::statemen
    }
 }

+/**
+ * Adds columns that are needed for the purpose of filtering to the selection.
+ * The columns that are added to the selection are columns that
+ * are needed for filtering on the coordinator but are not part of the selection.
+ * The columns are added with a meta-data indicating they are not to be returned
+ * to the user.
+ */
+void select_statement::ensure_filtering_columns_retrieval(database& db,
+                                        ::shared_ptr<selection::selection> selection,
+                                        ::shared_ptr<restrictions::statement_restrictions> restrictions) {
+    for (auto&& cdef : restrictions->get_column_defs_for_filtering(db)) {
+        if (!selection->has_column(*cdef)) {
+            selection->add_column_for_post_processing(*cdef);
+        }
+    }
+}
+
 bool select_statement::contains_alias(::shared_ptr<column_identifier> name) {
    return std::any_of(_select_clause.begin(), _select_clause.end(), [name] (auto raw) {
        return raw->alias && *name == *raw->alias;
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -186,10 +186,6 @@ public:
                                   schema_ptr view_schema);

 private:
-    static stdx::optional<secondary_index::index> find_idx(database& db,
-                                                           schema_ptr schema,
-                                                           ::shared_ptr<restrictions::statement_restrictions> restrictions);
-
    virtual future<::shared_ptr<cql_transport::messages::result_message>> do_execute(service::storage_proxy& proxy,
                                                                                     service::query_state& state, const query_options& options) override;

@@ -214,6 +210,9 @@ private:
            gc_clock::time_point now,
            ::shared_ptr<const service::pager::paging_state> paging_state);

+    lw_shared_ptr<query::read_command>
+    prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging);
+
    future<shared_ptr<cql_transport::messages::result_message>>
    execute_base_query(
            service::storage_proxy& proxy,
--- a/database.cc
+++ b/database.cc
@@ -76,6 +76,8 @@
 #include "sstables/compaction_manager.hh"
 #include "sstables/compaction_backlog_manager.hh"
 #include "sstables/progress_monitor.hh"
+#include "auth/common.hh"
+#include "tracing/trace_keyspace_helper.hh"

 #include "checked-file-impl.hh"
 #include "disk-error-handler.hh"
@@ -178,6 +180,18 @@ bool is_system_keyspace(const sstring& name) {
    return system_keyspaces.find(name) != system_keyspaces.end();
 }

+static const std::unordered_set<sstring> internal_keyspaces = {
+        db::system_distributed_keyspace::NAME,
+        db::system_keyspace::NAME,
+        db::schema_tables::NAME,
+        auth::meta::AUTH_KS,
+        tracing::trace_keyspace_helper::KEYSPACE_NAME
+};
+
+bool is_internal_keyspace(const sstring& name) {
+    return internal_keyspaces.find(name) != internal_keyspaces.end();
+}
+
 // Used for tests where the CF exists without a database object. We need to pass a valid
 // dirty_memory manager in that case.
 thread_local dirty_memory_manager default_dirty_memory_manager;
@@ -684,9 +698,11 @@ table::make_reader(schema_ptr s,
    return make_combined_reader(s, std::move(readers), fwd, fwd_mr);
 }

-sstables::shared_sstable
-table::make_streaming_sstable_for_write() {
+sstables::shared_sstable table::make_streaming_sstable_for_write(std::optional<sstring> subdir) {
    sstring dir = _config.datadir;
+    if (subdir) {
+        dir += "/" + *subdir;
+    }
    auto newtab = sstables::make_sstable(_schema,
            dir, calculate_generation_for_new_table(),
            get_highest_supported_format(),
@@ -826,7 +842,11 @@ void table::add_sstable(sstables::shared_sstable sstable, const std::vector<unsi
    new_sstables->insert(sstable);
    _sstables = std::move(new_sstables);
    update_stats_for_new_sstable(sstable->bytes_on_disk(), shards_for_the_sstable);
-    _compaction_strategy.get_backlog_tracker().add_sstable(sstable);
+    if (sstable->is_staging()) {
+        _sstables_staging.emplace(sstable->generation(), sstable);
+    } else {
+        _compaction_strategy.get_backlog_tracker().add_sstable(sstable);
+    }
 }

 future<>
@@ -1613,7 +1633,9 @@ std::vector<sstables::shared_sstable> table::select_sstables(const dht::partitio

 std::vector<sstables::shared_sstable> table::candidates_for_compaction() const {
    return boost::copy_range<std::vector<sstables::shared_sstable>>(*get_sstables()
-        | boost::adaptors::filtered([this] (auto& sst) { return !_sstables_need_rewrite.count(sst->generation()); }));
+            | boost::adaptors::filtered([this] (auto& sst) {
+        return !_sstables_need_rewrite.count(sst->generation()) && !_sstables_staging.count(sst->generation());
+    }));
 }

 std::vector<sstables::shared_sstable> table::sstables_need_rewrite() const {
@@ -1671,9 +1693,9 @@ future<> distributed_loader::open_sstable(distributed<database>& db, sstables::e
    // to distribute evenly the resource usage among all shards.

    return db.invoke_on(column_family::calculate_shard_from_sstable_generation(comps.generation),
-            [&db, comps = std::move(comps), func = std::move(func), pc] (database& local) {
+            [&db, comps = std::move(comps), func = std::move(func), &pc] (database& local) {

-        return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), pc] {
+        return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), &pc] {
            auto& cf = local.find_column_family(comps.ks, comps.cf);

            auto f = sstables::sstable::load_shared_components(cf.schema(), comps.sstdir, comps.generation, comps.version, comps.format, pc);
@@ -1969,6 +1991,12 @@ future<sstables::entry_descriptor> distributed_loader::probe_file(distributed<da
    }
    auto cf_sstable_open = [sstdir, comps, fname] (column_family& cf, sstables::foreign_sstable_open_info info) {
        cf.update_sstables_known_generation(comps.generation);
+        if (shared_sstable sst = cf.get_staging_sstable(comps.generation)) {
+            dblog.warn("SSTable {} is already present in staging/ directory. Moving from staging will be retried.", sst->get_filename());
+            return seastar::async([sst = std::move(sst), comps = std::move(comps)] () {
+                sst->move_to_new_dir_in_thread(comps.sstdir, comps.generation);
+            });
+        }
        {
            auto i = boost::range::find_if(*cf._sstables->all(), [gen = comps.generation] (sstables::shared_sstable sst) { return sst->generation() == gen; });
            if (i != cf._sstables->all()->end()) {
@@ -2154,9 +2182,6 @@ database::database(const db::config& cfg, database_config dbcfg)
        [this] {
            ++_stats->sstable_read_queue_overloaded;
            return std::make_exception_ptr(std::runtime_error("sstable inactive read queue overloaded"));
-        },
-        [this] {
-            return _querier_cache.evict_one();
        })
    // No timeouts or queue length limits - a failure here can kill an entire repair.
    // Trust the caller to limit concurrency.
@@ -2168,7 +2193,7 @@ database::database(const db::config& cfg, database_config dbcfg)
    , _version(empty_version)
    , _compaction_manager(make_compaction_manager(*_cfg, dbcfg))
    , _enable_incremental_backups(cfg.incremental_backups())
-    , _querier_cache(dbcfg.available_memory * 0.04)
+    , _querier_cache(_read_concurrency_sem, dbcfg.available_memory * 0.04)
    , _large_partition_handler(std::make_unique<db::cql_table_large_partition_handler>(_cfg->compaction_large_partition_warning_threshold_mb()*1024*1024))
    , _result_memory_limiter(dbcfg.available_memory / 10)
 {
@@ -2420,6 +2445,9 @@ database::setup_metrics() {
 }

 database::~database() {
+    _read_concurrency_sem.clear_inactive_reads();
+    _streaming_concurrency_sem.clear_inactive_reads();
+    _system_read_concurrency_sem.clear_inactive_reads();
 }

 void database::update_version(const utils::UUID& version) {
@@ -2450,6 +2478,8 @@ future<> distributed_loader::populate_keyspace(distributed<database>& db, sstrin
                auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
                dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
                return ks.make_directory_for_column_family(cfname, uuid).then([&db, sstdir, uuid, ks_name, cfname] {
+                    return distributed_loader::populate_column_family(db, sstdir + "/staging", ks_name, cfname);
+                }).then([&db, sstdir, uuid, ks_name, cfname] {
                    return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname);
                }).handle_exception([ks_name, cfname, sstdir](std::exception_ptr eptr) {
                    std::string msg =
@@ -2930,6 +2960,7 @@ keyspace::make_directory_for_column_family(const sstring& name, utils::UUID uuid
            io_check(recursive_touch_directory, cfdir).get();
        }
        io_check(touch_directory, cfdirs[0] + "/upload").get();
+        io_check(touch_directory, cfdirs[0] + "/staging").get();
    });
 }

@@ -4239,6 +4270,7 @@ future<> table::fail_streaming_mutations(utils::UUID plan_id) {
    _streaming_memtables_big.erase(it);
    return entry->flush_in_progress.close().then([this, entry] {
        for (auto&& sst : entry->sstables) {
+            sst.monitor->write_failed();
            sst.sstable->mark_for_deletion();
        }
    });
@@ -4447,64 +4479,6 @@ future<> table::generate_and_propagate_view_updates(const schema_ptr& base,
    });
 }

-/**
- * Given an update for the base table, calculates the set of potentially affected views,
- * generates the relevant updates, and sends them to the paired view replicas.
- */
-future<row_locker::lock_holder> table::push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const {
-    //FIXME: Avoid unfreezing here.
-    auto m = fm.unfreeze(s);
-    auto& base = schema();
-    m.upgrade(base);
-    auto views = affected_views(base, m);
-    if (views.empty()) {
-        return make_ready_future<row_locker::lock_holder>();
-    }
-    auto cr_ranges = db::view::calculate_affected_clustering_ranges(*base, m.decorated_key(), m.partition(), views);
-    if (cr_ranges.empty()) {
-        return generate_and_propagate_view_updates(base, std::move(views), std::move(m), { }, timeout).then([] {
-                // In this case we are not doing a read-before-write, just a
-                // write, so no lock is needed.
-                return make_ready_future<row_locker::lock_holder>();
-        });
-    }
-    // We read the whole set of regular columns in case the update now causes a base row to pass
-    // a view's filters, and a view happens to include columns that have no value in this update.
-    // Also, one of those columns can determine the lifetime of the base row, if it has a TTL.
-    auto columns = boost::copy_range<std::vector<column_id>>(
-            base->regular_columns() | boost::adaptors::transformed(std::mem_fn(&column_definition::id)));
-    query::partition_slice::option_set opts;
-    opts.set(query::partition_slice::option::send_partition_key);
-    opts.set(query::partition_slice::option::send_clustering_key);
-    opts.set(query::partition_slice::option::send_timestamp);
-    opts.set(query::partition_slice::option::send_ttl);
-    auto slice = query::partition_slice(
-            std::move(cr_ranges), { }, std::move(columns), std::move(opts), { }, cql_serialization_format::internal(), query::max_rows);
-    // Take the shard-local lock on the base-table row or partition as needed.
-    // We'll return this lock to the caller, which will release it after
-    // writing the base-table update.
-    future<row_locker::lock_holder> lockf = local_base_lock(base, m.decorated_key(), slice.default_row_ranges(), timeout);
-    return lockf.then([m = std::move(m), slice = std::move(slice), views = std::move(views), base, this, timeout] (row_locker::lock_holder lock) {
-      return do_with(
-        dht::partition_range::make_singular(m.decorated_key()),
-        std::move(slice),
-        std::move(m),
-        [base, views = std::move(views), lock = std::move(lock), this, timeout] (auto& pk, auto& slice, auto& m) mutable {
-            auto reader = this->make_reader(
-                base,
-                pk,
-                slice,
-                service::get_local_sstable_query_read_priority());
-            return this->generate_and_propagate_view_updates(base, std::move(views), std::move(m), std::move(reader), timeout).then([lock = std::move(lock)] () mutable {
-                // return the local partition/row lock we have taken so it
-                // remains locked until the caller is done modifying this
-                // partition/row and destroys the lock object.
-                return std::move(lock);
-            });
-      });
-    });
-}
-
 /**
 * Shard-local locking of clustering rows or entire partitions of the base
 * table during a Materialized-View read-modify-update:
--- a/database.hh
+++ b/database.hh
@@ -298,6 +298,8 @@ public:
 class table;
 using column_family = table;

+class database_sstable_write_monitor;
+
 class table : public enable_lw_shared_from_this<table> {
 public:
    struct config {
@@ -395,7 +397,7 @@ private:
    // plan memtables and the resulting sstables are not made visible until
    // the streaming is complete.
    struct monitored_sstable {
-        std::unique_ptr<sstables::write_monitor> monitor;
+        std::unique_ptr<database_sstable_write_monitor> monitor;
        sstables::shared_sstable sstable;
    };

@@ -432,6 +434,9 @@ private:
    // but for correct compaction we need to start the compaction only after
    // reading all sstables.
    std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_need_rewrite;
+    // sstables that should not be compacted (e.g. because they need to be used
+    // to generate view updates later)
+    std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_staging;
    // Control background fibers waiting for sstables to be deleted
    seastar::gate _sstable_deletion_gate;
    // There are situations in which we need to stop writing sstables. Flushers will take
@@ -485,6 +490,11 @@ private:
    utils::phased_barrier _pending_reads_phaser;
 public:
    future<> add_sstable_and_update_cache(sstables::shared_sstable sst);
+    void move_sstable_from_staging_in_thread(sstables::shared_sstable sst);
+    sstables::shared_sstable get_staging_sstable(uint64_t generation) {
+        auto it = _sstables_staging.find(generation);
+        return it != _sstables_staging.end() ? it->second : nullptr;
+    }
 private:
    void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, const std::vector<unsigned>& shards_for_the_sstable) noexcept;
    // Adds new sstable to the set of sstables
@@ -618,6 +628,14 @@ public:
            tracing::trace_state_ptr trace_state = nullptr,
            streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
            mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;
+    flat_mutation_reader make_reader_excluding_sstable(schema_ptr schema,
+            sstables::shared_sstable sst,
+            const dht::partition_range& range,
+            const query::partition_slice& slice,
+            const io_priority_class& pc = default_priority_class(),
+            tracing::trace_state_ptr trace_state = nullptr,
+            streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
+            mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;

    flat_mutation_reader make_reader(schema_ptr schema, const dht::partition_range& range = query::full_partition_range) const {
        auto& full_slice = schema->full_slice();
@@ -632,9 +650,13 @@ public:
    flat_mutation_reader make_streaming_reader(schema_ptr schema,
            const dht::partition_range_vector& ranges) const;

-    sstables::shared_sstable make_streaming_sstable_for_write();
+    sstables::shared_sstable make_streaming_sstable_for_write(std::optional<sstring> subdir = {});
+    sstables::shared_sstable make_streaming_staging_sstable() {
+        return make_streaming_sstable_for_write("staging");
+    }

    mutation_source as_mutation_source() const;
+    mutation_source as_mutation_source_excluding(sstables::shared_sstable sst) const;

    void set_virtual_reader(mutation_source virtual_reader) {
        _virtual_reader = std::move(virtual_reader);
@@ -842,6 +864,8 @@ public:
    void clear_views();
    const std::vector<view_ptr>& views() const;
    future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const;
+    future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout) const;
+    future<row_locker::lock_holder> stream_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, sstables::shared_sstable excluded_sstable) const;
    void add_coordinator_read_latency(utils::estimated_histogram::duration latency);
    std::chrono::milliseconds get_coordinator_read_latency_percentile(double percentile);

@@ -860,6 +884,7 @@ public:
            flat_mutation_reader&&);

 private:
+    future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source) const;
    std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
    future<> generate_and_propagate_view_updates(const schema_ptr& base,
            std::vector<view_ptr>&& views,
@@ -1399,6 +1424,12 @@ public:
    std::unordered_set<sstring> get_initial_tokens();
    std::experimental::optional<gms::inet_address> get_replace_address();
    bool is_replacing();
+    reader_concurrency_semaphore& user_read_concurrency_sem() {
+        return _read_concurrency_sem;
+    }
+    reader_concurrency_semaphore& streaming_read_concurrency_sem() {
+        return _streaming_concurrency_sem;
+    }
    reader_concurrency_semaphore& system_keyspace_read_concurrency_sem() {
        return _system_read_concurrency_sem;
    }
@@ -1428,6 +1459,8 @@ public:

 future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy);

+bool is_internal_keyspace(const sstring& name);
+
 class distributed_loader {
 public:
    static void reshard(distributed<database>& db, sstring ks_name, sstring cf_name);
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -1673,14 +1673,14 @@ const db::commitlog::config& db::commitlog::active_config() const {
 // No commit_io_check needed in the log reader since the database will fail
 // on error at startup if required
 future<std::unique_ptr<subscription<temporary_buffer<char>, db::replay_position>>>
-db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func next, position_type off, const db::extensions* exts) {
+db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class read_io_prio_class, commit_load_reader_func next, position_type off, const db::extensions* exts) {
    struct work {
    private:
-        file_input_stream_options make_file_input_stream_options() {
+        file_input_stream_options make_file_input_stream_options(seastar::io_priority_class read_io_prio_class) {
            file_input_stream_options fo;
            fo.buffer_size = db::commitlog::segment::default_size;
            fo.read_ahead = 10;
-            fo.io_priority_class = service::get_local_commitlog_priority();
+            fo.io_priority_class = read_io_prio_class;
            return fo;
        }
    public:
@@ -1699,8 +1699,8 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
        bool header = true;
        bool failed = false;

-        work(file f, position_type o = 0)
-                : f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options())), start_off(o) {
+        work(file f, seastar::io_priority_class read_io_prio_class, position_type o = 0)
+                : f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options(read_io_prio_class))), start_off(o) {
        }
        work(work&&) = default;

@@ -1918,9 +1918,9 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
        return fut;
    });

-    return fut.then([off, next](file f) {
+    return fut.then([off, next, read_io_prio_class] (file f) {
        f = make_checked_file(commit_error_handler, std::move(f));
-        auto w = make_lw_shared<work>(std::move(f), off);
+        auto w = make_lw_shared<work>(std::move(f), read_io_prio_class, off);
        auto ret = w->s.listen(next);

        w->s.started().then(std::bind(&work::read_file, w.get())).then([w] {
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -355,7 +355,7 @@ public:
    };

    static future<std::unique_ptr<subscription<temporary_buffer<char>, replay_position>>> read_log_file(
-            const sstring&, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
+            const sstring&, seastar::io_priority_class read_io_prio_class, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
 private:
    commitlog(config);

--- a/db/commitlog/commitlog_entry.hh
+++ b/db/commitlog/commitlog_entry.hh
@@ -34,7 +34,8 @@ public:
    commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
        : _mapping(std::move(mapping)), _mutation(std::move(mutation)) { }
    const stdx::optional<column_mapping>& mapping() const { return _mapping; }
-    const frozen_mutation& mutation() const { return _mutation; }
+    const frozen_mutation& mutation() const & { return _mutation; }
+    frozen_mutation&& mutation() && { return std::move(_mutation); }
 };

 class commitlog_entry_writer {
@@ -80,5 +81,6 @@ public:
    commitlog_entry_reader(const temporary_buffer<char>& buffer);

    const stdx::optional<column_mapping>& get_column_mapping() const { return _ce.mapping(); }
-    const frozen_mutation& mutation() const { return _ce.mutation(); }
+    const frozen_mutation& mutation() const & { return _ce.mutation(); }
+    frozen_mutation&& mutation() && { return std::move(_ce).mutation(); }
 };
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -58,6 +58,7 @@
 #include "converting_mutation_partition_applier.hh"
 #include "schema_registry.hh"
 #include "commitlog_entry.hh"
+#include "service/priority_manager.hh"

 static logging::logger rlogger("commitlog_replayer");

@@ -223,7 +224,7 @@ db::commitlog_replayer::impl::recover(sstring file, const sstring& fname_prefix)
    auto s = make_lw_shared<stats>();
    auto& exts = _qp.local().db().local().get_config().extensions();

-    return db::commitlog::read_log_file(file,
+    return db::commitlog::read_log_file(file, service::get_local_commitlog_priority(),
            std::bind(&impl::process, this, s.get(), std::placeholders::_1,
                    std::placeholders::_2), p, &exts).then([](auto s) {
        auto f = s->done();
--- a/db/config.hh
+++ b/db/config.hh
@@ -453,7 +453,7 @@ public:
            "The maximum number of tombstones a query can scan before aborting."  \
    )   \
    /* Network timeout settings */  \
-    val(range_request_timeout_in_ms, uint32_t, 10000, Unused,     \
+    val(range_request_timeout_in_ms, uint32_t, 10000, Used,     \
            "The time in milliseconds that the coordinator waits for sequential or index scans to complete."  \
    )   \
    val(read_request_timeout_in_ms, uint32_t, 5000, Used,     \
@@ -472,7 +472,7 @@ public:
            "The time in milliseconds that the coordinator waits for write operations to complete.\n"  \
            "Related information: About hinted handoff writes"  \
    )   \
-    val(request_timeout_in_ms, uint32_t, 10000, Unused,     \
+    val(request_timeout_in_ms, uint32_t, 10000, Used,     \
            "The default timeout for other, miscellaneous operations.\n"  \
            "Related information: About hinted handoff writes"  \
    )   \
@@ -578,7 +578,7 @@ public:
    val(dynamic_snitch_update_interval_in_ms, uint32_t, 100, Unused,     \
            "The time interval for how often the snitch calculates node scores. Because score calculation is CPU intensive, be careful when reducing this interval."  \
    )   \
-    val(hinted_handoff_enabled, sstring, "false", Used,     \
+    val(hinted_handoff_enabled, sstring, "true", Used,     \
            "Enable or disable hinted handoff. To enable per data center, add data center list. For example: hinted_handoff_enabled: DC1,DC2. A hint indicates that the write needs to be replayed to an unavailable node. " \
            "Related information: About hinted handoff writes"  \
    )   \
@@ -621,7 +621,7 @@ public:
    val(thrift_framed_transport_size_in_mb, uint32_t, 15, Unused,     \
            "Frame size (maximum field length) for Thrift. The frame is the row or part of the row the application is inserting."  \
    )   \
-    val(thrift_max_message_length_in_mb, uint32_t, 16, Unused,     \
+    val(thrift_max_message_length_in_mb, uint32_t, 16, Used,     \
            "The maximum length of a Thrift message in megabytes, including all fields and internal Thrift overhead (1 byte of overhead for each frame). Message length is usually used in conjunction with batches. A frame length greater than or equal to 24 accommodates a batch with four inserts, each of which is 24 bytes. The required message length is greater than or equal to 24+24+24+24+4 (number of frames)."  \
    )   \
    /* Security properties */   \
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -35,6 +35,7 @@
 #include "disk-error-handler.hh"
 #include "lister.hh"
 #include "db/timeout_clock.hh"
+#include "service/priority_manager.hh"

 using namespace std::literals::chrono_literals;

@@ -95,6 +96,7 @@ future<> manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr
        return compute_hints_dir_device_id();
    }).then([this] {
        _strorage_service_anchor->register_subscriber(this);
+        set_started();
    });
 }

@@ -105,7 +107,7 @@ future<> manager::stop() {
        _strorage_service_anchor->unregister_subscriber(this);
    }

-    _stopping = true;
+    set_stopping();

    return _draining_eps_gate.close().finally([this] {
        return parallel_for_each(_ep_managers, [] (auto& pair) {
@@ -277,7 +279,7 @@ inline bool manager::have_ep_manager(ep_key_type ep) const noexcept {
 }

 bool manager::store_hint(ep_key_type ep, schema_ptr s, lw_shared_ptr<const frozen_mutation> fm, tracing::trace_state_ptr tr_state) noexcept {
-    if (_stopping || !can_hint_for(ep)) {
+    if (stopping() || !started() || !can_hint_for(ep)) {
        manager_logger.trace("Can't store a hint to {}", ep);
        ++_stats.dropped;
        return false;
@@ -380,7 +382,7 @@ future<timespec> manager::end_point_hints_manager::sender::get_last_file_modific
    });
 }

-future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
+future<> manager::end_point_hints_manager::sender::do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
    return futurize_apply([this, m = std::move(m), &natural_endpoints] () mutable -> future<> {
        // The fact that we send with CL::ALL in both cases below ensures that new hints are not going
        // to be generated as a result of hints sending.
@@ -392,7 +394,8 @@ future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation
            // FIXME: using 1h as infinite timeout. If a node is down, we should get an
            // unavailable exception.
            auto timeout = db::timeout_clock::now() + 1h;
-            return _proxy.mutate({std::move(m)}, consistency_level::ALL, timeout, nullptr);
+            //FIXME: Add required frozen_mutation overloads
+            return _proxy.mutate({m.fm.unfreeze(m.s)}, consistency_level::ALL, timeout, nullptr);
        }
    });
 }
@@ -418,21 +421,19 @@ bool manager::end_point_hints_manager::sender::can_send() noexcept {
    }
 }

-mutation manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
+frozen_mutation_and_schema manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
    hint_entry_reader hr(buf);
    auto& fm = hr.mutation();
    auto& cm = get_column_mapping(std::move(ctx_ptr), fm, hr);
-    auto& cf = _db.find_column_family(fm.column_family_id());
+    auto schema = _db.find_schema(fm.column_family_id());

-    if (cf.schema()->version() != fm.schema_version()) {
-        mutation m(cf.schema(), fm.decorated_key(*cf.schema()));
-        converting_mutation_partition_applier v(cm, *cf.schema(), m.partition());
+    if (schema->version() != fm.schema_version()) {
+        mutation m(schema, fm.decorated_key(*schema));
+        converting_mutation_partition_applier v(cm, *schema, m.partition());
        fm.partition().accept(cm, v);
-
-        return std::move(m);
-    } else {
-        return fm.unfreeze(cf.schema());
+        return {freeze(m), std::move(schema)};
    }
+    return {std::move(hr).mutation(), std::move(schema)};
 }

 const column_mapping& manager::end_point_hints_manager::sender::get_column_mapping(lw_shared_ptr<send_one_file_ctx> ctx_ptr, const frozen_mutation& fm, const hint_entry_reader& hr) {
@@ -502,7 +503,7 @@ bool manager::check_dc_for(ep_key_type ep) const noexcept {
 }

 void manager::drain_for(gms::inet_address endpoint) {
-    if (_stopping) {
+    if (stopping()) {
        return;
    }

@@ -543,6 +544,7 @@ manager::end_point_hints_manager::sender::sender(end_point_hints_manager& parent
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(local_storage_proxy)
    , _db(local_db)
+    , _hints_cpu_sched_group(_db.get_streaming_scheduling_group())
    , _gossiper(local_gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
@@ -555,6 +557,7 @@ manager::end_point_hints_manager::sender::sender(const sender& other, end_point_
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(other._proxy)
    , _db(other._db)
+    , _hints_cpu_sched_group(other._hints_cpu_sched_group)
    , _gossiper(other._gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
@@ -610,7 +613,10 @@ manager::end_point_hints_manager::sender::clock::duration manager::end_point_hin
 }

 void manager::end_point_hints_manager::sender::start() {
-    _stopped = seastar::async([this] {
+    seastar::thread_attributes attr;
+
+    attr.sched_group = _hints_cpu_sched_group;
+    _stopped = seastar::async(std::move(attr), [this] {
        manager_logger.trace("ep_manager({})::sender: started", end_point_key());
        while (!stopping()) {
            try {
@@ -630,10 +636,11 @@ void manager::end_point_hints_manager::sender::start() {
    });
 }

-future<> manager::end_point_hints_manager::sender::send_one_mutation(mutation m) {
-    keyspace& ks = _db.find_keyspace(m.schema()->ks_name());
+future<> manager::end_point_hints_manager::sender::send_one_mutation(frozen_mutation_and_schema m) {
+    keyspace& ks = _db.find_keyspace(m.s->ks_name());
    auto& rs = ks.get_replication_strategy();
-    std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(m.token());
+    auto token = dht::global_partitioner().get_token(*m.s, m.fm.key(*m.s));
+    std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(std::move(token));

    return do_send_one_mutation(std::move(m), natural_endpoints);
 }
@@ -651,8 +658,8 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
                    return make_ready_future<>();
                }

-                mutation m = this->get_mutation(ctx_ptr, buf);
-                gc_clock::duration gc_grace_sec = m.schema()->gc_grace_seconds();
+                auto m = this->get_mutation(ctx_ptr, buf);
+                gc_clock::duration gc_grace_sec = m.s->gc_grace_seconds();

                // The hint is too old - drop it.
                //
@@ -693,7 +700,7 @@ bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fnam
    lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>();

    try {
-        auto s = commitlog::read_log_file(fname, [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
+        auto s = commitlog::read_log_file(fname, service::get_local_streaming_read_priority(), [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
            // Check that we can still send the next hint. Don't try to send it if the destination host
            // is DOWN or if we have already failed to send some of the previous hints.
            if (!draining() && ctx_ptr->state.contains(send_state::segment_replay_failed)) {
@@ -759,7 +766,7 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
    int replayed_segments_count = 0;

    try {
-        while (have_segments()) {
+        while (replay_allowed() && have_segments()) {
            if (!send_one_file(*_segments_to_replay.begin())) {
                break;
            }
@@ -784,14 +791,24 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
    manager_logger.trace("send_hints(): we handled {} segments", replayed_segments_count);
 }

+template<typename Func>
+static future<> scan_for_hints_dirs(const sstring& hints_directory, Func&& f) {
+    return lister::scan_dir(hints_directory, { directory_entry_type::directory }, [f = std::forward<Func>(f)] (lister::path dir, directory_entry de) {
+        try {
+            return f(std::move(dir), std::move(de), std::stoi(de.name.c_str()));
+        } catch (std::invalid_argument& ex) {
+            manager_logger.debug("Ignore invalid directory {}", de.name);
+            return make_ready_future<>();
+        }
+    });
+}
+
 // runs in seastar::async context
 manager::hints_segments_map manager::get_current_hints_segments(const sstring& hints_directory) {
    hints_segments_map current_hints_segments;

    // shards level
-    lister::scan_dir(hints_directory, { directory_entry_type::directory }, [&current_hints_segments] (lister::path dir, directory_entry de) {
-        unsigned shard_id = std::stoi(de.name.c_str());
-
+    scan_for_hints_dirs(hints_directory, [&current_hints_segments] (lister::path dir, directory_entry de, unsigned shard_id) {
        manager_logger.trace("shard_id = {}", shard_id);
        // IPs level
        return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory }, [&current_hints_segments, shard_id] (lister::path dir, directory_entry de) {
@@ -908,9 +925,7 @@ void manager::rebalance_segments_for(
 // runs in seastar::async context
 void manager::remove_irrelevant_shards_directories(const sstring& hints_directory) {
    // shards level
-    lister::scan_dir(hints_directory, { directory_entry_type::directory }, [] (lister::path dir, directory_entry de) {
-        unsigned shard_id = std::stoi(de.name.c_str());
-
+    scan_for_hints_dirs(hints_directory, [] (lister::path dir, directory_entry de, unsigned shard_id) {
        if (shard_id >= smp::count) {
            // IPs level
            return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory, directory_entry_type::regular }, lister::show_hidden::yes, [] (lister::path dir, directory_entry de) {
@@ -936,5 +951,15 @@ future<> manager::rebalance(sstring hints_directory) {
    });
 }

+void manager::update_backlog(size_t backlog, size_t max_backlog) {
+    _backlog_size = backlog;
+    _max_backlog_size = max_backlog;
+    if (backlog < max_backlog) {
+        allow_hints();
+    } else {
+        forbid_hints_for_eps_with_pending_hints();
+    }
+}
+
 }
 }
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -69,6 +69,8 @@ private:
    class drain_tag {};
    using drain = seastar::bool_class<drain_tag>;

+    friend class space_watchdog;
+
 public:
    class end_point_hints_manager {
    public:
@@ -119,6 +121,7 @@ public:
            resource_manager& _resource_manager;
            service::storage_proxy& _proxy;
            database& _db;
+            seastar::scheduling_group _hints_cpu_sched_group;
            gms::gossiper& _gossiper;
            seastar::shared_mutex& _file_update_mutex;

@@ -179,6 +182,10 @@ public:
                return _state.contains(state::stopping);
            }

+            bool replay_allowed() const noexcept {
+                return _ep_manager.replay_allowed();
+            }
+
            /// \brief Try to send one hint read from the file.
            ///  - Limit the maximum memory size of hints "in the air" and the maximum total number of hints "in the air".
            ///  - Discard the hints that are older than the grace seconds value of the corresponding table.
@@ -210,7 +217,7 @@ public:
            /// \param ctx_ptr pointer to the send context
            /// \param buf hints file entry
            /// \return The mutation object representing the original mutation stored in the hints file.
-            mutation get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);
+            frozen_mutation_and_schema get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);

            /// \brief Get a reference to the column_mapping object for a given frozen mutation.
            /// \param ctx_ptr pointer to the send context
@@ -227,13 +234,13 @@ public:
            /// \param m mutation to send
            /// \param natural_endpoints current replicas for the given mutation
            /// \return future that resolves when the operation is complete
-            future<> do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;
+            future<> do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;

            /// \brief Send one mutation out.
            ///
            /// \param m mutation to send
            /// \return future that resolves when the mutation sending processing is complete.
-            future<> send_one_mutation(mutation m);
+            future<> send_one_mutation(frozen_mutation_and_schema m);

            /// \brief Get the last modification time stamp for a given file.
            /// \param fname File name
@@ -328,6 +335,10 @@ public:
            return _hints_in_progress;
        }

+        bool replay_allowed() const noexcept {
+            return _shard_manager.replay_allowed();
+        }
+
        bool can_hint() const noexcept {
            return _state.contains(state::can_hint);
        }
@@ -393,6 +404,17 @@ public:
        }
    };

+    enum class state {
+        started,                // hinting is currently allowed (start() call is complete)
+        replay_allowed,         // replaying (hints sending) is allowed
+        stopping                // hinting is not allowed - stopping is in progress (stop() method has been called)
+    };
+
+    using state_set = enum_set<super_enum<state,
+        state::started,
+        state::replay_allowed,
+        state::stopping>>;
+
 private:
    using ep_key_type = typename end_point_hints_manager::key_type;
    using ep_managers_map_type = std::unordered_map<ep_key_type, end_point_hints_manager>;
@@ -403,6 +425,7 @@ public:
    static const std::chrono::seconds hint_file_write_timeout;

 private:
+    state_set _state;
    const boost::filesystem::path _hints_dir;
    dev_t _hints_dir_device_id = 0;

@@ -414,7 +437,7 @@ private:
    locator::snitch_ptr& _local_snitch_ptr;
    int64_t _max_hint_window_us = 0;
    database& _local_db;
-    bool _stopping = false;
+
    seastar::gate _draining_eps_gate; // gate used to control the progress of ep_managers stopping not in the context of manager::stop() call

    resource_manager& _resource_manager;
@@ -424,9 +447,14 @@ private:
    seastar::metrics::metric_groups _metrics;
    std::unordered_set<ep_key_type> _eps_with_pending_hints;

+    size_t _max_backlog_size;
+    size_t _backlog_size;
+
 public:
    manager(sstring hints_directory, std::vector<sstring> hinted_dcs, int64_t max_hint_window_ms, resource_manager&res_manager, distributed<database>& db);
    virtual ~manager();
+    manager(manager&&) = delete;
+    manager& operator=(manager&&) = delete;
    void register_metrics(const sstring& group_name);
    future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
    future<> stop();
@@ -503,6 +531,18 @@ public:
    void forbid_hints();
    void forbid_hints_for_eps_with_pending_hints();

+    size_t max_backlog_size() const {
+        return _max_backlog_size;
+    }
+
+    size_t backlog_size() const {
+        return _backlog_size;
+    }
+
+    void allow_replaying() noexcept {
+        _state.set(state::replay_allowed);
+    }
+
    /// \brief Rebalance hints segments among all present shards.
    ///
    /// The difference between the number of segments on every two shard will be not greater than 1 after the
@@ -616,6 +656,28 @@ private:
    /// \param endpoint node that left the cluster
    void drain_for(gms::inet_address endpoint);

+    void update_backlog(size_t backlog, size_t max_backlog);
+
+    bool stopping() const noexcept {
+        return _state.contains(state::stopping);
+    }
+
+    void set_stopping() noexcept {
+        _state.set(state::stopping);
+    }
+
+    bool started() const noexcept {
+        return _state.contains(state::started);
+    }
+
+    void set_started() noexcept {
+        _state.set(state::started);
+    }
+
+    bool replay_allowed() const noexcept {
+        return _state.contains(state::replay_allowed);
+    }
+
 public:
    ep_managers_map_type::iterator find_ep_manager(ep_key_type ep_key) noexcept {
        return _ep_managers.find(ep_key);
--- a/db/hints/resource_manager.cc
+++ b/db/hints/resource_manager.cc
@@ -27,6 +27,7 @@
 #include "lister.hh"
 #include "disk-error-handler.hh"
 #include "seastarx.hh"
+#include <seastar/core/sleep.hh>

 namespace db {
 namespace hints {
@@ -65,19 +66,28 @@ const std::chrono::seconds space_watchdog::_watchdog_period = std::chrono::secon
 space_watchdog::space_watchdog(shard_managers_set& managers, per_device_limits_map& per_device_limits_map)
    : _shard_managers(managers)
    , _per_device_limits_map(per_device_limits_map)
-    , _timer([this] { on_timer(); })
 {}

 void space_watchdog::start() {
-    _timer.arm(timer_clock_type::now());
+    _started = seastar::async([this] {
+        while (!_as.abort_requested()) {
+            try {
+                on_timer();
+            } catch (...) {
+                resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
+                // Stop all hint generators if space_watchdog callback failed
+                for (manager& shard_manager : _shard_managers) {
+                    shard_manager.forbid_hints();
+                }
+            }
+            seastar::sleep_abortable(_watchdog_period, _as).get();
+        }
+    }).handle_exception_type([] (const seastar::sleep_aborted& ignored) { });
 }

 future<> space_watchdog::stop() noexcept {
-    try {
-        return _gate.close().finally([this] { _timer.cancel(); });
-    } catch (...) {
-        return make_exception_future<>(std::current_exception());
-    }
+    _as.request_abort();
+    return std::move(_started);
 }

 future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager& shard_manager, ep_key_type ep_key) {
@@ -94,83 +104,62 @@ future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager&
    });
 }

+// Called from the context of a seastar::thread.
 void space_watchdog::on_timer() {
-    with_gate(_gate, [this] {
-        return futurize_apply([this] {
-            _total_size = 0;
+    // The hints directories are organized as follows:
+    // <hints root>
+    //    |- <shard1 ID>
+    //    |  |- <EP1 address>
+    //    |     |- <hints file1>
+    //    |     |- <hints file2>
+    //    |     |- ...
+    //    |  |- <EP2 address>
+    //    |     |- ...
+    //    |  |-...
+    //    |- <shard2 ID>
+    //    |  |- ...
+    //    ...
+    //    |- <shardN ID>
+    //    |  |- ...
+    //

-            return do_for_each(_shard_managers, [this] (manager& shard_manager) {
-                shard_manager.clear_eps_with_pending_hints();
-
-                // The hints directories are organized as follows:
-                // <hints root>
-                //    |- <shard1 ID>
-                //    |  |- <EP1 address>
-                //    |     |- <hints file1>
-                //    |     |- <hints file2>
-                //    |     |- ...
-                //    |  |- <EP2 address>
-                //    |     |- ...
-                //    |  |-...
-                //    |- <shard2 ID>
-                //    |  |- ...
-                //    ...
-                //    |- <shardN ID>
-                //    |  |- ...
+    for (auto& per_device_limits : _per_device_limits_map | boost::adaptors::map_values) {
+        _total_size = 0;
+        for (manager& shard_manager : per_device_limits.managers) {
+            shard_manager.clear_eps_with_pending_hints();
+            lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
+                _files_count = 0;
+                // Let's scan per-end-point directories and enumerate hints files...
                //
-                return lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
-                    _files_count = 0;
-                    // Let's scan per-end-point directories and enumerate hints files...
-                    //
-                    // Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
-                    // not hintable).
-                    // If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
-                    // continue to enumeration - there is no one to change them.
-                    auto it = shard_manager.find_ep_manager(de.name);
-                    if (it != shard_manager.ep_managers_end()) {
-                        return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
-                             return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
-                        });
-                    } else {
-                        return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
-                    }
-                });
-            }).then([this] {
-                return do_for_each(_per_device_limits_map, [this](per_device_limits_map::value_type& per_device_limits_entry) {
-                    space_watchdog::per_device_limits& per_device_limits = per_device_limits_entry.second;
-
-                    size_t adjusted_quota = 0;
-                    size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
-                        return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
+                // Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
+                // not hintable).
+                // If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
+                // continue to enumeration - there is no one to change them.
+                auto it = shard_manager.find_ep_manager(de.name);
+                if (it != shard_manager.ep_managers_end()) {
+                    return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
+                        return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
                    });
-                    if (per_device_limits.max_shard_disk_space_size > delta) {
-                        adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
-                    }
+                } else {
+                    return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
+                }
+            }).get();
+        }

-                    bool can_hint = _total_size < adjusted_quota;
-                    resource_manager_logger.trace("space_watchdog: total_size ({}) {} max_shard_disk_space_size ({})", _total_size, can_hint ? "<" : ">=", adjusted_quota);
-
-                    if (!can_hint) {
-                        for (manager& shard_manager : per_device_limits.managers) {
-                            shard_manager.forbid_hints_for_eps_with_pending_hints();
-                        }
-                    } else {
-                        for (manager& shard_manager : per_device_limits.managers) {
-                            shard_manager.allow_hints();
-                        }
-    }
-                });
-            });
-        }).handle_exception([this] (auto eptr) {
-            resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
-            // Stop all hint generators if space_watchdog callback failed
-            for (manager& shard_manager : _shard_managers) {
-                shard_manager.forbid_hints();
-            }
-        }).finally([this] {
-            _timer.arm(_watchdog_period);
+        // Adjust the quota to take into account the space we guarantee to every end point manager
+        size_t adjusted_quota = 0;
+        size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
+            return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
        });
-    });
+        if (per_device_limits.max_shard_disk_space_size > delta) {
+            adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
+        }
+
+        resource_manager_logger.trace("space_watchdog: consuming {}/{} bytes", _total_size, adjusted_quota);
+        for (manager& shard_manager : per_device_limits.managers) {
+            shard_manager.update_backlog(_total_size, adjusted_quota);
+        }
+    }
 }

 future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr) {
@@ -183,6 +172,10 @@ future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, s
    });
 }

+void resource_manager::allow_replaying() noexcept {
+    boost::for_each(_shard_managers, [] (manager& m) { m.allow_replaying(); });
+}
+
 future<> resource_manager::stop() noexcept {
    return parallel_for_each(_shard_managers, [](manager& m) {
        return m.stop();
@@ -201,14 +194,18 @@ future<> resource_manager::prepare_per_device_limits() {
        auto it = _per_device_limits_map.find(device_id);
        if (it == _per_device_limits_map.end()) {
            return is_mountpoint(shard_manager.hints_dir().parent_path()).then([this, device_id, &shard_manager](bool is_mountpoint) {
-                // By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
-                size_t max_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
-                // If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
-                // Then, reserve 90% of all space instead of 10% above.
-                if (is_mountpoint) {
-                    max_size *= 9;
+                auto [it, inserted] = _per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{});
+                // Since we possibly deferred, we need to recheck the _per_device_limits_map.
+                if (inserted) {
+                    // By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
+                    it->second.max_shard_disk_space_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
+                    // If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
+                    // Then, reserve 90% of all space instead of 10% above.
+                    if (is_mountpoint) {
+                        it->second.max_shard_disk_space_size *= 9;
+                    }
                }
-                _per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{{std::ref(shard_manager)}, max_size});
+                it->second.managers.emplace_back(std::ref(shard_manager));
            });
        } else {
            it->second.managers.emplace_back(std::ref(shard_manager));
--- a/db/hints/resource_manager.hh
+++ b/db/hints/resource_manager.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include <cstdint>
+#include <seastar/core/abort_source.hh>
 #include <seastar/core/semaphore.hh>
 #include <seastar/core/gate.hh>
 #include <seastar/core/memory.hh>
@@ -78,8 +79,8 @@ private:
    shard_managers_set& _shard_managers;
    per_device_limits_map& _per_device_limits_map;

-    seastar::gate _gate;
-    seastar::timer<timer_clock_type> _timer;
+    future<> _started = make_ready_future<>();
+    seastar::abort_source _as;
    int _files_count = 0;

 public:
@@ -137,6 +138,9 @@ public:
        , _space_watchdog(_shard_managers, _per_device_limits_map)
    {}

+    resource_manager(resource_manager&&) = delete;
+    resource_manager& operator=(resource_manager&&) = delete;
+
    future<semaphore_units<semaphore_default_exception_factory>> get_send_units_for(size_t buf_size);

    bool too_many_hints_in_progress() const {
@@ -156,6 +160,7 @@ public:
    }

    future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
+    void allow_replaying() noexcept;
    future<> stop() noexcept;
    void register_manager(manager& m);
    future<> prepare_per_device_limits();
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -87,7 +87,7 @@ future<> system_distributed_keyspace::start() {
        return do_with(all_tables(), [this] (std::vector<schema_ptr>& tables) {
            return do_for_each(tables, [this] (schema_ptr table) {
                return ignore_existing([this, table = std::move(table)] {
-                    return _mm.announce_new_column_family(std::move(table), false);
+                    return _mm.announce_new_column_family(std::move(table), api::min_timestamp, false);
                });
            });
        });
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -931,7 +931,7 @@ future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations
    auto fs = std::make_unique<std::vector<future<>>>();
    for (auto& mut : mutations) {
        auto view_token = mut.token();
-        auto keyspace_name = mut.schema()->ks_name();
+        auto& keyspace_name = mut.schema()->ks_name();
        auto paired_endpoint = get_view_natural_endpoint(keyspace_name, base_token, view_token);
        auto pending_endpoints = service::get_local_storage_service().get_token_metadata().pending_endpoints_for(view_token, keyspace_name);
        if (paired_endpoint) {
@@ -951,10 +951,19 @@ future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations
                // do not wait for it to complete.
                // Note also that mutate_locally(mut) copies mut (in
                // frozen form) so don't need to increase its lifetime.
-                fs->push_back(service::get_local_storage_proxy().mutate_locally(mut).handle_exception([&stats] (auto ep) {
-                    vlogger.error("Error applying local view update: {}", ep);
-                    stats.view_updates_failed_local++;
-                    return make_exception_future<>(std::move(ep));
+                // send_to_endpoint() below updates statistics on pending
+                // writes but mutate_locally() doesn't, so we need to do that here.
+                ++stats.writes;
+                fs->push_back(service::get_local_storage_proxy().mutate_locally(mut).then_wrapped([&stats] (auto&& fut) {
+                    --stats.writes;
+                    if (fut.failed()) {
+                        auto ep = fut.get_exception();
+                        vlogger.error("Error applying local view update: {}", ep);
+                        ++stats.view_updates_failed_local;
+                        return make_exception_future<>(std::move(ep));
+                    } else {
+                        return make_ready_future<>();
+                    }
                }));
            } else {
                vlogger.debug("Sending view update to endpoint {}, with pending endpoints = {}", *paired_endpoint, pending_endpoints);
@@ -1226,6 +1235,20 @@ future<> view_builder::calculate_shard_build_step(
        }
    }

+    // All shards need to arrive at the same decisions on whether or not to
+    // restart a view build at some common token (reshard), and which token
+    // to restart at. So we need to wait until all shards have read the view
+    // build statuses before they can all proceed to make the (same) decision.
+    // If we don't synchronoize here, a fast shard may make a decision, start
+    // building and finish a build step - before the slowest shard even read
+    // the view build information.
+    container().invoke_on(0, [] (view_builder& builder) {
+        if (++builder._shards_finished_read == smp::count) {
+            builder._shards_finished_read_promise.set_value();
+        }
+        return builder._shards_finished_read_promise.get_shared_future();
+    }).get();
+
    std::unordered_set<utils::UUID> loaded_views;
    if (view_build_status_per_shard.size() != smp::count) {
        reshard(std::move(view_build_status_per_shard), loaded_views);
@@ -1591,10 +1614,10 @@ future<> view_builder::maybe_mark_view_as_built(view_ptr view, dht::token next_t
    });
 }

-future<> view_builder::wait_until_built(const sstring& ks_name, const sstring& view_name, lowres_clock::time_point timeout) {
-    return container().invoke_on(0, [ks_name, view_name, timeout] (view_builder& builder) {
+future<> view_builder::wait_until_built(const sstring& ks_name, const sstring& view_name) {
+    return container().invoke_on(0, [ks_name, view_name] (view_builder& builder) {
        auto v = std::pair(std::move(ks_name), std::move(view_name));
-        return builder._build_notifiers[std::move(v)].get_shared_future(timeout);
+        return builder._build_notifiers[std::move(v)].get_shared_future();
    });
 }

--- a/db/view/view_builder.hh
+++ b/db/view/view_builder.hh
@@ -151,6 +151,10 @@ class view_builder final : public service::migration_listener::only_view_notific
    future<> _started = make_ready_future<>();
    // Used to coordinate between shards the conclusion of the build process for a particular view.
    std::unordered_set<utils::UUID> _built_views;
+    // Counter and promise (both on shard 0 only!) allowing to wait for all
+    // shards to have read the view build statuses
+    unsigned _shards_finished_read = 0;
+    seastar::shared_promise<> _shards_finished_read_promise;
    // Used for testing.
    std::unordered_map<std::pair<sstring, sstring>, seastar::shared_promise<>, utils::tuple_hash> _build_notifiers;

@@ -178,7 +182,7 @@ public:
    virtual void on_drop_view(const sstring& ks_name, const sstring& view_name) override;

    // For tests
-    future<> wait_until_built(const sstring& ks_name, const sstring& view_name, lowres_clock::time_point timeout);
+    future<> wait_until_built(const sstring& ks_name, const sstring& view_name);

 private:
    build_step& get_or_create_build_step(utils::UUID);
--- a/db/view/view_update_from_staging_generator.cc
+++ b/db/view/view_update_from_staging_generator.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "view_update_from_staging_generator.hh"
+
+namespace db::view {
+
+
+future<> view_update_from_staging_generator::start() {
+    _started = seastar::async([this]() mutable {
+        while (!_sstables_with_tables.empty()) {
+            auto& entry = _sstables_with_tables.front();
+            schema_ptr s = entry.t->schema();
+            if (_as.abort_requested()) {
+                return;
+            }
+            flat_mutation_reader staging_sstable_reader = entry.sst->read_rows_flat(s);
+            auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, _proxy, entry.sst, _as), db::no_timeout);
+            if (result == stop_iteration::no) {
+                entry.t->move_sstable_from_staging_in_thread(entry.sst);
+                _registration_sem.signal();
+                _sstables_with_tables.pop_front();
+            }
+        }
+    });
+    return make_ready_future<>();
+}
+
+future<> view_update_from_staging_generator::stop() {
+    _as.request_abort();
+    return std::move(_started);
+}
+
+future<> view_update_from_staging_generator::register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table) {
+    _sstables_with_tables.emplace_back(std::move(sst), std::move(table));
+    if (_as.abort_requested()) {
+        return make_ready_future<>();
+    }
+    future<> restart = make_ready_future<>();
+    if (_started.available()) {
+        restart = start();
+    }
+    return restart.then([this] () {
+        return _registration_sem.wait(1);
+    });
+}
+
+}
--- a/db/view/view_update_from_staging_generator.hh
+++ b/db/view/view_update_from_staging_generator.hh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "database.hh"
+#include "sstables/sstables.hh"
+#include "db/view/view_updating_consumer.hh"
+
+namespace db::view {
+
+class view_update_from_staging_generator {
+    static constexpr size_t registration_queue_size = 5;
+    database& _db;
+    service::storage_proxy& _proxy;
+    seastar::abort_source _as;
+    future<> _started = make_ready_future<>();
+    semaphore _registration_sem{registration_queue_size};
+    struct sstable_with_table {
+        sstables::shared_sstable sst;
+        lw_shared_ptr<table> t;
+        sstable_with_table(sstables::shared_sstable sst, lw_shared_ptr<table> t) : sst(sst), t(t) { }
+    };
+    std::deque<sstable_with_table> _sstables_with_tables;
+public:
+    view_update_from_staging_generator(database& db, service::storage_proxy& proxy) : _db(db), _proxy(proxy) { }
+
+    future<> start();
+    future<> stop();
+    future<> register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table);
+};
+
+}
--- a/db/view/view_updating_consumer.hh
+++ b/db/view/view_updating_consumer.hh
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "service/storage_proxy.hh"
+#include "dht/i_partitioner.hh"
+#include "schema.hh"
+#include "mutation_fragment.hh"
+#include "sstables/shared_sstable.hh"
+
+namespace db::view {
+
+/*
+ * A consumer that pushes materialized view updates for each consumed mutation.
+ * It is expected to be run in seastar::async threaded context through consume_in_thread()
+ */
+class view_updating_consumer {
+    schema_ptr _schema;
+    lw_shared_ptr<table> _table;
+    sstables::shared_sstable _excluded_sstable;
+    const seastar::abort_source& _as;
+    std::optional<mutation> _m;
+public:
+    view_updating_consumer(schema_ptr schema, service::storage_proxy& proxy, sstables::shared_sstable excluded_sstable, const seastar::abort_source& as)
+            : _schema(std::move(schema))
+            , _table(proxy.get_db().local().find_column_family(_schema->id()).shared_from_this())
+            , _excluded_sstable(excluded_sstable)
+            , _as(as)
+            , _m()
+    { }
+
+    void consume_new_partition(const dht::decorated_key& dk) {
+        _m = mutation(_schema, dk, mutation_partition(_schema));
+    }
+
+    void consume(tombstone t) {
+        _m->partition().apply(std::move(t));
+    }
+
+    stop_iteration consume(static_row&& sr) {
+        if (_as.abort_requested()) {
+            return stop_iteration::yes;
+        }
+        _m->partition().apply(*_schema, std::move(sr));
+        return stop_iteration::no;
+    }
+
+    stop_iteration consume(clustering_row&& cr) {
+        if (_as.abort_requested()) {
+            return stop_iteration::yes;
+        }
+        _m->partition().apply(*_schema, std::move(cr));
+        return stop_iteration::no;
+    }
+
+    stop_iteration consume(range_tombstone&& rt) {
+        if (_as.abort_requested()) {
+            return stop_iteration::yes;
+        }
+        _m->partition().apply(*_schema, std::move(rt));
+        return stop_iteration::no;
+    }
+
+    // Expected to be run in seastar::async threaded context (consume_in_thread())
+    stop_iteration consume_end_of_partition();
+
+    stop_iteration consume_end_of_stream() {
+        return stop_iteration(_as.abort_requested());
+    }
+};
+
+}
+
--- a/dht/boot_strapper.cc
+++ b/dht/boot_strapper.cc
@@ -49,7 +49,7 @@ namespace dht {
 future<> boot_strapper::bootstrap() {
    blogger.debug("Beginning bootstrap process: sorted_tokens={}", _token_metadata.sorted_tokens());

-    auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _tokens, _address, "Bootstrap");
+    auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _tokens, _address, "Bootstrap", streaming::stream_reason::bootstrap);
    streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(gms::get_local_failure_detector()));
    for (const auto& keyspace_name : _db.local().get_non_system_keyspaces()) {
        auto& ks = _db.local().find_keyspace(keyspace_name);
--- a/dht/range_streamer.cc
+++ b/dht/range_streamer.cc
@@ -294,7 +294,7 @@ future<> range_streamer::do_stream_async() {
                size_t nr_ranges_per_stream_plan = nr_ranges_total / 10;
                dht::token_range_vector ranges_to_stream;
                auto do_streaming = [&] {
-                    auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++));
+                    auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++), _reason);
                    logger.info("{} with {} for keyspace={}, {} out of {} ranges: ranges = {}",
                            description, source, keyspace, nr_ranges_streamed, nr_ranges_total, ranges_to_stream.size());
                    if (_nr_rx_added) {
--- a/dht/range_streamer.hh
+++ b/dht/range_streamer.hh
@@ -42,6 +42,7 @@
 #include "locator/snitch_base.hh"
 #include "streaming/stream_plan.hh"
 #include "streaming/stream_state.hh"
+#include "streaming/stream_reason.hh"
 #include "gms/inet_address.hh"
 #include "gms/i_failure_detector.hh"
 #include "range.hh"
@@ -101,17 +102,18 @@ public:
        }
    };

-    range_streamer(distributed<database>& db, token_metadata& tm, std::unordered_set<token> tokens, inet_address address, sstring description)
+    range_streamer(distributed<database>& db, token_metadata& tm, std::unordered_set<token> tokens, inet_address address, sstring description, streaming::stream_reason reason)
        : _db(db)
        , _metadata(tm)
        , _tokens(std::move(tokens))
        , _address(address)
        , _description(std::move(description))
+        , _reason(reason)
        , _stream_plan(_description) {
    }

-    range_streamer(distributed<database>& db, token_metadata& tm, inet_address address, sstring description)
-        : range_streamer(db, tm, std::unordered_set<token>(), address, description) {
+    range_streamer(distributed<database>& db, token_metadata& tm, inet_address address, sstring description, streaming::stream_reason reason)
+        : range_streamer(db, tm, std::unordered_set<token>(), address, description, reason) {
    }

    void add_source_filter(std::unique_ptr<i_source_filter> filter) {
@@ -166,6 +168,7 @@ private:
    std::unordered_set<token> _tokens;
    inet_address _address;
    sstring _description;
+    streaming::stream_reason _reason;
    std::unordered_multimap<sstring, std::unordered_map<inet_address, dht::token_range_vector>> _to_stream;
    std::unordered_set<std::unique_ptr<i_source_filter>> _source_filters;
    stream_plan _stream_plan;
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -62,10 +62,9 @@ if __name__ == '__main__':
            run('hugeadm --create-mounts')
        fi
    else:
-        set_nic = cfg.get('SET_NIC')
+        set_nic_and_disks = get_set_nic_and_disks_config_value(cfg)
        ifname = cfg.get('IFNAME')
-        if set_nic  == 'yes':
+        if set_nic_and_disks == 'yes':
            create_perftune_conf(ifname)
-            run('/usr/lib/scylla/posix_net_conf.sh {IFNAME} --options-file /etc/scylla.d/perftune.yaml'.format(IFNAME=ifname))
+            run("{} --options-file /etc/scylla.d/perftune.yaml".format(perftune_base_command()))

-    run('/usr/lib/scylla/scylla-blocktune')
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -122,8 +122,8 @@ if __name__ == '__main__':
                        help='specify NTP domain')
    parser.add_argument('--ami', action='store_true', default=False,
                        help='setup AMI instance')
-    parser.add_argument('--setup-nic', action='store_true', default=False,
-                        help='optimize NIC queue')
+    parser.add_argument('--setup-nic-and-disks', action='store_true', default=False,
+                        help='optimize NIC and disks')
    parser.add_argument('--developer-mode', action='store_true', default=False,
                        help='enable developer mode')
    parser.add_argument('--no-ec2-check', action='store_true', default=False,
@@ -173,7 +173,7 @@ if __name__ == '__main__':

    disks = args.disks
    nic = args.nic
-    set_nic = args.setup_nic
+    set_nic_and_disks = args.setup_nic_and_disks
    ec2_check = not args.no_ec2_check
    kernel_check = not args.no_kernel_check
    verify_package = not args.no_verify_package
@@ -336,11 +336,11 @@ if __name__ == '__main__':
    if interactive:
        sysconfig_setup = interactive_ask_service('Do you want to setup a system-wide customized configuration for Scylla?', 'Yes - setup the sysconfig file. No - skips this step.', 'yes')
    if sysconfig_setup:
-        nic = interactive_choose_nic()
        if interactive:
-            set_nic = interactive_ask_service('Do you want to enable Network Interface Card (NIC) optimization?', 'Yes - optimize the NIC queue settings. Selecting Yes greatly improves performance. No - skip this step.', 'yes')
+            nic = interactive_choose_nic()
+            set_nic_and_disks = interactive_ask_service('Do you want to enable Network Interface Card (NIC) and disk(s) optimization?', 'Yes - optimize the NIC queue and disks settings. Selecting Yes greatly improves performance. No - skip this step.', 'yes')
    if sysconfig_setup:
-        setup_args = '--setup-nic' if set_nic else ''
+        setup_args = '--setup-nic-and-disks' if set_nic_and_disks else ''
        run_setup_script('NIC queue', '/usr/lib/scylla/scylla_sysconfig_setup --nic {nic} {setup_args}'.format(nic=nic, setup_args=setup_args))

    if interactive:
--- a/dist/common/scripts/scylla_sysconfig_setup
+++ b/dist/common/scripts/scylla_sysconfig_setup
@@ -40,7 +40,7 @@ if __name__ == '__main__':
        cfg = sysconfig_parser('/etc/sysconfig/scylla-server')
    else:
        cfg = sysconfig_parser('/etc/default/scylla-server')
-    set_nic = str2bool(cfg.get('SET_NIC'))
+    set_nic_and_disks = str2bool(get_set_nic_and_disks_config_value(cfg))
    ami = str2bool(cfg.get('AMI'))

    parser = argparse.ArgumentParser(description='Setting parameters on Scylla sysconfig file.')
@@ -58,8 +58,8 @@ if __name__ == '__main__':
                        help='scylla home directory')
    parser.add_argument('--confdir',
                        help='scylla config directory')
-    parser.add_argument('--setup-nic', action='store_true', default=set_nic,
-                        help='setup NIC\'s interrupts, RPS, XPS')
+    parser.add_argument('--setup-nic-and-disks', action='store_true', default=set_nic_and_disks,
+                        help='setup NIC\'s and disks\' interrupts, RPS, XPS, nomerges and I/O scheduler')
    parser.add_argument('--ami', action='store_true', default=ami,
                        help='AMI instance mode')
    args = parser.parse_args()
@@ -71,8 +71,8 @@ if __name__ == '__main__':
    ifname = args.nic if args.nic else cfg.get('IFNAME')
    network_mode = args.mode if args.mode else cfg.get('NETWORK_MODE')

-    if args.setup_nic:
-        rps_cpus = out('/usr/lib/scylla/posix_net_conf.sh --cpu-mask {}'.format(ifname))
+    if args.setup_nic_and_disks:
+        rps_cpus = out('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname))
        if len(rps_cpus) > 0:
            cpuset = hex2list(rps_cpus)
            run('/usr/lib/scylla/scylla_cpuset_setup --cpuset {}'.format(cpuset))
@@ -104,8 +104,13 @@ if __name__ == '__main__':
        cfg.set('SCYLLA_HOME', args.homedir)
    if args.confdir:
        cfg.set('SCYLLA_CONF', args.confdir)
-    if str2bool(cfg.get('SET_NIC')) != args.setup_nic:
-        cfg.set('SET_NIC', bool2str(args.setup_nic))
+
+    if str2bool(get_set_nic_and_disks_config_value(cfg)) != args.setup_nic_and_disks:
+        if cfg.has_option('SET_NIC'):
+            cfg.set('SET_NIC', bool2str(args.setup_nic_and_disks))
+        else:
+            cfg.set('SET_NIC_AND_DISKS', bool2str(args.setup_nic_and_disks))
+
    if str2bool(cfg.get('AMI')) != args.ami:
        cfg.set('AMI', bool2str(args.ami))
    cfg.commit()
--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -28,6 +28,7 @@ import time
 import urllib.error
 import urllib.parse
 import urllib.request
+import yaml


 def curl(url, byte=False):
@@ -384,6 +385,35 @@ def get_mode_cpuset(nic, mode):
    except subprocess.CalledProcessError:
        return '-1'

+def get_scylla_dirs():
+    """
+    Returns a list of scylla directories configured in /etc/scylla/scylla.yaml.
+    Verifies that mandatory parameters are set.
+    """
+    scylla_yaml_name = '/etc/scylla/scylla.yaml'
+    y = yaml.load(open(scylla_yaml_name))
+
+    # Check that mandatory fields are set
+    if 'data_file_directories' not in y or \
+            not y['data_file_directories'] or \
+            not len(y['data_file_directories']) or \
+            not " ".join(y['data_file_directories']).strip():
+        raise Exception("{}: at least one directory has to be set in 'data_file_directory'".format(scylla_yaml_name))
+    if 'commitlog_directory' not in y or not y['commitlog_directory']:
+        raise Exception("{}: 'commitlog_directory' has to be set".format(scylla_yaml_name))
+
+    dirs = []
+    dirs.extend(y['data_file_directories'])
+    dirs.append(y['commitlog_directory'])
+
+    if 'hints_directory' in y and y['hints_directory']:
+        dirs.append(y['hints_directory'])
+
+    return [d for d in dirs if d is not None]
+
+def perftune_base_command():
+    disk_tune_param = "--tune disks " + " ".join("--dir {}".format(d) for d in get_scylla_dirs())
+    return '/usr/lib/scylla/perftune.py {}'.format(disk_tune_param)

 def get_cur_cpuset():
    cfg = sysconfig_parser('/etc/scylla.d/cpuset.conf')
@@ -419,6 +449,25 @@ def create_perftune_conf(nic='eth0'):
 def is_valid_nic(nic):
    return os.path.exists('/sys/class/net/{}'.format(nic))

+# Remove this when we do not support SET_NIC configuration value anymore
+def get_set_nic_and_disks_config_value(cfg):
+    """
+    Get the SET_NIC_AND_DISKS configuration value.
+    Return the SET_NIC configuration value if SET_NIC_AND_DISKS is not found (old releases case).
+    :param cfg: sysconfig_parser object
+    :return configuration value
+    :except If the configuration value is not found
+    """
+
+    # Sanity check
+    if cfg.has_option('SET_NIC_AND_DISKS') and cfg.has_option('SET_NIC'):
+        raise Exception("Only one of 'SET_NIC_AND_DISKS' and 'SET_NIC' is allowed to be present")
+
+    try:
+        return cfg.get('SET_NIC_AND_DISKS')
+    except:
+        # For backwards compatibility
+        return cfg.get('SET_NIC')

 class SystemdException(Exception):
    pass
@@ -483,8 +532,11 @@ class sysconfig_parser:
    def get(self, key):
        return self._cfg.get('global', key).strip('"')

+    def has_option(self, key):
+        return self._cfg.has_option('global', key)
+
    def set(self, key, val):
-        if not self._cfg.has_option('global', key):
+        if not self.has_option(key):
            return self.__add(key, val)
        self._data = re.sub('^{}=[^\n]*$'.format(key), '{}="{}"'.format(key, self.__escape(val)), self._data, flags=re.MULTILINE)
        self.__load()
--- a/dist/common/sysconfig/scylla-server
+++ b/dist/common/sysconfig/scylla-server
@@ -10,8 +10,8 @@ BRIDGE=virbr0
 # ethernet device name
 IFNAME=eth0

-# setup NIC's interrupts, RPS, XPS (posix)
-SET_NIC=no
+# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
+SET_NIC_AND_DISKS=no

 # ethernet device driver (dpdk)
 ETHDRV=
--- a/dist/common/sysctl.d/99-scylla-aio.conf
+++ b/dist/common/sysctl.d/99-scylla-aio.conf
@@ -0,0 +1,2 @@
+# Raise max AIO events
+fs.aio-max-nr = 1048576
--- a/dist/common/systemd/scylla-housekeeping-restart.service.mustache
+++ b/dist/common/systemd/scylla-housekeeping-restart.service.mustache
@@ -6,7 +6,12 @@ After=network.target
 Type=simple
 User=scylla
 Group=scylla
+{{#debian}}
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/apt/sources.list.d/scylla*.list' version --mode r
+{{/debian}}
+{{#redhat}}
 ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/yum.repos.d/scylla*.repo' version --mode r
+{{/redhat}}

 [Install]
 WantedBy=multi-user.target
--- a/dist/debian/debian/scylla-kernel-conf.install
+++ b/dist/debian/debian/scylla-kernel-conf.install
@@ -1 +1,2 @@
 dist/common/sysctl.d/99-scylla-sched.conf /etc/sysctl.d
+dist/common/sysctl.d/99-scylla-aio.conf /etc/sysctl.d
--- a/dist/debian/debian/scylla-kernel-conf.postinst
+++ b/dist/debian/debian/scylla-kernel-conf.postinst
@@ -9,6 +9,7 @@ if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
 else
    # expect failures in virtualized environments
    sysctl -p/etc/sysctl.d/99-scylla-sched.conf || :
+    sysctl -p/etc/sysctl.d/99-scylla-aio.conf || :
 fi

 #DEBHELPER#
--- a/dist/debian/rules.mustache
+++ b/dist/debian/rules.mustache
@@ -4,7 +4,7 @@ export PYBUILD_DISABLE=1
 jobs := $(shell echo $$DEB_BUILD_OPTIONS | sed -r "s/.*parallel=([0-9]+).*/-j\1/")

 override_dh_auto_configure:
-	./configure.py --with=scylla --with=iotune --enable-dpdk --mode=release --static-thrift --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7 --cflags="-I/opt/scylladb/include -L/opt/scylladb/lib/x86-linux-gnu/" --ldflags="-Wl,-rpath=/opt/scylladb/lib"
+	./configure.py --with=scylla --with=iotune --enable-dpdk --mode=release --static-thrift --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7 --c-compiler=/opt/scylladb/bin/gcc-7 --cflags="-I/opt/scylladb/include -L/opt/scylladb/lib/x86-linux-gnu/" --ldflags="-Wl,-rpath=/opt/scylladb/lib"

 override_dh_auto_build:
 	PATH="/opt/scylladb/bin:$$PATH" ninja $(jobs)
--- a/dist/debian/scylla-server.install.mustache
+++ b/dist/debian/scylla-server.install.mustache
@@ -1,7 +1,6 @@
 dist/common/limits.d/scylla.conf etc/security/limits.d
 dist/common/scylla.d/*.conf etc/scylla.d
 seastar/dpdk/usertools/dpdk-devbind.py usr/lib/scylla
-seastar/scripts/posix_net_conf.sh usr/lib/scylla
 seastar/scripts/perftune.py usr/lib/scylla
 dist/common/scripts/* usr/lib/scylla
 scylla-housekeeping usr/lib/scylla
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -26,7 +26,7 @@ ADD commandlineparser.py /commandlineparser.py
 ADD docker-entrypoint.py /docker-entrypoint.py

 # Install Scylla:
-RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo && \
+RUN curl http://downloads.scylladb.com/rpm/centos/scylla-3.0.repo -o /etc/yum.repos.d/scylla.repo && \
    yum -y install epel-release && \
    yum -y clean expire-cache && \
    yum -y update && \
--- a/dist/docker/redhat/etc/sysconfig/scylla-server
+++ b/dist/docker/redhat/etc/sysconfig/scylla-server
@@ -10,8 +10,8 @@ BRIDGE=virbr0
 # ethernet device name
 IFNAME=eth0

-# setup NIC's interrupts, RPS, XPS (posix)
-SET_NIC=no
+# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
+SET_NIC_AND_DISKS=no

 # ethernet device driver (dpdk)
 ETHDRV=
--- a/dist/offline_installer/redhat/build_offline_installer.sh
+++ b/dist/offline_installer/redhat/build_offline_installer.sh
@@ -91,7 +91,27 @@ mkdir -p build/offline_installer
 cp dist/offline_installer/redhat/header build/offline_installer
 sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve scylla
 # XXX: resolve option doesn't fetch some dependencies, need to manually fetch them
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve sudo.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve ntp.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libedit.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve ntpdate.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve net-tools.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve kernel
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve grubby.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve linux-firmware
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve initscripts.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve iproute.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve iptables.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libnfnetlink.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libnetfilter_conntrack.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libmnl.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve sysvinit-tools.x86_64
 sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve yajl.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve mdadm.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libreport-filesystem.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve xfsprogs.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve PyYAML.x86_64
+sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libyaml.x86_64
 sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libjpeg-turbo.x86_64
 sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libaio.x86_64
 sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve snappy.x86_64
--- a/dist/redhat/scylla.spec.mustache
+++ b/dist/redhat/scylla.spec.mustache
@@ -97,7 +97,7 @@ cflags="--cflags=${defines[*]}"
 %endif
 %if 0%{?rhel}
 . /etc/profile.d/scylla.sh
-python3.4 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --python python3.4 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
+python3.4 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --c-compiler=/opt/scylladb/bin/gcc-7.3 --python python3.4 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
 %endif
 ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune

@@ -193,7 +193,6 @@ rm -rf $RPM_BUILD_ROOT
 %{_prefix}/lib/scylla/scylla_cpuscaling_setup
 %{_prefix}/lib/scylla/scylla_fstrim
 %{_prefix}/lib/scylla/scylla_fstrim_setup
-%{_prefix}/lib/scylla/posix_net_conf.sh
 %{_prefix}/lib/scylla/perftune.py
 %{_prefix}/lib/scylla/dpdk-devbind.py
 %{_prefix}/lib/scylla/hex2list.py
@@ -283,6 +282,7 @@ if Scylla is the main application on your server and you wish to optimize its la
 # We cannot use the sysctl_apply rpm macro because it is not present in 7.0
 # following is a "manual" expansion
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
+/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :

 %files kernel-conf
 %defattr(-,root,root)
--- a/frozen_mutation.hh
+++ b/frozen_mutation.hh
@@ -78,6 +78,11 @@ public:

 frozen_mutation freeze(const mutation& m);

+struct frozen_mutation_and_schema {
+    frozen_mutation fm;
+    schema_ptr s;
+};
+
 // Can receive streamed_mutation in reversed order.
 class streamed_mutation_freezer {
    const schema& _schema;
--- a/gms/endpoint_state.hh
+++ b/gms/endpoint_state.hh
@@ -129,26 +129,8 @@ public:
        update_is_normal();
    }

-    void apply_application_state(application_state key, versioned_value&& value) {
-        auto&& e = _application_state[key];
-        if (e.version < value.version) {
-            e = std::move(value);
-        }
-        update_is_normal();
-    }
-
-    void apply_application_state(application_state key, const versioned_value& value) {
-        auto&& e = _application_state[key];
-        if (e.version < value.version) {
-            e = value;
-        }
-        update_is_normal();
-    }
-
-    void apply_application_state(const endpoint_state& es) {
-        for (auto&& e : es._application_state) {
-            apply_application_state(e.first, e.second);
-        }
+    void add_application_state(const endpoint_state& es) {
+        _application_state = es._application_state;
        update_is_normal();
    }

--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -930,7 +930,7 @@ void gossiper::make_random_gossip_digest(utils::chunked_vector<gossip_digest>& g
 future<> gossiper::replicate(inet_address ep, const endpoint_state& es) {
    return container().invoke_on_all([ep, es, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
-            g.endpoint_state_map[ep].apply_application_state(es);
+            g.endpoint_state_map[ep].add_application_state(es);
        }
    });
 }
@@ -939,7 +939,7 @@ future<> gossiper::replicate(inet_address ep, const std::map<application_state,
    return container().invoke_on_all([ep, &src, &changed, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
            for (auto&& key : changed) {
-                g.endpoint_state_map[ep].apply_application_state(key, src.at(key));
+                g.endpoint_state_map[ep].add_application_state(key, src.at(key));
            }
        }
    });
@@ -948,7 +948,7 @@ future<> gossiper::replicate(inet_address ep, const std::map<application_state,
 future<> gossiper::replicate(inet_address ep, application_state key, const versioned_value& value) {
    return container().invoke_on_all([ep, key, &value, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
-            g.endpoint_state_map[ep].apply_application_state(key, value);
+            g.endpoint_state_map[ep].add_application_state(key, value);
        }
    });
 }
@@ -1175,11 +1175,13 @@ stdx::optional<endpoint_state> gossiper::get_endpoint_state_for_endpoint(inet_ad
    }
 }

-void gossiper::reset_endpoint_state_map() {
-    endpoint_state_map.clear();
+future<> gossiper::reset_endpoint_state_map() {
    _unreachable_endpoints.clear();
    _live_endpoints.clear();
    _live_endpoints_just_added.clear();
+    return container().invoke_on_all([] (gossiper& g) {
+        g.endpoint_state_map.clear();
+    });
 }

 std::unordered_map<inet_address, endpoint_state>& gms::gossiper::get_endpoint_states() {
@@ -1298,6 +1300,14 @@ void gossiper::mark_alive(inet_address addr, endpoint_state& local_state) {
 // Runs inside seastar::async context
 void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
    logger.trace("marking as alive {}", addr);
+
+    // Do not mark a node with status shutdown as UP.
+    auto status = get_gossip_status(local_state);
+    if (status == sstring(versioned_value::SHUTDOWN)) {
+        logger.warn("Skip marking node {} with status = {} as UP", addr, status);
+        return;
+    }
+
    local_state.mark_alive();
    local_state.update_timestamp(); // prevents do_status_check from racing us and evicting if it was down > A_VERY_LONG_TIME

@@ -1319,7 +1329,7 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
    }

    if (!_in_shadow_round) {
-        logger.info("InetAddress {} is now UP, status = {}", addr, get_gossip_status(local_state));
+        logger.info("InetAddress {} is now UP, status = {}", addr, status);
    }

    _subscribers.for_each([addr, local_state] (auto& subscriber) {
@@ -1662,6 +1672,7 @@ void gossiper::maybe_initialize_local_state(int generation_nbr) {
    }
 }

+// Runs inside seastar::async context
 void gossiper::add_saved_endpoint(inet_address ep) {
    if (ep == get_broadcast_address()) {
        logger.debug("Attempt to add self as saved endpoint");
@@ -1687,6 +1698,7 @@ void gossiper::add_saved_endpoint(inet_address ep) {
    }
    ep_state.mark_dead();
    endpoint_state_map[ep] = ep_state;
+    replicate(ep, ep_state).get();
    _unreachable_endpoints[ep] = now();
    logger.trace("Adding saved endpoint {} {}", ep, ep_state.get_heart_beat_state().get_generation());
 }
@@ -1924,6 +1936,7 @@ void gossiper::mark_as_shutdown(const inet_address& endpoint) {
        auto& ep_state = *es;
        ep_state.add_application_state(application_state::STATUS, storage_service_value_factory().shutdown(true));
        ep_state.get_heart_beat_state().force_highest_possible_version_unsafe();
+        replicate(endpoint, ep_state).get();
        mark_dead(endpoint, ep_state);
        get_local_failure_detector().force_conviction(endpoint);
    }
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -417,7 +417,7 @@ public:
    stdx::optional<endpoint_state> get_endpoint_state_for_endpoint(inet_address ep) const;

    // removes ALL endpoint states; should only be called after shadow gossip
-    void reset_endpoint_state_map();
+    future<> reset_endpoint_state_map();

    std::unordered_map<inet_address, endpoint_state>& get_endpoint_states();

--- a/idl/streaming.idl.hh
+++ b/idl/streaming.idl.hh
@@ -42,4 +42,13 @@ class prepare_message {
    uint32_t dst_cpu_id;
 };

+enum class stream_reason : uint8_t {
+    unspecified,
+    bootstrap,
+    decommission,
+    removenode,
+    rebuild,
+    repair,
+};
+
 }
--- a/install.sh
+++ b/install.sh
@@ -93,7 +93,6 @@ install -m644 build/*.service -Dt "$rprefix"/lib/systemd/system
 install -m644 dist/common/systemd/*.service -Dt "$rprefix"/lib/systemd/system
 install -m644 dist/common/systemd/*.timer -Dt "$rprefix"/lib/systemd/system
 install -m755 dist/common/scripts/* -Dt "$rprefix"/lib/scylla/
-install -m755 seastar/scripts/posix_net_conf.sh "$rprefix"/lib/scylla/
 install -m755 seastar/scripts/perftune.py -Dt "$rprefix"/lib/scylla/
 install -m755 seastar/dpdk/usertools/dpdk-devbind.py -Dt "$rprefix"/lib/scylla/
 install -m755 build/release/scylla -Dt "$rprefix/bin"
--- a/1
+++ b/1
--- a/licenses/libdeflate-license.txt
+++ b/licenses/libdeflate-license.txt
@@ -0,0 +1,21 @@
+Copyright 2016 Eric Biggers
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation files
+(the "Software"), to deal in the Software without restriction,
+including without limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of the Software,
+and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/locator/abstract_replication_strategy.cc
+++ b/locator/abstract_replication_strategy.cc
@@ -119,9 +119,17 @@ insert_token_range_to_sorted_container_while_unwrapping(
        const dht::token& tok,
        dht::token_range_vector& ret) {
    if (prev_tok < tok) {
-        ret.emplace_back(
-                dht::token_range::bound(prev_tok, false),
-                dht::token_range::bound(tok, true));
+        auto pos = ret.end();
+        if (!ret.empty() && !std::prev(pos)->end()) {
+            // We inserted a wrapped range (a, b] previously as
+            // (-inf, b], (a, +inf). So now we insert in the next-to-last
+            // position to keep the last range (a, +inf) at the end.
+            pos = std::prev(pos);
+        }
+        ret.insert(pos,
+                dht::token_range{
+                        dht::token_range::bound(prev_tok, false),
+                        dht::token_range::bound(tok, true)});
    } else {
        ret.emplace_back(
                dht::token_range::bound(prev_tok, false),
--- a/main.cc
+++ b/main.cc
@@ -62,6 +62,7 @@
 #include "service/cache_hitrate_calculator.hh"
 #include "sstables/compaction_manager.hh"
 #include "sstables/sstables.hh"
+#include <db/view/view_update_from_staging_generator.hh>

 seastar::metrics::metric_groups app_metrics;

@@ -647,6 +648,21 @@ int main(int ac, char** av) {

            supervisor::notify("loading sstables");
            distributed_loader::init_non_system_keyspaces(db, proxy).get();
+
+            static sharded<db::view::view_update_from_staging_generator> view_update_from_staging_generator;
+            view_update_from_staging_generator.start(std::ref(db), std::ref(proxy)).get();
+            supervisor::notify("discovering staging sstables");
+            db.invoke_on_all([] (database& db) {
+                for (auto& x : db.get_column_families()) {
+                    table& t = *(x.second);
+                    for (sstables::shared_sstable sst : *t.get_sstables()) {
+                        if (sst->is_staging()) {
+                            view_update_from_staging_generator.local().register_staging_sstable(std::move(sst), t.shared_from_this());
+                        }
+                    }
+                }
+            }).get();
+
            // register connection drop notification to update cf's cache hit rate data
            db.invoke_on_all([] (database& db) {
                db.register_connection_drop_notifier(netw::get_local_messaging_service());
@@ -700,9 +716,21 @@ int main(int ac, char** av) {
            proxy.invoke_on_all([] (service::storage_proxy& p) {
                p.init_messaging_service();
            }).get();
+
            supervisor::notify("starting streaming service");
-            streaming::stream_session::init_streaming_service(db).get();
+            streaming::stream_session::init_streaming_service(db, sys_dist_ks, view_update_from_staging_generator).get();
            api::set_server_stream_manager(ctx).get();
+
+            supervisor::notify("starting hinted handoff manager");
+            if (hinted_handoff_enabled) {
+                db::hints::manager::rebalance(cfg->hints_directory()).get();
+            }
+            db::hints::manager::rebalance(cfg->data_file_directories()[0] + "/view_pending_updates").get();
+
+            proxy.invoke_on_all([] (service::storage_proxy& local_proxy) {
+                local_proxy.start_hints_manager(gms::get_local_gossiper().shared_from_this(), service::get_local_storage_service().shared_from_this());
+            }).get();
+
            supervisor::notify("starting messaging service");
            // Start handling REPAIR_CHECKSUM_RANGE messages
            netw::get_messaging_service().invoke_on_all([&db] (auto& ms) {
@@ -739,16 +767,16 @@ int main(int ac, char** av) {
            gms::get_local_gossiper().wait_for_gossip_to_settle().get();
            api::set_server_gossip_settle(ctx).get();

-            supervisor::notify("starting hinted handoff manager");
-            if (hinted_handoff_enabled) {
-                db::hints::manager::rebalance(cfg->hints_directory()).get();
-            }
-            db::hints::manager::rebalance(cfg->data_file_directories()[0] + "/view_pending_updates").get();
-
+            supervisor::notify("allow replaying hints");
            proxy.invoke_on_all([] (service::storage_proxy& local_proxy) {
-                local_proxy.start_hints_manager(gms::get_local_gossiper().shared_from_this(), service::get_local_storage_service().shared_from_this());
+                local_proxy.allow_replaying_hints();
            }).get();

+            if (cfg->view_building()) {
+                supervisor::notify("Launching generate_mv_updates for non system tables");
+                view_update_from_staging_generator.invoke_on_all(&db::view::view_update_from_staging_generator::start).get();
+            }
+
            static sharded<db::view::view_builder> view_builder;
            if (cfg->view_building()) {
                supervisor::notify("starting the view builder");
@@ -786,6 +814,11 @@ int main(int ac, char** av) {
            engine().at_exit([] {
                return repair_shutdown(service::get_local_storage_service().db());
            });
+
+            engine().at_exit([] {
+                return view_update_from_staging_generator.stop();
+            });
+
            engine().at_exit([] {
                return service::get_local_storage_service().drain_on_shutdown();
            });
--- a/memtable.hh
+++ b/memtable.hh
@@ -214,7 +214,9 @@ private:

        void update(const schema& s, const deletable_row& dr) {
            update(dr.marker());
-            update(dr.deleted_at().tomb());
+            row_tombstone row_tomb = dr.deleted_at();
+            update(row_tomb.regular());
+            update(row_tomb.tomb());
            update(s, dr.cells(), column_kind::regular_column);
        }

--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -135,12 +135,14 @@ struct messaging_service::rpc_protocol_wrapper : public rpc_protocol { using rpc
 // This should be integrated into messaging_service proper.
 class messaging_service::rpc_protocol_client_wrapper {
    std::unique_ptr<rpc_protocol::client> _p;
+    ::shared_ptr<seastar::tls::server_credentials> _credentials;
 public:
    rpc_protocol_client_wrapper(rpc_protocol& proto, rpc::client_options opts, ipv4_addr addr, ipv4_addr local = ipv4_addr())
            : _p(std::make_unique<rpc_protocol::client>(proto, std::move(opts), addr, local)) {
    }
    rpc_protocol_client_wrapper(rpc_protocol& proto, rpc::client_options opts, ipv4_addr addr, ipv4_addr local, ::shared_ptr<seastar::tls::server_credentials> c)
            : _p(std::make_unique<rpc_protocol::client>(proto, std::move(opts), seastar::tls::socket(c), addr, local))
+            , _credentials(c)
    {}
    auto get_stats() const { return _p->get_stats(); }
    future<> stop() { return _p->stop(); }
@@ -148,6 +150,19 @@ public:
        return _p->error();
    }
    operator rpc_protocol::client&() { return *_p; }
+
+    /**
+     * #3787 Must ensure we use the right type of socker. I.e. tls or not.
+     * See above, we retain credentials object so we here can know if we
+     * are tls or not.
+     */
+    template<typename Serializer, typename... Out>
+    future<rpc::sink<Out...>> make_stream_sink() {
+        if (_credentials) {
+            return _p->make_stream_sink<Serializer, Out...>(seastar::tls::socket(_credentials));
+        }
+        return _p->make_stream_sink<Serializer, Out...>();
+    }
 };

 struct messaging_service::rpc_protocol_server_wrapper : public rpc_protocol::server { using rpc_protocol::server::server; };
@@ -638,17 +653,18 @@ rpc::sink<int32_t> messaging_service::make_sink_for_stream_mutation_fragments(rp
 }

 future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>
-messaging_service::make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, msg_addr id) {
-    rpc_protocol::client& rpc_client = *get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, id);
-    return rpc_client.make_stream_sink<netw::serializer, frozen_mutation_fragment>().then([this, plan_id, schema_id, cf_id, estimated_partitions, &rpc_client] (rpc::sink<frozen_mutation_fragment> sink) mutable {
-        auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, rpc::sink<frozen_mutation_fragment>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
-        return rpc_handler(rpc_client , plan_id, schema_id, cf_id, estimated_partitions, sink).then([sink] (rpc::source<int32_t> source) mutable {
+messaging_service::make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id) {
+    auto wrapper = get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, id);
+    rpc_protocol::client& rpc_client = *wrapper;
+    return wrapper->make_stream_sink<netw::serializer, frozen_mutation_fragment>().then([this, plan_id, schema_id, cf_id, estimated_partitions, reason, &rpc_client] (rpc::sink<frozen_mutation_fragment> sink) mutable {
+        auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, streaming::stream_reason, rpc::sink<frozen_mutation_fragment>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
+        return rpc_handler(rpc_client , plan_id, schema_id, cf_id, estimated_partitions, reason, sink).then([sink] (rpc::source<int32_t> source) mutable {
            return make_ready_future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>(std::move(sink), std::move(source));
        });
    });
 }

-void messaging_service::register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::source<frozen_mutation_fragment> source)>&& func) {
+void messaging_service::register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason>, rpc::source<frozen_mutation_fragment> source)>&& func) {
    register_handler(this, messaging_verb::STREAM_MUTATION_FRAGMENTS, std::move(func));
 }

@@ -726,13 +742,13 @@ auto send_message_oneway_timeout(messaging_service* ms, Timeout timeout, messagi

 // PREPARE_MESSAGE
 void messaging_service::register_prepare_message(std::function<future<streaming::prepare_message> (const rpc::client_info& cinfo,
-        streaming::prepare_message msg, UUID plan_id, sstring description)>&& func) {
+        streaming::prepare_message msg, UUID plan_id, sstring description, rpc::optional<streaming::stream_reason> reason)>&& func) {
    register_handler(this, messaging_verb::PREPARE_MESSAGE, std::move(func));
 }
 future<streaming::prepare_message> messaging_service::send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
-        sstring description) {
+        sstring description, streaming::stream_reason reason) {
    return send_message<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
-        std::move(msg), plan_id, std::move(description));
+        std::move(msg), plan_id, std::move(description), reason);
 }

 // PREPARE_DONE_MESSAGE
@@ -745,12 +761,12 @@ future<> messaging_service::send_prepare_done_message(msg_addr id, UUID plan_id,
 }

 // STREAM_MUTATION
-void messaging_service::register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool> fragmented)>&& func) {
+void messaging_service::register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool> fragmented, rpc::optional<streaming::stream_reason> reason)>&& func) {
    register_handler(this, messaging_verb::STREAM_MUTATION, std::move(func));
 }
-future<> messaging_service::send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented) {
+future<> messaging_service::send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented, streaming::stream_reason reason) {
    return send_message<void>(this, messaging_verb::STREAM_MUTATION, id,
-        plan_id, std::move(fm), dst_cpu_id, fragmented);
+        plan_id, std::move(fm), dst_cpu_id, fragmented, reason);
 }

 // STREAM_MUTATION_DONE
--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -35,6 +35,7 @@
 #include "repair/repair.hh"
 #include "tracing/tracing.hh"
 #include "digest_algorithm.hh"
+#include "streaming/stream_reason.hh"

 #include <seastar/net/tls.hh>

@@ -237,23 +238,23 @@ public:

    // Wrapper for PREPARE_MESSAGE verb
    void register_prepare_message(std::function<future<streaming::prepare_message> (const rpc::client_info& cinfo,
-            streaming::prepare_message msg, UUID plan_id, sstring description)>&& func);
+            streaming::prepare_message msg, UUID plan_id, sstring description, rpc::optional<streaming::stream_reason> reason)>&& func);
    future<streaming::prepare_message> send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
-            sstring description);
+            sstring description, streaming::stream_reason);

    // Wrapper for PREPARE_DONE_MESSAGE verb
    void register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func);
    future<> send_prepare_done_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id);

    // Wrapper for STREAM_MUTATION verb
-    void register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool>)>&& func);
-    future<> send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented);
+    void register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool>, rpc::optional<streaming::stream_reason>)>&& func);
+    future<> send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented, streaming::stream_reason reason);

    // Wrapper for STREAM_MUTATION_FRAGMENTS
    // The receiver of STREAM_MUTATION_FRAGMENTS sends status code to the sender to notify any error on the receiver side. The status code is of type int32_t. 0 means successful, -1 means error, other status code value are reserved for future use.
-    void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::source<frozen_mutation_fragment> source)>&& func);
+    void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment> source)>&& func);
    rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment>& source);
-    future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, msg_addr id);
+    future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);

    void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
    future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -60,7 +60,7 @@ using foreign_unique_ptr = foreign_ptr<std::unique_ptr<T>>;
 /// 3) Both, `read_context::lookup_readers()` and `read_context::save_readers()`
 ///    knows to do nothing when the query is not stateful and just short
 ///    circuit.
-class read_context {
+class read_context : public reader_lifecycle_policy {
    struct reader_params {
        std::unique_ptr<const dht::partition_range> range;
        std::unique_ptr<const query::partition_slice> slice;
@@ -80,6 +80,20 @@ class read_context {
        foreign_unique_ptr<utils::phased_barrier::operation> read_operation;
        foreign_unique_ptr<flat_mutation_reader> reader;
    };
+    struct paused_reader {
+        shard_id shard;
+        reader_concurrency_semaphore::inactive_read_handle handle;
+        bool has_pending_next_partition;
+    };
+    struct inactive_read : public reader_concurrency_semaphore::inactive_read {
+        foreign_unique_ptr<flat_mutation_reader> reader;
+        explicit inactive_read(foreign_unique_ptr<flat_mutation_reader> reader)
+            : reader(std::move(reader)) {
+        }
+        virtual void evict() override {
+            reader.reset();
+        }
+    };

    using inexistent_state = std::monostate;
    struct successful_lookup_state {
@@ -94,61 +108,64 @@ class read_context {
    struct dismantling_state {
        foreign_unique_ptr<reader_params> params;
        foreign_unique_ptr<utils::phased_barrier::operation> read_operation;
-        future<stopped_foreign_reader> reader_fut;
+        std::variant<foreign_unique_ptr<flat_mutation_reader>, paused_reader> reader;
        circular_buffer<mutation_fragment> buffer;
    };
    struct ready_to_save_state {
        foreign_unique_ptr<reader_params> params;
        foreign_unique_ptr<utils::phased_barrier::operation> read_operation;
-        foreign_unique_ptr<flat_mutation_reader> reader;
+        std::variant<foreign_unique_ptr<flat_mutation_reader>, paused_reader> reader;
        circular_buffer<mutation_fragment> buffer;
    };
-    struct future_used_state {
-        future<used_state> fut;
+    struct paused_state {
+        foreign_unique_ptr<reader_params> params;
+        foreign_unique_ptr<utils::phased_barrier::operation> read_operation;
+        reader_concurrency_semaphore::inactive_read_handle handle;
    };
-    struct future_dismantling_state {
-        future<dismantling_state> fut;
+    struct evicted_state {
    };

-    //                           ( )
+    //              ( )    (O)
+    //               |      ^
+    //               |      |
+    //         +--- inexistent ---+
+    //         |                  |
+    //     (1) |              (3) |    (3)
+    //         |                  |  +------ evicted -> (O)
+    //  successful_lookup         |  |          ^
+    //     |         |            |  |  (7)     |
+    //     |         |            |  +-------+  | (8)
+    //     |         |    (4)     |  |       |  |
+    //     |         +----------> used      paused
+    //     |                      |  |  (6)  ^  |
+    // (2) |                      |  +-------+  |
+    //     |                  (5) |             | (5)
+    //     |                      |             |
+    //     |                      |             |
+    //     |                 dismantling <------+
+    //     |                      |
+    //     |                  (2) |
+    //     |                      |
+    //     +---------------> ready_to_save
    //                            |
-    //            +------ inexistent_state -----+
-    //            |                             |
-    //        (1) |                         (6) |
-    //            |                             |
-    //  successful_lookup_state         future_used_state
-    //     |              |               |           |
-    // (2) |          (3) |           (7) |       (8) |
-    //     |              |               |           |
-    //     |         used_state <---------+  future_dismantling_state
-    //     |              |                           |
-    //     |          (4) |                       (9) |
-    //     |              |                           |
-    //     |      dismantling_state <-----------------+
-    //     |              |
-    //     |          (5) |
-    //     |              |
-    //     +----> ready_to_save_state
-    //                    |
-    //                   (O)
+    //                           (O)
    //
    //  1) lookup_readers()
    //  2) save_readers()
-    //  3) make_remote_reader()
-    //  4) dismantle_reader()
-    //  5) prepare_reader_for_saving()
-    //  6) do_make_remote_reader()
-    //  7) reader is created
-    //  8) dismantle_reader()
-    //  9) reader is created
+    //  3) do_make_remote_reader()
+    //  4) make_remote_reader()
+    //  5) dismantle_reader()
+    //  6) pause_reader()
+    //  7) try_resume() - success
+    //  8) try_resume() - failure
    using reader_state = std::variant<
        inexistent_state,
        successful_lookup_state,
        used_state,
+        paused_state,
+        evicted_state,
        dismantling_state,
-        ready_to_save_state,
-        future_used_state,
-        future_dismantling_state>;
+        ready_to_save_state>;

    struct dismantle_buffer_stats {
        size_t partitions = 0;
@@ -184,6 +201,8 @@ class read_context {
    // One for each shard. Index is shard id.
    std::vector<reader_state> _readers;

+    gate _dismantling_gate;
+
    static future<bundled_remote_reader> do_make_remote_reader(
            distributed<database>& db,
            shard_id shard,
@@ -200,13 +219,10 @@ class read_context {
            const query::partition_slice& ps,
            const io_priority_class& pc,
            tracing::trace_state_ptr trace_state,
-            streamed_mutation::forwarding fwd_sm,
            mutation_reader::forwarding fwd_mr);

-    void dismantle_reader(shard_id shard, future<stopped_foreign_reader>&& stopped_reader_fut);
+    void dismantle_reader(shard_id shard, future<paused_or_stopped_reader>&& reader_fut);

-    ready_to_save_state* prepare_reader_for_saving(dismantling_state& current_state, future<stopped_foreign_reader>&& stopped_reader_fut,
-            const dht::decorated_key& last_pkey, const std::optional<clustering_key_prefix>& last_ckey);
    dismantle_buffer_stats dismantle_combined_buffer(circular_buffer<mutation_fragment> combined_buffer, const dht::decorated_key& pkey);
    dismantle_buffer_stats dismantle_compaction_state(detached_compaction_state compaction_state);
    future<> save_reader(ready_to_save_state& current_state, const dht::decorated_key& last_pkey,
@@ -229,26 +245,24 @@ public:
    read_context& operator=(read_context&&) = delete;
    read_context& operator=(const read_context&) = delete;

-    remote_reader_factory factory() {
-        return [this] (
-                shard_id shard,
-                schema_ptr schema,
-                const dht::partition_range& pr,
-                const query::partition_slice& ps,
-                const io_priority_class& pc,
-                tracing::trace_state_ptr trace_state,
-                streamed_mutation::forwarding fwd_sm,
-                mutation_reader::forwarding fwd_mr) {
-            return make_remote_reader(shard, std::move(schema), pr, ps, pc, std::move(trace_state), fwd_sm, fwd_mr);
-        };
+    virtual future<foreign_unique_ptr<flat_mutation_reader>> create_reader(
+            shard_id shard,
+            schema_ptr schema,
+            const dht::partition_range& pr,
+            const query::partition_slice& ps,
+            const io_priority_class& pc,
+            tracing::trace_state_ptr trace_state,
+            mutation_reader::forwarding fwd_mr) override {
+        return make_remote_reader(shard, std::move(schema), pr, ps, pc, std::move(trace_state), fwd_mr);
    }

-    foreign_reader_dismantler dismantler() {
-        return [this] (shard_id shard, future<stopped_foreign_reader>&& stopped_reader_fut) {
-            dismantle_reader(shard, std::move(stopped_reader_fut));
-        };
+    virtual void destroy_reader(shard_id shard, future<paused_or_stopped_reader> reader_fut) noexcept override {
+        dismantle_reader(shard, std::move(reader_fut));
    }

+    virtual future<> pause(foreign_unique_ptr<flat_mutation_reader> reader) override;
+    virtual future<foreign_unique_ptr<flat_mutation_reader>> try_resume(shard_id shard) override;
+
    future<> lookup_readers();

    future<> save_readers(circular_buffer<mutation_fragment> unconsumed_buffer, detached_compaction_state compaction_state,
@@ -289,7 +303,6 @@ future<foreign_unique_ptr<flat_mutation_reader>> read_context::make_remote_reade
        const query::partition_slice& ps,
        const io_priority_class& pc,
        tracing::trace_state_ptr trace_state,
-        streamed_mutation::forwarding,
        mutation_reader::forwarding) {
    auto& rs = _readers[shard];

@@ -306,100 +319,71 @@ future<foreign_unique_ptr<flat_mutation_reader>> read_context::make_remote_reade
        return make_ready_future<foreign_unique_ptr<flat_mutation_reader>>(std::move(reader));
    }

-    auto created = promise<used_state>();
-    rs = future_used_state{created.get_future()};
-    return do_make_remote_reader(_db, shard, std::move(schema), pr, ps, pc, std::move(trace_state)).then_wrapped([this, &rs,
-            created = std::move(created)] (future<bundled_remote_reader>&& bundled_reader_fut) mutable {
-        if (bundled_reader_fut.failed()) {
-            auto ex = bundled_reader_fut.get_exception();
-            if (!std::holds_alternative<future_used_state>(rs)) {
-                created.set_exception(ex);
-            }
-            return make_exception_future<foreign_unique_ptr<flat_mutation_reader>>(std::move(ex));
-        }
-
-        auto bundled_reader = bundled_reader_fut.get0();
-        auto new_state = used_state{std::move(bundled_reader.params), std::move(bundled_reader.read_operation)};
-        if (std::holds_alternative<future_used_state>(rs)) {
-            rs = std::move(new_state);
-        } else {
-            created.set_value(std::move(new_state));
-        }
+    return do_make_remote_reader(_db, shard, std::move(schema), pr, ps, pc, std::move(trace_state)).then(
+            [this, &rs] (bundled_remote_reader&& bundled_reader) mutable {
+        rs = used_state{std::move(bundled_reader.params), std::move(bundled_reader.read_operation)};
        return make_ready_future<foreign_unique_ptr<flat_mutation_reader>>(std::move(bundled_reader.reader));
    });
 }

-void read_context::dismantle_reader(shard_id shard, future<stopped_foreign_reader>&& stopped_reader_fut) {
-    auto& rs = _readers[shard];
+void read_context::dismantle_reader(shard_id shard, future<paused_or_stopped_reader>&& reader_fut) {
+    with_gate(_dismantling_gate, [this, shard, reader_fut = std::move(reader_fut)] () mutable {
+        return reader_fut.then_wrapped([this, shard] (future<paused_or_stopped_reader>&& reader_fut) {
+            if (reader_fut.failed()) {
+                mmq_log.debug("Failed to stop reader on shard {}: {}", shard, reader_fut.get_exception());
+                ++_db.local().get_stats().multishard_query_failed_reader_stops;
+                return;
+            }

-    if (auto* maybe_used_state = std::get_if<used_state>(&rs)) {
-        auto read_operation = std::move(maybe_used_state->read_operation);
-        auto params = std::move(maybe_used_state->params);
-        rs = dismantling_state{std::move(params), std::move(read_operation), std::move(stopped_reader_fut), circular_buffer<mutation_fragment>{}};
-    } else if (auto* maybe_future_used_state = std::get_if<future_used_state>(&rs)) {
-        auto f = maybe_future_used_state->fut.then([stopped_reader_fut = std::move(stopped_reader_fut)] (used_state&& current_state) mutable {
-            auto read_operation = std::move(current_state.read_operation);
-            auto params = std::move(current_state.params);
-            return dismantling_state{std::move(params), std::move(read_operation), std::move(stopped_reader_fut),
-                circular_buffer<mutation_fragment>{}};
+            auto reader = reader_fut.get0();
+            auto& rs = _readers[shard];
+            if (auto* maybe_used_state = std::get_if<used_state>(&rs)) {
+                auto read_operation = std::move(maybe_used_state->read_operation);
+                auto params = std::move(maybe_used_state->params);
+                rs = dismantling_state{std::move(params), std::move(read_operation), std::move(reader.remote_reader),
+                        std::move(reader.unconsumed_fragments)};
+            } else if (auto* maybe_paused_state = std::get_if<paused_state>(&rs)) {
+                auto read_operation = std::move(maybe_paused_state->read_operation);
+                auto params = std::move(maybe_paused_state->params);
+                auto handle = maybe_paused_state->handle;
+                rs = dismantling_state{std::move(params), std::move(read_operation), paused_reader{shard, handle, reader.has_pending_next_partition},
+                        std::move(reader.unconsumed_fragments)};
+            // Do nothing for evicted readers.
+            } else if (!std::holds_alternative<evicted_state>(rs)) {
+                mmq_log.warn(
+                        "Unexpected request to dismantle reader in state {} for shard {}."
+                        " Reader was not created nor is in the process of being created.",
+                        rs.index(),
+                        shard);
+            }
        });
-        rs = future_dismantling_state{std::move(f)};
-    } else {
-        mmq_log.warn("Unexpected request to dismantle reader for shard {}. Reader was not created nor is in the process of being created.", shard);
-    }
+    });
 }

 future<> read_context::stop() {
-    auto cleanup = [db = &_db.local()] (shard_id shard, dismantling_state state) {
-        return state.reader_fut.then_wrapped([db, shard, params = std::move(state.params),
-                read_operation = std::move(state.read_operation)] (future<stopped_foreign_reader>&& fut) mutable {
-            if (fut.failed()) {
-                mmq_log.debug("Failed to stop reader on shard {}: {}", shard, fut.get_exception());
-                ++db->get_stats().multishard_query_failed_reader_stops;
-            } else {
-                smp::submit_to(shard, [reader = fut.get0().remote_reader, params = std::move(params),
-                        read_operation = std::move(read_operation)] () mutable {
-                    reader.release();
+    auto pr = promise<>();
+    auto fut = pr.get_future();
+    auto gate_fut = _dismantling_gate.is_closed() ? make_ready_future<>() : _dismantling_gate.close();
+    gate_fut.then([this] {
+        for (shard_id shard = 0; shard != smp::count; ++shard) {
+            if (auto* maybe_dismantling_state = std::get_if<dismantling_state>(&_readers[shard])) {
+                _db.invoke_on(shard, [reader = std::move(maybe_dismantling_state->reader),
+                        params = std::move(maybe_dismantling_state->params),
+                        read_operation = std::move(maybe_dismantling_state->read_operation)] (database& db) mutable {
+                    if (auto* maybe_stopped_reader = std::get_if<foreign_unique_ptr<flat_mutation_reader>>(&reader)) {
+                        maybe_stopped_reader->release();
+                    } else {
+                        db.user_read_concurrency_sem().unregister_inactive_read(std::get<paused_reader>(reader).handle);
+                    }
                    params.release();
                    read_operation.release();
                });
            }
-        });
-    };
-
-    std::vector<future<>> futures;
-    auto immediate_cleanup = size_t(0);
-    auto future_cleanup = size_t(0);
-
-    // Wait for pending read-aheads in the background.
-    for (shard_id shard = 0; shard != smp::count; ++shard) {
-        auto& rs = _readers[shard];
-
-        if (auto maybe_dismantling_state = std::get_if<dismantling_state>(&rs)) {
-            ++immediate_cleanup;
-            cleanup(shard, std::move(*maybe_dismantling_state));
-        } else if (auto maybe_future_dismantling_state = std::get_if<future_dismantling_state>(&rs)) {
-            ++future_cleanup;
-            futures.emplace_back(maybe_future_dismantling_state->fut.then_wrapped([=] (future<dismantling_state>&& current_state_fut) {
-                if (current_state_fut.failed()) {
-                    mmq_log.debug("Failed to stop reader on shard {}: {}", shard, current_state_fut.get_exception());
-                    ++_db.local().get_stats().multishard_query_failed_reader_stops;
-                } else {
-                    cleanup(shard, current_state_fut.get0());
-                }
-            }));
        }
-    }
-
-    if (const auto total = immediate_cleanup + future_cleanup) {
-        tracing::trace(_trace_state,
-                "Stopping {} shard readers, {} ready for immediate cleanup, {} will be cleaned up after finishes read-ahead",
-                total,
-                immediate_cleanup,
-                future_cleanup);
-    }
-
-    return when_all(futures.begin(), futures.end()).discard_result();
+    }).finally([pr = std::move(pr)] () mutable {
+        pr.set_value();
+    });
+    return fut;
 }

 read_context::dismantle_buffer_stats read_context::dismantle_combined_buffer(circular_buffer<mutation_fragment> combined_buffer,
@@ -459,46 +443,35 @@ read_context::dismantle_buffer_stats read_context::dismantle_compaction_state(de
    return stats;
 }

-read_context::ready_to_save_state* read_context::prepare_reader_for_saving(
-        dismantling_state& current_state,
-        future<stopped_foreign_reader>&& stopped_reader_fut,
-        const dht::decorated_key& last_pkey,
-        const std::optional<clustering_key_prefix>& last_ckey) {
-    const auto shard = current_state.params.get_owner_shard();
-    auto& rs = _readers[shard];
-
-    if (stopped_reader_fut.failed()) {
-        mmq_log.debug("Failed to stop reader on shard {}: {}", shard, stopped_reader_fut.get_exception());
-        ++_db.local().get_stats().multishard_query_failed_reader_stops;
-        return nullptr;
-    }
-
-    auto stopped_reader = stopped_reader_fut.get0();
-
-    // If the buffer is empty just overwrite it.
-    // If it has some data in it append the fragments to the back.
-    // The unconsumed fragments appended here come from the
-    // foreign_reader which is at the lowest layer, hence its
-    // fragments need to be at the back of the buffer.
-    if (current_state.buffer.empty()) {
-        current_state.buffer = std::move(stopped_reader.unconsumed_fragments);
-    } else {
-        std::move(stopped_reader.unconsumed_fragments.begin(), stopped_reader.unconsumed_fragments.end(), std::back_inserter(current_state.buffer));
-    }
-    rs = ready_to_save_state{std::move(current_state.params), std::move(current_state.read_operation), std::move(stopped_reader.remote_reader),
-        std::move(current_state.buffer)};
-    return &std::get<ready_to_save_state>(rs);
-}
-
 future<> read_context::save_reader(ready_to_save_state& current_state, const dht::decorated_key& last_pkey,
        const std::optional<clustering_key_prefix>& last_ckey) {
-    const auto shard = current_state.reader.get_owner_shard();
+    auto* maybe_stopped_reader = std::get_if<foreign_unique_ptr<flat_mutation_reader>>(&current_state.reader);
+    const auto shard = maybe_stopped_reader
+        ? maybe_stopped_reader->get_owner_shard()
+        : std::get<paused_reader>(current_state.reader).shard;
+
    return _db.invoke_on(shard, [shard, query_uuid = _cmd.query_uuid, query_ranges = _ranges, &current_state, &last_pkey, &last_ckey,
            gts = tracing::global_trace_state_ptr(_trace_state)] (database& db) mutable {
        try {
            auto params = current_state.params.release();
            auto read_operation = current_state.read_operation.release();
-            auto reader = current_state.reader.release();
+
+            flat_mutation_reader_opt reader;
+            if (auto* maybe_paused_reader = std::get_if<paused_reader>(&current_state.reader)) {
+                if (auto inactive_read_ptr = db.user_read_concurrency_sem().unregister_inactive_read(maybe_paused_reader->handle)) {
+                    reader = std::move(*static_cast<inactive_read&>(*inactive_read_ptr).reader);
+                    if (maybe_paused_reader->has_pending_next_partition) {
+                        reader->next_partition();
+                    }
+                }
+            } else {
+                reader = std::move(*std::get<foreign_unique_ptr<flat_mutation_reader>>(current_state.reader));
+            }
+
+            if (!reader) {
+                return;
+            }
+
            auto& buffer = current_state.buffer;
            const auto fragments = buffer.size();
            const auto size_before = reader->buffer_size();
@@ -541,6 +514,33 @@ future<> read_context::save_reader(ready_to_save_state& current_state, const dht
    });
 }

+future<> read_context::pause(foreign_unique_ptr<flat_mutation_reader> reader) {
+    const auto shard = reader.get_owner_shard();
+    return _db.invoke_on(shard, [reader = std::move(reader)] (database& db) mutable {
+        return db.user_read_concurrency_sem().register_inactive_read(std::make_unique<inactive_read>(std::move(reader)));
+    }).then([this, shard] (reader_concurrency_semaphore::inactive_read_handle handle) {
+        auto& current_state = std::get<used_state>(_readers[shard]);
+        _readers[shard] = paused_state{std::move(current_state.params), std::move(current_state.read_operation), handle};
+    });
+}
+
+future<foreign_unique_ptr<flat_mutation_reader>> read_context::try_resume(shard_id shard) {
+    return _db.invoke_on(shard, [handle = std::get<paused_state>(_readers[shard]).handle] (database& db) mutable {
+        if (auto inactive_read_ptr = db.user_read_concurrency_sem().unregister_inactive_read(handle)) {
+            return std::move(static_cast<inactive_read&>(*inactive_read_ptr).reader);
+        }
+        return foreign_unique_ptr<flat_mutation_reader>();
+    }).then([this, shard] (foreign_unique_ptr<flat_mutation_reader> reader) {
+        if (reader) {
+            auto& current_state = std::get<paused_state>(_readers[shard]);
+            _readers[shard] = used_state{std::move(current_state.params), std::move(current_state.read_operation)};
+        } else {
+            _readers[shard] = evicted_state{};
+        }
+        return std::move(reader);
+    });
+}
+
 future<> read_context::lookup_readers() {
    if (_cmd.query_uuid == utils::UUID{} || _cmd.is_first_page) {
        return make_ready_future<>();
@@ -574,49 +574,37 @@ future<> read_context::save_readers(circular_buffer<mutation_fragment> unconsume
        return make_ready_future<>();
    }

-    auto last_pkey = compaction_state.partition_start.key();
+    return _dismantling_gate.close().then([this, unconsumed_buffer = std::move(unconsumed_buffer), compaction_state = std::move(compaction_state),
+          last_ckey = std::move(last_ckey)] () mutable {
+        auto last_pkey = compaction_state.partition_start.key();

-    const auto cb_stats = dismantle_combined_buffer(std::move(unconsumed_buffer), last_pkey);
-    tracing::trace(_trace_state, "Dismantled combined buffer: {} partitions/{} fragments/{} bytes", cb_stats.partitions, cb_stats.fragments,
-            cb_stats.bytes);
+        const auto cb_stats = dismantle_combined_buffer(std::move(unconsumed_buffer), last_pkey);
+        tracing::trace(_trace_state, "Dismantled combined buffer: {} partitions/{} fragments/{} bytes", cb_stats.partitions, cb_stats.fragments,
+                cb_stats.bytes);

-    const auto cs_stats = dismantle_compaction_state(std::move(compaction_state));
-    tracing::trace(_trace_state, "Dismantled compaction state: {} partitions/{} fragments/{} bytes", cs_stats.partitions, cs_stats.fragments,
-            cs_stats.bytes);
+        const auto cs_stats = dismantle_compaction_state(std::move(compaction_state));
+        tracing::trace(_trace_state, "Dismantled compaction state: {} partitions/{} fragments/{} bytes", cs_stats.partitions, cs_stats.fragments,
+                cs_stats.bytes);

-    return do_with(std::move(last_pkey), std::move(last_ckey), [this] (const dht::decorated_key& last_pkey,
+        return do_with(std::move(last_pkey), std::move(last_ckey), [this] (const dht::decorated_key& last_pkey,
                const std::optional<clustering_key_prefix>& last_ckey) {
-        return parallel_for_each(_readers, [this, &last_pkey, &last_ckey] (reader_state& rs) {
-            if (auto* maybe_successful_lookup_state = std::get_if<successful_lookup_state>(&rs)) {
-                auto& current_state = *maybe_successful_lookup_state;
-                rs = ready_to_save_state{std::move(current_state.params), std::move(current_state.read_operation),
-                        std::move(current_state.reader), circular_buffer<mutation_fragment>{}};
-                return save_reader(std::get<ready_to_save_state>(rs), last_pkey, last_ckey);
-            }
+            return parallel_for_each(_readers, [this, &last_pkey, &last_ckey] (reader_state& rs) {
+                if (auto* maybe_successful_lookup_state = std::get_if<successful_lookup_state>(&rs)) {
+                    auto& current_state = *maybe_successful_lookup_state;
+                    rs = ready_to_save_state{std::move(current_state.params), std::move(current_state.read_operation),
+                            std::move(current_state.reader), circular_buffer<mutation_fragment>{}};
+                    return save_reader(std::get<ready_to_save_state>(rs), last_pkey, last_ckey);
+                }

-            auto finish_saving = [this, &last_pkey, &last_ckey] (dismantling_state& current_state) {
-                return current_state.reader_fut.then_wrapped([this, &current_state, &last_pkey, &last_ckey] (
-                            future<stopped_foreign_reader>&& stopped_reader_fut) mutable {
-                    if (auto* ready_state = prepare_reader_for_saving(current_state, std::move(stopped_reader_fut), last_pkey, last_ckey)) {
-                        return save_reader(*ready_state, last_pkey, last_ckey);
-                    }
-                    return make_ready_future<>();
-                });
-            };
+                if (auto* maybe_dismantling_state = std::get_if<dismantling_state>(&rs)) {
+                    auto& current_state = *maybe_dismantling_state;
+                    rs = ready_to_save_state{std::move(current_state.params), std::move(current_state.read_operation),
+                            std::move(current_state.reader), std::move(current_state.buffer)};
+                    return save_reader(std::get<ready_to_save_state>(rs), last_pkey, last_ckey);
+                }

-            if (auto* maybe_dismantling_state = std::get_if<dismantling_state>(&rs)) {
-                return finish_saving(*maybe_dismantling_state);
-            }
-
-            if (auto* maybe_future_dismantling_state = std::get_if<future_dismantling_state>(&rs)) {
-                return maybe_future_dismantling_state->fut.then([this, &rs,
-                        finish_saving = std::move(finish_saving)] (dismantling_state&& next_state) mutable {
-                    rs = std::move(next_state);
-                    return finish_saving(std::get<dismantling_state>(rs));
-                });
-            }
-
-            return make_ready_future<>();
+                return make_ready_future<>();
+            });
        });
    });
 }
@@ -629,8 +617,8 @@ static future<reconcilable_result> do_query_mutations(
        tracing::trace_state_ptr trace_state,
        db::timeout_clock::time_point timeout,
        query::result_memory_accounter&& accounter) {
-    return do_with(std::make_unique<read_context>(db, s, cmd, ranges, trace_state), [s, &cmd, &ranges, trace_state, timeout,
-            accounter = std::move(accounter)] (std::unique_ptr<read_context>& ctx) mutable {
+    return do_with(seastar::make_shared<read_context>(db, s, cmd, ranges, trace_state), [s, &cmd, &ranges, trace_state, timeout,
+            accounter = std::move(accounter)] (shared_ptr<read_context>& ctx) mutable {
        return ctx->lookup_readers().then([&ctx, s = std::move(s), &cmd, &ranges, trace_state, timeout,
                accounter = std::move(accounter)] () mutable {
            auto ms = mutation_source([&] (schema_ptr s,
@@ -638,10 +626,9 @@ static future<reconcilable_result> do_query_mutations(
                    const query::partition_slice& ps,
                    const io_priority_class& pc,
                    tracing::trace_state_ptr trace_state,
-                    streamed_mutation::forwarding fwd_sm,
+                    streamed_mutation::forwarding,
                    mutation_reader::forwarding fwd_mr) {
-                return make_multishard_combining_reader(std::move(s), pr, ps, pc, dht::global_partitioner(), ctx->factory(), std::move(trace_state),
-                        fwd_sm, fwd_mr, ctx->dismantler());
+                return make_multishard_combining_reader(ctx, dht::global_partitioner(), std::move(s), pr, ps, pc, std::move(trace_state), fwd_mr);
            });
            auto reader = make_flat_multi_range_reader(s, std::move(ms), ranges, cmd.slice, service::get_local_sstable_query_read_priority(),
                    trace_state, mutation_reader::forwarding::no);
--- a/mutation_reader.cc
+++ b/mutation_reader.cc
@@ -556,128 +556,6 @@ flat_mutation_reader make_combined_reader(schema_ptr schema,
    return make_combined_reader(std::move(schema), std::move(v), fwd_sm, fwd_mr);
 }

-void reader_concurrency_semaphore::signal(const resources& r) {
-    _resources += r;
-    while (!_wait_list.empty() && has_available_units(_wait_list.front().res)) {
-        auto& x = _wait_list.front();
-        _resources -= x.res;
-        x.pr.set_value(make_lw_shared<reader_permit>(*this, x.res));
-        _wait_list.pop_front();
-    }
-}
-
-future<lw_shared_ptr<reader_concurrency_semaphore::reader_permit>> reader_concurrency_semaphore::wait_admission(size_t memory,
-        db::timeout_clock::time_point timeout) {
-    if (_wait_list.size() >= _max_queue_length) {
-        return make_exception_future<lw_shared_ptr<reader_permit>>(_make_queue_overloaded_exception());
-    }
-    auto r = resources(1, static_cast<ssize_t>(memory));
-    if (!may_proceed(r) && _evict_an_inactive_reader) {
-        while (_evict_an_inactive_reader() && !may_proceed(r));
-    }
-    if (may_proceed(r)) {
-        _resources -= r;
-        return make_ready_future<lw_shared_ptr<reader_permit>>(make_lw_shared<reader_permit>(*this, r));
-    }
-    promise<lw_shared_ptr<reader_permit>> pr;
-    auto fut = pr.get_future();
-    _wait_list.push_back(entry(std::move(pr), r), timeout);
-    return fut;
-}
-
-// A file that tracks the memory usage of buffers resulting from read
-// operations.
-class tracking_file_impl : public file_impl {
-    file _tracked_file;
-    lw_shared_ptr<reader_concurrency_semaphore::reader_permit> _permit;
-
-    // Shouldn't be called if semaphore is NULL.
-    temporary_buffer<uint8_t> make_tracked_buf(temporary_buffer<uint8_t> buf) {
-        return seastar::temporary_buffer<uint8_t>(buf.get_write(),
-                buf.size(),
-                make_deleter(buf.release(), std::bind(&reader_concurrency_semaphore::reader_permit::signal_memory, _permit, buf.size())));
-    }
-
-public:
-    tracking_file_impl(file file, reader_resource_tracker resource_tracker)
-        : _tracked_file(std::move(file))
-        , _permit(resource_tracker.get_permit()) {
-    }
-
-    tracking_file_impl(const tracking_file_impl&) = delete;
-    tracking_file_impl& operator=(const tracking_file_impl&) = delete;
-    tracking_file_impl(tracking_file_impl&&) = default;
-    tracking_file_impl& operator=(tracking_file_impl&&) = default;
-
-    virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override {
-        return get_file_impl(_tracked_file)->write_dma(pos, buffer, len, pc);
-    }
-
-    virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
-        return get_file_impl(_tracked_file)->write_dma(pos, std::move(iov), pc);
-    }
-
-    virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override {
-        return get_file_impl(_tracked_file)->read_dma(pos, buffer, len, pc);
-    }
-
-    virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
-        return get_file_impl(_tracked_file)->read_dma(pos, iov, pc);
-    }
-
-    virtual future<> flush(void) override {
-        return get_file_impl(_tracked_file)->flush();
-    }
-
-    virtual future<struct stat> stat(void) override {
-        return get_file_impl(_tracked_file)->stat();
-    }
-
-    virtual future<> truncate(uint64_t length) override {
-        return get_file_impl(_tracked_file)->truncate(length);
-    }
-
-    virtual future<> discard(uint64_t offset, uint64_t length) override {
-        return get_file_impl(_tracked_file)->discard(offset, length);
-    }
-
-    virtual future<> allocate(uint64_t position, uint64_t length) override {
-        return get_file_impl(_tracked_file)->allocate(position, length);
-    }
-
-    virtual future<uint64_t> size(void) override {
-        return get_file_impl(_tracked_file)->size();
-    }
-
-    virtual future<> close() override {
-        return get_file_impl(_tracked_file)->close();
-    }
-
-    virtual std::unique_ptr<file_handle_impl> dup() override {
-        return get_file_impl(_tracked_file)->dup();
-    }
-
-    virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) override {
-        return get_file_impl(_tracked_file)->list_directory(std::move(next));
-    }
-
-    virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) override {
-        return get_file_impl(_tracked_file)->dma_read_bulk(offset, range_size, pc).then([this] (temporary_buffer<uint8_t> buf) {
-            if (_permit) {
-                buf = make_tracked_buf(std::move(buf));
-                _permit->consume_memory(buf.size());
-            }
-            return make_ready_future<temporary_buffer<uint8_t>>(std::move(buf));
-        });
-    }
-};
-
-
-file reader_resource_tracker::track(file f) const {
-    return file(make_shared<tracking_file_impl>(f, *this));
-}
-
-
 class restricting_mutation_reader : public flat_mutation_reader::impl {
    struct mutation_source_and_params {
        mutation_source _ms;
@@ -840,12 +718,14 @@ class foreign_reader : public flat_mutation_reader::impl {
    template <typename T>
    using foreign_unique_ptr = foreign_ptr<std::unique_ptr<T>>;

+    using fragment_buffer = circular_buffer<mutation_fragment>;
+
    foreign_unique_ptr<flat_mutation_reader> _reader;
    foreign_unique_ptr<future<>> _read_ahead_future;
-    // Increase this counter every time next_partition() is called.
-    // These pending calls will be executed the next time we go to the remote
+    // Set this flag when next_partition() is called.
+    // This pending call will be executed the next time we go to the remote
    // reader (a fill_buffer() or a fast_forward_to() call).
-    unsigned _pending_next_partition = 0;
+    bool _pending_next_partition = false;
    streamed_mutation::forwarding _fwd_sm;

    // Forward an operation to the reader on the remote shard.
@@ -859,12 +739,11 @@ class foreign_reader : public flat_mutation_reader::impl {
    Result forward_operation(db::timeout_clock::time_point timeout, Operation op) {
        return smp::submit_to(_reader.get_owner_shard(), [reader = _reader.get(),
                read_ahead_future = std::exchange(_read_ahead_future, nullptr),
-                pending_next_partition = std::exchange(_pending_next_partition, 0),
+                pending_next_partition = std::exchange(_pending_next_partition, false),
                timeout,
                op = std::move(op)] () mutable {
            auto exec_op_and_read_ahead = [=] () mutable {
-                while (pending_next_partition) {
-                    --pending_next_partition;
+                if (pending_next_partition) {
                    reader->next_partition();
                }
                return op().then([=] (auto... results) {
@@ -883,6 +762,8 @@ class foreign_reader : public flat_mutation_reader::impl {
            return make_ready_future<decltype(results)...>(std::move(results)...);
        });
    }
+
+    void update_buffer_with(foreign_unique_ptr<fragment_buffer> buffer, bool end_of_steam);
 public:
    foreign_reader(schema_ptr schema,
            foreign_unique_ptr<flat_mutation_reader> reader,
@@ -902,10 +783,22 @@ public:
    virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override;

    const mutation_fragment& peek_buffer() const { return buffer().front(); }
+    const circular_buffer<mutation_fragment>& get_buffer() const { return buffer(); }

-    future<stopped_foreign_reader> stop();
+    future<foreign_unique_ptr<flat_mutation_reader>> pause();
+    void resume(foreign_unique_ptr<flat_mutation_reader> reader);
+
+    future<reader_lifecycle_policy::paused_or_stopped_reader> stop();
 };

+void foreign_reader::update_buffer_with(foreign_unique_ptr<fragment_buffer> buffer, bool end_of_steam) {
+    _end_of_stream = end_of_steam;
+    for (const auto& mf : *buffer) {
+        // Need a copy since the mf is on the remote shard.
+        push_mutation_fragment(mutation_fragment(*_schema, mf));
+    }
+}
+
 foreign_reader::foreign_reader(schema_ptr schema,
        foreign_unique_ptr<flat_mutation_reader> reader,
        streamed_mutation::forwarding fwd_sm)
@@ -931,8 +824,6 @@ future<> foreign_reader::fill_buffer(db::timeout_clock::time_point timeout) {
        return make_ready_future();
    }

-    using fragment_buffer = circular_buffer<mutation_fragment>;
-
    return forward_operation(timeout, [reader = _reader.get(), timeout] () {
        auto f = reader->is_buffer_empty() ? reader->fill_buffer(timeout) : make_ready_future<>();
        return f.then([=] {
@@ -940,12 +831,8 @@ future<> foreign_reader::fill_buffer(db::timeout_clock::time_point timeout) {
                    std::make_unique<fragment_buffer>(reader->detach_buffer()),
                    reader->is_end_of_stream());
        });
-    }).then([this] (foreign_unique_ptr<fragment_buffer> buffer, bool end_of_steam) mutable {
-        _end_of_stream = end_of_steam;
-        for (const auto& mf : *buffer) {
-            // Need a copy since the mf is on the remote shard.
-            push_mutation_fragment(mutation_fragment(*_schema, mf));
-        }
+    }).then([this] (foreign_unique_ptr<fragment_buffer> buffer, bool end_of_stream) mutable {
+        update_buffer_with(std::move(buffer), end_of_stream);
    });
 }

@@ -953,12 +840,12 @@ void foreign_reader::next_partition() {
    if (_fwd_sm == streamed_mutation::forwarding::yes) {
        clear_buffer();
        _end_of_stream = false;
-        ++_pending_next_partition;
+        _pending_next_partition = true;
    } else {
        clear_buffer_to_next_partition();
        if (is_buffer_empty()) {
            _end_of_stream = false;
-            ++_pending_next_partition;
+            _pending_next_partition = true;
        }
    }
 }
@@ -979,26 +866,61 @@ future<> foreign_reader::fast_forward_to(position_range pr, db::timeout_clock::t
    });
 }

-future<stopped_foreign_reader> foreign_reader::stop() {
-    if (_read_ahead_future || _pending_next_partition) {
+future<reader_lifecycle_policy::paused_or_stopped_reader> foreign_reader::stop() {
+    if (_reader && (_read_ahead_future || _pending_next_partition)) {
        const auto owner_shard = _reader.get_owner_shard();
        return smp::submit_to(owner_shard, [reader = _reader.get(),
                read_ahead_future = std::exchange(_read_ahead_future, nullptr),
-                pending_next_partition = std::exchange(_pending_next_partition, 0)] () mutable {
+                pending_next_partition = std::exchange(_pending_next_partition, false)] () mutable {
            auto fut = read_ahead_future ? std::move(*read_ahead_future) : make_ready_future<>();
            return fut.then([=] () mutable {
-                for (;pending_next_partition > 0; --pending_next_partition) {
+                if (pending_next_partition) {
                    reader->next_partition();
                }
            });
        }).then([this] {
-            return stopped_foreign_reader{std::move(_reader), detach_buffer()};
+            return reader_lifecycle_policy::paused_or_stopped_reader{std::move(_reader), detach_buffer(), false};
        });
    } else {
-        return make_ready_future<stopped_foreign_reader>(stopped_foreign_reader{std::move(_reader), detach_buffer()});
+        return make_ready_future<reader_lifecycle_policy::paused_or_stopped_reader>(
+                reader_lifecycle_policy::paused_or_stopped_reader{std::move(_reader), detach_buffer(), _pending_next_partition});
    }
 }

+future<foreign_ptr<std::unique_ptr<flat_mutation_reader>>> foreign_reader::pause() {
+    return smp::submit_to(_reader.get_owner_shard(), [reader = _reader.get(),
+            read_ahead_future = std::exchange(_read_ahead_future, nullptr),
+            pending_next_partition = std::exchange(_pending_next_partition, false)] () mutable {
+        auto fut = read_ahead_future ? std::move(*read_ahead_future) : make_ready_future<>();
+        return fut.then([=] () mutable {
+            if (pending_next_partition) {
+                reader->next_partition();
+            }
+            return make_ready_future<foreign_unique_ptr<fragment_buffer>, bool>(
+                    std::make_unique<fragment_buffer>(reader->detach_buffer()),
+                    reader->is_end_of_stream());
+        });
+    }).then([this] (foreign_unique_ptr<fragment_buffer>&& buffer, bool end_of_stream) mutable {
+        update_buffer_with(std::move(buffer), end_of_stream);
+
+        // An ongoing pause() might overlap with a next_partition() call.
+        // So if there is a pending next partition, try to execute it again
+        // after the remote buffer was transferred. This is required for
+        // correctness, otherwise some fragments belonging to the to-be-skipped
+        // partition can escape the next_partition() call, both on the local and
+        // the remote shard.
+        if (_pending_next_partition) {
+            _pending_next_partition = false;
+            next_partition();
+        }
+        return std::move(_reader);
+    });
+}
+
+void foreign_reader::resume(foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader) {
+    _reader = std::move(reader);
+}
+
 flat_mutation_reader make_foreign_reader(schema_ptr schema,
            foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader,
            streamed_mutation::forwarding fwd_sm) {
@@ -1010,14 +932,12 @@ flat_mutation_reader make_foreign_reader(schema_ptr schema,

 // See make_multishard_combining_reader() for description.
 class multishard_combining_reader : public flat_mutation_reader::impl {
+    shared_ptr<reader_lifecycle_policy> _lifecycle_policy;
    const dht::i_partitioner& _partitioner;
    const dht::partition_range* _pr;
    const query::partition_slice& _ps;
    const io_priority_class& _pc;
-    remote_reader_factory _reader_factory;
-    foreign_reader_dismantler _reader_dismantler;
    tracing::trace_state_ptr _trace_state;
-    const streamed_mutation::forwarding _fwd_sm;
    const mutation_reader::forwarding _fwd_mr;

    // Thin wrapper around a flat_mutation_reader (foreign_reader) that
@@ -1035,14 +955,30 @@ class multishard_combining_reader : public flat_mutation_reader::impl {
    class shard_reader {
        struct state {
            std::unique_ptr<foreign_reader> reader;
-            unsigned pending_next_partition = 0;
            bool stopped = false;
-            promise<> reader_promise;
+            bool drop_partition_start = false;
+            bool drop_static_row = false;
        };
        const multishard_combining_reader& _parent;
        const unsigned _shard;
        lw_shared_ptr<state> _state;
        std::optional<future<>> _read_ahead;
+        std::optional<future<>> _pause;
+
+        std::optional<dht::decorated_key> _last_pkey;
+        std::optional<position_in_partition> _last_position_in_partition;
+        // These are used when the reader has to be recreated (after having been
+        // evicted while paused) and the range and/or slice it is recreated with
+        // differs from the original ones.
+        std::optional<dht::partition_range> _range_override;
+        std::optional<query::partition_slice> _slice_override;
+
+    private:
+        void update_last_position();
+        void adjust_partition_slice();
+        future<foreign_ptr<std::unique_ptr<flat_mutation_reader>>> recreate_reader();
+        future<> resume();
+        future<> do_fill_buffer(db::timeout_clock::time_point timeout);

    public:
        shard_reader(multishard_combining_reader& parent, unsigned shard)
@@ -1057,11 +993,7 @@ class multishard_combining_reader : public flat_mutation_reader::impl {
        shard_reader(const shard_reader&) = delete;
        shard_reader& operator=(const shard_reader&) = delete;

-        ~shard_reader() {
-            if (!_state->stopped) {
-                stop();
-            }
-        }
+        ~shard_reader();

        // These methods assume the reader is already created.
        bool is_end_of_stream() const {
@@ -1081,7 +1013,6 @@ class multishard_combining_reader : public flat_mutation_reader::impl {
        // These methods don't assume the reader is already created.
        void next_partition();
        future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout);
-        future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout);
        future<> create_reader();
        explicit operator bool() const {
            return bool(_state->reader);
@@ -1093,7 +1024,7 @@ class multishard_combining_reader : public flat_mutation_reader::impl {
        bool is_read_ahead_in_progress() const {
            return _read_ahead.has_value();
        }
-        future<stopped_foreign_reader> stop();
+        void pause();
    };

    std::vector<shard_reader> _shard_readers;
@@ -1106,18 +1037,15 @@ class multishard_combining_reader : public flat_mutation_reader::impl {
    future<> handle_empty_reader_buffer(db::timeout_clock::time_point timeout);

 public:
-    multishard_combining_reader(schema_ptr s,
-        const dht::partition_range& pr,
-        const query::partition_slice& ps,
-        const io_priority_class& pc,
-        const dht::i_partitioner& partitioner,
-        remote_reader_factory reader_factory,
-        tracing::trace_state_ptr trace_state,
-        streamed_mutation::forwarding fwd_sm,
-        mutation_reader::forwarding fwd_mr,
-        foreign_reader_dismantler reader_dismantler);
-
-    ~multishard_combining_reader();
+    multishard_combining_reader(
+            shared_ptr<reader_lifecycle_policy> lifecycle_policy,
+            const dht::i_partitioner& partitioner,
+            schema_ptr s,
+            const dht::partition_range& pr,
+            const query::partition_slice& ps,
+            const io_priority_class& pc,
+            tracing::trace_state_ptr trace_state,
+            mutation_reader::forwarding fwd_mr);

    // this is captured.
    multishard_combining_reader(const multishard_combining_reader&) = delete;
@@ -1131,94 +1059,289 @@ public:
    virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override;
 };

+multishard_combining_reader::shard_reader::~shard_reader() {
+    // Nothing to do if there was no reader created, nor is there a background
+    // read ahead in progress which will create one.
+    if (!_state->reader && !_read_ahead) {
+        return;
+    }
+
+    _state->stopped = true;
+
+    auto f = [this] {
+        if (_read_ahead) {
+            return std::move(*_read_ahead);
+        } else if (_pause) {
+            return std::move(*_pause);
+        } else {
+            return make_ready_future<>();
+        }
+    }();
+
+    _parent._lifecycle_policy->destroy_reader(_shard, f.then([state = _state.get()] {
+        return state->reader->stop();
+    }).finally([state = _state] {}));
+}
+
+void multishard_combining_reader::shard_reader::update_last_position() {
+    auto& reader = *_state->reader;
+    if (reader.is_buffer_empty()) {
+        return;
+    }
+
+    auto rbegin = std::reverse_iterator(reader.get_buffer().end());
+    auto rend = std::reverse_iterator(reader.get_buffer().begin());
+    if (auto pk_it = std::find_if(rbegin, rend, std::mem_fn(&mutation_fragment::is_partition_start)); pk_it != rend) {
+        _last_pkey = pk_it->as_partition_start().key();
+    }
+
+    _last_position_in_partition.emplace(reader.get_buffer().back().position());
+}
+
+void multishard_combining_reader::shard_reader::adjust_partition_slice() {
+    if (!_slice_override) {
+        _slice_override = _parent._ps;
+    }
+
+    const auto& schema = *_parent._schema;
+    _slice_override->clear_range(schema, _last_pkey->key());
+    auto& last_ckey = _last_position_in_partition->key();
+
+    auto cmp = bound_view::compare(schema);
+    auto eq = clustering_key_prefix::equality(schema);
+
+    auto ranges = _slice_override->default_row_ranges();
+    auto it = ranges.begin();
+    while (it != ranges.end()) {
+        auto range = bound_view::from_range(*it);
+        if (cmp(range.second, last_ckey) || eq(range.second.prefix(), last_ckey)) {
+            it = ranges.erase(it);
+        } else {
+            if (cmp(range.first, last_ckey)) {
+                assert(cmp(last_ckey, range.second));
+                *it = query::clustering_range(query::clustering_range::bound{last_ckey, false}, it->end());
+            }
+            ++it;
+        }
+    }
+
+    _slice_override->clear_ranges();
+    _slice_override->set_range(schema, _last_pkey->key(), std::move(ranges));
+}
+
+future<foreign_ptr<std::unique_ptr<flat_mutation_reader>>> multishard_combining_reader::shard_reader::recreate_reader() {
+    const dht::partition_range* range = _parent._pr;
+    const query::partition_slice* slice = &_parent._ps;
+
+    if (_last_pkey) {
+        bool partition_range_is_inclusive = true;
+
+        if (_last_position_in_partition) {
+            switch (_last_position_in_partition->region()) {
+            case partition_region::partition_start:
+                _state->drop_partition_start = true;
+                break;
+            case partition_region::static_row:
+                _state->drop_partition_start = true;
+                _state->drop_static_row = true;
+                break;
+            case partition_region::clustered:
+                _state->drop_partition_start = true;
+                _state->drop_static_row = true;
+                adjust_partition_slice();
+                slice = &*_slice_override;
+                break;
+            case partition_region::partition_end:
+                partition_range_is_inclusive = false;
+                break;
+            }
+        }
+
+        // The original range contained a single partition and we've read it
+        // all. We'd have to create a reader with an empty range that would
+        // immediately be at EOS. This is not possible so just don't recreate
+        // the reader.
+        // This should be extremely rare (who'd create a multishard reader to
+        // read a single partition) but still, let's make sure we handle it
+        // correctly.
+        if (_parent._pr->is_singular() && !partition_range_is_inclusive) {
+            return make_ready_future<foreign_ptr<std::unique_ptr<flat_mutation_reader>>>();
+        }
+
+        _range_override = dht::partition_range({dht::partition_range::bound(*_last_pkey, partition_range_is_inclusive)}, _parent._pr->end());
+        range = &*_range_override;
+    }
+
+    return _parent._lifecycle_policy->create_reader(
+            _shard,
+            _parent._schema,
+            *range,
+            *slice,
+            _parent._pc,
+            _parent._trace_state,
+            _parent._fwd_mr);
+}
+
+future<> multishard_combining_reader::shard_reader::resume() {
+    return std::exchange(_pause, std::nullopt)->then([this, state = _state] {
+        if (state->stopped) {
+            return make_ready_future<>();
+        }
+        return _parent._lifecycle_policy->try_resume(_shard).then(
+                [this, state = std::move(state)] (foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader) mutable {
+            if (reader) {
+                state->reader->resume(std::move(reader));
+                return make_ready_future<>();
+            } else if (state->stopped) {
+                return make_ready_future<>();
+            } else {
+                return recreate_reader().then([this, state = std::move(state)] (foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader) {
+                    state->reader->resume(std::move(reader));
+                });
+            }
+        });
+    });
+}
+
+future<> multishard_combining_reader::shard_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
+    return _state->reader->fill_buffer(timeout).then([this, state = _state] {
+        auto& reader = *state->reader;
+
+        if (reader.is_buffer_empty()) {
+            return;
+        }
+        if (state->drop_partition_start) {
+            state->drop_partition_start = false;
+            if (reader.peek_buffer().is_partition_start()) {
+                reader.pop_mutation_fragment();
+            }
+        }
+
+        if (reader.is_buffer_empty()) {
+            return;
+        }
+        if (state->drop_static_row) {
+            state->drop_static_row = false;
+            if (reader.peek_buffer().is_static_row()) {
+                reader.pop_mutation_fragment();
+            }
+        }
+
+        if (!state->stopped) {
+            update_last_position();
+        }
+    });
+}
+
 future<> multishard_combining_reader::shard_reader::fill_buffer(db::timeout_clock::time_point timeout) {
    if (_read_ahead) {
        return *std::exchange(_read_ahead, std::nullopt);
    }
-    return _state->reader->fill_buffer(timeout);
+    if (!_state->reader->is_buffer_empty()) {
+        return make_ready_future<>();
+    }
+    if (_pause) {
+        return resume().then([this, timeout] {
+            return fill_buffer(timeout);
+        });
+    }
+    return do_fill_buffer(timeout);
 }

 void multishard_combining_reader::shard_reader::next_partition() {
+    _last_position_in_partition = position_in_partition(position_in_partition::end_of_partition_tag_t{});
+
+    // The only case this can be called with an uncreated reader is when
+    // `next_partition()` is called on the multishard reader before the
+    // first `fill_buffer()` call. In this case we are right before the first
+    // partition so this call has no effect, hence we can ignore it.
    if (_state->reader) {
        _state->reader->next_partition();
-    } else {
-        ++_state->pending_next_partition;
    }
 }

 future<> multishard_combining_reader::shard_reader::fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) {
    if (_state->reader) {
-        return _state->reader->fast_forward_to(pr, timeout);
+        _last_pkey.reset();
+        _last_position_in_partition.reset();
+
+        auto do_fast_forward = [this, &pr, timeout] {
+            return _state->reader->fast_forward_to(pr, timeout);
+        };
+
+        if (_pause) {
+            return resume().then(std::move(do_fast_forward));
+        }
+
+        if (_read_ahead) {
+            return std::exchange(_read_ahead, std::nullopt)->then(std::move(do_fast_forward));
+        }
+
+        return do_fast_forward();
    }
    // No need to fast-forward uncreated readers, they will be passed the new
    // range when created.
    return make_ready_future<>();
 }

-future<> multishard_combining_reader::shard_reader::fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) {
-    if (_state->reader) {
-        return _state->reader->fast_forward_to(pr, timeout);
-    }
-    return create_reader().then([this, pr = std::move(pr), timeout] {
-        return _state->reader->fast_forward_to(pr, timeout);
-    });
-}
-
 future<> multishard_combining_reader::shard_reader::create_reader() {
    if (_state->reader) {
        return make_ready_future<>();
    }
    if (_read_ahead) {
-        return _state->reader_promise.get_future();
+        return *std::exchange(_read_ahead, std::nullopt);
    }
-    return _parent._reader_factory(_shard, _parent._schema, *_parent._pr, _parent._ps, _parent._pc, _parent._trace_state,
-            _parent._fwd_sm, _parent._fwd_mr).then(
-            [schema = _parent._schema, state = _state, fwd_sm = _parent._fwd_sm] (foreign_ptr<std::unique_ptr<flat_mutation_reader>>&& r) mutable {
-        state->reader = std::make_unique<foreign_reader>(std::move(schema), std::move(r), fwd_sm);
-        for (;state->pending_next_partition; --state->pending_next_partition) {
-            state->reader->next_partition();
-        }
-
-        if (!state->stopped) {
-            state->reader_promise.set_value();
-        }
+    return _parent._lifecycle_policy->create_reader(_shard, _parent._schema, *_parent._pr, _parent._ps, _parent._pc, _parent._trace_state,
+            _parent._fwd_mr).then(
+            [schema = _parent._schema, state = _state] (foreign_ptr<std::unique_ptr<flat_mutation_reader>>&& r) mutable {
+        state->reader = std::make_unique<foreign_reader>(std::move(schema), std::move(r));
    });
 }

 void multishard_combining_reader::shard_reader::read_ahead(db::timeout_clock::time_point timeout) {
-    if (_read_ahead) {
+    if (_read_ahead || (_state->reader && (_state->reader->is_end_of_stream() || !_state->reader->is_buffer_empty()))) {
        return;
    }

-    if (_state->reader) {
-        _read_ahead.emplace(_state->reader->fill_buffer(timeout));
-    } else {
-        _read_ahead.emplace(create_reader().then([state = _state, timeout] () mutable {
-            if (state->stopped) {
-                return make_ready_future<>();
+    auto f = _state->reader
+        ? (_pause ? resume() : make_ready_future<>())
+        : create_reader();
+
+    _read_ahead.emplace(f.then([this, state = _state, timeout] () mutable {
+        if (state->stopped) {
+            return make_ready_future<>();
+        }
+        return do_fill_buffer(timeout).then([this, state = std::move(state)] {
+            // Read ahead is still in the background, so pause the reader.
+            if (!state->stopped && _read_ahead) {
+                pause();
            }
-            return state->reader->fill_buffer(timeout);
-        }));
-    }
+        });
+    }));
 }

-future<stopped_foreign_reader> multishard_combining_reader::shard_reader::stop() {
-    _state->stopped = true;
-
-    if (!_state->reader && !_read_ahead) {
-        return make_ready_future<stopped_foreign_reader>(stopped_foreign_reader{nullptr, circular_buffer<mutation_fragment>{}});
+void multishard_combining_reader::shard_reader::pause() {
+    if (_pause) {
+        return;
    }
-
-    auto f = [this] {
-        if (_read_ahead) {
-            return _read_ahead->then([state = _state.get()] () mutable {
-                return state->reader->stop();
-            });
-        } else {
-            return _state->reader->stop();
+    auto f = _read_ahead ? *std::exchange(_read_ahead, std::nullopt) : make_ready_future<>();
+    _pause = f.then([this, state = _state] () mutable {
+        if (state->stopped) {
+            return make_ready_future<>();
        }
-    }();
-    return f.finally([state = _state] {});
+        return state->reader->pause().then([this, state = std::move(state)] (foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader) {
+            if (state->stopped) {
+                state->reader->resume(std::move(reader));
+                return make_ready_future<>();
+            }
+
+            // When pausing, the content of the remote reader's buffer is transferred to
+            // the foreign reader, so we might need to update the last position.
+            update_last_position();
+
+            return _parent._lifecycle_policy->pause(std::move(reader));
+        });
+    });
 }

 void multishard_combining_reader::move_to_next_shard() {
@@ -1231,11 +1354,12 @@ future<> multishard_combining_reader::handle_empty_reader_buffer(db::timeout_clo
    auto& reader = _shard_readers[_current_shard];

    if (reader.is_end_of_stream()) {
-        if (_fwd_sm || std::all_of(_shard_readers.begin(), _shard_readers.end(), std::mem_fn(&shard_reader::done))) {
+        if (std::all_of(_shard_readers.begin(), _shard_readers.end(), std::mem_fn(&shard_reader::done))) {
            _end_of_stream = true;
        } else {
            move_to_next_shard();
        }
+        reader.pause();
        return make_ready_future<>();
    } else if (reader.is_read_ahead_in_progress()) {
        return reader.fill_buffer(timeout);
@@ -1257,25 +1381,22 @@ future<> multishard_combining_reader::handle_empty_reader_buffer(db::timeout_clo
    }
 }

-multishard_combining_reader::multishard_combining_reader(schema_ptr s,
+multishard_combining_reader::multishard_combining_reader(
+        shared_ptr<reader_lifecycle_policy> lifecycle_policy,
+        const dht::i_partitioner& partitioner,
+        schema_ptr s,
        const dht::partition_range& pr,
        const query::partition_slice& ps,
        const io_priority_class& pc,
-        const dht::i_partitioner& partitioner,
-        remote_reader_factory reader_factory,
        tracing::trace_state_ptr trace_state,
-        streamed_mutation::forwarding fwd_sm,
-        mutation_reader::forwarding fwd_mr,
-        foreign_reader_dismantler reader_dismantler)
+        mutation_reader::forwarding fwd_mr)
    : impl(s)
+    , _lifecycle_policy(std::move(lifecycle_policy))
    , _partitioner(partitioner)
    , _pr(&pr)
    , _ps(ps)
    , _pc(pc)
-    , _reader_factory(std::move(reader_factory))
-    , _reader_dismantler(std::move(reader_dismantler))
    , _trace_state(std::move(trace_state))
-    , _fwd_sm(fwd_sm)
    , _fwd_mr(fwd_mr)
    , _current_shard(pr.start() ? _partitioner.shard_of(pr.start()->value().token()) : _partitioner.shard_of_minimum_token())
    , _next_token(_partitioner.token_for_next_shard(pr.start() ? pr.start()->value().token() : dht::minimum_token(),
@@ -1286,25 +1407,6 @@ multishard_combining_reader::multishard_combining_reader(schema_ptr s,
    }
 }

-multishard_combining_reader::~multishard_combining_reader() {
-    for (shard_id shard = 0; shard < smp::count; ++shard) {
-        auto& reader = _shard_readers[shard];
-
-        // Readers might also be created by background read-aheads, so it's not
-        // enough to check whether the reader is created at the moment, we also
-        // need to check whether there is a read-ahead in progress. If there is,
-        // it will surely create a reader which also needs to be dismantled.
-        if (!reader && !reader.is_read_ahead_in_progress()) {
-            continue;
-        }
-
-        auto fut = reader.stop();
-        if (_reader_dismantler) {
-            _reader_dismantler(shard, std::move(fut));
-        }
-    }
-}
-
 future<> multishard_combining_reader::fill_buffer(db::timeout_clock::time_point timeout) {
    _crossed_shards = false;
    return do_until([this] { return is_buffer_full() || is_end_of_stream(); }, [this, timeout] {
@@ -1320,6 +1422,7 @@ future<> multishard_combining_reader::fill_buffer(db::timeout_clock::time_point
        while (!reader.is_buffer_empty() && !is_buffer_full()) {
            if (const auto& mf = reader.peek_buffer(); mf.is_partition_start() && mf.as_partition_start().key().token() >= _next_token) {
                move_to_next_shard();
+                reader.pause();
                return make_ready_future<>();
            }
            push_mutation_fragment(reader.pop_mutation_fragment());
@@ -1329,15 +1432,9 @@ future<> multishard_combining_reader::fill_buffer(db::timeout_clock::time_point
 }

 void multishard_combining_reader::next_partition() {
-    if (_fwd_sm == streamed_mutation::forwarding::yes) {
-        clear_buffer();
-        _end_of_stream = false;
+    clear_buffer_to_next_partition();
+    if (is_buffer_empty()) {
        _shard_readers[_current_shard].next_partition();
-    } else {
-        clear_buffer_to_next_partition();
-        if (is_buffer_empty()) {
-            _shard_readers[_current_shard].next_partition();
-        }
    }
 }

@@ -1359,24 +1456,18 @@ future<> multishard_combining_reader::fast_forward_to(const dht::partition_range
 }

 future<> multishard_combining_reader::fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) {
-    forward_buffer_to(pr.start());
-    _end_of_stream = false;
-    if (is_buffer_empty()) {
-        return _shard_readers[_current_shard].fast_forward_to(std::move(pr), timeout);
-    }
-    return make_ready_future<>();
+    return make_exception_future<>(std::bad_function_call());
 }

-flat_mutation_reader make_multishard_combining_reader(schema_ptr schema,
+flat_mutation_reader make_multishard_combining_reader(
+        shared_ptr<reader_lifecycle_policy> lifecycle_policy,
+        const dht::i_partitioner& partitioner,
+        schema_ptr schema,
        const dht::partition_range& pr,
        const query::partition_slice& ps,
        const io_priority_class& pc,
-        const dht::i_partitioner& partitioner,
-        remote_reader_factory reader_factory,
        tracing::trace_state_ptr trace_state,
-        streamed_mutation::forwarding fwd_sm,
-        mutation_reader::forwarding fwd_mr,
-        foreign_reader_dismantler reader_dismantler) {
-    return make_flat_mutation_reader<multishard_combining_reader>(std::move(schema), pr, ps, pc, partitioner, std::move(reader_factory),
-            std::move(trace_state), fwd_sm, fwd_mr, std::move(reader_dismantler));
+        mutation_reader::forwarding fwd_mr) {
+    return make_flat_mutation_reader<multishard_combining_reader>(std::move(lifecycle_policy), partitioner, std::move(schema), pr, ps, pc,
+            std::move(trace_state), fwd_mr);
 }
--- a/mutation_reader.hh
+++ b/mutation_reader.hh
@@ -388,27 +388,81 @@ flat_mutation_reader make_foreign_reader(schema_ptr schema,
        foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader,
        streamed_mutation::forwarding fwd_sm = streamed_mutation::forwarding::no);

-using remote_reader_factory = noncopyable_function<future<foreign_ptr<std::unique_ptr<flat_mutation_reader>>>(unsigned,
-        schema_ptr,
-        const dht::partition_range&,
-        const query::partition_slice&,
-        const io_priority_class&,
-        tracing::trace_state_ptr,
-        streamed_mutation::forwarding,
-        mutation_reader::forwarding)>;
+/// Reader lifecycle policy for the mulitshard combining reader.
+///
+/// This policy is expected to make sure any additional resource the readers
+/// might need is kept alive for the lifetime of the readers, not that
+/// of the multishard reader. This is a very important distinction. As
+/// destructors cannot return futures, the multishard reader will be
+/// destroyed before all it's shard readers could stop properly. Hence it
+/// is the duty of this policy to make sure all objects the shard readers
+/// depend on stay alive until they are properly destroyed on their home
+/// shards. Note that this also includes the passed in `range` and `slice`
+/// parameters because although client code is required to keep them alive as
+/// long as the top level reader lives, the shard readers might outlive the
+/// multishard reader itself.
+class reader_lifecycle_policy {
+public:
+    struct paused_or_stopped_reader {
+        // Null when the reader is paused.
+        foreign_ptr<std::unique_ptr<flat_mutation_reader>> remote_reader;
+        circular_buffer<mutation_fragment> unconsumed_fragments;
+        // Only set for paused readers.
+        bool has_pending_next_partition;
+    };

-struct stopped_foreign_reader {
-    foreign_ptr<std::unique_ptr<flat_mutation_reader>> remote_reader;
-    circular_buffer<mutation_fragment> unconsumed_fragments;
+public:
+    /// Create an appropriate reader on the specified shard.
+    ///
+    /// Will be called when the multishard reader visits a shard for the
+    /// first time. This method should also enter gates, take locks or
+    /// whatever is appropriate to make sure resources it is using on the
+    /// remote shard stay alive, during the lifetime of the created reader.
+    virtual future<foreign_ptr<std::unique_ptr<flat_mutation_reader>>> create_reader(
+            shard_id shard,
+            schema_ptr schema,
+            const dht::partition_range& range,
+            const query::partition_slice& slice,
+            const io_priority_class& pc,
+            tracing::trace_state_ptr trace_state,
+            mutation_reader::forwarding fwd_mr) = 0;
+
+    /// Wait on the shard reader to stop then destroy it.
+    ///
+    /// Will be called when the multishard reader is being destroyed. It will be
+    /// called for each of the shard readers. The future resolves when the
+    /// reader is stopped, that is it, finishes all background and/or pending
+    /// work.
+    /// This method is expected to do a proper cleanup, that is, leave any gates,
+    /// release any locks or whatever is appropriate for the shard reader.
+    ///
+    /// The multishard reader couldn't wait on any future returned from this
+    /// method (as it will be called from the destructor) so waiting on
+    /// all the readers being cleaned up is up to the implementation.
+    ///
+    /// This method will be called from a destructor so it cannot throw.
+    virtual void destroy_reader(shard_id shard, future<paused_or_stopped_reader> reader) noexcept = 0;
+
+    /// Pause the reader.
+    ///
+    /// The purpose of pausing a reader is making it evictable while it is
+    /// otherwise inactive. This allows freeing up resources that are in-demand
+    /// by evicting these paused readers. Most notably, this allows freeing up
+    /// reader permits when the node is overloaded with reads.
+    virtual future<> pause(foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader) = 0;
+
+    /// Try to resume the reader.
+    ///
+    /// The pointer returned will be null when resuming fails. This can happen
+    /// if the reader was evicted while paused.
+    virtual future<foreign_ptr<std::unique_ptr<flat_mutation_reader>>> try_resume(shard_id shard) = 0;
 };
-using foreign_reader_dismantler = noncopyable_function<void(shard_id, future<stopped_foreign_reader>)>;

 /// Make a multishard_combining_reader.
 ///
 /// multishard_combining_reader takes care of reading a range from all shards
-/// that own a subrange in the range. Readers are created on-demand with the
-/// supplied reader_factory. This factory function is expected to create an
-/// appropriate reader on the specified shard and return a foreign_ptr to it.
+/// that own a subrange in the range. Shard reader are created on-demand, when
+/// the shard is visited for the first time.
 ///
 /// The read starts with a concurrency of one, that is the reader reads from a
 /// single shard at a time. The concurrency is exponentially increased (to a
@@ -421,19 +475,13 @@ using foreign_reader_dismantler = noncopyable_function<void(shard_id, future<sto
 /// For dense tables (where we rarely cross shards) we rely on the
 /// foreign_reader to issue sufficient read-aheads on its own to avoid blocking.
 ///
-/// Optionally a dismantler function can be passed to the multishard
-/// reader. When the multishard reader is destroyed it will invoke the
-/// dismantler functor for each of its foreign (shard) readers, passing a future
-/// to a `stopped_foreign_reader`. The future becomes available when the foreign
-/// reader has stopped, that is, it finished all of its in-progress read aheads
-/// and/or any pending `next_partition()` calls.
-flat_mutation_reader make_multishard_combining_reader(schema_ptr schema,
+/// The readers' life-cycles are managed through the supplied lifecycle policy.
+flat_mutation_reader make_multishard_combining_reader(
+        shared_ptr<reader_lifecycle_policy> lifecycle_policy,
+        const dht::i_partitioner& partitioner,
+        schema_ptr schema,
        const dht::partition_range& pr,
        const query::partition_slice& ps,
        const io_priority_class& pc,
-        const dht::i_partitioner& partitioner,
-        remote_reader_factory reader_factory,
        tracing::trace_state_ptr trace_state = nullptr,
-        streamed_mutation::forwarding fwd_sm = streamed_mutation::forwarding::no,
-        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::no,
-        foreign_reader_dismantler reader_dismantler = {});
+        mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::no);
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -270,6 +270,7 @@ public:
    static position_in_partition for_range_start(const query::clustering_range&);
    static position_in_partition for_range_end(const query::clustering_range&);

+    partition_region region() const { return _type; }
    bool is_partition_start() const { return _type == partition_region::partition_start; }
    bool is_partition_end() const { return _type == partition_region::partition_end; }
    bool is_static_row() const { return _type == partition_region::static_row; }
--- a/querier.cc
+++ b/querier.cc
@@ -216,16 +216,43 @@ static querier_cache::entries::iterator find_querier(querier_cache::entries& ent
    return it->pos();
 }

-querier_cache::querier_cache(size_t max_cache_size, std::chrono::seconds entry_ttl)
-    : _expiry_timer([this] { scan_cache_entries(); })
+querier_cache::querier_cache(reader_concurrency_semaphore& sem, size_t max_cache_size, std::chrono::seconds entry_ttl)
+    : _sem(sem)
+    , _expiry_timer([this] { scan_cache_entries(); })
    , _entry_ttl(entry_ttl)
    , _max_queriers_memory_usage(max_cache_size) {
    _expiry_timer.arm_periodic(entry_ttl / 2);
 }

+class querier_inactive_read : public reader_concurrency_semaphore::inactive_read {
+    querier_cache::entries& _entries;
+    querier_cache::entries::iterator _pos;
+    querier_cache::stats& _stats;
+
+public:
+    querier_inactive_read(querier_cache::entries& entries, querier_cache::entries::iterator pos, querier_cache::stats& stats)
+        : _entries(entries)
+        , _pos(pos)
+        , _stats(stats) {
+    }
+    virtual void evict() override {
+        _entries.erase(_pos);
+        ++_stats.resource_based_evictions;
+        --_stats.population;
+    }
+};
+
 template <typename Querier>
-static void insert_querier(querier_cache::entries& entries, querier_cache::index& index, querier_cache::stats& stats,
-        size_t max_queriers_memory_usage, utils::UUID key, Querier&& q, lowres_clock::time_point expires, tracing::trace_state_ptr trace_state) {
+static void insert_querier(
+        reader_concurrency_semaphore& sem,
+        querier_cache::entries& entries,
+        querier_cache::index& index,
+        querier_cache::stats& stats,
+        size_t max_queriers_memory_usage,
+        utils::UUID key,
+        Querier&& q,
+        lowres_clock::time_point expires,
+        tracing::trace_state_ptr trace_state) {
    // FIXME: see #3159
    // In reverse mode flat_mutation_reader drops any remaining rows of the
    // current partition when the page ends so it cannot be reused across
@@ -258,27 +285,30 @@ static void insert_querier(querier_cache::entries& entries, querier_cache::index

    auto& e = entries.emplace_back(key, std::move(q), expires);
    e.set_pos(--entries.end());
+    e.set_inactive_handle(sem.register_inactive_read(std::make_unique<querier_inactive_read>(entries, e.pos(), stats)));
    index.insert(e);
    ++stats.population;
 }

 void querier_cache::insert(utils::UUID key, data_querier&& q, tracing::trace_state_ptr trace_state) {
-    insert_querier(_entries, _data_querier_index, _stats, _max_queriers_memory_usage, key, std::move(q), lowres_clock::now() + _entry_ttl,
+    insert_querier(_sem, _entries, _data_querier_index, _stats, _max_queriers_memory_usage, key, std::move(q), lowres_clock::now() + _entry_ttl,
            std::move(trace_state));
 }

 void querier_cache::insert(utils::UUID key, mutation_querier&& q, tracing::trace_state_ptr trace_state) {
-    insert_querier(_entries, _mutation_querier_index, _stats, _max_queriers_memory_usage, key, std::move(q), lowres_clock::now() + _entry_ttl,
+    insert_querier(_sem, _entries, _mutation_querier_index, _stats, _max_queriers_memory_usage, key, std::move(q), lowres_clock::now() + _entry_ttl,
            std::move(trace_state));
 }

 void querier_cache::insert(utils::UUID key, shard_mutation_querier&& q, tracing::trace_state_ptr trace_state) {
-    insert_querier(_entries, _shard_mutation_querier_index, _stats, _max_queriers_memory_usage, key, std::move(q), lowres_clock::now() + _entry_ttl,
+    insert_querier(_sem, _entries, _shard_mutation_querier_index, _stats, _max_queriers_memory_usage, key, std::move(q), lowres_clock::now() + _entry_ttl,
            std::move(trace_state));
 }

 template <typename Querier>
-static std::optional<Querier> lookup_querier(querier_cache::entries& entries,
+static std::optional<Querier> lookup_querier(
+        reader_concurrency_semaphore& sem,
+        querier_cache::entries& entries,
        querier_cache::index& index,
        querier_cache::stats& stats,
        utils::UUID key,
@@ -294,6 +324,7 @@ static std::optional<Querier> lookup_querier(querier_cache::entries& entries,
    }

    auto q = std::move(*it).template value<Querier>();
+    sem.unregister_inactive_read(it->get_inactive_handle());
    entries.erase(it);
    --stats.population;

@@ -313,7 +344,7 @@ std::optional<data_querier> querier_cache::lookup_data_querier(utils::UUID key,
        const dht::partition_range& range,
        const query::partition_slice& slice,
        tracing::trace_state_ptr trace_state) {
-    return lookup_querier<data_querier>(_entries, _data_querier_index, _stats, key, s, range, slice, std::move(trace_state));
+    return lookup_querier<data_querier>(_sem, _entries, _data_querier_index, _stats, key, s, range, slice, std::move(trace_state));
 }

 std::optional<mutation_querier> querier_cache::lookup_mutation_querier(utils::UUID key,
@@ -321,7 +352,7 @@ std::optional<mutation_querier> querier_cache::lookup_mutation_querier(utils::UU
        const dht::partition_range& range,
        const query::partition_slice& slice,
        tracing::trace_state_ptr trace_state) {
-    return lookup_querier<mutation_querier>(_entries, _mutation_querier_index, _stats, key, s, range, slice, std::move(trace_state));
+    return lookup_querier<mutation_querier>(_sem, _entries, _mutation_querier_index, _stats, key, s, range, slice, std::move(trace_state));
 }

 std::optional<shard_mutation_querier> querier_cache::lookup_shard_mutation_querier(utils::UUID key,
@@ -329,7 +360,8 @@ std::optional<shard_mutation_querier> querier_cache::lookup_shard_mutation_queri
        const dht::partition_range_vector& ranges,
        const query::partition_slice& slice,
        tracing::trace_state_ptr trace_state) {
-    return lookup_querier<shard_mutation_querier>(_entries, _shard_mutation_querier_index, _stats, key, s, ranges, slice, std::move(trace_state));
+    return lookup_querier<shard_mutation_querier>(_sem, _entries, _shard_mutation_querier_index, _stats, key, s, ranges, slice,
+            std::move(trace_state));
 }

 void querier_cache::set_entry_ttl(std::chrono::seconds entry_ttl) {
@@ -344,6 +376,7 @@ bool querier_cache::evict_one() {

    ++_stats.resource_based_evictions;
    --_stats.population;
+    _sem.unregister_inactive_read(_entries.front().get_inactive_handle());
    _entries.pop_front();

    return true;
@@ -355,6 +388,7 @@ void querier_cache::evict_all_for_table(const utils::UUID& schema_id) {
    while (it != end) {
        if (it->schema().id() == schema_id) {
            --_stats.population;
+            _sem.unregister_inactive_read(it->get_inactive_handle());
            it = _entries.erase(it);
        } else {
            ++it;
--- a/querier.hh
+++ b/querier.hh
@@ -291,9 +291,8 @@ public:
 /// Inserted queriers will have a TTL. When this expires the querier is
 /// evicted. This is to avoid excess and unnecessary resource usage due to
 /// abandoned queriers.
-/// Provides a way to evict readers one-by-one via `evict_one()`. This can be
-/// used by the concurrency-limiting code to evict cached readers to free up
-/// resources for admitting new ones.
+/// Registers cached readers with the reader concurrency semaphore, as inactive
+/// readers, so the latter can evict them if needed.
 /// Keeps the total memory consumption of cached queriers
 /// below max_queriers_memory_usage by evicting older entries upon inserting
 /// new ones if the the memory consupmtion would go above the limit.
@@ -327,6 +326,7 @@ public:
        const utils::UUID _key;
        const lowres_clock::time_point _expires;
        std::variant<data_querier, mutation_querier, shard_mutation_querier> _value;
+        std::optional<reader_concurrency_semaphore::inactive_read_handle> _handle;

    public:
        template <typename Querier>
@@ -344,6 +344,14 @@ public:
            _pos = pos;
        }

+        void set_inactive_handle(reader_concurrency_semaphore::inactive_read_handle handle) {
+            _handle = std::move(handle);
+        }
+
+        reader_concurrency_semaphore::inactive_read_handle get_inactive_handle() const {
+            return *_handle;
+        }
+
        const utils::UUID& key() const {
            return _key;
        }
@@ -391,6 +399,7 @@ public:
          boost::intrusive::constant_time_size<false>>;

 private:
+    reader_concurrency_semaphore& _sem;
    entries _entries;
    index _data_querier_index;
    index _mutation_querier_index;
@@ -403,7 +412,7 @@ private:
    void scan_cache_entries();

 public:
-    explicit querier_cache(size_t max_cache_size = 1'000'000, std::chrono::seconds entry_ttl = default_entry_ttl);
+    explicit querier_cache(reader_concurrency_semaphore& sem, size_t max_cache_size = 1'000'000, std::chrono::seconds entry_ttl = default_entry_ttl);

    querier_cache(const querier_cache&) = delete;
    querier_cache& operator=(const querier_cache&) = delete;
--- a/query-request.hh
+++ b/query-request.hh
@@ -136,6 +136,9 @@ public:
    const clustering_row_ranges& row_ranges(const schema&, const partition_key&) const;
    void set_range(const schema&, const partition_key&, clustering_row_ranges);
    void clear_range(const schema&, const partition_key&);
+    void clear_ranges() {
+        _specific_ranges = nullptr;
+    }
    // FIXME: possibly make this function return a const ref instead.
    clustering_row_ranges get_all_ranges() const;

--- a/reader_concurrency_semaphore.cc
+++ b/reader_concurrency_semaphore.cc
@@ -0,0 +1,181 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <seastar/core/reactor.hh>
+
+#include "reader_concurrency_semaphore.hh"
+
+void reader_concurrency_semaphore::signal(const resources& r) {
+    _resources += r;
+    while (!_wait_list.empty() && has_available_units(_wait_list.front().res)) {
+        auto& x = _wait_list.front();
+        _resources -= x.res;
+        x.pr.set_value(make_lw_shared<reader_permit>(*this, x.res));
+        _wait_list.pop_front();
+    }
+}
+
+reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore::register_inactive_read(std::unique_ptr<inactive_read> ir) {
+    // Implies _inactive_reads.empty(), we don't queue new readers before
+    // evicting all inactive reads.
+    if (_wait_list.empty()) {
+        const auto [it, _] = _inactive_reads.emplace(_next_id++, std::move(ir));
+        (void)_;
+        return inactive_read_handle(it->first);
+    }
+
+    // The evicted reader will release its permit, hopefully allowing us to
+    // admit some readers from the _wait_list.
+    ir->evict();
+    return inactive_read_handle();
+}
+
+std::unique_ptr<reader_concurrency_semaphore::inactive_read> reader_concurrency_semaphore::unregister_inactive_read(inactive_read_handle irh) {
+    if (auto it = _inactive_reads.find(irh._id); it != _inactive_reads.end()) {
+        auto ir = std::move(it->second);
+        _inactive_reads.erase(it);
+        return ir;
+    }
+    return {};
+}
+
+bool reader_concurrency_semaphore::try_evict_one_inactive_read() {
+    if (_inactive_reads.empty()) {
+        return false;
+    }
+    auto it = _inactive_reads.begin();
+    it->second->evict();
+    _inactive_reads.erase(it);
+    return true;
+}
+
+future<lw_shared_ptr<reader_concurrency_semaphore::reader_permit>> reader_concurrency_semaphore::wait_admission(size_t memory,
+        db::timeout_clock::time_point timeout) {
+    if (_wait_list.size() >= _max_queue_length) {
+        return make_exception_future<lw_shared_ptr<reader_permit>>(_make_queue_overloaded_exception());
+    }
+    auto r = resources(1, static_cast<ssize_t>(memory));
+    auto it = _inactive_reads.begin();
+    while (!may_proceed(r) && it != _inactive_reads.end()) {
+        auto ir = std::move(it->second);
+        it = _inactive_reads.erase(it);
+        ir->evict();
+    }
+    if (may_proceed(r)) {
+        _resources -= r;
+        return make_ready_future<lw_shared_ptr<reader_permit>>(make_lw_shared<reader_permit>(*this, r));
+    }
+    promise<lw_shared_ptr<reader_permit>> pr;
+    auto fut = pr.get_future();
+    _wait_list.push_back(entry(std::move(pr), r), timeout);
+    return fut;
+}
+
+// A file that tracks the memory usage of buffers resulting from read
+// operations.
+class tracking_file_impl : public file_impl {
+    file _tracked_file;
+    lw_shared_ptr<reader_concurrency_semaphore::reader_permit> _permit;
+
+    // Shouldn't be called if semaphore is NULL.
+    temporary_buffer<uint8_t> make_tracked_buf(temporary_buffer<uint8_t> buf) {
+        return seastar::temporary_buffer<uint8_t>(buf.get_write(),
+                buf.size(),
+                make_deleter(buf.release(), std::bind(&reader_concurrency_semaphore::reader_permit::signal_memory, _permit, buf.size())));
+    }
+
+public:
+    tracking_file_impl(file file, reader_resource_tracker resource_tracker)
+        : _tracked_file(std::move(file))
+        , _permit(resource_tracker.get_permit()) {
+    }
+
+    tracking_file_impl(const tracking_file_impl&) = delete;
+    tracking_file_impl& operator=(const tracking_file_impl&) = delete;
+    tracking_file_impl(tracking_file_impl&&) = default;
+    tracking_file_impl& operator=(tracking_file_impl&&) = default;
+
+    virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override {
+        return get_file_impl(_tracked_file)->write_dma(pos, buffer, len, pc);
+    }
+
+    virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
+        return get_file_impl(_tracked_file)->write_dma(pos, std::move(iov), pc);
+    }
+
+    virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override {
+        return get_file_impl(_tracked_file)->read_dma(pos, buffer, len, pc);
+    }
+
+    virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
+        return get_file_impl(_tracked_file)->read_dma(pos, iov, pc);
+    }
+
+    virtual future<> flush(void) override {
+        return get_file_impl(_tracked_file)->flush();
+    }
+
+    virtual future<struct stat> stat(void) override {
+        return get_file_impl(_tracked_file)->stat();
+    }
+
+    virtual future<> truncate(uint64_t length) override {
+        return get_file_impl(_tracked_file)->truncate(length);
+    }
+
+    virtual future<> discard(uint64_t offset, uint64_t length) override {
+        return get_file_impl(_tracked_file)->discard(offset, length);
+    }
+
+    virtual future<> allocate(uint64_t position, uint64_t length) override {
+        return get_file_impl(_tracked_file)->allocate(position, length);
+    }
+
+    virtual future<uint64_t> size(void) override {
+        return get_file_impl(_tracked_file)->size();
+    }
+
+    virtual future<> close() override {
+        return get_file_impl(_tracked_file)->close();
+    }
+
+    virtual std::unique_ptr<file_handle_impl> dup() override {
+        return get_file_impl(_tracked_file)->dup();
+    }
+
+    virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) override {
+        return get_file_impl(_tracked_file)->list_directory(std::move(next));
+    }
+
+    virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) override {
+        return get_file_impl(_tracked_file)->dma_read_bulk(offset, range_size, pc).then([this] (temporary_buffer<uint8_t> buf) {
+            if (_permit) {
+                buf = make_tracked_buf(std::move(buf));
+                _permit->consume_memory(buf.size());
+            }
+            return make_ready_future<temporary_buffer<uint8_t>>(std::move(buf));
+        });
+    }
+};
+
+file reader_resource_tracker::track(file f) const {
+    return file(make_shared<tracking_file_impl>(f, *this));
+}
--- a/reader_concurrency_semaphore.hh
+++ b/reader_concurrency_semaphore.hh
@@ -21,10 +21,14 @@

 #pragma once

-#include <core/file.hh>
-#include <core/semaphore.hh>
+#include <map>
+#include <seastar/core/file.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/semaphore.hh>
 #include "db/timeout_clock.hh"

+using namespace seastar;
+
 /// Specific semaphore for controlling reader concurrency
 ///
 /// Before creating a reader one should obtain a permit by calling
@@ -109,13 +113,24 @@ public:
        }
    };

+    class inactive_read {
+    public:
+        virtual void evict() = 0;
+        virtual ~inactive_read() = default;
+    };
+
+    class inactive_read_handle {
+        uint64_t _id = 0;
+
+        friend class reader_concurrency_semaphore;
+
+        inactive_read_handle() = default;
+        explicit inactive_read_handle(uint64_t id)
+            : _id(id) {
+        }
+    };
+
 private:
-    static std::exception_ptr default_make_queue_overloaded_exception() {
-        return std::make_exception_ptr(std::runtime_error("restricted mutation reader queue overload"));
-    }
-
-    resources _resources;
-
    struct entry {
        promise<lw_shared_ptr<reader_permit>> pr;
        resources res;
@@ -126,11 +141,21 @@ private:
            e.pr.set_exception(semaphore_timed_out());
        }
    };
+
+private:
+    resources _resources;
+
    expiring_fifo<entry, expiry_handler, db::timeout_clock> _wait_list;

    size_t _max_queue_length = std::numeric_limits<size_t>::max();
-    std::function<std::exception_ptr()> _make_queue_overloaded_exception = default_make_queue_overloaded_exception;
-    std::function<bool()> _evict_an_inactive_reader;
+    std::function<std::exception_ptr()> _make_queue_overloaded_exception;
+    uint64_t _next_id = 1;
+    std::map<uint64_t, std::unique_ptr<inactive_read>> _inactive_reads;
+
+private:
+    static std::exception_ptr default_make_queue_overloaded_exception() {
+        return std::make_exception_ptr(std::runtime_error("restricted mutation reader queue overload"));
+    }

    bool has_available_units(const resources& r) const {
        return bool(_resources) && _resources >= r;
@@ -153,12 +178,10 @@ public:
    reader_concurrency_semaphore(unsigned count,
            size_t memory,
            size_t max_queue_length = std::numeric_limits<size_t>::max(),
-            std::function<std::exception_ptr()> raise_queue_overloaded_exception = default_make_queue_overloaded_exception,
-            std::function<bool()> evict_an_inactive_reader = {})
+            std::function<std::exception_ptr()> raise_queue_overloaded_exception = default_make_queue_overloaded_exception)
        : _resources(count, memory)
        , _max_queue_length(max_queue_length)
-        , _make_queue_overloaded_exception(raise_queue_overloaded_exception)
-        , _evict_an_inactive_reader(std::move(evict_an_inactive_reader)) {
+        , _make_queue_overloaded_exception(raise_queue_overloaded_exception) {
    }

    reader_concurrency_semaphore(const reader_concurrency_semaphore&) = delete;
@@ -167,6 +190,35 @@ public:
    reader_concurrency_semaphore(reader_concurrency_semaphore&&) = delete;
    reader_concurrency_semaphore& operator=(reader_concurrency_semaphore&&) = delete;

+    /// Register an inactive read.
+    ///
+    /// The semaphore will evict this read when there is a shortage of
+    /// permits. This might be immediate, during this register call.
+    /// Clients can use the returned handle to unregister the read, when it
+    /// stops being inactive and hence evictable.
+    ///
+    /// An inactive read is an object implementing the `inactive_read`
+    /// interface.
+    /// The semaphore takes ownership of the created object and destroys it if
+    /// it is evicted.
+    inactive_read_handle register_inactive_read(std::unique_ptr<inactive_read> ir);
+
+    /// Unregister the previously registered inactive read.
+    ///
+    /// If the read was not evicted, the inactive read object, passed in to the
+    /// register call, will be returned. Otherwise a nullptr is returned.
+    std::unique_ptr<inactive_read> unregister_inactive_read(inactive_read_handle irh);
+
+    /// Try to evict an inactive read.
+    ///
+    /// Return true if an inactive read was evicted and false otherwise
+    /// (if there was no reader to evict).
+    bool try_evict_one_inactive_read();
+
+    void clear_inactive_reads() {
+        _inactive_reads.clear();
+    }
+
    future<lw_shared_ptr<reader_permit>> wait_admission(size_t memory, db::timeout_clock::time_point timeout = db::no_timeout);

    const resources available_resources() const {
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -25,6 +25,7 @@
 #include "atomic_cell_hash.hh"
 #include "streaming/stream_plan.hh"
 #include "streaming/stream_state.hh"
+#include "streaming/stream_reason.hh"
 #include "gms/inet_address.hh"
 #include "db/config.hh"
 #include "service/storage_service.hh"
@@ -95,8 +96,8 @@ public:
    future<> do_streaming() {
        size_t ranges_in = 0;
        size_t ranges_out = 0;
-        _sp_in = make_lw_shared<streaming::stream_plan>(sprint("repair-in-id-%d-shard-%d-index-%d", id, shard, sp_index));
-        _sp_out = make_lw_shared<streaming::stream_plan>(sprint("repair-out-id-%d-shard-%d-index-%d", id, shard, sp_index));
+        _sp_in = make_lw_shared<streaming::stream_plan>(sprint("repair-in-id-%d-shard-%d-index-%d", id, shard, sp_index), streaming::stream_reason::repair);
+        _sp_out = make_lw_shared<streaming::stream_plan>(sprint("repair-out-id-%d-shard-%d-index-%d", id, shard, sp_index), streaming::stream_reason::repair);

        for (auto& x : ranges_need_repair_in) {
            auto& peer = x.first;
--- a/schema.cc
+++ b/schema.cc
@@ -193,7 +193,9 @@ const std::vector<column_definition>& v3_columns::all_columns() const {
 void schema::rebuild() {
    _partition_key_type = make_lw_shared<compound_type<>>(get_column_types(partition_key_columns()));
    _clustering_key_type = make_lw_shared<compound_prefix>(get_column_types(clustering_key_columns()));
-
+    _clustering_key_size = column_offset(column_kind::static_column) - column_offset(column_kind::clustering_key);
+    _regular_column_count = _raw._columns.size() - column_offset(column_kind::regular_column);
+    _static_column_count = column_offset(column_kind::regular_column) - column_offset(column_kind::static_column);
    _columns_by_name.clear();

    for (const column_definition& def : all_columns()) {
@@ -1121,26 +1123,26 @@ schema::has_static_columns() const {
    return !static_columns().empty();
 }

+column_count_type
+schema::columns_count(column_kind kind) const {
+    switch (kind) {
+    case column_kind::partition_key:
+        return partition_key_size();
+    case column_kind::clustering_key:
+        return clustering_key_size();
+    case column_kind::static_column:
+        return static_columns_count();
+    case column_kind::regular_column:
+        return regular_columns_count();
+    default:
+        std::abort();
+    }
+}
 column_count_type
 schema::partition_key_size() const {
    return column_offset(column_kind::clustering_key);
 }

-column_count_type
-schema::clustering_key_size() const {
-    return column_offset(column_kind::static_column) - column_offset(column_kind::clustering_key);
-}
-
-column_count_type
-schema::static_columns_count() const {
-    return column_offset(column_kind::regular_column) - column_offset(column_kind::static_column);
-}
-
-column_count_type
-schema::regular_columns_count() const {
-    return _raw._columns.size() - column_offset(column_kind::regular_column);
-}
-
 schema::const_iterator_range_type
 schema::partition_key_columns() const {
    return boost::make_iterator_range(_raw._columns.begin() + column_offset(column_kind::partition_key)
--- a/schema.hh
+++ b/schema.hh
@@ -529,6 +529,9 @@ private:
    lw_shared_ptr<compound_type<allow_prefixes::yes>> _clustering_key_type;
    column_mapping _column_mapping;
    shared_ptr<query::partition_slice> _full_slice;
+    column_count_type _clustering_key_size;
+    column_count_type _regular_column_count;
+    column_count_type _static_column_count;

    extensions_map& extensions() {
        return _raw._extensions;
@@ -701,10 +704,11 @@ public:
    bool is_last_partition_key(const column_definition& def) const;
    bool has_multi_cell_collections() const;
    bool has_static_columns() const;
+    column_count_type columns_count(column_kind kind) const;
    column_count_type partition_key_size() const;
-    column_count_type clustering_key_size() const;
-    column_count_type static_columns_count() const;
-    column_count_type regular_columns_count() const;
+    column_count_type clustering_key_size() const { return _clustering_key_size; }
+    column_count_type static_columns_count() const { return _static_column_count; }
+    column_count_type regular_columns_count() const { return _regular_column_count; }
    // Returns a range of column definitions
    const_iterator_range_type partition_key_columns() const;
    // Returns a range of column definitions
--- a/2
+++ b/2
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -514,7 +514,12 @@ future<> migration_manager::announce_new_keyspace(lw_shared_ptr<keyspace_metadat
    return announce(std::move(mutations), announce_locally);
 }

-future<> migration_manager::announce_new_column_family(schema_ptr cfm, bool announce_locally) {
+future<> migration_manager::announce_new_column_family(schema_ptr cfm, bool announce_locally)
+{
+    return announce_new_column_family(std::move(cfm), api::new_timestamp(), announce_locally);
+}
+
+future<> migration_manager::announce_new_column_family(schema_ptr cfm, api::timestamp_type timestamp, bool announce_locally) {
 #if 0
    cfm.validate();
 #endif
@@ -525,7 +530,7 @@ future<> migration_manager::announce_new_column_family(schema_ptr cfm, bool anno
            throw exceptions::already_exists_exception(cfm->ks_name(), cfm->cf_name());
        }
        mlogger.info("Create new ColumnFamily: {}", cfm);
-        return db::schema_tables::make_create_table_mutations(keyspace.metadata(), cfm, api::new_timestamp())
+        return db::schema_tables::make_create_table_mutations(keyspace.metadata(), cfm, timestamp)
            .then([announce_locally, this] (auto&& mutations) {
                return announce(std::move(mutations), announce_locally);
            });
--- a/service/migration_manager.hh
+++ b/service/migration_manager.hh
@@ -111,6 +111,8 @@ public:

    future<> announce_new_column_family(schema_ptr cfm, bool announce_locally = false);

+    future<> announce_new_column_family(schema_ptr cfm, api::timestamp_type timestamp, bool announce_locally = false);
+
    future<> announce_new_type(user_type new_type, bool announce_locally = false);

    future<> announce_type_update(user_type updated_type, bool announce_locally = false);
--- a/service/pager/query_pager.hh
+++ b/service/pager/query_pager.hh
@@ -151,6 +151,10 @@ protected:
    void handle_result(Visitor&& visitor,
                      const foreign_ptr<lw_shared_ptr<query::result>>& results,
                      uint32_t page_size, gc_clock::time_point now);
+
+    virtual uint32_t max_rows_to_fetch(uint32_t page_size) {
+        return std::min(_max, page_size);
+    }
 };

 }
--- a/service/pager/query_pagers.cc
+++ b/service/pager/query_pagers.cc
@@ -55,7 +55,7 @@ struct noop_visitor {
    void accept_new_partition(const partition_key& key, uint32_t row_count) { }
    void accept_new_row(const clustering_key& key, const query::result_row_view& static_row, const query::result_row_view& row) { }
    void accept_new_row(const query::result_row_view& static_row, const query::result_row_view& row) { }
-    void accept_partition_end(const query::result_row_view& static_row) { }
+    uint32_t accept_partition_end(const query::result_row_view& static_row) { return 0; }
 };

 static bool has_clustering_keys(const schema& s, const query::read_command& cmd) {
@@ -202,7 +202,7 @@ static bool has_clustering_keys(const schema& s, const query::read_command& cmd)
            }
        }

-        auto max_rows = std::min(_max, page_size);
+        auto max_rows = max_rows_to_fetch(page_size);

        // We always need PK so we can determine where to start next.
        _cmd->slice.options.set<query::partition_slice::option::send_partition_key>();
@@ -284,6 +284,10 @@ public:
                          std::move(qr.query_result), page_size, now);
        });
    }
+protected:
+    virtual uint32_t max_rows_to_fetch(uint32_t page_size) override {
+        return page_size;
+    }
 };

 template<typename Base>
@@ -291,6 +295,7 @@ class query_pager::query_result_visitor : public Base {
    using visitor = Base;
 public:
    uint32_t total_rows = 0;
+    uint32_t dropped_rows = 0;
    std::experimental::optional<partition_key> last_pkey;
    std::experimental::optional<clustering_key> last_ckey;

@@ -317,7 +322,7 @@ public:
        visitor::accept_new_row(static_row, row);
    }
    void accept_partition_end(const query::result_row_view& static_row) {
-        visitor::accept_partition_end(static_row);
+        dropped_rows += visitor::accept_partition_end(static_row);
    }
 };

@@ -348,9 +353,9 @@ public:
                update_slice(*_last_pkey);
            }

-            row_count = v.total_rows;
+            row_count = v.total_rows - v.dropped_rows;
            _max = _max - row_count;
-            _exhausted = (v.total_rows < page_size && !results->is_short_read()) || _max == 0;
+            _exhausted = (v.total_rows < page_size && !results->is_short_read() && v.dropped_rows == 0) || _max == 0;
            _last_pkey = v.last_pkey;
            _last_ckey = v.last_ckey;
        } else {
@@ -379,7 +384,7 @@ public:
    }

    ::shared_ptr<const paging_state> query_pager::state() const {
-        return ::make_shared<paging_state>(*_last_pkey, _last_ckey, _exhausted ? 0 : _max, _cmd->query_uuid, _last_replicas, _query_read_repair_decision);
+        return ::make_shared<paging_state>(_last_pkey.value_or(partition_key::make_empty()), _last_ckey, _exhausted ? 0 : _max, _cmd->query_uuid, _last_replicas, _query_read_repair_decision);
    }

 }
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -64,6 +64,7 @@
 #include <boost/range/adaptors.hpp>
 #include <boost/algorithm/cxx11/any_of.hpp>
 #include <boost/algorithm/cxx11/none_of.hpp>
+#include <boost/algorithm/cxx11/partition_copy.hpp>
 #include <boost/range/algorithm/count_if.hpp>
 #include <boost/range/algorithm/find.hpp>
 #include <boost/range/algorithm/find_if.hpp>
@@ -170,10 +171,13 @@ public:
 class shared_mutation : public mutation_holder {
    lw_shared_ptr<const frozen_mutation> _mutation;
 public:
-    shared_mutation(const mutation& m) : _mutation(make_lw_shared<const frozen_mutation>(freeze(m))) {
+    explicit shared_mutation(frozen_mutation_and_schema&& fm_a_s)
+            : _mutation(make_lw_shared<const frozen_mutation>(std::move(fm_a_s.fm))) {
        _size = _mutation->representation().size();
-        _schema = m.schema();
-    };
+        _schema = std::move(fm_a_s.s);
+    }
+    explicit shared_mutation(const mutation& m) : shared_mutation(frozen_mutation_and_schema{freeze(m), m.schema()}) {
+    }
    lw_shared_ptr<const frozen_mutation> get_mutation_for(gms::inet_address ep) override {
        return _mutation;
    }
@@ -206,7 +210,8 @@ protected:
        FAILURE,
    };
    error _error = error::NONE;
-    size_t _failed = 0;
+    size_t _failed = 0; // only failures that may impact consistency
+    size_t _all_failures = 0; // total amount of failures
    size_t _total_endpoints = 0;
    storage_proxy::write_stats& _stats;

@@ -295,6 +300,50 @@ public:
        }
        return _targets.size() == 0;
    }
+    // return true if handler is no longer needed because
+    // CL cannot be reached
+    bool failure_response(gms::inet_address from, size_t count) {
+        auto it = _targets.find(from);
+        if (it == _targets.end()) {
+            // There is a little change we can get outdated reply
+            // if the coordinator was restarted after sending a request and
+            // getting reply back. The chance is low though since initial
+            // request id is initialized to server starting time
+            slogger.warn("Receive outdated write failure from {}", from);
+            return false;
+        }
+        _all_failures += count;
+        // we should not fail CL=ANY requests since they may succeed after
+        // writing hints
+        return _cl != db::consistency_level::ANY && failure(from, count);
+    }
+    void check_for_early_completion() {
+        if (_all_failures == _targets.size()) {
+            // leftover targets are all reported error, so nothing to wait for any longer
+            timeout_cb();
+        }
+    }
+    void timeout_cb() {
+        if (_cl_achieved || _cl == db::consistency_level::ANY) {
+            // we are here because either cl was achieved, but targets left in the handler are not
+            // responding, so a hint should be written for them, or cl == any in which case
+            // hints are counted towards consistency, so we need to write hints and count how much was written
+            auto hints = _proxy->hint_to_dead_endpoints(_mutation_holder, get_targets(), _type, get_trace_state());
+            signal(hints);
+            if (_cl == db::consistency_level::ANY && hints) {
+                slogger.trace("Wrote hint to satisfy CL.ANY after no replicas acknowledged the write");
+            }
+            if (_cl_achieved) { // For CL=ANY this can still be false
+                for (auto&& ep : get_targets()) {
+                    ++stats().background_replica_writes_failed.get_ep_stat(ep);
+                }
+                stats().background_writes_failed += int(!_targets.empty());
+            }
+        }
+
+        on_timeout();
+        _proxy->remove_response_handler(_id);
+    }
    future<> wait() {
        return _ready.get_future();
    }
@@ -319,6 +368,9 @@ public:
    const tracing::trace_state_ptr& get_trace_state() const {
        return _trace_state;
    }
+    storage_proxy::write_stats& stats() {
+        return _stats;
+    }
    friend storage_proxy;
 };

@@ -458,22 +510,7 @@ void storage_proxy::unthrottle() {

 storage_proxy::response_id_type storage_proxy::register_response_handler(shared_ptr<abstract_write_response_handler>&& h) {
    auto id = h->id();
-    auto e = _response_handlers.emplace(id, rh_entry(std::move(h), [this, id] {
-        auto& e = _response_handlers.find(id)->second;
-        if (e.handler->_cl_achieved || e.handler->_cl == db::consistency_level::ANY) {
-            // we are here because either cl was achieved, but targets left in the handler are not
-            // responding, so a hint should be written for them, or cl == any in which case
-            // hints are counted towards consistency, so we need to write hints and count how much was written
-            auto hints = hint_to_dead_endpoints(e.handler->_mutation_holder, e.handler->get_targets(), e.handler->_type, e.handler->get_trace_state());
-            e.handler->signal(hints);
-            if (e.handler->_cl == db::consistency_level::ANY && hints) {
-                slogger.trace("Wrote hint to satisfy CL.ANY after no replicas acknowledged the write");
-            }
-        }
-
-        e.handler->on_timeout();
-        remove_response_handler(id);
-    }));
+    auto e = _response_handlers.emplace(id, std::move(h));
    assert(e.second);
    return id;
 }
@@ -488,6 +525,8 @@ void storage_proxy::got_response(storage_proxy::response_id_type id, gms::inet_a
        tracing::trace(it->second.handler->get_trace_state(), "Got a response from /{}", from);
        if (it->second.handler->response(from)) {
            remove_response_handler(id); // last one, remove entry. Will cancel expiration timer too.
+        } else {
+            it->second.handler->check_for_early_completion();
        }
    }
 }
@@ -496,8 +535,10 @@ void storage_proxy::got_failure_response(storage_proxy::response_id_type id, gms
    auto it = _response_handlers.find(id);
    if (it != _response_handlers.end()) {
        tracing::trace(it->second.handler->get_trace_state(), "Got {} failures from /{}", count, from);
-        if (it->second.handler->failure(from, count)) {
-            remove_response_handler(id); // last one, remove entry. Will cancel expiration timer too.
+        if (it->second.handler->failure_response(from, count)) {
+            remove_response_handler(id);
+        } else {
+            it->second.handler->check_for_early_completion();
        }
    }
 }
@@ -544,22 +585,26 @@ storage_proxy_stats::split_stats::split_stats(const sstring& category, const sst
 storage_proxy_stats::write_stats::write_stats()
 : writes_attempts(storage_proxy::COORDINATOR_STATS_CATEGORY, "total_write_attempts", "total number of write requests", "mutation_data")
 , writes_errors(storage_proxy::COORDINATOR_STATS_CATEGORY, "write_errors", "number of write requests that failed", "mutation_data")
+, background_replica_writes_failed(storage_proxy::COORDINATOR_STATS_CATEGORY, "background_replica_writes_failed", "number of replica writes that timed out or failed after CL was reached", "mutation_data")
 , read_repair_write_attempts(storage_proxy::COORDINATOR_STATS_CATEGORY, "read_repair_write_attempts", "number of write operations in a read repair context", "mutation_data") { }

 storage_proxy_stats::write_stats::write_stats(const sstring& category, bool auto_register_stats)
        : writes_attempts(category, "total_write_attempts", "total number of write requests", "mutation_data", auto_register_stats)
        , writes_errors(category, "write_errors", "number of write requests that failed", "mutation_data", auto_register_stats)
+        , background_replica_writes_failed(category, "background_replica_writes_failed", "number of replica writes that timed out or failed after CL was reached", "mutation_data", auto_register_stats)
        , read_repair_write_attempts(category, "read_repair_write_attempts", "number of write operations in a read repair context", "mutation_data", auto_register_stats) { }

 void storage_proxy_stats::write_stats::register_metrics_local() {
    writes_attempts.register_metrics_local();
    writes_errors.register_metrics_local();
+    background_replica_writes_failed.register_metrics_local();
    read_repair_write_attempts.register_metrics_local();
 }

 void storage_proxy_stats::write_stats::register_metrics_for(gms::inet_address ep) {
    writes_attempts.register_metrics_for(ep);
    writes_errors.register_metrics_for(ep);
+    background_replica_writes_failed.register_metrics_for(ep);
    read_repair_write_attempts.register_metrics_for(ep);
 }

@@ -709,6 +754,9 @@ storage_proxy::storage_proxy(distributed<database>& db, storage_proxy::config cf

        sm::make_total_operations("speculative_data_reads", [this] { return _stats.speculative_data_reads; },
                       sm::description("number of speculative data read requests that were sent")),
+
+        sm::make_total_operations("background_writes_failed", [this] { return _stats.background_writes_failed; },
+                       sm::description("number of write requests that failed after CL was reached")),
    });

    _metrics.add_group(REPLICA_STATS_CATEGORY, {
@@ -753,7 +801,7 @@ storage_proxy::storage_proxy(distributed<database>& db, storage_proxy::config cf
    _hints_resource_manager.register_manager(_hints_for_views_manager);
 }

-storage_proxy::rh_entry::rh_entry(shared_ptr<abstract_write_response_handler>&& h, std::function<void()>&& cb) : handler(std::move(h)), expire_timer(std::move(cb)) {}
+storage_proxy::rh_entry::rh_entry(shared_ptr<abstract_write_response_handler>&& h) : handler(std::move(h)), expire_timer([this] { handler->timeout_cb(); }) {}

 storage_proxy::unique_response_handler::unique_response_handler(storage_proxy& p_, response_id_type id_) : id(id_), p(p_) {}
 storage_proxy::unique_response_handler::unique_response_handler(unique_response_handler&& x) : id(x.id), p(x.p) { x.id = 0; };
@@ -1292,28 +1340,28 @@ storage_proxy::hint_to_dead_endpoints(response_id_type id, db::consistency_level
 }

 template<typename Range, typename CreateWriteHandler>
-future<std::vector<storage_proxy::unique_response_handler>> storage_proxy::mutate_prepare(const Range& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler create_handler) {
+future<std::vector<storage_proxy::unique_response_handler>> storage_proxy::mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler create_handler) {
    // apply is used to convert exceptions to exceptional future
-    return futurize<std::vector<storage_proxy::unique_response_handler>>::apply([this] (const Range& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler create_handler) {
+    return futurize<std::vector<storage_proxy::unique_response_handler>>::apply([this] (Range&& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler create_handler) {
        std::vector<unique_response_handler> ids;
        ids.reserve(std::distance(std::begin(mutations), std::end(mutations)));
        for (auto& m : mutations) {
            ids.emplace_back(*this, create_handler(m, cl, type));
        }
        return make_ready_future<std::vector<unique_response_handler>>(std::move(ids));
-    }, mutations, cl, type, std::move(create_handler));
+    }, std::forward<Range>(mutations), cl, type, std::move(create_handler));
 }

 template<typename Range>
-future<std::vector<storage_proxy::unique_response_handler>> storage_proxy::mutate_prepare(const Range& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state) {
-    return mutate_prepare<>(mutations, cl, type, [this, tr_state = std::move(tr_state)] (const typename Range::value_type& m, db::consistency_level cl, db::write_type type) mutable {
+future<std::vector<storage_proxy::unique_response_handler>> storage_proxy::mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state) {
+    return mutate_prepare<>(std::forward<Range>(mutations), cl, type, [this, tr_state = std::move(tr_state)] (const typename std::decay_t<Range>::value_type& m, db::consistency_level cl, db::write_type type) mutable {
        return create_write_response_handler(m, cl, type, tr_state);
    });
 }

 future<> storage_proxy::mutate_begin(std::vector<unique_response_handler> ids, db::consistency_level cl,
-                                     write_stats& stats, stdx::optional<clock_type::time_point> timeout_opt) {
-    return parallel_for_each(ids, [this, cl, timeout_opt, &stats] (unique_response_handler& protected_response) {
+                                     stdx::optional<clock_type::time_point> timeout_opt) {
+    return parallel_for_each(ids, [this, cl, timeout_opt] (unique_response_handler& protected_response) {
        auto response_id = protected_response.id;
        // it is better to send first and hint afterwards to reduce latency
        // but request may complete before hint_to_dead_endpoints() is called and
@@ -1324,7 +1372,7 @@ future<> storage_proxy::mutate_begin(std::vector<unique_response_handler> ids, d
        auto timeout = timeout_opt.value_or(clock_type::now() + std::chrono::milliseconds(_db.local().get_config().write_request_timeout_in_ms()));
        // call before send_to_live_endpoints() for the same reason as above
        auto f = response_wait(response_id, timeout);
-        send_to_live_endpoints(protected_response.release(), timeout, stats); // response is now running and it will either complete or timeout
+        send_to_live_endpoints(protected_response.release(), timeout); // response is now running and it will either complete or timeout
        return std::move(f);
    });
 }
@@ -1503,7 +1551,7 @@ storage_proxy::mutate_internal(Range mutations, db::consistency_level cl, bool c
    lc.start();

    return mutate_prepare(mutations, cl, type, tr_state).then([this, cl, timeout_opt] (std::vector<storage_proxy::unique_response_handler> ids) {
-        return mutate_begin(std::move(ids), cl, _stats, timeout_opt);
+        return mutate_begin(std::move(ids), cl, timeout_opt);
    }).then_wrapped([this, p = shared_from_this(), lc, tr_state] (future<> f) mutable {
        return p->mutate_end(std::move(f), lc, _stats, std::move(tr_state));
    });
@@ -1590,7 +1638,7 @@ storage_proxy::mutate_atomically(std::vector<mutation> mutations, db::consistenc
                auto& ks = _p._db.local().find_keyspace(m.schema()->ks_name());
                return _p.create_write_response_handler(ks, cl, type, std::make_unique<shared_mutation>(m), _batchlog_endpoints, {}, {}, _trace_state, _stats);
            }).then([this, cl] (std::vector<unique_response_handler> ids) {
-                return _p.mutate_begin(std::move(ids), cl, _stats, _timeout);
+                return _p.mutate_begin(std::move(ids), cl, _timeout);
            });
        }
        future<> sync_write_to_batchlog() {
@@ -1616,7 +1664,7 @@ storage_proxy::mutate_atomically(std::vector<mutation> mutations, db::consistenc
            return _p.mutate_prepare(_mutations, _cl, db::write_type::BATCH, _trace_state).then([this] (std::vector<unique_response_handler> ids) {
                return sync_write_to_batchlog().then([this, ids = std::move(ids)] () mutable {
                    tracing::trace(_trace_state, "Sending batch mutations");
-                    return _p.mutate_begin(std::move(ids), _cl, _stats, _timeout);
+                    return _p.mutate_begin(std::move(ids), _cl, _timeout);
                }).then(std::bind(&context::async_remove_from_batchlog, this));
            });
        }
@@ -1644,7 +1692,7 @@ bool storage_proxy::cannot_hint(const Range& targets, db::write_type type) {
 }

 future<> storage_proxy::send_to_endpoint(
-        mutation m,
+        std::unique_ptr<mutation_holder> m,
        gms::inet_address target,
        std::vector<gms::inet_address> pending_endpoints,
        db::write_type type,
@@ -1654,29 +1702,78 @@ future<> storage_proxy::send_to_endpoint(

    // View updates use consistency level ANY in order to fall back to hinted handoff in case of a failed update
    db::consistency_level cl = (type == db::write_type::VIEW) ? db::consistency_level::ANY : db::consistency_level::ONE;
-    std::unordered_set<gms::inet_address> targets(pending_endpoints.begin(), pending_endpoints.end());
-    targets.insert(std::move(target));
-    return mutate_prepare(std::array<mutation, 1>{std::move(m)}, cl, type,
-        [this, targets = std::move(targets), pending_endpoints = std::move(pending_endpoints), &stats] (
-                const mutation& m,
+    return mutate_prepare(std::array{std::move(m)}, cl, type,
+            [this, target = std::array{target}, pending_endpoints = std::move(pending_endpoints), &stats] (
+                std::unique_ptr<mutation_holder>& m,
                db::consistency_level cl,
                db::write_type type) mutable {
-            auto& ks = _db.local().find_keyspace(m.schema()->ks_name());
-            return create_write_response_handler(
-                    ks,
-                    cl,
-                    type,
-                    std::make_unique<shared_mutation>(m),
-                    std::move(targets),
-                    pending_endpoints,
-                    { },
-                    nullptr,
-                    stats);
-    }).then([this, &stats, cl] (std::vector<unique_response_handler> ids) {
-        return mutate_begin(std::move(ids), cl, stats);
+        std::unordered_set<gms::inet_address> targets;
+        targets.reserve(pending_endpoints.size() + 1);
+        std::vector<gms::inet_address> dead_endpoints;
+        boost::algorithm::partition_copy(
+                boost::range::join(pending_endpoints, target),
+                std::inserter(targets, targets.begin()),
+                std::back_inserter(dead_endpoints),
+                [] (gms::inet_address ep) { return gms::get_local_failure_detector().is_alive(ep); });
+        auto& ks = _db.local().find_keyspace(m->schema()->ks_name());
+        slogger.trace("Creating write handler with live: {}; dead: {}", targets, dead_endpoints);
+        db::assure_sufficient_live_nodes(cl, ks, targets, pending_endpoints);
+        return create_write_response_handler(
+            ks,
+            cl,
+            type,
+            std::move(m),
+            std::move(targets),
+            pending_endpoints,
+            std::move(dead_endpoints),
+            nullptr,
+            stats);
+    }).then([this, cl] (std::vector<unique_response_handler> ids) {
+        return mutate_begin(std::move(ids), cl);
    }).then_wrapped([p = shared_from_this(), lc, &stats] (future<>&& f) {
        return p->mutate_end(std::move(f), lc, stats, nullptr);
-        });
+    });
+}
+
+future<> storage_proxy::send_to_endpoint(
+        frozen_mutation_and_schema fm_a_s,
+        gms::inet_address target,
+        std::vector<gms::inet_address> pending_endpoints,
+        db::write_type type) {
+    return send_to_endpoint(
+            std::make_unique<shared_mutation>(std::move(fm_a_s)),
+            std::move(target),
+            std::move(pending_endpoints),
+            type,
+            _stats);
+}
+
+future<> storage_proxy::send_to_endpoint(
+        frozen_mutation_and_schema fm_a_s,
+        gms::inet_address target,
+        std::vector<gms::inet_address> pending_endpoints,
+        db::write_type type,
+        write_stats& stats) {
+    return send_to_endpoint(
+            std::make_unique<shared_mutation>(std::move(fm_a_s)),
+            std::move(target),
+            std::move(pending_endpoints),
+            type,
+            stats);
+}
+
+future<> storage_proxy::send_to_endpoint(
+        mutation m,
+        gms::inet_address target,
+        std::vector<gms::inet_address> pending_endpoints,
+        db::write_type type,
+        write_stats& stats) {
+    return send_to_endpoint(
+            std::make_unique<shared_mutation>(m),
+            std::move(target),
+            std::move(pending_endpoints),
+            type,
+            stats);
 }

 future<> storage_proxy::send_to_endpoint(
@@ -1684,7 +1781,12 @@ future<> storage_proxy::send_to_endpoint(
        gms::inet_address target,
        std::vector<gms::inet_address> pending_endpoints,
        db::write_type type) {
-    return send_to_endpoint(std::move(m), std::move(target), std::move(pending_endpoints), type, _stats);
+    return send_to_endpoint(
+            std::make_unique<shared_mutation>(m),
+            std::move(target),
+            std::move(pending_endpoints),
+            type,
+            _stats);
 }

 /**
@@ -1702,7 +1804,7 @@ future<> storage_proxy::send_to_endpoint(
 * @throws OverloadedException if the hints cannot be written/enqueued
 */
 // returned future is ready when sent is complete, not when mutation is executed on all (or any) targets!
-void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type response_id, clock_type::time_point timeout, write_stats& stats)
+void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type response_id, clock_type::time_point timeout)
 {
    // extra-datacenter replicas, grouped by dc
    std::unordered_map<sstring, std::vector<gms::inet_address>> dc_groups;
@@ -1710,6 +1812,7 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
    local.reserve(3);

    auto handler_ptr = get_write_response_handler(response_id);
+    auto& stats = handler_ptr->stats();
    auto& handler = *handler_ptr;

    for(auto dest: handler.get_targets()) {
@@ -3074,8 +3177,9 @@ storage_proxy::query_result_local(schema_ptr s, lw_shared_ptr<query::read_comman
        unsigned shard = _db.local().shard_of(pr.start()->value().token());
        _stats.replica_cross_shard_ops += shard != engine().cpu_id();
        return _db.invoke_on(shard, [max_size, gs = global_schema_ptr(s), prv = dht::partition_range_vector({pr}) /* FIXME: pr is copied */, cmd, opts, timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable {
-            tracing::trace(gt, "Start querying the token range that starts with {}", seastar::value_of([&prv] { return prv.begin()->start()->value().token(); }));
-            return db.query(gs, *cmd, opts, prv, gt, max_size, timeout).then([trace_state = gt.get()](auto&& f, cache_temperature ht) {
+            auto trace_state = gt.get();
+            tracing::trace(trace_state, "Start querying the token range that starts with {}", seastar::value_of([&prv] { return prv.begin()->start()->value().token(); }));
+            return db.query(gs, *cmd, opts, prv, trace_state, max_size, timeout).then([trace_state](auto&& f, cache_temperature ht) {
                tracing::trace(trace_state, "Querying is done");
                return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>(make_foreign(std::move(f)), ht);
            });
@@ -4218,6 +4322,10 @@ future<> storage_proxy::start_hints_manager(shared_ptr<gms::gossiper> gossiper_p
    return _hints_resource_manager.start(shared_from_this(), gossiper_ptr, ss_ptr);
 }

+void storage_proxy::allow_replaying_hints() noexcept {
+    return _hints_resource_manager.allow_replaying();
+}
+
 future<> storage_proxy::stop_hints_manager() {
    return _hints_resource_manager.stop();
 }
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -83,7 +83,8 @@ private:
    struct rh_entry {
        ::shared_ptr<abstract_write_response_handler> handler;
        timer<clock_type> expire_timer;
-        rh_entry(::shared_ptr<abstract_write_response_handler>&& h, std::function<void()>&& cb);
+        rh_entry(::shared_ptr<abstract_write_response_handler>&& h);
+        rh_entry(rh_entry&&) = delete;
    };

    using response_id_type = uint64_t;
@@ -187,7 +188,7 @@ private:
            const std::vector<gms::inet_address>& pending_endpoints, std::vector<gms::inet_address>, tracing::trace_state_ptr tr_state, storage_proxy::write_stats& stats);
    response_id_type create_write_response_handler(const mutation&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state);
    response_id_type create_write_response_handler(const std::unordered_map<gms::inet_address, std::experimental::optional<mutation>>&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state);
-    void send_to_live_endpoints(response_id_type response_id, clock_type::time_point timeout, write_stats& stats);
+    void send_to_live_endpoints(response_id_type response_id, clock_type::time_point timeout);
    template<typename Range>
    size_t hint_to_dead_endpoints(std::unique_ptr<mutation_holder>& mh, const Range& targets, db::write_type type, tracing::trace_state_ptr tr_state) noexcept;
    void hint_to_dead_endpoints(response_id_type, db::consistency_level);
@@ -239,10 +240,10 @@ private:
        db::consistency_level cl,
        coordinator_query_options optional_params);
    template<typename Range, typename CreateWriteHandler>
-    future<std::vector<unique_response_handler>> mutate_prepare(const Range& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler handler);
+    future<std::vector<unique_response_handler>> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler handler);
    template<typename Range>
-    future<std::vector<unique_response_handler>> mutate_prepare(const Range& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state);
-    future<> mutate_begin(std::vector<unique_response_handler> ids, db::consistency_level cl, write_stats& stats, stdx::optional<clock_type::time_point> timeout_opt = { });
+    future<std::vector<unique_response_handler>> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state);
+    future<> mutate_begin(std::vector<unique_response_handler> ids, db::consistency_level cl, stdx::optional<clock_type::time_point> timeout_opt = { });
    future<> mutate_end(future<> mutate_result, utils::latency_counter, write_stats& stats, tracing::trace_state_ptr trace_state);
    future<> schedule_repair(std::unordered_map<dht::token, std::unordered_map<gms::inet_address, std::experimental::optional<mutation>>> diffs, db::consistency_level cl, tracing::trace_state_ptr trace_state);
    bool need_throttle_writes() const;
@@ -254,10 +255,6 @@ private:
            schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range_vector&& pr, tracing::trace_state_ptr trace_state,
            uint64_t max_size, clock_type::time_point timeout);

-    struct frozen_mutation_and_schema {
-        frozen_mutation fm;
-        schema_ptr s;
-    };
    future<> mutate_counters_on_leader(std::vector<frozen_mutation_and_schema> mutations, db::consistency_level cl, clock_type::time_point timeout,
                                       tracing::trace_state_ptr trace_state);
    future<> mutate_counter_on_leader_and_replicate(const schema_ptr& s, frozen_mutation m, db::consistency_level cl, clock_type::time_point timeout,
@@ -266,6 +263,13 @@ private:
    gms::inet_address find_leader_for_counter_update(const mutation& m, db::consistency_level cl);

    future<> do_mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, bool);
+
+    future<> send_to_endpoint(
+            std::unique_ptr<mutation_holder> m,
+            gms::inet_address target,
+            std::vector<gms::inet_address> pending_endpoints,
+            db::write_type type,
+            write_stats& stats);
 public:
    storage_proxy(distributed<database>& db, config cfg);
    ~storage_proxy();
@@ -338,6 +342,8 @@ public:
    // send_to_live_endpoints() - another take on the same original function.
    future<> send_to_endpoint(mutation m, gms::inet_address target, std::vector<gms::inet_address> pending_endpoints, db::write_type type, write_stats& stats);
    future<> send_to_endpoint(mutation m, gms::inet_address target, std::vector<gms::inet_address> pending_endpoints, db::write_type type);
+    future<> send_to_endpoint(frozen_mutation_and_schema fm_a_s, gms::inet_address target, std::vector<gms::inet_address> pending_endpoints, db::write_type type, write_stats& stats);
+    future<> send_to_endpoint(frozen_mutation_and_schema fm_a_s, gms::inet_address target, std::vector<gms::inet_address> pending_endpoints, db::write_type type);

    /**
     * Performs the truncate operatoin, which effectively deletes all data from
@@ -390,6 +396,7 @@ public:
    future<> stop();
    future<> stop_hints_manager();
    future<> start_hints_manager(shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
+    void allow_replaying_hints() noexcept;

    const stats& get_stats() const {
        return _stats;
--- a/service/storage_proxy_stats.hh
+++ b/service/storage_proxy_stats.hh
@@ -81,6 +81,7 @@ struct write_stats {
    // total write attempts
    split_stats writes_attempts;
    split_stats writes_errors;
+    split_stats background_replica_writes_failed;

    // write attempts due to Read Repair logic
    split_stats read_repair_write_attempts;
@@ -96,6 +97,7 @@ struct write_stats {
    uint64_t background_write_bytes = 0;
    uint64_t queued_write_bytes = 0;
    uint64_t throttled_writes = 0; // total number of writes ever delayed due to throttling
+    uint64_t background_writes_failed = 0;
 public:
    write_stats();
    write_stats(const sstring& category, bool auto_register_stats);
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -209,14 +209,15 @@ sstring storage_service::get_config_supported_features() {
        ROLES_FEATURE,
        LA_SSTABLE_FEATURE,
        STREAM_WITH_RPC_STREAM,
+        MATERIALIZED_VIEWS_FEATURE,
+        INDEXES_FEATURE
    };
    auto& config = service::get_local_storage_service()._db.local().get_config();
    if (config.enable_sstables_mc_format()) {
        features.push_back(MC_SSTABLE_FEATURE);
    }
    if (config.experimental()) {
-        features.push_back(MATERIALIZED_VIEWS_FEATURE);
-        features.push_back(INDEXES_FEATURE);
+        // push additional experimental features
    }
    return join(",", features);
 }
@@ -353,7 +354,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
                        gossiper.check_knows_remote_features(local_features, peer_features);
                    }

-                    gossiper.reset_endpoint_state_map();
+                    gossiper.reset_endpoint_state_map().get();
                    for (auto ep : loaded_endpoints) {
                        gossiper.add_saved_endpoint(ep);
                    }
@@ -367,7 +368,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
            slogger.info("Checking remote features with gossip");
            gossiper.do_shadow_round().get();
            gossiper.check_knows_remote_features(local_features);
-            gossiper.reset_endpoint_state_map();
+            gossiper.reset_endpoint_state_map().get();
            for (auto ep : loaded_endpoints) {
                gossiper.add_saved_endpoint(ep);
            }
@@ -432,11 +433,8 @@ void storage_service::register_features() {
    _la_sstable_feature = gms::feature(LA_SSTABLE_FEATURE);
    _stream_with_rpc_stream_feature = gms::feature(STREAM_WITH_RPC_STREAM);
    _mc_sstable_feature = gms::feature(MC_SSTABLE_FEATURE);
-
-    if (_db.local().get_config().experimental()) {
-        _materialized_views_feature = gms::feature(MATERIALIZED_VIEWS_FEATURE);
-        _indexes_feature = gms::feature(INDEXES_FEATURE);
-    }
+    _materialized_views_feature = gms::feature(MATERIALIZED_VIEWS_FEATURE);
+    _indexes_feature = gms::feature(INDEXES_FEATURE);
 }

 // Runs inside seastar::async context
@@ -446,6 +444,13 @@ void storage_service::join_token_ring(int delay) {
    get_storage_service().invoke_on_all([] (auto&& ss) {
        ss._joined = true;
    }).get();
+    if (!_is_survey_mode) {
+        supervisor::notify("starting system distributed keyspace");
+        _sys_dist_ks.start(
+                std::ref(cql3::get_query_processor()),
+                std::ref(service::get_migration_manager())).get();
+        _sys_dist_ks.invoke_on_all(&db::system_distributed_keyspace::start).get();
+    }
    // We bootstrap if we haven't successfully bootstrapped before, as long as we are not a seed.
    // If we are a seed, or if the user manually sets auto_bootstrap to false,
    // we'll skip streaming data from other nodes and jump directly into the ring.
@@ -618,12 +623,6 @@ void storage_service::join_token_ring(int delay) {

        supervisor::notify("starting tracing");
        tracing::tracing::start_tracing().get();
-
-        supervisor::notify("starting system distributed keyspace");
-        _sys_dist_ks.start(
-                std::ref(cql3::get_query_processor()),
-                std::ref(service::get_migration_manager())).get();
-        _sys_dist_ks.invoke_on_all(&db::system_distributed_keyspace::start).get();
    } else {
        slogger.info("Startup complete, but write survey mode is active, not becoming an active ring member. Use JMX (StorageService->joinRing()) to finalize ring joining.");
    }
@@ -1570,7 +1569,7 @@ future<> storage_service::check_for_endpoint_collision() {
                            throw std::runtime_error("Other bootstrapping/leaving/moving nodes detected, cannot bootstrap while consistent_rangemovement is true (check_for_endpoint_collision)");
                        } else {
                            gossiper.goto_shadow_round();
-                            gossiper.reset_endpoint_state_map();
+                            gossiper.reset_endpoint_state_map().get();
                            found_bootstrapping_node = true;
                            auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(gms::gossiper::clk::now() - t).count();
                            slogger.info("Checking bootstrapping/leaving/moving nodes: node={}, status={}, sleep 1 second and check again ({} seconds elapsed) (check_for_endpoint_collision)", addr, state, elapsed);
@@ -1582,7 +1581,7 @@ future<> storage_service::check_for_endpoint_collision() {
            }
        } while (found_bootstrapping_node);
        slogger.info("Checking bootstrapping/leaving/moving nodes: ok (check_for_endpoint_collision)");
-        gossiper.reset_endpoint_state_map();
+        gossiper.reset_endpoint_state_map().get();
    });
 }

@@ -1632,8 +1631,9 @@ future<std::unordered_set<token>> storage_service::prepare_replacement_info() {
        auto tokens = get_tokens_for(replace_address);
        // use the replacee's host Id as our own so we receive hints, etc
        return db::system_keyspace::set_local_host_id(host_id).discard_result().then([replace_address, tokens = std::move(tokens)] {
-            gms::get_local_gossiper().reset_endpoint_state_map(); // clean up since we have what we need
-            return make_ready_future<std::unordered_set<token>>(std::move(tokens));
+            return gms::get_local_gossiper().reset_endpoint_state_map().then([tokens = std::move(tokens)] { // clean up since we have what we need
+                return make_ready_future<std::unordered_set<token>>(std::move(tokens));
+            });
        });
    });
 }
@@ -2046,6 +2046,7 @@ future<> storage_service::start_rpc_server() {
        auto keepalive = cfg.rpc_keepalive();
        thrift_server_config tsc;
        tsc.timeout_config = make_timeout_config(cfg);
+        tsc.max_request_size = cfg.thrift_max_message_length_in_mb() * (uint64_t(1) << 20);
        return seastar::net::dns::resolve_name(addr).then([&ss, tserver, addr, port, keepalive, tsc] (seastar::net::inet_address ip) {
            return tserver->start(std::ref(ss._db), std::ref(cql3::get_query_processor()), std::ref(ss._auth_service), tsc).then([tserver, port, addr, ip, keepalive] {
                // #293 - do not stop anything
@@ -2473,7 +2474,7 @@ future<std::map<sstring, double>> storage_service::get_load_map() {
 future<> storage_service::rebuild(sstring source_dc) {
    return run_with_api_lock(sstring("rebuild"), [source_dc] (storage_service& ss) {
        slogger.info("rebuild from dc: {}", source_dc == "" ? "(any dc)" : source_dc);
-        auto streamer = make_lw_shared<dht::range_streamer>(ss._db, ss._token_metadata, ss.get_broadcast_address(), "Rebuild");
+        auto streamer = make_lw_shared<dht::range_streamer>(ss._db, ss._token_metadata, ss.get_broadcast_address(), "Rebuild", streaming::stream_reason::rebuild);
        streamer->add_source_filter(std::make_unique<dht::range_streamer::failure_detector_source_filter>(gms::get_local_failure_detector()));
        if (source_dc != "") {
            streamer->add_source_filter(std::make_unique<dht::range_streamer::single_datacenter_filter>(source_dc));
@@ -2610,7 +2611,7 @@ void storage_service::unbootstrap() {
 }

 future<> storage_service::restore_replica_count(inet_address endpoint, inet_address notify_endpoint) {
-    auto streamer = make_lw_shared<dht::range_streamer>(_db, get_token_metadata(), get_broadcast_address(), "Restore_replica_count");
+    auto streamer = make_lw_shared<dht::range_streamer>(_db, get_token_metadata(), get_broadcast_address(), "Restore_replica_count", streaming::stream_reason::removenode);
    auto my_address = get_broadcast_address();
    auto non_system_keyspaces = _db.local().get_non_system_keyspaces();
    for (const auto& keyspace_name : non_system_keyspaces) {
@@ -2729,7 +2730,7 @@ void storage_service::leave_ring() {

 future<>
 storage_service::stream_ranges(std::unordered_map<sstring, std::unordered_multimap<dht::token_range, inet_address>> ranges_to_stream_by_keyspace) {
-    auto streamer = make_lw_shared<dht::range_streamer>(_db, get_token_metadata(), get_broadcast_address(), "Unbootstrap");
+    auto streamer = make_lw_shared<dht::range_streamer>(_db, get_token_metadata(), get_broadcast_address(), "Unbootstrap", streaming::stream_reason::decommission);
    for (auto& entry : ranges_to_stream_by_keyspace) {
        const auto& keyspace = entry.first;
        auto& ranges_with_endpoints = entry.second;
--- a/sstables/checksum_utils.hh
+++ b/sstables/checksum_utils.hh
@@ -22,6 +22,9 @@
 #pragma once

 #include <zlib.h>
+#include <seastar/util/gcc6-concepts.hh>
+#include "libdeflate/libdeflate.h"
+#include "utils/gz/crc_combine.hh"

 GCC6_CONCEPT(
 template<typename Checksum>
@@ -30,6 +33,11 @@ concept bool ChecksumUtils = requires(const char* input, size_t size, uint32_t c
    { Checksum::checksum(input, size) } -> uint32_t;
    { Checksum::checksum(checksum, input, size) } -> uint32_t;
    { Checksum::checksum_combine(checksum, checksum, size) } -> uint32_t;
+
+    // Tells whether checksum_combine() should be preferred over checksum().
+    // For same checksummers it's faster to re-feed the buffer to checksum() than to
+    // combine the checksum of the buffer.
+    { Checksum::prefer_combine() } -> bool;
 };
 )

@@ -52,9 +60,11 @@ struct adler32_utils {
    inline static uint32_t checksum_combine(uint32_t first, uint32_t second, size_t input_len2) {
        return adler32_combine(first, second, input_len2);
    }
+
+    static constexpr bool prefer_combine() { return true; }
 };

-struct crc32_utils {
+struct zlib_crc32_checksummer {
    inline static uint32_t init_checksum() {
        return crc32(0, Z_NULL, 0);
    }
@@ -73,5 +83,55 @@ struct crc32_utils {
    inline static uint32_t checksum_combine(uint32_t first, uint32_t second, size_t input_len2) {
        return crc32_combine(first, second, input_len2);
    }
+
+    static constexpr bool prefer_combine() { return false; } // crc32_combine() is very slow
 };

+struct libdeflate_crc32_checksummer {
+    static uint32_t init_checksum() {
+        return 0;
+    }
+
+    static uint32_t checksum(const char* input, size_t input_len) {
+        return checksum(init_checksum(), input, input_len);
+    }
+
+    static uint32_t checksum(uint32_t prev, const char* input, size_t input_len) {
+        return libdeflate_crc32(prev, input, input_len);
+    }
+
+    static uint32_t checksum_combine(uint32_t first, uint32_t second, size_t input_len2) {
+        return zlib_crc32_checksummer::checksum_combine(first, second, input_len2);
+    }
+
+    static constexpr bool prefer_combine() { return false; }
+};
+
+template<typename Checksum>
+inline uint32_t checksum_combine_or_feed(uint32_t first, uint32_t second, const char* input, size_t input_len) {
+    if constexpr (Checksum::prefer_combine()) {
+        return Checksum::checksum_combine(first, second, input_len);
+    } else {
+        return Checksum::checksum(first, input, input_len);
+    }
+}
+
+struct crc32_utils {
+    static uint32_t init_checksum() { return libdeflate_crc32_checksummer::init_checksum(); }
+
+    static uint32_t checksum(const char* input, size_t input_len) {
+        return libdeflate_crc32_checksummer::checksum(input, input_len);
+    }
+
+    static uint32_t checksum(uint32_t prev, const char* input, size_t input_len) {
+        return libdeflate_crc32_checksummer::checksum(prev, input, input_len);
+    }
+
+    static uint32_t checksum_combine(uint32_t first, uint32_t second, size_t input_len2) {
+        return fast_crc32_combine(first, second, input_len2);
+    }
+
+    static constexpr bool prefer_combine() {
+        return fast_crc32_combine_optimized();
+    }
+};
--- a/sstables/column_translation.hh
+++ b/sstables/column_translation.hh
@@ -53,63 +53,76 @@ inline column_values_fixed_lengths get_clustering_values_fixed_lengths(const ser
 * This way we don't need to looku them up by column name every time.
 */
 class column_translation {
+public:
+    struct column_info {
+        // Disengaged 'id' means the column is missing from the current schema
+        std::optional<column_id> id;
+        std::optional<uint32_t> value_length;
+        bool is_collection;
+        bool is_counter;
+    };
+
+private:

    struct state {

-        static std::tuple<std::vector<std::optional<column_id>>,
-                          std::vector<std::optional<uint32_t>>,
-                          std::vector<bool>,
-                          std::vector<bool>> build(
+        static std::vector<column_info> build(
                const schema& s,
                const utils::chunked_vector<serialization_header::column_desc>& src,
                bool is_static) {
-            std::vector<std::optional<column_id>> ids;
-            std::vector<std::optional<column_id>> lens;
-            std::vector<bool> is_collection;
-            std::vector<bool> is_counter;
+            std::vector<column_info> cols;
            if (s.is_dense()) {
                if (is_static) {
-                    ids.push_back(s.static_begin()->id);
-                    lens.push_back(s.static_begin()->type->value_length_if_fixed());
-                    is_collection.push_back(s.static_begin()->is_multi_cell());
-                    is_counter.push_back(s.static_begin()->is_counter());
+                    cols.push_back(column_info{
+                        s.static_begin()->id,
+                        s.static_begin()->type->value_length_if_fixed(),
+                        s.static_begin()->is_multi_cell(),
+                        s.static_begin()->is_counter()
+                    });
                } else {
-                    ids.push_back(s.regular_begin()->id);
-                    lens.push_back(s.regular_begin()->type->value_length_if_fixed());
-                    is_collection.push_back(s.regular_begin()->is_multi_cell());
-                    is_counter.push_back(s.regular_begin()->is_counter());
+                    cols.push_back(column_info{
+                        s.regular_begin()->id,
+                        s.regular_begin()->type->value_length_if_fixed(),
+                        s.regular_begin()->is_multi_cell(),
+                        s.regular_begin()->is_counter()
+                    });
                }
            } else {
-                ids.reserve(src.size());
-                lens.reserve(src.size());
+                cols.reserve(src.size());
                for (auto&& desc : src) {
+                    const bytes& type_name = desc.type_name.value;
+                    data_type type = db::marshal::type_parser::parse(to_sstring_view(type_name));
                    const column_definition* def = s.get_column_definition(desc.name.value);
+                    std::optional<column_id> id;
                    if (def) {
-                        ids.push_back(def->id);
-                        lens.push_back(def->type->value_length_if_fixed());
-                        is_collection.push_back(def->is_multi_cell());
-                        is_counter.push_back(def->is_counter());
-                    } else {
-                        ids.push_back(std::nullopt);
-                        lens.push_back(std::nullopt);
-                        is_collection.push_back(false);
-                        is_counter.push_back(false);
+                        if (def->is_multi_cell() != type->is_multi_cell() || def->is_counter() != type->is_counter()) {
+                            throw malformed_sstable_exception(sprint(
+                                    "{} definition in serialization header does not match schema. "
+                                    "Schema collection = {}, counter = {}. Header collection = {}, counter = {}",
+                                    def->name(),
+                                    def->is_multi_cell(),
+                                    def->is_counter(),
+                                    type->is_multi_cell(),
+                                    type->is_counter()));
+                        }
+                        id = def->id;
                    }
+                    cols.push_back(column_info{
+                        id,
+                        type->value_length_if_fixed(),
+                        type->is_multi_cell(),
+                        type->is_counter()
+                    });
                }
+                boost::range::stable_partition(cols, [](const column_info& column) { return !column.is_collection; });
            }
-            return std::make_tuple(std::move(ids), std::move(lens), std::move(is_collection), std::move(is_counter));
+            return cols;
        }

        utils::UUID schema_uuid;
-        std::vector<std::optional<column_id>> regular_schema_column_id_from_sstable;
-        std::vector<std::optional<column_id>> static_schema_column_id_from_sstable;
-        column_values_fixed_lengths regular_column_value_fix_lengths;
-        column_values_fixed_lengths static_column_value_fix_lengths;
+        std::vector<column_info> regular_schema_columns_from_sstable;
+        std::vector<column_info> static_schema_columns_from_sstable;
        column_values_fixed_lengths clustering_column_value_fix_lengths;
-        std::vector<bool> static_column_is_collection;
-        std::vector<bool> regular_column_is_collection;
-        std::vector<bool> static_column_is_counter;
-        std::vector<bool> regular_column_is_counter;

        state() = default;
        state(const state&) = delete;
@@ -118,19 +131,11 @@ class column_translation {
        state& operator=(state&&) = default;

        state(const schema& s, const serialization_header& header)
-                : schema_uuid(s.version()) {
-            std::tie(regular_schema_column_id_from_sstable,
-                     regular_column_value_fix_lengths,
-                     regular_column_is_collection,
-                     regular_column_is_counter) =
-                    build(s, header.regular_columns.elements, false);
-            std::tie(static_schema_column_id_from_sstable,
-                     static_column_value_fix_lengths,
-                     static_column_is_collection,
-                     static_column_is_counter) =
-                    build(s, header.static_columns.elements, true);
-            clustering_column_value_fix_lengths = get_clustering_values_fixed_lengths(header);
-        }
+            : schema_uuid(s.version())
+            , regular_schema_columns_from_sstable(build(s, header.regular_columns.elements, false))
+            , static_schema_columns_from_sstable(build(s, header.static_columns.elements, true))
+            , clustering_column_value_fix_lengths (get_clustering_values_fixed_lengths(header))
+        {}
    };

    lw_shared_ptr<const state> _state = make_lw_shared<const state>();
@@ -143,33 +148,15 @@ public:
        return *this;
    }

-    const std::vector<std::optional<column_id>>& regular_columns() const {
-        return _state->regular_schema_column_id_from_sstable;
+    const std::vector<column_info>& regular_columns() const {
+        return _state->regular_schema_columns_from_sstable;
    }
-    const std::vector<std::optional<column_id>>& static_columns() const {
-        return _state->static_schema_column_id_from_sstable;
-    }
-    const std::vector<std::optional<uint32_t>>& regular_column_value_fix_legths() const {
-        return _state->regular_column_value_fix_lengths;
-    }
-    const std::vector<std::optional<uint32_t>>& static_column_value_fix_legths() const {
-        return _state->static_column_value_fix_lengths;
+    const std::vector<column_info>& static_columns() const {
+        return _state->static_schema_columns_from_sstable;
    }
    const std::vector<std::optional<uint32_t>>& clustering_column_value_fix_legths() const {
        return _state->clustering_column_value_fix_lengths;
    }
-    const std::vector<bool>& static_column_is_collection() const {
-        return _state->static_column_is_collection;
-    }
-    const std::vector<bool>& regular_column_is_collection() const {
-        return _state->regular_column_is_collection;
-    }
-    const std::vector<bool>& static_column_is_counter() const {
-        return _state->static_column_is_counter;
-    }
-    const std::vector<bool>& regular_column_is_counter() const {
-        return _state->regular_column_is_counter;
-    }
 };

 };   // namespace sstables
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -531,11 +531,11 @@ public:
    }

    void report_start(const sstring& formatted_msg) const override {
-        clogger.debug("Compacting {}", formatted_msg);
+        clogger.info("Compacting {}", formatted_msg);
    }

    void report_finish(const sstring& formatted_msg, std::chrono::time_point<db_clock> ended_at) const override {
-        clogger.debug("Compacted {}", formatted_msg);
+        clogger.info("Compacted {}", formatted_msg);
    }

    void backlog_tracker_adjust_charges() override {
@@ -818,7 +818,10 @@ get_fully_expired_sstables(column_family& cf, const std::vector<sstables::shared
    auto compacted_undeleted_gens = boost::copy_range<std::unordered_set<int64_t>>(cf.compacted_undeleted_sstables()
        | boost::adaptors::transformed(std::mem_fn(&sstables::sstable::generation)));
    auto has_undeleted_ancestor = [&compacted_undeleted_gens] (auto& candidate) {
-        return boost::algorithm::any_of(candidate->ancestors(), [&compacted_undeleted_gens] (auto gen) {
+        // Get ancestors from metadata collector which is empty after restart. It works for this purpose because
+        // we only need to check that a sstable compacted *in this instance* hasn't an ancestor undeleted.
+        // Not getting it from sstable metadata because mc format hasn't it available.
+        return boost::algorithm::any_of(candidate->get_metadata_collector().ancestors(), [&compacted_undeleted_gens] (auto gen) {
            return compacted_undeleted_gens.count(gen);
        });
    };
--- a/Show More
+++ b/Show More